{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 80000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.8566444516181946, "learning_rate": 2.25e-07, "loss": 1.4972, "step": 10 }, { "grad_norm": 1.052078127861023, "learning_rate": 4.75e-07, "loss": 1.5293, "step": 20 }, { "grad_norm": 1.0558305978775024, "learning_rate": 7.25e-07, "loss": 1.5131, "step": 30 }, { "grad_norm": 0.8465989232063293, "learning_rate": 9.75e-07, "loss": 1.5034, "step": 40 }, { "grad_norm": 0.8464053273200989, "learning_rate": 1.2250000000000001e-06, "loss": 1.5043, "step": 50 }, { "grad_norm": 1.080320119857788, "learning_rate": 1.475e-06, "loss": 1.4823, "step": 60 }, { "grad_norm": 1.712898850440979, "learning_rate": 1.7250000000000002e-06, "loss": 1.4301, "step": 70 }, { "grad_norm": 2.5787534713745117, "learning_rate": 1.975e-06, "loss": 1.3559, "step": 80 }, { "grad_norm": 1.4710171222686768, "learning_rate": 2.225e-06, "loss": 1.3244, "step": 90 }, { "grad_norm": 1.439327359199524, "learning_rate": 2.4750000000000004e-06, "loss": 1.3007, "step": 100 }, { "grad_norm": 1.5541735887527466, "learning_rate": 2.725e-06, "loss": 1.2361, "step": 110 }, { "grad_norm": 1.6562047004699707, "learning_rate": 2.975e-06, "loss": 1.1969, "step": 120 }, { "grad_norm": 1.7488754987716675, "learning_rate": 3.225e-06, "loss": 1.1812, "step": 130 }, { "grad_norm": 1.671818494796753, "learning_rate": 3.4750000000000006e-06, "loss": 1.1436, "step": 140 }, { "grad_norm": 1.5163180828094482, "learning_rate": 3.725e-06, "loss": 1.1363, "step": 150 }, { "grad_norm": 2.0778310298919678, "learning_rate": 3.975e-06, "loss": 1.0874, "step": 160 }, { "grad_norm": 1.4043035507202148, "learning_rate": 4.225e-06, "loss": 1.1041, "step": 170 }, { "grad_norm": 1.5369722843170166, "learning_rate": 4.475e-06, "loss": 1.1117, "step": 180 }, { "grad_norm": 1.5060052871704102, "learning_rate": 4.7250000000000005e-06, "loss": 1.099, "step": 190 }, { "grad_norm": 1.527758240699768, "learning_rate": 4.975000000000001e-06, "loss": 1.0867, "step": 200 }, { "grad_norm": 1.6469988822937012, "learning_rate": 5.225e-06, "loss": 1.0625, "step": 210 }, { "grad_norm": 1.411040186882019, "learning_rate": 5.475e-06, "loss": 1.0919, "step": 220 }, { "grad_norm": 1.713234305381775, "learning_rate": 5.725e-06, "loss": 1.0933, "step": 230 }, { "grad_norm": 1.3267306089401245, "learning_rate": 5.975e-06, "loss": 1.0634, "step": 240 }, { "grad_norm": 1.4911235570907593, "learning_rate": 6.2250000000000005e-06, "loss": 1.0539, "step": 250 }, { "grad_norm": 1.3073464632034302, "learning_rate": 6.475000000000001e-06, "loss": 1.0539, "step": 260 }, { "grad_norm": 1.7743191719055176, "learning_rate": 6.725000000000001e-06, "loss": 1.0706, "step": 270 }, { "grad_norm": 1.1513415575027466, "learning_rate": 6.975000000000001e-06, "loss": 1.0641, "step": 280 }, { "grad_norm": 1.8084874153137207, "learning_rate": 7.2249999999999994e-06, "loss": 1.0713, "step": 290 }, { "grad_norm": 1.5587002038955688, "learning_rate": 7.4750000000000004e-06, "loss": 1.0756, "step": 300 }, { "grad_norm": 1.5197229385375977, "learning_rate": 7.725e-06, "loss": 1.0702, "step": 310 }, { "grad_norm": 1.256421446800232, "learning_rate": 7.975e-06, "loss": 1.0808, "step": 320 }, { "grad_norm": 1.3807047605514526, "learning_rate": 8.225e-06, "loss": 1.0635, "step": 330 }, { "grad_norm": 1.3026821613311768, "learning_rate": 8.475000000000001e-06, "loss": 1.0538, "step": 340 }, { "grad_norm": 1.6104710102081299, "learning_rate": 8.725e-06, "loss": 1.052, "step": 350 }, { "grad_norm": 4.900639057159424, "learning_rate": 8.975e-06, "loss": 1.0462, "step": 360 }, { "grad_norm": 1.2930063009262085, "learning_rate": 9.225e-06, "loss": 1.0507, "step": 370 }, { "grad_norm": 1.373496413230896, "learning_rate": 9.475e-06, "loss": 1.0593, "step": 380 }, { "grad_norm": 1.0941420793533325, "learning_rate": 9.725000000000001e-06, "loss": 1.0342, "step": 390 }, { "grad_norm": 1.3661017417907715, "learning_rate": 9.975e-06, "loss": 1.0455, "step": 400 }, { "grad_norm": 1.6110551357269287, "learning_rate": 1.0225e-05, "loss": 1.0295, "step": 410 }, { "grad_norm": 1.2054811716079712, "learning_rate": 1.0475e-05, "loss": 1.0319, "step": 420 }, { "grad_norm": 1.7380324602127075, "learning_rate": 1.0725e-05, "loss": 1.0048, "step": 430 }, { "grad_norm": 1.737931728363037, "learning_rate": 1.0975e-05, "loss": 0.9872, "step": 440 }, { "grad_norm": 2.165952682495117, "learning_rate": 1.1225e-05, "loss": 0.9469, "step": 450 }, { "grad_norm": 2.0361502170562744, "learning_rate": 1.1475000000000001e-05, "loss": 0.9242, "step": 460 }, { "grad_norm": 2.1341609954833984, "learning_rate": 1.1725e-05, "loss": 0.9051, "step": 470 }, { "grad_norm": 2.3306849002838135, "learning_rate": 1.1975e-05, "loss": 0.8751, "step": 480 }, { "grad_norm": 2.3624191284179688, "learning_rate": 1.2225e-05, "loss": 0.8293, "step": 490 }, { "grad_norm": 2.481816053390503, "learning_rate": 1.2475e-05, "loss": 0.8011, "step": 500 }, { "grad_norm": 2.2493226528167725, "learning_rate": 1.2725000000000001e-05, "loss": 0.7819, "step": 510 }, { "grad_norm": 2.22247314453125, "learning_rate": 1.2975e-05, "loss": 0.7436, "step": 520 }, { "grad_norm": 2.200357675552368, "learning_rate": 1.3225000000000001e-05, "loss": 0.7195, "step": 530 }, { "grad_norm": 2.189655303955078, "learning_rate": 1.3475000000000002e-05, "loss": 0.6537, "step": 540 }, { "grad_norm": 2.2234089374542236, "learning_rate": 1.3725000000000002e-05, "loss": 0.6663, "step": 550 }, { "grad_norm": 2.5929200649261475, "learning_rate": 1.3975000000000003e-05, "loss": 0.6565, "step": 560 }, { "grad_norm": 2.4045796394348145, "learning_rate": 1.4225e-05, "loss": 0.5968, "step": 570 }, { "grad_norm": 2.8181729316711426, "learning_rate": 1.4475e-05, "loss": 0.5653, "step": 580 }, { "grad_norm": 2.589650869369507, "learning_rate": 1.4725e-05, "loss": 0.5644, "step": 590 }, { "grad_norm": 2.329069137573242, "learning_rate": 1.4975e-05, "loss": 0.555, "step": 600 }, { "grad_norm": 2.7081966400146484, "learning_rate": 1.5225e-05, "loss": 0.5345, "step": 610 }, { "grad_norm": 3.20296573638916, "learning_rate": 1.5475e-05, "loss": 0.53, "step": 620 }, { "grad_norm": 2.572547674179077, "learning_rate": 1.5725e-05, "loss": 0.4933, "step": 630 }, { "grad_norm": 2.887559652328491, "learning_rate": 1.5975000000000002e-05, "loss": 0.478, "step": 640 }, { "grad_norm": 2.592221260070801, "learning_rate": 1.6225e-05, "loss": 0.4475, "step": 650 }, { "grad_norm": 3.0392093658447266, "learning_rate": 1.6475e-05, "loss": 0.4481, "step": 660 }, { "grad_norm": 2.204176425933838, "learning_rate": 1.6725000000000003e-05, "loss": 0.4116, "step": 670 }, { "grad_norm": 2.795159101486206, "learning_rate": 1.6975000000000003e-05, "loss": 0.4124, "step": 680 }, { "grad_norm": 2.620204448699951, "learning_rate": 1.7225e-05, "loss": 0.4035, "step": 690 }, { "grad_norm": 2.677187204360962, "learning_rate": 1.7475e-05, "loss": 0.4086, "step": 700 }, { "grad_norm": 2.485642671585083, "learning_rate": 1.7725e-05, "loss": 0.3386, "step": 710 }, { "grad_norm": 3.008882999420166, "learning_rate": 1.7975e-05, "loss": 0.3929, "step": 720 }, { "grad_norm": 2.5245120525360107, "learning_rate": 1.8225e-05, "loss": 0.347, "step": 730 }, { "grad_norm": 2.4516313076019287, "learning_rate": 1.8475000000000002e-05, "loss": 0.3054, "step": 740 }, { "grad_norm": 2.6255953311920166, "learning_rate": 1.8725e-05, "loss": 0.3227, "step": 750 }, { "grad_norm": 2.6038966178894043, "learning_rate": 1.8975e-05, "loss": 0.3065, "step": 760 }, { "grad_norm": 2.0367722511291504, "learning_rate": 1.9225e-05, "loss": 0.2999, "step": 770 }, { "grad_norm": 3.137856960296631, "learning_rate": 1.9475000000000002e-05, "loss": 0.3105, "step": 780 }, { "grad_norm": 2.818552017211914, "learning_rate": 1.9725000000000002e-05, "loss": 0.2559, "step": 790 }, { "grad_norm": 2.6628899574279785, "learning_rate": 1.9975e-05, "loss": 0.2394, "step": 800 }, { "grad_norm": 2.6328117847442627, "learning_rate": 2.0225000000000004e-05, "loss": 0.247, "step": 810 }, { "grad_norm": 2.2496070861816406, "learning_rate": 2.0475e-05, "loss": 0.2474, "step": 820 }, { "grad_norm": 2.1063759326934814, "learning_rate": 2.0725e-05, "loss": 0.2593, "step": 830 }, { "grad_norm": 2.586805582046509, "learning_rate": 2.0975e-05, "loss": 0.2318, "step": 840 }, { "grad_norm": 2.083897113800049, "learning_rate": 2.1225e-05, "loss": 0.2307, "step": 850 }, { "grad_norm": 2.946162223815918, "learning_rate": 2.1475e-05, "loss": 0.2096, "step": 860 }, { "grad_norm": 2.0351502895355225, "learning_rate": 2.1725e-05, "loss": 0.2255, "step": 870 }, { "grad_norm": 2.9062485694885254, "learning_rate": 2.1975000000000002e-05, "loss": 0.2063, "step": 880 }, { "grad_norm": 2.3277430534362793, "learning_rate": 2.2225e-05, "loss": 0.2017, "step": 890 }, { "grad_norm": 2.0139076709747314, "learning_rate": 2.2475e-05, "loss": 0.2128, "step": 900 }, { "grad_norm": 2.68556547164917, "learning_rate": 2.2725000000000003e-05, "loss": 0.1955, "step": 910 }, { "grad_norm": 1.8197780847549438, "learning_rate": 2.2975000000000003e-05, "loss": 0.18, "step": 920 }, { "grad_norm": 3.055431365966797, "learning_rate": 2.3225000000000002e-05, "loss": 0.2008, "step": 930 }, { "grad_norm": 2.5528318881988525, "learning_rate": 2.3475e-05, "loss": 0.1844, "step": 940 }, { "grad_norm": 1.8346315622329712, "learning_rate": 2.3725e-05, "loss": 0.1686, "step": 950 }, { "grad_norm": 2.5486485958099365, "learning_rate": 2.3975e-05, "loss": 0.1781, "step": 960 }, { "grad_norm": 2.367338180541992, "learning_rate": 2.4225e-05, "loss": 0.2171, "step": 970 }, { "grad_norm": 1.5607470273971558, "learning_rate": 2.4475000000000002e-05, "loss": 0.1736, "step": 980 }, { "grad_norm": 2.464700222015381, "learning_rate": 2.4725e-05, "loss": 0.1471, "step": 990 }, { "grad_norm": 2.3309032917022705, "learning_rate": 2.4975e-05, "loss": 0.1603, "step": 1000 }, { "grad_norm": 1.7804707288742065, "learning_rate": 2.5225e-05, "loss": 0.1626, "step": 1010 }, { "grad_norm": 2.502549886703491, "learning_rate": 2.5475e-05, "loss": 0.2055, "step": 1020 }, { "grad_norm": 2.2387709617614746, "learning_rate": 2.5725e-05, "loss": 0.1497, "step": 1030 }, { "grad_norm": 2.6627652645111084, "learning_rate": 2.5974999999999998e-05, "loss": 0.1657, "step": 1040 }, { "grad_norm": 2.499324083328247, "learning_rate": 2.6225e-05, "loss": 0.1753, "step": 1050 }, { "grad_norm": 1.8046517372131348, "learning_rate": 2.6475e-05, "loss": 0.1813, "step": 1060 }, { "grad_norm": 2.121721029281616, "learning_rate": 2.6725e-05, "loss": 0.1992, "step": 1070 }, { "grad_norm": 2.066898822784424, "learning_rate": 2.6975000000000002e-05, "loss": 0.1931, "step": 1080 }, { "grad_norm": 2.096219301223755, "learning_rate": 2.7225e-05, "loss": 0.183, "step": 1090 }, { "grad_norm": 2.2575268745422363, "learning_rate": 2.7475e-05, "loss": 0.1368, "step": 1100 }, { "grad_norm": 2.472886800765991, "learning_rate": 2.7725e-05, "loss": 0.2119, "step": 1110 }, { "grad_norm": 1.5364497900009155, "learning_rate": 2.7975000000000002e-05, "loss": 0.1383, "step": 1120 }, { "grad_norm": 1.7759634256362915, "learning_rate": 2.8225e-05, "loss": 0.1641, "step": 1130 }, { "grad_norm": 2.386911630630493, "learning_rate": 2.8475e-05, "loss": 0.2093, "step": 1140 }, { "grad_norm": 2.0811383724212646, "learning_rate": 2.8725e-05, "loss": 0.2033, "step": 1150 }, { "grad_norm": 1.8869506120681763, "learning_rate": 2.8975000000000003e-05, "loss": 0.1645, "step": 1160 }, { "grad_norm": 1.5703043937683105, "learning_rate": 2.9225000000000002e-05, "loss": 0.1759, "step": 1170 }, { "grad_norm": 1.8086001873016357, "learning_rate": 2.9475e-05, "loss": 0.1445, "step": 1180 }, { "grad_norm": 1.9723494052886963, "learning_rate": 2.9725000000000004e-05, "loss": 0.1722, "step": 1190 }, { "grad_norm": 3.039930820465088, "learning_rate": 2.9975000000000004e-05, "loss": 0.1749, "step": 1200 }, { "grad_norm": 3.072293758392334, "learning_rate": 3.0225000000000003e-05, "loss": 0.193, "step": 1210 }, { "grad_norm": 1.9556934833526611, "learning_rate": 3.0475000000000002e-05, "loss": 0.1759, "step": 1220 }, { "grad_norm": 2.0926413536071777, "learning_rate": 3.0725e-05, "loss": 0.1697, "step": 1230 }, { "grad_norm": 1.7194592952728271, "learning_rate": 3.0975e-05, "loss": 0.1802, "step": 1240 }, { "grad_norm": 2.392340660095215, "learning_rate": 3.122500000000001e-05, "loss": 0.1591, "step": 1250 }, { "grad_norm": 1.9104856252670288, "learning_rate": 3.1475e-05, "loss": 0.1626, "step": 1260 }, { "grad_norm": 1.8004789352416992, "learning_rate": 3.1725e-05, "loss": 0.1458, "step": 1270 }, { "grad_norm": 1.1673132181167603, "learning_rate": 3.1975e-05, "loss": 0.1585, "step": 1280 }, { "grad_norm": 1.5528271198272705, "learning_rate": 3.2225e-05, "loss": 0.1724, "step": 1290 }, { "grad_norm": 1.7146399021148682, "learning_rate": 3.2474999999999997e-05, "loss": 0.1507, "step": 1300 }, { "grad_norm": 1.9470059871673584, "learning_rate": 3.2725e-05, "loss": 0.1508, "step": 1310 }, { "grad_norm": 1.6733075380325317, "learning_rate": 3.2975e-05, "loss": 0.1575, "step": 1320 }, { "grad_norm": 1.4112375974655151, "learning_rate": 3.3225e-05, "loss": 0.1326, "step": 1330 }, { "grad_norm": 1.9768624305725098, "learning_rate": 3.3475e-05, "loss": 0.133, "step": 1340 }, { "grad_norm": 2.3789350986480713, "learning_rate": 3.3725e-05, "loss": 0.1777, "step": 1350 }, { "grad_norm": 2.076359748840332, "learning_rate": 3.3975e-05, "loss": 0.1511, "step": 1360 }, { "grad_norm": 2.0208630561828613, "learning_rate": 3.4225e-05, "loss": 0.1362, "step": 1370 }, { "grad_norm": 2.2057301998138428, "learning_rate": 3.4475000000000005e-05, "loss": 0.1336, "step": 1380 }, { "grad_norm": 2.3049211502075195, "learning_rate": 3.4725000000000004e-05, "loss": 0.151, "step": 1390 }, { "grad_norm": 1.7244722843170166, "learning_rate": 3.4975e-05, "loss": 0.1358, "step": 1400 }, { "grad_norm": 1.30256187915802, "learning_rate": 3.5225e-05, "loss": 0.1469, "step": 1410 }, { "grad_norm": 1.9231311082839966, "learning_rate": 3.5475e-05, "loss": 0.1437, "step": 1420 }, { "grad_norm": 1.4121044874191284, "learning_rate": 3.5725e-05, "loss": 0.1209, "step": 1430 }, { "grad_norm": 1.7647475004196167, "learning_rate": 3.5975e-05, "loss": 0.1502, "step": 1440 }, { "grad_norm": 1.3415299654006958, "learning_rate": 3.6225000000000006e-05, "loss": 0.1644, "step": 1450 }, { "grad_norm": 1.5529747009277344, "learning_rate": 3.6475000000000006e-05, "loss": 0.1505, "step": 1460 }, { "grad_norm": 1.4107692241668701, "learning_rate": 3.6725000000000005e-05, "loss": 0.1492, "step": 1470 }, { "grad_norm": 1.5124170780181885, "learning_rate": 3.6975000000000004e-05, "loss": 0.1303, "step": 1480 }, { "grad_norm": 1.4635210037231445, "learning_rate": 3.7225000000000004e-05, "loss": 0.1435, "step": 1490 }, { "grad_norm": 1.0458636283874512, "learning_rate": 3.7475e-05, "loss": 0.1516, "step": 1500 }, { "grad_norm": 1.691277027130127, "learning_rate": 3.7725e-05, "loss": 0.1212, "step": 1510 }, { "grad_norm": 1.855945110321045, "learning_rate": 3.7975e-05, "loss": 0.1377, "step": 1520 }, { "grad_norm": 1.3603715896606445, "learning_rate": 3.8225e-05, "loss": 0.1492, "step": 1530 }, { "grad_norm": 1.5935473442077637, "learning_rate": 3.8475e-05, "loss": 0.1283, "step": 1540 }, { "grad_norm": 1.614700198173523, "learning_rate": 3.8725e-05, "loss": 0.1315, "step": 1550 }, { "grad_norm": 1.11336088180542, "learning_rate": 3.8975e-05, "loss": 0.1625, "step": 1560 }, { "grad_norm": 1.5134731531143188, "learning_rate": 3.9225e-05, "loss": 0.1266, "step": 1570 }, { "grad_norm": 1.4739716053009033, "learning_rate": 3.9475000000000004e-05, "loss": 0.1378, "step": 1580 }, { "grad_norm": 1.5415149927139282, "learning_rate": 3.9725e-05, "loss": 0.146, "step": 1590 }, { "grad_norm": 1.203621745109558, "learning_rate": 3.9975e-05, "loss": 0.1242, "step": 1600 }, { "grad_norm": 1.371915578842163, "learning_rate": 4.0225e-05, "loss": 0.1188, "step": 1610 }, { "grad_norm": 1.548456072807312, "learning_rate": 4.0475e-05, "loss": 0.1484, "step": 1620 }, { "grad_norm": 1.4837747812271118, "learning_rate": 4.0725e-05, "loss": 0.1837, "step": 1630 }, { "grad_norm": 1.2722891569137573, "learning_rate": 4.0975e-05, "loss": 0.141, "step": 1640 }, { "grad_norm": 1.508676290512085, "learning_rate": 4.1225e-05, "loss": 0.1512, "step": 1650 }, { "grad_norm": 1.4114471673965454, "learning_rate": 4.1475000000000005e-05, "loss": 0.1463, "step": 1660 }, { "grad_norm": 1.0855860710144043, "learning_rate": 4.1725000000000005e-05, "loss": 0.1172, "step": 1670 }, { "grad_norm": 1.4499181509017944, "learning_rate": 4.1975000000000004e-05, "loss": 0.124, "step": 1680 }, { "grad_norm": 1.3984365463256836, "learning_rate": 4.2225e-05, "loss": 0.1274, "step": 1690 }, { "grad_norm": 1.5261346101760864, "learning_rate": 4.2475e-05, "loss": 0.1395, "step": 1700 }, { "grad_norm": 1.8622990846633911, "learning_rate": 4.2725e-05, "loss": 0.171, "step": 1710 }, { "grad_norm": 1.4312527179718018, "learning_rate": 4.2975e-05, "loss": 0.1472, "step": 1720 }, { "grad_norm": 1.1331348419189453, "learning_rate": 4.322500000000001e-05, "loss": 0.1408, "step": 1730 }, { "grad_norm": 1.5758891105651855, "learning_rate": 4.3475000000000006e-05, "loss": 0.1364, "step": 1740 }, { "grad_norm": 1.5547208786010742, "learning_rate": 4.3725000000000006e-05, "loss": 0.1442, "step": 1750 }, { "grad_norm": 1.650541067123413, "learning_rate": 4.3975e-05, "loss": 0.1337, "step": 1760 }, { "grad_norm": 1.2564619779586792, "learning_rate": 4.4225e-05, "loss": 0.1294, "step": 1770 }, { "grad_norm": 1.4697320461273193, "learning_rate": 4.4475e-05, "loss": 0.1482, "step": 1780 }, { "grad_norm": 1.2501603364944458, "learning_rate": 4.4725e-05, "loss": 0.123, "step": 1790 }, { "grad_norm": 1.3848594427108765, "learning_rate": 4.4975e-05, "loss": 0.1359, "step": 1800 }, { "grad_norm": 1.213675618171692, "learning_rate": 4.5225e-05, "loss": 0.1284, "step": 1810 }, { "grad_norm": 1.4491252899169922, "learning_rate": 4.5475e-05, "loss": 0.1431, "step": 1820 }, { "grad_norm": 1.994454026222229, "learning_rate": 4.5725e-05, "loss": 0.1426, "step": 1830 }, { "grad_norm": 1.155197024345398, "learning_rate": 4.5975e-05, "loss": 0.1443, "step": 1840 }, { "grad_norm": 1.3587158918380737, "learning_rate": 4.6225e-05, "loss": 0.1574, "step": 1850 }, { "grad_norm": 1.7028276920318604, "learning_rate": 4.6475000000000005e-05, "loss": 0.1666, "step": 1860 }, { "grad_norm": 1.031579613685608, "learning_rate": 4.6725000000000004e-05, "loss": 0.1312, "step": 1870 }, { "grad_norm": 1.0434411764144897, "learning_rate": 4.6975000000000003e-05, "loss": 0.1153, "step": 1880 }, { "grad_norm": 1.3154933452606201, "learning_rate": 4.7225e-05, "loss": 0.1362, "step": 1890 }, { "grad_norm": 1.3980038166046143, "learning_rate": 4.7475e-05, "loss": 0.1275, "step": 1900 }, { "grad_norm": 1.6730084419250488, "learning_rate": 4.7725e-05, "loss": 0.168, "step": 1910 }, { "grad_norm": 1.2374154329299927, "learning_rate": 4.7975e-05, "loss": 0.1393, "step": 1920 }, { "grad_norm": 1.1759912967681885, "learning_rate": 4.822500000000001e-05, "loss": 0.1092, "step": 1930 }, { "grad_norm": 1.6462161540985107, "learning_rate": 4.8475000000000006e-05, "loss": 0.1338, "step": 1940 }, { "grad_norm": 1.1983445882797241, "learning_rate": 4.8725000000000005e-05, "loss": 0.1134, "step": 1950 }, { "grad_norm": 1.4946386814117432, "learning_rate": 4.8975000000000005e-05, "loss": 0.1545, "step": 1960 }, { "grad_norm": 1.4756296873092651, "learning_rate": 4.9225000000000004e-05, "loss": 0.1075, "step": 1970 }, { "grad_norm": 1.287211298942566, "learning_rate": 4.9475e-05, "loss": 0.1474, "step": 1980 }, { "grad_norm": 1.1569725275039673, "learning_rate": 4.9725e-05, "loss": 0.1109, "step": 1990 }, { "grad_norm": 1.2647931575775146, "learning_rate": 4.9975e-05, "loss": 0.1313, "step": 2000 }, { "grad_norm": 1.1894464492797852, "learning_rate": 5.0225e-05, "loss": 0.1673, "step": 2010 }, { "grad_norm": 1.6426259279251099, "learning_rate": 5.047500000000001e-05, "loss": 0.1357, "step": 2020 }, { "grad_norm": 1.2882579565048218, "learning_rate": 5.0725e-05, "loss": 0.1127, "step": 2030 }, { "grad_norm": 1.1565042734146118, "learning_rate": 5.0975000000000006e-05, "loss": 0.1352, "step": 2040 }, { "grad_norm": 1.435943365097046, "learning_rate": 5.1225e-05, "loss": 0.134, "step": 2050 }, { "grad_norm": 1.309027075767517, "learning_rate": 5.1475000000000004e-05, "loss": 0.124, "step": 2060 }, { "grad_norm": 0.8613530397415161, "learning_rate": 5.1725000000000004e-05, "loss": 0.1111, "step": 2070 }, { "grad_norm": 1.4724136590957642, "learning_rate": 5.197500000000001e-05, "loss": 0.1231, "step": 2080 }, { "grad_norm": 1.253044605255127, "learning_rate": 5.2225e-05, "loss": 0.1337, "step": 2090 }, { "grad_norm": 1.290144681930542, "learning_rate": 5.247500000000001e-05, "loss": 0.1263, "step": 2100 }, { "grad_norm": 0.9221688508987427, "learning_rate": 5.2725e-05, "loss": 0.1523, "step": 2110 }, { "grad_norm": 1.2253038883209229, "learning_rate": 5.297500000000001e-05, "loss": 0.1256, "step": 2120 }, { "grad_norm": 1.0585968494415283, "learning_rate": 5.3225e-05, "loss": 0.1168, "step": 2130 }, { "grad_norm": 0.9900386333465576, "learning_rate": 5.3475e-05, "loss": 0.1153, "step": 2140 }, { "grad_norm": 1.4635428190231323, "learning_rate": 5.3725000000000005e-05, "loss": 0.1579, "step": 2150 }, { "grad_norm": 1.2288469076156616, "learning_rate": 5.3975e-05, "loss": 0.1292, "step": 2160 }, { "grad_norm": 1.117184042930603, "learning_rate": 5.4225000000000003e-05, "loss": 0.1192, "step": 2170 }, { "grad_norm": 1.2069185972213745, "learning_rate": 5.4474999999999996e-05, "loss": 0.1381, "step": 2180 }, { "grad_norm": 1.1388211250305176, "learning_rate": 5.4725e-05, "loss": 0.1321, "step": 2190 }, { "grad_norm": 1.4022496938705444, "learning_rate": 5.4975e-05, "loss": 0.1261, "step": 2200 }, { "grad_norm": 1.0118855237960815, "learning_rate": 5.522500000000001e-05, "loss": 0.1296, "step": 2210 }, { "grad_norm": 1.197070837020874, "learning_rate": 5.5475e-05, "loss": 0.116, "step": 2220 }, { "grad_norm": 1.015507698059082, "learning_rate": 5.5725000000000006e-05, "loss": 0.1127, "step": 2230 }, { "grad_norm": 1.0297216176986694, "learning_rate": 5.5975e-05, "loss": 0.1237, "step": 2240 }, { "grad_norm": 1.2062488794326782, "learning_rate": 5.6225000000000005e-05, "loss": 0.1319, "step": 2250 }, { "grad_norm": 1.00223708152771, "learning_rate": 5.6475e-05, "loss": 0.1192, "step": 2260 }, { "grad_norm": 0.9938284158706665, "learning_rate": 5.6725e-05, "loss": 0.1161, "step": 2270 }, { "grad_norm": 1.0867445468902588, "learning_rate": 5.6975e-05, "loss": 0.1247, "step": 2280 }, { "grad_norm": 1.4874166250228882, "learning_rate": 5.722500000000001e-05, "loss": 0.1562, "step": 2290 }, { "grad_norm": 1.1856448650360107, "learning_rate": 5.7475e-05, "loss": 0.1144, "step": 2300 }, { "grad_norm": 0.9583752751350403, "learning_rate": 5.772500000000001e-05, "loss": 0.1167, "step": 2310 }, { "grad_norm": 0.729552149772644, "learning_rate": 5.7975e-05, "loss": 0.1, "step": 2320 }, { "grad_norm": 1.2534377574920654, "learning_rate": 5.8225000000000006e-05, "loss": 0.1343, "step": 2330 }, { "grad_norm": 0.8592886328697205, "learning_rate": 5.8475000000000005e-05, "loss": 0.1206, "step": 2340 }, { "grad_norm": 1.1068272590637207, "learning_rate": 5.8725000000000004e-05, "loss": 0.1252, "step": 2350 }, { "grad_norm": 0.9235677123069763, "learning_rate": 5.8975000000000004e-05, "loss": 0.1373, "step": 2360 }, { "grad_norm": 1.0849090814590454, "learning_rate": 5.922500000000001e-05, "loss": 0.1322, "step": 2370 }, { "grad_norm": 1.2575007677078247, "learning_rate": 5.9475e-05, "loss": 0.1266, "step": 2380 }, { "grad_norm": 1.0506086349487305, "learning_rate": 5.9724999999999995e-05, "loss": 0.1371, "step": 2390 }, { "grad_norm": 0.824508786201477, "learning_rate": 5.9975e-05, "loss": 0.1325, "step": 2400 }, { "grad_norm": 1.0877012014389038, "learning_rate": 6.0225e-05, "loss": 0.13, "step": 2410 }, { "grad_norm": 1.1890604496002197, "learning_rate": 6.0475000000000006e-05, "loss": 0.1604, "step": 2420 }, { "grad_norm": 1.2111625671386719, "learning_rate": 6.0725e-05, "loss": 0.1198, "step": 2430 }, { "grad_norm": 1.2719937562942505, "learning_rate": 6.0975000000000005e-05, "loss": 0.135, "step": 2440 }, { "grad_norm": 0.9964267015457153, "learning_rate": 6.1225e-05, "loss": 0.102, "step": 2450 }, { "grad_norm": 0.8452976942062378, "learning_rate": 6.1475e-05, "loss": 0.1131, "step": 2460 }, { "grad_norm": 1.1771998405456543, "learning_rate": 6.1725e-05, "loss": 0.1221, "step": 2470 }, { "grad_norm": 1.0269160270690918, "learning_rate": 6.1975e-05, "loss": 0.1475, "step": 2480 }, { "grad_norm": 1.1866605281829834, "learning_rate": 6.2225e-05, "loss": 0.1148, "step": 2490 }, { "grad_norm": 0.9720594882965088, "learning_rate": 6.2475e-05, "loss": 0.1027, "step": 2500 }, { "grad_norm": 0.9526267051696777, "learning_rate": 6.2725e-05, "loss": 0.106, "step": 2510 }, { "grad_norm": 0.7789490818977356, "learning_rate": 6.297500000000001e-05, "loss": 0.111, "step": 2520 }, { "grad_norm": 0.9073293805122375, "learning_rate": 6.3225e-05, "loss": 0.1099, "step": 2530 }, { "grad_norm": 0.9686797857284546, "learning_rate": 6.347500000000001e-05, "loss": 0.1169, "step": 2540 }, { "grad_norm": 1.067726731300354, "learning_rate": 6.3725e-05, "loss": 0.1229, "step": 2550 }, { "grad_norm": 1.0729447603225708, "learning_rate": 6.397500000000001e-05, "loss": 0.1354, "step": 2560 }, { "grad_norm": 1.0141679048538208, "learning_rate": 6.4225e-05, "loss": 0.1155, "step": 2570 }, { "grad_norm": 0.9481139779090881, "learning_rate": 6.447500000000001e-05, "loss": 0.1198, "step": 2580 }, { "grad_norm": 0.8647817373275757, "learning_rate": 6.4725e-05, "loss": 0.1098, "step": 2590 }, { "grad_norm": 0.7792224287986755, "learning_rate": 6.497500000000001e-05, "loss": 0.127, "step": 2600 }, { "grad_norm": 0.9152771234512329, "learning_rate": 6.5225e-05, "loss": 0.1205, "step": 2610 }, { "grad_norm": 0.8536548018455505, "learning_rate": 6.5475e-05, "loss": 0.107, "step": 2620 }, { "grad_norm": 1.120984673500061, "learning_rate": 6.5725e-05, "loss": 0.1009, "step": 2630 }, { "grad_norm": 0.9958048462867737, "learning_rate": 6.5975e-05, "loss": 0.1292, "step": 2640 }, { "grad_norm": 0.8403273820877075, "learning_rate": 6.6225e-05, "loss": 0.1241, "step": 2650 }, { "grad_norm": 0.9059581756591797, "learning_rate": 6.6475e-05, "loss": 0.1224, "step": 2660 }, { "grad_norm": 1.0263863801956177, "learning_rate": 6.672500000000001e-05, "loss": 0.1386, "step": 2670 }, { "grad_norm": 1.0336647033691406, "learning_rate": 6.6975e-05, "loss": 0.162, "step": 2680 }, { "grad_norm": 0.7713290452957153, "learning_rate": 6.722500000000001e-05, "loss": 0.1268, "step": 2690 }, { "grad_norm": 1.0892807245254517, "learning_rate": 6.7475e-05, "loss": 0.1081, "step": 2700 }, { "grad_norm": 0.7608943581581116, "learning_rate": 6.7725e-05, "loss": 0.1162, "step": 2710 }, { "grad_norm": 0.9948204159736633, "learning_rate": 6.7975e-05, "loss": 0.1507, "step": 2720 }, { "grad_norm": 1.1813359260559082, "learning_rate": 6.8225e-05, "loss": 0.1212, "step": 2730 }, { "grad_norm": 0.6962792873382568, "learning_rate": 6.8475e-05, "loss": 0.1141, "step": 2740 }, { "grad_norm": 0.8911656141281128, "learning_rate": 6.8725e-05, "loss": 0.1209, "step": 2750 }, { "grad_norm": 0.9426058530807495, "learning_rate": 6.8975e-05, "loss": 0.11, "step": 2760 }, { "grad_norm": 0.8397668600082397, "learning_rate": 6.9225e-05, "loss": 0.121, "step": 2770 }, { "grad_norm": 0.7752172350883484, "learning_rate": 6.9475e-05, "loss": 0.1237, "step": 2780 }, { "grad_norm": 0.9767682552337646, "learning_rate": 6.9725e-05, "loss": 0.1244, "step": 2790 }, { "grad_norm": 0.8171544075012207, "learning_rate": 6.997500000000001e-05, "loss": 0.109, "step": 2800 }, { "grad_norm": 1.1246387958526611, "learning_rate": 7.022500000000001e-05, "loss": 0.1506, "step": 2810 }, { "grad_norm": 1.2448110580444336, "learning_rate": 7.0475e-05, "loss": 0.1191, "step": 2820 }, { "grad_norm": 0.7032856941223145, "learning_rate": 7.072500000000001e-05, "loss": 0.109, "step": 2830 }, { "grad_norm": 0.8576553463935852, "learning_rate": 7.0975e-05, "loss": 0.14, "step": 2840 }, { "grad_norm": 1.0686527490615845, "learning_rate": 7.122500000000001e-05, "loss": 0.1265, "step": 2850 }, { "grad_norm": 0.9892820119857788, "learning_rate": 7.1475e-05, "loss": 0.1241, "step": 2860 }, { "grad_norm": 0.7837059497833252, "learning_rate": 7.172500000000001e-05, "loss": 0.0953, "step": 2870 }, { "grad_norm": 0.9075708389282227, "learning_rate": 7.1975e-05, "loss": 0.1111, "step": 2880 }, { "grad_norm": 0.9460199475288391, "learning_rate": 7.2225e-05, "loss": 0.1133, "step": 2890 }, { "grad_norm": 0.8519068360328674, "learning_rate": 7.2475e-05, "loss": 0.1244, "step": 2900 }, { "grad_norm": 0.9632498621940613, "learning_rate": 7.272499999999999e-05, "loss": 0.1097, "step": 2910 }, { "grad_norm": 0.8330392837524414, "learning_rate": 7.2975e-05, "loss": 0.1262, "step": 2920 }, { "grad_norm": 0.9332038164138794, "learning_rate": 7.3225e-05, "loss": 0.1033, "step": 2930 }, { "grad_norm": 0.9664743542671204, "learning_rate": 7.347500000000001e-05, "loss": 0.114, "step": 2940 }, { "grad_norm": 1.0465316772460938, "learning_rate": 7.3725e-05, "loss": 0.0982, "step": 2950 }, { "grad_norm": 0.6503188610076904, "learning_rate": 7.397500000000001e-05, "loss": 0.1021, "step": 2960 }, { "grad_norm": 0.8504334688186646, "learning_rate": 7.4225e-05, "loss": 0.0914, "step": 2970 }, { "grad_norm": 0.8423386216163635, "learning_rate": 7.447500000000001e-05, "loss": 0.1089, "step": 2980 }, { "grad_norm": 1.0958874225616455, "learning_rate": 7.4725e-05, "loss": 0.1196, "step": 2990 }, { "grad_norm": 0.8235782980918884, "learning_rate": 7.4975e-05, "loss": 0.1147, "step": 3000 }, { "grad_norm": 0.7361459136009216, "learning_rate": 7.5225e-05, "loss": 0.093, "step": 3010 }, { "grad_norm": 0.7590476274490356, "learning_rate": 7.5475e-05, "loss": 0.1049, "step": 3020 }, { "grad_norm": 0.8384370803833008, "learning_rate": 7.5725e-05, "loss": 0.1032, "step": 3030 }, { "grad_norm": 0.9068732261657715, "learning_rate": 7.5975e-05, "loss": 0.1041, "step": 3040 }, { "grad_norm": 0.9689406752586365, "learning_rate": 7.6225e-05, "loss": 0.1193, "step": 3050 }, { "grad_norm": 0.703710675239563, "learning_rate": 7.6475e-05, "loss": 0.1165, "step": 3060 }, { "grad_norm": 0.714146077632904, "learning_rate": 7.672500000000001e-05, "loss": 0.1068, "step": 3070 }, { "grad_norm": 0.7870296239852905, "learning_rate": 7.697500000000001e-05, "loss": 0.1151, "step": 3080 }, { "grad_norm": 1.0404621362686157, "learning_rate": 7.722500000000001e-05, "loss": 0.1298, "step": 3090 }, { "grad_norm": 1.1002635955810547, "learning_rate": 7.747500000000001e-05, "loss": 0.1044, "step": 3100 }, { "grad_norm": 0.6686312556266785, "learning_rate": 7.7725e-05, "loss": 0.0889, "step": 3110 }, { "grad_norm": 1.005028486251831, "learning_rate": 7.797500000000001e-05, "loss": 0.1024, "step": 3120 }, { "grad_norm": 0.7940409779548645, "learning_rate": 7.8225e-05, "loss": 0.0994, "step": 3130 }, { "grad_norm": 0.7729306817054749, "learning_rate": 7.8475e-05, "loss": 0.1161, "step": 3140 }, { "grad_norm": 0.8297701478004456, "learning_rate": 7.8725e-05, "loss": 0.1046, "step": 3150 }, { "grad_norm": 0.6328955292701721, "learning_rate": 7.8975e-05, "loss": 0.1103, "step": 3160 }, { "grad_norm": 0.6792614459991455, "learning_rate": 7.9225e-05, "loss": 0.0899, "step": 3170 }, { "grad_norm": 0.6784380078315735, "learning_rate": 7.9475e-05, "loss": 0.1208, "step": 3180 }, { "grad_norm": 0.7756112217903137, "learning_rate": 7.9725e-05, "loss": 0.112, "step": 3190 }, { "grad_norm": 0.7105650901794434, "learning_rate": 7.9975e-05, "loss": 0.1105, "step": 3200 }, { "grad_norm": 0.8879854679107666, "learning_rate": 8.022500000000001e-05, "loss": 0.0928, "step": 3210 }, { "grad_norm": 1.0180779695510864, "learning_rate": 8.0475e-05, "loss": 0.1325, "step": 3220 }, { "grad_norm": 0.6412224769592285, "learning_rate": 8.072500000000001e-05, "loss": 0.1148, "step": 3230 }, { "grad_norm": 0.712070882320404, "learning_rate": 8.0975e-05, "loss": 0.0932, "step": 3240 }, { "grad_norm": 0.8054584264755249, "learning_rate": 8.122500000000001e-05, "loss": 0.1163, "step": 3250 }, { "grad_norm": 0.8408918976783752, "learning_rate": 8.1475e-05, "loss": 0.1157, "step": 3260 }, { "grad_norm": 0.8645448684692383, "learning_rate": 8.172500000000001e-05, "loss": 0.1106, "step": 3270 }, { "grad_norm": 0.8799538016319275, "learning_rate": 8.1975e-05, "loss": 0.0983, "step": 3280 }, { "grad_norm": 0.8132560849189758, "learning_rate": 8.2225e-05, "loss": 0.102, "step": 3290 }, { "grad_norm": 0.661750853061676, "learning_rate": 8.2475e-05, "loss": 0.0955, "step": 3300 }, { "grad_norm": 0.9824926853179932, "learning_rate": 8.2725e-05, "loss": 0.1141, "step": 3310 }, { "grad_norm": 0.7666614055633545, "learning_rate": 8.2975e-05, "loss": 0.129, "step": 3320 }, { "grad_norm": 0.6857062578201294, "learning_rate": 8.3225e-05, "loss": 0.1108, "step": 3330 }, { "grad_norm": 1.0009814500808716, "learning_rate": 8.347500000000001e-05, "loss": 0.1117, "step": 3340 }, { "grad_norm": 0.7505905032157898, "learning_rate": 8.3725e-05, "loss": 0.0996, "step": 3350 }, { "grad_norm": 0.9107828140258789, "learning_rate": 8.397500000000001e-05, "loss": 0.1034, "step": 3360 }, { "grad_norm": 0.9461653828620911, "learning_rate": 8.422500000000001e-05, "loss": 0.1115, "step": 3370 }, { "grad_norm": 0.73564213514328, "learning_rate": 8.447500000000001e-05, "loss": 0.1546, "step": 3380 }, { "grad_norm": 0.87794029712677, "learning_rate": 8.4725e-05, "loss": 0.1019, "step": 3390 }, { "grad_norm": 0.8158360123634338, "learning_rate": 8.4975e-05, "loss": 0.1073, "step": 3400 }, { "grad_norm": 1.1356405019760132, "learning_rate": 8.5225e-05, "loss": 0.1126, "step": 3410 }, { "grad_norm": 0.8446733355522156, "learning_rate": 8.5475e-05, "loss": 0.0968, "step": 3420 }, { "grad_norm": 0.8232311606407166, "learning_rate": 8.5725e-05, "loss": 0.1113, "step": 3430 }, { "grad_norm": 0.775856077671051, "learning_rate": 8.5975e-05, "loss": 0.1232, "step": 3440 }, { "grad_norm": 0.6224350929260254, "learning_rate": 8.6225e-05, "loss": 0.1072, "step": 3450 }, { "grad_norm": 0.6721522808074951, "learning_rate": 8.6475e-05, "loss": 0.0912, "step": 3460 }, { "grad_norm": 0.5471897721290588, "learning_rate": 8.672500000000001e-05, "loss": 0.1174, "step": 3470 }, { "grad_norm": 0.8137038350105286, "learning_rate": 8.6975e-05, "loss": 0.1125, "step": 3480 }, { "grad_norm": 0.8667474389076233, "learning_rate": 8.7225e-05, "loss": 0.1165, "step": 3490 }, { "grad_norm": 0.7663674354553223, "learning_rate": 8.747500000000001e-05, "loss": 0.0932, "step": 3500 }, { "grad_norm": 1.0189703702926636, "learning_rate": 8.7725e-05, "loss": 0.1062, "step": 3510 }, { "grad_norm": 0.8456623554229736, "learning_rate": 8.797500000000001e-05, "loss": 0.1, "step": 3520 }, { "grad_norm": 0.872280478477478, "learning_rate": 8.8225e-05, "loss": 0.1047, "step": 3530 }, { "grad_norm": 0.8092300295829773, "learning_rate": 8.847500000000001e-05, "loss": 0.1116, "step": 3540 }, { "grad_norm": 0.739482045173645, "learning_rate": 8.8725e-05, "loss": 0.1066, "step": 3550 }, { "grad_norm": 0.6361083388328552, "learning_rate": 8.897500000000001e-05, "loss": 0.0861, "step": 3560 }, { "grad_norm": 0.7802761793136597, "learning_rate": 8.9225e-05, "loss": 0.1111, "step": 3570 }, { "grad_norm": 0.9123257994651794, "learning_rate": 8.9475e-05, "loss": 0.1235, "step": 3580 }, { "grad_norm": 0.7880210876464844, "learning_rate": 8.9725e-05, "loss": 0.1131, "step": 3590 }, { "grad_norm": 0.7353662848472595, "learning_rate": 8.9975e-05, "loss": 0.1057, "step": 3600 }, { "grad_norm": 0.7072981595993042, "learning_rate": 9.0225e-05, "loss": 0.0999, "step": 3610 }, { "grad_norm": 0.7792389988899231, "learning_rate": 9.0475e-05, "loss": 0.0987, "step": 3620 }, { "grad_norm": 0.5647854208946228, "learning_rate": 9.072500000000001e-05, "loss": 0.0909, "step": 3630 }, { "grad_norm": 0.7663183808326721, "learning_rate": 9.0975e-05, "loss": 0.093, "step": 3640 }, { "grad_norm": 0.7045959234237671, "learning_rate": 9.122500000000001e-05, "loss": 0.0984, "step": 3650 }, { "grad_norm": 0.6886898279190063, "learning_rate": 9.1475e-05, "loss": 0.1042, "step": 3660 }, { "grad_norm": 0.7177726626396179, "learning_rate": 9.172500000000001e-05, "loss": 0.121, "step": 3670 }, { "grad_norm": 0.7852405309677124, "learning_rate": 9.1975e-05, "loss": 0.1009, "step": 3680 }, { "grad_norm": 0.8155625462532043, "learning_rate": 9.2225e-05, "loss": 0.1035, "step": 3690 }, { "grad_norm": 0.6780592203140259, "learning_rate": 9.2475e-05, "loss": 0.0968, "step": 3700 }, { "grad_norm": 0.9664795398712158, "learning_rate": 9.2725e-05, "loss": 0.0945, "step": 3710 }, { "grad_norm": 0.6774467825889587, "learning_rate": 9.2975e-05, "loss": 0.1087, "step": 3720 }, { "grad_norm": 0.6875939965248108, "learning_rate": 9.3225e-05, "loss": 0.0948, "step": 3730 }, { "grad_norm": 0.6742399334907532, "learning_rate": 9.3475e-05, "loss": 0.1017, "step": 3740 }, { "grad_norm": 0.6886413097381592, "learning_rate": 9.3725e-05, "loss": 0.1214, "step": 3750 }, { "grad_norm": 0.6794500350952148, "learning_rate": 9.397500000000001e-05, "loss": 0.1065, "step": 3760 }, { "grad_norm": 0.7444162964820862, "learning_rate": 9.422500000000001e-05, "loss": 0.1089, "step": 3770 }, { "grad_norm": 0.8418692350387573, "learning_rate": 9.4475e-05, "loss": 0.1036, "step": 3780 }, { "grad_norm": 0.6365513801574707, "learning_rate": 9.472500000000001e-05, "loss": 0.1063, "step": 3790 }, { "grad_norm": 0.6338428258895874, "learning_rate": 9.4975e-05, "loss": 0.1347, "step": 3800 }, { "grad_norm": 0.6404892206192017, "learning_rate": 9.522500000000001e-05, "loss": 0.0918, "step": 3810 }, { "grad_norm": 0.8720124959945679, "learning_rate": 9.5475e-05, "loss": 0.1153, "step": 3820 }, { "grad_norm": 0.7741401195526123, "learning_rate": 9.572500000000001e-05, "loss": 0.0957, "step": 3830 }, { "grad_norm": 0.6871811151504517, "learning_rate": 9.5975e-05, "loss": 0.0925, "step": 3840 }, { "grad_norm": 0.7619557976722717, "learning_rate": 9.622500000000001e-05, "loss": 0.1163, "step": 3850 }, { "grad_norm": 1.0261647701263428, "learning_rate": 9.6475e-05, "loss": 0.0976, "step": 3860 }, { "grad_norm": 1.1623576879501343, "learning_rate": 9.6725e-05, "loss": 0.0957, "step": 3870 }, { "grad_norm": 0.6544579863548279, "learning_rate": 9.6975e-05, "loss": 0.0971, "step": 3880 }, { "grad_norm": 0.7102132439613342, "learning_rate": 9.7225e-05, "loss": 0.1035, "step": 3890 }, { "grad_norm": 0.7584795951843262, "learning_rate": 9.747500000000001e-05, "loss": 0.1104, "step": 3900 }, { "grad_norm": 0.7538645267486572, "learning_rate": 9.7725e-05, "loss": 0.1085, "step": 3910 }, { "grad_norm": 0.9404613971710205, "learning_rate": 9.797500000000001e-05, "loss": 0.1175, "step": 3920 }, { "grad_norm": 0.6639807224273682, "learning_rate": 9.8225e-05, "loss": 0.095, "step": 3930 }, { "grad_norm": 0.8156827092170715, "learning_rate": 9.847500000000001e-05, "loss": 0.1172, "step": 3940 }, { "grad_norm": 0.7430055141448975, "learning_rate": 9.8725e-05, "loss": 0.1064, "step": 3950 }, { "grad_norm": 0.8384851813316345, "learning_rate": 9.897500000000001e-05, "loss": 0.1307, "step": 3960 }, { "grad_norm": 0.8636842966079712, "learning_rate": 9.9225e-05, "loss": 0.1044, "step": 3970 }, { "grad_norm": 0.7145629525184631, "learning_rate": 9.9475e-05, "loss": 0.1082, "step": 3980 }, { "grad_norm": 0.7894698977470398, "learning_rate": 9.9725e-05, "loss": 0.1126, "step": 3990 }, { "grad_norm": 0.9782698750495911, "learning_rate": 9.9975e-05, "loss": 0.0998, "step": 4000 }, { "grad_norm": 0.9395438432693481, "learning_rate": 9.999999653982884e-05, "loss": 0.1059, "step": 4010 }, { "grad_norm": 0.5968555808067322, "learning_rate": 9.999998457874392e-05, "loss": 0.1047, "step": 4020 }, { "grad_norm": 1.401621699333191, "learning_rate": 9.999996407402913e-05, "loss": 0.109, "step": 4030 }, { "grad_norm": 0.5748519897460938, "learning_rate": 9.999993502568801e-05, "loss": 0.0787, "step": 4040 }, { "grad_norm": 0.6979372501373291, "learning_rate": 9.999989743372548e-05, "loss": 0.1036, "step": 4050 }, { "grad_norm": 0.8275598287582397, "learning_rate": 9.999985129814798e-05, "loss": 0.1006, "step": 4060 }, { "grad_norm": 0.6689101457595825, "learning_rate": 9.99997966189634e-05, "loss": 0.0906, "step": 4070 }, { "grad_norm": 0.8133144378662109, "learning_rate": 9.999973339618107e-05, "loss": 0.1163, "step": 4080 }, { "grad_norm": 0.7694265246391296, "learning_rate": 9.999966162981179e-05, "loss": 0.1039, "step": 4090 }, { "grad_norm": 0.886293888092041, "learning_rate": 9.999958131986784e-05, "loss": 0.1011, "step": 4100 }, { "grad_norm": 0.6149765253067017, "learning_rate": 9.999949246636293e-05, "loss": 0.1092, "step": 4110 }, { "grad_norm": 0.8998809456825256, "learning_rate": 9.999939506931224e-05, "loss": 0.1131, "step": 4120 }, { "grad_norm": 0.6959656476974487, "learning_rate": 9.999928912873243e-05, "loss": 0.0994, "step": 4130 }, { "grad_norm": 0.6853281259536743, "learning_rate": 9.999917464464159e-05, "loss": 0.1151, "step": 4140 }, { "grad_norm": 0.7998688817024231, "learning_rate": 9.999905161705929e-05, "loss": 0.116, "step": 4150 }, { "grad_norm": 0.6944754123687744, "learning_rate": 9.999892004600653e-05, "loss": 0.0985, "step": 4160 }, { "grad_norm": 0.8804993033409119, "learning_rate": 9.999877993150581e-05, "loss": 0.0963, "step": 4170 }, { "grad_norm": 0.5880829095840454, "learning_rate": 9.999863127358108e-05, "loss": 0.0921, "step": 4180 }, { "grad_norm": 0.7048696279525757, "learning_rate": 9.999847407225773e-05, "loss": 0.0922, "step": 4190 }, { "grad_norm": 0.6522114276885986, "learning_rate": 9.999830832756262e-05, "loss": 0.0945, "step": 4200 }, { "grad_norm": 0.6083471775054932, "learning_rate": 9.999813403952407e-05, "loss": 0.0943, "step": 4210 }, { "grad_norm": 0.6971043348312378, "learning_rate": 9.999795120817187e-05, "loss": 0.1133, "step": 4220 }, { "grad_norm": 0.761482298374176, "learning_rate": 9.999775983353725e-05, "loss": 0.1257, "step": 4230 }, { "grad_norm": 0.6328215003013611, "learning_rate": 9.999755991565292e-05, "loss": 0.1128, "step": 4240 }, { "grad_norm": 0.6867467164993286, "learning_rate": 9.999735145455303e-05, "loss": 0.1134, "step": 4250 }, { "grad_norm": 0.7732101082801819, "learning_rate": 9.99971344502732e-05, "loss": 0.121, "step": 4260 }, { "grad_norm": 0.6812613010406494, "learning_rate": 9.999690890285053e-05, "loss": 0.1081, "step": 4270 }, { "grad_norm": 0.6683989763259888, "learning_rate": 9.999667481232356e-05, "loss": 0.0905, "step": 4280 }, { "grad_norm": 0.6420314311981201, "learning_rate": 9.999643217873225e-05, "loss": 0.0918, "step": 4290 }, { "grad_norm": 0.5997201204299927, "learning_rate": 9.999618100211809e-05, "loss": 0.088, "step": 4300 }, { "grad_norm": 0.6233285665512085, "learning_rate": 9.999592128252402e-05, "loss": 0.1019, "step": 4310 }, { "grad_norm": 1.12154221534729, "learning_rate": 9.999565301999437e-05, "loss": 0.1345, "step": 4320 }, { "grad_norm": 0.5902978777885437, "learning_rate": 9.999537621457502e-05, "loss": 0.0923, "step": 4330 }, { "grad_norm": 0.6204026937484741, "learning_rate": 9.999509086631323e-05, "loss": 0.0765, "step": 4340 }, { "grad_norm": 0.5791992545127869, "learning_rate": 9.99947969752578e-05, "loss": 0.0784, "step": 4350 }, { "grad_norm": 0.7568576335906982, "learning_rate": 9.999449454145891e-05, "loss": 0.0848, "step": 4360 }, { "grad_norm": 0.9663796424865723, "learning_rate": 9.999418356496827e-05, "loss": 0.0918, "step": 4370 }, { "grad_norm": 0.5497902631759644, "learning_rate": 9.999386404583899e-05, "loss": 0.092, "step": 4380 }, { "grad_norm": 0.8964238166809082, "learning_rate": 9.999353598412568e-05, "loss": 0.0853, "step": 4390 }, { "grad_norm": 0.6785461902618408, "learning_rate": 9.999319937988442e-05, "loss": 0.1116, "step": 4400 }, { "grad_norm": 0.8033525347709656, "learning_rate": 9.999285423317268e-05, "loss": 0.1123, "step": 4410 }, { "grad_norm": 0.602044939994812, "learning_rate": 9.999250054404947e-05, "loss": 0.1139, "step": 4420 }, { "grad_norm": 0.7390519976615906, "learning_rate": 9.99921383125752e-05, "loss": 0.1074, "step": 4430 }, { "grad_norm": 0.5590643286705017, "learning_rate": 9.99917675388118e-05, "loss": 0.103, "step": 4440 }, { "grad_norm": 0.6482062935829163, "learning_rate": 9.99913882228226e-05, "loss": 0.1086, "step": 4450 }, { "grad_norm": 0.5862317681312561, "learning_rate": 9.999100036467242e-05, "loss": 0.0957, "step": 4460 }, { "grad_norm": 0.5239005088806152, "learning_rate": 9.999060396442753e-05, "loss": 0.0863, "step": 4470 }, { "grad_norm": 0.7254201173782349, "learning_rate": 9.999019902215566e-05, "loss": 0.1059, "step": 4480 }, { "grad_norm": 0.7194250226020813, "learning_rate": 9.998978553792602e-05, "loss": 0.0941, "step": 4490 }, { "grad_norm": 0.6066429018974304, "learning_rate": 9.998936351180926e-05, "loss": 0.0993, "step": 4500 }, { "grad_norm": 0.8544101715087891, "learning_rate": 9.998893294387747e-05, "loss": 0.1215, "step": 4510 }, { "grad_norm": 0.7941309809684753, "learning_rate": 9.998849383420426e-05, "loss": 0.0989, "step": 4520 }, { "grad_norm": 0.9260858297348022, "learning_rate": 9.998804618286465e-05, "loss": 0.1054, "step": 4530 }, { "grad_norm": 0.9364845752716064, "learning_rate": 9.99875899899351e-05, "loss": 0.116, "step": 4540 }, { "grad_norm": 0.7720131278038025, "learning_rate": 9.99871252554936e-05, "loss": 0.1104, "step": 4550 }, { "grad_norm": 0.6601158380508423, "learning_rate": 9.998665197961955e-05, "loss": 0.0913, "step": 4560 }, { "grad_norm": 0.8134888410568237, "learning_rate": 9.998617016239379e-05, "loss": 0.1274, "step": 4570 }, { "grad_norm": 0.975920557975769, "learning_rate": 9.998567980389869e-05, "loss": 0.1108, "step": 4580 }, { "grad_norm": 1.1326417922973633, "learning_rate": 9.998518090421802e-05, "loss": 0.1001, "step": 4590 }, { "grad_norm": 0.8385424613952637, "learning_rate": 9.998467346343703e-05, "loss": 0.1024, "step": 4600 }, { "grad_norm": 0.7343429923057556, "learning_rate": 9.998415748164243e-05, "loss": 0.103, "step": 4610 }, { "grad_norm": 0.659519612789154, "learning_rate": 9.998363295892238e-05, "loss": 0.0992, "step": 4620 }, { "grad_norm": 0.800759494304657, "learning_rate": 9.998309989536652e-05, "loss": 0.0992, "step": 4630 }, { "grad_norm": 0.6683587431907654, "learning_rate": 9.998255829106593e-05, "loss": 0.1181, "step": 4640 }, { "grad_norm": 0.6858997344970703, "learning_rate": 9.998200814611316e-05, "loss": 0.0944, "step": 4650 }, { "grad_norm": 0.7707213759422302, "learning_rate": 9.998144946060219e-05, "loss": 0.1007, "step": 4660 }, { "grad_norm": 0.8431375622749329, "learning_rate": 9.998088223462852e-05, "loss": 0.0881, "step": 4670 }, { "grad_norm": 0.8811302781105042, "learning_rate": 9.998030646828905e-05, "loss": 0.0954, "step": 4680 }, { "grad_norm": 0.7049357891082764, "learning_rate": 9.997972216168217e-05, "loss": 0.1022, "step": 4690 }, { "grad_norm": 0.7231827974319458, "learning_rate": 9.997912931490771e-05, "loss": 0.0934, "step": 4700 }, { "grad_norm": 0.8076702356338501, "learning_rate": 9.9978527928067e-05, "loss": 0.1002, "step": 4710 }, { "grad_norm": 0.5180054306983948, "learning_rate": 9.997791800126277e-05, "loss": 0.0844, "step": 4720 }, { "grad_norm": 0.6055232286453247, "learning_rate": 9.997729953459927e-05, "loss": 0.1092, "step": 4730 }, { "grad_norm": 0.5780087113380432, "learning_rate": 9.997667252818214e-05, "loss": 0.1011, "step": 4740 }, { "grad_norm": 0.8006765842437744, "learning_rate": 9.997603698211855e-05, "loss": 0.0915, "step": 4750 }, { "grad_norm": 0.8868600130081177, "learning_rate": 9.99753928965171e-05, "loss": 0.1096, "step": 4760 }, { "grad_norm": 0.6947904825210571, "learning_rate": 9.997474027148781e-05, "loss": 0.0996, "step": 4770 }, { "grad_norm": 0.7912223935127258, "learning_rate": 9.997407910714223e-05, "loss": 0.1102, "step": 4780 }, { "grad_norm": 0.6882911920547485, "learning_rate": 9.997340940359332e-05, "loss": 0.0814, "step": 4790 }, { "grad_norm": 0.741892397403717, "learning_rate": 9.997273116095552e-05, "loss": 0.0904, "step": 4800 }, { "grad_norm": 0.735978901386261, "learning_rate": 9.997204437934473e-05, "loss": 0.1017, "step": 4810 }, { "grad_norm": 0.6550512313842773, "learning_rate": 9.997134905887829e-05, "loss": 0.09, "step": 4820 }, { "grad_norm": 0.5389544367790222, "learning_rate": 9.997064519967501e-05, "loss": 0.0985, "step": 4830 }, { "grad_norm": 0.5578478574752808, "learning_rate": 9.996993280185517e-05, "loss": 0.0864, "step": 4840 }, { "grad_norm": 0.8112117648124695, "learning_rate": 9.99692118655405e-05, "loss": 0.0947, "step": 4850 }, { "grad_norm": 0.7575563192367554, "learning_rate": 9.996848239085417e-05, "loss": 0.0853, "step": 4860 }, { "grad_norm": 0.5553140044212341, "learning_rate": 9.996774437792085e-05, "loss": 0.0857, "step": 4870 }, { "grad_norm": 0.6277113556861877, "learning_rate": 9.996699782686664e-05, "loss": 0.0805, "step": 4880 }, { "grad_norm": 0.8783878684043884, "learning_rate": 9.996624273781909e-05, "loss": 0.108, "step": 4890 }, { "grad_norm": 0.7128214836120605, "learning_rate": 9.996547911090725e-05, "loss": 0.0878, "step": 4900 }, { "grad_norm": 0.7296270132064819, "learning_rate": 9.996470694626157e-05, "loss": 0.0956, "step": 4910 }, { "grad_norm": 0.5878462195396423, "learning_rate": 9.996392624401403e-05, "loss": 0.088, "step": 4920 }, { "grad_norm": 0.793500542640686, "learning_rate": 9.996313700429801e-05, "loss": 0.0907, "step": 4930 }, { "grad_norm": 0.630072295665741, "learning_rate": 9.996233922724836e-05, "loss": 0.0855, "step": 4940 }, { "grad_norm": 0.4852931499481201, "learning_rate": 9.996153291300141e-05, "loss": 0.1097, "step": 4950 }, { "grad_norm": 0.6370925307273865, "learning_rate": 9.996071806169494e-05, "loss": 0.0887, "step": 4960 }, { "grad_norm": 0.5234808325767517, "learning_rate": 9.995989467346817e-05, "loss": 0.0933, "step": 4970 }, { "grad_norm": 0.5112282037734985, "learning_rate": 9.995906274846183e-05, "loss": 0.0944, "step": 4980 }, { "grad_norm": 0.824572741985321, "learning_rate": 9.995822228681803e-05, "loss": 0.0935, "step": 4990 }, { "grad_norm": 0.5927644371986389, "learning_rate": 9.99573732886804e-05, "loss": 0.0952, "step": 5000 }, { "grad_norm": 0.6479384303092957, "learning_rate": 9.995651575419402e-05, "loss": 0.0919, "step": 5010 }, { "grad_norm": 0.7614251375198364, "learning_rate": 9.995564968350541e-05, "loss": 0.0944, "step": 5020 }, { "grad_norm": 0.6317470669746399, "learning_rate": 9.995477507676256e-05, "loss": 0.0795, "step": 5030 }, { "grad_norm": 0.57611083984375, "learning_rate": 9.995389193411493e-05, "loss": 0.0808, "step": 5040 }, { "grad_norm": 0.576521098613739, "learning_rate": 9.995300025571339e-05, "loss": 0.0971, "step": 5050 }, { "grad_norm": 0.5752301216125488, "learning_rate": 9.995210004171034e-05, "loss": 0.0801, "step": 5060 }, { "grad_norm": 0.5324503183364868, "learning_rate": 9.995119129225956e-05, "loss": 0.1067, "step": 5070 }, { "grad_norm": 0.5263371467590332, "learning_rate": 9.995027400751637e-05, "loss": 0.1019, "step": 5080 }, { "grad_norm": 0.6340789794921875, "learning_rate": 9.994934818763751e-05, "loss": 0.0839, "step": 5090 }, { "grad_norm": 0.8557195663452148, "learning_rate": 9.994841383278115e-05, "loss": 0.0916, "step": 5100 }, { "grad_norm": 0.6279283165931702, "learning_rate": 9.994747094310695e-05, "loss": 0.0931, "step": 5110 }, { "grad_norm": 0.7897734642028809, "learning_rate": 9.994651951877604e-05, "loss": 0.1175, "step": 5120 }, { "grad_norm": 0.8290451169013977, "learning_rate": 9.994555955995099e-05, "loss": 0.1032, "step": 5130 }, { "grad_norm": 0.6533148884773254, "learning_rate": 9.994459106679581e-05, "loss": 0.1003, "step": 5140 }, { "grad_norm": 0.5186514258384705, "learning_rate": 9.994361403947603e-05, "loss": 0.0827, "step": 5150 }, { "grad_norm": 0.5841973423957825, "learning_rate": 9.994262847815854e-05, "loss": 0.1166, "step": 5160 }, { "grad_norm": 0.7745619416236877, "learning_rate": 9.99416343830118e-05, "loss": 0.096, "step": 5170 }, { "grad_norm": 0.5785901546478271, "learning_rate": 9.994063175420565e-05, "loss": 0.0791, "step": 5180 }, { "grad_norm": 0.7756817936897278, "learning_rate": 9.99396205919114e-05, "loss": 0.1161, "step": 5190 }, { "grad_norm": 0.6601975560188293, "learning_rate": 9.993860089630185e-05, "loss": 0.0847, "step": 5200 }, { "grad_norm": 0.5972104072570801, "learning_rate": 9.993757266755123e-05, "loss": 0.0891, "step": 5210 }, { "grad_norm": 0.7150823473930359, "learning_rate": 9.993653590583522e-05, "loss": 0.0888, "step": 5220 }, { "grad_norm": 0.5095343589782715, "learning_rate": 9.993549061133102e-05, "loss": 0.081, "step": 5230 }, { "grad_norm": 0.649614691734314, "learning_rate": 9.993443678421719e-05, "loss": 0.0915, "step": 5240 }, { "grad_norm": 0.7014835476875305, "learning_rate": 9.993337442467384e-05, "loss": 0.1048, "step": 5250 }, { "grad_norm": 0.7221680879592896, "learning_rate": 9.993230353288248e-05, "loss": 0.0814, "step": 5260 }, { "grad_norm": 0.7052260637283325, "learning_rate": 9.993122410902608e-05, "loss": 0.0918, "step": 5270 }, { "grad_norm": 0.6487457156181335, "learning_rate": 9.993013615328912e-05, "loss": 0.0994, "step": 5280 }, { "grad_norm": 0.5955459475517273, "learning_rate": 9.992903966585747e-05, "loss": 0.1007, "step": 5290 }, { "grad_norm": 0.5775958299636841, "learning_rate": 9.992793464691852e-05, "loss": 0.0771, "step": 5300 }, { "grad_norm": 0.6087354421615601, "learning_rate": 9.992682109666105e-05, "loss": 0.0932, "step": 5310 }, { "grad_norm": 0.8038695454597473, "learning_rate": 9.992569901527538e-05, "loss": 0.0954, "step": 5320 }, { "grad_norm": 0.6683801412582397, "learning_rate": 9.99245684029532e-05, "loss": 0.1041, "step": 5330 }, { "grad_norm": 0.6920615434646606, "learning_rate": 9.992342925988774e-05, "loss": 0.0915, "step": 5340 }, { "grad_norm": 0.682948112487793, "learning_rate": 9.992228158627361e-05, "loss": 0.0865, "step": 5350 }, { "grad_norm": 0.6402882933616638, "learning_rate": 9.992112538230693e-05, "loss": 0.0917, "step": 5360 }, { "grad_norm": 0.6054399013519287, "learning_rate": 9.991996064818527e-05, "loss": 0.0909, "step": 5370 }, { "grad_norm": 0.6279029250144958, "learning_rate": 9.991878738410768e-05, "loss": 0.082, "step": 5380 }, { "grad_norm": 0.7772502303123474, "learning_rate": 9.991760559027457e-05, "loss": 0.084, "step": 5390 }, { "grad_norm": 0.5706392526626587, "learning_rate": 9.991641526688793e-05, "loss": 0.0942, "step": 5400 }, { "grad_norm": 0.7494158744812012, "learning_rate": 9.991521641415113e-05, "loss": 0.0849, "step": 5410 }, { "grad_norm": 0.49097758531570435, "learning_rate": 9.991400903226904e-05, "loss": 0.0792, "step": 5420 }, { "grad_norm": 0.7469417452812195, "learning_rate": 9.991279312144794e-05, "loss": 0.0896, "step": 5430 }, { "grad_norm": 0.8340598344802856, "learning_rate": 9.991156868189564e-05, "loss": 0.1059, "step": 5440 }, { "grad_norm": 0.5886636972427368, "learning_rate": 9.991033571382131e-05, "loss": 0.0683, "step": 5450 }, { "grad_norm": 0.5876349210739136, "learning_rate": 9.990909421743569e-05, "loss": 0.0757, "step": 5460 }, { "grad_norm": 0.7227078080177307, "learning_rate": 9.990784419295085e-05, "loss": 0.0789, "step": 5470 }, { "grad_norm": 0.5558689832687378, "learning_rate": 9.990658564058044e-05, "loss": 0.0793, "step": 5480 }, { "grad_norm": 0.7360127568244934, "learning_rate": 9.990531856053948e-05, "loss": 0.0968, "step": 5490 }, { "grad_norm": 0.6224015355110168, "learning_rate": 9.99040429530445e-05, "loss": 0.1021, "step": 5500 }, { "grad_norm": 0.6956664323806763, "learning_rate": 9.990275881831346e-05, "loss": 0.0927, "step": 5510 }, { "grad_norm": 0.5497816801071167, "learning_rate": 9.990146615656577e-05, "loss": 0.0806, "step": 5520 }, { "grad_norm": 0.5484920144081116, "learning_rate": 9.990016496802233e-05, "loss": 0.0781, "step": 5530 }, { "grad_norm": 0.6671008467674255, "learning_rate": 9.989885525290548e-05, "loss": 0.0702, "step": 5540 }, { "grad_norm": 0.6948413848876953, "learning_rate": 9.989753701143897e-05, "loss": 0.0785, "step": 5550 }, { "grad_norm": 0.5744526982307434, "learning_rate": 9.989621024384812e-05, "loss": 0.0821, "step": 5560 }, { "grad_norm": 0.6918222308158875, "learning_rate": 9.989487495035959e-05, "loss": 0.0875, "step": 5570 }, { "grad_norm": 0.7812965512275696, "learning_rate": 9.989353113120156e-05, "loss": 0.1038, "step": 5580 }, { "grad_norm": 0.5323149561882019, "learning_rate": 9.989217878660366e-05, "loss": 0.0812, "step": 5590 }, { "grad_norm": 0.7620360851287842, "learning_rate": 9.989081791679695e-05, "loss": 0.0926, "step": 5600 }, { "grad_norm": 0.8405633568763733, "learning_rate": 9.988944852201397e-05, "loss": 0.093, "step": 5610 }, { "grad_norm": 0.683945894241333, "learning_rate": 9.988807060248873e-05, "loss": 0.0805, "step": 5620 }, { "grad_norm": 0.6843264698982239, "learning_rate": 9.988668415845665e-05, "loss": 0.0752, "step": 5630 }, { "grad_norm": 0.7045908570289612, "learning_rate": 9.988528919015466e-05, "loss": 0.088, "step": 5640 }, { "grad_norm": 0.6740930080413818, "learning_rate": 9.988388569782112e-05, "loss": 0.1122, "step": 5650 }, { "grad_norm": 0.6316326260566711, "learning_rate": 9.988247368169583e-05, "loss": 0.0846, "step": 5660 }, { "grad_norm": 0.6323257088661194, "learning_rate": 9.988105314202007e-05, "loss": 0.0799, "step": 5670 }, { "grad_norm": 0.8100172281265259, "learning_rate": 9.987962407903659e-05, "loss": 0.0909, "step": 5680 }, { "grad_norm": 0.7992761135101318, "learning_rate": 9.987818649298957e-05, "loss": 0.0846, "step": 5690 }, { "grad_norm": 0.4743105173110962, "learning_rate": 9.987674038412465e-05, "loss": 0.0889, "step": 5700 }, { "grad_norm": 0.5580900311470032, "learning_rate": 9.987528575268891e-05, "loss": 0.0895, "step": 5710 }, { "grad_norm": 0.5444880127906799, "learning_rate": 9.987382259893095e-05, "loss": 0.0849, "step": 5720 }, { "grad_norm": 0.705169677734375, "learning_rate": 9.987235092310074e-05, "loss": 0.0804, "step": 5730 }, { "grad_norm": 0.5228917002677917, "learning_rate": 9.987087072544978e-05, "loss": 0.0946, "step": 5740 }, { "grad_norm": 0.5801391005516052, "learning_rate": 9.9869382006231e-05, "loss": 0.0805, "step": 5750 }, { "grad_norm": 0.6619839668273926, "learning_rate": 9.986788476569875e-05, "loss": 0.095, "step": 5760 }, { "grad_norm": 0.6981720328330994, "learning_rate": 9.986637900410887e-05, "loss": 0.0924, "step": 5770 }, { "grad_norm": 0.6766951680183411, "learning_rate": 9.986486472171869e-05, "loss": 0.0984, "step": 5780 }, { "grad_norm": 0.7668548822402954, "learning_rate": 9.986334191878692e-05, "loss": 0.0805, "step": 5790 }, { "grad_norm": 0.7369725704193115, "learning_rate": 9.986181059557378e-05, "loss": 0.1001, "step": 5800 }, { "grad_norm": 0.6186543107032776, "learning_rate": 9.986027075234094e-05, "loss": 0.087, "step": 5810 }, { "grad_norm": 0.6717166304588318, "learning_rate": 9.985872238935152e-05, "loss": 0.0864, "step": 5820 }, { "grad_norm": 0.5116910934448242, "learning_rate": 9.985716550687008e-05, "loss": 0.0903, "step": 5830 }, { "grad_norm": 0.515976071357727, "learning_rate": 9.985560010516264e-05, "loss": 0.0817, "step": 5840 }, { "grad_norm": 0.556938648223877, "learning_rate": 9.985402618449668e-05, "loss": 0.0819, "step": 5850 }, { "grad_norm": 0.62430739402771, "learning_rate": 9.985244374514118e-05, "loss": 0.1034, "step": 5860 }, { "grad_norm": 0.49266645312309265, "learning_rate": 9.985085278736651e-05, "loss": 0.0852, "step": 5870 }, { "grad_norm": 0.6841632723808289, "learning_rate": 9.984925331144452e-05, "loss": 0.0886, "step": 5880 }, { "grad_norm": 0.6660784482955933, "learning_rate": 9.984764531764851e-05, "loss": 0.0912, "step": 5890 }, { "grad_norm": 0.6641606688499451, "learning_rate": 9.984602880625326e-05, "loss": 0.1009, "step": 5900 }, { "grad_norm": 0.595757007598877, "learning_rate": 9.9844403777535e-05, "loss": 0.0926, "step": 5910 }, { "grad_norm": 0.6960862874984741, "learning_rate": 9.984277023177135e-05, "loss": 0.085, "step": 5920 }, { "grad_norm": 0.6049979329109192, "learning_rate": 9.984112816924148e-05, "loss": 0.0785, "step": 5930 }, { "grad_norm": 0.5524760484695435, "learning_rate": 9.983947759022596e-05, "loss": 0.087, "step": 5940 }, { "grad_norm": 0.5655296444892883, "learning_rate": 9.983781849500682e-05, "loss": 0.0755, "step": 5950 }, { "grad_norm": 0.6246306896209717, "learning_rate": 9.98361508838676e-05, "loss": 0.0794, "step": 5960 }, { "grad_norm": 0.7517223954200745, "learning_rate": 9.98344747570932e-05, "loss": 0.0863, "step": 5970 }, { "grad_norm": 0.6165030002593994, "learning_rate": 9.983279011497004e-05, "loss": 0.0928, "step": 5980 }, { "grad_norm": 0.7620031237602234, "learning_rate": 9.983109695778596e-05, "loss": 0.0991, "step": 5990 }, { "grad_norm": 0.6286477446556091, "learning_rate": 9.982939528583032e-05, "loss": 0.1114, "step": 6000 }, { "grad_norm": 0.6545832753181458, "learning_rate": 9.982768509939385e-05, "loss": 0.0932, "step": 6010 }, { "grad_norm": 0.641276478767395, "learning_rate": 9.982596639876879e-05, "loss": 0.0874, "step": 6020 }, { "grad_norm": 0.7061131000518799, "learning_rate": 9.982423918424881e-05, "loss": 0.0914, "step": 6030 }, { "grad_norm": 0.6645289659500122, "learning_rate": 9.982250345612908e-05, "loss": 0.0838, "step": 6040 }, { "grad_norm": 0.6894896626472473, "learning_rate": 9.982075921470611e-05, "loss": 0.0969, "step": 6050 }, { "grad_norm": 0.6992466449737549, "learning_rate": 9.981900646027802e-05, "loss": 0.0886, "step": 6060 }, { "grad_norm": 0.6641819477081299, "learning_rate": 9.981724519314425e-05, "loss": 0.082, "step": 6070 }, { "grad_norm": 0.4960598051548004, "learning_rate": 9.981547541360581e-05, "loss": 0.0899, "step": 6080 }, { "grad_norm": 0.6687148809432983, "learning_rate": 9.981369712196508e-05, "loss": 0.0891, "step": 6090 }, { "grad_norm": 0.5848655700683594, "learning_rate": 9.981191031852592e-05, "loss": 0.1014, "step": 6100 }, { "grad_norm": 0.5084607601165771, "learning_rate": 9.981011500359362e-05, "loss": 0.0759, "step": 6110 }, { "grad_norm": 0.5993052124977112, "learning_rate": 9.9808311177475e-05, "loss": 0.0847, "step": 6120 }, { "grad_norm": 0.6061128377914429, "learning_rate": 9.980649884047826e-05, "loss": 0.0668, "step": 6130 }, { "grad_norm": 0.8377283215522766, "learning_rate": 9.980467799291307e-05, "loss": 0.0737, "step": 6140 }, { "grad_norm": 0.6630624532699585, "learning_rate": 9.980284863509058e-05, "loss": 0.1127, "step": 6150 }, { "grad_norm": 0.714704692363739, "learning_rate": 9.980101076732334e-05, "loss": 0.1041, "step": 6160 }, { "grad_norm": 0.6556045413017273, "learning_rate": 9.979916438992544e-05, "loss": 0.0929, "step": 6170 }, { "grad_norm": 0.4563150703907013, "learning_rate": 9.979730950321237e-05, "loss": 0.0741, "step": 6180 }, { "grad_norm": 0.572694718837738, "learning_rate": 9.979544610750104e-05, "loss": 0.0889, "step": 6190 }, { "grad_norm": 0.40172266960144043, "learning_rate": 9.97935742031099e-05, "loss": 0.077, "step": 6200 }, { "grad_norm": 0.665533185005188, "learning_rate": 9.979169379035878e-05, "loss": 0.0861, "step": 6210 }, { "grad_norm": 0.6549365520477295, "learning_rate": 9.978980486956899e-05, "loss": 0.0776, "step": 6220 }, { "grad_norm": 0.39825084805488586, "learning_rate": 9.978790744106332e-05, "loss": 0.0789, "step": 6230 }, { "grad_norm": 0.5174660086631775, "learning_rate": 9.978600150516594e-05, "loss": 0.0869, "step": 6240 }, { "grad_norm": 0.5000942945480347, "learning_rate": 9.978408706220259e-05, "loss": 0.0837, "step": 6250 }, { "grad_norm": 0.4778214395046234, "learning_rate": 9.978216411250032e-05, "loss": 0.0736, "step": 6260 }, { "grad_norm": 0.6414165496826172, "learning_rate": 9.978023265638778e-05, "loss": 0.0845, "step": 6270 }, { "grad_norm": 0.5939429402351379, "learning_rate": 9.977829269419495e-05, "loss": 0.087, "step": 6280 }, { "grad_norm": 0.6921668648719788, "learning_rate": 9.977634422625335e-05, "loss": 0.0805, "step": 6290 }, { "grad_norm": 0.46726545691490173, "learning_rate": 9.97743872528959e-05, "loss": 0.0783, "step": 6300 }, { "grad_norm": 0.8982806205749512, "learning_rate": 9.9772421774457e-05, "loss": 0.0897, "step": 6310 }, { "grad_norm": 0.5197902917861938, "learning_rate": 9.977044779127252e-05, "loss": 0.079, "step": 6320 }, { "grad_norm": 0.6256740093231201, "learning_rate": 9.976846530367971e-05, "loss": 0.0769, "step": 6330 }, { "grad_norm": 0.7717055082321167, "learning_rate": 9.976647431201735e-05, "loss": 0.0972, "step": 6340 }, { "grad_norm": 0.543281614780426, "learning_rate": 9.976447481662568e-05, "loss": 0.0755, "step": 6350 }, { "grad_norm": 0.5701563358306885, "learning_rate": 9.976246681784629e-05, "loss": 0.086, "step": 6360 }, { "grad_norm": 0.521278977394104, "learning_rate": 9.976045031602234e-05, "loss": 0.0763, "step": 6370 }, { "grad_norm": 0.6630493998527527, "learning_rate": 9.975842531149837e-05, "loss": 0.0825, "step": 6380 }, { "grad_norm": 0.7824996113777161, "learning_rate": 9.975639180462043e-05, "loss": 0.0911, "step": 6390 }, { "grad_norm": 0.5555384159088135, "learning_rate": 9.975434979573596e-05, "loss": 0.0873, "step": 6400 }, { "grad_norm": 0.5395234823226929, "learning_rate": 9.97522992851939e-05, "loss": 0.0684, "step": 6410 }, { "grad_norm": 0.5292207598686218, "learning_rate": 9.975024027334461e-05, "loss": 0.0742, "step": 6420 }, { "grad_norm": 0.5102924108505249, "learning_rate": 9.974817276053993e-05, "loss": 0.0755, "step": 6430 }, { "grad_norm": 0.5511497259140015, "learning_rate": 9.974609674713315e-05, "loss": 0.0796, "step": 6440 }, { "grad_norm": 0.5663013458251953, "learning_rate": 9.9744012233479e-05, "loss": 0.0794, "step": 6450 }, { "grad_norm": 0.5795053243637085, "learning_rate": 9.974191921993366e-05, "loss": 0.0768, "step": 6460 }, { "grad_norm": 0.5679762363433838, "learning_rate": 9.973981770685474e-05, "loss": 0.0757, "step": 6470 }, { "grad_norm": 0.5937929153442383, "learning_rate": 9.97377076946014e-05, "loss": 0.0685, "step": 6480 }, { "grad_norm": 0.7179611921310425, "learning_rate": 9.973558918353412e-05, "loss": 0.0869, "step": 6490 }, { "grad_norm": 0.4802556335926056, "learning_rate": 9.973346217401494e-05, "loss": 0.0721, "step": 6500 }, { "grad_norm": 0.5837225914001465, "learning_rate": 9.973132666640726e-05, "loss": 0.0666, "step": 6510 }, { "grad_norm": 0.5305376648902893, "learning_rate": 9.972918266107602e-05, "loss": 0.0712, "step": 6520 }, { "grad_norm": 0.5546383261680603, "learning_rate": 9.972703015838756e-05, "loss": 0.0778, "step": 6530 }, { "grad_norm": 0.5884080529212952, "learning_rate": 9.97248691587097e-05, "loss": 0.0848, "step": 6540 }, { "grad_norm": 0.6735504269599915, "learning_rate": 9.972269966241166e-05, "loss": 0.0711, "step": 6550 }, { "grad_norm": 0.6120998859405518, "learning_rate": 9.972052166986417e-05, "loss": 0.0837, "step": 6560 }, { "grad_norm": 0.598736584186554, "learning_rate": 9.971833518143938e-05, "loss": 0.0803, "step": 6570 }, { "grad_norm": 0.5242087244987488, "learning_rate": 9.971614019751093e-05, "loss": 0.0693, "step": 6580 }, { "grad_norm": 0.6529421210289001, "learning_rate": 9.971393671845383e-05, "loss": 0.0819, "step": 6590 }, { "grad_norm": 0.48039865493774414, "learning_rate": 9.971172474464464e-05, "loss": 0.0903, "step": 6600 }, { "grad_norm": 0.612311601638794, "learning_rate": 9.97095042764613e-05, "loss": 0.0674, "step": 6610 }, { "grad_norm": 0.5499475598335266, "learning_rate": 9.970727531428324e-05, "loss": 0.0675, "step": 6620 }, { "grad_norm": 0.7114015817642212, "learning_rate": 9.970503785849132e-05, "loss": 0.0785, "step": 6630 }, { "grad_norm": 0.5258294939994812, "learning_rate": 9.970279190946788e-05, "loss": 0.0656, "step": 6640 }, { "grad_norm": 0.562099039554596, "learning_rate": 9.970053746759667e-05, "loss": 0.0757, "step": 6650 }, { "grad_norm": 0.48050370812416077, "learning_rate": 9.969827453326292e-05, "loss": 0.0707, "step": 6660 }, { "grad_norm": 0.5256308913230896, "learning_rate": 9.969600310685332e-05, "loss": 0.07, "step": 6670 }, { "grad_norm": 0.5210569500923157, "learning_rate": 9.969372318875596e-05, "loss": 0.0844, "step": 6680 }, { "grad_norm": 0.5725545883178711, "learning_rate": 9.969143477936043e-05, "loss": 0.0861, "step": 6690 }, { "grad_norm": 0.5495489835739136, "learning_rate": 9.968913787905775e-05, "loss": 0.0851, "step": 6700 }, { "grad_norm": 0.7121819853782654, "learning_rate": 9.968683248824045e-05, "loss": 0.0821, "step": 6710 }, { "grad_norm": 0.6612098813056946, "learning_rate": 9.968451860730238e-05, "loss": 0.0964, "step": 6720 }, { "grad_norm": 0.6445244550704956, "learning_rate": 9.968219623663896e-05, "loss": 0.0814, "step": 6730 }, { "grad_norm": 0.6360859870910645, "learning_rate": 9.967986537664702e-05, "loss": 0.1098, "step": 6740 }, { "grad_norm": 0.4677753150463104, "learning_rate": 9.967752602772483e-05, "loss": 0.0776, "step": 6750 }, { "grad_norm": 0.7404025793075562, "learning_rate": 9.967517819027212e-05, "loss": 0.0896, "step": 6760 }, { "grad_norm": 0.5820971131324768, "learning_rate": 9.967282186469009e-05, "loss": 0.069, "step": 6770 }, { "grad_norm": 0.5957365036010742, "learning_rate": 9.967045705138135e-05, "loss": 0.0814, "step": 6780 }, { "grad_norm": 0.5245500206947327, "learning_rate": 9.966808375074998e-05, "loss": 0.0722, "step": 6790 }, { "grad_norm": 0.4469815790653229, "learning_rate": 9.966570196320154e-05, "loss": 0.0764, "step": 6800 }, { "grad_norm": 0.5435115098953247, "learning_rate": 9.966331168914299e-05, "loss": 0.0737, "step": 6810 }, { "grad_norm": 0.6284081935882568, "learning_rate": 9.966091292898277e-05, "loss": 0.0842, "step": 6820 }, { "grad_norm": 0.6967406272888184, "learning_rate": 9.965850568313076e-05, "loss": 0.0734, "step": 6830 }, { "grad_norm": 0.7054814100265503, "learning_rate": 9.965608995199827e-05, "loss": 0.0907, "step": 6840 }, { "grad_norm": 0.4201411306858063, "learning_rate": 9.965366573599812e-05, "loss": 0.0837, "step": 6850 }, { "grad_norm": 0.5019124150276184, "learning_rate": 9.965123303554453e-05, "loss": 0.0792, "step": 6860 }, { "grad_norm": 0.6041797399520874, "learning_rate": 9.964879185105317e-05, "loss": 0.0906, "step": 6870 }, { "grad_norm": 0.4634665846824646, "learning_rate": 9.964634218294119e-05, "loss": 0.0775, "step": 6880 }, { "grad_norm": 0.6784645318984985, "learning_rate": 9.964388403162714e-05, "loss": 0.0879, "step": 6890 }, { "grad_norm": 0.8196770548820496, "learning_rate": 9.96414173975311e-05, "loss": 0.0891, "step": 6900 }, { "grad_norm": 0.5274406671524048, "learning_rate": 9.963894228107451e-05, "loss": 0.0714, "step": 6910 }, { "grad_norm": 0.4961077570915222, "learning_rate": 9.963645868268032e-05, "loss": 0.0811, "step": 6920 }, { "grad_norm": 0.5992028117179871, "learning_rate": 9.963396660277289e-05, "loss": 0.0841, "step": 6930 }, { "grad_norm": 0.6371963024139404, "learning_rate": 9.963146604177807e-05, "loss": 0.0694, "step": 6940 }, { "grad_norm": 0.6578781008720398, "learning_rate": 9.962895700012311e-05, "loss": 0.0743, "step": 6950 }, { "grad_norm": 0.583487868309021, "learning_rate": 9.962643947823677e-05, "loss": 0.0819, "step": 6960 }, { "grad_norm": 0.5523295402526855, "learning_rate": 9.962391347654921e-05, "loss": 0.092, "step": 6970 }, { "grad_norm": 0.660244345664978, "learning_rate": 9.962137899549204e-05, "loss": 0.0935, "step": 6980 }, { "grad_norm": 0.6012049913406372, "learning_rate": 9.961883603549835e-05, "loss": 0.0734, "step": 6990 }, { "grad_norm": 0.6125153303146362, "learning_rate": 9.961628459700267e-05, "loss": 0.0803, "step": 7000 }, { "grad_norm": 0.4813218414783478, "learning_rate": 9.961372468044095e-05, "loss": 0.0824, "step": 7010 }, { "grad_norm": 0.6742751598358154, "learning_rate": 9.961115628625062e-05, "loss": 0.0738, "step": 7020 }, { "grad_norm": 0.6300835609436035, "learning_rate": 9.960857941487056e-05, "loss": 0.0907, "step": 7030 }, { "grad_norm": 0.5639364123344421, "learning_rate": 9.960599406674106e-05, "loss": 0.0837, "step": 7040 }, { "grad_norm": 0.4899959862232208, "learning_rate": 9.960340024230393e-05, "loss": 0.0713, "step": 7050 }, { "grad_norm": 0.47215735912323, "learning_rate": 9.960079794200232e-05, "loss": 0.0717, "step": 7060 }, { "grad_norm": 0.9612568020820618, "learning_rate": 9.959818716628096e-05, "loss": 0.076, "step": 7070 }, { "grad_norm": 0.7235377430915833, "learning_rate": 9.95955679155859e-05, "loss": 0.0967, "step": 7080 }, { "grad_norm": 0.6620001196861267, "learning_rate": 9.959294019036472e-05, "loss": 0.0913, "step": 7090 }, { "grad_norm": 0.6814266443252563, "learning_rate": 9.959030399106646e-05, "loss": 0.0809, "step": 7100 }, { "grad_norm": 0.5925230979919434, "learning_rate": 9.958765931814153e-05, "loss": 0.0955, "step": 7110 }, { "grad_norm": 0.5488848090171814, "learning_rate": 9.958500617204184e-05, "loss": 0.0692, "step": 7120 }, { "grad_norm": 0.44855979084968567, "learning_rate": 9.958234455322075e-05, "loss": 0.0778, "step": 7130 }, { "grad_norm": 0.786207914352417, "learning_rate": 9.957967446213308e-05, "loss": 0.0989, "step": 7140 }, { "grad_norm": 0.5896354913711548, "learning_rate": 9.957699589923501e-05, "loss": 0.0944, "step": 7150 }, { "grad_norm": 0.7702515721321106, "learning_rate": 9.957430886498431e-05, "loss": 0.0852, "step": 7160 }, { "grad_norm": 0.5171547532081604, "learning_rate": 9.957161335984008e-05, "loss": 0.091, "step": 7170 }, { "grad_norm": 0.6412470936775208, "learning_rate": 9.956890938426291e-05, "loss": 0.0719, "step": 7180 }, { "grad_norm": 0.4749596416950226, "learning_rate": 9.956619693871482e-05, "loss": 0.0889, "step": 7190 }, { "grad_norm": 0.6670500040054321, "learning_rate": 9.956347602365934e-05, "loss": 0.0836, "step": 7200 }, { "grad_norm": 0.7603017091751099, "learning_rate": 9.956074663956135e-05, "loss": 0.0853, "step": 7210 }, { "grad_norm": 0.647438108921051, "learning_rate": 9.955800878688726e-05, "loss": 0.0756, "step": 7220 }, { "grad_norm": 0.7035764455795288, "learning_rate": 9.955526246610489e-05, "loss": 0.0787, "step": 7230 }, { "grad_norm": 0.5803048014640808, "learning_rate": 9.955250767768349e-05, "loss": 0.0831, "step": 7240 }, { "grad_norm": 0.46901997923851013, "learning_rate": 9.95497444220938e-05, "loss": 0.0693, "step": 7250 }, { "grad_norm": 0.5870178937911987, "learning_rate": 9.954697269980797e-05, "loss": 0.0777, "step": 7260 }, { "grad_norm": 0.5349048972129822, "learning_rate": 9.954419251129962e-05, "loss": 0.0773, "step": 7270 }, { "grad_norm": 0.5633330345153809, "learning_rate": 9.95414038570438e-05, "loss": 0.074, "step": 7280 }, { "grad_norm": 0.4122370481491089, "learning_rate": 9.953860673751703e-05, "loss": 0.0803, "step": 7290 }, { "grad_norm": 0.638259768486023, "learning_rate": 9.953580115319725e-05, "loss": 0.0829, "step": 7300 }, { "grad_norm": 0.5125757455825806, "learning_rate": 9.953298710456387e-05, "loss": 0.0908, "step": 7310 }, { "grad_norm": 0.6491495966911316, "learning_rate": 9.953016459209771e-05, "loss": 0.0836, "step": 7320 }, { "grad_norm": 0.534156322479248, "learning_rate": 9.952733361628108e-05, "loss": 0.0694, "step": 7330 }, { "grad_norm": 0.4131522476673126, "learning_rate": 9.952449417759772e-05, "loss": 0.0804, "step": 7340 }, { "grad_norm": 0.5887997150421143, "learning_rate": 9.952164627653279e-05, "loss": 0.0829, "step": 7350 }, { "grad_norm": 0.5840027332305908, "learning_rate": 9.951878991357292e-05, "loss": 0.0771, "step": 7360 }, { "grad_norm": 0.5345921516418457, "learning_rate": 9.951592508920622e-05, "loss": 0.0844, "step": 7370 }, { "grad_norm": 0.5947027802467346, "learning_rate": 9.951305180392219e-05, "loss": 0.0735, "step": 7380 }, { "grad_norm": 0.5444971323013306, "learning_rate": 9.951017005821178e-05, "loss": 0.0828, "step": 7390 }, { "grad_norm": 0.6058732867240906, "learning_rate": 9.95072798525674e-05, "loss": 0.0766, "step": 7400 }, { "grad_norm": 0.6537426710128784, "learning_rate": 9.950438118748293e-05, "loss": 0.078, "step": 7410 }, { "grad_norm": 0.5865012407302856, "learning_rate": 9.950147406345366e-05, "loss": 0.0957, "step": 7420 }, { "grad_norm": 0.5345022082328796, "learning_rate": 9.949855848097635e-05, "loss": 0.0848, "step": 7430 }, { "grad_norm": 0.5454451441764832, "learning_rate": 9.949563444054916e-05, "loss": 0.0828, "step": 7440 }, { "grad_norm": 0.5145485401153564, "learning_rate": 9.949270194267178e-05, "loss": 0.0729, "step": 7450 }, { "grad_norm": 0.7162746787071228, "learning_rate": 9.948976098784526e-05, "loss": 0.0757, "step": 7460 }, { "grad_norm": 0.5064693093299866, "learning_rate": 9.948681157657213e-05, "loss": 0.084, "step": 7470 }, { "grad_norm": 0.5145942568778992, "learning_rate": 9.948385370935638e-05, "loss": 0.0844, "step": 7480 }, { "grad_norm": 0.6642272472381592, "learning_rate": 9.94808873867034e-05, "loss": 0.0816, "step": 7490 }, { "grad_norm": 0.4546453058719635, "learning_rate": 9.947791260912009e-05, "loss": 0.0753, "step": 7500 }, { "grad_norm": 0.44852158427238464, "learning_rate": 9.947492937711474e-05, "loss": 0.0632, "step": 7510 }, { "grad_norm": 0.6819369196891785, "learning_rate": 9.947193769119707e-05, "loss": 0.084, "step": 7520 }, { "grad_norm": 0.5647487044334412, "learning_rate": 9.946893755187834e-05, "loss": 0.081, "step": 7530 }, { "grad_norm": 0.5558541417121887, "learning_rate": 9.946592895967115e-05, "loss": 0.0788, "step": 7540 }, { "grad_norm": 0.5256293416023254, "learning_rate": 9.94629119150896e-05, "loss": 0.0697, "step": 7550 }, { "grad_norm": 0.5223481059074402, "learning_rate": 9.94598864186492e-05, "loss": 0.078, "step": 7560 }, { "grad_norm": 0.6608245968818665, "learning_rate": 9.945685247086696e-05, "loss": 0.0864, "step": 7570 }, { "grad_norm": 0.7018124461174011, "learning_rate": 9.945381007226129e-05, "loss": 0.0887, "step": 7580 }, { "grad_norm": 0.5795349478721619, "learning_rate": 9.945075922335203e-05, "loss": 0.0811, "step": 7590 }, { "grad_norm": 0.5344424247741699, "learning_rate": 9.944769992466049e-05, "loss": 0.1004, "step": 7600 }, { "grad_norm": 0.5863407850265503, "learning_rate": 9.944463217670945e-05, "loss": 0.0852, "step": 7610 }, { "grad_norm": 0.5257319808006287, "learning_rate": 9.944155598002307e-05, "loss": 0.0752, "step": 7620 }, { "grad_norm": 0.6034574508666992, "learning_rate": 9.943847133512701e-05, "loss": 0.0768, "step": 7630 }, { "grad_norm": 0.5758699178695679, "learning_rate": 9.943537824254834e-05, "loss": 0.0712, "step": 7640 }, { "grad_norm": 0.5220390558242798, "learning_rate": 9.943227670281559e-05, "loss": 0.0698, "step": 7650 }, { "grad_norm": 0.4810178875923157, "learning_rate": 9.942916671645873e-05, "loss": 0.0626, "step": 7660 }, { "grad_norm": 0.6277997493743896, "learning_rate": 9.942604828400916e-05, "loss": 0.0776, "step": 7670 }, { "grad_norm": 0.543779194355011, "learning_rate": 9.942292140599975e-05, "loss": 0.0714, "step": 7680 }, { "grad_norm": 0.6571970582008362, "learning_rate": 9.94197860829648e-05, "loss": 0.0773, "step": 7690 }, { "grad_norm": 0.6270903944969177, "learning_rate": 9.941664231544004e-05, "loss": 0.067, "step": 7700 }, { "grad_norm": 0.6082995533943176, "learning_rate": 9.941349010396264e-05, "loss": 0.0897, "step": 7710 }, { "grad_norm": 0.6345779895782471, "learning_rate": 9.941032944907125e-05, "loss": 0.082, "step": 7720 }, { "grad_norm": 0.4864940047264099, "learning_rate": 9.940716035130596e-05, "loss": 0.0718, "step": 7730 }, { "grad_norm": 0.6063573956489563, "learning_rate": 9.940398281120821e-05, "loss": 0.0763, "step": 7740 }, { "grad_norm": 0.41293632984161377, "learning_rate": 9.940079682932102e-05, "loss": 0.0692, "step": 7750 }, { "grad_norm": 0.49428272247314453, "learning_rate": 9.939760240618877e-05, "loss": 0.0803, "step": 7760 }, { "grad_norm": 0.5525352954864502, "learning_rate": 9.939439954235729e-05, "loss": 0.0802, "step": 7770 }, { "grad_norm": 0.6079758405685425, "learning_rate": 9.939118823837387e-05, "loss": 0.0847, "step": 7780 }, { "grad_norm": 0.6707183718681335, "learning_rate": 9.938796849478725e-05, "loss": 0.0888, "step": 7790 }, { "grad_norm": 0.6252160668373108, "learning_rate": 9.938474031214755e-05, "loss": 0.0832, "step": 7800 }, { "grad_norm": 0.5688503980636597, "learning_rate": 9.938150369100643e-05, "loss": 0.0839, "step": 7810 }, { "grad_norm": 0.5247336030006409, "learning_rate": 9.93782586319169e-05, "loss": 0.0665, "step": 7820 }, { "grad_norm": 0.6053378582000732, "learning_rate": 9.937500513543348e-05, "loss": 0.0885, "step": 7830 }, { "grad_norm": 0.6640838980674744, "learning_rate": 9.937174320211207e-05, "loss": 0.0753, "step": 7840 }, { "grad_norm": 0.5992828607559204, "learning_rate": 9.936847283251009e-05, "loss": 0.0862, "step": 7850 }, { "grad_norm": 0.552062451839447, "learning_rate": 9.936519402718632e-05, "loss": 0.0749, "step": 7860 }, { "grad_norm": 0.5658836960792542, "learning_rate": 9.936190678670102e-05, "loss": 0.0953, "step": 7870 }, { "grad_norm": 0.6080424189567566, "learning_rate": 9.935861111161593e-05, "loss": 0.0932, "step": 7880 }, { "grad_norm": 0.46163392066955566, "learning_rate": 9.935530700249416e-05, "loss": 0.0809, "step": 7890 }, { "grad_norm": 0.5472280979156494, "learning_rate": 9.935199445990028e-05, "loss": 0.0906, "step": 7900 }, { "grad_norm": 0.5473344326019287, "learning_rate": 9.934867348440033e-05, "loss": 0.0922, "step": 7910 }, { "grad_norm": 0.5393911004066467, "learning_rate": 9.934534407656176e-05, "loss": 0.1034, "step": 7920 }, { "grad_norm": 0.7119646072387695, "learning_rate": 9.93420062369535e-05, "loss": 0.0747, "step": 7930 }, { "grad_norm": 0.6053966283798218, "learning_rate": 9.933865996614589e-05, "loss": 0.0822, "step": 7940 }, { "grad_norm": 0.6384376883506775, "learning_rate": 9.933530526471068e-05, "loss": 0.0752, "step": 7950 }, { "grad_norm": 0.4880998730659485, "learning_rate": 9.933194213322114e-05, "loss": 0.0676, "step": 7960 }, { "grad_norm": 0.6033698320388794, "learning_rate": 9.932857057225192e-05, "loss": 0.0749, "step": 7970 }, { "grad_norm": 0.4216141104698181, "learning_rate": 9.932519058237912e-05, "loss": 0.0667, "step": 7980 }, { "grad_norm": 0.4756056070327759, "learning_rate": 9.932180216418032e-05, "loss": 0.0775, "step": 7990 }, { "grad_norm": 0.5185341238975525, "learning_rate": 9.931840531823446e-05, "loss": 0.0712, "step": 8000 }, { "grad_norm": 0.7336075305938721, "learning_rate": 9.9315000045122e-05, "loss": 0.0746, "step": 8010 }, { "grad_norm": 0.6552649140357971, "learning_rate": 9.931158634542481e-05, "loss": 0.0747, "step": 8020 }, { "grad_norm": 0.47241249680519104, "learning_rate": 9.930816421972617e-05, "loss": 0.0739, "step": 8030 }, { "grad_norm": 0.6726285815238953, "learning_rate": 9.930473366861086e-05, "loss": 0.076, "step": 8040 }, { "grad_norm": 0.5947357416152954, "learning_rate": 9.930129469266505e-05, "loss": 0.0778, "step": 8050 }, { "grad_norm": 0.6444632411003113, "learning_rate": 9.929784729247638e-05, "loss": 0.0746, "step": 8060 }, { "grad_norm": 0.3752988278865814, "learning_rate": 9.929439146863389e-05, "loss": 0.0695, "step": 8070 }, { "grad_norm": 0.6311030983924866, "learning_rate": 9.92909272217281e-05, "loss": 0.0884, "step": 8080 }, { "grad_norm": 0.582111120223999, "learning_rate": 9.928745455235097e-05, "loss": 0.084, "step": 8090 }, { "grad_norm": 0.5776529908180237, "learning_rate": 9.928397346109588e-05, "loss": 0.0928, "step": 8100 }, { "grad_norm": 0.7851165533065796, "learning_rate": 9.928048394855762e-05, "loss": 0.0862, "step": 8110 }, { "grad_norm": 0.5811523795127869, "learning_rate": 9.92769860153325e-05, "loss": 0.0826, "step": 8120 }, { "grad_norm": 0.6158207654953003, "learning_rate": 9.927347966201819e-05, "loss": 0.0916, "step": 8130 }, { "grad_norm": 0.6241328716278076, "learning_rate": 9.926996488921383e-05, "loss": 0.0949, "step": 8140 }, { "grad_norm": 0.6622969508171082, "learning_rate": 9.926644169752001e-05, "loss": 0.0818, "step": 8150 }, { "grad_norm": 0.6560549736022949, "learning_rate": 9.926291008753875e-05, "loss": 0.0845, "step": 8160 }, { "grad_norm": 0.5953367948532104, "learning_rate": 9.92593700598735e-05, "loss": 0.0787, "step": 8170 }, { "grad_norm": 0.48978498578071594, "learning_rate": 9.925582161512915e-05, "loss": 0.0821, "step": 8180 }, { "grad_norm": 0.5452508926391602, "learning_rate": 9.925226475391205e-05, "loss": 0.0813, "step": 8190 }, { "grad_norm": 0.5918993353843689, "learning_rate": 9.924869947682993e-05, "loss": 0.0707, "step": 8200 }, { "grad_norm": 0.5062345862388611, "learning_rate": 9.924512578449204e-05, "loss": 0.0603, "step": 8210 }, { "grad_norm": 0.6024461984634399, "learning_rate": 9.924154367750901e-05, "loss": 0.0734, "step": 8220 }, { "grad_norm": 0.45956534147262573, "learning_rate": 9.923795315649293e-05, "loss": 0.0801, "step": 8230 }, { "grad_norm": 0.40152662992477417, "learning_rate": 9.92343542220573e-05, "loss": 0.0807, "step": 8240 }, { "grad_norm": 0.5154745578765869, "learning_rate": 9.92307468748171e-05, "loss": 0.0771, "step": 8250 }, { "grad_norm": 0.6546145677566528, "learning_rate": 9.922713111538873e-05, "loss": 0.067, "step": 8260 }, { "grad_norm": 0.44911089539527893, "learning_rate": 9.922350694439003e-05, "loss": 0.0632, "step": 8270 }, { "grad_norm": 0.5611553192138672, "learning_rate": 9.921987436244024e-05, "loss": 0.084, "step": 8280 }, { "grad_norm": 0.44501787424087524, "learning_rate": 9.921623337016008e-05, "loss": 0.0627, "step": 8290 }, { "grad_norm": 0.43497902154922485, "learning_rate": 9.921258396817172e-05, "loss": 0.0716, "step": 8300 }, { "grad_norm": 0.4734800159931183, "learning_rate": 9.920892615709874e-05, "loss": 0.0716, "step": 8310 }, { "grad_norm": 0.4928618371486664, "learning_rate": 9.920525993756612e-05, "loss": 0.0649, "step": 8320 }, { "grad_norm": 0.6678888201713562, "learning_rate": 9.920158531020036e-05, "loss": 0.0691, "step": 8330 }, { "grad_norm": 0.5597033500671387, "learning_rate": 9.919790227562933e-05, "loss": 0.0607, "step": 8340 }, { "grad_norm": 0.541186511516571, "learning_rate": 9.919421083448237e-05, "loss": 0.0792, "step": 8350 }, { "grad_norm": 0.6392624974250793, "learning_rate": 9.919051098739022e-05, "loss": 0.0646, "step": 8360 }, { "grad_norm": 0.4600302577018738, "learning_rate": 9.918680273498514e-05, "loss": 0.0649, "step": 8370 }, { "grad_norm": 0.4908701181411743, "learning_rate": 9.918308607790072e-05, "loss": 0.0712, "step": 8380 }, { "grad_norm": 0.5421652793884277, "learning_rate": 9.917936101677205e-05, "loss": 0.0629, "step": 8390 }, { "grad_norm": 0.5423133373260498, "learning_rate": 9.917562755223564e-05, "loss": 0.0645, "step": 8400 }, { "grad_norm": 0.6865993738174438, "learning_rate": 9.917188568492944e-05, "loss": 0.0707, "step": 8410 }, { "grad_norm": 0.443251371383667, "learning_rate": 9.916813541549283e-05, "loss": 0.066, "step": 8420 }, { "grad_norm": 0.6024317741394043, "learning_rate": 9.916437674456663e-05, "loss": 0.0707, "step": 8430 }, { "grad_norm": 0.5571625828742981, "learning_rate": 9.916060967279308e-05, "loss": 0.079, "step": 8440 }, { "grad_norm": 0.6472134590148926, "learning_rate": 9.91568342008159e-05, "loss": 0.0715, "step": 8450 }, { "grad_norm": 0.5414539575576782, "learning_rate": 9.915305032928019e-05, "loss": 0.0731, "step": 8460 }, { "grad_norm": 0.5334198474884033, "learning_rate": 9.914925805883253e-05, "loss": 0.0686, "step": 8470 }, { "grad_norm": 0.4812500476837158, "learning_rate": 9.914545739012088e-05, "loss": 0.067, "step": 8480 }, { "grad_norm": 0.6603078842163086, "learning_rate": 9.91416483237947e-05, "loss": 0.0697, "step": 8490 }, { "grad_norm": 0.6700253486633301, "learning_rate": 9.913783086050485e-05, "loss": 0.075, "step": 8500 }, { "grad_norm": 0.5269599556922913, "learning_rate": 9.913400500090364e-05, "loss": 0.0682, "step": 8510 }, { "grad_norm": 0.5117515921592712, "learning_rate": 9.913017074564479e-05, "loss": 0.0578, "step": 8520 }, { "grad_norm": 0.43821173906326294, "learning_rate": 9.912632809538348e-05, "loss": 0.0742, "step": 8530 }, { "grad_norm": 0.46659329533576965, "learning_rate": 9.912247705077629e-05, "loss": 0.0702, "step": 8540 }, { "grad_norm": 0.5670514106750488, "learning_rate": 9.911861761248127e-05, "loss": 0.0701, "step": 8550 }, { "grad_norm": 0.46762633323669434, "learning_rate": 9.91147497811579e-05, "loss": 0.0674, "step": 8560 }, { "grad_norm": 0.6000680327415466, "learning_rate": 9.911087355746709e-05, "loss": 0.1045, "step": 8570 }, { "grad_norm": 0.5994502305984497, "learning_rate": 9.910698894207117e-05, "loss": 0.0917, "step": 8580 }, { "grad_norm": 0.6225472092628479, "learning_rate": 9.910309593563392e-05, "loss": 0.0815, "step": 8590 }, { "grad_norm": 0.3814719319343567, "learning_rate": 9.909919453882057e-05, "loss": 0.0567, "step": 8600 }, { "grad_norm": 0.6434612274169922, "learning_rate": 9.90952847522977e-05, "loss": 0.0616, "step": 8610 }, { "grad_norm": 0.594108521938324, "learning_rate": 9.909136657673346e-05, "loss": 0.0856, "step": 8620 }, { "grad_norm": 0.5542135834693909, "learning_rate": 9.908744001279731e-05, "loss": 0.0761, "step": 8630 }, { "grad_norm": 0.5304981470108032, "learning_rate": 9.90835050611602e-05, "loss": 0.0861, "step": 8640 }, { "grad_norm": 0.5296944975852966, "learning_rate": 9.90795617224945e-05, "loss": 0.0871, "step": 8650 }, { "grad_norm": 0.779909610748291, "learning_rate": 9.907560999747405e-05, "loss": 0.0761, "step": 8660 }, { "grad_norm": 0.5125412344932556, "learning_rate": 9.907164988677408e-05, "loss": 0.0655, "step": 8670 }, { "grad_norm": 0.515066921710968, "learning_rate": 9.906768139107124e-05, "loss": 0.0702, "step": 8680 }, { "grad_norm": 0.5224674940109253, "learning_rate": 9.906370451104367e-05, "loss": 0.0718, "step": 8690 }, { "grad_norm": 0.538932204246521, "learning_rate": 9.905971924737088e-05, "loss": 0.0696, "step": 8700 }, { "grad_norm": 0.49121612310409546, "learning_rate": 9.905572560073387e-05, "loss": 0.0746, "step": 8710 }, { "grad_norm": 0.7217441201210022, "learning_rate": 9.905172357181501e-05, "loss": 0.0871, "step": 8720 }, { "grad_norm": 0.5550188422203064, "learning_rate": 9.904771316129817e-05, "loss": 0.0687, "step": 8730 }, { "grad_norm": 0.5372753739356995, "learning_rate": 9.904369436986862e-05, "loss": 0.0707, "step": 8740 }, { "grad_norm": 0.5187351107597351, "learning_rate": 9.903966719821303e-05, "loss": 0.0797, "step": 8750 }, { "grad_norm": 0.5818197727203369, "learning_rate": 9.903563164701956e-05, "loss": 0.0747, "step": 8760 }, { "grad_norm": 0.43288910388946533, "learning_rate": 9.903158771697778e-05, "loss": 0.071, "step": 8770 }, { "grad_norm": 0.4564231038093567, "learning_rate": 9.902753540877867e-05, "loss": 0.0697, "step": 8780 }, { "grad_norm": 0.4980979561805725, "learning_rate": 9.902347472311466e-05, "loss": 0.062, "step": 8790 }, { "grad_norm": 0.45232757925987244, "learning_rate": 9.901940566067962e-05, "loss": 0.0576, "step": 8800 }, { "grad_norm": 0.37579652667045593, "learning_rate": 9.901532822216883e-05, "loss": 0.0554, "step": 8810 }, { "grad_norm": 0.6114408373832703, "learning_rate": 9.901124240827904e-05, "loss": 0.0712, "step": 8820 }, { "grad_norm": 0.4763182997703552, "learning_rate": 9.900714821970835e-05, "loss": 0.0592, "step": 8830 }, { "grad_norm": 0.6974069476127625, "learning_rate": 9.900304565715641e-05, "loss": 0.0727, "step": 8840 }, { "grad_norm": 0.48879295587539673, "learning_rate": 9.899893472132419e-05, "loss": 0.0669, "step": 8850 }, { "grad_norm": 0.5510627031326294, "learning_rate": 9.899481541291415e-05, "loss": 0.0668, "step": 8860 }, { "grad_norm": 0.8395988345146179, "learning_rate": 9.899068773263016e-05, "loss": 0.0724, "step": 8870 }, { "grad_norm": 0.6579270958900452, "learning_rate": 9.898655168117754e-05, "loss": 0.0823, "step": 8880 }, { "grad_norm": 0.5676449537277222, "learning_rate": 9.898240725926302e-05, "loss": 0.0734, "step": 8890 }, { "grad_norm": 0.4908083975315094, "learning_rate": 9.897825446759478e-05, "loss": 0.0691, "step": 8900 }, { "grad_norm": 0.512637734413147, "learning_rate": 9.897409330688241e-05, "loss": 0.065, "step": 8910 }, { "grad_norm": 0.6759678721427917, "learning_rate": 9.896992377783692e-05, "loss": 0.0839, "step": 8920 }, { "grad_norm": 0.5710850358009338, "learning_rate": 9.89657458811708e-05, "loss": 0.0599, "step": 8930 }, { "grad_norm": 0.49472475051879883, "learning_rate": 9.896155961759792e-05, "loss": 0.0744, "step": 8940 }, { "grad_norm": 0.42063847184181213, "learning_rate": 9.895736498783361e-05, "loss": 0.071, "step": 8950 }, { "grad_norm": 0.49710312485694885, "learning_rate": 9.895316199259462e-05, "loss": 0.0618, "step": 8960 }, { "grad_norm": 0.49238476157188416, "learning_rate": 9.894895063259909e-05, "loss": 0.0572, "step": 8970 }, { "grad_norm": 0.5295642614364624, "learning_rate": 9.894473090856667e-05, "loss": 0.0675, "step": 8980 }, { "grad_norm": 0.6173418164253235, "learning_rate": 9.894050282121839e-05, "loss": 0.0967, "step": 8990 }, { "grad_norm": 0.8051798343658447, "learning_rate": 9.893626637127668e-05, "loss": 0.072, "step": 9000 }, { "grad_norm": 0.6721288561820984, "learning_rate": 9.893202155946546e-05, "loss": 0.0573, "step": 9010 }, { "grad_norm": 0.7106654047966003, "learning_rate": 9.892776838651006e-05, "loss": 0.0676, "step": 9020 }, { "grad_norm": 0.5048244595527649, "learning_rate": 9.892350685313722e-05, "loss": 0.0834, "step": 9030 }, { "grad_norm": 0.6409045457839966, "learning_rate": 9.891923696007513e-05, "loss": 0.0788, "step": 9040 }, { "grad_norm": 0.5113962292671204, "learning_rate": 9.891495870805336e-05, "loss": 0.0584, "step": 9050 }, { "grad_norm": 0.37682652473449707, "learning_rate": 9.891067209780298e-05, "loss": 0.0795, "step": 9060 }, { "grad_norm": 0.48223623633384705, "learning_rate": 9.890637713005646e-05, "loss": 0.0629, "step": 9070 }, { "grad_norm": 0.5244320034980774, "learning_rate": 9.890207380554767e-05, "loss": 0.0722, "step": 9080 }, { "grad_norm": 0.6954978704452515, "learning_rate": 9.889776212501196e-05, "loss": 0.0785, "step": 9090 }, { "grad_norm": 1.1287119388580322, "learning_rate": 9.889344208918605e-05, "loss": 0.0924, "step": 9100 }, { "grad_norm": 0.73579341173172, "learning_rate": 9.888911369880812e-05, "loss": 0.0854, "step": 9110 }, { "grad_norm": 0.4959702491760254, "learning_rate": 9.888477695461777e-05, "loss": 0.0805, "step": 9120 }, { "grad_norm": 0.642024040222168, "learning_rate": 9.888043185735607e-05, "loss": 0.0856, "step": 9130 }, { "grad_norm": 0.5178091526031494, "learning_rate": 9.887607840776542e-05, "loss": 0.0693, "step": 9140 }, { "grad_norm": 0.4942685663700104, "learning_rate": 9.887171660658975e-05, "loss": 0.0832, "step": 9150 }, { "grad_norm": 0.43152061104774475, "learning_rate": 9.886734645457435e-05, "loss": 0.0634, "step": 9160 }, { "grad_norm": 0.6355152726173401, "learning_rate": 9.886296795246597e-05, "loss": 0.0663, "step": 9170 }, { "grad_norm": 0.4967827796936035, "learning_rate": 9.885858110101276e-05, "loss": 0.0646, "step": 9180 }, { "grad_norm": 0.3480011224746704, "learning_rate": 9.885418590096434e-05, "loss": 0.0578, "step": 9190 }, { "grad_norm": 0.5714940428733826, "learning_rate": 9.88497823530717e-05, "loss": 0.0705, "step": 9200 }, { "grad_norm": 0.48751407861709595, "learning_rate": 9.884537045808732e-05, "loss": 0.085, "step": 9210 }, { "grad_norm": 0.7203940749168396, "learning_rate": 9.884095021676502e-05, "loss": 0.0885, "step": 9220 }, { "grad_norm": 0.612568199634552, "learning_rate": 9.883652162986017e-05, "loss": 0.0727, "step": 9230 }, { "grad_norm": 0.3774983584880829, "learning_rate": 9.883208469812943e-05, "loss": 0.078, "step": 9240 }, { "grad_norm": 0.6235118508338928, "learning_rate": 9.882763942233098e-05, "loss": 0.062, "step": 9250 }, { "grad_norm": 0.48246079683303833, "learning_rate": 9.882318580322441e-05, "loss": 0.0656, "step": 9260 }, { "grad_norm": 0.5446850657463074, "learning_rate": 9.881872384157067e-05, "loss": 0.07, "step": 9270 }, { "grad_norm": 0.603510320186615, "learning_rate": 9.881425353813225e-05, "loss": 0.0837, "step": 9280 }, { "grad_norm": 0.6341303586959839, "learning_rate": 9.880977489367296e-05, "loss": 0.0793, "step": 9290 }, { "grad_norm": 0.4608558416366577, "learning_rate": 9.88052879089581e-05, "loss": 0.0781, "step": 9300 }, { "grad_norm": 0.47998034954071045, "learning_rate": 9.880079258475434e-05, "loss": 0.0551, "step": 9310 }, { "grad_norm": 0.5838139057159424, "learning_rate": 9.879628892182985e-05, "loss": 0.0838, "step": 9320 }, { "grad_norm": 0.5037564039230347, "learning_rate": 9.879177692095416e-05, "loss": 0.0676, "step": 9330 }, { "grad_norm": 0.5623456835746765, "learning_rate": 9.878725658289825e-05, "loss": 0.0722, "step": 9340 }, { "grad_norm": 0.4526917636394501, "learning_rate": 9.878272790843454e-05, "loss": 0.0607, "step": 9350 }, { "grad_norm": 0.45705878734588623, "learning_rate": 9.877819089833682e-05, "loss": 0.0596, "step": 9360 }, { "grad_norm": 0.4997333884239197, "learning_rate": 9.877364555338038e-05, "loss": 0.0776, "step": 9370 }, { "grad_norm": 0.49549952149391174, "learning_rate": 9.876909187434186e-05, "loss": 0.0767, "step": 9380 }, { "grad_norm": 0.5832117795944214, "learning_rate": 9.876452986199939e-05, "loss": 0.0675, "step": 9390 }, { "grad_norm": 0.6102084517478943, "learning_rate": 9.875995951713248e-05, "loss": 0.078, "step": 9400 }, { "grad_norm": 0.5803288817405701, "learning_rate": 9.875538084052207e-05, "loss": 0.0766, "step": 9410 }, { "grad_norm": 0.6518296599388123, "learning_rate": 9.875079383295053e-05, "loss": 0.0687, "step": 9420 }, { "grad_norm": 0.47929784655570984, "learning_rate": 9.874619849520167e-05, "loss": 0.0605, "step": 9430 }, { "grad_norm": 0.4858376681804657, "learning_rate": 9.874159482806069e-05, "loss": 0.0688, "step": 9440 }, { "grad_norm": 0.5415228009223938, "learning_rate": 9.873698283231426e-05, "loss": 0.0646, "step": 9450 }, { "grad_norm": 0.5964908599853516, "learning_rate": 9.87323625087504e-05, "loss": 0.065, "step": 9460 }, { "grad_norm": 0.7430070042610168, "learning_rate": 9.872773385815863e-05, "loss": 0.0902, "step": 9470 }, { "grad_norm": 0.5829607844352722, "learning_rate": 9.872309688132986e-05, "loss": 0.0701, "step": 9480 }, { "grad_norm": 0.5070826411247253, "learning_rate": 9.871845157905639e-05, "loss": 0.0719, "step": 9490 }, { "grad_norm": 0.42030829191207886, "learning_rate": 9.871379795213201e-05, "loss": 0.0643, "step": 9500 }, { "grad_norm": 0.6071983575820923, "learning_rate": 9.87091360013519e-05, "loss": 0.0806, "step": 9510 }, { "grad_norm": 0.5571500062942505, "learning_rate": 9.870446572751262e-05, "loss": 0.0719, "step": 9520 }, { "grad_norm": 0.46850311756134033, "learning_rate": 9.869978713141224e-05, "loss": 0.0661, "step": 9530 }, { "grad_norm": 0.7364309430122375, "learning_rate": 9.869510021385016e-05, "loss": 0.0816, "step": 9540 }, { "grad_norm": 0.4975375533103943, "learning_rate": 9.869040497562727e-05, "loss": 0.0665, "step": 9550 }, { "grad_norm": 0.5496515035629272, "learning_rate": 9.868570141754587e-05, "loss": 0.0832, "step": 9560 }, { "grad_norm": 0.5905676484107971, "learning_rate": 9.868098954040965e-05, "loss": 0.0857, "step": 9570 }, { "grad_norm": 0.5726953744888306, "learning_rate": 9.867626934502374e-05, "loss": 0.0767, "step": 9580 }, { "grad_norm": 0.5666323900222778, "learning_rate": 9.86715408321947e-05, "loss": 0.0722, "step": 9590 }, { "grad_norm": 0.5547336339950562, "learning_rate": 9.86668040027305e-05, "loss": 0.0664, "step": 9600 }, { "grad_norm": 0.41620108485221863, "learning_rate": 9.866205885744053e-05, "loss": 0.0642, "step": 9610 }, { "grad_norm": 0.49861940741539, "learning_rate": 9.865730539713563e-05, "loss": 0.0742, "step": 9620 }, { "grad_norm": 0.49937838315963745, "learning_rate": 9.8652543622628e-05, "loss": 0.0638, "step": 9630 }, { "grad_norm": 0.4270590841770172, "learning_rate": 9.864777353473132e-05, "loss": 0.0627, "step": 9640 }, { "grad_norm": 0.4169381260871887, "learning_rate": 9.864299513426068e-05, "loss": 0.0599, "step": 9650 }, { "grad_norm": 0.4526083171367645, "learning_rate": 9.863820842203254e-05, "loss": 0.0645, "step": 9660 }, { "grad_norm": 0.5332221388816833, "learning_rate": 9.863341339886483e-05, "loss": 0.0709, "step": 9670 }, { "grad_norm": 0.3774122893810272, "learning_rate": 9.86286100655769e-05, "loss": 0.0704, "step": 9680 }, { "grad_norm": 0.63335782289505, "learning_rate": 9.862379842298953e-05, "loss": 0.0801, "step": 9690 }, { "grad_norm": 0.5507038831710815, "learning_rate": 9.861897847192485e-05, "loss": 0.06, "step": 9700 }, { "grad_norm": 0.5049829483032227, "learning_rate": 9.86141502132065e-05, "loss": 0.076, "step": 9710 }, { "grad_norm": 0.6465753316879272, "learning_rate": 9.860931364765946e-05, "loss": 0.0652, "step": 9720 }, { "grad_norm": 0.6848470568656921, "learning_rate": 9.860446877611021e-05, "loss": 0.0861, "step": 9730 }, { "grad_norm": 0.40645211935043335, "learning_rate": 9.859961559938655e-05, "loss": 0.071, "step": 9740 }, { "grad_norm": 0.7722731232643127, "learning_rate": 9.85947541183178e-05, "loss": 0.0741, "step": 9750 }, { "grad_norm": 0.6133878231048584, "learning_rate": 9.858988433373463e-05, "loss": 0.0726, "step": 9760 }, { "grad_norm": 0.4107014834880829, "learning_rate": 9.858500624646918e-05, "loss": 0.0622, "step": 9770 }, { "grad_norm": 0.4912766218185425, "learning_rate": 9.858011985735497e-05, "loss": 0.065, "step": 9780 }, { "grad_norm": 0.6418346762657166, "learning_rate": 9.857522516722693e-05, "loss": 0.0663, "step": 9790 }, { "grad_norm": 0.47870081663131714, "learning_rate": 9.857032217692145e-05, "loss": 0.0585, "step": 9800 }, { "grad_norm": 0.5832774639129639, "learning_rate": 9.856541088727631e-05, "loss": 0.0734, "step": 9810 }, { "grad_norm": 0.5065296292304993, "learning_rate": 9.856049129913072e-05, "loss": 0.0766, "step": 9820 }, { "grad_norm": 0.4514777958393097, "learning_rate": 9.85555634133253e-05, "loss": 0.0804, "step": 9830 }, { "grad_norm": 0.5096484422683716, "learning_rate": 9.855062723070208e-05, "loss": 0.076, "step": 9840 }, { "grad_norm": 0.5311128497123718, "learning_rate": 9.854568275210454e-05, "loss": 0.0722, "step": 9850 }, { "grad_norm": 0.4949483275413513, "learning_rate": 9.854072997837754e-05, "loss": 0.062, "step": 9860 }, { "grad_norm": 0.3600897789001465, "learning_rate": 9.853576891036737e-05, "loss": 0.0626, "step": 9870 }, { "grad_norm": 0.5488556027412415, "learning_rate": 9.853079954892177e-05, "loss": 0.0683, "step": 9880 }, { "grad_norm": 0.43747347593307495, "learning_rate": 9.852582189488983e-05, "loss": 0.0554, "step": 9890 }, { "grad_norm": 0.5295486450195312, "learning_rate": 9.852083594912212e-05, "loss": 0.0754, "step": 9900 }, { "grad_norm": 0.5644481182098389, "learning_rate": 9.851584171247058e-05, "loss": 0.0713, "step": 9910 }, { "grad_norm": 0.6369706392288208, "learning_rate": 9.851083918578863e-05, "loss": 0.078, "step": 9920 }, { "grad_norm": 0.7326534986495972, "learning_rate": 9.850582836993103e-05, "loss": 0.0639, "step": 9930 }, { "grad_norm": 0.4444167912006378, "learning_rate": 9.850080926575397e-05, "loss": 0.0666, "step": 9940 }, { "grad_norm": 0.3720998764038086, "learning_rate": 9.849578187411515e-05, "loss": 0.0625, "step": 9950 }, { "grad_norm": 0.3741411566734314, "learning_rate": 9.849074619587354e-05, "loss": 0.0547, "step": 9960 }, { "grad_norm": 0.5425230264663696, "learning_rate": 9.848570223188964e-05, "loss": 0.0697, "step": 9970 }, { "grad_norm": 0.510445773601532, "learning_rate": 9.848064998302531e-05, "loss": 0.0622, "step": 9980 }, { "grad_norm": 0.6347084045410156, "learning_rate": 9.847558945014386e-05, "loss": 0.0663, "step": 9990 }, { "grad_norm": 0.6618502736091614, "learning_rate": 9.847052063410996e-05, "loss": 0.076, "step": 10000 }, { "grad_norm": 0.4091278910636902, "learning_rate": 9.846544353578977e-05, "loss": 0.0677, "step": 10010 }, { "grad_norm": 0.4396703541278839, "learning_rate": 9.846035815605081e-05, "loss": 0.0698, "step": 10020 }, { "grad_norm": 0.5700857639312744, "learning_rate": 9.845526449576204e-05, "loss": 0.0584, "step": 10030 }, { "grad_norm": 0.5997193455696106, "learning_rate": 9.845016255579383e-05, "loss": 0.0742, "step": 10040 }, { "grad_norm": 0.6025073528289795, "learning_rate": 9.844505233701794e-05, "loss": 0.07, "step": 10050 }, { "grad_norm": 0.5162746906280518, "learning_rate": 9.843993384030757e-05, "loss": 0.0641, "step": 10060 }, { "grad_norm": 0.49940618872642517, "learning_rate": 9.843480706653737e-05, "loss": 0.0527, "step": 10070 }, { "grad_norm": 0.43470194935798645, "learning_rate": 9.84296720165833e-05, "loss": 0.0687, "step": 10080 }, { "grad_norm": 0.5716680884361267, "learning_rate": 9.842452869132286e-05, "loss": 0.0608, "step": 10090 }, { "grad_norm": 0.5751482844352722, "learning_rate": 9.841937709163489e-05, "loss": 0.0692, "step": 10100 }, { "grad_norm": 0.4945722222328186, "learning_rate": 9.841421721839962e-05, "loss": 0.0572, "step": 10110 }, { "grad_norm": 0.49780818819999695, "learning_rate": 9.840904907249879e-05, "loss": 0.0545, "step": 10120 }, { "grad_norm": 0.6120661497116089, "learning_rate": 9.840387265481545e-05, "loss": 0.0693, "step": 10130 }, { "grad_norm": 0.49387794733047485, "learning_rate": 9.839868796623411e-05, "loss": 0.0581, "step": 10140 }, { "grad_norm": 0.7472683787345886, "learning_rate": 9.839349500764072e-05, "loss": 0.0734, "step": 10150 }, { "grad_norm": 0.4986494779586792, "learning_rate": 9.83882937799226e-05, "loss": 0.0683, "step": 10160 }, { "grad_norm": 0.5343347787857056, "learning_rate": 9.838308428396849e-05, "loss": 0.0602, "step": 10170 }, { "grad_norm": 0.43264368176460266, "learning_rate": 9.837786652066854e-05, "loss": 0.077, "step": 10180 }, { "grad_norm": 0.5335564613342285, "learning_rate": 9.837264049091437e-05, "loss": 0.0863, "step": 10190 }, { "grad_norm": 0.5781038403511047, "learning_rate": 9.836740619559893e-05, "loss": 0.0663, "step": 10200 }, { "grad_norm": 0.5219928622245789, "learning_rate": 9.836216363561659e-05, "loss": 0.0612, "step": 10210 }, { "grad_norm": 0.42490968108177185, "learning_rate": 9.835691281186322e-05, "loss": 0.0642, "step": 10220 }, { "grad_norm": 0.5100446343421936, "learning_rate": 9.8351653725236e-05, "loss": 0.0637, "step": 10230 }, { "grad_norm": 0.5788230299949646, "learning_rate": 9.83463863766336e-05, "loss": 0.063, "step": 10240 }, { "grad_norm": 0.5297635793685913, "learning_rate": 9.834111076695602e-05, "loss": 0.0601, "step": 10250 }, { "grad_norm": 0.5334398150444031, "learning_rate": 9.833582689710477e-05, "loss": 0.0513, "step": 10260 }, { "grad_norm": 0.5317943692207336, "learning_rate": 9.833053476798268e-05, "loss": 0.0579, "step": 10270 }, { "grad_norm": 0.5237641334533691, "learning_rate": 9.832523438049404e-05, "loss": 0.0689, "step": 10280 }, { "grad_norm": 0.511309802532196, "learning_rate": 9.831992573554454e-05, "loss": 0.072, "step": 10290 }, { "grad_norm": 0.5886943340301514, "learning_rate": 9.831460883404128e-05, "loss": 0.0614, "step": 10300 }, { "grad_norm": 0.5211082100868225, "learning_rate": 9.830928367689278e-05, "loss": 0.0593, "step": 10310 }, { "grad_norm": 0.44764968752861023, "learning_rate": 9.830395026500896e-05, "loss": 0.06, "step": 10320 }, { "grad_norm": 0.47405296564102173, "learning_rate": 9.829860859930115e-05, "loss": 0.06, "step": 10330 }, { "grad_norm": 0.5120576024055481, "learning_rate": 9.829325868068212e-05, "loss": 0.0666, "step": 10340 }, { "grad_norm": 0.4373224079608917, "learning_rate": 9.8287900510066e-05, "loss": 0.0707, "step": 10350 }, { "grad_norm": 0.40297454595565796, "learning_rate": 9.828253408836834e-05, "loss": 0.0542, "step": 10360 }, { "grad_norm": 0.45834460854530334, "learning_rate": 9.827715941650615e-05, "loss": 0.0564, "step": 10370 }, { "grad_norm": 0.7817295789718628, "learning_rate": 9.82717764953978e-05, "loss": 0.0653, "step": 10380 }, { "grad_norm": 0.5011459589004517, "learning_rate": 9.826638532596308e-05, "loss": 0.0683, "step": 10390 }, { "grad_norm": 0.364704430103302, "learning_rate": 9.82609859091232e-05, "loss": 0.0688, "step": 10400 }, { "grad_norm": 0.4202499985694885, "learning_rate": 9.825557824580076e-05, "loss": 0.0607, "step": 10410 }, { "grad_norm": 0.6425885558128357, "learning_rate": 9.82501623369198e-05, "loss": 0.0657, "step": 10420 }, { "grad_norm": 0.39972034096717834, "learning_rate": 9.824473818340574e-05, "loss": 0.0631, "step": 10430 }, { "grad_norm": 0.4666995108127594, "learning_rate": 9.823930578618541e-05, "loss": 0.0556, "step": 10440 }, { "grad_norm": 0.5190655589103699, "learning_rate": 9.823386514618709e-05, "loss": 0.066, "step": 10450 }, { "grad_norm": 0.4803110361099243, "learning_rate": 9.82284162643404e-05, "loss": 0.0633, "step": 10460 }, { "grad_norm": 0.45321086049079895, "learning_rate": 9.822295914157642e-05, "loss": 0.0613, "step": 10470 }, { "grad_norm": 0.49006354808807373, "learning_rate": 9.821749377882763e-05, "loss": 0.0561, "step": 10480 }, { "grad_norm": 0.37744376063346863, "learning_rate": 9.821202017702791e-05, "loss": 0.0551, "step": 10490 }, { "grad_norm": 0.5051290988922119, "learning_rate": 9.820653833711253e-05, "loss": 0.0669, "step": 10500 }, { "grad_norm": 0.45523008704185486, "learning_rate": 9.820104826001822e-05, "loss": 0.0723, "step": 10510 }, { "grad_norm": 0.4781116843223572, "learning_rate": 9.819554994668305e-05, "loss": 0.0508, "step": 10520 }, { "grad_norm": 0.5672711133956909, "learning_rate": 9.819004339804654e-05, "loss": 0.0752, "step": 10530 }, { "grad_norm": 0.6387122869491577, "learning_rate": 9.818452861504961e-05, "loss": 0.0642, "step": 10540 }, { "grad_norm": 0.41960325837135315, "learning_rate": 9.81790055986346e-05, "loss": 0.0529, "step": 10550 }, { "grad_norm": 0.5333678126335144, "learning_rate": 9.817347434974523e-05, "loss": 0.054, "step": 10560 }, { "grad_norm": 0.5261216163635254, "learning_rate": 9.816793486932664e-05, "loss": 0.0482, "step": 10570 }, { "grad_norm": 0.38006317615509033, "learning_rate": 9.816238715832538e-05, "loss": 0.0563, "step": 10580 }, { "grad_norm": 0.5152315497398376, "learning_rate": 9.815683121768939e-05, "loss": 0.0652, "step": 10590 }, { "grad_norm": 0.5786665081977844, "learning_rate": 9.815126704836804e-05, "loss": 0.068, "step": 10600 }, { "grad_norm": 0.4801790118217468, "learning_rate": 9.81456946513121e-05, "loss": 0.0861, "step": 10610 }, { "grad_norm": 0.6892103552818298, "learning_rate": 9.814011402747373e-05, "loss": 0.0832, "step": 10620 }, { "grad_norm": 0.45143523812294006, "learning_rate": 9.813452517780651e-05, "loss": 0.0601, "step": 10630 }, { "grad_norm": 0.470048725605011, "learning_rate": 9.81289281032654e-05, "loss": 0.0649, "step": 10640 }, { "grad_norm": 0.38817164301872253, "learning_rate": 9.812332280480683e-05, "loss": 0.0689, "step": 10650 }, { "grad_norm": 0.5688868761062622, "learning_rate": 9.811770928338854e-05, "loss": 0.0672, "step": 10660 }, { "grad_norm": 0.5175865292549133, "learning_rate": 9.811208753996979e-05, "loss": 0.0613, "step": 10670 }, { "grad_norm": 0.7522755861282349, "learning_rate": 9.810645757551113e-05, "loss": 0.0549, "step": 10680 }, { "grad_norm": 0.4679219424724579, "learning_rate": 9.810081939097459e-05, "loss": 0.0632, "step": 10690 }, { "grad_norm": 0.5028095245361328, "learning_rate": 9.809517298732356e-05, "loss": 0.0861, "step": 10700 }, { "grad_norm": 0.5206220746040344, "learning_rate": 9.80895183655229e-05, "loss": 0.0693, "step": 10710 }, { "grad_norm": 0.5633223652839661, "learning_rate": 9.808385552653877e-05, "loss": 0.0617, "step": 10720 }, { "grad_norm": 0.46951043605804443, "learning_rate": 9.807818447133886e-05, "loss": 0.0535, "step": 10730 }, { "grad_norm": 0.5878501534461975, "learning_rate": 9.807250520089215e-05, "loss": 0.0639, "step": 10740 }, { "grad_norm": 0.51053386926651, "learning_rate": 9.806681771616908e-05, "loss": 0.0714, "step": 10750 }, { "grad_norm": 0.48155099153518677, "learning_rate": 9.80611220181415e-05, "loss": 0.0664, "step": 10760 }, { "grad_norm": 0.8090944886207581, "learning_rate": 9.805541810778264e-05, "loss": 0.0712, "step": 10770 }, { "grad_norm": 0.5876461267471313, "learning_rate": 9.804970598606716e-05, "loss": 0.0721, "step": 10780 }, { "grad_norm": 0.4594459533691406, "learning_rate": 9.804398565397106e-05, "loss": 0.0677, "step": 10790 }, { "grad_norm": 0.49279171228408813, "learning_rate": 9.803825711247183e-05, "loss": 0.068, "step": 10800 }, { "grad_norm": 0.6178396344184875, "learning_rate": 9.803252036254831e-05, "loss": 0.0664, "step": 10810 }, { "grad_norm": 0.4840087294578552, "learning_rate": 9.802677540518076e-05, "loss": 0.0719, "step": 10820 }, { "grad_norm": 0.45001333951950073, "learning_rate": 9.802102224135081e-05, "loss": 0.0592, "step": 10830 }, { "grad_norm": 0.5556122064590454, "learning_rate": 9.801526087204155e-05, "loss": 0.0715, "step": 10840 }, { "grad_norm": 0.5611178278923035, "learning_rate": 9.800949129823743e-05, "loss": 0.0695, "step": 10850 }, { "grad_norm": 0.5856145620346069, "learning_rate": 9.80037135209243e-05, "loss": 0.0577, "step": 10860 }, { "grad_norm": 0.5239263772964478, "learning_rate": 9.799792754108946e-05, "loss": 0.0578, "step": 10870 }, { "grad_norm": 0.5323117971420288, "learning_rate": 9.799213335972152e-05, "loss": 0.0631, "step": 10880 }, { "grad_norm": 0.5557563304901123, "learning_rate": 9.798633097781058e-05, "loss": 0.0589, "step": 10890 }, { "grad_norm": 0.4666772484779358, "learning_rate": 9.79805203963481e-05, "loss": 0.0641, "step": 10900 }, { "grad_norm": 0.4904639720916748, "learning_rate": 9.797470161632697e-05, "loss": 0.0582, "step": 10910 }, { "grad_norm": 0.663727879524231, "learning_rate": 9.796887463874145e-05, "loss": 0.0667, "step": 10920 }, { "grad_norm": 0.5077590942382812, "learning_rate": 9.796303946458718e-05, "loss": 0.0725, "step": 10930 }, { "grad_norm": 0.4980730414390564, "learning_rate": 9.795719609486127e-05, "loss": 0.0709, "step": 10940 }, { "grad_norm": 0.6474615931510925, "learning_rate": 9.795134453056219e-05, "loss": 0.0783, "step": 10950 }, { "grad_norm": 0.40167558193206787, "learning_rate": 9.794548477268979e-05, "loss": 0.0588, "step": 10960 }, { "grad_norm": 0.40887126326560974, "learning_rate": 9.793961682224537e-05, "loss": 0.0672, "step": 10970 }, { "grad_norm": 0.5863602757453918, "learning_rate": 9.793374068023156e-05, "loss": 0.0668, "step": 10980 }, { "grad_norm": 0.5960561633110046, "learning_rate": 9.792785634765247e-05, "loss": 0.0645, "step": 10990 }, { "grad_norm": 0.5374048352241516, "learning_rate": 9.792196382551357e-05, "loss": 0.0602, "step": 11000 }, { "grad_norm": 0.6917343139648438, "learning_rate": 9.791606311482171e-05, "loss": 0.0629, "step": 11010 }, { "grad_norm": 0.5328034162521362, "learning_rate": 9.791015421658518e-05, "loss": 0.059, "step": 11020 }, { "grad_norm": 0.629740834236145, "learning_rate": 9.790423713181362e-05, "loss": 0.0725, "step": 11030 }, { "grad_norm": 0.592491626739502, "learning_rate": 9.789831186151814e-05, "loss": 0.0665, "step": 11040 }, { "grad_norm": 0.4506857693195343, "learning_rate": 9.789237840671118e-05, "loss": 0.0559, "step": 11050 }, { "grad_norm": 0.4322627782821655, "learning_rate": 9.78864367684066e-05, "loss": 0.0656, "step": 11060 }, { "grad_norm": 0.4835537075996399, "learning_rate": 9.788048694761968e-05, "loss": 0.0551, "step": 11070 }, { "grad_norm": 0.5597334504127502, "learning_rate": 9.787452894536709e-05, "loss": 0.0613, "step": 11080 }, { "grad_norm": 0.44173315167427063, "learning_rate": 9.786856276266685e-05, "loss": 0.0568, "step": 11090 }, { "grad_norm": 0.4020325541496277, "learning_rate": 9.786258840053845e-05, "loss": 0.071, "step": 11100 }, { "grad_norm": 0.5139191746711731, "learning_rate": 9.785660586000273e-05, "loss": 0.0636, "step": 11110 }, { "grad_norm": 0.5339110493659973, "learning_rate": 9.785061514208196e-05, "loss": 0.0628, "step": 11120 }, { "grad_norm": 0.4409109950065613, "learning_rate": 9.784461624779977e-05, "loss": 0.057, "step": 11130 }, { "grad_norm": 0.414781391620636, "learning_rate": 9.783860917818123e-05, "loss": 0.0535, "step": 11140 }, { "grad_norm": 0.42393556237220764, "learning_rate": 9.783259393425277e-05, "loss": 0.0446, "step": 11150 }, { "grad_norm": 0.6049558520317078, "learning_rate": 9.782657051704221e-05, "loss": 0.0598, "step": 11160 }, { "grad_norm": 0.49230289459228516, "learning_rate": 9.782053892757883e-05, "loss": 0.0643, "step": 11170 }, { "grad_norm": 0.523532509803772, "learning_rate": 9.781449916689324e-05, "loss": 0.0579, "step": 11180 }, { "grad_norm": 0.541348397731781, "learning_rate": 9.780845123601746e-05, "loss": 0.0638, "step": 11190 }, { "grad_norm": 0.41640907526016235, "learning_rate": 9.780239513598492e-05, "loss": 0.0594, "step": 11200 }, { "grad_norm": 0.5862225890159607, "learning_rate": 9.779633086783047e-05, "loss": 0.0694, "step": 11210 }, { "grad_norm": 0.42599067091941833, "learning_rate": 9.779025843259031e-05, "loss": 0.0589, "step": 11220 }, { "grad_norm": 0.5728679299354553, "learning_rate": 9.778417783130204e-05, "loss": 0.0781, "step": 11230 }, { "grad_norm": 0.6704587936401367, "learning_rate": 9.777808906500468e-05, "loss": 0.0619, "step": 11240 }, { "grad_norm": 0.5196340084075928, "learning_rate": 9.777199213473862e-05, "loss": 0.0455, "step": 11250 }, { "grad_norm": 0.5066853165626526, "learning_rate": 9.77658870415457e-05, "loss": 0.0458, "step": 11260 }, { "grad_norm": 0.4003813862800598, "learning_rate": 9.775977378646906e-05, "loss": 0.0535, "step": 11270 }, { "grad_norm": 0.6289421916007996, "learning_rate": 9.775365237055331e-05, "loss": 0.0705, "step": 11280 }, { "grad_norm": 0.5156441330909729, "learning_rate": 9.774752279484445e-05, "loss": 0.0676, "step": 11290 }, { "grad_norm": 0.4693014323711395, "learning_rate": 9.774138506038984e-05, "loss": 0.0597, "step": 11300 }, { "grad_norm": 0.5440437197685242, "learning_rate": 9.773523916823826e-05, "loss": 0.059, "step": 11310 }, { "grad_norm": 0.4272075295448303, "learning_rate": 9.772908511943986e-05, "loss": 0.0566, "step": 11320 }, { "grad_norm": 0.4485940635204315, "learning_rate": 9.77229229150462e-05, "loss": 0.0529, "step": 11330 }, { "grad_norm": 0.4700075685977936, "learning_rate": 9.771675255611024e-05, "loss": 0.0534, "step": 11340 }, { "grad_norm": 0.4959406554698944, "learning_rate": 9.771057404368632e-05, "loss": 0.0903, "step": 11350 }, { "grad_norm": 0.4902646839618683, "learning_rate": 9.770438737883018e-05, "loss": 0.0653, "step": 11360 }, { "grad_norm": 0.6687435507774353, "learning_rate": 9.769819256259898e-05, "loss": 0.0554, "step": 11370 }, { "grad_norm": 0.4988503158092499, "learning_rate": 9.769198959605119e-05, "loss": 0.0648, "step": 11380 }, { "grad_norm": 0.5161327123641968, "learning_rate": 9.768577848024678e-05, "loss": 0.0631, "step": 11390 }, { "grad_norm": 0.5382257699966431, "learning_rate": 9.767955921624702e-05, "loss": 0.0702, "step": 11400 }, { "grad_norm": 0.581555962562561, "learning_rate": 9.767333180511465e-05, "loss": 0.0554, "step": 11410 }, { "grad_norm": 0.4277598261833191, "learning_rate": 9.766709624791373e-05, "loss": 0.0769, "step": 11420 }, { "grad_norm": 0.5249338150024414, "learning_rate": 9.766085254570975e-05, "loss": 0.0627, "step": 11430 }, { "grad_norm": 0.48978427052497864, "learning_rate": 9.76546006995696e-05, "loss": 0.0582, "step": 11440 }, { "grad_norm": 0.6361219882965088, "learning_rate": 9.764834071056155e-05, "loss": 0.0541, "step": 11450 }, { "grad_norm": 0.4529152810573578, "learning_rate": 9.764207257975526e-05, "loss": 0.0641, "step": 11460 }, { "grad_norm": 0.49457433819770813, "learning_rate": 9.763579630822179e-05, "loss": 0.0586, "step": 11470 }, { "grad_norm": 0.620780885219574, "learning_rate": 9.762951189703356e-05, "loss": 0.0629, "step": 11480 }, { "grad_norm": 0.4973123371601105, "learning_rate": 9.762321934726442e-05, "loss": 0.0692, "step": 11490 }, { "grad_norm": 0.4918985366821289, "learning_rate": 9.761691865998959e-05, "loss": 0.0619, "step": 11500 }, { "grad_norm": 0.3691946268081665, "learning_rate": 9.76106098362857e-05, "loss": 0.0619, "step": 11510 }, { "grad_norm": 0.3795381784439087, "learning_rate": 9.760429287723072e-05, "loss": 0.0676, "step": 11520 }, { "grad_norm": 0.4623118042945862, "learning_rate": 9.759796778390406e-05, "loss": 0.0594, "step": 11530 }, { "grad_norm": 0.3452932834625244, "learning_rate": 9.759163455738653e-05, "loss": 0.0601, "step": 11540 }, { "grad_norm": 0.46523287892341614, "learning_rate": 9.75852931987603e-05, "loss": 0.0569, "step": 11550 }, { "grad_norm": 0.39174529910087585, "learning_rate": 9.757894370910891e-05, "loss": 0.0541, "step": 11560 }, { "grad_norm": 0.3750211298465729, "learning_rate": 9.757258608951733e-05, "loss": 0.0528, "step": 11570 }, { "grad_norm": 0.5726413130760193, "learning_rate": 9.75662203410719e-05, "loss": 0.0625, "step": 11580 }, { "grad_norm": 0.522438108921051, "learning_rate": 9.755984646486034e-05, "loss": 0.0643, "step": 11590 }, { "grad_norm": 0.5638173818588257, "learning_rate": 9.75534644619718e-05, "loss": 0.0537, "step": 11600 }, { "grad_norm": 0.5112835168838501, "learning_rate": 9.754707433349676e-05, "loss": 0.0702, "step": 11610 }, { "grad_norm": 0.6159500479698181, "learning_rate": 9.754067608052715e-05, "loss": 0.0748, "step": 11620 }, { "grad_norm": 0.5188262462615967, "learning_rate": 9.753426970415622e-05, "loss": 0.0819, "step": 11630 }, { "grad_norm": 0.5930245518684387, "learning_rate": 9.752785520547868e-05, "loss": 0.0552, "step": 11640 }, { "grad_norm": 0.6035420298576355, "learning_rate": 9.752143258559056e-05, "loss": 0.0659, "step": 11650 }, { "grad_norm": 0.6282968521118164, "learning_rate": 9.751500184558933e-05, "loss": 0.0636, "step": 11660 }, { "grad_norm": 0.3814532160758972, "learning_rate": 9.750856298657383e-05, "loss": 0.0538, "step": 11670 }, { "grad_norm": 0.6258551478385925, "learning_rate": 9.750211600964428e-05, "loss": 0.0748, "step": 11680 }, { "grad_norm": 0.6197370290756226, "learning_rate": 9.749566091590226e-05, "loss": 0.0746, "step": 11690 }, { "grad_norm": 0.5483576655387878, "learning_rate": 9.748919770645083e-05, "loss": 0.0726, "step": 11700 }, { "grad_norm": 0.44345372915267944, "learning_rate": 9.748272638239432e-05, "loss": 0.0575, "step": 11710 }, { "grad_norm": 0.5411166548728943, "learning_rate": 9.747624694483855e-05, "loss": 0.0653, "step": 11720 }, { "grad_norm": 0.3842584490776062, "learning_rate": 9.746975939489065e-05, "loss": 0.0577, "step": 11730 }, { "grad_norm": 0.5061067938804626, "learning_rate": 9.746326373365918e-05, "loss": 0.0626, "step": 11740 }, { "grad_norm": 0.4759167730808258, "learning_rate": 9.745675996225403e-05, "loss": 0.0666, "step": 11750 }, { "grad_norm": 0.3666512966156006, "learning_rate": 9.745024808178657e-05, "loss": 0.0564, "step": 11760 }, { "grad_norm": 0.4994114339351654, "learning_rate": 9.744372809336947e-05, "loss": 0.0542, "step": 11770 }, { "grad_norm": 0.5859090685844421, "learning_rate": 9.743719999811682e-05, "loss": 0.0533, "step": 11780 }, { "grad_norm": 0.37819209694862366, "learning_rate": 9.743066379714412e-05, "loss": 0.0525, "step": 11790 }, { "grad_norm": 0.36415624618530273, "learning_rate": 9.74241194915682e-05, "loss": 0.0538, "step": 11800 }, { "grad_norm": 0.3965369760990143, "learning_rate": 9.741756708250731e-05, "loss": 0.0699, "step": 11810 }, { "grad_norm": 0.33564427495002747, "learning_rate": 9.741100657108109e-05, "loss": 0.0585, "step": 11820 }, { "grad_norm": 0.47318029403686523, "learning_rate": 9.740443795841054e-05, "loss": 0.0747, "step": 11830 }, { "grad_norm": 0.4517688751220703, "learning_rate": 9.739786124561805e-05, "loss": 0.0553, "step": 11840 }, { "grad_norm": 0.37956592440605164, "learning_rate": 9.73912764338274e-05, "loss": 0.0539, "step": 11850 }, { "grad_norm": 0.4490576684474945, "learning_rate": 9.738468352416377e-05, "loss": 0.0617, "step": 11860 }, { "grad_norm": 0.46689778566360474, "learning_rate": 9.737808251775369e-05, "loss": 0.057, "step": 11870 }, { "grad_norm": 0.46646490693092346, "learning_rate": 9.737147341572512e-05, "loss": 0.0675, "step": 11880 }, { "grad_norm": 0.6111024022102356, "learning_rate": 9.736485621920735e-05, "loss": 0.0535, "step": 11890 }, { "grad_norm": 0.6969670653343201, "learning_rate": 9.735823092933108e-05, "loss": 0.0649, "step": 11900 }, { "grad_norm": 0.4614640772342682, "learning_rate": 9.735159754722838e-05, "loss": 0.0695, "step": 11910 }, { "grad_norm": 0.4850657880306244, "learning_rate": 9.734495607403275e-05, "loss": 0.0557, "step": 11920 }, { "grad_norm": 0.4173281192779541, "learning_rate": 9.733830651087901e-05, "loss": 0.0603, "step": 11930 }, { "grad_norm": 0.4374813437461853, "learning_rate": 9.733164885890338e-05, "loss": 0.0537, "step": 11940 }, { "grad_norm": 0.3972770571708679, "learning_rate": 9.732498311924349e-05, "loss": 0.0413, "step": 11950 }, { "grad_norm": 0.5699617266654968, "learning_rate": 9.731830929303833e-05, "loss": 0.0493, "step": 11960 }, { "grad_norm": 0.6178339719772339, "learning_rate": 9.731162738142827e-05, "loss": 0.0651, "step": 11970 }, { "grad_norm": 0.5505210161209106, "learning_rate": 9.730493738555506e-05, "loss": 0.0552, "step": 11980 }, { "grad_norm": 0.4336758553981781, "learning_rate": 9.729823930656186e-05, "loss": 0.0589, "step": 11990 }, { "grad_norm": 0.4784168601036072, "learning_rate": 9.729153314559316e-05, "loss": 0.0699, "step": 12000 }, { "grad_norm": 0.508312463760376, "learning_rate": 9.728481890379486e-05, "loss": 0.06, "step": 12010 }, { "grad_norm": 0.3893534243106842, "learning_rate": 9.727809658231428e-05, "loss": 0.0552, "step": 12020 }, { "grad_norm": 0.29796102643013, "learning_rate": 9.727136618230003e-05, "loss": 0.0517, "step": 12030 }, { "grad_norm": 0.5339233875274658, "learning_rate": 9.726462770490219e-05, "loss": 0.0652, "step": 12040 }, { "grad_norm": 0.5633680820465088, "learning_rate": 9.725788115127214e-05, "loss": 0.0689, "step": 12050 }, { "grad_norm": 0.6238586902618408, "learning_rate": 9.725112652256274e-05, "loss": 0.0704, "step": 12060 }, { "grad_norm": 0.3771195113658905, "learning_rate": 9.724436381992812e-05, "loss": 0.0839, "step": 12070 }, { "grad_norm": 0.6174362897872925, "learning_rate": 9.723759304452387e-05, "loss": 0.0571, "step": 12080 }, { "grad_norm": 0.6503215432167053, "learning_rate": 9.72308141975069e-05, "loss": 0.0589, "step": 12090 }, { "grad_norm": 0.5385487079620361, "learning_rate": 9.722402728003557e-05, "loss": 0.0684, "step": 12100 }, { "grad_norm": 0.4951871633529663, "learning_rate": 9.721723229326953e-05, "loss": 0.0643, "step": 12110 }, { "grad_norm": 0.5088367462158203, "learning_rate": 9.721042923836992e-05, "loss": 0.0521, "step": 12120 }, { "grad_norm": 0.439734548330307, "learning_rate": 9.720361811649914e-05, "loss": 0.0586, "step": 12130 }, { "grad_norm": 0.5745687484741211, "learning_rate": 9.719679892882106e-05, "loss": 0.0536, "step": 12140 }, { "grad_norm": 0.6281094551086426, "learning_rate": 9.718997167650085e-05, "loss": 0.0565, "step": 12150 }, { "grad_norm": 0.4176163971424103, "learning_rate": 9.718313636070515e-05, "loss": 0.0437, "step": 12160 }, { "grad_norm": 0.451083242893219, "learning_rate": 9.717629298260192e-05, "loss": 0.0542, "step": 12170 }, { "grad_norm": 0.6961185932159424, "learning_rate": 9.716944154336047e-05, "loss": 0.0698, "step": 12180 }, { "grad_norm": 0.4768107831478119, "learning_rate": 9.716258204415157e-05, "loss": 0.0571, "step": 12190 }, { "grad_norm": 0.36736688017845154, "learning_rate": 9.715571448614728e-05, "loss": 0.0667, "step": 12200 }, { "grad_norm": 0.3951527774333954, "learning_rate": 9.71488388705211e-05, "loss": 0.0767, "step": 12210 }, { "grad_norm": 0.4301919937133789, "learning_rate": 9.714195519844788e-05, "loss": 0.0487, "step": 12220 }, { "grad_norm": 0.5247161984443665, "learning_rate": 9.713506347110386e-05, "loss": 0.0637, "step": 12230 }, { "grad_norm": 0.259634405374527, "learning_rate": 9.712816368966662e-05, "loss": 0.0502, "step": 12240 }, { "grad_norm": 0.45500341057777405, "learning_rate": 9.712125585531517e-05, "loss": 0.0735, "step": 12250 }, { "grad_norm": 0.5666537880897522, "learning_rate": 9.711433996922988e-05, "loss": 0.083, "step": 12260 }, { "grad_norm": 0.442716509103775, "learning_rate": 9.710741603259245e-05, "loss": 0.0669, "step": 12270 }, { "grad_norm": 0.4983271658420563, "learning_rate": 9.710048404658603e-05, "loss": 0.0582, "step": 12280 }, { "grad_norm": 0.3714739978313446, "learning_rate": 9.709354401239508e-05, "loss": 0.0596, "step": 12290 }, { "grad_norm": 0.47869664430618286, "learning_rate": 9.708659593120546e-05, "loss": 0.047, "step": 12300 }, { "grad_norm": 0.5367125868797302, "learning_rate": 9.707963980420443e-05, "loss": 0.0557, "step": 12310 }, { "grad_norm": 0.7279293537139893, "learning_rate": 9.707267563258058e-05, "loss": 0.066, "step": 12320 }, { "grad_norm": 0.358759343624115, "learning_rate": 9.70657034175239e-05, "loss": 0.0588, "step": 12330 }, { "grad_norm": 0.5555664300918579, "learning_rate": 9.705872316022577e-05, "loss": 0.0704, "step": 12340 }, { "grad_norm": 0.6790883541107178, "learning_rate": 9.705173486187891e-05, "loss": 0.0626, "step": 12350 }, { "grad_norm": 0.45110148191452026, "learning_rate": 9.704473852367741e-05, "loss": 0.0667, "step": 12360 }, { "grad_norm": 0.4314880073070526, "learning_rate": 9.70377341468168e-05, "loss": 0.0502, "step": 12370 }, { "grad_norm": 0.38417866826057434, "learning_rate": 9.703072173249389e-05, "loss": 0.0499, "step": 12380 }, { "grad_norm": 0.6005095839500427, "learning_rate": 9.702370128190693e-05, "loss": 0.0552, "step": 12390 }, { "grad_norm": 0.48568615317344666, "learning_rate": 9.701667279625552e-05, "loss": 0.06, "step": 12400 }, { "grad_norm": 0.5800198912620544, "learning_rate": 9.700963627674065e-05, "loss": 0.0651, "step": 12410 }, { "grad_norm": 0.6564385890960693, "learning_rate": 9.700259172456466e-05, "loss": 0.0614, "step": 12420 }, { "grad_norm": 0.5126976370811462, "learning_rate": 9.699553914093124e-05, "loss": 0.0686, "step": 12430 }, { "grad_norm": 0.5238712430000305, "learning_rate": 9.698847852704553e-05, "loss": 0.0679, "step": 12440 }, { "grad_norm": 0.4680565595626831, "learning_rate": 9.6981409884114e-05, "loss": 0.0561, "step": 12450 }, { "grad_norm": 0.4497414529323578, "learning_rate": 9.697433321334443e-05, "loss": 0.0633, "step": 12460 }, { "grad_norm": 0.42903217673301697, "learning_rate": 9.696724851594607e-05, "loss": 0.0611, "step": 12470 }, { "grad_norm": 0.5119521021842957, "learning_rate": 9.696015579312952e-05, "loss": 0.0498, "step": 12480 }, { "grad_norm": 0.38429024815559387, "learning_rate": 9.695305504610668e-05, "loss": 0.0517, "step": 12490 }, { "grad_norm": 0.3940727710723877, "learning_rate": 9.694594627609092e-05, "loss": 0.0539, "step": 12500 }, { "grad_norm": 0.46301254630088806, "learning_rate": 9.693882948429691e-05, "loss": 0.0666, "step": 12510 }, { "grad_norm": 0.4297737181186676, "learning_rate": 9.693170467194071e-05, "loss": 0.0471, "step": 12520 }, { "grad_norm": 0.2994825839996338, "learning_rate": 9.692457184023977e-05, "loss": 0.0523, "step": 12530 }, { "grad_norm": 0.525976300239563, "learning_rate": 9.691743099041291e-05, "loss": 0.0594, "step": 12540 }, { "grad_norm": 0.5679636597633362, "learning_rate": 9.691028212368027e-05, "loss": 0.0553, "step": 12550 }, { "grad_norm": 0.6155673265457153, "learning_rate": 9.690312524126342e-05, "loss": 0.0532, "step": 12560 }, { "grad_norm": 0.42327290773391724, "learning_rate": 9.689596034438527e-05, "loss": 0.0541, "step": 12570 }, { "grad_norm": 0.23242095112800598, "learning_rate": 9.688878743427012e-05, "loss": 0.0502, "step": 12580 }, { "grad_norm": 0.45328426361083984, "learning_rate": 9.688160651214359e-05, "loss": 0.0578, "step": 12590 }, { "grad_norm": 0.5501020550727844, "learning_rate": 9.687441757923273e-05, "loss": 0.0553, "step": 12600 }, { "grad_norm": 0.38956552743911743, "learning_rate": 9.68672206367659e-05, "loss": 0.0526, "step": 12610 }, { "grad_norm": 0.4872821867465973, "learning_rate": 9.686001568597291e-05, "loss": 0.052, "step": 12620 }, { "grad_norm": 0.5207077264785767, "learning_rate": 9.685280272808486e-05, "loss": 0.0512, "step": 12630 }, { "grad_norm": 0.6682066321372986, "learning_rate": 9.684558176433424e-05, "loss": 0.0706, "step": 12640 }, { "grad_norm": 0.6672905087471008, "learning_rate": 9.683835279595495e-05, "loss": 0.0623, "step": 12650 }, { "grad_norm": 0.4992046654224396, "learning_rate": 9.683111582418216e-05, "loss": 0.0655, "step": 12660 }, { "grad_norm": 0.4321530759334564, "learning_rate": 9.682387085025254e-05, "loss": 0.0511, "step": 12670 }, { "grad_norm": 0.5082467794418335, "learning_rate": 9.681661787540401e-05, "loss": 0.0651, "step": 12680 }, { "grad_norm": 0.5544316172599792, "learning_rate": 9.680935690087593e-05, "loss": 0.0587, "step": 12690 }, { "grad_norm": 0.5326444506645203, "learning_rate": 9.680208792790901e-05, "loss": 0.0627, "step": 12700 }, { "grad_norm": 0.4822700023651123, "learning_rate": 9.679481095774529e-05, "loss": 0.056, "step": 12710 }, { "grad_norm": 0.43254977464675903, "learning_rate": 9.678752599162822e-05, "loss": 0.0569, "step": 12720 }, { "grad_norm": 0.5730451345443726, "learning_rate": 9.678023303080259e-05, "loss": 0.0517, "step": 12730 }, { "grad_norm": 0.43799230456352234, "learning_rate": 9.677293207651459e-05, "loss": 0.0569, "step": 12740 }, { "grad_norm": 0.3739047646522522, "learning_rate": 9.676562313001173e-05, "loss": 0.0474, "step": 12750 }, { "grad_norm": 0.5843848586082458, "learning_rate": 9.675830619254293e-05, "loss": 0.0659, "step": 12760 }, { "grad_norm": 0.6382627487182617, "learning_rate": 9.675098126535843e-05, "loss": 0.0658, "step": 12770 }, { "grad_norm": 0.5332679152488708, "learning_rate": 9.674364834970988e-05, "loss": 0.0645, "step": 12780 }, { "grad_norm": 0.516463041305542, "learning_rate": 9.673630744685028e-05, "loss": 0.059, "step": 12790 }, { "grad_norm": 0.40653061866760254, "learning_rate": 9.672895855803397e-05, "loss": 0.0565, "step": 12800 }, { "grad_norm": 0.317954957485199, "learning_rate": 9.672160168451667e-05, "loss": 0.0505, "step": 12810 }, { "grad_norm": 0.7046358585357666, "learning_rate": 9.671423682755549e-05, "loss": 0.0687, "step": 12820 }, { "grad_norm": 0.6180813312530518, "learning_rate": 9.670686398840888e-05, "loss": 0.0678, "step": 12830 }, { "grad_norm": 0.4528583288192749, "learning_rate": 9.669948316833664e-05, "loss": 0.0492, "step": 12840 }, { "grad_norm": 0.3629089891910553, "learning_rate": 9.669209436859997e-05, "loss": 0.0504, "step": 12850 }, { "grad_norm": 0.46608635783195496, "learning_rate": 9.66846975904614e-05, "loss": 0.0482, "step": 12860 }, { "grad_norm": 0.5228429436683655, "learning_rate": 9.667729283518483e-05, "loss": 0.0474, "step": 12870 }, { "grad_norm": 0.5946757197380066, "learning_rate": 9.666988010403557e-05, "loss": 0.05, "step": 12880 }, { "grad_norm": 0.35808444023132324, "learning_rate": 9.66624593982802e-05, "loss": 0.0495, "step": 12890 }, { "grad_norm": 0.394538938999176, "learning_rate": 9.665503071918675e-05, "loss": 0.0654, "step": 12900 }, { "grad_norm": 0.461523175239563, "learning_rate": 9.664759406802456e-05, "loss": 0.0479, "step": 12910 }, { "grad_norm": 0.4583331346511841, "learning_rate": 9.664014944606437e-05, "loss": 0.0411, "step": 12920 }, { "grad_norm": 0.396570086479187, "learning_rate": 9.663269685457822e-05, "loss": 0.0555, "step": 12930 }, { "grad_norm": 0.475657194852829, "learning_rate": 9.662523629483962e-05, "loss": 0.0533, "step": 12940 }, { "grad_norm": 0.4590015411376953, "learning_rate": 9.661776776812333e-05, "loss": 0.047, "step": 12950 }, { "grad_norm": 0.399452805519104, "learning_rate": 9.661029127570553e-05, "loss": 0.0535, "step": 12960 }, { "grad_norm": 0.5399861931800842, "learning_rate": 9.660280681886373e-05, "loss": 0.0573, "step": 12970 }, { "grad_norm": 0.42048659920692444, "learning_rate": 9.659531439887685e-05, "loss": 0.0437, "step": 12980 }, { "grad_norm": 0.6112729907035828, "learning_rate": 9.658781401702511e-05, "loss": 0.0642, "step": 12990 }, { "grad_norm": 0.6229283809661865, "learning_rate": 9.658030567459015e-05, "loss": 0.0516, "step": 13000 }, { "grad_norm": 0.4835776090621948, "learning_rate": 9.65727893728549e-05, "loss": 0.0683, "step": 13010 }, { "grad_norm": 0.5735816359519958, "learning_rate": 9.656526511310375e-05, "loss": 0.0657, "step": 13020 }, { "grad_norm": 0.36295846104621887, "learning_rate": 9.655773289662233e-05, "loss": 0.0627, "step": 13030 }, { "grad_norm": 0.49264830350875854, "learning_rate": 9.655019272469772e-05, "loss": 0.0515, "step": 13040 }, { "grad_norm": 0.3799680471420288, "learning_rate": 9.654264459861832e-05, "loss": 0.0382, "step": 13050 }, { "grad_norm": 0.3839673101902008, "learning_rate": 9.653508851967391e-05, "loss": 0.0498, "step": 13060 }, { "grad_norm": 0.6036388874053955, "learning_rate": 9.65275244891556e-05, "loss": 0.0598, "step": 13070 }, { "grad_norm": 0.452420711517334, "learning_rate": 9.651995250835591e-05, "loss": 0.0497, "step": 13080 }, { "grad_norm": 0.5009552836418152, "learning_rate": 9.651237257856862e-05, "loss": 0.0555, "step": 13090 }, { "grad_norm": 0.3853681981563568, "learning_rate": 9.6504784701089e-05, "loss": 0.0503, "step": 13100 }, { "grad_norm": 0.39679187536239624, "learning_rate": 9.649718887721357e-05, "loss": 0.047, "step": 13110 }, { "grad_norm": 0.5313712358474731, "learning_rate": 9.648958510824028e-05, "loss": 0.0504, "step": 13120 }, { "grad_norm": 0.40359553694725037, "learning_rate": 9.648197339546837e-05, "loss": 0.0652, "step": 13130 }, { "grad_norm": 0.5241568088531494, "learning_rate": 9.647435374019851e-05, "loss": 0.0566, "step": 13140 }, { "grad_norm": 0.7275563478469849, "learning_rate": 9.646672614373266e-05, "loss": 0.0559, "step": 13150 }, { "grad_norm": 0.8062583804130554, "learning_rate": 9.645909060737418e-05, "loss": 0.0564, "step": 13160 }, { "grad_norm": 0.37637144327163696, "learning_rate": 9.645144713242778e-05, "loss": 0.0526, "step": 13170 }, { "grad_norm": 0.43788889050483704, "learning_rate": 9.64437957201995e-05, "loss": 0.058, "step": 13180 }, { "grad_norm": 0.5193544030189514, "learning_rate": 9.643613637199678e-05, "loss": 0.0501, "step": 13190 }, { "grad_norm": 0.5266392827033997, "learning_rate": 9.642846908912839e-05, "loss": 0.0623, "step": 13200 }, { "grad_norm": 0.34718140959739685, "learning_rate": 9.642079387290444e-05, "loss": 0.0461, "step": 13210 }, { "grad_norm": 0.44600051641464233, "learning_rate": 9.641311072463644e-05, "loss": 0.0542, "step": 13220 }, { "grad_norm": 0.4068472981452942, "learning_rate": 9.640541964563722e-05, "loss": 0.0625, "step": 13230 }, { "grad_norm": 0.6845272779464722, "learning_rate": 9.639772063722096e-05, "loss": 0.0721, "step": 13240 }, { "grad_norm": 0.38633227348327637, "learning_rate": 9.639001370070324e-05, "loss": 0.0495, "step": 13250 }, { "grad_norm": 0.5158147215843201, "learning_rate": 9.638229883740095e-05, "loss": 0.0529, "step": 13260 }, { "grad_norm": 0.4466469883918762, "learning_rate": 9.637457604863233e-05, "loss": 0.0454, "step": 13270 }, { "grad_norm": 0.406982958316803, "learning_rate": 9.636684533571703e-05, "loss": 0.0473, "step": 13280 }, { "grad_norm": 0.39714792370796204, "learning_rate": 9.635910669997599e-05, "loss": 0.0547, "step": 13290 }, { "grad_norm": 0.8019206523895264, "learning_rate": 9.635136014273154e-05, "loss": 0.0574, "step": 13300 }, { "grad_norm": 0.5107654929161072, "learning_rate": 9.634360566530735e-05, "loss": 0.0616, "step": 13310 }, { "grad_norm": 0.5184831023216248, "learning_rate": 9.633584326902845e-05, "loss": 0.0728, "step": 13320 }, { "grad_norm": 0.49459850788116455, "learning_rate": 9.632807295522124e-05, "loss": 0.0537, "step": 13330 }, { "grad_norm": 0.5792140364646912, "learning_rate": 9.632029472521342e-05, "loss": 0.0555, "step": 13340 }, { "grad_norm": 0.42952093482017517, "learning_rate": 9.631250858033409e-05, "loss": 0.0521, "step": 13350 }, { "grad_norm": 0.4056546688079834, "learning_rate": 9.630471452191371e-05, "loss": 0.0566, "step": 13360 }, { "grad_norm": 0.3412325382232666, "learning_rate": 9.629691255128405e-05, "loss": 0.0618, "step": 13370 }, { "grad_norm": 0.5883224010467529, "learning_rate": 9.628910266977825e-05, "loss": 0.0556, "step": 13380 }, { "grad_norm": 0.4362053871154785, "learning_rate": 9.628128487873083e-05, "loss": 0.0586, "step": 13390 }, { "grad_norm": 0.3857019245624542, "learning_rate": 9.627345917947761e-05, "loss": 0.0646, "step": 13400 }, { "grad_norm": 0.4557597041130066, "learning_rate": 9.626562557335579e-05, "loss": 0.0542, "step": 13410 }, { "grad_norm": 0.5967693328857422, "learning_rate": 9.625778406170393e-05, "loss": 0.0546, "step": 13420 }, { "grad_norm": 0.6393765807151794, "learning_rate": 9.624993464586193e-05, "loss": 0.0555, "step": 13430 }, { "grad_norm": 0.9492204785346985, "learning_rate": 9.624207732717105e-05, "loss": 0.0674, "step": 13440 }, { "grad_norm": 0.5531773567199707, "learning_rate": 9.623421210697386e-05, "loss": 0.071, "step": 13450 }, { "grad_norm": 0.5705338716506958, "learning_rate": 9.622633898661434e-05, "loss": 0.0664, "step": 13460 }, { "grad_norm": 0.5228156447410583, "learning_rate": 9.621845796743778e-05, "loss": 0.0717, "step": 13470 }, { "grad_norm": 0.41122686862945557, "learning_rate": 9.621056905079082e-05, "loss": 0.0624, "step": 13480 }, { "grad_norm": 0.35819023847579956, "learning_rate": 9.620267223802149e-05, "loss": 0.0601, "step": 13490 }, { "grad_norm": 0.5768851041793823, "learning_rate": 9.619476753047911e-05, "loss": 0.0568, "step": 13500 }, { "grad_norm": 0.6150023341178894, "learning_rate": 9.618685492951438e-05, "loss": 0.0674, "step": 13510 }, { "grad_norm": 0.4555216431617737, "learning_rate": 9.617893443647938e-05, "loss": 0.0686, "step": 13520 }, { "grad_norm": 0.6599083542823792, "learning_rate": 9.617100605272746e-05, "loss": 0.0774, "step": 13530 }, { "grad_norm": 0.4650951027870178, "learning_rate": 9.616306977961338e-05, "loss": 0.0604, "step": 13540 }, { "grad_norm": 0.47509610652923584, "learning_rate": 9.615512561849326e-05, "loss": 0.0537, "step": 13550 }, { "grad_norm": 0.5183965563774109, "learning_rate": 9.61471735707245e-05, "loss": 0.0629, "step": 13560 }, { "grad_norm": 0.44399717450141907, "learning_rate": 9.613921363766592e-05, "loss": 0.0543, "step": 13570 }, { "grad_norm": 0.4387592673301697, "learning_rate": 9.613124582067763e-05, "loss": 0.0592, "step": 13580 }, { "grad_norm": 0.711341142654419, "learning_rate": 9.612327012112112e-05, "loss": 0.0711, "step": 13590 }, { "grad_norm": 0.46586835384368896, "learning_rate": 9.611528654035921e-05, "loss": 0.0829, "step": 13600 }, { "grad_norm": 0.4049167335033417, "learning_rate": 9.610729507975611e-05, "loss": 0.0519, "step": 13610 }, { "grad_norm": 0.3950308859348297, "learning_rate": 9.609929574067731e-05, "loss": 0.0693, "step": 13620 }, { "grad_norm": 0.40929606556892395, "learning_rate": 9.609128852448967e-05, "loss": 0.0487, "step": 13630 }, { "grad_norm": 0.3893642723560333, "learning_rate": 9.608327343256143e-05, "loss": 0.0544, "step": 13640 }, { "grad_norm": 0.38704434037208557, "learning_rate": 9.607525046626216e-05, "loss": 0.0536, "step": 13650 }, { "grad_norm": 0.550605833530426, "learning_rate": 9.606721962696272e-05, "loss": 0.0627, "step": 13660 }, { "grad_norm": 0.496146559715271, "learning_rate": 9.60591809160354e-05, "loss": 0.0591, "step": 13670 }, { "grad_norm": 0.585860550403595, "learning_rate": 9.605113433485378e-05, "loss": 0.0514, "step": 13680 }, { "grad_norm": 0.5782285332679749, "learning_rate": 9.604307988479279e-05, "loss": 0.0522, "step": 13690 }, { "grad_norm": 0.4286107122898102, "learning_rate": 9.603501756722876e-05, "loss": 0.0585, "step": 13700 }, { "grad_norm": 0.42148861289024353, "learning_rate": 9.602694738353927e-05, "loss": 0.0498, "step": 13710 }, { "grad_norm": 0.458650678396225, "learning_rate": 9.601886933510331e-05, "loss": 0.0452, "step": 13720 }, { "grad_norm": 0.4473418891429901, "learning_rate": 9.60107834233012e-05, "loss": 0.0483, "step": 13730 }, { "grad_norm": 0.40928730368614197, "learning_rate": 9.60026896495146e-05, "loss": 0.0427, "step": 13740 }, { "grad_norm": 0.3577019274234772, "learning_rate": 9.599458801512652e-05, "loss": 0.0491, "step": 13750 }, { "grad_norm": 0.5461050868034363, "learning_rate": 9.598647852152129e-05, "loss": 0.0718, "step": 13760 }, { "grad_norm": 0.415498286485672, "learning_rate": 9.597836117008462e-05, "loss": 0.0527, "step": 13770 }, { "grad_norm": 0.5185036063194275, "learning_rate": 9.597023596220356e-05, "loss": 0.0511, "step": 13780 }, { "grad_norm": 0.5062903165817261, "learning_rate": 9.596210289926643e-05, "loss": 0.0467, "step": 13790 }, { "grad_norm": 0.5436053276062012, "learning_rate": 9.5953961982663e-05, "loss": 0.0418, "step": 13800 }, { "grad_norm": 0.5463793873786926, "learning_rate": 9.594581321378431e-05, "loss": 0.0537, "step": 13810 }, { "grad_norm": 0.35424911975860596, "learning_rate": 9.593765659402276e-05, "loss": 0.0511, "step": 13820 }, { "grad_norm": 0.4462425112724304, "learning_rate": 9.59294921247721e-05, "loss": 0.0551, "step": 13830 }, { "grad_norm": 0.43719273805618286, "learning_rate": 9.59213198074274e-05, "loss": 0.059, "step": 13840 }, { "grad_norm": 0.4526122212409973, "learning_rate": 9.59131396433851e-05, "loss": 0.0466, "step": 13850 }, { "grad_norm": 0.46751677989959717, "learning_rate": 9.590495163404297e-05, "loss": 0.0585, "step": 13860 }, { "grad_norm": 0.47963759303092957, "learning_rate": 9.589675578080009e-05, "loss": 0.0606, "step": 13870 }, { "grad_norm": 0.36476194858551025, "learning_rate": 9.588855208505694e-05, "loss": 0.0458, "step": 13880 }, { "grad_norm": 0.6570359468460083, "learning_rate": 9.588034054821529e-05, "loss": 0.073, "step": 13890 }, { "grad_norm": 0.5787973403930664, "learning_rate": 9.587212117167826e-05, "loss": 0.046, "step": 13900 }, { "grad_norm": 0.5146920680999756, "learning_rate": 9.586389395685033e-05, "loss": 0.0536, "step": 13910 }, { "grad_norm": 0.5033076405525208, "learning_rate": 9.585565890513733e-05, "loss": 0.0634, "step": 13920 }, { "grad_norm": 0.49523618817329407, "learning_rate": 9.584741601794636e-05, "loss": 0.0554, "step": 13930 }, { "grad_norm": 0.5313734412193298, "learning_rate": 9.58391652966859e-05, "loss": 0.0604, "step": 13940 }, { "grad_norm": 0.5585854053497314, "learning_rate": 9.583090674276583e-05, "loss": 0.0533, "step": 13950 }, { "grad_norm": 0.3265303671360016, "learning_rate": 9.582264035759726e-05, "loss": 0.0499, "step": 13960 }, { "grad_norm": 0.596828281879425, "learning_rate": 9.58143661425927e-05, "loss": 0.0869, "step": 13970 }, { "grad_norm": 0.4451601207256317, "learning_rate": 9.580608409916601e-05, "loss": 0.0657, "step": 13980 }, { "grad_norm": 0.3955245912075043, "learning_rate": 9.579779422873233e-05, "loss": 0.0665, "step": 13990 }, { "grad_norm": 0.4898768961429596, "learning_rate": 9.578949653270819e-05, "loss": 0.0584, "step": 14000 }, { "grad_norm": 0.5250555276870728, "learning_rate": 9.578119101251144e-05, "loss": 0.0479, "step": 14010 }, { "grad_norm": 0.33549976348876953, "learning_rate": 9.577287766956127e-05, "loss": 0.0632, "step": 14020 }, { "grad_norm": 0.4389890432357788, "learning_rate": 9.57645565052782e-05, "loss": 0.0571, "step": 14030 }, { "grad_norm": 0.5942932963371277, "learning_rate": 9.575622752108407e-05, "loss": 0.0529, "step": 14040 }, { "grad_norm": 0.4221065640449524, "learning_rate": 9.57478907184021e-05, "loss": 0.0581, "step": 14050 }, { "grad_norm": 0.5964877605438232, "learning_rate": 9.573954609865681e-05, "loss": 0.0597, "step": 14060 }, { "grad_norm": 0.4764214754104614, "learning_rate": 9.573119366327408e-05, "loss": 0.0548, "step": 14070 }, { "grad_norm": 0.49264076352119446, "learning_rate": 9.57228334136811e-05, "loss": 0.0529, "step": 14080 }, { "grad_norm": 0.482690691947937, "learning_rate": 9.571446535130641e-05, "loss": 0.058, "step": 14090 }, { "grad_norm": 0.6076065301895142, "learning_rate": 9.570608947757988e-05, "loss": 0.0609, "step": 14100 }, { "grad_norm": 0.44847357273101807, "learning_rate": 9.569770579393274e-05, "loss": 0.0574, "step": 14110 }, { "grad_norm": 0.5404349565505981, "learning_rate": 9.56893143017975e-05, "loss": 0.0589, "step": 14120 }, { "grad_norm": 0.4248036742210388, "learning_rate": 9.568091500260806e-05, "loss": 0.0607, "step": 14130 }, { "grad_norm": 0.45067206025123596, "learning_rate": 9.567250789779961e-05, "loss": 0.0594, "step": 14140 }, { "grad_norm": 0.5536141395568848, "learning_rate": 9.566409298880872e-05, "loss": 0.046, "step": 14150 }, { "grad_norm": 0.5934009552001953, "learning_rate": 9.565567027707326e-05, "loss": 0.0724, "step": 14160 }, { "grad_norm": 0.47881245613098145, "learning_rate": 9.56472397640324e-05, "loss": 0.058, "step": 14170 }, { "grad_norm": 0.42804160714149475, "learning_rate": 9.563880145112675e-05, "loss": 0.0521, "step": 14180 }, { "grad_norm": 0.5155171155929565, "learning_rate": 9.563035533979814e-05, "loss": 0.0597, "step": 14190 }, { "grad_norm": 0.7945255637168884, "learning_rate": 9.562190143148981e-05, "loss": 0.0589, "step": 14200 }, { "grad_norm": 0.474673867225647, "learning_rate": 9.561343972764627e-05, "loss": 0.0564, "step": 14210 }, { "grad_norm": 0.38457190990448, "learning_rate": 9.560497022971343e-05, "loss": 0.0622, "step": 14220 }, { "grad_norm": 0.4293605089187622, "learning_rate": 9.559649293913847e-05, "loss": 0.0494, "step": 14230 }, { "grad_norm": 0.6011494398117065, "learning_rate": 9.558800785736993e-05, "loss": 0.0523, "step": 14240 }, { "grad_norm": 0.3547777831554413, "learning_rate": 9.557951498585767e-05, "loss": 0.0547, "step": 14250 }, { "grad_norm": 0.42474550008773804, "learning_rate": 9.557101432605293e-05, "loss": 0.0472, "step": 14260 }, { "grad_norm": 0.44859230518341064, "learning_rate": 9.556250587940818e-05, "loss": 0.0556, "step": 14270 }, { "grad_norm": 0.5457127690315247, "learning_rate": 9.555398964737734e-05, "loss": 0.0446, "step": 14280 }, { "grad_norm": 0.5887022018432617, "learning_rate": 9.554546563141555e-05, "loss": 0.0502, "step": 14290 }, { "grad_norm": 0.6038946509361267, "learning_rate": 9.553693383297937e-05, "loss": 0.064, "step": 14300 }, { "grad_norm": 0.46753740310668945, "learning_rate": 9.552839425352663e-05, "loss": 0.0529, "step": 14310 }, { "grad_norm": 0.4085116684436798, "learning_rate": 9.551984689451652e-05, "loss": 0.0582, "step": 14320 }, { "grad_norm": 0.4451644718647003, "learning_rate": 9.551129175740953e-05, "loss": 0.0486, "step": 14330 }, { "grad_norm": 0.4281066954135895, "learning_rate": 9.550272884366754e-05, "loss": 0.0557, "step": 14340 }, { "grad_norm": 0.5113348960876465, "learning_rate": 9.549415815475369e-05, "loss": 0.0502, "step": 14350 }, { "grad_norm": 0.333560585975647, "learning_rate": 9.548557969213247e-05, "loss": 0.0474, "step": 14360 }, { "grad_norm": 0.5316098928451538, "learning_rate": 9.547699345726972e-05, "loss": 0.0555, "step": 14370 }, { "grad_norm": 0.33038967847824097, "learning_rate": 9.546839945163257e-05, "loss": 0.0488, "step": 14380 }, { "grad_norm": 0.3589778542518616, "learning_rate": 9.545979767668953e-05, "loss": 0.0619, "step": 14390 }, { "grad_norm": 0.3842172622680664, "learning_rate": 9.54511881339104e-05, "loss": 0.0492, "step": 14400 }, { "grad_norm": 0.47122910618782043, "learning_rate": 9.54425708247663e-05, "loss": 0.0428, "step": 14410 }, { "grad_norm": 0.5029028654098511, "learning_rate": 9.543394575072972e-05, "loss": 0.0481, "step": 14420 }, { "grad_norm": 0.33108511567115784, "learning_rate": 9.542531291327441e-05, "loss": 0.051, "step": 14430 }, { "grad_norm": 0.41530895233154297, "learning_rate": 9.541667231387552e-05, "loss": 0.0508, "step": 14440 }, { "grad_norm": 0.4797438085079193, "learning_rate": 9.540802395400949e-05, "loss": 0.0533, "step": 14450 }, { "grad_norm": 0.47740858793258667, "learning_rate": 9.539936783515406e-05, "loss": 0.053, "step": 14460 }, { "grad_norm": 0.422791451215744, "learning_rate": 9.539070395878835e-05, "loss": 0.0585, "step": 14470 }, { "grad_norm": 0.46071386337280273, "learning_rate": 9.538203232639277e-05, "loss": 0.0453, "step": 14480 }, { "grad_norm": 0.46485233306884766, "learning_rate": 9.537335293944907e-05, "loss": 0.0559, "step": 14490 }, { "grad_norm": 0.3991223871707916, "learning_rate": 9.536466579944032e-05, "loss": 0.0576, "step": 14500 }, { "grad_norm": 0.39007583260536194, "learning_rate": 9.535597090785091e-05, "loss": 0.0591, "step": 14510 }, { "grad_norm": 0.46021685004234314, "learning_rate": 9.534726826616656e-05, "loss": 0.0491, "step": 14520 }, { "grad_norm": 0.4507134258747101, "learning_rate": 9.53385578758743e-05, "loss": 0.0727, "step": 14530 }, { "grad_norm": 0.4650369882583618, "learning_rate": 9.532983973846252e-05, "loss": 0.0546, "step": 14540 }, { "grad_norm": 0.3814445436000824, "learning_rate": 9.53211138554209e-05, "loss": 0.0543, "step": 14550 }, { "grad_norm": 0.4848548471927643, "learning_rate": 9.531238022824047e-05, "loss": 0.0526, "step": 14560 }, { "grad_norm": 0.4674452543258667, "learning_rate": 9.530363885841355e-05, "loss": 0.0446, "step": 14570 }, { "grad_norm": 0.6358458995819092, "learning_rate": 9.52948897474338e-05, "loss": 0.0617, "step": 14580 }, { "grad_norm": 0.44976210594177246, "learning_rate": 9.528613289679622e-05, "loss": 0.0546, "step": 14590 }, { "grad_norm": 0.29490765929222107, "learning_rate": 9.52773683079971e-05, "loss": 0.0434, "step": 14600 }, { "grad_norm": 0.5756850838661194, "learning_rate": 9.526859598253407e-05, "loss": 0.0502, "step": 14610 }, { "grad_norm": 0.328177809715271, "learning_rate": 9.525981592190609e-05, "loss": 0.0475, "step": 14620 }, { "grad_norm": 0.6405220627784729, "learning_rate": 9.525102812761342e-05, "loss": 0.0454, "step": 14630 }, { "grad_norm": 0.5689123272895813, "learning_rate": 9.524223260115768e-05, "loss": 0.0472, "step": 14640 }, { "grad_norm": 0.44282931089401245, "learning_rate": 9.523342934404175e-05, "loss": 0.0437, "step": 14650 }, { "grad_norm": 0.35986092686653137, "learning_rate": 9.522461835776989e-05, "loss": 0.0454, "step": 14660 }, { "grad_norm": 0.6142690181732178, "learning_rate": 9.521579964384764e-05, "loss": 0.0644, "step": 14670 }, { "grad_norm": 0.42789122462272644, "learning_rate": 9.52069732037819e-05, "loss": 0.0443, "step": 14680 }, { "grad_norm": 0.3970990478992462, "learning_rate": 9.519813903908083e-05, "loss": 0.0472, "step": 14690 }, { "grad_norm": 0.6197693347930908, "learning_rate": 9.5189297151254e-05, "loss": 0.0631, "step": 14700 }, { "grad_norm": 0.5977867841720581, "learning_rate": 9.518044754181218e-05, "loss": 0.067, "step": 14710 }, { "grad_norm": 0.6393871307373047, "learning_rate": 9.51715902122676e-05, "loss": 0.057, "step": 14720 }, { "grad_norm": 0.34001436829566956, "learning_rate": 9.516272516413368e-05, "loss": 0.0533, "step": 14730 }, { "grad_norm": 0.3365767002105713, "learning_rate": 9.515385239892525e-05, "loss": 0.063, "step": 14740 }, { "grad_norm": 0.4506396949291229, "learning_rate": 9.514497191815839e-05, "loss": 0.0612, "step": 14750 }, { "grad_norm": 0.5076624155044556, "learning_rate": 9.513608372335055e-05, "loss": 0.0547, "step": 14760 }, { "grad_norm": 0.5347692370414734, "learning_rate": 9.512718781602045e-05, "loss": 0.0511, "step": 14770 }, { "grad_norm": 0.47536465525627136, "learning_rate": 9.511828419768823e-05, "loss": 0.0512, "step": 14780 }, { "grad_norm": 0.3769194781780243, "learning_rate": 9.510937286987521e-05, "loss": 0.0437, "step": 14790 }, { "grad_norm": 0.6043297648429871, "learning_rate": 9.510045383410408e-05, "loss": 0.0588, "step": 14800 }, { "grad_norm": 0.47589781880378723, "learning_rate": 9.509152709189892e-05, "loss": 0.0528, "step": 14810 }, { "grad_norm": 0.3421868681907654, "learning_rate": 9.508259264478504e-05, "loss": 0.0522, "step": 14820 }, { "grad_norm": 0.4740440249443054, "learning_rate": 9.507365049428909e-05, "loss": 0.0643, "step": 14830 }, { "grad_norm": 0.4419344961643219, "learning_rate": 9.506470064193902e-05, "loss": 0.0684, "step": 14840 }, { "grad_norm": 0.445315957069397, "learning_rate": 9.505574308926414e-05, "loss": 0.0686, "step": 14850 }, { "grad_norm": 0.547569751739502, "learning_rate": 9.504677783779505e-05, "loss": 0.0519, "step": 14860 }, { "grad_norm": 0.6982003450393677, "learning_rate": 9.503780488906365e-05, "loss": 0.0517, "step": 14870 }, { "grad_norm": 0.4773062765598297, "learning_rate": 9.502882424460319e-05, "loss": 0.0538, "step": 14880 }, { "grad_norm": 0.3410710096359253, "learning_rate": 9.501983590594821e-05, "loss": 0.0439, "step": 14890 }, { "grad_norm": 0.4389824867248535, "learning_rate": 9.501083987463455e-05, "loss": 0.0461, "step": 14900 }, { "grad_norm": 0.5316569805145264, "learning_rate": 9.500183615219942e-05, "loss": 0.0511, "step": 14910 }, { "grad_norm": 0.6041095852851868, "learning_rate": 9.49928247401813e-05, "loss": 0.0515, "step": 14920 }, { "grad_norm": 0.5317081809043884, "learning_rate": 9.498380564011997e-05, "loss": 0.0488, "step": 14930 }, { "grad_norm": 0.34009402990341187, "learning_rate": 9.497477885355656e-05, "loss": 0.0507, "step": 14940 }, { "grad_norm": 0.4429684579372406, "learning_rate": 9.496574438203353e-05, "loss": 0.0505, "step": 14950 }, { "grad_norm": 0.41715937852859497, "learning_rate": 9.495670222709459e-05, "loss": 0.0497, "step": 14960 }, { "grad_norm": 0.48127809166908264, "learning_rate": 9.494765239028483e-05, "loss": 0.0458, "step": 14970 }, { "grad_norm": 0.5980187654495239, "learning_rate": 9.493859487315057e-05, "loss": 0.0503, "step": 14980 }, { "grad_norm": 0.37109777331352234, "learning_rate": 9.492952967723953e-05, "loss": 0.0535, "step": 14990 }, { "grad_norm": 0.5294158458709717, "learning_rate": 9.492045680410068e-05, "loss": 0.0579, "step": 15000 }, { "grad_norm": 0.4016527831554413, "learning_rate": 9.491137625528436e-05, "loss": 0.05, "step": 15010 }, { "grad_norm": 0.553155243396759, "learning_rate": 9.490228803234215e-05, "loss": 0.0486, "step": 15020 }, { "grad_norm": 0.5598495602607727, "learning_rate": 9.489319213682701e-05, "loss": 0.0505, "step": 15030 }, { "grad_norm": 0.3657766878604889, "learning_rate": 9.488408857029316e-05, "loss": 0.0639, "step": 15040 }, { "grad_norm": 0.3216487765312195, "learning_rate": 9.487497733429616e-05, "loss": 0.057, "step": 15050 }, { "grad_norm": 0.4488483667373657, "learning_rate": 9.486585843039286e-05, "loss": 0.0594, "step": 15060 }, { "grad_norm": 0.5099849700927734, "learning_rate": 9.485673186014143e-05, "loss": 0.0582, "step": 15070 }, { "grad_norm": 0.608860194683075, "learning_rate": 9.484759762510137e-05, "loss": 0.0671, "step": 15080 }, { "grad_norm": 0.4684206247329712, "learning_rate": 9.483845572683346e-05, "loss": 0.06, "step": 15090 }, { "grad_norm": 0.43800854682922363, "learning_rate": 9.48293061668998e-05, "loss": 0.0515, "step": 15100 }, { "grad_norm": 0.44029897451400757, "learning_rate": 9.48201489468638e-05, "loss": 0.0519, "step": 15110 }, { "grad_norm": 0.37394094467163086, "learning_rate": 9.481098406829016e-05, "loss": 0.05, "step": 15120 }, { "grad_norm": 0.8441257476806641, "learning_rate": 9.480181153274495e-05, "loss": 0.047, "step": 15130 }, { "grad_norm": 0.3270556628704071, "learning_rate": 9.479263134179548e-05, "loss": 0.0354, "step": 15140 }, { "grad_norm": 0.4632319211959839, "learning_rate": 9.478344349701039e-05, "loss": 0.0514, "step": 15150 }, { "grad_norm": 0.5470839738845825, "learning_rate": 9.477424799995964e-05, "loss": 0.0539, "step": 15160 }, { "grad_norm": 0.4870503842830658, "learning_rate": 9.476504485221448e-05, "loss": 0.0561, "step": 15170 }, { "grad_norm": 0.3518287241458893, "learning_rate": 9.475583405534748e-05, "loss": 0.0468, "step": 15180 }, { "grad_norm": 0.4037591814994812, "learning_rate": 9.474661561093251e-05, "loss": 0.0508, "step": 15190 }, { "grad_norm": 0.45932695269584656, "learning_rate": 9.473738952054478e-05, "loss": 0.0581, "step": 15200 }, { "grad_norm": 0.36843302845954895, "learning_rate": 9.472815578576073e-05, "loss": 0.0542, "step": 15210 }, { "grad_norm": 0.5673783421516418, "learning_rate": 9.471891440815817e-05, "loss": 0.0558, "step": 15220 }, { "grad_norm": 0.44230514764785767, "learning_rate": 9.470966538931621e-05, "loss": 0.0438, "step": 15230 }, { "grad_norm": 0.4144175052642822, "learning_rate": 9.470040873081525e-05, "loss": 0.0501, "step": 15240 }, { "grad_norm": 0.5201701521873474, "learning_rate": 9.469114443423698e-05, "loss": 0.0598, "step": 15250 }, { "grad_norm": 0.4014289975166321, "learning_rate": 9.468187250116445e-05, "loss": 0.0523, "step": 15260 }, { "grad_norm": 0.31348875164985657, "learning_rate": 9.467259293318197e-05, "loss": 0.0518, "step": 15270 }, { "grad_norm": 0.5063713192939758, "learning_rate": 9.466330573187514e-05, "loss": 0.0449, "step": 15280 }, { "grad_norm": 0.4225960373878479, "learning_rate": 9.46540108988309e-05, "loss": 0.069, "step": 15290 }, { "grad_norm": 0.5104843974113464, "learning_rate": 9.46447084356375e-05, "loss": 0.0654, "step": 15300 }, { "grad_norm": 0.3803739845752716, "learning_rate": 9.463539834388447e-05, "loss": 0.0564, "step": 15310 }, { "grad_norm": 0.5027730464935303, "learning_rate": 9.462608062516263e-05, "loss": 0.0419, "step": 15320 }, { "grad_norm": 0.4286441206932068, "learning_rate": 9.461675528106413e-05, "loss": 0.0501, "step": 15330 }, { "grad_norm": 0.4135034680366516, "learning_rate": 9.460742231318244e-05, "loss": 0.0591, "step": 15340 }, { "grad_norm": 0.7003588676452637, "learning_rate": 9.459808172311229e-05, "loss": 0.0679, "step": 15350 }, { "grad_norm": 0.36746546626091003, "learning_rate": 9.458873351244972e-05, "loss": 0.0461, "step": 15360 }, { "grad_norm": 0.5013552904129028, "learning_rate": 9.457937768279211e-05, "loss": 0.0543, "step": 15370 }, { "grad_norm": 0.3807920217514038, "learning_rate": 9.45700142357381e-05, "loss": 0.0494, "step": 15380 }, { "grad_norm": 0.4004082679748535, "learning_rate": 9.456064317288765e-05, "loss": 0.0517, "step": 15390 }, { "grad_norm": 0.3786054849624634, "learning_rate": 9.455126449584201e-05, "loss": 0.0483, "step": 15400 }, { "grad_norm": 0.4260689914226532, "learning_rate": 9.454187820620375e-05, "loss": 0.041, "step": 15410 }, { "grad_norm": 0.4134853482246399, "learning_rate": 9.453248430557673e-05, "loss": 0.0583, "step": 15420 }, { "grad_norm": 0.40948304533958435, "learning_rate": 9.452308279556611e-05, "loss": 0.0681, "step": 15430 }, { "grad_norm": 0.3926686942577362, "learning_rate": 9.451367367777835e-05, "loss": 0.0447, "step": 15440 }, { "grad_norm": 0.3569059669971466, "learning_rate": 9.450425695382122e-05, "loss": 0.055, "step": 15450 }, { "grad_norm": 0.5753059387207031, "learning_rate": 9.449483262530375e-05, "loss": 0.05, "step": 15460 }, { "grad_norm": 0.47253096103668213, "learning_rate": 9.448540069383633e-05, "loss": 0.0461, "step": 15470 }, { "grad_norm": 0.5226724743843079, "learning_rate": 9.447596116103061e-05, "loss": 0.0489, "step": 15480 }, { "grad_norm": 0.4580247402191162, "learning_rate": 9.446651402849955e-05, "loss": 0.0537, "step": 15490 }, { "grad_norm": 0.4213922321796417, "learning_rate": 9.44570592978574e-05, "loss": 0.0544, "step": 15500 }, { "grad_norm": 0.3888319730758667, "learning_rate": 9.444759697071972e-05, "loss": 0.0527, "step": 15510 }, { "grad_norm": 0.48531559109687805, "learning_rate": 9.443812704870336e-05, "loss": 0.0441, "step": 15520 }, { "grad_norm": 0.38240522146224976, "learning_rate": 9.442864953342649e-05, "loss": 0.0494, "step": 15530 }, { "grad_norm": 0.4002750813961029, "learning_rate": 9.441916442650852e-05, "loss": 0.0497, "step": 15540 }, { "grad_norm": 0.3275427520275116, "learning_rate": 9.440967172957023e-05, "loss": 0.0514, "step": 15550 }, { "grad_norm": 0.5512595772743225, "learning_rate": 9.440017144423364e-05, "loss": 0.0562, "step": 15560 }, { "grad_norm": 0.4538585841655731, "learning_rate": 9.439066357212209e-05, "loss": 0.0557, "step": 15570 }, { "grad_norm": 0.46685996651649475, "learning_rate": 9.438114811486022e-05, "loss": 0.0598, "step": 15580 }, { "grad_norm": 0.3928752541542053, "learning_rate": 9.4371625074074e-05, "loss": 0.045, "step": 15590 }, { "grad_norm": 0.3654257655143738, "learning_rate": 9.436209445139059e-05, "loss": 0.0452, "step": 15600 }, { "grad_norm": 0.30704551935195923, "learning_rate": 9.435255624843855e-05, "loss": 0.0518, "step": 15610 }, { "grad_norm": 0.3622589409351349, "learning_rate": 9.43430104668477e-05, "loss": 0.0428, "step": 15620 }, { "grad_norm": 0.48062804341316223, "learning_rate": 9.433345710824914e-05, "loss": 0.0508, "step": 15630 }, { "grad_norm": 0.5214651226997375, "learning_rate": 9.432389617427529e-05, "loss": 0.0473, "step": 15640 }, { "grad_norm": 0.5004831552505493, "learning_rate": 9.431432766655984e-05, "loss": 0.0511, "step": 15650 }, { "grad_norm": 0.582532525062561, "learning_rate": 9.430475158673778e-05, "loss": 0.059, "step": 15660 }, { "grad_norm": 0.5115883350372314, "learning_rate": 9.429516793644542e-05, "loss": 0.0508, "step": 15670 }, { "grad_norm": 0.3918122351169586, "learning_rate": 9.428557671732034e-05, "loss": 0.0434, "step": 15680 }, { "grad_norm": 0.36906370520591736, "learning_rate": 9.42759779310014e-05, "loss": 0.0491, "step": 15690 }, { "grad_norm": 0.4642312526702881, "learning_rate": 9.426637157912879e-05, "loss": 0.0536, "step": 15700 }, { "grad_norm": 0.5743995308876038, "learning_rate": 9.425675766334397e-05, "loss": 0.0516, "step": 15710 }, { "grad_norm": 0.6297668218612671, "learning_rate": 9.424713618528968e-05, "loss": 0.0474, "step": 15720 }, { "grad_norm": 0.5312448143959045, "learning_rate": 9.423750714661e-05, "loss": 0.0657, "step": 15730 }, { "grad_norm": 0.5792948007583618, "learning_rate": 9.422787054895022e-05, "loss": 0.0519, "step": 15740 }, { "grad_norm": 0.4381738007068634, "learning_rate": 9.4218226393957e-05, "loss": 0.06, "step": 15750 }, { "grad_norm": 0.5418424606323242, "learning_rate": 9.420857468327828e-05, "loss": 0.054, "step": 15760 }, { "grad_norm": 0.48037588596343994, "learning_rate": 9.419891541856323e-05, "loss": 0.0528, "step": 15770 }, { "grad_norm": 0.4324412941932678, "learning_rate": 9.41892486014624e-05, "loss": 0.0494, "step": 15780 }, { "grad_norm": 0.38642948865890503, "learning_rate": 9.417957423362756e-05, "loss": 0.0547, "step": 15790 }, { "grad_norm": 0.4662612974643707, "learning_rate": 9.416989231671178e-05, "loss": 0.0521, "step": 15800 }, { "grad_norm": 0.5276997685432434, "learning_rate": 9.416020285236946e-05, "loss": 0.0508, "step": 15810 }, { "grad_norm": 0.3553415536880493, "learning_rate": 9.415050584225626e-05, "loss": 0.0492, "step": 15820 }, { "grad_norm": 0.5122085213661194, "learning_rate": 9.414080128802914e-05, "loss": 0.0493, "step": 15830 }, { "grad_norm": 0.3632834255695343, "learning_rate": 9.413108919134632e-05, "loss": 0.0528, "step": 15840 }, { "grad_norm": 0.5919451117515564, "learning_rate": 9.412136955386734e-05, "loss": 0.0539, "step": 15850 }, { "grad_norm": 0.44512441754341125, "learning_rate": 9.411164237725303e-05, "loss": 0.0501, "step": 15860 }, { "grad_norm": 0.4767262935638428, "learning_rate": 9.41019076631655e-05, "loss": 0.0568, "step": 15870 }, { "grad_norm": 0.419369101524353, "learning_rate": 9.409216541326815e-05, "loss": 0.0549, "step": 15880 }, { "grad_norm": 0.5834709405899048, "learning_rate": 9.408241562922564e-05, "loss": 0.0559, "step": 15890 }, { "grad_norm": 0.7187374234199524, "learning_rate": 9.407265831270395e-05, "loss": 0.0508, "step": 15900 }, { "grad_norm": 0.5282018184661865, "learning_rate": 9.406289346537035e-05, "loss": 0.0478, "step": 15910 }, { "grad_norm": 0.49803364276885986, "learning_rate": 9.405312108889339e-05, "loss": 0.0477, "step": 15920 }, { "grad_norm": 0.5098147392272949, "learning_rate": 9.404334118494288e-05, "loss": 0.0528, "step": 15930 }, { "grad_norm": 0.4578053951263428, "learning_rate": 9.403355375518995e-05, "loss": 0.0467, "step": 15940 }, { "grad_norm": 0.44624924659729004, "learning_rate": 9.4023758801307e-05, "loss": 0.047, "step": 15950 }, { "grad_norm": 0.42414721846580505, "learning_rate": 9.401395632496774e-05, "loss": 0.0437, "step": 15960 }, { "grad_norm": 0.42368626594543457, "learning_rate": 9.400414632784711e-05, "loss": 0.0546, "step": 15970 }, { "grad_norm": 0.43017512559890747, "learning_rate": 9.39943288116214e-05, "loss": 0.0486, "step": 15980 }, { "grad_norm": 0.39341816306114197, "learning_rate": 9.398450377796815e-05, "loss": 0.0497, "step": 15990 }, { "grad_norm": 0.457612007856369, "learning_rate": 9.397467122856616e-05, "loss": 0.0454, "step": 16000 }, { "grad_norm": 0.43165910243988037, "learning_rate": 9.396483116509558e-05, "loss": 0.0419, "step": 16010 }, { "grad_norm": 0.3637091815471649, "learning_rate": 9.39549835892378e-05, "loss": 0.0521, "step": 16020 }, { "grad_norm": 0.3815147876739502, "learning_rate": 9.39451285026755e-05, "loss": 0.0488, "step": 16030 }, { "grad_norm": 0.5143526792526245, "learning_rate": 9.393526590709262e-05, "loss": 0.0467, "step": 16040 }, { "grad_norm": 0.44112923741340637, "learning_rate": 9.392539580417444e-05, "loss": 0.0428, "step": 16050 }, { "grad_norm": 0.4142274260520935, "learning_rate": 9.391551819560747e-05, "loss": 0.0488, "step": 16060 }, { "grad_norm": 0.4359363913536072, "learning_rate": 9.390563308307955e-05, "loss": 0.049, "step": 16070 }, { "grad_norm": 0.5290529131889343, "learning_rate": 9.389574046827974e-05, "loss": 0.0483, "step": 16080 }, { "grad_norm": 0.40042850375175476, "learning_rate": 9.388584035289845e-05, "loss": 0.0565, "step": 16090 }, { "grad_norm": 0.43609675765037537, "learning_rate": 9.387593273862732e-05, "loss": 0.0574, "step": 16100 }, { "grad_norm": 0.580829918384552, "learning_rate": 9.386601762715929e-05, "loss": 0.056, "step": 16110 }, { "grad_norm": 0.5191097259521484, "learning_rate": 9.38560950201886e-05, "loss": 0.0628, "step": 16120 }, { "grad_norm": 0.5721935033798218, "learning_rate": 9.384616491941071e-05, "loss": 0.0654, "step": 16130 }, { "grad_norm": 0.4550773501396179, "learning_rate": 9.383622732652245e-05, "loss": 0.054, "step": 16140 }, { "grad_norm": 0.40128618478775024, "learning_rate": 9.382628224322187e-05, "loss": 0.0606, "step": 16150 }, { "grad_norm": 0.2827898859977722, "learning_rate": 9.381632967120829e-05, "loss": 0.0546, "step": 16160 }, { "grad_norm": 0.4163252115249634, "learning_rate": 9.380636961218235e-05, "loss": 0.0419, "step": 16170 }, { "grad_norm": 0.3551839590072632, "learning_rate": 9.379640206784597e-05, "loss": 0.0404, "step": 16180 }, { "grad_norm": 0.37595707178115845, "learning_rate": 9.378642703990229e-05, "loss": 0.0412, "step": 16190 }, { "grad_norm": 0.4031367003917694, "learning_rate": 9.37764445300558e-05, "loss": 0.0491, "step": 16200 }, { "grad_norm": 0.3175992965698242, "learning_rate": 9.376645454001222e-05, "loss": 0.053, "step": 16210 }, { "grad_norm": 0.3658614754676819, "learning_rate": 9.375645707147858e-05, "loss": 0.0391, "step": 16220 }, { "grad_norm": 0.3974130153656006, "learning_rate": 9.374645212616316e-05, "loss": 0.0527, "step": 16230 }, { "grad_norm": 0.5041705369949341, "learning_rate": 9.373643970577555e-05, "loss": 0.0481, "step": 16240 }, { "grad_norm": 0.47137704491615295, "learning_rate": 9.372641981202659e-05, "loss": 0.0467, "step": 16250 }, { "grad_norm": 0.6911545395851135, "learning_rate": 9.37163924466284e-05, "loss": 0.0548, "step": 16260 }, { "grad_norm": 0.45630505681037903, "learning_rate": 9.370635761129438e-05, "loss": 0.0506, "step": 16270 }, { "grad_norm": 0.5334038734436035, "learning_rate": 9.36963153077392e-05, "loss": 0.0485, "step": 16280 }, { "grad_norm": 0.4360458254814148, "learning_rate": 9.368626553767888e-05, "loss": 0.049, "step": 16290 }, { "grad_norm": 0.4827522039413452, "learning_rate": 9.367620830283057e-05, "loss": 0.0418, "step": 16300 }, { "grad_norm": 0.4613913893699646, "learning_rate": 9.366614360491281e-05, "loss": 0.054, "step": 16310 }, { "grad_norm": 0.4600227475166321, "learning_rate": 9.365607144564539e-05, "loss": 0.0489, "step": 16320 }, { "grad_norm": 0.3913819491863251, "learning_rate": 9.364599182674934e-05, "loss": 0.0531, "step": 16330 }, { "grad_norm": 0.4206211268901825, "learning_rate": 9.3635904749947e-05, "loss": 0.0422, "step": 16340 }, { "grad_norm": 0.31845852732658386, "learning_rate": 9.362581021696202e-05, "loss": 0.0461, "step": 16350 }, { "grad_norm": 0.38868457078933716, "learning_rate": 9.361570822951921e-05, "loss": 0.0473, "step": 16360 }, { "grad_norm": 0.3791089355945587, "learning_rate": 9.360559878934476e-05, "loss": 0.0513, "step": 16370 }, { "grad_norm": 0.3545776903629303, "learning_rate": 9.359548189816611e-05, "loss": 0.0446, "step": 16380 }, { "grad_norm": 0.41993728280067444, "learning_rate": 9.358535755771193e-05, "loss": 0.0552, "step": 16390 }, { "grad_norm": 0.4316343367099762, "learning_rate": 9.357522576971221e-05, "loss": 0.0529, "step": 16400 }, { "grad_norm": 0.49926257133483887, "learning_rate": 9.356508653589819e-05, "loss": 0.0631, "step": 16410 }, { "grad_norm": 0.5017146468162537, "learning_rate": 9.355493985800237e-05, "loss": 0.05, "step": 16420 }, { "grad_norm": 0.489958256483078, "learning_rate": 9.354478573775857e-05, "loss": 0.0493, "step": 16430 }, { "grad_norm": 0.3483849763870239, "learning_rate": 9.353462417690186e-05, "loss": 0.0423, "step": 16440 }, { "grad_norm": 0.4985266923904419, "learning_rate": 9.352445517716853e-05, "loss": 0.0466, "step": 16450 }, { "grad_norm": 0.5540010333061218, "learning_rate": 9.351427874029621e-05, "loss": 0.0465, "step": 16460 }, { "grad_norm": 0.4627552628517151, "learning_rate": 9.350409486802379e-05, "loss": 0.0481, "step": 16470 }, { "grad_norm": 0.536679208278656, "learning_rate": 9.349390356209138e-05, "loss": 0.0565, "step": 16480 }, { "grad_norm": 0.49858084321022034, "learning_rate": 9.348370482424042e-05, "loss": 0.0479, "step": 16490 }, { "grad_norm": 0.5700745582580566, "learning_rate": 9.347349865621357e-05, "loss": 0.0508, "step": 16500 }, { "grad_norm": 0.3594331741333008, "learning_rate": 9.346328505975481e-05, "loss": 0.0545, "step": 16510 }, { "grad_norm": 0.3910166621208191, "learning_rate": 9.345306403660936e-05, "loss": 0.0543, "step": 16520 }, { "grad_norm": 0.5232212543487549, "learning_rate": 9.344283558852371e-05, "loss": 0.0531, "step": 16530 }, { "grad_norm": 0.46310052275657654, "learning_rate": 9.343259971724563e-05, "loss": 0.0475, "step": 16540 }, { "grad_norm": 0.5303177833557129, "learning_rate": 9.342235642452413e-05, "loss": 0.0553, "step": 16550 }, { "grad_norm": 0.47632792592048645, "learning_rate": 9.341210571210954e-05, "loss": 0.0516, "step": 16560 }, { "grad_norm": 0.44938087463378906, "learning_rate": 9.340184758175338e-05, "loss": 0.0447, "step": 16570 }, { "grad_norm": 0.3835536539554596, "learning_rate": 9.339158203520854e-05, "loss": 0.0555, "step": 16580 }, { "grad_norm": 0.5747255682945251, "learning_rate": 9.338130907422908e-05, "loss": 0.0437, "step": 16590 }, { "grad_norm": 0.43569090962409973, "learning_rate": 9.337102870057037e-05, "loss": 0.0413, "step": 16600 }, { "grad_norm": 0.36708030104637146, "learning_rate": 9.336074091598907e-05, "loss": 0.0448, "step": 16610 }, { "grad_norm": 0.3580847978591919, "learning_rate": 9.335044572224306e-05, "loss": 0.041, "step": 16620 }, { "grad_norm": 0.5249303579330444, "learning_rate": 9.334014312109151e-05, "loss": 0.039, "step": 16630 }, { "grad_norm": 0.5613766312599182, "learning_rate": 9.332983311429486e-05, "loss": 0.0568, "step": 16640 }, { "grad_norm": 0.38868847489356995, "learning_rate": 9.33195157036148e-05, "loss": 0.0505, "step": 16650 }, { "grad_norm": 0.42730751633644104, "learning_rate": 9.330919089081432e-05, "loss": 0.0459, "step": 16660 }, { "grad_norm": 0.4059789776802063, "learning_rate": 9.32988586776576e-05, "loss": 0.0614, "step": 16670 }, { "grad_norm": 0.32009679079055786, "learning_rate": 9.328851906591016e-05, "loss": 0.0497, "step": 16680 }, { "grad_norm": 0.344638854265213, "learning_rate": 9.327817205733875e-05, "loss": 0.0527, "step": 16690 }, { "grad_norm": 0.5018848776817322, "learning_rate": 9.326781765371142e-05, "loss": 0.0474, "step": 16700 }, { "grad_norm": 0.6263241171836853, "learning_rate": 9.325745585679741e-05, "loss": 0.0496, "step": 16710 }, { "grad_norm": 0.4510333240032196, "learning_rate": 9.32470866683673e-05, "loss": 0.0633, "step": 16720 }, { "grad_norm": 0.5669657588005066, "learning_rate": 9.323671009019288e-05, "loss": 0.0569, "step": 16730 }, { "grad_norm": 0.34390750527381897, "learning_rate": 9.322632612404725e-05, "loss": 0.0429, "step": 16740 }, { "grad_norm": 0.3042227625846863, "learning_rate": 9.321593477170471e-05, "loss": 0.0551, "step": 16750 }, { "grad_norm": 0.35519474744796753, "learning_rate": 9.320553603494088e-05, "loss": 0.0372, "step": 16760 }, { "grad_norm": 0.5425854921340942, "learning_rate": 9.319512991553261e-05, "loss": 0.0423, "step": 16770 }, { "grad_norm": 0.5034248232841492, "learning_rate": 9.318471641525803e-05, "loss": 0.0445, "step": 16780 }, { "grad_norm": 0.35036683082580566, "learning_rate": 9.317429553589652e-05, "loss": 0.039, "step": 16790 }, { "grad_norm": 0.47559377551078796, "learning_rate": 9.316386727922873e-05, "loss": 0.0491, "step": 16800 }, { "grad_norm": 0.5327713489532471, "learning_rate": 9.315343164703656e-05, "loss": 0.0528, "step": 16810 }, { "grad_norm": 0.4371911883354187, "learning_rate": 9.314298864110316e-05, "loss": 0.0422, "step": 16820 }, { "grad_norm": 0.4848827123641968, "learning_rate": 9.313253826321295e-05, "loss": 0.0492, "step": 16830 }, { "grad_norm": 0.39525601267814636, "learning_rate": 9.312208051515165e-05, "loss": 0.0424, "step": 16840 }, { "grad_norm": 0.4056767225265503, "learning_rate": 9.311161539870618e-05, "loss": 0.0456, "step": 16850 }, { "grad_norm": 0.46053358912467957, "learning_rate": 9.310114291566474e-05, "loss": 0.0505, "step": 16860 }, { "grad_norm": 0.37739455699920654, "learning_rate": 9.309066306781679e-05, "loss": 0.042, "step": 16870 }, { "grad_norm": 0.3760488033294678, "learning_rate": 9.308017585695306e-05, "loss": 0.0442, "step": 16880 }, { "grad_norm": 0.3582945466041565, "learning_rate": 9.306968128486552e-05, "loss": 0.0434, "step": 16890 }, { "grad_norm": 0.3815053701400757, "learning_rate": 9.30591793533474e-05, "loss": 0.0359, "step": 16900 }, { "grad_norm": 0.4708728790283203, "learning_rate": 9.304867006419321e-05, "loss": 0.0449, "step": 16910 }, { "grad_norm": 0.6036666631698608, "learning_rate": 9.303815341919868e-05, "loss": 0.046, "step": 16920 }, { "grad_norm": 0.506068766117096, "learning_rate": 9.302762942016084e-05, "loss": 0.056, "step": 16930 }, { "grad_norm": 0.37355631589889526, "learning_rate": 9.301709806887792e-05, "loss": 0.0451, "step": 16940 }, { "grad_norm": 0.49276798963546753, "learning_rate": 9.300655936714948e-05, "loss": 0.0729, "step": 16950 }, { "grad_norm": 0.49158814549446106, "learning_rate": 9.299601331677627e-05, "loss": 0.0503, "step": 16960 }, { "grad_norm": 0.3862527310848236, "learning_rate": 9.298545991956033e-05, "loss": 0.0547, "step": 16970 }, { "grad_norm": 0.36509954929351807, "learning_rate": 9.297489917730493e-05, "loss": 0.0503, "step": 16980 }, { "grad_norm": 0.459330677986145, "learning_rate": 9.296433109181464e-05, "loss": 0.0526, "step": 16990 }, { "grad_norm": 0.5387334823608398, "learning_rate": 9.295375566489523e-05, "loss": 0.0438, "step": 17000 }, { "grad_norm": 0.4099545180797577, "learning_rate": 9.294317289835379e-05, "loss": 0.0496, "step": 17010 }, { "grad_norm": 0.46257349848747253, "learning_rate": 9.293258279399859e-05, "loss": 0.0543, "step": 17020 }, { "grad_norm": 0.28469887375831604, "learning_rate": 9.292198535363919e-05, "loss": 0.0379, "step": 17030 }, { "grad_norm": 0.4613698720932007, "learning_rate": 9.291138057908641e-05, "loss": 0.0386, "step": 17040 }, { "grad_norm": 0.42319053411483765, "learning_rate": 9.290076847215234e-05, "loss": 0.0445, "step": 17050 }, { "grad_norm": 0.6631290316581726, "learning_rate": 9.289014903465025e-05, "loss": 0.0636, "step": 17060 }, { "grad_norm": 0.35391664505004883, "learning_rate": 9.287952226839475e-05, "loss": 0.0401, "step": 17070 }, { "grad_norm": 0.4701930284500122, "learning_rate": 9.286888817520164e-05, "loss": 0.0408, "step": 17080 }, { "grad_norm": 0.44349101185798645, "learning_rate": 9.285824675688803e-05, "loss": 0.0429, "step": 17090 }, { "grad_norm": 0.3665250539779663, "learning_rate": 9.28475980152722e-05, "loss": 0.0458, "step": 17100 }, { "grad_norm": 0.47186723351478577, "learning_rate": 9.283694195217379e-05, "loss": 0.0539, "step": 17110 }, { "grad_norm": 0.472590833902359, "learning_rate": 9.282627856941356e-05, "loss": 0.0488, "step": 17120 }, { "grad_norm": 0.4705214202404022, "learning_rate": 9.281560786881363e-05, "loss": 0.0492, "step": 17130 }, { "grad_norm": 0.40266379714012146, "learning_rate": 9.280492985219733e-05, "loss": 0.0377, "step": 17140 }, { "grad_norm": 0.6422222256660461, "learning_rate": 9.279424452138924e-05, "loss": 0.0537, "step": 17150 }, { "grad_norm": 0.49038657546043396, "learning_rate": 9.278355187821517e-05, "loss": 0.0463, "step": 17160 }, { "grad_norm": 0.3897664248943329, "learning_rate": 9.277285192450224e-05, "loss": 0.051, "step": 17170 }, { "grad_norm": 0.3768746554851532, "learning_rate": 9.276214466207875e-05, "loss": 0.0494, "step": 17180 }, { "grad_norm": 0.5339741110801697, "learning_rate": 9.275143009277427e-05, "loss": 0.0534, "step": 17190 }, { "grad_norm": 0.45128941535949707, "learning_rate": 9.274070821841964e-05, "loss": 0.0446, "step": 17200 }, { "grad_norm": 0.36912408471107483, "learning_rate": 9.272997904084696e-05, "loss": 0.048, "step": 17210 }, { "grad_norm": 0.4406306743621826, "learning_rate": 9.271924256188951e-05, "loss": 0.0601, "step": 17220 }, { "grad_norm": 0.4546271860599518, "learning_rate": 9.270849878338189e-05, "loss": 0.0501, "step": 17230 }, { "grad_norm": 0.5635523200035095, "learning_rate": 9.269774770715991e-05, "loss": 0.0485, "step": 17240 }, { "grad_norm": 0.46319109201431274, "learning_rate": 9.268698933506061e-05, "loss": 0.0573, "step": 17250 }, { "grad_norm": 0.42842063307762146, "learning_rate": 9.267622366892235e-05, "loss": 0.0469, "step": 17260 }, { "grad_norm": 0.2679559290409088, "learning_rate": 9.266545071058465e-05, "loss": 0.0421, "step": 17270 }, { "grad_norm": 0.2860298156738281, "learning_rate": 9.265467046188833e-05, "loss": 0.0441, "step": 17280 }, { "grad_norm": 0.3341018259525299, "learning_rate": 9.264388292467543e-05, "loss": 0.0498, "step": 17290 }, { "grad_norm": 0.44328436255455017, "learning_rate": 9.263308810078926e-05, "loss": 0.0455, "step": 17300 }, { "grad_norm": 0.47867193818092346, "learning_rate": 9.262228599207434e-05, "loss": 0.0407, "step": 17310 }, { "grad_norm": 0.3917229175567627, "learning_rate": 9.261147660037647e-05, "loss": 0.0476, "step": 17320 }, { "grad_norm": 0.38001030683517456, "learning_rate": 9.26006599275427e-05, "loss": 0.0488, "step": 17330 }, { "grad_norm": 0.2772998809814453, "learning_rate": 9.258983597542124e-05, "loss": 0.0408, "step": 17340 }, { "grad_norm": 0.7000154852867126, "learning_rate": 9.257900474586167e-05, "loss": 0.0405, "step": 17350 }, { "grad_norm": 0.4905734360218048, "learning_rate": 9.256816624071471e-05, "loss": 0.0543, "step": 17360 }, { "grad_norm": 0.4610691964626312, "learning_rate": 9.25573204618324e-05, "loss": 0.0506, "step": 17370 }, { "grad_norm": 0.44650334119796753, "learning_rate": 9.254646741106796e-05, "loss": 0.0529, "step": 17380 }, { "grad_norm": 0.3474525809288025, "learning_rate": 9.253560709027589e-05, "loss": 0.0476, "step": 17390 }, { "grad_norm": 0.5406584143638611, "learning_rate": 9.252473950131192e-05, "loss": 0.0541, "step": 17400 }, { "grad_norm": 0.45152685046195984, "learning_rate": 9.251386464603302e-05, "loss": 0.0415, "step": 17410 }, { "grad_norm": 0.40371400117874146, "learning_rate": 9.250298252629741e-05, "loss": 0.0495, "step": 17420 }, { "grad_norm": 0.4054633378982544, "learning_rate": 9.249209314396454e-05, "loss": 0.0427, "step": 17430 }, { "grad_norm": 0.47280940413475037, "learning_rate": 9.248119650089513e-05, "loss": 0.0511, "step": 17440 }, { "grad_norm": 0.4179045259952545, "learning_rate": 9.247029259895108e-05, "loss": 0.0466, "step": 17450 }, { "grad_norm": 0.37917208671569824, "learning_rate": 9.24593814399956e-05, "loss": 0.0468, "step": 17460 }, { "grad_norm": 0.33377552032470703, "learning_rate": 9.244846302589309e-05, "loss": 0.0431, "step": 17470 }, { "grad_norm": 0.3949427902698517, "learning_rate": 9.243753735850923e-05, "loss": 0.0495, "step": 17480 }, { "grad_norm": 0.3617684543132782, "learning_rate": 9.24266044397109e-05, "loss": 0.0419, "step": 17490 }, { "grad_norm": 0.47181564569473267, "learning_rate": 9.241566427136624e-05, "loss": 0.0397, "step": 17500 }, { "grad_norm": 0.4499017298221588, "learning_rate": 9.240471685534463e-05, "loss": 0.0511, "step": 17510 }, { "grad_norm": 0.5026893019676208, "learning_rate": 9.239376219351667e-05, "loss": 0.0513, "step": 17520 }, { "grad_norm": 0.38806048035621643, "learning_rate": 9.238280028775425e-05, "loss": 0.0487, "step": 17530 }, { "grad_norm": 0.44798797369003296, "learning_rate": 9.237183113993041e-05, "loss": 0.0515, "step": 17540 }, { "grad_norm": 0.3862147927284241, "learning_rate": 9.236085475191952e-05, "loss": 0.0434, "step": 17550 }, { "grad_norm": 0.4148387908935547, "learning_rate": 9.234987112559709e-05, "loss": 0.0589, "step": 17560 }, { "grad_norm": 0.5168870091438293, "learning_rate": 9.233888026283999e-05, "loss": 0.0516, "step": 17570 }, { "grad_norm": 0.4194619655609131, "learning_rate": 9.232788216552619e-05, "loss": 0.053, "step": 17580 }, { "grad_norm": 0.4486626386642456, "learning_rate": 9.231687683553502e-05, "loss": 0.0595, "step": 17590 }, { "grad_norm": 0.36330607533454895, "learning_rate": 9.230586427474698e-05, "loss": 0.0571, "step": 17600 }, { "grad_norm": 0.4590070843696594, "learning_rate": 9.229484448504379e-05, "loss": 0.0578, "step": 17610 }, { "grad_norm": 0.36059507727622986, "learning_rate": 9.228381746830843e-05, "loss": 0.0459, "step": 17620 }, { "grad_norm": 0.4515981674194336, "learning_rate": 9.227278322642514e-05, "loss": 0.0395, "step": 17630 }, { "grad_norm": 0.42196959257125854, "learning_rate": 9.226174176127937e-05, "loss": 0.0412, "step": 17640 }, { "grad_norm": 0.3268335461616516, "learning_rate": 9.22506930747578e-05, "loss": 0.0441, "step": 17650 }, { "grad_norm": 0.42404723167419434, "learning_rate": 9.223963716874831e-05, "loss": 0.0532, "step": 17660 }, { "grad_norm": 0.35288652777671814, "learning_rate": 9.222857404514012e-05, "loss": 0.0538, "step": 17670 }, { "grad_norm": 0.49639853835105896, "learning_rate": 9.221750370582355e-05, "loss": 0.0493, "step": 17680 }, { "grad_norm": 0.42676854133605957, "learning_rate": 9.220642615269028e-05, "loss": 0.0491, "step": 17690 }, { "grad_norm": 0.3317035138607025, "learning_rate": 9.219534138763311e-05, "loss": 0.0538, "step": 17700 }, { "grad_norm": 0.6410305500030518, "learning_rate": 9.218424941254613e-05, "loss": 0.0557, "step": 17710 }, { "grad_norm": 0.44963517785072327, "learning_rate": 9.217315022932468e-05, "loss": 0.0512, "step": 17720 }, { "grad_norm": 0.7361551523208618, "learning_rate": 9.216204383986528e-05, "loss": 0.0492, "step": 17730 }, { "grad_norm": 0.41928985714912415, "learning_rate": 9.215093024606574e-05, "loss": 0.051, "step": 17740 }, { "grad_norm": 0.5395050644874573, "learning_rate": 9.213980944982506e-05, "loss": 0.0669, "step": 17750 }, { "grad_norm": 0.5412027835845947, "learning_rate": 9.212868145304346e-05, "loss": 0.0498, "step": 17760 }, { "grad_norm": 0.36505594849586487, "learning_rate": 9.211754625762241e-05, "loss": 0.0449, "step": 17770 }, { "grad_norm": 0.4179619550704956, "learning_rate": 9.210640386546463e-05, "loss": 0.0562, "step": 17780 }, { "grad_norm": 0.3868648409843445, "learning_rate": 9.209525427847405e-05, "loss": 0.0368, "step": 17790 }, { "grad_norm": 0.3306910991668701, "learning_rate": 9.208409749855583e-05, "loss": 0.0365, "step": 17800 }, { "grad_norm": 0.44604751467704773, "learning_rate": 9.207293352761633e-05, "loss": 0.0476, "step": 17810 }, { "grad_norm": 0.4732693135738373, "learning_rate": 9.206176236756319e-05, "loss": 0.0416, "step": 17820 }, { "grad_norm": 0.37133416533470154, "learning_rate": 9.205058402030525e-05, "loss": 0.04, "step": 17830 }, { "grad_norm": 0.34696608781814575, "learning_rate": 9.203939848775259e-05, "loss": 0.0406, "step": 17840 }, { "grad_norm": 0.4050498306751251, "learning_rate": 9.202820577181652e-05, "loss": 0.0375, "step": 17850 }, { "grad_norm": 0.4067344069480896, "learning_rate": 9.201700587440953e-05, "loss": 0.0395, "step": 17860 }, { "grad_norm": 0.41745153069496155, "learning_rate": 9.200579879744544e-05, "loss": 0.0405, "step": 17870 }, { "grad_norm": 0.36967164278030396, "learning_rate": 9.199458454283918e-05, "loss": 0.0366, "step": 17880 }, { "grad_norm": 0.6266419291496277, "learning_rate": 9.198336311250697e-05, "loss": 0.0433, "step": 17890 }, { "grad_norm": 0.5724923014640808, "learning_rate": 9.197213450836626e-05, "loss": 0.0415, "step": 17900 }, { "grad_norm": 0.4881480634212494, "learning_rate": 9.19608987323357e-05, "loss": 0.0587, "step": 17910 }, { "grad_norm": 0.3583170473575592, "learning_rate": 9.194965578633517e-05, "loss": 0.0428, "step": 17920 }, { "grad_norm": 0.35071316361427307, "learning_rate": 9.193840567228582e-05, "loss": 0.0465, "step": 17930 }, { "grad_norm": 0.44520917534828186, "learning_rate": 9.192714839210994e-05, "loss": 0.037, "step": 17940 }, { "grad_norm": 0.44570401310920715, "learning_rate": 9.19158839477311e-05, "loss": 0.0548, "step": 17950 }, { "grad_norm": 0.4197930097579956, "learning_rate": 9.190461234107411e-05, "loss": 0.0487, "step": 17960 }, { "grad_norm": 0.42074257135391235, "learning_rate": 9.189333357406496e-05, "loss": 0.0367, "step": 17970 }, { "grad_norm": 0.434550017118454, "learning_rate": 9.188204764863089e-05, "loss": 0.0554, "step": 17980 }, { "grad_norm": 0.4996167719364166, "learning_rate": 9.187075456670033e-05, "loss": 0.0447, "step": 17990 }, { "grad_norm": 0.3965904712677002, "learning_rate": 9.1859454330203e-05, "loss": 0.045, "step": 18000 }, { "grad_norm": 0.3403480648994446, "learning_rate": 9.18481469410698e-05, "loss": 0.0539, "step": 18010 }, { "grad_norm": 0.3514561057090759, "learning_rate": 9.183683240123281e-05, "loss": 0.0489, "step": 18020 }, { "grad_norm": 0.3916683495044708, "learning_rate": 9.182551071262541e-05, "loss": 0.0492, "step": 18030 }, { "grad_norm": 0.35395514965057373, "learning_rate": 9.181418187718218e-05, "loss": 0.0501, "step": 18040 }, { "grad_norm": 0.4219619631767273, "learning_rate": 9.180284589683888e-05, "loss": 0.0484, "step": 18050 }, { "grad_norm": 0.6233333349227905, "learning_rate": 9.17915027735325e-05, "loss": 0.044, "step": 18060 }, { "grad_norm": 0.5004484057426453, "learning_rate": 9.178015250920133e-05, "loss": 0.0403, "step": 18070 }, { "grad_norm": 0.30236679315567017, "learning_rate": 9.176879510578477e-05, "loss": 0.0437, "step": 18080 }, { "grad_norm": 0.36591583490371704, "learning_rate": 9.17574305652235e-05, "loss": 0.0437, "step": 18090 }, { "grad_norm": 0.43933072686195374, "learning_rate": 9.174605888945942e-05, "loss": 0.0461, "step": 18100 }, { "grad_norm": 0.47427842020988464, "learning_rate": 9.173468008043564e-05, "loss": 0.0477, "step": 18110 }, { "grad_norm": 0.4422697424888611, "learning_rate": 9.172329414009648e-05, "loss": 0.0456, "step": 18120 }, { "grad_norm": 0.46529528498649597, "learning_rate": 9.171190107038747e-05, "loss": 0.0461, "step": 18130 }, { "grad_norm": 0.3292218744754791, "learning_rate": 9.170050087325541e-05, "loss": 0.0357, "step": 18140 }, { "grad_norm": 0.6226466298103333, "learning_rate": 9.168909355064824e-05, "loss": 0.0459, "step": 18150 }, { "grad_norm": 0.36209821701049805, "learning_rate": 9.167767910451519e-05, "loss": 0.0441, "step": 18160 }, { "grad_norm": 0.4004814922809601, "learning_rate": 9.166625753680669e-05, "loss": 0.0481, "step": 18170 }, { "grad_norm": 0.39458993077278137, "learning_rate": 9.165482884947431e-05, "loss": 0.036, "step": 18180 }, { "grad_norm": 0.33097222447395325, "learning_rate": 9.164339304447098e-05, "loss": 0.0492, "step": 18190 }, { "grad_norm": 0.32948964834213257, "learning_rate": 9.163195012375072e-05, "loss": 0.0358, "step": 18200 }, { "grad_norm": 0.35830894112586975, "learning_rate": 9.16205000892688e-05, "loss": 0.0449, "step": 18210 }, { "grad_norm": 0.49802830815315247, "learning_rate": 9.160904294298175e-05, "loss": 0.0371, "step": 18220 }, { "grad_norm": 0.3723101317882538, "learning_rate": 9.159757868684727e-05, "loss": 0.0521, "step": 18230 }, { "grad_norm": 0.5354562997817993, "learning_rate": 9.15861073228243e-05, "loss": 0.0471, "step": 18240 }, { "grad_norm": 0.49127790331840515, "learning_rate": 9.157462885287296e-05, "loss": 0.0541, "step": 18250 }, { "grad_norm": 0.39852583408355713, "learning_rate": 9.156314327895461e-05, "loss": 0.0412, "step": 18260 }, { "grad_norm": 0.45748981833457947, "learning_rate": 9.155165060303185e-05, "loss": 0.0488, "step": 18270 }, { "grad_norm": 0.39376896619796753, "learning_rate": 9.154015082706841e-05, "loss": 0.037, "step": 18280 }, { "grad_norm": 0.48368480801582336, "learning_rate": 9.152864395302936e-05, "loss": 0.0569, "step": 18290 }, { "grad_norm": 0.424626886844635, "learning_rate": 9.151712998288085e-05, "loss": 0.0445, "step": 18300 }, { "grad_norm": 0.32650840282440186, "learning_rate": 9.150560891859031e-05, "loss": 0.0503, "step": 18310 }, { "grad_norm": 0.5001503229141235, "learning_rate": 9.14940807621264e-05, "loss": 0.0473, "step": 18320 }, { "grad_norm": 0.37380316853523254, "learning_rate": 9.148254551545894e-05, "loss": 0.0457, "step": 18330 }, { "grad_norm": 0.5240084528923035, "learning_rate": 9.147100318055901e-05, "loss": 0.0458, "step": 18340 }, { "grad_norm": 0.5631433725357056, "learning_rate": 9.145945375939888e-05, "loss": 0.054, "step": 18350 }, { "grad_norm": 0.37836337089538574, "learning_rate": 9.144789725395203e-05, "loss": 0.0467, "step": 18360 }, { "grad_norm": 0.4375858008861542, "learning_rate": 9.14363336661931e-05, "loss": 0.0466, "step": 18370 }, { "grad_norm": 0.519420862197876, "learning_rate": 9.142476299809806e-05, "loss": 0.0517, "step": 18380 }, { "grad_norm": 0.32426100969314575, "learning_rate": 9.1413185251644e-05, "loss": 0.0423, "step": 18390 }, { "grad_norm": 0.47818976640701294, "learning_rate": 9.140160042880923e-05, "loss": 0.0489, "step": 18400 }, { "grad_norm": 0.3371392488479614, "learning_rate": 9.139000853157327e-05, "loss": 0.0538, "step": 18410 }, { "grad_norm": 0.42071834206581116, "learning_rate": 9.137840956191688e-05, "loss": 0.0464, "step": 18420 }, { "grad_norm": 0.32122400403022766, "learning_rate": 9.136680352182199e-05, "loss": 0.0411, "step": 18430 }, { "grad_norm": 0.693686306476593, "learning_rate": 9.135519041327177e-05, "loss": 0.0456, "step": 18440 }, { "grad_norm": 0.5701736807823181, "learning_rate": 9.134357023825058e-05, "loss": 0.0495, "step": 18450 }, { "grad_norm": 0.5690689086914062, "learning_rate": 9.133194299874398e-05, "loss": 0.0437, "step": 18460 }, { "grad_norm": 0.5482969284057617, "learning_rate": 9.132030869673876e-05, "loss": 0.047, "step": 18470 }, { "grad_norm": 0.3678447902202606, "learning_rate": 9.130866733422288e-05, "loss": 0.0472, "step": 18480 }, { "grad_norm": 0.49431657791137695, "learning_rate": 9.129701891318556e-05, "loss": 0.046, "step": 18490 }, { "grad_norm": 0.36410412192344666, "learning_rate": 9.128536343561718e-05, "loss": 0.0566, "step": 18500 }, { "grad_norm": 0.39433351159095764, "learning_rate": 9.127370090350934e-05, "loss": 0.0489, "step": 18510 }, { "grad_norm": 0.5512345433235168, "learning_rate": 9.126203131885487e-05, "loss": 0.0489, "step": 18520 }, { "grad_norm": 0.3685130476951599, "learning_rate": 9.125035468364775e-05, "loss": 0.0473, "step": 18530 }, { "grad_norm": 0.4313828945159912, "learning_rate": 9.123867099988322e-05, "loss": 0.0374, "step": 18540 }, { "grad_norm": 0.3716210424900055, "learning_rate": 9.122698026955769e-05, "loss": 0.0449, "step": 18550 }, { "grad_norm": 0.33366742730140686, "learning_rate": 9.12152824946688e-05, "loss": 0.0602, "step": 18560 }, { "grad_norm": 0.5035731792449951, "learning_rate": 9.120357767721538e-05, "loss": 0.0469, "step": 18570 }, { "grad_norm": 0.5691210031509399, "learning_rate": 9.119186581919745e-05, "loss": 0.0479, "step": 18580 }, { "grad_norm": 0.4657294452190399, "learning_rate": 9.118014692261624e-05, "loss": 0.0463, "step": 18590 }, { "grad_norm": 0.5154022574424744, "learning_rate": 9.116842098947422e-05, "loss": 0.0633, "step": 18600 }, { "grad_norm": 0.33172035217285156, "learning_rate": 9.115668802177499e-05, "loss": 0.1059, "step": 18610 }, { "grad_norm": 0.3645341694355011, "learning_rate": 9.114494802152342e-05, "loss": 0.0542, "step": 18620 }, { "grad_norm": 0.5142310261726379, "learning_rate": 9.113320099072555e-05, "loss": 0.0714, "step": 18630 }, { "grad_norm": 0.4616219103336334, "learning_rate": 9.112144693138864e-05, "loss": 0.0497, "step": 18640 }, { "grad_norm": 0.5424106121063232, "learning_rate": 9.110968584552111e-05, "loss": 0.049, "step": 18650 }, { "grad_norm": 0.4547431170940399, "learning_rate": 9.109791773513264e-05, "loss": 0.0489, "step": 18660 }, { "grad_norm": 0.3708053231239319, "learning_rate": 9.108614260223403e-05, "loss": 0.0486, "step": 18670 }, { "grad_norm": 0.37850937247276306, "learning_rate": 9.107436044883738e-05, "loss": 0.0444, "step": 18680 }, { "grad_norm": 0.35114747285842896, "learning_rate": 9.10625712769559e-05, "loss": 0.0449, "step": 18690 }, { "grad_norm": 0.3576127290725708, "learning_rate": 9.105077508860406e-05, "loss": 0.0446, "step": 18700 }, { "grad_norm": 0.26172545552253723, "learning_rate": 9.103897188579751e-05, "loss": 0.0447, "step": 18710 }, { "grad_norm": 0.2928027808666229, "learning_rate": 9.102716167055308e-05, "loss": 0.0327, "step": 18720 }, { "grad_norm": 0.428871214389801, "learning_rate": 9.10153444448888e-05, "loss": 0.0833, "step": 18730 }, { "grad_norm": 0.44842785596847534, "learning_rate": 9.100352021082393e-05, "loss": 0.0387, "step": 18740 }, { "grad_norm": 0.4595809578895569, "learning_rate": 9.099168897037891e-05, "loss": 0.0463, "step": 18750 }, { "grad_norm": 0.4775937795639038, "learning_rate": 9.097985072557538e-05, "loss": 0.0423, "step": 18760 }, { "grad_norm": 0.3902364671230316, "learning_rate": 9.096800547843615e-05, "loss": 0.0645, "step": 18770 }, { "grad_norm": 0.5328691005706787, "learning_rate": 9.095615323098526e-05, "loss": 0.058, "step": 18780 }, { "grad_norm": 0.4999523162841797, "learning_rate": 9.094429398524795e-05, "loss": 0.0434, "step": 18790 }, { "grad_norm": 0.4646527171134949, "learning_rate": 9.093242774325061e-05, "loss": 0.0502, "step": 18800 }, { "grad_norm": 0.38364896178245544, "learning_rate": 9.092055450702088e-05, "loss": 0.0464, "step": 18810 }, { "grad_norm": 0.4000265300273895, "learning_rate": 9.090867427858756e-05, "loss": 0.0454, "step": 18820 }, { "grad_norm": 0.4138435423374176, "learning_rate": 9.089678705998066e-05, "loss": 0.0475, "step": 18830 }, { "grad_norm": 0.36388272047042847, "learning_rate": 9.088489285323139e-05, "loss": 0.0395, "step": 18840 }, { "grad_norm": 0.6647864580154419, "learning_rate": 9.087299166037212e-05, "loss": 0.0505, "step": 18850 }, { "grad_norm": 0.40818339586257935, "learning_rate": 9.086108348343647e-05, "loss": 0.0501, "step": 18860 }, { "grad_norm": 0.7136252522468567, "learning_rate": 9.08491683244592e-05, "loss": 0.0567, "step": 18870 }, { "grad_norm": 0.3919753432273865, "learning_rate": 9.08372461854763e-05, "loss": 0.0427, "step": 18880 }, { "grad_norm": 0.5261215567588806, "learning_rate": 9.082531706852492e-05, "loss": 0.0489, "step": 18890 }, { "grad_norm": 0.3648856282234192, "learning_rate": 9.081338097564342e-05, "loss": 0.0555, "step": 18900 }, { "grad_norm": 0.29920148849487305, "learning_rate": 9.080143790887137e-05, "loss": 0.04, "step": 18910 }, { "grad_norm": 0.5591827630996704, "learning_rate": 9.07894878702495e-05, "loss": 0.0425, "step": 18920 }, { "grad_norm": 0.48662516474723816, "learning_rate": 9.077753086181974e-05, "loss": 0.0498, "step": 18930 }, { "grad_norm": 0.3461203873157501, "learning_rate": 9.076556688562524e-05, "loss": 0.037, "step": 18940 }, { "grad_norm": 0.35929614305496216, "learning_rate": 9.075359594371029e-05, "loss": 0.0382, "step": 18950 }, { "grad_norm": 0.35810086131095886, "learning_rate": 9.07416180381204e-05, "loss": 0.0459, "step": 18960 }, { "grad_norm": 0.3818668723106384, "learning_rate": 9.072963317090228e-05, "loss": 0.0483, "step": 18970 }, { "grad_norm": 0.42043977975845337, "learning_rate": 9.071764134410382e-05, "loss": 0.0382, "step": 18980 }, { "grad_norm": 0.48334845900535583, "learning_rate": 9.070564255977407e-05, "loss": 0.0396, "step": 18990 }, { "grad_norm": 0.4480196535587311, "learning_rate": 9.06936368199633e-05, "loss": 0.0459, "step": 19000 }, { "grad_norm": 0.5642924904823303, "learning_rate": 9.0681624126723e-05, "loss": 0.0502, "step": 19010 }, { "grad_norm": 0.635949432849884, "learning_rate": 9.066960448210576e-05, "loss": 0.0511, "step": 19020 }, { "grad_norm": 0.34117740392684937, "learning_rate": 9.065757788816543e-05, "loss": 0.043, "step": 19030 }, { "grad_norm": 0.4244130849838257, "learning_rate": 9.064554434695705e-05, "loss": 0.0506, "step": 19040 }, { "grad_norm": 0.34839537739753723, "learning_rate": 9.063350386053677e-05, "loss": 0.0434, "step": 19050 }, { "grad_norm": 0.3763282001018524, "learning_rate": 9.062145643096202e-05, "loss": 0.05, "step": 19060 }, { "grad_norm": 0.3772372603416443, "learning_rate": 9.060940206029136e-05, "loss": 0.0503, "step": 19070 }, { "grad_norm": 0.48522108793258667, "learning_rate": 9.059734075058457e-05, "loss": 0.04, "step": 19080 }, { "grad_norm": 0.4105361998081207, "learning_rate": 9.058527250390257e-05, "loss": 0.0531, "step": 19090 }, { "grad_norm": 0.3763461112976074, "learning_rate": 9.057319732230752e-05, "loss": 0.0513, "step": 19100 }, { "grad_norm": 0.2947256863117218, "learning_rate": 9.056111520786273e-05, "loss": 0.0417, "step": 19110 }, { "grad_norm": 0.48616328835487366, "learning_rate": 9.054902616263268e-05, "loss": 0.0476, "step": 19120 }, { "grad_norm": 0.30506548285484314, "learning_rate": 9.05369301886831e-05, "loss": 0.0562, "step": 19130 }, { "grad_norm": 0.3298284411430359, "learning_rate": 9.052482728808083e-05, "loss": 0.0412, "step": 19140 }, { "grad_norm": 0.32739725708961487, "learning_rate": 9.051271746289391e-05, "loss": 0.0384, "step": 19150 }, { "grad_norm": 0.43930599093437195, "learning_rate": 9.050060071519162e-05, "loss": 0.0504, "step": 19160 }, { "grad_norm": 0.32873937487602234, "learning_rate": 9.048847704704437e-05, "loss": 0.0433, "step": 19170 }, { "grad_norm": 0.49663659930229187, "learning_rate": 9.047634646052376e-05, "loss": 0.0496, "step": 19180 }, { "grad_norm": 0.4383392930030823, "learning_rate": 9.046420895770256e-05, "loss": 0.0563, "step": 19190 }, { "grad_norm": 0.507361888885498, "learning_rate": 9.045206454065473e-05, "loss": 0.0447, "step": 19200 }, { "grad_norm": 0.4453997015953064, "learning_rate": 9.043991321145546e-05, "loss": 0.043, "step": 19210 }, { "grad_norm": 0.9261266589164734, "learning_rate": 9.042775497218105e-05, "loss": 0.0427, "step": 19220 }, { "grad_norm": 0.3002825379371643, "learning_rate": 9.041558982490901e-05, "loss": 0.0484, "step": 19230 }, { "grad_norm": 0.3835168182849884, "learning_rate": 9.040341777171805e-05, "loss": 0.042, "step": 19240 }, { "grad_norm": 0.382592111825943, "learning_rate": 9.039123881468802e-05, "loss": 0.042, "step": 19250 }, { "grad_norm": 0.39278197288513184, "learning_rate": 9.037905295589998e-05, "loss": 0.0436, "step": 19260 }, { "grad_norm": 0.4779919981956482, "learning_rate": 9.036686019743617e-05, "loss": 0.037, "step": 19270 }, { "grad_norm": 0.4412473738193512, "learning_rate": 9.035466054137997e-05, "loss": 0.0408, "step": 19280 }, { "grad_norm": 0.4980818033218384, "learning_rate": 9.0342453989816e-05, "loss": 0.0601, "step": 19290 }, { "grad_norm": 0.44003260135650635, "learning_rate": 9.033024054483e-05, "loss": 0.054, "step": 19300 }, { "grad_norm": 0.49755072593688965, "learning_rate": 9.031802020850894e-05, "loss": 0.0426, "step": 19310 }, { "grad_norm": 0.47568920254707336, "learning_rate": 9.030579298294092e-05, "loss": 0.0618, "step": 19320 }, { "grad_norm": 0.47543632984161377, "learning_rate": 9.029355887021524e-05, "loss": 0.0564, "step": 19330 }, { "grad_norm": 0.5396119952201843, "learning_rate": 9.028131787242238e-05, "loss": 0.0706, "step": 19340 }, { "grad_norm": 0.3351048231124878, "learning_rate": 9.026906999165399e-05, "loss": 0.0582, "step": 19350 }, { "grad_norm": 0.4699519872665405, "learning_rate": 9.025681523000291e-05, "loss": 0.0455, "step": 19360 }, { "grad_norm": 0.5228760242462158, "learning_rate": 9.024455358956315e-05, "loss": 0.0541, "step": 19370 }, { "grad_norm": 0.4073556661605835, "learning_rate": 9.023228507242984e-05, "loss": 0.051, "step": 19380 }, { "grad_norm": 0.49003955721855164, "learning_rate": 9.022000968069937e-05, "loss": 0.0507, "step": 19390 }, { "grad_norm": 0.3538607954978943, "learning_rate": 9.020772741646928e-05, "loss": 0.0413, "step": 19400 }, { "grad_norm": 0.5258539915084839, "learning_rate": 9.019543828183826e-05, "loss": 0.0439, "step": 19410 }, { "grad_norm": 1.116852045059204, "learning_rate": 9.018314227890616e-05, "loss": 0.0456, "step": 19420 }, { "grad_norm": 0.6097331047058105, "learning_rate": 9.017083940977408e-05, "loss": 0.0669, "step": 19430 }, { "grad_norm": 0.503165066242218, "learning_rate": 9.015852967654422e-05, "loss": 0.0487, "step": 19440 }, { "grad_norm": 0.3846209943294525, "learning_rate": 9.014621308131996e-05, "loss": 0.0491, "step": 19450 }, { "grad_norm": 0.40789780020713806, "learning_rate": 9.01338896262059e-05, "loss": 0.0417, "step": 19460 }, { "grad_norm": 0.3583377003669739, "learning_rate": 9.012155931330777e-05, "loss": 0.0531, "step": 19470 }, { "grad_norm": 0.34001171588897705, "learning_rate": 9.010922214473246e-05, "loss": 0.0434, "step": 19480 }, { "grad_norm": 0.34443527460098267, "learning_rate": 9.009687812258808e-05, "loss": 0.0571, "step": 19490 }, { "grad_norm": 0.4295424520969391, "learning_rate": 9.00845272489839e-05, "loss": 0.0528, "step": 19500 }, { "grad_norm": 0.4039481580257416, "learning_rate": 9.007216952603031e-05, "loss": 0.056, "step": 19510 }, { "grad_norm": 0.55397629737854, "learning_rate": 9.005980495583894e-05, "loss": 0.0406, "step": 19520 }, { "grad_norm": 0.3778585195541382, "learning_rate": 9.004743354052252e-05, "loss": 0.0413, "step": 19530 }, { "grad_norm": 0.45316770672798157, "learning_rate": 9.003505528219503e-05, "loss": 0.0427, "step": 19540 }, { "grad_norm": 0.3983694612979889, "learning_rate": 9.002267018297154e-05, "loss": 0.0536, "step": 19550 }, { "grad_norm": 0.5432919263839722, "learning_rate": 9.001027824496834e-05, "loss": 0.0561, "step": 19560 }, { "grad_norm": 0.2900235056877136, "learning_rate": 8.999787947030287e-05, "loss": 0.0427, "step": 19570 }, { "grad_norm": 0.31725138425827026, "learning_rate": 8.998547386109376e-05, "loss": 0.0484, "step": 19580 }, { "grad_norm": 0.4389428496360779, "learning_rate": 8.997306141946073e-05, "loss": 0.0403, "step": 19590 }, { "grad_norm": 0.3854921758174896, "learning_rate": 8.996064214752481e-05, "loss": 0.0491, "step": 19600 }, { "grad_norm": 0.31811681389808655, "learning_rate": 8.994821604740806e-05, "loss": 0.0378, "step": 19610 }, { "grad_norm": 0.4165861904621124, "learning_rate": 8.993578312123377e-05, "loss": 0.0448, "step": 19620 }, { "grad_norm": 0.40162673592567444, "learning_rate": 8.992334337112639e-05, "loss": 0.0458, "step": 19630 }, { "grad_norm": 0.36505210399627686, "learning_rate": 8.991089679921154e-05, "loss": 0.0361, "step": 19640 }, { "grad_norm": 0.4555925726890564, "learning_rate": 8.989844340761599e-05, "loss": 0.0408, "step": 19650 }, { "grad_norm": 0.460748553276062, "learning_rate": 8.988598319846768e-05, "loss": 0.041, "step": 19660 }, { "grad_norm": 0.44555312395095825, "learning_rate": 8.987351617389574e-05, "loss": 0.0458, "step": 19670 }, { "grad_norm": 0.3562496304512024, "learning_rate": 8.98610423360304e-05, "loss": 0.0424, "step": 19680 }, { "grad_norm": 0.28259193897247314, "learning_rate": 8.984856168700317e-05, "loss": 0.0477, "step": 19690 }, { "grad_norm": 0.3451822102069855, "learning_rate": 8.983607422894658e-05, "loss": 0.0483, "step": 19700 }, { "grad_norm": 0.28118690848350525, "learning_rate": 8.982357996399442e-05, "loss": 0.0515, "step": 19710 }, { "grad_norm": 0.4314395785331726, "learning_rate": 8.981107889428164e-05, "loss": 0.043, "step": 19720 }, { "grad_norm": 0.4636106491088867, "learning_rate": 8.979857102194428e-05, "loss": 0.0372, "step": 19730 }, { "grad_norm": 0.4978257715702057, "learning_rate": 8.978605634911968e-05, "loss": 0.0414, "step": 19740 }, { "grad_norm": 0.44759857654571533, "learning_rate": 8.977353487794616e-05, "loss": 0.0398, "step": 19750 }, { "grad_norm": 0.4646267592906952, "learning_rate": 8.976100661056334e-05, "loss": 0.0486, "step": 19760 }, { "grad_norm": 0.4285925626754761, "learning_rate": 8.974847154911197e-05, "loss": 0.0351, "step": 19770 }, { "grad_norm": 0.42476218938827515, "learning_rate": 8.973592969573393e-05, "loss": 0.0481, "step": 19780 }, { "grad_norm": 0.41438186168670654, "learning_rate": 8.972338105257228e-05, "loss": 0.0495, "step": 19790 }, { "grad_norm": 0.36223769187927246, "learning_rate": 8.971082562177125e-05, "loss": 0.0344, "step": 19800 }, { "grad_norm": 0.4340059459209442, "learning_rate": 8.96982634054762e-05, "loss": 0.0414, "step": 19810 }, { "grad_norm": 0.41979625821113586, "learning_rate": 8.96856944058337e-05, "loss": 0.0422, "step": 19820 }, { "grad_norm": 0.3582156002521515, "learning_rate": 8.967311862499144e-05, "loss": 0.0424, "step": 19830 }, { "grad_norm": 0.3659866154193878, "learning_rate": 8.966053606509825e-05, "loss": 0.0353, "step": 19840 }, { "grad_norm": 0.5421175956726074, "learning_rate": 8.964794672830417e-05, "loss": 0.0574, "step": 19850 }, { "grad_norm": 0.36318498849868774, "learning_rate": 8.963535061676038e-05, "loss": 0.0491, "step": 19860 }, { "grad_norm": 0.4301612973213196, "learning_rate": 8.962274773261918e-05, "loss": 0.0421, "step": 19870 }, { "grad_norm": 0.568656325340271, "learning_rate": 8.961013807803409e-05, "loss": 0.049, "step": 19880 }, { "grad_norm": 0.557157039642334, "learning_rate": 8.959752165515973e-05, "loss": 0.0441, "step": 19890 }, { "grad_norm": 0.7076267600059509, "learning_rate": 8.958489846615193e-05, "loss": 0.0701, "step": 19900 }, { "grad_norm": 0.4608009457588196, "learning_rate": 8.957226851316762e-05, "loss": 0.0525, "step": 19910 }, { "grad_norm": 0.3467980921268463, "learning_rate": 8.955963179836493e-05, "loss": 0.0416, "step": 19920 }, { "grad_norm": 0.4394841194152832, "learning_rate": 8.954698832390312e-05, "loss": 0.0466, "step": 19930 }, { "grad_norm": 0.3771930932998657, "learning_rate": 8.953433809194263e-05, "loss": 0.0432, "step": 19940 }, { "grad_norm": 0.4022397994995117, "learning_rate": 8.9521681104645e-05, "loss": 0.0408, "step": 19950 }, { "grad_norm": 0.4619176685810089, "learning_rate": 8.9509017364173e-05, "loss": 0.0452, "step": 19960 }, { "grad_norm": 0.41055190563201904, "learning_rate": 8.949634687269052e-05, "loss": 0.0407, "step": 19970 }, { "grad_norm": 0.4100129008293152, "learning_rate": 8.948366963236259e-05, "loss": 0.0405, "step": 19980 }, { "grad_norm": 0.49012860655784607, "learning_rate": 8.947098564535538e-05, "loss": 0.0467, "step": 19990 }, { "grad_norm": 0.5261788368225098, "learning_rate": 8.945829491383627e-05, "loss": 0.0442, "step": 20000 }, { "grad_norm": 0.34056660532951355, "learning_rate": 8.944559743997374e-05, "loss": 0.0409, "step": 20010 }, { "grad_norm": 0.6259142160415649, "learning_rate": 8.943289322593746e-05, "loss": 0.0472, "step": 20020 }, { "grad_norm": 0.6227961182594299, "learning_rate": 8.942018227389821e-05, "loss": 0.0412, "step": 20030 }, { "grad_norm": 0.41418296098709106, "learning_rate": 8.940746458602795e-05, "loss": 0.0597, "step": 20040 }, { "grad_norm": 0.4840579926967621, "learning_rate": 8.939474016449979e-05, "loss": 0.0456, "step": 20050 }, { "grad_norm": 0.37089499831199646, "learning_rate": 8.938200901148799e-05, "loss": 0.0376, "step": 20060 }, { "grad_norm": 0.493495911359787, "learning_rate": 8.936927112916795e-05, "loss": 0.0454, "step": 20070 }, { "grad_norm": 0.42521724104881287, "learning_rate": 8.935652651971622e-05, "loss": 0.0421, "step": 20080 }, { "grad_norm": 0.3533003628253937, "learning_rate": 8.934377518531052e-05, "loss": 0.0461, "step": 20090 }, { "grad_norm": 0.5164453387260437, "learning_rate": 8.933101712812967e-05, "loss": 0.0445, "step": 20100 }, { "grad_norm": 0.29500865936279297, "learning_rate": 8.931825235035374e-05, "loss": 0.0393, "step": 20110 }, { "grad_norm": 0.5055369138717651, "learning_rate": 8.930548085416382e-05, "loss": 0.0424, "step": 20120 }, { "grad_norm": 0.38209953904151917, "learning_rate": 8.92927026417422e-05, "loss": 0.0447, "step": 20130 }, { "grad_norm": 0.6393257975578308, "learning_rate": 8.92799177152724e-05, "loss": 0.0432, "step": 20140 }, { "grad_norm": 0.32734814286231995, "learning_rate": 8.926712607693895e-05, "loss": 0.0404, "step": 20150 }, { "grad_norm": 0.32850947976112366, "learning_rate": 8.925432772892762e-05, "loss": 0.0478, "step": 20160 }, { "grad_norm": 0.4696460962295532, "learning_rate": 8.924152267342529e-05, "loss": 0.0486, "step": 20170 }, { "grad_norm": 0.4675148129463196, "learning_rate": 8.922871091261998e-05, "loss": 0.0584, "step": 20180 }, { "grad_norm": 0.34978073835372925, "learning_rate": 8.92158924487009e-05, "loss": 0.0392, "step": 20190 }, { "grad_norm": 0.344547301530838, "learning_rate": 8.920306728385834e-05, "loss": 0.042, "step": 20200 }, { "grad_norm": 0.35800445079803467, "learning_rate": 8.919023542028379e-05, "loss": 0.044, "step": 20210 }, { "grad_norm": 0.32415926456451416, "learning_rate": 8.917739686016988e-05, "loss": 0.0421, "step": 20220 }, { "grad_norm": 0.4666482210159302, "learning_rate": 8.916455160571033e-05, "loss": 0.0544, "step": 20230 }, { "grad_norm": 0.3847824037075043, "learning_rate": 8.915169965910008e-05, "loss": 0.0372, "step": 20240 }, { "grad_norm": 0.3575577139854431, "learning_rate": 8.913884102253514e-05, "loss": 0.0429, "step": 20250 }, { "grad_norm": 0.4929393231868744, "learning_rate": 8.912597569821273e-05, "loss": 0.034, "step": 20260 }, { "grad_norm": 0.43627870082855225, "learning_rate": 8.911310368833118e-05, "loss": 0.0434, "step": 20270 }, { "grad_norm": 0.390506774187088, "learning_rate": 8.910022499508994e-05, "loss": 0.0362, "step": 20280 }, { "grad_norm": 0.3874220848083496, "learning_rate": 8.908733962068965e-05, "loss": 0.0489, "step": 20290 }, { "grad_norm": 0.3840945363044739, "learning_rate": 8.907444756733207e-05, "loss": 0.0401, "step": 20300 }, { "grad_norm": 0.37096846103668213, "learning_rate": 8.906154883722006e-05, "loss": 0.0498, "step": 20310 }, { "grad_norm": 0.3993034362792969, "learning_rate": 8.904864343255773e-05, "loss": 0.0421, "step": 20320 }, { "grad_norm": 0.4773644804954529, "learning_rate": 8.90357313555502e-05, "loss": 0.0448, "step": 20330 }, { "grad_norm": 0.36526206135749817, "learning_rate": 8.90228126084038e-05, "loss": 0.0448, "step": 20340 }, { "grad_norm": 0.40554988384246826, "learning_rate": 8.900988719332601e-05, "loss": 0.0521, "step": 20350 }, { "grad_norm": 0.3722306489944458, "learning_rate": 8.899695511252542e-05, "loss": 0.0437, "step": 20360 }, { "grad_norm": 0.3651525676250458, "learning_rate": 8.898401636821176e-05, "loss": 0.0505, "step": 20370 }, { "grad_norm": 0.4575544595718384, "learning_rate": 8.897107096259593e-05, "loss": 0.0538, "step": 20380 }, { "grad_norm": 0.37889692187309265, "learning_rate": 8.895811889788994e-05, "loss": 0.0443, "step": 20390 }, { "grad_norm": 0.3387317955493927, "learning_rate": 8.894516017630692e-05, "loss": 0.0377, "step": 20400 }, { "grad_norm": 0.3838305175304413, "learning_rate": 8.893219480006118e-05, "loss": 0.0403, "step": 20410 }, { "grad_norm": 0.5641466379165649, "learning_rate": 8.891922277136817e-05, "loss": 0.0408, "step": 20420 }, { "grad_norm": 0.34733667969703674, "learning_rate": 8.890624409244441e-05, "loss": 0.0341, "step": 20430 }, { "grad_norm": 0.473761647939682, "learning_rate": 8.889325876550763e-05, "loss": 0.0447, "step": 20440 }, { "grad_norm": 0.37968748807907104, "learning_rate": 8.888026679277666e-05, "loss": 0.0472, "step": 20450 }, { "grad_norm": 0.3087767958641052, "learning_rate": 8.886726817647147e-05, "loss": 0.0642, "step": 20460 }, { "grad_norm": 0.3626560568809509, "learning_rate": 8.885426291881319e-05, "loss": 0.0539, "step": 20470 }, { "grad_norm": 0.34914323687553406, "learning_rate": 8.884125102202401e-05, "loss": 0.0421, "step": 20480 }, { "grad_norm": 0.4341055452823639, "learning_rate": 8.882823248832736e-05, "loss": 0.041, "step": 20490 }, { "grad_norm": 0.32842472195625305, "learning_rate": 8.881520731994772e-05, "loss": 0.0409, "step": 20500 }, { "grad_norm": 0.363120973110199, "learning_rate": 8.880217551911077e-05, "loss": 0.042, "step": 20510 }, { "grad_norm": 0.40225934982299805, "learning_rate": 8.878913708804323e-05, "loss": 0.0397, "step": 20520 }, { "grad_norm": 0.37127479910850525, "learning_rate": 8.877609202897308e-05, "loss": 0.0433, "step": 20530 }, { "grad_norm": 0.41352513432502747, "learning_rate": 8.876304034412933e-05, "loss": 0.0383, "step": 20540 }, { "grad_norm": 0.3991379737854004, "learning_rate": 8.874998203574214e-05, "loss": 0.0437, "step": 20550 }, { "grad_norm": 0.463224858045578, "learning_rate": 8.873691710604284e-05, "loss": 0.0464, "step": 20560 }, { "grad_norm": 0.46907588839530945, "learning_rate": 8.872384555726387e-05, "loss": 0.0383, "step": 20570 }, { "grad_norm": 0.3631048798561096, "learning_rate": 8.871076739163878e-05, "loss": 0.0432, "step": 20580 }, { "grad_norm": 0.40032246708869934, "learning_rate": 8.86976826114023e-05, "loss": 0.0491, "step": 20590 }, { "grad_norm": 0.3725372850894928, "learning_rate": 8.868459121879023e-05, "loss": 0.0448, "step": 20600 }, { "grad_norm": 0.435505747795105, "learning_rate": 8.867149321603956e-05, "loss": 0.0492, "step": 20610 }, { "grad_norm": 0.392659991979599, "learning_rate": 8.865838860538835e-05, "loss": 0.0395, "step": 20620 }, { "grad_norm": 0.36557379364967346, "learning_rate": 8.864527738907585e-05, "loss": 0.0437, "step": 20630 }, { "grad_norm": 0.5805622935295105, "learning_rate": 8.863215956934239e-05, "loss": 0.0478, "step": 20640 }, { "grad_norm": 0.35094425082206726, "learning_rate": 8.861903514842947e-05, "loss": 0.0319, "step": 20650 }, { "grad_norm": 0.33844679594039917, "learning_rate": 8.860590412857966e-05, "loss": 0.0361, "step": 20660 }, { "grad_norm": 0.29113200306892395, "learning_rate": 8.85927665120367e-05, "loss": 0.035, "step": 20670 }, { "grad_norm": 0.3472925126552582, "learning_rate": 8.857962230104546e-05, "loss": 0.038, "step": 20680 }, { "grad_norm": 0.41030701994895935, "learning_rate": 8.856647149785193e-05, "loss": 0.0416, "step": 20690 }, { "grad_norm": 0.4137555956840515, "learning_rate": 8.855331410470322e-05, "loss": 0.0397, "step": 20700 }, { "grad_norm": 0.40451371669769287, "learning_rate": 8.854015012384756e-05, "loss": 0.0372, "step": 20710 }, { "grad_norm": 0.3563341796398163, "learning_rate": 8.852697955753433e-05, "loss": 0.0411, "step": 20720 }, { "grad_norm": 0.3757877051830292, "learning_rate": 8.851380240801399e-05, "loss": 0.0461, "step": 20730 }, { "grad_norm": 0.32278335094451904, "learning_rate": 8.850061867753818e-05, "loss": 0.0317, "step": 20740 }, { "grad_norm": 0.5435505509376526, "learning_rate": 8.848742836835963e-05, "loss": 0.0477, "step": 20750 }, { "grad_norm": 0.3184448480606079, "learning_rate": 8.847423148273221e-05, "loss": 0.0479, "step": 20760 }, { "grad_norm": 0.4435928761959076, "learning_rate": 8.846102802291092e-05, "loss": 0.0549, "step": 20770 }, { "grad_norm": 0.39589375257492065, "learning_rate": 8.844781799115183e-05, "loss": 0.0485, "step": 20780 }, { "grad_norm": 0.655390202999115, "learning_rate": 8.84346013897122e-05, "loss": 0.0405, "step": 20790 }, { "grad_norm": 0.38348016142845154, "learning_rate": 8.842137822085038e-05, "loss": 0.0505, "step": 20800 }, { "grad_norm": 0.4404093623161316, "learning_rate": 8.840814848682585e-05, "loss": 0.0418, "step": 20810 }, { "grad_norm": 0.3382313549518585, "learning_rate": 8.83949121898992e-05, "loss": 0.0504, "step": 20820 }, { "grad_norm": 0.4104587137699127, "learning_rate": 8.838166933233217e-05, "loss": 0.0347, "step": 20830 }, { "grad_norm": 0.3382103145122528, "learning_rate": 8.83684199163876e-05, "loss": 0.0394, "step": 20840 }, { "grad_norm": 0.34913259744644165, "learning_rate": 8.835516394432943e-05, "loss": 0.0398, "step": 20850 }, { "grad_norm": 0.2628422677516937, "learning_rate": 8.834190141842276e-05, "loss": 0.0471, "step": 20860 }, { "grad_norm": 0.5147480368614197, "learning_rate": 8.83286323409338e-05, "loss": 0.0403, "step": 20870 }, { "grad_norm": 0.4742644727230072, "learning_rate": 8.831535671412986e-05, "loss": 0.0423, "step": 20880 }, { "grad_norm": 0.3087865114212036, "learning_rate": 8.830207454027938e-05, "loss": 0.0457, "step": 20890 }, { "grad_norm": 0.48189082741737366, "learning_rate": 8.828878582165192e-05, "loss": 0.0427, "step": 20900 }, { "grad_norm": 0.3942968547344208, "learning_rate": 8.827549056051818e-05, "loss": 0.0361, "step": 20910 }, { "grad_norm": 0.2737170457839966, "learning_rate": 8.826218875914993e-05, "loss": 0.0399, "step": 20920 }, { "grad_norm": 0.3259723484516144, "learning_rate": 8.82488804198201e-05, "loss": 0.0409, "step": 20930 }, { "grad_norm": 0.5073270797729492, "learning_rate": 8.82355655448027e-05, "loss": 0.0406, "step": 20940 }, { "grad_norm": 0.34852296113967896, "learning_rate": 8.822224413637293e-05, "loss": 0.0432, "step": 20950 }, { "grad_norm": 0.430035799741745, "learning_rate": 8.820891619680697e-05, "loss": 0.0414, "step": 20960 }, { "grad_norm": 0.5764379501342773, "learning_rate": 8.819558172838227e-05, "loss": 0.0386, "step": 20970 }, { "grad_norm": 0.4043080508708954, "learning_rate": 8.818224073337731e-05, "loss": 0.0343, "step": 20980 }, { "grad_norm": 0.4764874577522278, "learning_rate": 8.816889321407169e-05, "loss": 0.037, "step": 20990 }, { "grad_norm": 0.35909584164619446, "learning_rate": 8.815553917274615e-05, "loss": 0.0447, "step": 21000 }, { "grad_norm": 0.7614809274673462, "learning_rate": 8.81421786116825e-05, "loss": 0.0556, "step": 21010 }, { "grad_norm": 0.39125871658325195, "learning_rate": 8.812881153316373e-05, "loss": 0.0408, "step": 21020 }, { "grad_norm": 0.4638764560222626, "learning_rate": 8.81154379394739e-05, "loss": 0.0494, "step": 21030 }, { "grad_norm": 0.3617638647556305, "learning_rate": 8.810205783289818e-05, "loss": 0.0388, "step": 21040 }, { "grad_norm": 0.39255574345588684, "learning_rate": 8.808867121572286e-05, "loss": 0.0445, "step": 21050 }, { "grad_norm": 0.3589771091938019, "learning_rate": 8.807527809023537e-05, "loss": 0.045, "step": 21060 }, { "grad_norm": 0.37758609652519226, "learning_rate": 8.80618784587242e-05, "loss": 0.0464, "step": 21070 }, { "grad_norm": 0.5484545826911926, "learning_rate": 8.804847232347902e-05, "loss": 0.0431, "step": 21080 }, { "grad_norm": 0.3175560534000397, "learning_rate": 8.803505968679054e-05, "loss": 0.0465, "step": 21090 }, { "grad_norm": 0.41936415433883667, "learning_rate": 8.802164055095061e-05, "loss": 0.0501, "step": 21100 }, { "grad_norm": 0.39680540561676025, "learning_rate": 8.80082149182522e-05, "loss": 0.0381, "step": 21110 }, { "grad_norm": 0.4469011127948761, "learning_rate": 8.79947827909894e-05, "loss": 0.0385, "step": 21120 }, { "grad_norm": 0.3001357913017273, "learning_rate": 8.798134417145738e-05, "loss": 0.0419, "step": 21130 }, { "grad_norm": 0.4556581377983093, "learning_rate": 8.796789906195243e-05, "loss": 0.0462, "step": 21140 }, { "grad_norm": 0.3852024972438812, "learning_rate": 8.795444746477195e-05, "loss": 0.0374, "step": 21150 }, { "grad_norm": 0.3980770707130432, "learning_rate": 8.794098938221446e-05, "loss": 0.0453, "step": 21160 }, { "grad_norm": 0.3353688418865204, "learning_rate": 8.792752481657957e-05, "loss": 0.0446, "step": 21170 }, { "grad_norm": 0.39930129051208496, "learning_rate": 8.791405377016802e-05, "loss": 0.045, "step": 21180 }, { "grad_norm": 0.3793131113052368, "learning_rate": 8.790057624528163e-05, "loss": 0.0462, "step": 21190 }, { "grad_norm": 0.47550761699676514, "learning_rate": 8.788709224422333e-05, "loss": 0.0449, "step": 21200 }, { "grad_norm": 0.4641960859298706, "learning_rate": 8.787360176929717e-05, "loss": 0.0422, "step": 21210 }, { "grad_norm": 0.3076764941215515, "learning_rate": 8.786010482280834e-05, "loss": 0.0423, "step": 21220 }, { "grad_norm": 0.4022735059261322, "learning_rate": 8.784660140706306e-05, "loss": 0.0453, "step": 21230 }, { "grad_norm": 0.3644639253616333, "learning_rate": 8.783309152436872e-05, "loss": 0.0397, "step": 21240 }, { "grad_norm": 0.5628812909126282, "learning_rate": 8.781957517703375e-05, "loss": 0.0412, "step": 21250 }, { "grad_norm": 0.3481931686401367, "learning_rate": 8.780605236736776e-05, "loss": 0.0461, "step": 21260 }, { "grad_norm": 0.31135889887809753, "learning_rate": 8.779252309768142e-05, "loss": 0.0392, "step": 21270 }, { "grad_norm": 0.299515962600708, "learning_rate": 8.777898737028652e-05, "loss": 0.0419, "step": 21280 }, { "grad_norm": 0.32377904653549194, "learning_rate": 8.776544518749591e-05, "loss": 0.0394, "step": 21290 }, { "grad_norm": 0.874402642250061, "learning_rate": 8.775189655162364e-05, "loss": 0.0414, "step": 21300 }, { "grad_norm": 0.40388965606689453, "learning_rate": 8.773834146498474e-05, "loss": 0.0416, "step": 21310 }, { "grad_norm": 0.3142060935497284, "learning_rate": 8.772477992989545e-05, "loss": 0.0457, "step": 21320 }, { "grad_norm": 0.42489445209503174, "learning_rate": 8.771121194867304e-05, "loss": 0.0531, "step": 21330 }, { "grad_norm": 0.4715721309185028, "learning_rate": 8.769763752363589e-05, "loss": 0.0422, "step": 21340 }, { "grad_norm": 0.4326429069042206, "learning_rate": 8.768405665710352e-05, "loss": 0.0468, "step": 21350 }, { "grad_norm": 0.6190152168273926, "learning_rate": 8.767046935139655e-05, "loss": 0.047, "step": 21360 }, { "grad_norm": 0.3633706569671631, "learning_rate": 8.765687560883666e-05, "loss": 0.0415, "step": 21370 }, { "grad_norm": 0.47155970335006714, "learning_rate": 8.764327543174664e-05, "loss": 0.044, "step": 21380 }, { "grad_norm": 0.4212954044342041, "learning_rate": 8.762966882245038e-05, "loss": 0.0513, "step": 21390 }, { "grad_norm": 0.7498831748962402, "learning_rate": 8.761605578327291e-05, "loss": 0.0587, "step": 21400 }, { "grad_norm": 0.30157774686813354, "learning_rate": 8.76024363165403e-05, "loss": 0.0524, "step": 21410 }, { "grad_norm": 0.32946234941482544, "learning_rate": 8.758881042457976e-05, "loss": 0.0531, "step": 21420 }, { "grad_norm": 0.46972477436065674, "learning_rate": 8.757517810971957e-05, "loss": 0.0455, "step": 21430 }, { "grad_norm": 0.42094454169273376, "learning_rate": 8.756153937428913e-05, "loss": 0.0456, "step": 21440 }, { "grad_norm": 0.3978346884250641, "learning_rate": 8.754789422061889e-05, "loss": 0.0513, "step": 21450 }, { "grad_norm": 0.35469961166381836, "learning_rate": 8.753424265104052e-05, "loss": 0.0466, "step": 21460 }, { "grad_norm": 0.4539138972759247, "learning_rate": 8.752058466788659e-05, "loss": 0.0545, "step": 21470 }, { "grad_norm": 0.3853146433830261, "learning_rate": 8.750692027349097e-05, "loss": 0.0421, "step": 21480 }, { "grad_norm": 0.431584894657135, "learning_rate": 8.749324947018847e-05, "loss": 0.0356, "step": 21490 }, { "grad_norm": 0.42241019010543823, "learning_rate": 8.747957226031507e-05, "loss": 0.0406, "step": 21500 }, { "grad_norm": 0.3506660759449005, "learning_rate": 8.746588864620787e-05, "loss": 0.0434, "step": 21510 }, { "grad_norm": 0.6929968595504761, "learning_rate": 8.745219863020498e-05, "loss": 0.0354, "step": 21520 }, { "grad_norm": 0.38616034388542175, "learning_rate": 8.743850221464564e-05, "loss": 0.0365, "step": 21530 }, { "grad_norm": 0.36973562836647034, "learning_rate": 8.742479940187026e-05, "loss": 0.0357, "step": 21540 }, { "grad_norm": 0.5105233192443848, "learning_rate": 8.74110901942202e-05, "loss": 0.0379, "step": 21550 }, { "grad_norm": 0.37451353669166565, "learning_rate": 8.739737459403803e-05, "loss": 0.0422, "step": 21560 }, { "grad_norm": 0.3581109046936035, "learning_rate": 8.738365260366737e-05, "loss": 0.0385, "step": 21570 }, { "grad_norm": 0.42213204503059387, "learning_rate": 8.736992422545292e-05, "loss": 0.0396, "step": 21580 }, { "grad_norm": 0.47907277941703796, "learning_rate": 8.73561894617405e-05, "loss": 0.0543, "step": 21590 }, { "grad_norm": 0.3688228130340576, "learning_rate": 8.734244831487697e-05, "loss": 0.0399, "step": 21600 }, { "grad_norm": 0.3034004867076874, "learning_rate": 8.732870078721035e-05, "loss": 0.0488, "step": 21610 }, { "grad_norm": 0.31764698028564453, "learning_rate": 8.731494688108972e-05, "loss": 0.0385, "step": 21620 }, { "grad_norm": 0.36431312561035156, "learning_rate": 8.730118659886523e-05, "loss": 0.0457, "step": 21630 }, { "grad_norm": 0.4362398684024811, "learning_rate": 8.728741994288814e-05, "loss": 0.0382, "step": 21640 }, { "grad_norm": 0.4134388566017151, "learning_rate": 8.727364691551079e-05, "loss": 0.0422, "step": 21650 }, { "grad_norm": 0.5594828724861145, "learning_rate": 8.725986751908661e-05, "loss": 0.0463, "step": 21660 }, { "grad_norm": 0.4521106481552124, "learning_rate": 8.724608175597016e-05, "loss": 0.041, "step": 21670 }, { "grad_norm": 0.5079529285430908, "learning_rate": 8.723228962851699e-05, "loss": 0.035, "step": 21680 }, { "grad_norm": 0.48114845156669617, "learning_rate": 8.721849113908385e-05, "loss": 0.0418, "step": 21690 }, { "grad_norm": 0.3887217044830322, "learning_rate": 8.720468629002848e-05, "loss": 0.0437, "step": 21700 }, { "grad_norm": 0.39454615116119385, "learning_rate": 8.719087508370978e-05, "loss": 0.0393, "step": 21710 }, { "grad_norm": 0.5725314617156982, "learning_rate": 8.717705752248772e-05, "loss": 0.05, "step": 21720 }, { "grad_norm": 0.5996996760368347, "learning_rate": 8.71632336087233e-05, "loss": 0.0374, "step": 21730 }, { "grad_norm": 0.3783387839794159, "learning_rate": 8.71494033447787e-05, "loss": 0.0402, "step": 21740 }, { "grad_norm": 0.5916247963905334, "learning_rate": 8.713556673301708e-05, "loss": 0.0405, "step": 21750 }, { "grad_norm": 0.5462086796760559, "learning_rate": 8.712172377580278e-05, "loss": 0.0625, "step": 21760 }, { "grad_norm": 0.48514604568481445, "learning_rate": 8.710787447550114e-05, "loss": 0.0491, "step": 21770 }, { "grad_norm": 0.39239534735679626, "learning_rate": 8.70940188344787e-05, "loss": 0.053, "step": 21780 }, { "grad_norm": 0.40888530015945435, "learning_rate": 8.708015685510293e-05, "loss": 0.0407, "step": 21790 }, { "grad_norm": 0.453909307718277, "learning_rate": 8.706628853974252e-05, "loss": 0.0414, "step": 21800 }, { "grad_norm": 0.3720516860485077, "learning_rate": 8.705241389076715e-05, "loss": 0.0371, "step": 21810 }, { "grad_norm": 0.38460931181907654, "learning_rate": 8.703853291054764e-05, "loss": 0.0418, "step": 21820 }, { "grad_norm": 0.4893437325954437, "learning_rate": 8.702464560145587e-05, "loss": 0.0462, "step": 21830 }, { "grad_norm": 0.40328094363212585, "learning_rate": 8.701075196586476e-05, "loss": 0.0454, "step": 21840 }, { "grad_norm": 0.33448535203933716, "learning_rate": 8.699685200614842e-05, "loss": 0.0409, "step": 21850 }, { "grad_norm": 0.4403231143951416, "learning_rate": 8.698294572468193e-05, "loss": 0.0477, "step": 21860 }, { "grad_norm": 0.31779584288597107, "learning_rate": 8.696903312384148e-05, "loss": 0.0463, "step": 21870 }, { "grad_norm": 0.43012017011642456, "learning_rate": 8.695511420600439e-05, "loss": 0.0467, "step": 21880 }, { "grad_norm": 0.3480916917324066, "learning_rate": 8.694118897354901e-05, "loss": 0.039, "step": 21890 }, { "grad_norm": 0.39864760637283325, "learning_rate": 8.692725742885478e-05, "loss": 0.0337, "step": 21900 }, { "grad_norm": 0.5442846417427063, "learning_rate": 8.691331957430221e-05, "loss": 0.0424, "step": 21910 }, { "grad_norm": 0.506366491317749, "learning_rate": 8.68993754122729e-05, "loss": 0.0445, "step": 21920 }, { "grad_norm": 0.45883849263191223, "learning_rate": 8.688542494514955e-05, "loss": 0.0418, "step": 21930 }, { "grad_norm": 0.4696846008300781, "learning_rate": 8.68714681753159e-05, "loss": 0.0455, "step": 21940 }, { "grad_norm": 0.3725524842739105, "learning_rate": 8.685750510515676e-05, "loss": 0.0404, "step": 21950 }, { "grad_norm": 0.45401695370674133, "learning_rate": 8.684353573705805e-05, "loss": 0.038, "step": 21960 }, { "grad_norm": 0.3969828188419342, "learning_rate": 8.682956007340677e-05, "loss": 0.0364, "step": 21970 }, { "grad_norm": 0.38764965534210205, "learning_rate": 8.681557811659095e-05, "loss": 0.0405, "step": 21980 }, { "grad_norm": 0.400790274143219, "learning_rate": 8.680158986899974e-05, "loss": 0.0378, "step": 21990 }, { "grad_norm": 0.34655919671058655, "learning_rate": 8.678759533302335e-05, "loss": 0.0382, "step": 22000 }, { "grad_norm": 0.36535099148750305, "learning_rate": 8.677359451105308e-05, "loss": 0.0373, "step": 22010 }, { "grad_norm": 0.3923311233520508, "learning_rate": 8.675958740548123e-05, "loss": 0.0453, "step": 22020 }, { "grad_norm": 0.3901534676551819, "learning_rate": 8.674557401870129e-05, "loss": 0.0455, "step": 22030 }, { "grad_norm": 0.36467358469963074, "learning_rate": 8.673155435310775e-05, "loss": 0.0413, "step": 22040 }, { "grad_norm": 0.5329539775848389, "learning_rate": 8.671752841109617e-05, "loss": 0.0355, "step": 22050 }, { "grad_norm": 0.5176284313201904, "learning_rate": 8.670349619506321e-05, "loss": 0.0412, "step": 22060 }, { "grad_norm": 0.36251839995384216, "learning_rate": 8.66894577074066e-05, "loss": 0.035, "step": 22070 }, { "grad_norm": 0.3848806917667389, "learning_rate": 8.667541295052513e-05, "loss": 0.0385, "step": 22080 }, { "grad_norm": 0.43465474247932434, "learning_rate": 8.666136192681865e-05, "loss": 0.0489, "step": 22090 }, { "grad_norm": 0.4341127872467041, "learning_rate": 8.664730463868811e-05, "loss": 0.0418, "step": 22100 }, { "grad_norm": 0.4515470862388611, "learning_rate": 8.663324108853552e-05, "loss": 0.0513, "step": 22110 }, { "grad_norm": 0.4270012080669403, "learning_rate": 8.661917127876395e-05, "loss": 0.0549, "step": 22120 }, { "grad_norm": 0.34365114569664, "learning_rate": 8.660509521177754e-05, "loss": 0.0414, "step": 22130 }, { "grad_norm": 0.4088355004787445, "learning_rate": 8.65910128899815e-05, "loss": 0.0395, "step": 22140 }, { "grad_norm": 0.46432197093963623, "learning_rate": 8.657692431578214e-05, "loss": 0.0417, "step": 22150 }, { "grad_norm": 0.5048816204071045, "learning_rate": 8.656282949158679e-05, "loss": 0.0413, "step": 22160 }, { "grad_norm": 0.3291794955730438, "learning_rate": 8.654872841980388e-05, "loss": 0.0309, "step": 22170 }, { "grad_norm": 0.3446251153945923, "learning_rate": 8.653462110284289e-05, "loss": 0.0305, "step": 22180 }, { "grad_norm": 0.23954853415489197, "learning_rate": 8.652050754311437e-05, "loss": 0.0361, "step": 22190 }, { "grad_norm": 0.5025196671485901, "learning_rate": 8.650638774302995e-05, "loss": 0.0398, "step": 22200 }, { "grad_norm": 0.6535457968711853, "learning_rate": 8.649226170500233e-05, "loss": 0.0438, "step": 22210 }, { "grad_norm": 0.5955875515937805, "learning_rate": 8.647812943144524e-05, "loss": 0.0353, "step": 22220 }, { "grad_norm": 0.3150794506072998, "learning_rate": 8.646399092477351e-05, "loss": 0.0323, "step": 22230 }, { "grad_norm": 0.4300084710121155, "learning_rate": 8.644984618740301e-05, "loss": 0.0429, "step": 22240 }, { "grad_norm": 0.4038391411304474, "learning_rate": 8.643569522175073e-05, "loss": 0.0303, "step": 22250 }, { "grad_norm": 0.38777387142181396, "learning_rate": 8.642153803023463e-05, "loss": 0.041, "step": 22260 }, { "grad_norm": 0.7417004108428955, "learning_rate": 8.640737461527383e-05, "loss": 0.0418, "step": 22270 }, { "grad_norm": 0.48775428533554077, "learning_rate": 8.639320497928845e-05, "loss": 0.0396, "step": 22280 }, { "grad_norm": 0.3929864168167114, "learning_rate": 8.637902912469969e-05, "loss": 0.0347, "step": 22290 }, { "grad_norm": 0.44274893403053284, "learning_rate": 8.636484705392982e-05, "loss": 0.0353, "step": 22300 }, { "grad_norm": 0.34268316626548767, "learning_rate": 8.635065876940216e-05, "loss": 0.0383, "step": 22310 }, { "grad_norm": 0.48235443234443665, "learning_rate": 8.633646427354112e-05, "loss": 0.0429, "step": 22320 }, { "grad_norm": 0.3592666983604431, "learning_rate": 8.632226356877213e-05, "loss": 0.0398, "step": 22330 }, { "grad_norm": 0.5772674679756165, "learning_rate": 8.630805665752173e-05, "loss": 0.0487, "step": 22340 }, { "grad_norm": 0.46362823247909546, "learning_rate": 8.629384354221748e-05, "loss": 0.0413, "step": 22350 }, { "grad_norm": 0.2985246777534485, "learning_rate": 8.627962422528797e-05, "loss": 0.0401, "step": 22360 }, { "grad_norm": 1.1086804866790771, "learning_rate": 8.626539870916296e-05, "loss": 0.0556, "step": 22370 }, { "grad_norm": 0.4194813072681427, "learning_rate": 8.625116699627317e-05, "loss": 0.0414, "step": 22380 }, { "grad_norm": 0.9819856882095337, "learning_rate": 8.623692908905041e-05, "loss": 0.0398, "step": 22390 }, { "grad_norm": 0.3723289966583252, "learning_rate": 8.622268498992755e-05, "loss": 0.045, "step": 22400 }, { "grad_norm": 0.3456726372241974, "learning_rate": 8.620843470133851e-05, "loss": 0.0526, "step": 22410 }, { "grad_norm": 0.4120030701160431, "learning_rate": 8.619417822571829e-05, "loss": 0.0489, "step": 22420 }, { "grad_norm": 0.40380439162254333, "learning_rate": 8.617991556550292e-05, "loss": 0.0345, "step": 22430 }, { "grad_norm": 0.46556222438812256, "learning_rate": 8.616564672312952e-05, "loss": 0.0496, "step": 22440 }, { "grad_norm": 0.5931329131126404, "learning_rate": 8.61513717010362e-05, "loss": 0.0452, "step": 22450 }, { "grad_norm": 0.5241634249687195, "learning_rate": 8.613709050166221e-05, "loss": 0.0451, "step": 22460 }, { "grad_norm": 0.3020169138908386, "learning_rate": 8.61228031274478e-05, "loss": 0.0545, "step": 22470 }, { "grad_norm": 0.5521996021270752, "learning_rate": 8.610850958083431e-05, "loss": 0.0458, "step": 22480 }, { "grad_norm": 0.4868680536746979, "learning_rate": 8.609420986426409e-05, "loss": 0.0431, "step": 22490 }, { "grad_norm": 0.3750680685043335, "learning_rate": 8.60799039801806e-05, "loss": 0.0382, "step": 22500 }, { "grad_norm": 0.2853897511959076, "learning_rate": 8.606559193102828e-05, "loss": 0.0305, "step": 22510 }, { "grad_norm": 0.8846461772918701, "learning_rate": 8.605127371925273e-05, "loss": 0.0489, "step": 22520 }, { "grad_norm": 0.4279473125934601, "learning_rate": 8.603694934730047e-05, "loss": 0.0446, "step": 22530 }, { "grad_norm": 0.4349218010902405, "learning_rate": 8.602261881761919e-05, "loss": 0.0307, "step": 22540 }, { "grad_norm": 0.24069653451442719, "learning_rate": 8.600828213265759e-05, "loss": 0.0487, "step": 22550 }, { "grad_norm": 0.4672398269176483, "learning_rate": 8.599393929486539e-05, "loss": 0.0542, "step": 22560 }, { "grad_norm": 0.36541080474853516, "learning_rate": 8.59795903066934e-05, "loss": 0.0417, "step": 22570 }, { "grad_norm": 0.3069100081920624, "learning_rate": 8.596523517059347e-05, "loss": 0.0396, "step": 22580 }, { "grad_norm": 0.3009398281574249, "learning_rate": 8.59508738890185e-05, "loss": 0.0368, "step": 22590 }, { "grad_norm": 0.30294889211654663, "learning_rate": 8.593650646442246e-05, "loss": 0.036, "step": 22600 }, { "grad_norm": 0.38313016295433044, "learning_rate": 8.59221328992603e-05, "loss": 0.0326, "step": 22610 }, { "grad_norm": 0.2673264443874359, "learning_rate": 8.590775319598813e-05, "loss": 0.0331, "step": 22620 }, { "grad_norm": 0.4027816355228424, "learning_rate": 8.589336735706301e-05, "loss": 0.0326, "step": 22630 }, { "grad_norm": 0.4285391867160797, "learning_rate": 8.587897538494307e-05, "loss": 0.0538, "step": 22640 }, { "grad_norm": 0.3853621780872345, "learning_rate": 8.586457728208756e-05, "loss": 0.0368, "step": 22650 }, { "grad_norm": 0.382097989320755, "learning_rate": 8.585017305095667e-05, "loss": 0.0406, "step": 22660 }, { "grad_norm": 0.4392511546611786, "learning_rate": 8.583576269401173e-05, "loss": 0.0376, "step": 22670 }, { "grad_norm": 0.3278123736381531, "learning_rate": 8.582134621371504e-05, "loss": 0.0352, "step": 22680 }, { "grad_norm": 0.4459514021873474, "learning_rate": 8.580692361253e-05, "loss": 0.037, "step": 22690 }, { "grad_norm": 0.4914637804031372, "learning_rate": 8.579249489292104e-05, "loss": 0.0378, "step": 22700 }, { "grad_norm": 0.3897912800312042, "learning_rate": 8.577806005735363e-05, "loss": 0.0431, "step": 22710 }, { "grad_norm": 0.3359130918979645, "learning_rate": 8.576361910829429e-05, "loss": 0.0293, "step": 22720 }, { "grad_norm": 0.49091604351997375, "learning_rate": 8.574917204821057e-05, "loss": 0.0435, "step": 22730 }, { "grad_norm": 0.3243132531642914, "learning_rate": 8.57347188795711e-05, "loss": 0.0424, "step": 22740 }, { "grad_norm": 0.45343461632728577, "learning_rate": 8.572025960484551e-05, "loss": 0.0409, "step": 22750 }, { "grad_norm": 0.40628015995025635, "learning_rate": 8.57057942265045e-05, "loss": 0.0489, "step": 22760 }, { "grad_norm": 0.47544315457344055, "learning_rate": 8.569132274701984e-05, "loss": 0.0375, "step": 22770 }, { "grad_norm": 0.3640468418598175, "learning_rate": 8.567684516886427e-05, "loss": 0.0396, "step": 22780 }, { "grad_norm": 0.4115819036960602, "learning_rate": 8.56623614945116e-05, "loss": 0.0342, "step": 22790 }, { "grad_norm": 0.5039069652557373, "learning_rate": 8.564787172643675e-05, "loss": 0.0394, "step": 22800 }, { "grad_norm": 0.8119823336601257, "learning_rate": 8.563337586711559e-05, "loss": 0.0451, "step": 22810 }, { "grad_norm": 0.40232759714126587, "learning_rate": 8.561887391902506e-05, "loss": 0.0485, "step": 22820 }, { "grad_norm": 0.4161657989025116, "learning_rate": 8.560436588464316e-05, "loss": 0.0365, "step": 22830 }, { "grad_norm": 0.358784943819046, "learning_rate": 8.55898517664489e-05, "loss": 0.0487, "step": 22840 }, { "grad_norm": 0.5189841389656067, "learning_rate": 8.557533156692236e-05, "loss": 0.0535, "step": 22850 }, { "grad_norm": 0.324388712644577, "learning_rate": 8.556080528854467e-05, "loss": 0.0344, "step": 22860 }, { "grad_norm": 0.44110700488090515, "learning_rate": 8.554627293379791e-05, "loss": 0.0353, "step": 22870 }, { "grad_norm": 0.44962891936302185, "learning_rate": 8.553173450516531e-05, "loss": 0.0501, "step": 22880 }, { "grad_norm": 0.3192385137081146, "learning_rate": 8.551719000513108e-05, "loss": 0.0389, "step": 22890 }, { "grad_norm": 0.4592524766921997, "learning_rate": 8.550263943618049e-05, "loss": 0.0555, "step": 22900 }, { "grad_norm": 0.37780946493148804, "learning_rate": 8.54880828007998e-05, "loss": 0.0467, "step": 22910 }, { "grad_norm": 0.330398827791214, "learning_rate": 8.547352010147637e-05, "loss": 0.0442, "step": 22920 }, { "grad_norm": 0.299932062625885, "learning_rate": 8.545895134069855e-05, "loss": 0.0339, "step": 22930 }, { "grad_norm": 0.4147069454193115, "learning_rate": 8.544437652095576e-05, "loss": 0.0376, "step": 22940 }, { "grad_norm": 0.4152790307998657, "learning_rate": 8.542979564473843e-05, "loss": 0.0497, "step": 22950 }, { "grad_norm": 0.29710039496421814, "learning_rate": 8.541520871453802e-05, "loss": 0.0352, "step": 22960 }, { "grad_norm": 0.40615028142929077, "learning_rate": 8.540061573284705e-05, "loss": 0.0321, "step": 22970 }, { "grad_norm": 0.3377879559993744, "learning_rate": 8.538601670215906e-05, "loss": 0.0351, "step": 22980 }, { "grad_norm": 0.38129720091819763, "learning_rate": 8.537141162496864e-05, "loss": 0.0374, "step": 22990 }, { "grad_norm": 0.4798239469528198, "learning_rate": 8.535680050377137e-05, "loss": 0.0364, "step": 23000 }, { "grad_norm": 0.24400694668293, "learning_rate": 8.534218334106391e-05, "loss": 0.0358, "step": 23010 }, { "grad_norm": 0.39705294370651245, "learning_rate": 8.532756013934393e-05, "loss": 0.0359, "step": 23020 }, { "grad_norm": 0.3640195429325104, "learning_rate": 8.531293090111012e-05, "loss": 0.046, "step": 23030 }, { "grad_norm": 0.4358002543449402, "learning_rate": 8.529829562886225e-05, "loss": 0.0388, "step": 23040 }, { "grad_norm": 0.4252126216888428, "learning_rate": 8.528365432510105e-05, "loss": 0.0444, "step": 23050 }, { "grad_norm": 0.29355260729789734, "learning_rate": 8.526900699232833e-05, "loss": 0.0364, "step": 23060 }, { "grad_norm": 0.40762612223625183, "learning_rate": 8.525435363304695e-05, "loss": 0.0374, "step": 23070 }, { "grad_norm": 0.4168974459171295, "learning_rate": 8.523969424976072e-05, "loss": 0.0334, "step": 23080 }, { "grad_norm": 0.3290300965309143, "learning_rate": 8.522502884497457e-05, "loss": 0.0319, "step": 23090 }, { "grad_norm": 0.3824249505996704, "learning_rate": 8.521035742119437e-05, "loss": 0.037, "step": 23100 }, { "grad_norm": 0.3058430850505829, "learning_rate": 8.519567998092712e-05, "loss": 0.0496, "step": 23110 }, { "grad_norm": 0.3682359457015991, "learning_rate": 8.518099652668075e-05, "loss": 0.0374, "step": 23120 }, { "grad_norm": 0.29861530661582947, "learning_rate": 8.516630706096429e-05, "loss": 0.0331, "step": 23130 }, { "grad_norm": 0.3337567150592804, "learning_rate": 8.515161158628773e-05, "loss": 0.0306, "step": 23140 }, { "grad_norm": 0.31694602966308594, "learning_rate": 8.513691010516216e-05, "loss": 0.0535, "step": 23150 }, { "grad_norm": 0.3358549475669861, "learning_rate": 8.512220262009966e-05, "loss": 0.0352, "step": 23160 }, { "grad_norm": 0.7338257431983948, "learning_rate": 8.510748913361332e-05, "loss": 0.0403, "step": 23170 }, { "grad_norm": 0.4874925911426544, "learning_rate": 8.509276964821726e-05, "loss": 0.0374, "step": 23180 }, { "grad_norm": 0.3619789779186249, "learning_rate": 8.507804416642669e-05, "loss": 0.0362, "step": 23190 }, { "grad_norm": 0.3608023524284363, "learning_rate": 8.506331269075774e-05, "loss": 0.0335, "step": 23200 }, { "grad_norm": 0.4574397802352905, "learning_rate": 8.504857522372765e-05, "loss": 0.0325, "step": 23210 }, { "grad_norm": 0.3227071762084961, "learning_rate": 8.503383176785461e-05, "loss": 0.034, "step": 23220 }, { "grad_norm": 0.42295271158218384, "learning_rate": 8.501908232565792e-05, "loss": 0.0391, "step": 23230 }, { "grad_norm": 0.46902188658714294, "learning_rate": 8.50043268996578e-05, "loss": 0.0302, "step": 23240 }, { "grad_norm": 0.3032638728618622, "learning_rate": 8.498956549237562e-05, "loss": 0.0401, "step": 23250 }, { "grad_norm": 0.5223937034606934, "learning_rate": 8.497479810633366e-05, "loss": 0.0407, "step": 23260 }, { "grad_norm": 0.582065761089325, "learning_rate": 8.496002474405525e-05, "loss": 0.0402, "step": 23270 }, { "grad_norm": 0.4455065131187439, "learning_rate": 8.494524540806478e-05, "loss": 0.0417, "step": 23280 }, { "grad_norm": 0.39639726281166077, "learning_rate": 8.493046010088761e-05, "loss": 0.0527, "step": 23290 }, { "grad_norm": 0.5570604205131531, "learning_rate": 8.491566882505018e-05, "loss": 0.0474, "step": 23300 }, { "grad_norm": 0.4875354468822479, "learning_rate": 8.490087158307988e-05, "loss": 0.0432, "step": 23310 }, { "grad_norm": 0.5812391638755798, "learning_rate": 8.488606837750518e-05, "loss": 0.0397, "step": 23320 }, { "grad_norm": 0.3770366311073303, "learning_rate": 8.487125921085552e-05, "loss": 0.0377, "step": 23330 }, { "grad_norm": 0.2996135950088501, "learning_rate": 8.485644408566141e-05, "loss": 0.0412, "step": 23340 }, { "grad_norm": 0.3694923520088196, "learning_rate": 8.484162300445431e-05, "loss": 0.0395, "step": 23350 }, { "grad_norm": 0.3599699139595032, "learning_rate": 8.482679596976676e-05, "loss": 0.035, "step": 23360 }, { "grad_norm": 0.28566041588783264, "learning_rate": 8.48119629841323e-05, "loss": 0.0413, "step": 23370 }, { "grad_norm": 0.45353105664253235, "learning_rate": 8.479712405008547e-05, "loss": 0.0479, "step": 23380 }, { "grad_norm": 0.3331073820590973, "learning_rate": 8.478227917016184e-05, "loss": 0.0504, "step": 23390 }, { "grad_norm": 0.24208317697048187, "learning_rate": 8.476742834689801e-05, "loss": 0.0381, "step": 23400 }, { "grad_norm": 0.3945857882499695, "learning_rate": 8.475257158283157e-05, "loss": 0.046, "step": 23410 }, { "grad_norm": 0.3446183204650879, "learning_rate": 8.473770888050112e-05, "loss": 0.0287, "step": 23420 }, { "grad_norm": 0.6569340229034424, "learning_rate": 8.47228402424463e-05, "loss": 0.0445, "step": 23430 }, { "grad_norm": 0.4032449722290039, "learning_rate": 8.470796567120775e-05, "loss": 0.0336, "step": 23440 }, { "grad_norm": 0.37666791677474976, "learning_rate": 8.469308516932714e-05, "loss": 0.0418, "step": 23450 }, { "grad_norm": 0.47443887591362, "learning_rate": 8.467819873934714e-05, "loss": 0.0375, "step": 23460 }, { "grad_norm": 0.4007268249988556, "learning_rate": 8.466330638381143e-05, "loss": 0.0386, "step": 23470 }, { "grad_norm": 0.36449190974235535, "learning_rate": 8.464840810526469e-05, "loss": 0.0353, "step": 23480 }, { "grad_norm": 0.4549974799156189, "learning_rate": 8.463350390625264e-05, "loss": 0.0473, "step": 23490 }, { "grad_norm": 0.36054182052612305, "learning_rate": 8.4618593789322e-05, "loss": 0.0497, "step": 23500 }, { "grad_norm": 0.4478822350502014, "learning_rate": 8.46036777570205e-05, "loss": 0.0442, "step": 23510 }, { "grad_norm": 0.38880088925361633, "learning_rate": 8.458875581189688e-05, "loss": 0.0568, "step": 23520 }, { "grad_norm": 0.37238454818725586, "learning_rate": 8.457382795650092e-05, "loss": 0.035, "step": 23530 }, { "grad_norm": 0.2783520817756653, "learning_rate": 8.455889419338335e-05, "loss": 0.0392, "step": 23540 }, { "grad_norm": 0.2925814688205719, "learning_rate": 8.454395452509593e-05, "loss": 0.0317, "step": 23550 }, { "grad_norm": 0.27866965532302856, "learning_rate": 8.452900895419146e-05, "loss": 0.0377, "step": 23560 }, { "grad_norm": 0.28780004382133484, "learning_rate": 8.451405748322376e-05, "loss": 0.0383, "step": 23570 }, { "grad_norm": 0.3760055899620056, "learning_rate": 8.449910011474759e-05, "loss": 0.0308, "step": 23580 }, { "grad_norm": 0.3658260405063629, "learning_rate": 8.448413685131876e-05, "loss": 0.0476, "step": 23590 }, { "grad_norm": 0.4192010462284088, "learning_rate": 8.446916769549407e-05, "loss": 0.04, "step": 23600 }, { "grad_norm": 0.368377149105072, "learning_rate": 8.445419264983136e-05, "loss": 0.0446, "step": 23610 }, { "grad_norm": 0.3298661410808563, "learning_rate": 8.443921171688947e-05, "loss": 0.0361, "step": 23620 }, { "grad_norm": 0.4046891927719116, "learning_rate": 8.442422489922819e-05, "loss": 0.0386, "step": 23630 }, { "grad_norm": 0.5129214525222778, "learning_rate": 8.440923219940838e-05, "loss": 0.0475, "step": 23640 }, { "grad_norm": 0.7371879816055298, "learning_rate": 8.439423361999189e-05, "loss": 0.0408, "step": 23650 }, { "grad_norm": 0.39359143376350403, "learning_rate": 8.437922916354155e-05, "loss": 0.0432, "step": 23660 }, { "grad_norm": 0.3055610954761505, "learning_rate": 8.436421883262123e-05, "loss": 0.0351, "step": 23670 }, { "grad_norm": 0.35022038221359253, "learning_rate": 8.434920262979577e-05, "loss": 0.0456, "step": 23680 }, { "grad_norm": 0.3188483715057373, "learning_rate": 8.433418055763104e-05, "loss": 0.0328, "step": 23690 }, { "grad_norm": 0.5356181859970093, "learning_rate": 8.431915261869389e-05, "loss": 0.0397, "step": 23700 }, { "grad_norm": 0.39337071776390076, "learning_rate": 8.43041188155522e-05, "loss": 0.0369, "step": 23710 }, { "grad_norm": 0.2718241512775421, "learning_rate": 8.428907915077481e-05, "loss": 0.0384, "step": 23720 }, { "grad_norm": 0.4364806115627289, "learning_rate": 8.42740336269316e-05, "loss": 0.0409, "step": 23730 }, { "grad_norm": 0.5290950536727905, "learning_rate": 8.425898224659345e-05, "loss": 0.0386, "step": 23740 }, { "grad_norm": 0.3382754623889923, "learning_rate": 8.42439250123322e-05, "loss": 0.038, "step": 23750 }, { "grad_norm": 0.406888872385025, "learning_rate": 8.422886192672076e-05, "loss": 0.0466, "step": 23760 }, { "grad_norm": 0.3560609221458435, "learning_rate": 8.421379299233297e-05, "loss": 0.0388, "step": 23770 }, { "grad_norm": 0.8309618234634399, "learning_rate": 8.419871821174371e-05, "loss": 0.0396, "step": 23780 }, { "grad_norm": 0.3358793556690216, "learning_rate": 8.418363758752884e-05, "loss": 0.0398, "step": 23790 }, { "grad_norm": 0.4100521504878998, "learning_rate": 8.416855112226523e-05, "loss": 0.0416, "step": 23800 }, { "grad_norm": 0.3876206874847412, "learning_rate": 8.415345881853075e-05, "loss": 0.0386, "step": 23810 }, { "grad_norm": 0.4263685941696167, "learning_rate": 8.413836067890426e-05, "loss": 0.0432, "step": 23820 }, { "grad_norm": 0.8099292516708374, "learning_rate": 8.41232567059656e-05, "loss": 0.0498, "step": 23830 }, { "grad_norm": 0.3867720067501068, "learning_rate": 8.410814690229565e-05, "loss": 0.0473, "step": 23840 }, { "grad_norm": 0.6509806513786316, "learning_rate": 8.409303127047626e-05, "loss": 0.0434, "step": 23850 }, { "grad_norm": 0.37724778056144714, "learning_rate": 8.407790981309028e-05, "loss": 0.0419, "step": 23860 }, { "grad_norm": 0.4541305899620056, "learning_rate": 8.406278253272153e-05, "loss": 0.0519, "step": 23870 }, { "grad_norm": 0.35653069615364075, "learning_rate": 8.404764943195487e-05, "loss": 0.0374, "step": 23880 }, { "grad_norm": 0.30719688534736633, "learning_rate": 8.403251051337613e-05, "loss": 0.0483, "step": 23890 }, { "grad_norm": 0.32501938939094543, "learning_rate": 8.401736577957214e-05, "loss": 0.0331, "step": 23900 }, { "grad_norm": 0.34461814165115356, "learning_rate": 8.40022152331307e-05, "loss": 0.037, "step": 23910 }, { "grad_norm": 0.6013424396514893, "learning_rate": 8.398705887664064e-05, "loss": 0.0474, "step": 23920 }, { "grad_norm": 0.2740260064601898, "learning_rate": 8.397189671269177e-05, "loss": 0.0336, "step": 23930 }, { "grad_norm": 0.42596375942230225, "learning_rate": 8.395672874387488e-05, "loss": 0.0304, "step": 23940 }, { "grad_norm": 0.36422139406204224, "learning_rate": 8.394155497278177e-05, "loss": 0.0324, "step": 23950 }, { "grad_norm": 0.6251114010810852, "learning_rate": 8.392637540200523e-05, "loss": 0.0457, "step": 23960 }, { "grad_norm": 0.2784256041049957, "learning_rate": 8.391119003413902e-05, "loss": 0.0352, "step": 23970 }, { "grad_norm": 0.37316030263900757, "learning_rate": 8.38959988717779e-05, "loss": 0.0382, "step": 23980 }, { "grad_norm": 0.3670729696750641, "learning_rate": 8.388080191751764e-05, "loss": 0.0447, "step": 23990 }, { "grad_norm": 0.34520119428634644, "learning_rate": 8.386559917395496e-05, "loss": 0.0359, "step": 24000 }, { "grad_norm": 0.3215217590332031, "learning_rate": 8.385039064368761e-05, "loss": 0.0295, "step": 24010 }, { "grad_norm": 0.26023098826408386, "learning_rate": 8.383517632931431e-05, "loss": 0.0318, "step": 24020 }, { "grad_norm": 0.42411109805107117, "learning_rate": 8.381995623343477e-05, "loss": 0.0322, "step": 24030 }, { "grad_norm": 0.5321361422538757, "learning_rate": 8.380473035864968e-05, "loss": 0.0449, "step": 24040 }, { "grad_norm": 0.34493544697761536, "learning_rate": 8.378949870756076e-05, "loss": 0.0454, "step": 24050 }, { "grad_norm": 0.23108354210853577, "learning_rate": 8.377426128277063e-05, "loss": 0.0392, "step": 24060 }, { "grad_norm": 0.3097778856754303, "learning_rate": 8.375901808688298e-05, "loss": 0.0284, "step": 24070 }, { "grad_norm": 0.46681785583496094, "learning_rate": 8.374376912250246e-05, "loss": 0.0477, "step": 24080 }, { "grad_norm": 0.515021026134491, "learning_rate": 8.372851439223468e-05, "loss": 0.0343, "step": 24090 }, { "grad_norm": 0.3505977988243103, "learning_rate": 8.371325389868627e-05, "loss": 0.0415, "step": 24100 }, { "grad_norm": 0.45480239391326904, "learning_rate": 8.369798764446482e-05, "loss": 0.0328, "step": 24110 }, { "grad_norm": 0.41604146361351013, "learning_rate": 8.368271563217893e-05, "loss": 0.0364, "step": 24120 }, { "grad_norm": 0.3466338515281677, "learning_rate": 8.366743786443817e-05, "loss": 0.0518, "step": 24130 }, { "grad_norm": 0.3841904103755951, "learning_rate": 8.365215434385309e-05, "loss": 0.0413, "step": 24140 }, { "grad_norm": 0.3154439330101013, "learning_rate": 8.36368650730352e-05, "loss": 0.0347, "step": 24150 }, { "grad_norm": 0.5770677328109741, "learning_rate": 8.362157005459705e-05, "loss": 0.0444, "step": 24160 }, { "grad_norm": 0.31989914178848267, "learning_rate": 8.360626929115213e-05, "loss": 0.0358, "step": 24170 }, { "grad_norm": 0.422523558139801, "learning_rate": 8.359096278531492e-05, "loss": 0.0436, "step": 24180 }, { "grad_norm": 0.28160297870635986, "learning_rate": 8.357565053970088e-05, "loss": 0.0362, "step": 24190 }, { "grad_norm": 0.29187697172164917, "learning_rate": 8.356033255692647e-05, "loss": 0.0369, "step": 24200 }, { "grad_norm": 0.34438085556030273, "learning_rate": 8.354500883960911e-05, "loss": 0.0336, "step": 24210 }, { "grad_norm": 0.2920222878456116, "learning_rate": 8.352967939036717e-05, "loss": 0.0422, "step": 24220 }, { "grad_norm": 0.3912653923034668, "learning_rate": 8.35143442118201e-05, "loss": 0.0426, "step": 24230 }, { "grad_norm": 0.27101826667785645, "learning_rate": 8.349900330658819e-05, "loss": 0.0343, "step": 24240 }, { "grad_norm": 0.3934495151042938, "learning_rate": 8.348365667729284e-05, "loss": 0.0369, "step": 24250 }, { "grad_norm": 0.43457818031311035, "learning_rate": 8.346830432655633e-05, "loss": 0.0271, "step": 24260 }, { "grad_norm": 0.4297574758529663, "learning_rate": 8.345294625700195e-05, "loss": 0.0409, "step": 24270 }, { "grad_norm": 0.38288432359695435, "learning_rate": 8.343758247125402e-05, "loss": 0.0345, "step": 24280 }, { "grad_norm": 0.46307799220085144, "learning_rate": 8.342221297193776e-05, "loss": 0.048, "step": 24290 }, { "grad_norm": 0.37432411313056946, "learning_rate": 8.34068377616794e-05, "loss": 0.0417, "step": 24300 }, { "grad_norm": 0.508449375629425, "learning_rate": 8.339145684310615e-05, "loss": 0.0409, "step": 24310 }, { "grad_norm": 0.4251790940761566, "learning_rate": 8.337607021884618e-05, "loss": 0.0404, "step": 24320 }, { "grad_norm": 0.33005964756011963, "learning_rate": 8.336067789152867e-05, "loss": 0.0306, "step": 24330 }, { "grad_norm": 0.5185114145278931, "learning_rate": 8.334527986378369e-05, "loss": 0.0462, "step": 24340 }, { "grad_norm": 0.3126010000705719, "learning_rate": 8.332987613824239e-05, "loss": 0.0456, "step": 24350 }, { "grad_norm": 0.34460335969924927, "learning_rate": 8.331446671753685e-05, "loss": 0.0353, "step": 24360 }, { "grad_norm": 0.3727891445159912, "learning_rate": 8.329905160430007e-05, "loss": 0.0326, "step": 24370 }, { "grad_norm": 0.4107999801635742, "learning_rate": 8.328363080116611e-05, "loss": 0.0385, "step": 24380 }, { "grad_norm": 0.2804005444049835, "learning_rate": 8.326820431076997e-05, "loss": 0.0322, "step": 24390 }, { "grad_norm": 0.48790302872657776, "learning_rate": 8.325277213574759e-05, "loss": 0.0371, "step": 24400 }, { "grad_norm": 0.5145897269248962, "learning_rate": 8.32373342787359e-05, "loss": 0.0384, "step": 24410 }, { "grad_norm": 0.31482991576194763, "learning_rate": 8.322189074237285e-05, "loss": 0.0359, "step": 24420 }, { "grad_norm": 0.30580681562423706, "learning_rate": 8.32064415292973e-05, "loss": 0.0381, "step": 24430 }, { "grad_norm": 0.32883456349372864, "learning_rate": 8.319098664214907e-05, "loss": 0.0308, "step": 24440 }, { "grad_norm": 0.2938387095928192, "learning_rate": 8.3175526083569e-05, "loss": 0.04, "step": 24450 }, { "grad_norm": 0.415071576833725, "learning_rate": 8.316005985619889e-05, "loss": 0.0431, "step": 24460 }, { "grad_norm": 0.3825474977493286, "learning_rate": 8.314458796268147e-05, "loss": 0.0433, "step": 24470 }, { "grad_norm": 0.23583078384399414, "learning_rate": 8.312911040566047e-05, "loss": 0.0335, "step": 24480 }, { "grad_norm": 0.34945279359817505, "learning_rate": 8.31136271877806e-05, "loss": 0.0399, "step": 24490 }, { "grad_norm": 0.49892279505729675, "learning_rate": 8.309813831168748e-05, "loss": 0.0389, "step": 24500 }, { "grad_norm": 0.3003208041191101, "learning_rate": 8.308264378002777e-05, "loss": 0.0272, "step": 24510 }, { "grad_norm": 0.4372708201408386, "learning_rate": 8.306714359544906e-05, "loss": 0.0449, "step": 24520 }, { "grad_norm": 0.3919720947742462, "learning_rate": 8.30516377605999e-05, "loss": 0.0338, "step": 24530 }, { "grad_norm": 0.37409889698028564, "learning_rate": 8.30361262781298e-05, "loss": 0.0295, "step": 24540 }, { "grad_norm": 0.5010517835617065, "learning_rate": 8.302060915068924e-05, "loss": 0.0429, "step": 24550 }, { "grad_norm": 0.43346574902534485, "learning_rate": 8.300508638092972e-05, "loss": 0.0272, "step": 24560 }, { "grad_norm": 0.4381730258464813, "learning_rate": 8.298955797150361e-05, "loss": 0.0346, "step": 24570 }, { "grad_norm": 0.35832634568214417, "learning_rate": 8.297402392506433e-05, "loss": 0.0339, "step": 24580 }, { "grad_norm": 0.3643679916858673, "learning_rate": 8.295848424426617e-05, "loss": 0.0306, "step": 24590 }, { "grad_norm": 0.3634908199310303, "learning_rate": 8.29429389317645e-05, "loss": 0.0298, "step": 24600 }, { "grad_norm": 0.3762493133544922, "learning_rate": 8.292738799021556e-05, "loss": 0.0377, "step": 24610 }, { "grad_norm": 0.3841758370399475, "learning_rate": 8.291183142227656e-05, "loss": 0.0313, "step": 24620 }, { "grad_norm": 0.30083927512168884, "learning_rate": 8.289626923060572e-05, "loss": 0.0503, "step": 24630 }, { "grad_norm": 0.5198475122451782, "learning_rate": 8.288070141786218e-05, "loss": 0.0416, "step": 24640 }, { "grad_norm": 0.32850995659828186, "learning_rate": 8.286512798670605e-05, "loss": 0.0425, "step": 24650 }, { "grad_norm": 0.4832904636859894, "learning_rate": 8.284954893979842e-05, "loss": 0.0424, "step": 24660 }, { "grad_norm": 0.2543092370033264, "learning_rate": 8.283396427980131e-05, "loss": 0.0451, "step": 24670 }, { "grad_norm": 0.3743324875831604, "learning_rate": 8.281837400937771e-05, "loss": 0.0426, "step": 24680 }, { "grad_norm": 0.4454415440559387, "learning_rate": 8.28027781311916e-05, "loss": 0.0368, "step": 24690 }, { "grad_norm": 0.33765774965286255, "learning_rate": 8.278717664790785e-05, "loss": 0.0374, "step": 24700 }, { "grad_norm": 0.40240564942359924, "learning_rate": 8.277156956219234e-05, "loss": 0.0428, "step": 24710 }, { "grad_norm": 0.4891471266746521, "learning_rate": 8.275595687671189e-05, "loss": 0.0309, "step": 24720 }, { "grad_norm": 0.48558205366134644, "learning_rate": 8.27403385941343e-05, "loss": 0.036, "step": 24730 }, { "grad_norm": 0.41785165667533875, "learning_rate": 8.272471471712828e-05, "loss": 0.0417, "step": 24740 }, { "grad_norm": 0.25755226612091064, "learning_rate": 8.270908524836355e-05, "loss": 0.0392, "step": 24750 }, { "grad_norm": 0.4035930037498474, "learning_rate": 8.269345019051074e-05, "loss": 0.0491, "step": 24760 }, { "grad_norm": 0.3428150713443756, "learning_rate": 8.267780954624147e-05, "loss": 0.0403, "step": 24770 }, { "grad_norm": 0.29328015446662903, "learning_rate": 8.266216331822827e-05, "loss": 0.0386, "step": 24780 }, { "grad_norm": 0.45701369643211365, "learning_rate": 8.264651150914469e-05, "loss": 0.0398, "step": 24790 }, { "grad_norm": 0.3055519163608551, "learning_rate": 8.263085412166517e-05, "loss": 0.0298, "step": 24800 }, { "grad_norm": 0.33656013011932373, "learning_rate": 8.261519115846514e-05, "loss": 0.0314, "step": 24810 }, { "grad_norm": 0.3619993031024933, "learning_rate": 8.259952262222096e-05, "loss": 0.0321, "step": 24820 }, { "grad_norm": 0.33390721678733826, "learning_rate": 8.258384851560997e-05, "loss": 0.0385, "step": 24830 }, { "grad_norm": 0.40049368143081665, "learning_rate": 8.256816884131044e-05, "loss": 0.041, "step": 24840 }, { "grad_norm": 0.46868622303009033, "learning_rate": 8.255248360200159e-05, "loss": 0.0378, "step": 24850 }, { "grad_norm": 0.3489411771297455, "learning_rate": 8.253679280036359e-05, "loss": 0.0323, "step": 24860 }, { "grad_norm": 0.6569962501525879, "learning_rate": 8.252109643907762e-05, "loss": 0.052, "step": 24870 }, { "grad_norm": 0.3263883590698242, "learning_rate": 8.250539452082569e-05, "loss": 0.0342, "step": 24880 }, { "grad_norm": 0.5736626386642456, "learning_rate": 8.248968704829087e-05, "loss": 0.0365, "step": 24890 }, { "grad_norm": 0.41664713621139526, "learning_rate": 8.247397402415714e-05, "loss": 0.0383, "step": 24900 }, { "grad_norm": 0.3453262448310852, "learning_rate": 8.24582554511094e-05, "loss": 0.0383, "step": 24910 }, { "grad_norm": 0.33468160033226013, "learning_rate": 8.244253133183355e-05, "loss": 0.0325, "step": 24920 }, { "grad_norm": 0.3169734477996826, "learning_rate": 8.24268016690164e-05, "loss": 0.0405, "step": 24930 }, { "grad_norm": 0.5753021240234375, "learning_rate": 8.241106646534571e-05, "loss": 0.0402, "step": 24940 }, { "grad_norm": 0.5624960660934448, "learning_rate": 8.23953257235102e-05, "loss": 0.0393, "step": 24950 }, { "grad_norm": 0.35657691955566406, "learning_rate": 8.237957944619956e-05, "loss": 0.0392, "step": 24960 }, { "grad_norm": 0.37869027256965637, "learning_rate": 8.236382763610437e-05, "loss": 0.0383, "step": 24970 }, { "grad_norm": 0.43884631991386414, "learning_rate": 8.234807029591619e-05, "loss": 0.0379, "step": 24980 }, { "grad_norm": 0.5654921531677246, "learning_rate": 8.233230742832752e-05, "loss": 0.051, "step": 24990 }, { "grad_norm": 0.5136704444885254, "learning_rate": 8.231653903603178e-05, "loss": 0.0441, "step": 25000 }, { "grad_norm": 0.4359566569328308, "learning_rate": 8.23007651217234e-05, "loss": 0.0383, "step": 25010 }, { "grad_norm": 0.42622384428977966, "learning_rate": 8.228498568809769e-05, "loss": 0.0397, "step": 25020 }, { "grad_norm": 0.6241799592971802, "learning_rate": 8.22692007378509e-05, "loss": 0.0473, "step": 25030 }, { "grad_norm": 0.3639211356639862, "learning_rate": 8.225341027368028e-05, "loss": 0.0348, "step": 25040 }, { "grad_norm": 0.30905213952064514, "learning_rate": 8.223761429828399e-05, "loss": 0.0374, "step": 25050 }, { "grad_norm": 0.32510414719581604, "learning_rate": 8.22218128143611e-05, "loss": 0.037, "step": 25060 }, { "grad_norm": 0.4039130210876465, "learning_rate": 8.220600582461166e-05, "loss": 0.0369, "step": 25070 }, { "grad_norm": 0.2976377308368683, "learning_rate": 8.219019333173668e-05, "loss": 0.0409, "step": 25080 }, { "grad_norm": 0.5296673774719238, "learning_rate": 8.217437533843805e-05, "loss": 0.0459, "step": 25090 }, { "grad_norm": 0.3041204512119293, "learning_rate": 8.215855184741867e-05, "loss": 0.0363, "step": 25100 }, { "grad_norm": 0.3187488615512848, "learning_rate": 8.21427228613823e-05, "loss": 0.0415, "step": 25110 }, { "grad_norm": 0.3907845914363861, "learning_rate": 8.21268883830337e-05, "loss": 0.0443, "step": 25120 }, { "grad_norm": 0.503295361995697, "learning_rate": 8.211104841507855e-05, "loss": 0.0472, "step": 25130 }, { "grad_norm": 0.28870919346809387, "learning_rate": 8.209520296022346e-05, "loss": 0.038, "step": 25140 }, { "grad_norm": 0.4237841069698334, "learning_rate": 8.207935202117599e-05, "loss": 0.0335, "step": 25150 }, { "grad_norm": 0.2978396713733673, "learning_rate": 8.206349560064463e-05, "loss": 0.036, "step": 25160 }, { "grad_norm": 0.4225586950778961, "learning_rate": 8.204763370133881e-05, "loss": 0.0299, "step": 25170 }, { "grad_norm": 0.5689405202865601, "learning_rate": 8.203176632596892e-05, "loss": 0.0378, "step": 25180 }, { "grad_norm": 0.4877204895019531, "learning_rate": 8.20158934772462e-05, "loss": 0.0369, "step": 25190 }, { "grad_norm": 0.30801910161972046, "learning_rate": 8.200001515788294e-05, "loss": 0.0368, "step": 25200 }, { "grad_norm": 0.41173872351646423, "learning_rate": 8.198413137059228e-05, "loss": 0.042, "step": 25210 }, { "grad_norm": 0.27922579646110535, "learning_rate": 8.196824211808835e-05, "loss": 0.0312, "step": 25220 }, { "grad_norm": 0.36899274587631226, "learning_rate": 8.195234740308617e-05, "loss": 0.0417, "step": 25230 }, { "grad_norm": 0.3122774362564087, "learning_rate": 8.193644722830171e-05, "loss": 0.0287, "step": 25240 }, { "grad_norm": 0.39229249954223633, "learning_rate": 8.19205415964519e-05, "loss": 0.0332, "step": 25250 }, { "grad_norm": 0.3475547730922699, "learning_rate": 8.190463051025456e-05, "loss": 0.0362, "step": 25260 }, { "grad_norm": 0.4692322313785553, "learning_rate": 8.188871397242843e-05, "loss": 0.0426, "step": 25270 }, { "grad_norm": 0.30388185381889343, "learning_rate": 8.187279198569326e-05, "loss": 0.0347, "step": 25280 }, { "grad_norm": 0.3290528357028961, "learning_rate": 8.185686455276966e-05, "loss": 0.0435, "step": 25290 }, { "grad_norm": 0.2765262722969055, "learning_rate": 8.184093167637921e-05, "loss": 0.0446, "step": 25300 }, { "grad_norm": 0.3158276677131653, "learning_rate": 8.182499335924437e-05, "loss": 0.0464, "step": 25310 }, { "grad_norm": 0.42659783363342285, "learning_rate": 8.18090496040886e-05, "loss": 0.0336, "step": 25320 }, { "grad_norm": 0.3188697397708893, "learning_rate": 8.179310041363621e-05, "loss": 0.0407, "step": 25330 }, { "grad_norm": 0.4821057617664337, "learning_rate": 8.17771457906125e-05, "loss": 0.0305, "step": 25340 }, { "grad_norm": 0.4027799367904663, "learning_rate": 8.176118573774371e-05, "loss": 0.0365, "step": 25350 }, { "grad_norm": 0.5404442548751831, "learning_rate": 8.174522025775692e-05, "loss": 0.0474, "step": 25360 }, { "grad_norm": 0.5216759443283081, "learning_rate": 8.172924935338022e-05, "loss": 0.0403, "step": 25370 }, { "grad_norm": 0.8483458757400513, "learning_rate": 8.171327302734262e-05, "loss": 0.0438, "step": 25380 }, { "grad_norm": 0.3685353398323059, "learning_rate": 8.169729128237401e-05, "loss": 0.0314, "step": 25390 }, { "grad_norm": 0.32151415944099426, "learning_rate": 8.168130412120525e-05, "loss": 0.0327, "step": 25400 }, { "grad_norm": 0.3560349941253662, "learning_rate": 8.16653115465681e-05, "loss": 0.0395, "step": 25410 }, { "grad_norm": 0.4066050052642822, "learning_rate": 8.164931356119526e-05, "loss": 0.033, "step": 25420 }, { "grad_norm": 0.46920129656791687, "learning_rate": 8.163331016782032e-05, "loss": 0.0462, "step": 25430 }, { "grad_norm": 0.4150952398777008, "learning_rate": 8.161730136917785e-05, "loss": 0.0269, "step": 25440 }, { "grad_norm": 0.5608559250831604, "learning_rate": 8.160128716800333e-05, "loss": 0.0447, "step": 25450 }, { "grad_norm": 0.38095369935035706, "learning_rate": 8.158526756703313e-05, "loss": 0.0337, "step": 25460 }, { "grad_norm": 0.41329896450042725, "learning_rate": 8.156924256900455e-05, "loss": 0.0412, "step": 25470 }, { "grad_norm": 0.3910927176475525, "learning_rate": 8.155321217665584e-05, "loss": 0.0325, "step": 25480 }, { "grad_norm": 0.427280068397522, "learning_rate": 8.153717639272614e-05, "loss": 0.0312, "step": 25490 }, { "grad_norm": 0.4155002534389496, "learning_rate": 8.152113521995555e-05, "loss": 0.0421, "step": 25500 }, { "grad_norm": 0.34978511929512024, "learning_rate": 8.150508866108505e-05, "loss": 0.036, "step": 25510 }, { "grad_norm": 0.4637085795402527, "learning_rate": 8.148903671885657e-05, "loss": 0.0306, "step": 25520 }, { "grad_norm": 0.42415398359298706, "learning_rate": 8.147297939601292e-05, "loss": 0.0388, "step": 25530 }, { "grad_norm": 0.47402602434158325, "learning_rate": 8.145691669529792e-05, "loss": 0.0372, "step": 25540 }, { "grad_norm": 0.43341711163520813, "learning_rate": 8.144084861945618e-05, "loss": 0.0357, "step": 25550 }, { "grad_norm": 0.4487268924713135, "learning_rate": 8.142477517123333e-05, "loss": 0.0391, "step": 25560 }, { "grad_norm": 0.44505932927131653, "learning_rate": 8.140869635337586e-05, "loss": 0.039, "step": 25570 }, { "grad_norm": 0.39345622062683105, "learning_rate": 8.139261216863123e-05, "loss": 0.0428, "step": 25580 }, { "grad_norm": 0.3580279052257538, "learning_rate": 8.137652261974776e-05, "loss": 0.0338, "step": 25590 }, { "grad_norm": 0.46042633056640625, "learning_rate": 8.136042770947472e-05, "loss": 0.0389, "step": 25600 }, { "grad_norm": 0.3524976372718811, "learning_rate": 8.134432744056228e-05, "loss": 0.0402, "step": 25610 }, { "grad_norm": 0.4357067942619324, "learning_rate": 8.132822181576158e-05, "loss": 0.0414, "step": 25620 }, { "grad_norm": 0.33910831809043884, "learning_rate": 8.131211083782459e-05, "loss": 0.0432, "step": 25630 }, { "grad_norm": 0.35045161843299866, "learning_rate": 8.129599450950424e-05, "loss": 0.0359, "step": 25640 }, { "grad_norm": 0.4278032183647156, "learning_rate": 8.127987283355438e-05, "loss": 0.0436, "step": 25650 }, { "grad_norm": 0.3205428123474121, "learning_rate": 8.126374581272976e-05, "loss": 0.0274, "step": 25660 }, { "grad_norm": 0.5149857997894287, "learning_rate": 8.124761344978605e-05, "loss": 0.0506, "step": 25670 }, { "grad_norm": 0.4391191899776459, "learning_rate": 8.12314757474798e-05, "loss": 0.0317, "step": 25680 }, { "grad_norm": 0.40977999567985535, "learning_rate": 8.121533270856856e-05, "loss": 0.041, "step": 25690 }, { "grad_norm": 0.3419661819934845, "learning_rate": 8.119918433581069e-05, "loss": 0.0323, "step": 25700 }, { "grad_norm": 0.5694565176963806, "learning_rate": 8.118303063196551e-05, "loss": 0.0333, "step": 25710 }, { "grad_norm": 0.343578577041626, "learning_rate": 8.116687159979326e-05, "loss": 0.0302, "step": 25720 }, { "grad_norm": 0.38532090187072754, "learning_rate": 8.115070724205508e-05, "loss": 0.0404, "step": 25730 }, { "grad_norm": 0.4677465558052063, "learning_rate": 8.113453756151296e-05, "loss": 0.0354, "step": 25740 }, { "grad_norm": 0.8903543949127197, "learning_rate": 8.111836256092995e-05, "loss": 0.0529, "step": 25750 }, { "grad_norm": 0.42492789030075073, "learning_rate": 8.110218224306985e-05, "loss": 0.0354, "step": 25760 }, { "grad_norm": 0.3811917304992676, "learning_rate": 8.108599661069745e-05, "loss": 0.0372, "step": 25770 }, { "grad_norm": 0.4262034296989441, "learning_rate": 8.106980566657845e-05, "loss": 0.0319, "step": 25780 }, { "grad_norm": 0.29677459597587585, "learning_rate": 8.10536094134794e-05, "loss": 0.0352, "step": 25790 }, { "grad_norm": 0.26246002316474915, "learning_rate": 8.103740785416783e-05, "loss": 0.0335, "step": 25800 }, { "grad_norm": 0.3740750551223755, "learning_rate": 8.102120099141212e-05, "loss": 0.0311, "step": 25810 }, { "grad_norm": 0.32604023814201355, "learning_rate": 8.100498882798163e-05, "loss": 0.0347, "step": 25820 }, { "grad_norm": 0.39554983377456665, "learning_rate": 8.09887713666465e-05, "loss": 0.0342, "step": 25830 }, { "grad_norm": 0.3602917194366455, "learning_rate": 8.09725486101779e-05, "loss": 0.0363, "step": 25840 }, { "grad_norm": 0.33782511949539185, "learning_rate": 8.095632056134784e-05, "loss": 0.0373, "step": 25850 }, { "grad_norm": 0.2845567762851715, "learning_rate": 8.094008722292925e-05, "loss": 0.0352, "step": 25860 }, { "grad_norm": 0.34059929847717285, "learning_rate": 8.092384859769598e-05, "loss": 0.0344, "step": 25870 }, { "grad_norm": 0.4458770453929901, "learning_rate": 8.090760468842275e-05, "loss": 0.0445, "step": 25880 }, { "grad_norm": 0.22779889404773712, "learning_rate": 8.089135549788521e-05, "loss": 0.0368, "step": 25890 }, { "grad_norm": 0.26275190711021423, "learning_rate": 8.087510102885987e-05, "loss": 0.0317, "step": 25900 }, { "grad_norm": 0.29245853424072266, "learning_rate": 8.085884128412422e-05, "loss": 0.0347, "step": 25910 }, { "grad_norm": 0.4870556890964508, "learning_rate": 8.084257626645659e-05, "loss": 0.0331, "step": 25920 }, { "grad_norm": 0.3626016676425934, "learning_rate": 8.08263059786362e-05, "loss": 0.0376, "step": 25930 }, { "grad_norm": 0.4751908779144287, "learning_rate": 8.081003042344325e-05, "loss": 0.0347, "step": 25940 }, { "grad_norm": 0.3667985498905182, "learning_rate": 8.079374960365872e-05, "loss": 0.0389, "step": 25950 }, { "grad_norm": 0.33148738741874695, "learning_rate": 8.077746352206463e-05, "loss": 0.0315, "step": 25960 }, { "grad_norm": 0.5627084374427795, "learning_rate": 8.076117218144377e-05, "loss": 0.0407, "step": 25970 }, { "grad_norm": 0.4650718867778778, "learning_rate": 8.074487558457991e-05, "loss": 0.0374, "step": 25980 }, { "grad_norm": 0.39752474427223206, "learning_rate": 8.072857373425768e-05, "loss": 0.0381, "step": 25990 }, { "grad_norm": 0.45586085319519043, "learning_rate": 8.071226663326264e-05, "loss": 0.033, "step": 26000 }, { "grad_norm": 0.30841758847236633, "learning_rate": 8.069595428438121e-05, "loss": 0.0385, "step": 26010 }, { "grad_norm": 0.44155073165893555, "learning_rate": 8.067963669040072e-05, "loss": 0.0387, "step": 26020 }, { "grad_norm": 0.47612860798835754, "learning_rate": 8.066331385410942e-05, "loss": 0.0319, "step": 26030 }, { "grad_norm": 0.43348461389541626, "learning_rate": 8.064698577829641e-05, "loss": 0.0445, "step": 26040 }, { "grad_norm": 0.3530580401420593, "learning_rate": 8.063065246575175e-05, "loss": 0.0405, "step": 26050 }, { "grad_norm": 0.36171677708625793, "learning_rate": 8.061431391926631e-05, "loss": 0.033, "step": 26060 }, { "grad_norm": 0.25701168179512024, "learning_rate": 8.059797014163195e-05, "loss": 0.0394, "step": 26070 }, { "grad_norm": 0.4206072986125946, "learning_rate": 8.058162113564133e-05, "loss": 0.0498, "step": 26080 }, { "grad_norm": 0.3802116811275482, "learning_rate": 8.056526690408806e-05, "loss": 0.0408, "step": 26090 }, { "grad_norm": 0.31791800260543823, "learning_rate": 8.054890744976666e-05, "loss": 0.0352, "step": 26100 }, { "grad_norm": 0.28830811381340027, "learning_rate": 8.053254277547248e-05, "loss": 0.0297, "step": 26110 }, { "grad_norm": 0.34305715560913086, "learning_rate": 8.051617288400182e-05, "loss": 0.047, "step": 26120 }, { "grad_norm": 0.3132619857788086, "learning_rate": 8.049979777815182e-05, "loss": 0.0346, "step": 26130 }, { "grad_norm": 0.40323781967163086, "learning_rate": 8.048341746072054e-05, "loss": 0.0369, "step": 26140 }, { "grad_norm": 0.3730570077896118, "learning_rate": 8.046703193450696e-05, "loss": 0.0443, "step": 26150 }, { "grad_norm": 0.4121675193309784, "learning_rate": 8.04506412023109e-05, "loss": 0.0334, "step": 26160 }, { "grad_norm": 0.3242807984352112, "learning_rate": 8.043424526693306e-05, "loss": 0.0328, "step": 26170 }, { "grad_norm": 0.35221269726753235, "learning_rate": 8.04178441311751e-05, "loss": 0.0321, "step": 26180 }, { "grad_norm": 0.374254435300827, "learning_rate": 8.04014377978395e-05, "loss": 0.0305, "step": 26190 }, { "grad_norm": 0.4948943555355072, "learning_rate": 8.038502626972967e-05, "loss": 0.0434, "step": 26200 }, { "grad_norm": 0.3365286588668823, "learning_rate": 8.036860954964989e-05, "loss": 0.0383, "step": 26210 }, { "grad_norm": 0.38988253474235535, "learning_rate": 8.035218764040531e-05, "loss": 0.037, "step": 26220 }, { "grad_norm": 0.5695765018463135, "learning_rate": 8.033576054480199e-05, "loss": 0.0413, "step": 26230 }, { "grad_norm": 0.34214234352111816, "learning_rate": 8.031932826564688e-05, "loss": 0.0316, "step": 26240 }, { "grad_norm": 0.44237592816352844, "learning_rate": 8.030289080574782e-05, "loss": 0.0417, "step": 26250 }, { "grad_norm": 0.3610183298587799, "learning_rate": 8.028644816791349e-05, "loss": 0.0409, "step": 26260 }, { "grad_norm": 0.5066478848457336, "learning_rate": 8.027000035495351e-05, "loss": 0.0297, "step": 26270 }, { "grad_norm": 0.5121681690216064, "learning_rate": 8.025354736967836e-05, "loss": 0.046, "step": 26280 }, { "grad_norm": 0.3422638773918152, "learning_rate": 8.023708921489941e-05, "loss": 0.0463, "step": 26290 }, { "grad_norm": 0.44006478786468506, "learning_rate": 8.022062589342887e-05, "loss": 0.0458, "step": 26300 }, { "grad_norm": 0.2945168912410736, "learning_rate": 8.020415740807993e-05, "loss": 0.0459, "step": 26310 }, { "grad_norm": 0.3791096806526184, "learning_rate": 8.018768376166656e-05, "loss": 0.0367, "step": 26320 }, { "grad_norm": 0.4146389365196228, "learning_rate": 8.017120495700368e-05, "loss": 0.0377, "step": 26330 }, { "grad_norm": 0.3904920518398285, "learning_rate": 8.015472099690704e-05, "loss": 0.0314, "step": 26340 }, { "grad_norm": 0.275314599275589, "learning_rate": 8.013823188419332e-05, "loss": 0.03, "step": 26350 }, { "grad_norm": 0.3311101794242859, "learning_rate": 8.012173762168006e-05, "loss": 0.0357, "step": 26360 }, { "grad_norm": 0.33885350823402405, "learning_rate": 8.010523821218567e-05, "loss": 0.0318, "step": 26370 }, { "grad_norm": 0.37810903787612915, "learning_rate": 8.008873365852945e-05, "loss": 0.0399, "step": 26380 }, { "grad_norm": 0.38300254940986633, "learning_rate": 8.007222396353157e-05, "loss": 0.0426, "step": 26390 }, { "grad_norm": 0.4294455051422119, "learning_rate": 8.00557091300131e-05, "loss": 0.0339, "step": 26400 }, { "grad_norm": 0.38035863637924194, "learning_rate": 8.003918916079597e-05, "loss": 0.0368, "step": 26410 }, { "grad_norm": 0.29583820700645447, "learning_rate": 8.002266405870298e-05, "loss": 0.0286, "step": 26420 }, { "grad_norm": 0.28300175070762634, "learning_rate": 8.000613382655782e-05, "loss": 0.0398, "step": 26430 }, { "grad_norm": 0.31404662132263184, "learning_rate": 7.998959846718505e-05, "loss": 0.0366, "step": 26440 }, { "grad_norm": 0.36267757415771484, "learning_rate": 7.997305798341012e-05, "loss": 0.0413, "step": 26450 }, { "grad_norm": 0.3565422594547272, "learning_rate": 7.995651237805937e-05, "loss": 0.0403, "step": 26460 }, { "grad_norm": 0.6282119750976562, "learning_rate": 7.993996165395996e-05, "loss": 0.0301, "step": 26470 }, { "grad_norm": 0.4390592575073242, "learning_rate": 7.992340581393996e-05, "loss": 0.0364, "step": 26480 }, { "grad_norm": 0.37623170018196106, "learning_rate": 7.990684486082831e-05, "loss": 0.0362, "step": 26490 }, { "grad_norm": 0.5275106430053711, "learning_rate": 7.989027879745482e-05, "loss": 0.0412, "step": 26500 }, { "grad_norm": 0.4501206576824188, "learning_rate": 7.98737076266502e-05, "loss": 0.0419, "step": 26510 }, { "grad_norm": 0.26376646757125854, "learning_rate": 7.985713135124598e-05, "loss": 0.0371, "step": 26520 }, { "grad_norm": 0.43551644682884216, "learning_rate": 7.98405499740746e-05, "loss": 0.0362, "step": 26530 }, { "grad_norm": 0.37836751341819763, "learning_rate": 7.98239634979694e-05, "loss": 0.0387, "step": 26540 }, { "grad_norm": 0.39166536927223206, "learning_rate": 7.98073719257645e-05, "loss": 0.0363, "step": 26550 }, { "grad_norm": 0.3377719223499298, "learning_rate": 7.979077526029499e-05, "loss": 0.0373, "step": 26560 }, { "grad_norm": 0.31327372789382935, "learning_rate": 7.977417350439675e-05, "loss": 0.0342, "step": 26570 }, { "grad_norm": 0.434072881937027, "learning_rate": 7.97575666609066e-05, "loss": 0.035, "step": 26580 }, { "grad_norm": 0.48381832242012024, "learning_rate": 7.974095473266216e-05, "loss": 0.0353, "step": 26590 }, { "grad_norm": 0.6166089177131653, "learning_rate": 7.972433772250198e-05, "loss": 0.0412, "step": 26600 }, { "grad_norm": 0.4325288236141205, "learning_rate": 7.970771563326544e-05, "loss": 0.0369, "step": 26610 }, { "grad_norm": 0.4636719524860382, "learning_rate": 7.96910884677928e-05, "loss": 0.0442, "step": 26620 }, { "grad_norm": 0.4158759117126465, "learning_rate": 7.967445622892523e-05, "loss": 0.0402, "step": 26630 }, { "grad_norm": 0.3010777533054352, "learning_rate": 7.965781891950465e-05, "loss": 0.0338, "step": 26640 }, { "grad_norm": 0.3316950798034668, "learning_rate": 7.964117654237397e-05, "loss": 0.0292, "step": 26650 }, { "grad_norm": 0.4341413378715515, "learning_rate": 7.962452910037692e-05, "loss": 0.0371, "step": 26660 }, { "grad_norm": 0.45837464928627014, "learning_rate": 7.96078765963581e-05, "loss": 0.0341, "step": 26670 }, { "grad_norm": 0.4746483266353607, "learning_rate": 7.95912190331629e-05, "loss": 0.0376, "step": 26680 }, { "grad_norm": 0.44059813022613525, "learning_rate": 7.957455641363772e-05, "loss": 0.0337, "step": 26690 }, { "grad_norm": 0.4280632734298706, "learning_rate": 7.955788874062968e-05, "loss": 0.0474, "step": 26700 }, { "grad_norm": 0.34973105788230896, "learning_rate": 7.95412160169869e-05, "loss": 0.0388, "step": 26710 }, { "grad_norm": 0.3681664764881134, "learning_rate": 7.952453824555824e-05, "loss": 0.034, "step": 26720 }, { "grad_norm": 0.3301907181739807, "learning_rate": 7.95078554291935e-05, "loss": 0.0269, "step": 26730 }, { "grad_norm": 0.3936743438243866, "learning_rate": 7.94911675707433e-05, "loss": 0.0287, "step": 26740 }, { "grad_norm": 0.40615114569664, "learning_rate": 7.947447467305915e-05, "loss": 0.0422, "step": 26750 }, { "grad_norm": 0.3026663661003113, "learning_rate": 7.94577767389934e-05, "loss": 0.0342, "step": 26760 }, { "grad_norm": 0.4933186173439026, "learning_rate": 7.944107377139928e-05, "loss": 0.0391, "step": 26770 }, { "grad_norm": 0.38944900035858154, "learning_rate": 7.942436577313088e-05, "loss": 0.0445, "step": 26780 }, { "grad_norm": 0.3180173933506012, "learning_rate": 7.940765274704312e-05, "loss": 0.0424, "step": 26790 }, { "grad_norm": 0.3308948278427124, "learning_rate": 7.939093469599181e-05, "loss": 0.0394, "step": 26800 }, { "grad_norm": 0.3333880603313446, "learning_rate": 7.93742116228336e-05, "loss": 0.0381, "step": 26810 }, { "grad_norm": 0.35067257285118103, "learning_rate": 7.935748353042602e-05, "loss": 0.042, "step": 26820 }, { "grad_norm": 0.4005313813686371, "learning_rate": 7.934075042162744e-05, "loss": 0.0351, "step": 26830 }, { "grad_norm": 0.5148031115531921, "learning_rate": 7.932401229929705e-05, "loss": 0.0381, "step": 26840 }, { "grad_norm": 0.4606221616268158, "learning_rate": 7.9307269166295e-05, "loss": 0.0391, "step": 26850 }, { "grad_norm": 0.30706986784935, "learning_rate": 7.92905210254822e-05, "loss": 0.0417, "step": 26860 }, { "grad_norm": 0.39857640862464905, "learning_rate": 7.927376787972045e-05, "loss": 0.0355, "step": 26870 }, { "grad_norm": 0.37111756205558777, "learning_rate": 7.92570097318724e-05, "loss": 0.0431, "step": 26880 }, { "grad_norm": 0.32073602080345154, "learning_rate": 7.924024658480158e-05, "loss": 0.0318, "step": 26890 }, { "grad_norm": 0.4689770042896271, "learning_rate": 7.922347844137233e-05, "loss": 0.0283, "step": 26900 }, { "grad_norm": 0.2342590093612671, "learning_rate": 7.92067053044499e-05, "loss": 0.0335, "step": 26910 }, { "grad_norm": 0.45231279730796814, "learning_rate": 7.918992717690031e-05, "loss": 0.0391, "step": 26920 }, { "grad_norm": 0.38893750309944153, "learning_rate": 7.917314406159053e-05, "loss": 0.0381, "step": 26930 }, { "grad_norm": 0.504094660282135, "learning_rate": 7.915635596138832e-05, "loss": 0.0477, "step": 26940 }, { "grad_norm": 0.30635425448417664, "learning_rate": 7.913956287916228e-05, "loss": 0.0386, "step": 26950 }, { "grad_norm": 0.27683597803115845, "learning_rate": 7.912276481778193e-05, "loss": 0.0282, "step": 26960 }, { "grad_norm": 0.27924180030822754, "learning_rate": 7.910596178011759e-05, "loss": 0.034, "step": 26970 }, { "grad_norm": 0.26654231548309326, "learning_rate": 7.908915376904043e-05, "loss": 0.0304, "step": 26980 }, { "grad_norm": 0.40861332416534424, "learning_rate": 7.907234078742247e-05, "loss": 0.0315, "step": 26990 }, { "grad_norm": 0.6609748601913452, "learning_rate": 7.90555228381366e-05, "loss": 0.0282, "step": 27000 }, { "grad_norm": 0.357246071100235, "learning_rate": 7.903869992405656e-05, "loss": 0.0306, "step": 27010 }, { "grad_norm": 0.33751389384269714, "learning_rate": 7.902187204805691e-05, "loss": 0.0381, "step": 27020 }, { "grad_norm": 0.3731074631214142, "learning_rate": 7.900503921301308e-05, "loss": 0.035, "step": 27030 }, { "grad_norm": 0.5328174829483032, "learning_rate": 7.898820142180133e-05, "loss": 0.0524, "step": 27040 }, { "grad_norm": 0.3950950503349304, "learning_rate": 7.897135867729879e-05, "loss": 0.0346, "step": 27050 }, { "grad_norm": 0.23663921654224396, "learning_rate": 7.89545109823834e-05, "loss": 0.0291, "step": 27060 }, { "grad_norm": 0.2925245761871338, "learning_rate": 7.8937658339934e-05, "loss": 0.033, "step": 27070 }, { "grad_norm": 0.307487428188324, "learning_rate": 7.892080075283026e-05, "loss": 0.0282, "step": 27080 }, { "grad_norm": 0.368296355009079, "learning_rate": 7.890393822395263e-05, "loss": 0.0343, "step": 27090 }, { "grad_norm": 0.46705546975135803, "learning_rate": 7.88870707561825e-05, "loss": 0.0375, "step": 27100 }, { "grad_norm": 0.49634918570518494, "learning_rate": 7.887019835240203e-05, "loss": 0.0369, "step": 27110 }, { "grad_norm": 0.4288693964481354, "learning_rate": 7.885332101549427e-05, "loss": 0.0344, "step": 27120 }, { "grad_norm": 0.3951151371002197, "learning_rate": 7.883643874834308e-05, "loss": 0.0357, "step": 27130 }, { "grad_norm": 0.35374823212623596, "learning_rate": 7.881955155383321e-05, "loss": 0.0422, "step": 27140 }, { "grad_norm": 0.4123232960700989, "learning_rate": 7.880265943485017e-05, "loss": 0.0389, "step": 27150 }, { "grad_norm": 0.2684786319732666, "learning_rate": 7.878576239428038e-05, "loss": 0.0345, "step": 27160 }, { "grad_norm": 0.5255995392799377, "learning_rate": 7.87688604350111e-05, "loss": 0.0531, "step": 27170 }, { "grad_norm": 0.5111618638038635, "learning_rate": 7.875195355993042e-05, "loss": 0.0368, "step": 27180 }, { "grad_norm": 0.37665197253227234, "learning_rate": 7.873504177192724e-05, "loss": 0.0411, "step": 27190 }, { "grad_norm": 0.34959208965301514, "learning_rate": 7.87181250738913e-05, "loss": 0.0403, "step": 27200 }, { "grad_norm": 0.5266463160514832, "learning_rate": 7.870120346871324e-05, "loss": 0.0405, "step": 27210 }, { "grad_norm": 0.3885047137737274, "learning_rate": 7.86842769592845e-05, "loss": 0.0342, "step": 27220 }, { "grad_norm": 0.4307662546634674, "learning_rate": 7.866734554849732e-05, "loss": 0.0327, "step": 27230 }, { "grad_norm": 0.3997863233089447, "learning_rate": 7.865040923924486e-05, "loss": 0.0339, "step": 27240 }, { "grad_norm": 0.4465568959712982, "learning_rate": 7.863346803442104e-05, "loss": 0.0401, "step": 27250 }, { "grad_norm": 0.4706557095050812, "learning_rate": 7.861652193692067e-05, "loss": 0.0347, "step": 27260 }, { "grad_norm": 0.3494054675102234, "learning_rate": 7.859957094963937e-05, "loss": 0.0269, "step": 27270 }, { "grad_norm": 0.302289217710495, "learning_rate": 7.858261507547357e-05, "loss": 0.0315, "step": 27280 }, { "grad_norm": 0.2960748076438904, "learning_rate": 7.856565431732061e-05, "loss": 0.0286, "step": 27290 }, { "grad_norm": 0.3771667182445526, "learning_rate": 7.854868867807859e-05, "loss": 0.028, "step": 27300 }, { "grad_norm": 0.4640548825263977, "learning_rate": 7.85317181606465e-05, "loss": 0.0386, "step": 27310 }, { "grad_norm": 0.33612626791000366, "learning_rate": 7.85147427679241e-05, "loss": 0.029, "step": 27320 }, { "grad_norm": 0.2911970615386963, "learning_rate": 7.849776250281205e-05, "loss": 0.0256, "step": 27330 }, { "grad_norm": 0.5392832159996033, "learning_rate": 7.84807773682118e-05, "loss": 0.0341, "step": 27340 }, { "grad_norm": 0.23192404210567474, "learning_rate": 7.846378736702565e-05, "loss": 0.0318, "step": 27350 }, { "grad_norm": 0.8128169775009155, "learning_rate": 7.844679250215671e-05, "loss": 0.0467, "step": 27360 }, { "grad_norm": 0.21435977518558502, "learning_rate": 7.842979277650898e-05, "loss": 0.0349, "step": 27370 }, { "grad_norm": 0.41452282667160034, "learning_rate": 7.84127881929872e-05, "loss": 0.0333, "step": 27380 }, { "grad_norm": 0.3328341543674469, "learning_rate": 7.839577875449704e-05, "loss": 0.0389, "step": 27390 }, { "grad_norm": 0.4174051284790039, "learning_rate": 7.837876446394489e-05, "loss": 0.0315, "step": 27400 }, { "grad_norm": 0.3233720660209656, "learning_rate": 7.836174532423805e-05, "loss": 0.0454, "step": 27410 }, { "grad_norm": 0.4265144467353821, "learning_rate": 7.834472133828466e-05, "loss": 0.0321, "step": 27420 }, { "grad_norm": 0.47449350357055664, "learning_rate": 7.832769250899359e-05, "loss": 0.0415, "step": 27430 }, { "grad_norm": 0.47426357865333557, "learning_rate": 7.831065883927464e-05, "loss": 0.0443, "step": 27440 }, { "grad_norm": 0.46108075976371765, "learning_rate": 7.829362033203841e-05, "loss": 0.0384, "step": 27450 }, { "grad_norm": 0.47269177436828613, "learning_rate": 7.827657699019628e-05, "loss": 0.041, "step": 27460 }, { "grad_norm": 0.4484047293663025, "learning_rate": 7.825952881666052e-05, "loss": 0.0535, "step": 27470 }, { "grad_norm": 0.39067837595939636, "learning_rate": 7.824247581434418e-05, "loss": 0.035, "step": 27480 }, { "grad_norm": 0.3793664574623108, "learning_rate": 7.822541798616116e-05, "loss": 0.0358, "step": 27490 }, { "grad_norm": 0.45740455389022827, "learning_rate": 7.820835533502617e-05, "loss": 0.0436, "step": 27500 }, { "grad_norm": 0.42526957392692566, "learning_rate": 7.819128786385475e-05, "loss": 0.0345, "step": 27510 }, { "grad_norm": 0.28437837958335876, "learning_rate": 7.817421557556329e-05, "loss": 0.0413, "step": 27520 }, { "grad_norm": 0.49365681409835815, "learning_rate": 7.815713847306893e-05, "loss": 0.0444, "step": 27530 }, { "grad_norm": 0.3506441116333008, "learning_rate": 7.81400565592897e-05, "loss": 0.0339, "step": 27540 }, { "grad_norm": 0.37809503078460693, "learning_rate": 7.812296983714444e-05, "loss": 0.0432, "step": 27550 }, { "grad_norm": 0.2446260005235672, "learning_rate": 7.810587830955281e-05, "loss": 0.034, "step": 27560 }, { "grad_norm": 0.4154547154903412, "learning_rate": 7.808878197943528e-05, "loss": 0.0408, "step": 27570 }, { "grad_norm": 0.30511343479156494, "learning_rate": 7.807168084971312e-05, "loss": 0.0298, "step": 27580 }, { "grad_norm": 0.40037068724632263, "learning_rate": 7.805457492330849e-05, "loss": 0.0333, "step": 27590 }, { "grad_norm": 0.2935405969619751, "learning_rate": 7.803746420314428e-05, "loss": 0.0352, "step": 27600 }, { "grad_norm": 0.5615595579147339, "learning_rate": 7.802034869214428e-05, "loss": 0.0414, "step": 27610 }, { "grad_norm": 0.3793192207813263, "learning_rate": 7.800322839323303e-05, "loss": 0.0317, "step": 27620 }, { "grad_norm": 0.32634010910987854, "learning_rate": 7.798610330933593e-05, "loss": 0.0338, "step": 27630 }, { "grad_norm": 0.2923886477947235, "learning_rate": 7.796897344337922e-05, "loss": 0.0374, "step": 27640 }, { "grad_norm": 0.4231252670288086, "learning_rate": 7.795183879828989e-05, "loss": 0.0372, "step": 27650 }, { "grad_norm": 0.3485090136528015, "learning_rate": 7.793469937699579e-05, "loss": 0.0457, "step": 27660 }, { "grad_norm": 0.3870183527469635, "learning_rate": 7.791755518242558e-05, "loss": 0.0437, "step": 27670 }, { "grad_norm": 0.5108955502510071, "learning_rate": 7.790040621750876e-05, "loss": 0.0404, "step": 27680 }, { "grad_norm": 0.36856546998023987, "learning_rate": 7.788325248517558e-05, "loss": 0.0338, "step": 27690 }, { "grad_norm": 0.3102523386478424, "learning_rate": 7.786609398835715e-05, "loss": 0.0374, "step": 27700 }, { "grad_norm": 0.40978410840034485, "learning_rate": 7.784893072998541e-05, "loss": 0.0342, "step": 27710 }, { "grad_norm": 0.36289820075035095, "learning_rate": 7.783176271299306e-05, "loss": 0.034, "step": 27720 }, { "grad_norm": 0.3209196925163269, "learning_rate": 7.781458994031368e-05, "loss": 0.0271, "step": 27730 }, { "grad_norm": 0.48333272337913513, "learning_rate": 7.779741241488161e-05, "loss": 0.0357, "step": 27740 }, { "grad_norm": 0.5137066841125488, "learning_rate": 7.7780230139632e-05, "loss": 0.0351, "step": 27750 }, { "grad_norm": 0.3065577745437622, "learning_rate": 7.776304311750087e-05, "loss": 0.0269, "step": 27760 }, { "grad_norm": 0.538612425327301, "learning_rate": 7.7745851351425e-05, "loss": 0.0449, "step": 27770 }, { "grad_norm": 0.32117417454719543, "learning_rate": 7.772865484434197e-05, "loss": 0.0345, "step": 27780 }, { "grad_norm": 0.46304357051849365, "learning_rate": 7.77114535991902e-05, "loss": 0.0465, "step": 27790 }, { "grad_norm": 0.44618532061576843, "learning_rate": 7.769424761890893e-05, "loss": 0.0374, "step": 27800 }, { "grad_norm": 0.42586687207221985, "learning_rate": 7.767703690643817e-05, "loss": 0.0368, "step": 27810 }, { "grad_norm": 0.3043377101421356, "learning_rate": 7.76598214647188e-05, "loss": 0.0346, "step": 27820 }, { "grad_norm": 0.3963184654712677, "learning_rate": 7.764260129669241e-05, "loss": 0.0346, "step": 27830 }, { "grad_norm": 0.3672337532043457, "learning_rate": 7.76253764053015e-05, "loss": 0.0487, "step": 27840 }, { "grad_norm": 0.42359739542007446, "learning_rate": 7.760814679348932e-05, "loss": 0.0345, "step": 27850 }, { "grad_norm": 0.31714633107185364, "learning_rate": 7.759091246419992e-05, "loss": 0.0322, "step": 27860 }, { "grad_norm": 0.8422105312347412, "learning_rate": 7.75736734203782e-05, "loss": 0.0373, "step": 27870 }, { "grad_norm": 0.5081814527511597, "learning_rate": 7.755642966496985e-05, "loss": 0.0432, "step": 27880 }, { "grad_norm": 0.3416920006275177, "learning_rate": 7.753918120092132e-05, "loss": 0.0401, "step": 27890 }, { "grad_norm": 0.24751979112625122, "learning_rate": 7.752192803117993e-05, "loss": 0.034, "step": 27900 }, { "grad_norm": 0.42375892400741577, "learning_rate": 7.750467015869377e-05, "loss": 0.0386, "step": 27910 }, { "grad_norm": 0.3303353190422058, "learning_rate": 7.748740758641174e-05, "loss": 0.0372, "step": 27920 }, { "grad_norm": 0.3891981542110443, "learning_rate": 7.74701403172835e-05, "loss": 0.04, "step": 27930 }, { "grad_norm": 0.31414374709129333, "learning_rate": 7.745286835425962e-05, "loss": 0.0404, "step": 27940 }, { "grad_norm": 0.4916878640651703, "learning_rate": 7.743559170029138e-05, "loss": 0.0395, "step": 27950 }, { "grad_norm": 0.33915799856185913, "learning_rate": 7.741831035833087e-05, "loss": 0.0434, "step": 27960 }, { "grad_norm": 0.3251585364341736, "learning_rate": 7.740102433133102e-05, "loss": 0.0313, "step": 27970 }, { "grad_norm": 0.2702239453792572, "learning_rate": 7.738373362224553e-05, "loss": 0.0278, "step": 27980 }, { "grad_norm": 0.490888386964798, "learning_rate": 7.73664382340289e-05, "loss": 0.032, "step": 27990 }, { "grad_norm": 0.4249108135700226, "learning_rate": 7.734913816963647e-05, "loss": 0.0376, "step": 28000 }, { "grad_norm": 0.2909999489784241, "learning_rate": 7.73318334320243e-05, "loss": 0.0306, "step": 28010 }, { "grad_norm": 0.444385826587677, "learning_rate": 7.731452402414934e-05, "loss": 0.0431, "step": 28020 }, { "grad_norm": 0.4683043658733368, "learning_rate": 7.729720994896928e-05, "loss": 0.0449, "step": 28030 }, { "grad_norm": 0.3238729238510132, "learning_rate": 7.727989120944262e-05, "loss": 0.0347, "step": 28040 }, { "grad_norm": 0.5567827820777893, "learning_rate": 7.726256780852865e-05, "loss": 0.0313, "step": 28050 }, { "grad_norm": 0.40400755405426025, "learning_rate": 7.724523974918749e-05, "loss": 0.0357, "step": 28060 }, { "grad_norm": 0.41690337657928467, "learning_rate": 7.722790703438002e-05, "loss": 0.0278, "step": 28070 }, { "grad_norm": 0.22722041606903076, "learning_rate": 7.72105696670679e-05, "loss": 0.024, "step": 28080 }, { "grad_norm": 0.45020100474357605, "learning_rate": 7.719322765021364e-05, "loss": 0.035, "step": 28090 }, { "grad_norm": 0.3093051314353943, "learning_rate": 7.717588098678051e-05, "loss": 0.0327, "step": 28100 }, { "grad_norm": 0.382124662399292, "learning_rate": 7.715852967973258e-05, "loss": 0.0296, "step": 28110 }, { "grad_norm": 0.3715169131755829, "learning_rate": 7.714117373203474e-05, "loss": 0.0329, "step": 28120 }, { "grad_norm": 0.3263669013977051, "learning_rate": 7.712381314665259e-05, "loss": 0.0297, "step": 28130 }, { "grad_norm": 0.4044187366962433, "learning_rate": 7.710644792655261e-05, "loss": 0.0327, "step": 28140 }, { "grad_norm": 0.5003530383110046, "learning_rate": 7.708907807470207e-05, "loss": 0.0364, "step": 28150 }, { "grad_norm": 0.3954615294933319, "learning_rate": 7.707170359406896e-05, "loss": 0.034, "step": 28160 }, { "grad_norm": 0.2724061906337738, "learning_rate": 7.705432448762213e-05, "loss": 0.0286, "step": 28170 }, { "grad_norm": 0.444261372089386, "learning_rate": 7.703694075833117e-05, "loss": 0.0434, "step": 28180 }, { "grad_norm": 0.48918840289115906, "learning_rate": 7.70195524091665e-05, "loss": 0.0537, "step": 28190 }, { "grad_norm": 0.44910985231399536, "learning_rate": 7.70021594430993e-05, "loss": 0.0424, "step": 28200 }, { "grad_norm": 0.3878355324268341, "learning_rate": 7.698476186310157e-05, "loss": 0.0341, "step": 28210 }, { "grad_norm": 0.3489014208316803, "learning_rate": 7.696735967214608e-05, "loss": 0.03, "step": 28220 }, { "grad_norm": 0.408863365650177, "learning_rate": 7.694995287320636e-05, "loss": 0.0317, "step": 28230 }, { "grad_norm": 0.3345980644226074, "learning_rate": 7.693254146925679e-05, "loss": 0.0325, "step": 28240 }, { "grad_norm": 0.46886226534843445, "learning_rate": 7.691512546327251e-05, "loss": 0.0407, "step": 28250 }, { "grad_norm": 0.37863636016845703, "learning_rate": 7.689770485822939e-05, "loss": 0.0365, "step": 28260 }, { "grad_norm": 0.36998993158340454, "learning_rate": 7.688027965710416e-05, "loss": 0.0277, "step": 28270 }, { "grad_norm": 0.38643479347229004, "learning_rate": 7.686284986287433e-05, "loss": 0.0346, "step": 28280 }, { "grad_norm": 0.44082117080688477, "learning_rate": 7.684541547851817e-05, "loss": 0.0289, "step": 28290 }, { "grad_norm": 0.34034016728401184, "learning_rate": 7.68279765070147e-05, "loss": 0.0403, "step": 28300 }, { "grad_norm": 0.44719091057777405, "learning_rate": 7.68105329513438e-05, "loss": 0.0358, "step": 28310 }, { "grad_norm": 0.35605743527412415, "learning_rate": 7.67930848144861e-05, "loss": 0.0338, "step": 28320 }, { "grad_norm": 0.5891214609146118, "learning_rate": 7.6775632099423e-05, "loss": 0.0427, "step": 28330 }, { "grad_norm": 0.34897172451019287, "learning_rate": 7.675817480913667e-05, "loss": 0.0373, "step": 28340 }, { "grad_norm": 0.5718961358070374, "learning_rate": 7.674071294661011e-05, "loss": 0.0329, "step": 28350 }, { "grad_norm": 0.4072992205619812, "learning_rate": 7.672324651482707e-05, "loss": 0.0325, "step": 28360 }, { "grad_norm": 0.5499745011329651, "learning_rate": 7.670577551677209e-05, "loss": 0.0375, "step": 28370 }, { "grad_norm": 0.3897631764411926, "learning_rate": 7.668829995543047e-05, "loss": 0.0308, "step": 28380 }, { "grad_norm": 0.3066364824771881, "learning_rate": 7.667081983378832e-05, "loss": 0.0367, "step": 28390 }, { "grad_norm": 0.3081328570842743, "learning_rate": 7.66533351548325e-05, "loss": 0.0361, "step": 28400 }, { "grad_norm": 0.47952261567115784, "learning_rate": 7.663584592155069e-05, "loss": 0.034, "step": 28410 }, { "grad_norm": 0.4081026613712311, "learning_rate": 7.661835213693129e-05, "loss": 0.0356, "step": 28420 }, { "grad_norm": 0.32421737909317017, "learning_rate": 7.660085380396353e-05, "loss": 0.0336, "step": 28430 }, { "grad_norm": 0.42934074997901917, "learning_rate": 7.658335092563738e-05, "loss": 0.0325, "step": 28440 }, { "grad_norm": 0.3879725933074951, "learning_rate": 7.656584350494362e-05, "loss": 0.0287, "step": 28450 }, { "grad_norm": 0.4192294776439667, "learning_rate": 7.654833154487378e-05, "loss": 0.0352, "step": 28460 }, { "grad_norm": 0.34820684790611267, "learning_rate": 7.653081504842017e-05, "loss": 0.0324, "step": 28470 }, { "grad_norm": 0.40906476974487305, "learning_rate": 7.65132940185759e-05, "loss": 0.0405, "step": 28480 }, { "grad_norm": 0.4090190529823303, "learning_rate": 7.649576845833481e-05, "loss": 0.0294, "step": 28490 }, { "grad_norm": 0.22446785867214203, "learning_rate": 7.647823837069156e-05, "loss": 0.0385, "step": 28500 }, { "grad_norm": 0.3073008358478546, "learning_rate": 7.646070375864156e-05, "loss": 0.0314, "step": 28510 }, { "grad_norm": 0.5725428462028503, "learning_rate": 7.644316462518099e-05, "loss": 0.0403, "step": 28520 }, { "grad_norm": 0.4440491497516632, "learning_rate": 7.642562097330679e-05, "loss": 0.0402, "step": 28530 }, { "grad_norm": 0.25246480107307434, "learning_rate": 7.640807280601671e-05, "loss": 0.0376, "step": 28540 }, { "grad_norm": 0.34144386649131775, "learning_rate": 7.639052012630927e-05, "loss": 0.0412, "step": 28550 }, { "grad_norm": 0.3020366430282593, "learning_rate": 7.63729629371837e-05, "loss": 0.0473, "step": 28560 }, { "grad_norm": 0.4836753308773041, "learning_rate": 7.635540124164009e-05, "loss": 0.0475, "step": 28570 }, { "grad_norm": 0.24459588527679443, "learning_rate": 7.633783504267922e-05, "loss": 0.0364, "step": 28580 }, { "grad_norm": 0.3385756015777588, "learning_rate": 7.632026434330269e-05, "loss": 0.0322, "step": 28590 }, { "grad_norm": 0.5360713005065918, "learning_rate": 7.630268914651282e-05, "loss": 0.0316, "step": 28600 }, { "grad_norm": 0.38646525144577026, "learning_rate": 7.628510945531278e-05, "loss": 0.0403, "step": 28610 }, { "grad_norm": 0.3043692111968994, "learning_rate": 7.626752527270641e-05, "loss": 0.0398, "step": 28620 }, { "grad_norm": 0.27635666728019714, "learning_rate": 7.62499366016984e-05, "loss": 0.0362, "step": 28630 }, { "grad_norm": 0.24269770085811615, "learning_rate": 7.623234344529416e-05, "loss": 0.0358, "step": 28640 }, { "grad_norm": 0.3764733076095581, "learning_rate": 7.62147458064999e-05, "loss": 0.0362, "step": 28650 }, { "grad_norm": 0.46975961327552795, "learning_rate": 7.619714368832254e-05, "loss": 0.0388, "step": 28660 }, { "grad_norm": 0.41823285818099976, "learning_rate": 7.61795370937698e-05, "loss": 0.0378, "step": 28670 }, { "grad_norm": 0.3335524797439575, "learning_rate": 7.61619260258502e-05, "loss": 0.0403, "step": 28680 }, { "grad_norm": 0.4268164038658142, "learning_rate": 7.614431048757298e-05, "loss": 0.0448, "step": 28690 }, { "grad_norm": 0.3073824644088745, "learning_rate": 7.612669048194814e-05, "loss": 0.0391, "step": 28700 }, { "grad_norm": 0.3667285740375519, "learning_rate": 7.610906601198646e-05, "loss": 0.0315, "step": 28710 }, { "grad_norm": 0.41563764214515686, "learning_rate": 7.60914370806995e-05, "loss": 0.039, "step": 28720 }, { "grad_norm": 0.8199501037597656, "learning_rate": 7.607380369109953e-05, "loss": 0.0544, "step": 28730 }, { "grad_norm": 0.3118637502193451, "learning_rate": 7.605616584619961e-05, "loss": 0.0278, "step": 28740 }, { "grad_norm": 0.3594553768634796, "learning_rate": 7.603852354901362e-05, "loss": 0.0319, "step": 28750 }, { "grad_norm": 0.4480181336402893, "learning_rate": 7.602087680255609e-05, "loss": 0.0492, "step": 28760 }, { "grad_norm": 0.44751212000846863, "learning_rate": 7.600322560984238e-05, "loss": 0.0331, "step": 28770 }, { "grad_norm": 0.48085594177246094, "learning_rate": 7.598556997388863e-05, "loss": 0.0363, "step": 28780 }, { "grad_norm": 0.3298587501049042, "learning_rate": 7.596790989771166e-05, "loss": 0.0357, "step": 28790 }, { "grad_norm": 0.5641162395477295, "learning_rate": 7.595024538432914e-05, "loss": 0.0343, "step": 28800 }, { "grad_norm": 0.319990873336792, "learning_rate": 7.59325764367594e-05, "loss": 0.0319, "step": 28810 }, { "grad_norm": 0.2536564767360687, "learning_rate": 7.59149030580216e-05, "loss": 0.0339, "step": 28820 }, { "grad_norm": 0.27711769938468933, "learning_rate": 7.589722525113562e-05, "loss": 0.0316, "step": 28830 }, { "grad_norm": 0.4575051963329315, "learning_rate": 7.587954301912216e-05, "loss": 0.0295, "step": 28840 }, { "grad_norm": 0.42146602272987366, "learning_rate": 7.586185636500263e-05, "loss": 0.0357, "step": 28850 }, { "grad_norm": 0.44585829973220825, "learning_rate": 7.584416529179914e-05, "loss": 0.0333, "step": 28860 }, { "grad_norm": 0.39381706714630127, "learning_rate": 7.582646980253465e-05, "loss": 0.0313, "step": 28870 }, { "grad_norm": 0.36865338683128357, "learning_rate": 7.580876990023282e-05, "loss": 0.0462, "step": 28880 }, { "grad_norm": 0.39628973603248596, "learning_rate": 7.579106558791809e-05, "loss": 0.0327, "step": 28890 }, { "grad_norm": 0.4474451243877411, "learning_rate": 7.577335686861565e-05, "loss": 0.0311, "step": 28900 }, { "grad_norm": 0.34012797474861145, "learning_rate": 7.575564374535141e-05, "loss": 0.0355, "step": 28910 }, { "grad_norm": 0.2929285764694214, "learning_rate": 7.573792622115207e-05, "loss": 0.0312, "step": 28920 }, { "grad_norm": 0.28437352180480957, "learning_rate": 7.572020429904507e-05, "loss": 0.0303, "step": 28930 }, { "grad_norm": 0.36123016476631165, "learning_rate": 7.570247798205861e-05, "loss": 0.0395, "step": 28940 }, { "grad_norm": 0.20064552128314972, "learning_rate": 7.568474727322164e-05, "loss": 0.0369, "step": 28950 }, { "grad_norm": 0.31772273778915405, "learning_rate": 7.566701217556384e-05, "loss": 0.03, "step": 28960 }, { "grad_norm": 0.39282557368278503, "learning_rate": 7.564927269211564e-05, "loss": 0.0347, "step": 28970 }, { "grad_norm": 0.37230241298675537, "learning_rate": 7.563152882590824e-05, "loss": 0.0311, "step": 28980 }, { "grad_norm": 0.3970652222633362, "learning_rate": 7.56137805799736e-05, "loss": 0.0372, "step": 28990 }, { "grad_norm": 0.3008716106414795, "learning_rate": 7.559602795734439e-05, "loss": 0.0323, "step": 29000 }, { "grad_norm": 0.2705875635147095, "learning_rate": 7.557827096105408e-05, "loss": 0.0297, "step": 29010 }, { "grad_norm": 0.28527072072029114, "learning_rate": 7.55605095941368e-05, "loss": 0.0314, "step": 29020 }, { "grad_norm": 0.33838608860969543, "learning_rate": 7.554274385962752e-05, "loss": 0.0365, "step": 29030 }, { "grad_norm": 0.4075585603713989, "learning_rate": 7.552497376056191e-05, "loss": 0.0397, "step": 29040 }, { "grad_norm": 0.3762553930282593, "learning_rate": 7.550719929997639e-05, "loss": 0.0386, "step": 29050 }, { "grad_norm": 0.35836440324783325, "learning_rate": 7.548942048090813e-05, "loss": 0.0312, "step": 29060 }, { "grad_norm": 0.46880918741226196, "learning_rate": 7.547163730639506e-05, "loss": 0.0378, "step": 29070 }, { "grad_norm": 0.34491166472435, "learning_rate": 7.545384977947583e-05, "loss": 0.0352, "step": 29080 }, { "grad_norm": 0.3197081387042999, "learning_rate": 7.543605790318981e-05, "loss": 0.0309, "step": 29090 }, { "grad_norm": 0.6486555337905884, "learning_rate": 7.54182616805772e-05, "loss": 0.0461, "step": 29100 }, { "grad_norm": 0.27044689655303955, "learning_rate": 7.540046111467885e-05, "loss": 0.0293, "step": 29110 }, { "grad_norm": 0.4119844436645508, "learning_rate": 7.53826562085364e-05, "loss": 0.0335, "step": 29120 }, { "grad_norm": 0.4685700833797455, "learning_rate": 7.536484696519221e-05, "loss": 0.0274, "step": 29130 }, { "grad_norm": 0.31213757395744324, "learning_rate": 7.534703338768942e-05, "loss": 0.0277, "step": 29140 }, { "grad_norm": 0.4247969090938568, "learning_rate": 7.532921547907185e-05, "loss": 0.0358, "step": 29150 }, { "grad_norm": 1.0766090154647827, "learning_rate": 7.531139324238412e-05, "loss": 0.0376, "step": 29160 }, { "grad_norm": 0.37593981623649597, "learning_rate": 7.529356668067157e-05, "loss": 0.0512, "step": 29170 }, { "grad_norm": 0.3407297730445862, "learning_rate": 7.527573579698023e-05, "loss": 0.04, "step": 29180 }, { "grad_norm": 0.3758319914340973, "learning_rate": 7.525790059435693e-05, "loss": 0.0353, "step": 29190 }, { "grad_norm": 0.32174721360206604, "learning_rate": 7.524006107584926e-05, "loss": 0.0341, "step": 29200 }, { "grad_norm": 0.3073754906654358, "learning_rate": 7.522221724450544e-05, "loss": 0.0313, "step": 29210 }, { "grad_norm": 0.35094407200813293, "learning_rate": 7.520436910337451e-05, "loss": 0.0424, "step": 29220 }, { "grad_norm": 0.5353171825408936, "learning_rate": 7.518651665550627e-05, "loss": 0.0354, "step": 29230 }, { "grad_norm": 0.4117603898048401, "learning_rate": 7.516865990395117e-05, "loss": 0.0426, "step": 29240 }, { "grad_norm": 0.3419453203678131, "learning_rate": 7.515079885176047e-05, "loss": 0.03, "step": 29250 }, { "grad_norm": 0.28843623399734497, "learning_rate": 7.513293350198612e-05, "loss": 0.0346, "step": 29260 }, { "grad_norm": 0.398569792509079, "learning_rate": 7.511506385768081e-05, "loss": 0.0327, "step": 29270 }, { "grad_norm": 0.3600170612335205, "learning_rate": 7.509718992189801e-05, "loss": 0.0337, "step": 29280 }, { "grad_norm": 0.457074910402298, "learning_rate": 7.507931169769182e-05, "loss": 0.0367, "step": 29290 }, { "grad_norm": 0.5916290879249573, "learning_rate": 7.506142918811722e-05, "loss": 0.0364, "step": 29300 }, { "grad_norm": 0.25289711356163025, "learning_rate": 7.504354239622978e-05, "loss": 0.0419, "step": 29310 }, { "grad_norm": 0.36078643798828125, "learning_rate": 7.50256513250859e-05, "loss": 0.0373, "step": 29320 }, { "grad_norm": 0.41583141684532166, "learning_rate": 7.500775597774265e-05, "loss": 0.0337, "step": 29330 }, { "grad_norm": 0.4691835343837738, "learning_rate": 7.498985635725788e-05, "loss": 0.0301, "step": 29340 }, { "grad_norm": 0.355111300945282, "learning_rate": 7.497195246669012e-05, "loss": 0.0351, "step": 29350 }, { "grad_norm": 0.28597167134284973, "learning_rate": 7.495404430909868e-05, "loss": 0.0331, "step": 29360 }, { "grad_norm": 0.38601964712142944, "learning_rate": 7.493613188754356e-05, "loss": 0.0329, "step": 29370 }, { "grad_norm": 0.30612584948539734, "learning_rate": 7.49182152050855e-05, "loss": 0.0335, "step": 29380 }, { "grad_norm": 0.3471492528915405, "learning_rate": 7.490029426478598e-05, "loss": 0.0302, "step": 29390 }, { "grad_norm": 0.30306559801101685, "learning_rate": 7.488236906970719e-05, "loss": 0.0381, "step": 29400 }, { "grad_norm": 0.3360282778739929, "learning_rate": 7.486443962291207e-05, "loss": 0.0358, "step": 29410 }, { "grad_norm": 0.3777059316635132, "learning_rate": 7.484650592746424e-05, "loss": 0.0305, "step": 29420 }, { "grad_norm": 0.29559412598609924, "learning_rate": 7.482856798642811e-05, "loss": 0.0276, "step": 29430 }, { "grad_norm": 0.3154853582382202, "learning_rate": 7.481062580286878e-05, "loss": 0.0392, "step": 29440 }, { "grad_norm": 0.2616778016090393, "learning_rate": 7.479267937985208e-05, "loss": 0.0305, "step": 29450 }, { "grad_norm": 0.4091392159461975, "learning_rate": 7.477472872044456e-05, "loss": 0.0288, "step": 29460 }, { "grad_norm": 0.39530664682388306, "learning_rate": 7.475677382771347e-05, "loss": 0.0345, "step": 29470 }, { "grad_norm": 0.23156608641147614, "learning_rate": 7.473881470472683e-05, "loss": 0.0302, "step": 29480 }, { "grad_norm": 0.3807160258293152, "learning_rate": 7.472085135455337e-05, "loss": 0.0353, "step": 29490 }, { "grad_norm": 0.327819287776947, "learning_rate": 7.470288378026256e-05, "loss": 0.0435, "step": 29500 }, { "grad_norm": 0.3527953028678894, "learning_rate": 7.468491198492451e-05, "loss": 0.0305, "step": 29510 }, { "grad_norm": 0.278313010931015, "learning_rate": 7.466693597161013e-05, "loss": 0.0374, "step": 29520 }, { "grad_norm": 0.28316792845726013, "learning_rate": 7.464895574339104e-05, "loss": 0.0317, "step": 29530 }, { "grad_norm": 0.2516832947731018, "learning_rate": 7.463097130333958e-05, "loss": 0.0383, "step": 29540 }, { "grad_norm": 0.24631403386592865, "learning_rate": 7.461298265452876e-05, "loss": 0.036, "step": 29550 }, { "grad_norm": 0.2968909442424774, "learning_rate": 7.459498980003239e-05, "loss": 0.0256, "step": 29560 }, { "grad_norm": 0.34332743287086487, "learning_rate": 7.457699274292493e-05, "loss": 0.0398, "step": 29570 }, { "grad_norm": 0.30124431848526, "learning_rate": 7.455899148628159e-05, "loss": 0.0374, "step": 29580 }, { "grad_norm": 0.4362775683403015, "learning_rate": 7.45409860331783e-05, "loss": 0.0312, "step": 29590 }, { "grad_norm": 0.4020267724990845, "learning_rate": 7.452297638669169e-05, "loss": 0.036, "step": 29600 }, { "grad_norm": 0.23975960910320282, "learning_rate": 7.450496254989911e-05, "loss": 0.0252, "step": 29610 }, { "grad_norm": 0.2620667815208435, "learning_rate": 7.448694452587866e-05, "loss": 0.0322, "step": 29620 }, { "grad_norm": 0.3057984411716461, "learning_rate": 7.44689223177091e-05, "loss": 0.0363, "step": 29630 }, { "grad_norm": 0.3542177081108093, "learning_rate": 7.445089592846994e-05, "loss": 0.0336, "step": 29640 }, { "grad_norm": 0.3528105616569519, "learning_rate": 7.443286536124141e-05, "loss": 0.0412, "step": 29650 }, { "grad_norm": 0.3681844472885132, "learning_rate": 7.441483061910443e-05, "loss": 0.0436, "step": 29660 }, { "grad_norm": 0.45315465331077576, "learning_rate": 7.439679170514064e-05, "loss": 0.036, "step": 29670 }, { "grad_norm": 0.3088579475879669, "learning_rate": 7.43787486224324e-05, "loss": 0.0326, "step": 29680 }, { "grad_norm": 0.4603652060031891, "learning_rate": 7.436070137406276e-05, "loss": 0.0317, "step": 29690 }, { "grad_norm": 0.4352196455001831, "learning_rate": 7.434264996311556e-05, "loss": 0.0398, "step": 29700 }, { "grad_norm": 0.42331552505493164, "learning_rate": 7.432459439267525e-05, "loss": 0.0401, "step": 29710 }, { "grad_norm": 0.3554043471813202, "learning_rate": 7.430653466582701e-05, "loss": 0.0356, "step": 29720 }, { "grad_norm": 0.2896016836166382, "learning_rate": 7.42884707856568e-05, "loss": 0.0457, "step": 29730 }, { "grad_norm": 0.5419121384620667, "learning_rate": 7.427040275525122e-05, "loss": 0.0356, "step": 29740 }, { "grad_norm": 0.33346256613731384, "learning_rate": 7.42523305776976e-05, "loss": 0.0343, "step": 29750 }, { "grad_norm": 0.3314988911151886, "learning_rate": 7.4234254256084e-05, "loss": 0.0319, "step": 29760 }, { "grad_norm": 0.22452707588672638, "learning_rate": 7.421617379349915e-05, "loss": 0.0368, "step": 29770 }, { "grad_norm": 0.35627442598342896, "learning_rate": 7.41980891930325e-05, "loss": 0.0307, "step": 29780 }, { "grad_norm": 0.24213828146457672, "learning_rate": 7.418000045777425e-05, "loss": 0.0303, "step": 29790 }, { "grad_norm": 0.27445781230926514, "learning_rate": 7.416190759081523e-05, "loss": 0.0307, "step": 29800 }, { "grad_norm": 0.40803059935569763, "learning_rate": 7.414381059524704e-05, "loss": 0.03, "step": 29810 }, { "grad_norm": 0.2523200511932373, "learning_rate": 7.412570947416195e-05, "loss": 0.025, "step": 29820 }, { "grad_norm": 0.6366965174674988, "learning_rate": 7.410760423065295e-05, "loss": 0.0507, "step": 29830 }, { "grad_norm": 0.450918972492218, "learning_rate": 7.408949486781372e-05, "loss": 0.0441, "step": 29840 }, { "grad_norm": 0.4072817265987396, "learning_rate": 7.407138138873868e-05, "loss": 0.0403, "step": 29850 }, { "grad_norm": 0.34891217947006226, "learning_rate": 7.405326379652292e-05, "loss": 0.0378, "step": 29860 }, { "grad_norm": 0.3033551573753357, "learning_rate": 7.403514209426222e-05, "loss": 0.0303, "step": 29870 }, { "grad_norm": 0.3665034770965576, "learning_rate": 7.40170162850531e-05, "loss": 0.0383, "step": 29880 }, { "grad_norm": 0.6112878918647766, "learning_rate": 7.399888637199278e-05, "loss": 0.0297, "step": 29890 }, { "grad_norm": 0.3934504985809326, "learning_rate": 7.398075235817914e-05, "loss": 0.0442, "step": 29900 }, { "grad_norm": 0.4452930688858032, "learning_rate": 7.39626142467108e-05, "loss": 0.0391, "step": 29910 }, { "grad_norm": 0.251437246799469, "learning_rate": 7.394447204068706e-05, "loss": 0.027, "step": 29920 }, { "grad_norm": 0.4139256477355957, "learning_rate": 7.392632574320793e-05, "loss": 0.0453, "step": 29930 }, { "grad_norm": 0.351233571767807, "learning_rate": 7.390817535737411e-05, "loss": 0.0375, "step": 29940 }, { "grad_norm": 0.28412654995918274, "learning_rate": 7.389002088628703e-05, "loss": 0.0346, "step": 29950 }, { "grad_norm": 0.3975529372692108, "learning_rate": 7.387186233304877e-05, "loss": 0.0392, "step": 29960 }, { "grad_norm": 0.45378655195236206, "learning_rate": 7.385369970076212e-05, "loss": 0.0463, "step": 29970 }, { "grad_norm": 0.4003828167915344, "learning_rate": 7.38355329925306e-05, "loss": 0.0371, "step": 29980 }, { "grad_norm": 0.30434650182724, "learning_rate": 7.381736221145838e-05, "loss": 0.0389, "step": 29990 }, { "grad_norm": 0.6399891376495361, "learning_rate": 7.37991873606504e-05, "loss": 0.0326, "step": 30000 }, { "grad_norm": 0.3068901300430298, "learning_rate": 7.378100844321218e-05, "loss": 0.032, "step": 30010 }, { "grad_norm": 0.2631552815437317, "learning_rate": 7.376282546225004e-05, "loss": 0.0275, "step": 30020 }, { "grad_norm": 0.2567211985588074, "learning_rate": 7.374463842087094e-05, "loss": 0.0235, "step": 30030 }, { "grad_norm": 0.4202497899532318, "learning_rate": 7.372644732218254e-05, "loss": 0.0333, "step": 30040 }, { "grad_norm": 0.5195497274398804, "learning_rate": 7.370825216929322e-05, "loss": 0.0469, "step": 30050 }, { "grad_norm": 0.4189993143081665, "learning_rate": 7.369005296531205e-05, "loss": 0.0382, "step": 30060 }, { "grad_norm": 0.4470600485801697, "learning_rate": 7.367184971334873e-05, "loss": 0.0422, "step": 30070 }, { "grad_norm": 0.24168804287910461, "learning_rate": 7.365364241651371e-05, "loss": 0.038, "step": 30080 }, { "grad_norm": 0.29192444682121277, "learning_rate": 7.363543107791815e-05, "loss": 0.0288, "step": 30090 }, { "grad_norm": 0.31938159465789795, "learning_rate": 7.361721570067384e-05, "loss": 0.0282, "step": 30100 }, { "grad_norm": 0.36875128746032715, "learning_rate": 7.359899628789331e-05, "loss": 0.0341, "step": 30110 }, { "grad_norm": 0.3302942216396332, "learning_rate": 7.358077284268974e-05, "loss": 0.0387, "step": 30120 }, { "grad_norm": 0.5883016586303711, "learning_rate": 7.356254536817702e-05, "loss": 0.0306, "step": 30130 }, { "grad_norm": 0.3840203285217285, "learning_rate": 7.354431386746973e-05, "loss": 0.0354, "step": 30140 }, { "grad_norm": 0.5377202033996582, "learning_rate": 7.352607834368316e-05, "loss": 0.0336, "step": 30150 }, { "grad_norm": 0.32420414686203003, "learning_rate": 7.350783879993324e-05, "loss": 0.0263, "step": 30160 }, { "grad_norm": 0.4091842472553253, "learning_rate": 7.348959523933658e-05, "loss": 0.034, "step": 30170 }, { "grad_norm": 0.2445857971906662, "learning_rate": 7.347134766501057e-05, "loss": 0.0248, "step": 30180 }, { "grad_norm": 0.3480307459831238, "learning_rate": 7.345309608007315e-05, "loss": 0.0275, "step": 30190 }, { "grad_norm": 0.39476892352104187, "learning_rate": 7.343484048764308e-05, "loss": 0.0325, "step": 30200 }, { "grad_norm": 0.29322800040245056, "learning_rate": 7.341658089083972e-05, "loss": 0.0322, "step": 30210 }, { "grad_norm": 0.33075669407844543, "learning_rate": 7.339831729278313e-05, "loss": 0.0246, "step": 30220 }, { "grad_norm": 0.3349643051624298, "learning_rate": 7.338004969659404e-05, "loss": 0.0345, "step": 30230 }, { "grad_norm": 0.28702104091644287, "learning_rate": 7.336177810539391e-05, "loss": 0.0382, "step": 30240 }, { "grad_norm": 0.33878058195114136, "learning_rate": 7.334350252230485e-05, "loss": 0.0237, "step": 30250 }, { "grad_norm": 0.3076372742652893, "learning_rate": 7.332522295044965e-05, "loss": 0.0308, "step": 30260 }, { "grad_norm": 0.41351938247680664, "learning_rate": 7.33069393929518e-05, "loss": 0.0345, "step": 30270 }, { "grad_norm": 0.3959885835647583, "learning_rate": 7.328865185293545e-05, "loss": 0.0443, "step": 30280 }, { "grad_norm": 0.3259527385234833, "learning_rate": 7.327036033352546e-05, "loss": 0.0337, "step": 30290 }, { "grad_norm": 0.4535384774208069, "learning_rate": 7.325206483784733e-05, "loss": 0.0302, "step": 30300 }, { "grad_norm": 0.3391687273979187, "learning_rate": 7.323376536902724e-05, "loss": 0.0438, "step": 30310 }, { "grad_norm": 0.36955273151397705, "learning_rate": 7.321546193019213e-05, "loss": 0.0332, "step": 30320 }, { "grad_norm": 0.4643072485923767, "learning_rate": 7.31971545244695e-05, "loss": 0.0367, "step": 30330 }, { "grad_norm": 0.35690638422966003, "learning_rate": 7.31788431549876e-05, "loss": 0.0318, "step": 30340 }, { "grad_norm": 0.2288520187139511, "learning_rate": 7.316052782487534e-05, "loss": 0.0268, "step": 30350 }, { "grad_norm": 0.40817567706108093, "learning_rate": 7.314220853726234e-05, "loss": 0.0355, "step": 30360 }, { "grad_norm": 0.38411152362823486, "learning_rate": 7.312388529527884e-05, "loss": 0.0374, "step": 30370 }, { "grad_norm": 0.2664336860179901, "learning_rate": 7.310555810205577e-05, "loss": 0.0289, "step": 30380 }, { "grad_norm": 0.47210824489593506, "learning_rate": 7.308722696072476e-05, "loss": 0.0402, "step": 30390 }, { "grad_norm": 0.35274600982666016, "learning_rate": 7.306889187441811e-05, "loss": 0.0307, "step": 30400 }, { "grad_norm": 0.26878660917282104, "learning_rate": 7.305055284626876e-05, "loss": 0.0304, "step": 30410 }, { "grad_norm": 0.3164759874343872, "learning_rate": 7.303220987941037e-05, "loss": 0.0318, "step": 30420 }, { "grad_norm": 0.3457998037338257, "learning_rate": 7.301386297697726e-05, "loss": 0.036, "step": 30430 }, { "grad_norm": 0.438925564289093, "learning_rate": 7.299551214210438e-05, "loss": 0.0305, "step": 30440 }, { "grad_norm": 0.3031505346298218, "learning_rate": 7.297715737792738e-05, "loss": 0.0314, "step": 30450 }, { "grad_norm": 0.2803176939487457, "learning_rate": 7.295879868758265e-05, "loss": 0.0396, "step": 30460 }, { "grad_norm": 0.34160083532333374, "learning_rate": 7.294043607420713e-05, "loss": 0.027, "step": 30470 }, { "grad_norm": 0.39339643716812134, "learning_rate": 7.292206954093852e-05, "loss": 0.0369, "step": 30480 }, { "grad_norm": 0.30415448546409607, "learning_rate": 7.290369909091515e-05, "loss": 0.0323, "step": 30490 }, { "grad_norm": 0.3021294176578522, "learning_rate": 7.2885324727276e-05, "loss": 0.0374, "step": 30500 }, { "grad_norm": 0.4294411242008209, "learning_rate": 7.286694645316076e-05, "loss": 0.0324, "step": 30510 }, { "grad_norm": 0.4110672175884247, "learning_rate": 7.284856427170982e-05, "loss": 0.0325, "step": 30520 }, { "grad_norm": 0.3099907636642456, "learning_rate": 7.283017818606414e-05, "loss": 0.0331, "step": 30530 }, { "grad_norm": 0.4607732594013214, "learning_rate": 7.28117881993654e-05, "loss": 0.0367, "step": 30540 }, { "grad_norm": 0.277525931596756, "learning_rate": 7.279339431475598e-05, "loss": 0.0331, "step": 30550 }, { "grad_norm": 0.2808089256286621, "learning_rate": 7.277499653537887e-05, "loss": 0.027, "step": 30560 }, { "grad_norm": 0.4373674988746643, "learning_rate": 7.275659486437776e-05, "loss": 0.0382, "step": 30570 }, { "grad_norm": 0.4027968943119049, "learning_rate": 7.273818930489695e-05, "loss": 0.0423, "step": 30580 }, { "grad_norm": 0.35085511207580566, "learning_rate": 7.271977986008151e-05, "loss": 0.0304, "step": 30590 }, { "grad_norm": 0.3970487415790558, "learning_rate": 7.270136653307705e-05, "loss": 0.0328, "step": 30600 }, { "grad_norm": 0.3994583487510681, "learning_rate": 7.268294932702994e-05, "loss": 0.0321, "step": 30610 }, { "grad_norm": 0.3120206892490387, "learning_rate": 7.266452824508719e-05, "loss": 0.0369, "step": 30620 }, { "grad_norm": 0.3681611716747284, "learning_rate": 7.264610329039643e-05, "loss": 0.0263, "step": 30630 }, { "grad_norm": 0.35373324155807495, "learning_rate": 7.262767446610599e-05, "loss": 0.031, "step": 30640 }, { "grad_norm": 0.5602478384971619, "learning_rate": 7.260924177536485e-05, "loss": 0.0369, "step": 30650 }, { "grad_norm": 0.25740230083465576, "learning_rate": 7.259080522132265e-05, "loss": 0.0296, "step": 30660 }, { "grad_norm": 0.28046971559524536, "learning_rate": 7.257236480712972e-05, "loss": 0.03, "step": 30670 }, { "grad_norm": 0.2618052363395691, "learning_rate": 7.255392053593697e-05, "loss": 0.0299, "step": 30680 }, { "grad_norm": 0.5209044218063354, "learning_rate": 7.253547241089607e-05, "loss": 0.0273, "step": 30690 }, { "grad_norm": 0.39042043685913086, "learning_rate": 7.251702043515927e-05, "loss": 0.0357, "step": 30700 }, { "grad_norm": 0.40438544750213623, "learning_rate": 7.249856461187952e-05, "loss": 0.0296, "step": 30710 }, { "grad_norm": 0.41114360094070435, "learning_rate": 7.248010494421042e-05, "loss": 0.0326, "step": 30720 }, { "grad_norm": 0.2467285841703415, "learning_rate": 7.246164143530622e-05, "loss": 0.0418, "step": 30730 }, { "grad_norm": 0.31095704436302185, "learning_rate": 7.244317408832181e-05, "loss": 0.0338, "step": 30740 }, { "grad_norm": 0.29416534304618835, "learning_rate": 7.242470290641279e-05, "loss": 0.0294, "step": 30750 }, { "grad_norm": 0.42298877239227295, "learning_rate": 7.240622789273536e-05, "loss": 0.0314, "step": 30760 }, { "grad_norm": 0.512024462223053, "learning_rate": 7.238774905044638e-05, "loss": 0.0253, "step": 30770 }, { "grad_norm": 0.45601996779441833, "learning_rate": 7.236926638270341e-05, "loss": 0.0305, "step": 30780 }, { "grad_norm": 0.35060250759124756, "learning_rate": 7.23507798926646e-05, "loss": 0.0322, "step": 30790 }, { "grad_norm": 0.3276536464691162, "learning_rate": 7.23322895834888e-05, "loss": 0.0331, "step": 30800 }, { "grad_norm": 0.3079427182674408, "learning_rate": 7.231379545833552e-05, "loss": 0.0302, "step": 30810 }, { "grad_norm": 0.2962753474712372, "learning_rate": 7.229529752036487e-05, "loss": 0.0365, "step": 30820 }, { "grad_norm": 0.4158231317996979, "learning_rate": 7.227679577273765e-05, "loss": 0.0361, "step": 30830 }, { "grad_norm": 0.31738361716270447, "learning_rate": 7.225829021861529e-05, "loss": 0.03, "step": 30840 }, { "grad_norm": 0.2381373643875122, "learning_rate": 7.223978086115992e-05, "loss": 0.0273, "step": 30850 }, { "grad_norm": 0.3151799738407135, "learning_rate": 7.222126770353425e-05, "loss": 0.0368, "step": 30860 }, { "grad_norm": 0.3273445665836334, "learning_rate": 7.22027507489017e-05, "loss": 0.0286, "step": 30870 }, { "grad_norm": 0.31746116280555725, "learning_rate": 7.218423000042627e-05, "loss": 0.0268, "step": 30880 }, { "grad_norm": 0.4490748643875122, "learning_rate": 7.216570546127268e-05, "loss": 0.0386, "step": 30890 }, { "grad_norm": 0.40363577008247375, "learning_rate": 7.214717713460626e-05, "loss": 0.0388, "step": 30900 }, { "grad_norm": 0.3746781647205353, "learning_rate": 7.2128645023593e-05, "loss": 0.0369, "step": 30910 }, { "grad_norm": 0.20983922481536865, "learning_rate": 7.211010913139951e-05, "loss": 0.0346, "step": 30920 }, { "grad_norm": 0.3256675899028778, "learning_rate": 7.209156946119308e-05, "loss": 0.0315, "step": 30930 }, { "grad_norm": 0.41340598464012146, "learning_rate": 7.207302601614166e-05, "loss": 0.0321, "step": 30940 }, { "grad_norm": 0.4803236722946167, "learning_rate": 7.205447879941378e-05, "loss": 0.0384, "step": 30950 }, { "grad_norm": 0.3393690884113312, "learning_rate": 7.203592781417866e-05, "loss": 0.0304, "step": 30960 }, { "grad_norm": 0.3086491525173187, "learning_rate": 7.201737306360617e-05, "loss": 0.0311, "step": 30970 }, { "grad_norm": 0.40828320384025574, "learning_rate": 7.19988145508668e-05, "loss": 0.0294, "step": 30980 }, { "grad_norm": 0.34513500332832336, "learning_rate": 7.198025227913168e-05, "loss": 0.0343, "step": 30990 }, { "grad_norm": 0.4084954261779785, "learning_rate": 7.196168625157261e-05, "loss": 0.0383, "step": 31000 }, { "grad_norm": 0.3670057952404022, "learning_rate": 7.194311647136201e-05, "loss": 0.0354, "step": 31010 }, { "grad_norm": 0.3613690435886383, "learning_rate": 7.192454294167297e-05, "loss": 0.0308, "step": 31020 }, { "grad_norm": 0.4000214636325836, "learning_rate": 7.190596566567917e-05, "loss": 0.0341, "step": 31030 }, { "grad_norm": 0.5058186650276184, "learning_rate": 7.188738464655496e-05, "loss": 0.0374, "step": 31040 }, { "grad_norm": 0.272420197725296, "learning_rate": 7.186879988747533e-05, "loss": 0.0262, "step": 31050 }, { "grad_norm": 0.27825817465782166, "learning_rate": 7.185021139161592e-05, "loss": 0.0262, "step": 31060 }, { "grad_norm": 0.2731463313102722, "learning_rate": 7.1831619162153e-05, "loss": 0.028, "step": 31070 }, { "grad_norm": 0.3675355613231659, "learning_rate": 7.181302320226345e-05, "loss": 0.026, "step": 31080 }, { "grad_norm": 0.23827959597110748, "learning_rate": 7.179442351512482e-05, "loss": 0.0337, "step": 31090 }, { "grad_norm": 0.3320253789424896, "learning_rate": 7.177582010391528e-05, "loss": 0.0288, "step": 31100 }, { "grad_norm": 0.4686873257160187, "learning_rate": 7.175721297181366e-05, "loss": 0.0301, "step": 31110 }, { "grad_norm": 0.501180112361908, "learning_rate": 7.173860212199942e-05, "loss": 0.0309, "step": 31120 }, { "grad_norm": 0.3117202818393707, "learning_rate": 7.171998755765263e-05, "loss": 0.0306, "step": 31130 }, { "grad_norm": 0.32011619210243225, "learning_rate": 7.170136928195398e-05, "loss": 0.0289, "step": 31140 }, { "grad_norm": 0.32855692505836487, "learning_rate": 7.168274729808489e-05, "loss": 0.0347, "step": 31150 }, { "grad_norm": 0.3385167419910431, "learning_rate": 7.166412160922728e-05, "loss": 0.0373, "step": 31160 }, { "grad_norm": 0.4015747606754303, "learning_rate": 7.164549221856382e-05, "loss": 0.0395, "step": 31170 }, { "grad_norm": 0.3401370644569397, "learning_rate": 7.162685912927775e-05, "loss": 0.0461, "step": 31180 }, { "grad_norm": 0.3368269205093384, "learning_rate": 7.160822234455294e-05, "loss": 0.032, "step": 31190 }, { "grad_norm": 0.31871774792671204, "learning_rate": 7.158958186757391e-05, "loss": 0.0306, "step": 31200 }, { "grad_norm": 0.3709195554256439, "learning_rate": 7.157093770152582e-05, "loss": 0.0326, "step": 31210 }, { "grad_norm": 0.2559715211391449, "learning_rate": 7.155228984959446e-05, "loss": 0.0282, "step": 31220 }, { "grad_norm": 0.3945593535900116, "learning_rate": 7.153363831496621e-05, "loss": 0.0399, "step": 31230 }, { "grad_norm": 0.38594192266464233, "learning_rate": 7.151498310082811e-05, "loss": 0.0396, "step": 31240 }, { "grad_norm": 0.3617468774318695, "learning_rate": 7.149632421036784e-05, "loss": 0.0271, "step": 31250 }, { "grad_norm": 0.4971126317977905, "learning_rate": 7.147766164677369e-05, "loss": 0.0576, "step": 31260 }, { "grad_norm": 0.32229432463645935, "learning_rate": 7.145899541323459e-05, "loss": 0.0245, "step": 31270 }, { "grad_norm": 0.4253738522529602, "learning_rate": 7.144032551294007e-05, "loss": 0.0387, "step": 31280 }, { "grad_norm": 0.28549322485923767, "learning_rate": 7.14216519490803e-05, "loss": 0.0356, "step": 31290 }, { "grad_norm": 0.49367934465408325, "learning_rate": 7.140297472484609e-05, "loss": 0.0435, "step": 31300 }, { "grad_norm": 0.21613709628582, "learning_rate": 7.138429384342891e-05, "loss": 0.0472, "step": 31310 }, { "grad_norm": 0.5237776637077332, "learning_rate": 7.136560930802074e-05, "loss": 0.0288, "step": 31320 }, { "grad_norm": 0.3707351088523865, "learning_rate": 7.134692112181431e-05, "loss": 0.0268, "step": 31330 }, { "grad_norm": 0.4788525104522705, "learning_rate": 7.13282292880029e-05, "loss": 0.0333, "step": 31340 }, { "grad_norm": 0.6229890584945679, "learning_rate": 7.130953380978043e-05, "loss": 0.033, "step": 31350 }, { "grad_norm": 0.3230333924293518, "learning_rate": 7.129083469034144e-05, "loss": 0.0402, "step": 31360 }, { "grad_norm": 0.49671900272369385, "learning_rate": 7.127213193288112e-05, "loss": 0.0416, "step": 31370 }, { "grad_norm": 0.2707148492336273, "learning_rate": 7.125342554059522e-05, "loss": 0.0297, "step": 31380 }, { "grad_norm": 0.41704061627388, "learning_rate": 7.12347155166802e-05, "loss": 0.0363, "step": 31390 }, { "grad_norm": 0.3650479316711426, "learning_rate": 7.121600186433306e-05, "loss": 0.0296, "step": 31400 }, { "grad_norm": 0.46613138914108276, "learning_rate": 7.119728458675148e-05, "loss": 0.0288, "step": 31410 }, { "grad_norm": 0.5837252140045166, "learning_rate": 7.117856368713369e-05, "loss": 0.0299, "step": 31420 }, { "grad_norm": 0.47062695026397705, "learning_rate": 7.115983916867861e-05, "loss": 0.0343, "step": 31430 }, { "grad_norm": 0.2972720265388489, "learning_rate": 7.114111103458574e-05, "loss": 0.027, "step": 31440 }, { "grad_norm": 0.23856887221336365, "learning_rate": 7.11223792880552e-05, "loss": 0.0246, "step": 31450 }, { "grad_norm": 0.3228229880332947, "learning_rate": 7.110364393228773e-05, "loss": 0.0298, "step": 31460 }, { "grad_norm": 0.390104204416275, "learning_rate": 7.108490497048471e-05, "loss": 0.0252, "step": 31470 }, { "grad_norm": 0.36247164011001587, "learning_rate": 7.10661624058481e-05, "loss": 0.0322, "step": 31480 }, { "grad_norm": 0.3186359107494354, "learning_rate": 7.10474162415805e-05, "loss": 0.0316, "step": 31490 }, { "grad_norm": 0.23858828842639923, "learning_rate": 7.102866648088511e-05, "loss": 0.0328, "step": 31500 }, { "grad_norm": 0.3127123713493347, "learning_rate": 7.100991312696576e-05, "loss": 0.0285, "step": 31510 }, { "grad_norm": 0.4598511755466461, "learning_rate": 7.099115618302686e-05, "loss": 0.0316, "step": 31520 }, { "grad_norm": 0.3591444194316864, "learning_rate": 7.097239565227349e-05, "loss": 0.0356, "step": 31530 }, { "grad_norm": 0.3779938519001007, "learning_rate": 7.09536315379113e-05, "loss": 0.0302, "step": 31540 }, { "grad_norm": 0.32064786553382874, "learning_rate": 7.093486384314656e-05, "loss": 0.0277, "step": 31550 }, { "grad_norm": 0.29639026522636414, "learning_rate": 7.091609257118616e-05, "loss": 0.0293, "step": 31560 }, { "grad_norm": 0.38813069462776184, "learning_rate": 7.08973177252376e-05, "loss": 0.0322, "step": 31570 }, { "grad_norm": 0.35184019804000854, "learning_rate": 7.087853930850898e-05, "loss": 0.03, "step": 31580 }, { "grad_norm": 0.3431978225708008, "learning_rate": 7.085975732420903e-05, "loss": 0.0259, "step": 31590 }, { "grad_norm": 1.226312518119812, "learning_rate": 7.084097177554706e-05, "loss": 0.0267, "step": 31600 }, { "grad_norm": 0.39809510111808777, "learning_rate": 7.082218266573301e-05, "loss": 0.0361, "step": 31610 }, { "grad_norm": 0.4310806691646576, "learning_rate": 7.080338999797743e-05, "loss": 0.0356, "step": 31620 }, { "grad_norm": 0.3725602626800537, "learning_rate": 7.07845937754915e-05, "loss": 0.0337, "step": 31630 }, { "grad_norm": 0.2617644965648651, "learning_rate": 7.076579400148693e-05, "loss": 0.0359, "step": 31640 }, { "grad_norm": 0.2604229152202606, "learning_rate": 7.074699067917611e-05, "loss": 0.0297, "step": 31650 }, { "grad_norm": 0.49256813526153564, "learning_rate": 7.072818381177201e-05, "loss": 0.0354, "step": 31660 }, { "grad_norm": 0.5522903800010681, "learning_rate": 7.070937340248823e-05, "loss": 0.0321, "step": 31670 }, { "grad_norm": 0.24639567732810974, "learning_rate": 7.069055945453893e-05, "loss": 0.033, "step": 31680 }, { "grad_norm": 0.3024982810020447, "learning_rate": 7.067174197113892e-05, "loss": 0.0259, "step": 31690 }, { "grad_norm": 0.29247602820396423, "learning_rate": 7.065292095550355e-05, "loss": 0.0324, "step": 31700 }, { "grad_norm": 0.32333922386169434, "learning_rate": 7.063409641084887e-05, "loss": 0.0287, "step": 31710 }, { "grad_norm": 0.3882564902305603, "learning_rate": 7.061526834039145e-05, "loss": 0.0304, "step": 31720 }, { "grad_norm": 0.3106219172477722, "learning_rate": 7.05964367473485e-05, "loss": 0.0264, "step": 31730 }, { "grad_norm": 0.43275880813598633, "learning_rate": 7.057760163493783e-05, "loss": 0.0327, "step": 31740 }, { "grad_norm": 0.34832170605659485, "learning_rate": 7.055876300637783e-05, "loss": 0.0254, "step": 31750 }, { "grad_norm": 0.35641300678253174, "learning_rate": 7.053992086488753e-05, "loss": 0.0428, "step": 31760 }, { "grad_norm": 0.2562656104564667, "learning_rate": 7.052107521368651e-05, "loss": 0.0315, "step": 31770 }, { "grad_norm": 0.4385853409767151, "learning_rate": 7.0502226055995e-05, "loss": 0.0351, "step": 31780 }, { "grad_norm": 0.3043154180049896, "learning_rate": 7.048337339503379e-05, "loss": 0.0372, "step": 31790 }, { "grad_norm": 0.3903402090072632, "learning_rate": 7.046451723402427e-05, "loss": 0.0298, "step": 31800 }, { "grad_norm": 0.35385504364967346, "learning_rate": 7.044565757618848e-05, "loss": 0.0418, "step": 31810 }, { "grad_norm": 0.31844744086265564, "learning_rate": 7.042679442474899e-05, "loss": 0.0335, "step": 31820 }, { "grad_norm": 0.307681679725647, "learning_rate": 7.040792778292902e-05, "loss": 0.0404, "step": 31830 }, { "grad_norm": 0.5240086317062378, "learning_rate": 7.038905765395234e-05, "loss": 0.0428, "step": 31840 }, { "grad_norm": 0.4752126932144165, "learning_rate": 7.037018404104334e-05, "loss": 0.0342, "step": 31850 }, { "grad_norm": 0.31220388412475586, "learning_rate": 7.035130694742702e-05, "loss": 0.0325, "step": 31860 }, { "grad_norm": 0.4850665032863617, "learning_rate": 7.033242637632897e-05, "loss": 0.037, "step": 31870 }, { "grad_norm": 0.41466087102890015, "learning_rate": 7.031354233097534e-05, "loss": 0.0325, "step": 31880 }, { "grad_norm": 0.3817399740219116, "learning_rate": 7.029465481459289e-05, "loss": 0.044, "step": 31890 }, { "grad_norm": 0.35764992237091064, "learning_rate": 7.027576383040898e-05, "loss": 0.0278, "step": 31900 }, { "grad_norm": 0.27647772431373596, "learning_rate": 7.025686938165159e-05, "loss": 0.0328, "step": 31910 }, { "grad_norm": 0.3535732626914978, "learning_rate": 7.023797147154924e-05, "loss": 0.0467, "step": 31920 }, { "grad_norm": 0.24478209018707275, "learning_rate": 7.021907010333111e-05, "loss": 0.027, "step": 31930 }, { "grad_norm": 0.25189757347106934, "learning_rate": 7.020016528022685e-05, "loss": 0.0272, "step": 31940 }, { "grad_norm": 0.4092661440372467, "learning_rate": 7.018125700546683e-05, "loss": 0.0439, "step": 31950 }, { "grad_norm": 0.3288266956806183, "learning_rate": 7.016234528228196e-05, "loss": 0.033, "step": 31960 }, { "grad_norm": 0.250161349773407, "learning_rate": 7.014343011390372e-05, "loss": 0.03, "step": 31970 }, { "grad_norm": 0.29977747797966003, "learning_rate": 7.01245115035642e-05, "loss": 0.0245, "step": 31980 }, { "grad_norm": 0.3889031410217285, "learning_rate": 7.010558945449606e-05, "loss": 0.0453, "step": 31990 }, { "grad_norm": 0.2996220886707306, "learning_rate": 7.008666396993258e-05, "loss": 0.0242, "step": 32000 }, { "grad_norm": 0.3821980059146881, "learning_rate": 7.006773505310759e-05, "loss": 0.0373, "step": 32010 }, { "grad_norm": 0.458947092294693, "learning_rate": 7.004880270725553e-05, "loss": 0.033, "step": 32020 }, { "grad_norm": 0.35426172614097595, "learning_rate": 7.002986693561144e-05, "loss": 0.0405, "step": 32030 }, { "grad_norm": 0.30721572041511536, "learning_rate": 7.001092774141089e-05, "loss": 0.0369, "step": 32040 }, { "grad_norm": 1.072271704673767, "learning_rate": 6.999198512789009e-05, "loss": 0.0349, "step": 32050 }, { "grad_norm": 0.46487560868263245, "learning_rate": 6.997303909828584e-05, "loss": 0.0377, "step": 32060 }, { "grad_norm": 0.29376593232154846, "learning_rate": 6.995408965583544e-05, "loss": 0.0358, "step": 32070 }, { "grad_norm": 0.33349549770355225, "learning_rate": 6.993513680377688e-05, "loss": 0.0214, "step": 32080 }, { "grad_norm": 0.33836233615875244, "learning_rate": 6.991618054534868e-05, "loss": 0.0436, "step": 32090 }, { "grad_norm": 0.4400595724582672, "learning_rate": 6.989722088378991e-05, "loss": 0.0333, "step": 32100 }, { "grad_norm": 0.41378480195999146, "learning_rate": 6.987825782234027e-05, "loss": 0.0363, "step": 32110 }, { "grad_norm": 0.3535673916339874, "learning_rate": 6.985929136424006e-05, "loss": 0.0264, "step": 32120 }, { "grad_norm": 0.5069364905357361, "learning_rate": 6.984032151273012e-05, "loss": 0.0346, "step": 32130 }, { "grad_norm": 0.3539620339870453, "learning_rate": 6.982134827105186e-05, "loss": 0.0326, "step": 32140 }, { "grad_norm": 0.3967602849006653, "learning_rate": 6.980237164244729e-05, "loss": 0.0366, "step": 32150 }, { "grad_norm": 0.42862123250961304, "learning_rate": 6.9783391630159e-05, "loss": 0.0284, "step": 32160 }, { "grad_norm": 0.30076560378074646, "learning_rate": 6.976440823743015e-05, "loss": 0.0305, "step": 32170 }, { "grad_norm": 0.4073678255081177, "learning_rate": 6.974542146750451e-05, "loss": 0.0272, "step": 32180 }, { "grad_norm": 0.43827420473098755, "learning_rate": 6.972643132362637e-05, "loss": 0.0274, "step": 32190 }, { "grad_norm": 0.33427777886390686, "learning_rate": 6.970743780904064e-05, "loss": 0.033, "step": 32200 }, { "grad_norm": 0.3937208354473114, "learning_rate": 6.968844092699277e-05, "loss": 0.0284, "step": 32210 }, { "grad_norm": 0.3122343420982361, "learning_rate": 6.966944068072883e-05, "loss": 0.0289, "step": 32220 }, { "grad_norm": 0.375379353761673, "learning_rate": 6.965043707349545e-05, "loss": 0.0243, "step": 32230 }, { "grad_norm": 0.3268110156059265, "learning_rate": 6.963143010853982e-05, "loss": 0.0338, "step": 32240 }, { "grad_norm": 0.44712352752685547, "learning_rate": 6.961241978910971e-05, "loss": 0.0376, "step": 32250 }, { "grad_norm": 0.40118247270584106, "learning_rate": 6.959340611845344e-05, "loss": 0.0342, "step": 32260 }, { "grad_norm": 0.49085208773612976, "learning_rate": 6.957438909981995e-05, "loss": 0.0332, "step": 32270 }, { "grad_norm": 0.32285377383232117, "learning_rate": 6.955536873645872e-05, "loss": 0.0247, "step": 32280 }, { "grad_norm": 0.3372119069099426, "learning_rate": 6.953634503161982e-05, "loss": 0.0287, "step": 32290 }, { "grad_norm": 0.29233264923095703, "learning_rate": 6.951731798855387e-05, "loss": 0.0268, "step": 32300 }, { "grad_norm": 0.2718869149684906, "learning_rate": 6.949828761051208e-05, "loss": 0.035, "step": 32310 }, { "grad_norm": 0.24175652861595154, "learning_rate": 6.947925390074622e-05, "loss": 0.0221, "step": 32320 }, { "grad_norm": 0.31841838359832764, "learning_rate": 6.946021686250863e-05, "loss": 0.0247, "step": 32330 }, { "grad_norm": 0.33482062816619873, "learning_rate": 6.94411764990522e-05, "loss": 0.0283, "step": 32340 }, { "grad_norm": 0.3745651841163635, "learning_rate": 6.942213281363044e-05, "loss": 0.0299, "step": 32350 }, { "grad_norm": 0.35116466879844666, "learning_rate": 6.940308580949737e-05, "loss": 0.0375, "step": 32360 }, { "grad_norm": 0.25292709469795227, "learning_rate": 6.93840354899076e-05, "loss": 0.0278, "step": 32370 }, { "grad_norm": 0.41174980998039246, "learning_rate": 6.936498185811633e-05, "loss": 0.0313, "step": 32380 }, { "grad_norm": 0.2574644386768341, "learning_rate": 6.93459249173793e-05, "loss": 0.0247, "step": 32390 }, { "grad_norm": 0.38711968064308167, "learning_rate": 6.932686467095279e-05, "loss": 0.0305, "step": 32400 }, { "grad_norm": 0.40622642636299133, "learning_rate": 6.930780112209373e-05, "loss": 0.0352, "step": 32410 }, { "grad_norm": 0.5030367970466614, "learning_rate": 6.92887342740595e-05, "loss": 0.0309, "step": 32420 }, { "grad_norm": 0.3116573393344879, "learning_rate": 6.926966413010816e-05, "loss": 0.0358, "step": 32430 }, { "grad_norm": 0.25325462222099304, "learning_rate": 6.925059069349824e-05, "loss": 0.0259, "step": 32440 }, { "grad_norm": 0.25537410378456116, "learning_rate": 6.923151396748886e-05, "loss": 0.0279, "step": 32450 }, { "grad_norm": 0.28017398715019226, "learning_rate": 6.921243395533974e-05, "loss": 0.032, "step": 32460 }, { "grad_norm": 0.283731609582901, "learning_rate": 6.919335066031109e-05, "loss": 0.03, "step": 32470 }, { "grad_norm": 0.34479716420173645, "learning_rate": 6.917426408566379e-05, "loss": 0.0336, "step": 32480 }, { "grad_norm": 0.4030351936817169, "learning_rate": 6.915517423465916e-05, "loss": 0.0379, "step": 32490 }, { "grad_norm": 0.36167749762535095, "learning_rate": 6.913608111055914e-05, "loss": 0.0335, "step": 32500 }, { "grad_norm": 0.3521696925163269, "learning_rate": 6.911698471662623e-05, "loss": 0.0246, "step": 32510 }, { "grad_norm": 0.2320660650730133, "learning_rate": 6.90978850561235e-05, "loss": 0.0254, "step": 32520 }, { "grad_norm": 0.4238888621330261, "learning_rate": 6.907878213231454e-05, "loss": 0.026, "step": 32530 }, { "grad_norm": 0.4413042366504669, "learning_rate": 6.90596759484635e-05, "loss": 0.032, "step": 32540 }, { "grad_norm": 0.415234237909317, "learning_rate": 6.904056650783514e-05, "loss": 0.0318, "step": 32550 }, { "grad_norm": 0.3619465231895447, "learning_rate": 6.902145381369471e-05, "loss": 0.0348, "step": 32560 }, { "grad_norm": 0.4993058443069458, "learning_rate": 6.900233786930808e-05, "loss": 0.0336, "step": 32570 }, { "grad_norm": 0.39047712087631226, "learning_rate": 6.898321867794161e-05, "loss": 0.0322, "step": 32580 }, { "grad_norm": 0.2872781455516815, "learning_rate": 6.896409624286226e-05, "loss": 0.0311, "step": 32590 }, { "grad_norm": 0.4034043252468109, "learning_rate": 6.894497056733754e-05, "loss": 0.0284, "step": 32600 }, { "grad_norm": 0.25919443368911743, "learning_rate": 6.89258416546355e-05, "loss": 0.032, "step": 32610 }, { "grad_norm": 0.2737017571926117, "learning_rate": 6.890670950802474e-05, "loss": 0.0289, "step": 32620 }, { "grad_norm": 0.34809526801109314, "learning_rate": 6.88875741307744e-05, "loss": 0.0314, "step": 32630 }, { "grad_norm": 0.4481312036514282, "learning_rate": 6.886843552615425e-05, "loss": 0.0315, "step": 32640 }, { "grad_norm": 0.2903803884983063, "learning_rate": 6.884929369743451e-05, "loss": 0.0232, "step": 32650 }, { "grad_norm": 0.29124119877815247, "learning_rate": 6.8830148647886e-05, "loss": 0.0288, "step": 32660 }, { "grad_norm": 0.3275162875652313, "learning_rate": 6.88110003807801e-05, "loss": 0.0394, "step": 32670 }, { "grad_norm": 0.40914303064346313, "learning_rate": 6.87918488993887e-05, "loss": 0.0271, "step": 32680 }, { "grad_norm": 0.48555055260658264, "learning_rate": 6.877269420698431e-05, "loss": 0.0342, "step": 32690 }, { "grad_norm": 0.3582095801830292, "learning_rate": 6.875353630683989e-05, "loss": 0.031, "step": 32700 }, { "grad_norm": 0.44469669461250305, "learning_rate": 6.873437520222905e-05, "loss": 0.0371, "step": 32710 }, { "grad_norm": 0.21860839426517487, "learning_rate": 6.871521089642585e-05, "loss": 0.0258, "step": 32720 }, { "grad_norm": 0.24931751191616058, "learning_rate": 6.869604339270498e-05, "loss": 0.0277, "step": 32730 }, { "grad_norm": 0.3982490301132202, "learning_rate": 6.867687269434164e-05, "loss": 0.0346, "step": 32740 }, { "grad_norm": 0.34489819407463074, "learning_rate": 6.865769880461156e-05, "loss": 0.0274, "step": 32750 }, { "grad_norm": 0.5629722476005554, "learning_rate": 6.863852172679104e-05, "loss": 0.0328, "step": 32760 }, { "grad_norm": 0.32520076632499695, "learning_rate": 6.861934146415693e-05, "loss": 0.0332, "step": 32770 }, { "grad_norm": 0.5504567623138428, "learning_rate": 6.86001580199866e-05, "loss": 0.0372, "step": 32780 }, { "grad_norm": 0.33443760871887207, "learning_rate": 6.858097139755798e-05, "loss": 0.0261, "step": 32790 }, { "grad_norm": 0.36429494619369507, "learning_rate": 6.856178160014955e-05, "loss": 0.0263, "step": 32800 }, { "grad_norm": 0.3554668128490448, "learning_rate": 6.85425886310403e-05, "loss": 0.027, "step": 32810 }, { "grad_norm": 0.3631960451602936, "learning_rate": 6.852339249350979e-05, "loss": 0.0255, "step": 32820 }, { "grad_norm": 0.2530467212200165, "learning_rate": 6.850419319083812e-05, "loss": 0.0372, "step": 32830 }, { "grad_norm": 0.3793100416660309, "learning_rate": 6.848499072630592e-05, "loss": 0.0257, "step": 32840 }, { "grad_norm": 0.3673008978366852, "learning_rate": 6.846578510319439e-05, "loss": 0.0327, "step": 32850 }, { "grad_norm": 0.31233975291252136, "learning_rate": 6.844657632478519e-05, "loss": 0.0315, "step": 32860 }, { "grad_norm": 0.2211541086435318, "learning_rate": 6.842736439436063e-05, "loss": 0.0237, "step": 32870 }, { "grad_norm": 0.2737797498703003, "learning_rate": 6.84081493152035e-05, "loss": 0.0252, "step": 32880 }, { "grad_norm": 0.35388678312301636, "learning_rate": 6.83889310905971e-05, "loss": 0.0255, "step": 32890 }, { "grad_norm": 0.3622930347919464, "learning_rate": 6.836970972382533e-05, "loss": 0.0384, "step": 32900 }, { "grad_norm": 0.2983044683933258, "learning_rate": 6.835048521817257e-05, "loss": 0.0314, "step": 32910 }, { "grad_norm": 0.3077954053878784, "learning_rate": 6.833125757692379e-05, "loss": 0.0334, "step": 32920 }, { "grad_norm": 0.27287381887435913, "learning_rate": 6.831202680336441e-05, "loss": 0.0249, "step": 32930 }, { "grad_norm": 0.28633761405944824, "learning_rate": 6.829279290078052e-05, "loss": 0.0257, "step": 32940 }, { "grad_norm": 0.3208579123020172, "learning_rate": 6.827355587245863e-05, "loss": 0.0354, "step": 32950 }, { "grad_norm": 0.20442593097686768, "learning_rate": 6.82543157216858e-05, "loss": 0.0281, "step": 32960 }, { "grad_norm": 0.3295811414718628, "learning_rate": 6.823507245174969e-05, "loss": 0.034, "step": 32970 }, { "grad_norm": 0.43798473477363586, "learning_rate": 6.821582606593841e-05, "loss": 0.0248, "step": 32980 }, { "grad_norm": 0.3221345543861389, "learning_rate": 6.81965765675407e-05, "loss": 0.0237, "step": 32990 }, { "grad_norm": 0.3647124171257019, "learning_rate": 6.81773239598457e-05, "loss": 0.0295, "step": 33000 }, { "grad_norm": 0.36118704080581665, "learning_rate": 6.815806824614319e-05, "loss": 0.0327, "step": 33010 }, { "grad_norm": 0.49232640862464905, "learning_rate": 6.813880942972343e-05, "loss": 0.0308, "step": 33020 }, { "grad_norm": 0.30354443192481995, "learning_rate": 6.811954751387726e-05, "loss": 0.0315, "step": 33030 }, { "grad_norm": 0.31832894682884216, "learning_rate": 6.810028250189598e-05, "loss": 0.0297, "step": 33040 }, { "grad_norm": 0.45183128118515015, "learning_rate": 6.808101439707147e-05, "loss": 0.0407, "step": 33050 }, { "grad_norm": 0.4521848261356354, "learning_rate": 6.806174320269609e-05, "loss": 0.0323, "step": 33060 }, { "grad_norm": 0.33665481209754944, "learning_rate": 6.804246892206281e-05, "loss": 0.0325, "step": 33070 }, { "grad_norm": 0.38651278614997864, "learning_rate": 6.802319155846506e-05, "loss": 0.0456, "step": 33080 }, { "grad_norm": 0.48339205980300903, "learning_rate": 6.800391111519679e-05, "loss": 0.0298, "step": 33090 }, { "grad_norm": 0.4063251316547394, "learning_rate": 6.798462759555253e-05, "loss": 0.0333, "step": 33100 }, { "grad_norm": 0.25249600410461426, "learning_rate": 6.79653410028273e-05, "loss": 0.0271, "step": 33110 }, { "grad_norm": 0.24571900069713593, "learning_rate": 6.794605134031663e-05, "loss": 0.0296, "step": 33120 }, { "grad_norm": 0.4306679964065552, "learning_rate": 6.792675861131661e-05, "loss": 0.0362, "step": 33130 }, { "grad_norm": 0.47876155376434326, "learning_rate": 6.790746281912386e-05, "loss": 0.0395, "step": 33140 }, { "grad_norm": 0.2803647220134735, "learning_rate": 6.788816396703546e-05, "loss": 0.028, "step": 33150 }, { "grad_norm": 0.5615425705909729, "learning_rate": 6.78688620583491e-05, "loss": 0.0308, "step": 33160 }, { "grad_norm": 0.44414642453193665, "learning_rate": 6.784955709636292e-05, "loss": 0.0389, "step": 33170 }, { "grad_norm": 0.44658175110816956, "learning_rate": 6.783024908437564e-05, "loss": 0.0274, "step": 33180 }, { "grad_norm": 0.6297395825386047, "learning_rate": 6.781093802568641e-05, "loss": 0.0283, "step": 33190 }, { "grad_norm": 0.29046717286109924, "learning_rate": 6.779162392359504e-05, "loss": 0.0269, "step": 33200 }, { "grad_norm": 0.4219120740890503, "learning_rate": 6.777230678140172e-05, "loss": 0.0324, "step": 33210 }, { "grad_norm": 0.3332587480545044, "learning_rate": 6.775298660240726e-05, "loss": 0.0351, "step": 33220 }, { "grad_norm": 0.5743197202682495, "learning_rate": 6.773366338991292e-05, "loss": 0.0414, "step": 33230 }, { "grad_norm": 0.3184424042701721, "learning_rate": 6.771433714722052e-05, "loss": 0.0299, "step": 33240 }, { "grad_norm": 0.3675006926059723, "learning_rate": 6.769500787763239e-05, "loss": 0.0369, "step": 33250 }, { "grad_norm": 0.22951245307922363, "learning_rate": 6.76756755844514e-05, "loss": 0.0377, "step": 33260 }, { "grad_norm": 0.4525536894798279, "learning_rate": 6.765634027098087e-05, "loss": 0.0316, "step": 33270 }, { "grad_norm": 0.36894747614860535, "learning_rate": 6.763700194052468e-05, "loss": 0.0284, "step": 33280 }, { "grad_norm": 0.30686187744140625, "learning_rate": 6.761766059638723e-05, "loss": 0.0279, "step": 33290 }, { "grad_norm": 0.4646385908126831, "learning_rate": 6.759831624187345e-05, "loss": 0.0326, "step": 33300 }, { "grad_norm": 0.6342246532440186, "learning_rate": 6.757896888028871e-05, "loss": 0.0339, "step": 33310 }, { "grad_norm": 0.43516233563423157, "learning_rate": 6.7559618514939e-05, "loss": 0.032, "step": 33320 }, { "grad_norm": 0.38899049162864685, "learning_rate": 6.754026514913073e-05, "loss": 0.0299, "step": 33330 }, { "grad_norm": 0.32041868567466736, "learning_rate": 6.752090878617087e-05, "loss": 0.0347, "step": 33340 }, { "grad_norm": 0.37139183282852173, "learning_rate": 6.75015494293669e-05, "loss": 0.0283, "step": 33350 }, { "grad_norm": 0.7739192843437195, "learning_rate": 6.74821870820268e-05, "loss": 0.025, "step": 33360 }, { "grad_norm": 0.25476422905921936, "learning_rate": 6.746282174745907e-05, "loss": 0.0372, "step": 33370 }, { "grad_norm": 0.443288654088974, "learning_rate": 6.744345342897271e-05, "loss": 0.0377, "step": 33380 }, { "grad_norm": 0.3006734251976013, "learning_rate": 6.742408212987724e-05, "loss": 0.0387, "step": 33390 }, { "grad_norm": 0.30069607496261597, "learning_rate": 6.740470785348269e-05, "loss": 0.0241, "step": 33400 }, { "grad_norm": 0.28314638137817383, "learning_rate": 6.738533060309958e-05, "loss": 0.0312, "step": 33410 }, { "grad_norm": 0.2892697751522064, "learning_rate": 6.736595038203894e-05, "loss": 0.0299, "step": 33420 }, { "grad_norm": 0.21550296247005463, "learning_rate": 6.734656719361236e-05, "loss": 0.0342, "step": 33430 }, { "grad_norm": 0.35473495721817017, "learning_rate": 6.732718104113189e-05, "loss": 0.0335, "step": 33440 }, { "grad_norm": 0.38772645592689514, "learning_rate": 6.730779192791006e-05, "loss": 0.031, "step": 33450 }, { "grad_norm": 0.32372719049453735, "learning_rate": 6.728839985725997e-05, "loss": 0.0317, "step": 33460 }, { "grad_norm": 0.38697880506515503, "learning_rate": 6.726900483249517e-05, "loss": 0.0288, "step": 33470 }, { "grad_norm": 0.3732318878173828, "learning_rate": 6.724960685692976e-05, "loss": 0.0343, "step": 33480 }, { "grad_norm": 0.29046764969825745, "learning_rate": 6.723020593387833e-05, "loss": 0.0296, "step": 33490 }, { "grad_norm": 0.3502050042152405, "learning_rate": 6.721080206665593e-05, "loss": 0.0326, "step": 33500 }, { "grad_norm": 0.33806851506233215, "learning_rate": 6.719139525857819e-05, "loss": 0.041, "step": 33510 }, { "grad_norm": 0.21053887903690338, "learning_rate": 6.717198551296117e-05, "loss": 0.0279, "step": 33520 }, { "grad_norm": 0.3387147784233093, "learning_rate": 6.715257283312148e-05, "loss": 0.0315, "step": 33530 }, { "grad_norm": 0.3240709900856018, "learning_rate": 6.713315722237623e-05, "loss": 0.03, "step": 33540 }, { "grad_norm": 0.3116764426231384, "learning_rate": 6.7113738684043e-05, "loss": 0.0323, "step": 33550 }, { "grad_norm": 0.42014363408088684, "learning_rate": 6.709431722143989e-05, "loss": 0.0305, "step": 33560 }, { "grad_norm": 0.338529109954834, "learning_rate": 6.70748928378855e-05, "loss": 0.0346, "step": 33570 }, { "grad_norm": 0.21631725132465363, "learning_rate": 6.705546553669891e-05, "loss": 0.0212, "step": 33580 }, { "grad_norm": 0.1952035278081894, "learning_rate": 6.703603532119974e-05, "loss": 0.0265, "step": 33590 }, { "grad_norm": 0.45518359541893005, "learning_rate": 6.701660219470808e-05, "loss": 0.0278, "step": 33600 }, { "grad_norm": 0.5720212459564209, "learning_rate": 6.69971661605445e-05, "loss": 0.0365, "step": 33610 }, { "grad_norm": 0.434165894985199, "learning_rate": 6.697772722203008e-05, "loss": 0.0295, "step": 33620 }, { "grad_norm": 0.4072114825248718, "learning_rate": 6.695828538248643e-05, "loss": 0.0342, "step": 33630 }, { "grad_norm": 0.28008338809013367, "learning_rate": 6.693884064523563e-05, "loss": 0.0325, "step": 33640 }, { "grad_norm": 0.278062105178833, "learning_rate": 6.691939301360023e-05, "loss": 0.0254, "step": 33650 }, { "grad_norm": 0.3730780780315399, "learning_rate": 6.689994249090333e-05, "loss": 0.03, "step": 33660 }, { "grad_norm": 0.5782896280288696, "learning_rate": 6.688048908046845e-05, "loss": 0.036, "step": 33670 }, { "grad_norm": 0.2736591398715973, "learning_rate": 6.686103278561969e-05, "loss": 0.0266, "step": 33680 }, { "grad_norm": 0.2448476254940033, "learning_rate": 6.684157360968156e-05, "loss": 0.0309, "step": 33690 }, { "grad_norm": 0.2709334194660187, "learning_rate": 6.682211155597911e-05, "loss": 0.0243, "step": 33700 }, { "grad_norm": 0.25607073307037354, "learning_rate": 6.680264662783789e-05, "loss": 0.03, "step": 33710 }, { "grad_norm": 0.3480489253997803, "learning_rate": 6.678317882858391e-05, "loss": 0.0305, "step": 33720 }, { "grad_norm": 1.2706648111343384, "learning_rate": 6.67637081615437e-05, "loss": 0.0321, "step": 33730 }, { "grad_norm": 0.37600985169410706, "learning_rate": 6.674423463004427e-05, "loss": 0.0282, "step": 33740 }, { "grad_norm": 0.4419335424900055, "learning_rate": 6.672475823741308e-05, "loss": 0.0288, "step": 33750 }, { "grad_norm": 0.5417532324790955, "learning_rate": 6.670527898697811e-05, "loss": 0.0356, "step": 33760 }, { "grad_norm": 0.28889596462249756, "learning_rate": 6.668579688206788e-05, "loss": 0.036, "step": 33770 }, { "grad_norm": 0.3452002704143524, "learning_rate": 6.666631192601131e-05, "loss": 0.0291, "step": 33780 }, { "grad_norm": 0.39953044056892395, "learning_rate": 6.664682412213785e-05, "loss": 0.0291, "step": 33790 }, { "grad_norm": 0.4309249520301819, "learning_rate": 6.662733347377745e-05, "loss": 0.026, "step": 33800 }, { "grad_norm": 0.2121022641658783, "learning_rate": 6.660783998426051e-05, "loss": 0.0303, "step": 33810 }, { "grad_norm": 0.36701926589012146, "learning_rate": 6.658834365691794e-05, "loss": 0.0253, "step": 33820 }, { "grad_norm": 0.2789294421672821, "learning_rate": 6.656884449508115e-05, "loss": 0.0248, "step": 33830 }, { "grad_norm": 0.2626398205757141, "learning_rate": 6.654934250208198e-05, "loss": 0.0236, "step": 33840 }, { "grad_norm": 0.27121472358703613, "learning_rate": 6.65298376812528e-05, "loss": 0.0297, "step": 33850 }, { "grad_norm": 0.2501174211502075, "learning_rate": 6.651033003592646e-05, "loss": 0.0254, "step": 33860 }, { "grad_norm": 0.3505308926105499, "learning_rate": 6.649081956943626e-05, "loss": 0.0218, "step": 33870 }, { "grad_norm": 0.5404722690582275, "learning_rate": 6.647130628511604e-05, "loss": 0.028, "step": 33880 }, { "grad_norm": 0.31089094281196594, "learning_rate": 6.645179018630005e-05, "loss": 0.0247, "step": 33890 }, { "grad_norm": 0.5932881832122803, "learning_rate": 6.643227127632309e-05, "loss": 0.0292, "step": 33900 }, { "grad_norm": 0.38012000918388367, "learning_rate": 6.641274955852038e-05, "loss": 0.0338, "step": 33910 }, { "grad_norm": 0.526625394821167, "learning_rate": 6.639322503622768e-05, "loss": 0.0514, "step": 33920 }, { "grad_norm": 0.6367372274398804, "learning_rate": 6.637369771278116e-05, "loss": 0.0417, "step": 33930 }, { "grad_norm": 0.2652965784072876, "learning_rate": 6.635416759151751e-05, "loss": 0.0333, "step": 33940 }, { "grad_norm": 0.33642831444740295, "learning_rate": 6.633463467577394e-05, "loss": 0.0482, "step": 33950 }, { "grad_norm": 0.3794395327568054, "learning_rate": 6.631509896888803e-05, "loss": 0.0384, "step": 33960 }, { "grad_norm": 0.32070618867874146, "learning_rate": 6.629556047419794e-05, "loss": 0.0312, "step": 33970 }, { "grad_norm": 0.44042065739631653, "learning_rate": 6.627601919504223e-05, "loss": 0.0282, "step": 33980 }, { "grad_norm": 0.3883326053619385, "learning_rate": 6.625647513476001e-05, "loss": 0.029, "step": 33990 }, { "grad_norm": 0.5166831612586975, "learning_rate": 6.62369282966908e-05, "loss": 0.0432, "step": 34000 }, { "grad_norm": 0.3936937749385834, "learning_rate": 6.621737868417464e-05, "loss": 0.0365, "step": 34010 }, { "grad_norm": 0.30831989645957947, "learning_rate": 6.619782630055198e-05, "loss": 0.0311, "step": 34020 }, { "grad_norm": 0.32267555594444275, "learning_rate": 6.617827114916382e-05, "loss": 0.0303, "step": 34030 }, { "grad_norm": 0.2108050286769867, "learning_rate": 6.615871323335161e-05, "loss": 0.0263, "step": 34040 }, { "grad_norm": 0.28836384415626526, "learning_rate": 6.613915255645725e-05, "loss": 0.0304, "step": 34050 }, { "grad_norm": 0.406087189912796, "learning_rate": 6.611958912182312e-05, "loss": 0.0254, "step": 34060 }, { "grad_norm": 0.3164714276790619, "learning_rate": 6.610002293279207e-05, "loss": 0.0294, "step": 34070 }, { "grad_norm": 0.25576916337013245, "learning_rate": 6.608045399270746e-05, "loss": 0.0229, "step": 34080 }, { "grad_norm": 0.4919949769973755, "learning_rate": 6.606088230491304e-05, "loss": 0.0264, "step": 34090 }, { "grad_norm": 0.26354050636291504, "learning_rate": 6.604130787275312e-05, "loss": 0.0257, "step": 34100 }, { "grad_norm": 0.2549012303352356, "learning_rate": 6.602173069957242e-05, "loss": 0.0269, "step": 34110 }, { "grad_norm": 0.2814289331436157, "learning_rate": 6.600215078871612e-05, "loss": 0.0278, "step": 34120 }, { "grad_norm": 0.3021193742752075, "learning_rate": 6.598256814352992e-05, "loss": 0.0242, "step": 34130 }, { "grad_norm": 0.36073246598243713, "learning_rate": 6.596298276735995e-05, "loss": 0.0255, "step": 34140 }, { "grad_norm": 0.3309273421764374, "learning_rate": 6.594339466355282e-05, "loss": 0.0277, "step": 34150 }, { "grad_norm": 0.34328532218933105, "learning_rate": 6.592380383545558e-05, "loss": 0.0369, "step": 34160 }, { "grad_norm": 0.38292962312698364, "learning_rate": 6.590421028641577e-05, "loss": 0.0338, "step": 34170 }, { "grad_norm": 0.34086790680885315, "learning_rate": 6.588461401978143e-05, "loss": 0.0315, "step": 34180 }, { "grad_norm": 0.4680657982826233, "learning_rate": 6.586501503890099e-05, "loss": 0.0298, "step": 34190 }, { "grad_norm": 0.3460874855518341, "learning_rate": 6.584541334712338e-05, "loss": 0.0272, "step": 34200 }, { "grad_norm": 0.3589758276939392, "learning_rate": 6.582580894779802e-05, "loss": 0.0289, "step": 34210 }, { "grad_norm": 0.3933168649673462, "learning_rate": 6.580620184427473e-05, "loss": 0.0249, "step": 34220 }, { "grad_norm": 0.2529889643192291, "learning_rate": 6.578659203990385e-05, "loss": 0.0278, "step": 34230 }, { "grad_norm": 0.3648965060710907, "learning_rate": 6.576697953803615e-05, "loss": 0.028, "step": 34240 }, { "grad_norm": 0.23162388801574707, "learning_rate": 6.57473643420229e-05, "loss": 0.0294, "step": 34250 }, { "grad_norm": 0.26089125871658325, "learning_rate": 6.572774645521574e-05, "loss": 0.0299, "step": 34260 }, { "grad_norm": 0.48914211988449097, "learning_rate": 6.570812588096688e-05, "loss": 0.0373, "step": 34270 }, { "grad_norm": 0.30789315700531006, "learning_rate": 6.568850262262893e-05, "loss": 0.0267, "step": 34280 }, { "grad_norm": 0.3613687753677368, "learning_rate": 6.566887668355497e-05, "loss": 0.0254, "step": 34290 }, { "grad_norm": 0.4339894652366638, "learning_rate": 6.564924806709851e-05, "loss": 0.0229, "step": 34300 }, { "grad_norm": 0.3307172656059265, "learning_rate": 6.562961677661359e-05, "loss": 0.0344, "step": 34310 }, { "grad_norm": 0.4365094006061554, "learning_rate": 6.56099828154546e-05, "loss": 0.0312, "step": 34320 }, { "grad_norm": 0.32364487648010254, "learning_rate": 6.55903461869765e-05, "loss": 0.0252, "step": 34330 }, { "grad_norm": 0.21418093144893646, "learning_rate": 6.557070689453465e-05, "loss": 0.0249, "step": 34340 }, { "grad_norm": 0.18915054202079773, "learning_rate": 6.555106494148482e-05, "loss": 0.0351, "step": 34350 }, { "grad_norm": 0.3527035117149353, "learning_rate": 6.553142033118333e-05, "loss": 0.0273, "step": 34360 }, { "grad_norm": 0.3650072515010834, "learning_rate": 6.551177306698688e-05, "loss": 0.0407, "step": 34370 }, { "grad_norm": 0.42612743377685547, "learning_rate": 6.549212315225267e-05, "loss": 0.0281, "step": 34380 }, { "grad_norm": 0.33662018179893494, "learning_rate": 6.547247059033833e-05, "loss": 0.0309, "step": 34390 }, { "grad_norm": 0.3717235326766968, "learning_rate": 6.545281538460193e-05, "loss": 0.0294, "step": 34400 }, { "grad_norm": 0.2672235071659088, "learning_rate": 6.543315753840202e-05, "loss": 0.0273, "step": 34410 }, { "grad_norm": 0.251592218875885, "learning_rate": 6.541349705509758e-05, "loss": 0.0256, "step": 34420 }, { "grad_norm": 0.32917073369026184, "learning_rate": 6.539383393804805e-05, "loss": 0.0281, "step": 34430 }, { "grad_norm": 0.5394722819328308, "learning_rate": 6.537416819061333e-05, "loss": 0.0337, "step": 34440 }, { "grad_norm": 0.31090039014816284, "learning_rate": 6.535449981615375e-05, "loss": 0.0314, "step": 34450 }, { "grad_norm": 0.32147806882858276, "learning_rate": 6.53348288180301e-05, "loss": 0.0277, "step": 34460 }, { "grad_norm": 0.39478376507759094, "learning_rate": 6.531515519960361e-05, "loss": 0.0347, "step": 34470 }, { "grad_norm": 0.27607014775276184, "learning_rate": 6.529547896423597e-05, "loss": 0.0263, "step": 34480 }, { "grad_norm": 0.8129873871803284, "learning_rate": 6.52758001152893e-05, "loss": 0.0352, "step": 34490 }, { "grad_norm": 0.26332440972328186, "learning_rate": 6.525611865612618e-05, "loss": 0.0258, "step": 34500 }, { "grad_norm": 0.41834697127342224, "learning_rate": 6.523643459010966e-05, "loss": 0.0328, "step": 34510 }, { "grad_norm": 0.31826257705688477, "learning_rate": 6.521674792060317e-05, "loss": 0.0314, "step": 34520 }, { "grad_norm": 0.3717917203903198, "learning_rate": 6.519705865097063e-05, "loss": 0.0284, "step": 34530 }, { "grad_norm": 0.4179988503456116, "learning_rate": 6.517736678457641e-05, "loss": 0.0305, "step": 34540 }, { "grad_norm": 0.4135383665561676, "learning_rate": 6.515767232478534e-05, "loss": 0.031, "step": 34550 }, { "grad_norm": 0.2649118900299072, "learning_rate": 6.51379752749626e-05, "loss": 0.0231, "step": 34560 }, { "grad_norm": 0.33390623331069946, "learning_rate": 6.511827563847393e-05, "loss": 0.0315, "step": 34570 }, { "grad_norm": 0.34948021173477173, "learning_rate": 6.509857341868542e-05, "loss": 0.0313, "step": 34580 }, { "grad_norm": 0.39213940501213074, "learning_rate": 6.507886861896367e-05, "loss": 0.03, "step": 34590 }, { "grad_norm": 0.360895574092865, "learning_rate": 6.505916124267567e-05, "loss": 0.0371, "step": 34600 }, { "grad_norm": 0.39404627680778503, "learning_rate": 6.503945129318891e-05, "loss": 0.031, "step": 34610 }, { "grad_norm": 0.5413391590118408, "learning_rate": 6.501973877387122e-05, "loss": 0.0261, "step": 34620 }, { "grad_norm": 0.5369406342506409, "learning_rate": 6.500002368809098e-05, "loss": 0.0303, "step": 34630 }, { "grad_norm": 0.43894821405410767, "learning_rate": 6.498030603921694e-05, "loss": 0.0331, "step": 34640 }, { "grad_norm": 0.3754022717475891, "learning_rate": 6.496058583061832e-05, "loss": 0.0308, "step": 34650 }, { "grad_norm": 0.40075042843818665, "learning_rate": 6.494086306566475e-05, "loss": 0.025, "step": 34660 }, { "grad_norm": 0.3033559024333954, "learning_rate": 6.492113774772632e-05, "loss": 0.0293, "step": 34670 }, { "grad_norm": 0.5776040554046631, "learning_rate": 6.490140988017354e-05, "loss": 0.0316, "step": 34680 }, { "grad_norm": 0.6258729696273804, "learning_rate": 6.488167946637736e-05, "loss": 0.0364, "step": 34690 }, { "grad_norm": 0.3188856840133667, "learning_rate": 6.486194650970915e-05, "loss": 0.0313, "step": 34700 }, { "grad_norm": 0.5411329865455627, "learning_rate": 6.48422110135408e-05, "loss": 0.0367, "step": 34710 }, { "grad_norm": 0.2379775196313858, "learning_rate": 6.482247298124451e-05, "loss": 0.0333, "step": 34720 }, { "grad_norm": 0.20005373656749725, "learning_rate": 6.480273241619297e-05, "loss": 0.0279, "step": 34730 }, { "grad_norm": 0.4022268056869507, "learning_rate": 6.478298932175933e-05, "loss": 0.0274, "step": 34740 }, { "grad_norm": 0.3655270040035248, "learning_rate": 6.476324370131712e-05, "loss": 0.0326, "step": 34750 }, { "grad_norm": 0.3323074281215668, "learning_rate": 6.474349555824036e-05, "loss": 0.0381, "step": 34760 }, { "grad_norm": 0.3191341161727905, "learning_rate": 6.472374489590342e-05, "loss": 0.0262, "step": 34770 }, { "grad_norm": 0.18315407633781433, "learning_rate": 6.470399171768118e-05, "loss": 0.0439, "step": 34780 }, { "grad_norm": 0.28710103034973145, "learning_rate": 6.468423602694891e-05, "loss": 0.0251, "step": 34790 }, { "grad_norm": 0.25281625986099243, "learning_rate": 6.466447782708232e-05, "loss": 0.0274, "step": 34800 }, { "grad_norm": 0.23383338749408722, "learning_rate": 6.464471712145754e-05, "loss": 0.0243, "step": 34810 }, { "grad_norm": 0.24666596949100494, "learning_rate": 6.462495391345114e-05, "loss": 0.0257, "step": 34820 }, { "grad_norm": 0.5583727955818176, "learning_rate": 6.46051882064401e-05, "loss": 0.0256, "step": 34830 }, { "grad_norm": 0.37385448813438416, "learning_rate": 6.458542000380186e-05, "loss": 0.0271, "step": 34840 }, { "grad_norm": 0.3545275032520294, "learning_rate": 6.456564930891424e-05, "loss": 0.0265, "step": 34850 }, { "grad_norm": 0.39508482813835144, "learning_rate": 6.454587612515555e-05, "loss": 0.035, "step": 34860 }, { "grad_norm": 0.21243983507156372, "learning_rate": 6.452610045590444e-05, "loss": 0.0295, "step": 34870 }, { "grad_norm": 0.47432592511177063, "learning_rate": 6.450632230454005e-05, "loss": 0.0302, "step": 34880 }, { "grad_norm": 0.3192518651485443, "learning_rate": 6.448654167444195e-05, "loss": 0.0369, "step": 34890 }, { "grad_norm": 0.33597394824028015, "learning_rate": 6.446675856899005e-05, "loss": 0.0235, "step": 34900 }, { "grad_norm": 0.22078284621238708, "learning_rate": 6.444697299156481e-05, "loss": 0.0267, "step": 34910 }, { "grad_norm": 0.3301418423652649, "learning_rate": 6.442718494554701e-05, "loss": 0.034, "step": 34920 }, { "grad_norm": 0.36321988701820374, "learning_rate": 6.440739443431787e-05, "loss": 0.0234, "step": 34930 }, { "grad_norm": 0.2055717408657074, "learning_rate": 6.438760146125906e-05, "loss": 0.0201, "step": 34940 }, { "grad_norm": 0.36506107449531555, "learning_rate": 6.436780602975267e-05, "loss": 0.0323, "step": 34950 }, { "grad_norm": 0.3523676097393036, "learning_rate": 6.43480081431812e-05, "loss": 0.0313, "step": 34960 }, { "grad_norm": 0.3094734251499176, "learning_rate": 6.432820780492756e-05, "loss": 0.0327, "step": 34970 }, { "grad_norm": 0.32352083921432495, "learning_rate": 6.430840501837506e-05, "loss": 0.0321, "step": 34980 }, { "grad_norm": 0.32122668623924255, "learning_rate": 6.428859978690748e-05, "loss": 0.0317, "step": 34990 }, { "grad_norm": 0.3777931332588196, "learning_rate": 6.426879211390901e-05, "loss": 0.0238, "step": 35000 }, { "grad_norm": 1.0015151500701904, "learning_rate": 6.424898200276422e-05, "loss": 0.0315, "step": 35010 }, { "grad_norm": 0.4598880708217621, "learning_rate": 6.42291694568581e-05, "loss": 0.0278, "step": 35020 }, { "grad_norm": 0.44275394082069397, "learning_rate": 6.42093544795761e-05, "loss": 0.0386, "step": 35030 }, { "grad_norm": 0.6386398077011108, "learning_rate": 6.418953707430403e-05, "loss": 0.0343, "step": 35040 }, { "grad_norm": 0.8441705703735352, "learning_rate": 6.416971724442819e-05, "loss": 0.0301, "step": 35050 }, { "grad_norm": 0.25269055366516113, "learning_rate": 6.414989499333519e-05, "loss": 0.024, "step": 35060 }, { "grad_norm": 0.3050679564476013, "learning_rate": 6.413007032441214e-05, "loss": 0.0275, "step": 35070 }, { "grad_norm": 0.6259297132492065, "learning_rate": 6.411024324104653e-05, "loss": 0.0341, "step": 35080 }, { "grad_norm": 0.2958106994628906, "learning_rate": 6.409041374662628e-05, "loss": 0.0252, "step": 35090 }, { "grad_norm": 0.3536859452724457, "learning_rate": 6.407058184453967e-05, "loss": 0.0357, "step": 35100 }, { "grad_norm": 0.44957393407821655, "learning_rate": 6.405074753817548e-05, "loss": 0.0277, "step": 35110 }, { "grad_norm": 0.40564459562301636, "learning_rate": 6.40309108309228e-05, "loss": 0.0322, "step": 35120 }, { "grad_norm": 0.33156895637512207, "learning_rate": 6.401107172617122e-05, "loss": 0.0267, "step": 35130 }, { "grad_norm": 0.3430612087249756, "learning_rate": 6.399123022731068e-05, "loss": 0.0448, "step": 35140 }, { "grad_norm": 0.4556727111339569, "learning_rate": 6.397138633773157e-05, "loss": 0.0268, "step": 35150 }, { "grad_norm": 0.39984583854675293, "learning_rate": 6.395154006082463e-05, "loss": 0.025, "step": 35160 }, { "grad_norm": 0.3775705099105835, "learning_rate": 6.393169139998109e-05, "loss": 0.0274, "step": 35170 }, { "grad_norm": 0.3247458338737488, "learning_rate": 6.39118403585925e-05, "loss": 0.0257, "step": 35180 }, { "grad_norm": 0.308506041765213, "learning_rate": 6.38919869400509e-05, "loss": 0.0272, "step": 35190 }, { "grad_norm": 0.3246496021747589, "learning_rate": 6.387213114774865e-05, "loss": 0.0303, "step": 35200 }, { "grad_norm": 0.32023775577545166, "learning_rate": 6.385227298507863e-05, "loss": 0.052, "step": 35210 }, { "grad_norm": 0.35166487097740173, "learning_rate": 6.3832412455434e-05, "loss": 0.0217, "step": 35220 }, { "grad_norm": 0.4007161855697632, "learning_rate": 6.381254956220841e-05, "loss": 0.0262, "step": 35230 }, { "grad_norm": 0.28429678082466125, "learning_rate": 6.379268430879586e-05, "loss": 0.0296, "step": 35240 }, { "grad_norm": 0.3055553436279297, "learning_rate": 6.37728166985908e-05, "loss": 0.0227, "step": 35250 }, { "grad_norm": 0.39492088556289673, "learning_rate": 6.375294673498804e-05, "loss": 0.0237, "step": 35260 }, { "grad_norm": 0.30364134907722473, "learning_rate": 6.373307442138284e-05, "loss": 0.0286, "step": 35270 }, { "grad_norm": 0.6093922853469849, "learning_rate": 6.371319976117081e-05, "loss": 0.0393, "step": 35280 }, { "grad_norm": 0.3311797082424164, "learning_rate": 6.3693322757748e-05, "loss": 0.0287, "step": 35290 }, { "grad_norm": 0.6272584199905396, "learning_rate": 6.367344341451086e-05, "loss": 0.0331, "step": 35300 }, { "grad_norm": 0.304344117641449, "learning_rate": 6.36535617348562e-05, "loss": 0.0267, "step": 35310 }, { "grad_norm": 0.32795190811157227, "learning_rate": 6.363367772218128e-05, "loss": 0.0324, "step": 35320 }, { "grad_norm": 0.3764305114746094, "learning_rate": 6.36137913798837e-05, "loss": 0.0286, "step": 35330 }, { "grad_norm": 0.30125507712364197, "learning_rate": 6.359390271136151e-05, "loss": 0.0268, "step": 35340 }, { "grad_norm": 0.3433741331100464, "learning_rate": 6.357401172001314e-05, "loss": 0.0285, "step": 35350 }, { "grad_norm": 0.3597528636455536, "learning_rate": 6.355411840923742e-05, "loss": 0.0309, "step": 35360 }, { "grad_norm": 0.37608128786087036, "learning_rate": 6.353422278243358e-05, "loss": 0.0245, "step": 35370 }, { "grad_norm": 0.41262736916542053, "learning_rate": 6.351432484300121e-05, "loss": 0.0331, "step": 35380 }, { "grad_norm": 0.39298152923583984, "learning_rate": 6.349442459434036e-05, "loss": 0.0336, "step": 35390 }, { "grad_norm": 0.3674567937850952, "learning_rate": 6.34745220398514e-05, "loss": 0.0332, "step": 35400 }, { "grad_norm": 0.3484545350074768, "learning_rate": 6.345461718293518e-05, "loss": 0.0244, "step": 35410 }, { "grad_norm": 0.3528917133808136, "learning_rate": 6.343471002699286e-05, "loss": 0.0359, "step": 35420 }, { "grad_norm": 0.3855854868888855, "learning_rate": 6.341480057542602e-05, "loss": 0.0351, "step": 35430 }, { "grad_norm": 0.3041292726993561, "learning_rate": 6.339488883163667e-05, "loss": 0.0263, "step": 35440 }, { "grad_norm": 0.2915996313095093, "learning_rate": 6.337497479902716e-05, "loss": 0.0252, "step": 35450 }, { "grad_norm": 0.33996954560279846, "learning_rate": 6.335505848100027e-05, "loss": 0.0279, "step": 35460 }, { "grad_norm": 0.33962303400039673, "learning_rate": 6.333513988095915e-05, "loss": 0.0234, "step": 35470 }, { "grad_norm": 0.3920212388038635, "learning_rate": 6.331521900230735e-05, "loss": 0.0256, "step": 35480 }, { "grad_norm": 0.37314435839653015, "learning_rate": 6.329529584844878e-05, "loss": 0.0241, "step": 35490 }, { "grad_norm": 0.3725551962852478, "learning_rate": 6.327537042278777e-05, "loss": 0.0418, "step": 35500 }, { "grad_norm": 0.4710610806941986, "learning_rate": 6.325544272872905e-05, "loss": 0.0289, "step": 35510 }, { "grad_norm": 0.27936238050460815, "learning_rate": 6.323551276967771e-05, "loss": 0.0323, "step": 35520 }, { "grad_norm": 0.736513614654541, "learning_rate": 6.321558054903922e-05, "loss": 0.0274, "step": 35530 }, { "grad_norm": 0.23184457421302795, "learning_rate": 6.319564607021947e-05, "loss": 0.0221, "step": 35540 }, { "grad_norm": 0.6529484391212463, "learning_rate": 6.31757093366247e-05, "loss": 0.0309, "step": 35550 }, { "grad_norm": 0.6541072726249695, "learning_rate": 6.315577035166154e-05, "loss": 0.0327, "step": 35560 }, { "grad_norm": 0.28793975710868835, "learning_rate": 6.313582911873708e-05, "loss": 0.0251, "step": 35570 }, { "grad_norm": 0.30274397134780884, "learning_rate": 6.311588564125865e-05, "loss": 0.024, "step": 35580 }, { "grad_norm": 0.4245603084564209, "learning_rate": 6.30959399226341e-05, "loss": 0.0324, "step": 35590 }, { "grad_norm": 0.3152540326118469, "learning_rate": 6.30759919662716e-05, "loss": 0.0391, "step": 35600 }, { "grad_norm": 0.384600967168808, "learning_rate": 6.30560417755797e-05, "loss": 0.0273, "step": 35610 }, { "grad_norm": 0.2826300859451294, "learning_rate": 6.303608935396735e-05, "loss": 0.0224, "step": 35620 }, { "grad_norm": 0.28739216923713684, "learning_rate": 6.301613470484386e-05, "loss": 0.0261, "step": 35630 }, { "grad_norm": 0.3723272681236267, "learning_rate": 6.299617783161893e-05, "loss": 0.0302, "step": 35640 }, { "grad_norm": 0.21705707907676697, "learning_rate": 6.297621873770266e-05, "loss": 0.0247, "step": 35650 }, { "grad_norm": 0.2830166816711426, "learning_rate": 6.29562574265055e-05, "loss": 0.0263, "step": 35660 }, { "grad_norm": 0.3021983802318573, "learning_rate": 6.293629390143834e-05, "loss": 0.0281, "step": 35670 }, { "grad_norm": 0.2356579750776291, "learning_rate": 6.291632816591232e-05, "loss": 0.0186, "step": 35680 }, { "grad_norm": 0.39875808358192444, "learning_rate": 6.28963602233391e-05, "loss": 0.0277, "step": 35690 }, { "grad_norm": 0.3780891001224518, "learning_rate": 6.287639007713062e-05, "loss": 0.0318, "step": 35700 }, { "grad_norm": 0.5565645694732666, "learning_rate": 6.285641773069926e-05, "loss": 0.0331, "step": 35710 }, { "grad_norm": 0.36851346492767334, "learning_rate": 6.283644318745773e-05, "loss": 0.0279, "step": 35720 }, { "grad_norm": 0.31936565041542053, "learning_rate": 6.281646645081912e-05, "loss": 0.0306, "step": 35730 }, { "grad_norm": 0.4774877429008484, "learning_rate": 6.279648752419693e-05, "loss": 0.0263, "step": 35740 }, { "grad_norm": 0.27433261275291443, "learning_rate": 6.2776506411005e-05, "loss": 0.0254, "step": 35750 }, { "grad_norm": 0.24734443426132202, "learning_rate": 6.275652311465758e-05, "loss": 0.0289, "step": 35760 }, { "grad_norm": 0.2761479318141937, "learning_rate": 6.273653763856926e-05, "loss": 0.0264, "step": 35770 }, { "grad_norm": 0.3988555371761322, "learning_rate": 6.271654998615501e-05, "loss": 0.0287, "step": 35780 }, { "grad_norm": 0.3195246756076813, "learning_rate": 6.269656016083013e-05, "loss": 0.0314, "step": 35790 }, { "grad_norm": 0.25714555382728577, "learning_rate": 6.267656816601038e-05, "loss": 0.0296, "step": 35800 }, { "grad_norm": 0.20410709083080292, "learning_rate": 6.265657400511185e-05, "loss": 0.0288, "step": 35810 }, { "grad_norm": 0.3095223605632782, "learning_rate": 6.263657768155098e-05, "loss": 0.0305, "step": 35820 }, { "grad_norm": 0.317588210105896, "learning_rate": 6.261657919874457e-05, "loss": 0.0252, "step": 35830 }, { "grad_norm": 0.3086235225200653, "learning_rate": 6.259657856010986e-05, "loss": 0.0404, "step": 35840 }, { "grad_norm": 0.42404982447624207, "learning_rate": 6.257657576906439e-05, "loss": 0.0296, "step": 35850 }, { "grad_norm": 0.343925803899765, "learning_rate": 6.255657082902609e-05, "loss": 0.026, "step": 35860 }, { "grad_norm": 0.2600584626197815, "learning_rate": 6.253656374341325e-05, "loss": 0.022, "step": 35870 }, { "grad_norm": 0.34865128993988037, "learning_rate": 6.251655451564457e-05, "loss": 0.0279, "step": 35880 }, { "grad_norm": 0.3276077210903168, "learning_rate": 6.249654314913902e-05, "loss": 0.024, "step": 35890 }, { "grad_norm": 0.29312264919281006, "learning_rate": 6.247652964731604e-05, "loss": 0.0271, "step": 35900 }, { "grad_norm": 0.25538551807403564, "learning_rate": 6.245651401359537e-05, "loss": 0.0404, "step": 35910 }, { "grad_norm": 0.30363187193870544, "learning_rate": 6.243649625139715e-05, "loss": 0.0277, "step": 35920 }, { "grad_norm": 0.3196888864040375, "learning_rate": 6.241647636414185e-05, "loss": 0.03, "step": 35930 }, { "grad_norm": 0.37457844614982605, "learning_rate": 6.239645435525034e-05, "loss": 0.0362, "step": 35940 }, { "grad_norm": 0.2807179391384125, "learning_rate": 6.237643022814381e-05, "loss": 0.0311, "step": 35950 }, { "grad_norm": 0.42192748188972473, "learning_rate": 6.235640398624386e-05, "loss": 0.0291, "step": 35960 }, { "grad_norm": 0.33954378962516785, "learning_rate": 6.233637563297243e-05, "loss": 0.0344, "step": 35970 }, { "grad_norm": 0.40478596091270447, "learning_rate": 6.23163451717518e-05, "loss": 0.0326, "step": 35980 }, { "grad_norm": 0.4446772038936615, "learning_rate": 6.229631260600463e-05, "loss": 0.0273, "step": 35990 }, { "grad_norm": 0.27307072281837463, "learning_rate": 6.227627793915392e-05, "loss": 0.0321, "step": 36000 }, { "grad_norm": 0.25563713908195496, "learning_rate": 6.225624117462309e-05, "loss": 0.0277, "step": 36010 }, { "grad_norm": 0.3940165340900421, "learning_rate": 6.223620231583586e-05, "loss": 0.031, "step": 36020 }, { "grad_norm": 0.5681398510932922, "learning_rate": 6.221616136621629e-05, "loss": 0.0273, "step": 36030 }, { "grad_norm": 0.3130367696285248, "learning_rate": 6.219611832918887e-05, "loss": 0.0305, "step": 36040 }, { "grad_norm": 0.31107231974601746, "learning_rate": 6.217607320817838e-05, "loss": 0.0274, "step": 36050 }, { "grad_norm": 0.3797638416290283, "learning_rate": 6.215602600661001e-05, "loss": 0.0254, "step": 36060 }, { "grad_norm": 0.29374226927757263, "learning_rate": 6.213597672790925e-05, "loss": 0.0265, "step": 36070 }, { "grad_norm": 0.2761935591697693, "learning_rate": 6.2115925375502e-05, "loss": 0.024, "step": 36080 }, { "grad_norm": 0.37053295969963074, "learning_rate": 6.209587195281447e-05, "loss": 0.0287, "step": 36090 }, { "grad_norm": 0.347003310918808, "learning_rate": 6.207581646327324e-05, "loss": 0.0343, "step": 36100 }, { "grad_norm": 0.27851471304893494, "learning_rate": 6.205575891030526e-05, "loss": 0.0215, "step": 36110 }, { "grad_norm": 0.2956666648387909, "learning_rate": 6.203569929733781e-05, "loss": 0.0224, "step": 36120 }, { "grad_norm": 0.35374608635902405, "learning_rate": 6.201563762779852e-05, "loss": 0.0242, "step": 36130 }, { "grad_norm": 0.3538194000720978, "learning_rate": 6.199557390511538e-05, "loss": 0.0211, "step": 36140 }, { "grad_norm": 0.3325132429599762, "learning_rate": 6.197550813271675e-05, "loss": 0.0257, "step": 36150 }, { "grad_norm": 0.2723730504512787, "learning_rate": 6.195544031403131e-05, "loss": 0.025, "step": 36160 }, { "grad_norm": 0.44124576449394226, "learning_rate": 6.19353704524881e-05, "loss": 0.0266, "step": 36170 }, { "grad_norm": 0.3006921708583832, "learning_rate": 6.191529855151652e-05, "loss": 0.024, "step": 36180 }, { "grad_norm": 0.27245843410491943, "learning_rate": 6.189522461454629e-05, "loss": 0.03, "step": 36190 }, { "grad_norm": 0.3314477503299713, "learning_rate": 6.187514864500752e-05, "loss": 0.03, "step": 36200 }, { "grad_norm": 0.4611988663673401, "learning_rate": 6.185507064633062e-05, "loss": 0.0254, "step": 36210 }, { "grad_norm": 0.3525920808315277, "learning_rate": 6.18349906219464e-05, "loss": 0.0293, "step": 36220 }, { "grad_norm": 0.32538625597953796, "learning_rate": 6.181490857528596e-05, "loss": 0.0228, "step": 36230 }, { "grad_norm": 0.2562819719314575, "learning_rate": 6.179482450978077e-05, "loss": 0.0175, "step": 36240 }, { "grad_norm": 0.26437726616859436, "learning_rate": 6.177473842886269e-05, "loss": 0.0244, "step": 36250 }, { "grad_norm": 0.45517265796661377, "learning_rate": 6.175465033596382e-05, "loss": 0.0325, "step": 36260 }, { "grad_norm": 0.26307418942451477, "learning_rate": 6.173456023451671e-05, "loss": 0.0259, "step": 36270 }, { "grad_norm": 0.26284968852996826, "learning_rate": 6.171446812795422e-05, "loss": 0.0293, "step": 36280 }, { "grad_norm": 0.4588093161582947, "learning_rate": 6.169437401970949e-05, "loss": 0.0358, "step": 36290 }, { "grad_norm": 0.42043909430503845, "learning_rate": 6.16742779132161e-05, "loss": 0.0271, "step": 36300 }, { "grad_norm": 0.3484369218349457, "learning_rate": 6.165417981190789e-05, "loss": 0.0361, "step": 36310 }, { "grad_norm": 0.2738793194293976, "learning_rate": 6.16340797192191e-05, "loss": 0.0293, "step": 36320 }, { "grad_norm": 0.34755992889404297, "learning_rate": 6.161397763858427e-05, "loss": 0.0413, "step": 36330 }, { "grad_norm": 0.3036898672580719, "learning_rate": 6.159387357343834e-05, "loss": 0.028, "step": 36340 }, { "grad_norm": 0.2890717387199402, "learning_rate": 6.157376752721648e-05, "loss": 0.0384, "step": 36350 }, { "grad_norm": 0.3858796954154968, "learning_rate": 6.155365950335428e-05, "loss": 0.0304, "step": 36360 }, { "grad_norm": 0.46949851512908936, "learning_rate": 6.153354950528768e-05, "loss": 0.0336, "step": 36370 }, { "grad_norm": 0.4041706323623657, "learning_rate": 6.151343753645293e-05, "loss": 0.0344, "step": 36380 }, { "grad_norm": 0.3021923005580902, "learning_rate": 6.149332360028657e-05, "loss": 0.0307, "step": 36390 }, { "grad_norm": 0.3250735104084015, "learning_rate": 6.147320770022555e-05, "loss": 0.0329, "step": 36400 }, { "grad_norm": 0.2312266081571579, "learning_rate": 6.145308983970715e-05, "loss": 0.029, "step": 36410 }, { "grad_norm": 0.3421735167503357, "learning_rate": 6.143297002216892e-05, "loss": 0.0295, "step": 36420 }, { "grad_norm": 0.44054052233695984, "learning_rate": 6.141284825104882e-05, "loss": 0.0297, "step": 36430 }, { "grad_norm": 0.234154611825943, "learning_rate": 6.13927245297851e-05, "loss": 0.0301, "step": 36440 }, { "grad_norm": 0.3197711408138275, "learning_rate": 6.137259886181633e-05, "loss": 0.0306, "step": 36450 }, { "grad_norm": 0.3597415089607239, "learning_rate": 6.135247125058145e-05, "loss": 0.031, "step": 36460 }, { "grad_norm": 0.21740759909152985, "learning_rate": 6.133234169951974e-05, "loss": 0.0234, "step": 36470 }, { "grad_norm": 0.31094199419021606, "learning_rate": 6.131221021207078e-05, "loss": 0.028, "step": 36480 }, { "grad_norm": 0.39844977855682373, "learning_rate": 6.129207679167448e-05, "loss": 0.0239, "step": 36490 }, { "grad_norm": 0.2790948748588562, "learning_rate": 6.127194144177109e-05, "loss": 0.0207, "step": 36500 }, { "grad_norm": 0.3330177664756775, "learning_rate": 6.125180416580118e-05, "loss": 0.0274, "step": 36510 }, { "grad_norm": 0.2987173795700073, "learning_rate": 6.123166496720571e-05, "loss": 0.0292, "step": 36520 }, { "grad_norm": 1.075379490852356, "learning_rate": 6.121152384942588e-05, "loss": 0.0269, "step": 36530 }, { "grad_norm": 0.38238391280174255, "learning_rate": 6.119138081590324e-05, "loss": 0.0276, "step": 36540 }, { "grad_norm": 0.2280697375535965, "learning_rate": 6.117123587007971e-05, "loss": 0.0212, "step": 36550 }, { "grad_norm": 0.34480538964271545, "learning_rate": 6.11510890153975e-05, "loss": 0.0297, "step": 36560 }, { "grad_norm": 0.44447842240333557, "learning_rate": 6.113094025529916e-05, "loss": 0.0249, "step": 36570 }, { "grad_norm": 0.297944575548172, "learning_rate": 6.111078959322757e-05, "loss": 0.0398, "step": 36580 }, { "grad_norm": 0.4161616563796997, "learning_rate": 6.109063703262592e-05, "loss": 0.0399, "step": 36590 }, { "grad_norm": 0.3671628534793854, "learning_rate": 6.107048257693772e-05, "loss": 0.028, "step": 36600 }, { "grad_norm": 0.3117436170578003, "learning_rate": 6.105032622960683e-05, "loss": 0.0315, "step": 36610 }, { "grad_norm": 0.3344254791736603, "learning_rate": 6.103016799407743e-05, "loss": 0.0264, "step": 36620 }, { "grad_norm": 0.35748305916786194, "learning_rate": 6.1010007873793984e-05, "loss": 0.0358, "step": 36630 }, { "grad_norm": 0.297756165266037, "learning_rate": 6.098984587220131e-05, "loss": 0.0266, "step": 36640 }, { "grad_norm": 0.3047441244125366, "learning_rate": 6.096968199274456e-05, "loss": 0.0282, "step": 36650 }, { "grad_norm": 0.3685123324394226, "learning_rate": 6.0949516238869166e-05, "loss": 0.0292, "step": 36660 }, { "grad_norm": 0.35102909803390503, "learning_rate": 6.092934861402092e-05, "loss": 0.0348, "step": 36670 }, { "grad_norm": 0.3924168348312378, "learning_rate": 6.0909179121645924e-05, "loss": 0.0364, "step": 36680 }, { "grad_norm": 0.3743508458137512, "learning_rate": 6.0889007765190576e-05, "loss": 0.0351, "step": 36690 }, { "grad_norm": 0.24095191061496735, "learning_rate": 6.086883454810162e-05, "loss": 0.0274, "step": 36700 }, { "grad_norm": 0.3410160541534424, "learning_rate": 6.0848659473826084e-05, "loss": 0.0273, "step": 36710 }, { "grad_norm": 0.3996411859989166, "learning_rate": 6.082848254581138e-05, "loss": 0.0296, "step": 36720 }, { "grad_norm": 0.3289145231246948, "learning_rate": 6.080830376750517e-05, "loss": 0.0277, "step": 36730 }, { "grad_norm": 0.3272501528263092, "learning_rate": 6.0788123142355445e-05, "loss": 0.0271, "step": 36740 }, { "grad_norm": 0.4352776110172272, "learning_rate": 6.076794067381052e-05, "loss": 0.034, "step": 36750 }, { "grad_norm": 0.5093581676483154, "learning_rate": 6.074775636531905e-05, "loss": 0.0295, "step": 36760 }, { "grad_norm": 0.4912567436695099, "learning_rate": 6.072757022032997e-05, "loss": 0.0344, "step": 36770 }, { "grad_norm": 0.24647963047027588, "learning_rate": 6.070738224229253e-05, "loss": 0.0277, "step": 36780 }, { "grad_norm": 0.2705988883972168, "learning_rate": 6.0687192434656314e-05, "loss": 0.0337, "step": 36790 }, { "grad_norm": 0.28804391622543335, "learning_rate": 6.066700080087121e-05, "loss": 0.0293, "step": 36800 }, { "grad_norm": 0.2585350573062897, "learning_rate": 6.0646807344387424e-05, "loss": 0.0255, "step": 36810 }, { "grad_norm": 0.357103556394577, "learning_rate": 6.062661206865543e-05, "loss": 0.0319, "step": 36820 }, { "grad_norm": 0.2783879041671753, "learning_rate": 6.06064149771261e-05, "loss": 0.0263, "step": 36830 }, { "grad_norm": 0.31365782022476196, "learning_rate": 6.058621607325051e-05, "loss": 0.0264, "step": 36840 }, { "grad_norm": 0.345684677362442, "learning_rate": 6.056601536048014e-05, "loss": 0.0287, "step": 36850 }, { "grad_norm": 0.45079073309898376, "learning_rate": 6.0545812842266725e-05, "loss": 0.0357, "step": 36860 }, { "grad_norm": 0.22109773755073547, "learning_rate": 6.052560852206232e-05, "loss": 0.0274, "step": 36870 }, { "grad_norm": 0.33317309617996216, "learning_rate": 6.05054024033193e-05, "loss": 0.0372, "step": 36880 }, { "grad_norm": 0.32438990473747253, "learning_rate": 6.048519448949032e-05, "loss": 0.0297, "step": 36890 }, { "grad_norm": 0.25281691551208496, "learning_rate": 6.046498478402839e-05, "loss": 0.0268, "step": 36900 }, { "grad_norm": 0.23806814849376678, "learning_rate": 6.044477329038677e-05, "loss": 0.0252, "step": 36910 }, { "grad_norm": 0.3636144697666168, "learning_rate": 6.042456001201906e-05, "loss": 0.0263, "step": 36920 }, { "grad_norm": 0.5067775249481201, "learning_rate": 6.040434495237917e-05, "loss": 0.0274, "step": 36930 }, { "grad_norm": 0.4283745288848877, "learning_rate": 6.0384128114921256e-05, "loss": 0.0256, "step": 36940 }, { "grad_norm": 0.3287353813648224, "learning_rate": 6.036390950309987e-05, "loss": 0.0241, "step": 36950 }, { "grad_norm": 0.36132463812828064, "learning_rate": 6.0343689120369805e-05, "loss": 0.0285, "step": 36960 }, { "grad_norm": 0.37168946862220764, "learning_rate": 6.032346697018616e-05, "loss": 0.0223, "step": 36970 }, { "grad_norm": 0.3629243075847626, "learning_rate": 6.0303243056004375e-05, "loss": 0.0466, "step": 36980 }, { "grad_norm": 0.26764169335365295, "learning_rate": 6.0283017381280136e-05, "loss": 0.0254, "step": 36990 }, { "grad_norm": 0.3767290413379669, "learning_rate": 6.026278994946948e-05, "loss": 0.0234, "step": 37000 }, { "grad_norm": 0.34418022632598877, "learning_rate": 6.02425607640287e-05, "loss": 0.0282, "step": 37010 }, { "grad_norm": 0.6145243644714355, "learning_rate": 6.022232982841441e-05, "loss": 0.0256, "step": 37020 }, { "grad_norm": 0.3110731542110443, "learning_rate": 6.020209714608355e-05, "loss": 0.027, "step": 37030 }, { "grad_norm": 0.3943530023097992, "learning_rate": 6.018186272049331e-05, "loss": 0.0303, "step": 37040 }, { "grad_norm": 0.24464945495128632, "learning_rate": 6.01616265551012e-05, "loss": 0.0272, "step": 37050 }, { "grad_norm": 0.2639109790325165, "learning_rate": 6.014138865336503e-05, "loss": 0.0274, "step": 37060 }, { "grad_norm": 0.2998410761356354, "learning_rate": 6.0121149018742905e-05, "loss": 0.0284, "step": 37070 }, { "grad_norm": 0.39238110184669495, "learning_rate": 6.010090765469325e-05, "loss": 0.0245, "step": 37080 }, { "grad_norm": 0.3123111128807068, "learning_rate": 6.008066456467473e-05, "loss": 0.0255, "step": 37090 }, { "grad_norm": 0.2657341957092285, "learning_rate": 6.0060419752146335e-05, "loss": 0.0275, "step": 37100 }, { "grad_norm": 0.3924385607242584, "learning_rate": 6.0040173220567353e-05, "loss": 0.0208, "step": 37110 }, { "grad_norm": 0.20438314974308014, "learning_rate": 6.001992497339737e-05, "loss": 0.0217, "step": 37120 }, { "grad_norm": 0.28439706563949585, "learning_rate": 5.999967501409626e-05, "loss": 0.0331, "step": 37130 }, { "grad_norm": 0.2261466085910797, "learning_rate": 5.997942334612418e-05, "loss": 0.0224, "step": 37140 }, { "grad_norm": 0.22395998239517212, "learning_rate": 5.995916997294158e-05, "loss": 0.0188, "step": 37150 }, { "grad_norm": 0.26914095878601074, "learning_rate": 5.9938914898009214e-05, "loss": 0.0334, "step": 37160 }, { "grad_norm": 0.3079448938369751, "learning_rate": 5.991865812478813e-05, "loss": 0.023, "step": 37170 }, { "grad_norm": 0.3738947808742523, "learning_rate": 5.989839965673964e-05, "loss": 0.0392, "step": 37180 }, { "grad_norm": 0.2698102593421936, "learning_rate": 5.987813949732539e-05, "loss": 0.0298, "step": 37190 }, { "grad_norm": 0.28227749466896057, "learning_rate": 5.9857877650007255e-05, "loss": 0.0245, "step": 37200 }, { "grad_norm": 0.24742615222930908, "learning_rate": 5.983761411824744e-05, "loss": 0.0292, "step": 37210 }, { "grad_norm": 0.2901344895362854, "learning_rate": 5.981734890550844e-05, "loss": 0.0307, "step": 37220 }, { "grad_norm": 0.4271516501903534, "learning_rate": 5.979708201525301e-05, "loss": 0.0369, "step": 37230 }, { "grad_norm": 0.4472871422767639, "learning_rate": 5.977681345094422e-05, "loss": 0.0319, "step": 37240 }, { "grad_norm": 0.3653191328048706, "learning_rate": 5.97565432160454e-05, "loss": 0.0284, "step": 37250 }, { "grad_norm": 0.27873387932777405, "learning_rate": 5.9736271314020186e-05, "loss": 0.0295, "step": 37260 }, { "grad_norm": 0.3475779891014099, "learning_rate": 5.971599774833251e-05, "loss": 0.0286, "step": 37270 }, { "grad_norm": 0.3276061415672302, "learning_rate": 5.9695722522446525e-05, "loss": 0.0261, "step": 37280 }, { "grad_norm": 0.3584728240966797, "learning_rate": 5.9675445639826765e-05, "loss": 0.0285, "step": 37290 }, { "grad_norm": 0.2439466267824173, "learning_rate": 5.965516710393796e-05, "loss": 0.028, "step": 37300 }, { "grad_norm": 0.28032055497169495, "learning_rate": 5.963488691824516e-05, "loss": 0.0308, "step": 37310 }, { "grad_norm": 0.35033735632896423, "learning_rate": 5.96146050862137e-05, "loss": 0.0376, "step": 37320 }, { "grad_norm": 0.3528103530406952, "learning_rate": 5.959432161130919e-05, "loss": 0.0254, "step": 37330 }, { "grad_norm": 0.29247674345970154, "learning_rate": 5.9574036496997545e-05, "loss": 0.0334, "step": 37340 }, { "grad_norm": 0.3151431977748871, "learning_rate": 5.955374974674488e-05, "loss": 0.0286, "step": 37350 }, { "grad_norm": 0.19523680210113525, "learning_rate": 5.9533461364017696e-05, "loss": 0.0263, "step": 37360 }, { "grad_norm": 0.2776966989040375, "learning_rate": 5.9513171352282716e-05, "loss": 0.024, "step": 37370 }, { "grad_norm": 0.24614644050598145, "learning_rate": 5.949287971500692e-05, "loss": 0.0247, "step": 37380 }, { "grad_norm": 0.2193557769060135, "learning_rate": 5.947258645565762e-05, "loss": 0.0243, "step": 37390 }, { "grad_norm": 0.23369817435741425, "learning_rate": 5.945229157770237e-05, "loss": 0.0251, "step": 37400 }, { "grad_norm": 0.2847188711166382, "learning_rate": 5.9431995084609006e-05, "loss": 0.0277, "step": 37410 }, { "grad_norm": 0.3491547703742981, "learning_rate": 5.941169697984564e-05, "loss": 0.0283, "step": 37420 }, { "grad_norm": 0.4530866742134094, "learning_rate": 5.9391397266880675e-05, "loss": 0.026, "step": 37430 }, { "grad_norm": 0.47578996419906616, "learning_rate": 5.937109594918279e-05, "loss": 0.046, "step": 37440 }, { "grad_norm": 0.3723621368408203, "learning_rate": 5.9350793030220884e-05, "loss": 0.0457, "step": 37450 }, { "grad_norm": 0.3677609860897064, "learning_rate": 5.933048851346421e-05, "loss": 0.0277, "step": 37460 }, { "grad_norm": 0.4042243957519531, "learning_rate": 5.931018240238224e-05, "loss": 0.0335, "step": 37470 }, { "grad_norm": 0.3775276243686676, "learning_rate": 5.928987470044471e-05, "loss": 0.0272, "step": 37480 }, { "grad_norm": 0.4808596074581146, "learning_rate": 5.9269565411121695e-05, "loss": 0.0275, "step": 37490 }, { "grad_norm": 0.25925081968307495, "learning_rate": 5.924925453788347e-05, "loss": 0.0324, "step": 37500 }, { "grad_norm": 0.38154736161231995, "learning_rate": 5.92289420842006e-05, "loss": 0.0288, "step": 37510 }, { "grad_norm": 0.40007147192955017, "learning_rate": 5.9208628053543945e-05, "loss": 0.0325, "step": 37520 }, { "grad_norm": 0.41970130801200867, "learning_rate": 5.918831244938462e-05, "loss": 0.0304, "step": 37530 }, { "grad_norm": 0.3018203675746918, "learning_rate": 5.9167995275194e-05, "loss": 0.0268, "step": 37540 }, { "grad_norm": 0.43105143308639526, "learning_rate": 5.914767653444373e-05, "loss": 0.0287, "step": 37550 }, { "grad_norm": 0.6257772445678711, "learning_rate": 5.912735623060572e-05, "loss": 0.0402, "step": 37560 }, { "grad_norm": 0.5064408779144287, "learning_rate": 5.910703436715217e-05, "loss": 0.029, "step": 37570 }, { "grad_norm": 0.27311816811561584, "learning_rate": 5.908671094755552e-05, "loss": 0.0217, "step": 37580 }, { "grad_norm": 0.4067152142524719, "learning_rate": 5.906638597528851e-05, "loss": 0.024, "step": 37590 }, { "grad_norm": 0.3213956356048584, "learning_rate": 5.9046059453824076e-05, "loss": 0.024, "step": 37600 }, { "grad_norm": 0.3358546495437622, "learning_rate": 5.9025731386635505e-05, "loss": 0.0249, "step": 37610 }, { "grad_norm": 0.29961711168289185, "learning_rate": 5.900540177719629e-05, "loss": 0.0224, "step": 37620 }, { "grad_norm": 0.38197705149650574, "learning_rate": 5.898507062898021e-05, "loss": 0.0323, "step": 37630 }, { "grad_norm": 0.47785481810569763, "learning_rate": 5.8964737945461316e-05, "loss": 0.0231, "step": 37640 }, { "grad_norm": 0.2434448003768921, "learning_rate": 5.8944403730113885e-05, "loss": 0.0253, "step": 37650 }, { "grad_norm": 0.4115777015686035, "learning_rate": 5.892406798641248e-05, "loss": 0.0269, "step": 37660 }, { "grad_norm": 0.6108325123786926, "learning_rate": 5.890373071783193e-05, "loss": 0.0241, "step": 37670 }, { "grad_norm": 0.8097881078720093, "learning_rate": 5.888339192784732e-05, "loss": 0.0381, "step": 37680 }, { "grad_norm": 0.3023432791233063, "learning_rate": 5.8863051619934003e-05, "loss": 0.0253, "step": 37690 }, { "grad_norm": 0.3201882243156433, "learning_rate": 5.8842709797567554e-05, "loss": 0.0281, "step": 37700 }, { "grad_norm": 0.34596407413482666, "learning_rate": 5.8822366464223855e-05, "loss": 0.0312, "step": 37710 }, { "grad_norm": 0.2813735902309418, "learning_rate": 5.880202162337901e-05, "loss": 0.03, "step": 37720 }, { "grad_norm": 0.6467520594596863, "learning_rate": 5.8781675278509405e-05, "loss": 0.0267, "step": 37730 }, { "grad_norm": 0.25843849778175354, "learning_rate": 5.8761327433091696e-05, "loss": 0.0273, "step": 37740 }, { "grad_norm": 0.46915119886398315, "learning_rate": 5.874097809060275e-05, "loss": 0.0363, "step": 37750 }, { "grad_norm": 0.381571501493454, "learning_rate": 5.87206272545197e-05, "loss": 0.0473, "step": 37760 }, { "grad_norm": 0.32803359627723694, "learning_rate": 5.8700274928319955e-05, "loss": 0.0291, "step": 37770 }, { "grad_norm": 0.3716691732406616, "learning_rate": 5.867992111548118e-05, "loss": 0.0306, "step": 37780 }, { "grad_norm": 0.2999662458896637, "learning_rate": 5.865956581948131e-05, "loss": 0.0241, "step": 37790 }, { "grad_norm": 0.28095120191574097, "learning_rate": 5.863920904379845e-05, "loss": 0.0264, "step": 37800 }, { "grad_norm": 0.6155972480773926, "learning_rate": 5.8618850791911064e-05, "loss": 0.0404, "step": 37810 }, { "grad_norm": 0.28913816809654236, "learning_rate": 5.859849106729779e-05, "loss": 0.0288, "step": 37820 }, { "grad_norm": 0.340237557888031, "learning_rate": 5.857812987343758e-05, "loss": 0.0271, "step": 37830 }, { "grad_norm": 0.37168315052986145, "learning_rate": 5.855776721380957e-05, "loss": 0.0278, "step": 37840 }, { "grad_norm": 0.3489270806312561, "learning_rate": 5.8537403091893217e-05, "loss": 0.0276, "step": 37850 }, { "grad_norm": 0.47265079617500305, "learning_rate": 5.851703751116816e-05, "loss": 0.0295, "step": 37860 }, { "grad_norm": 0.2967027723789215, "learning_rate": 5.8496670475114336e-05, "loss": 0.0255, "step": 37870 }, { "grad_norm": 0.3621142506599426, "learning_rate": 5.84763019872119e-05, "loss": 0.0357, "step": 37880 }, { "grad_norm": 0.3219316303730011, "learning_rate": 5.845593205094131e-05, "loss": 0.0268, "step": 37890 }, { "grad_norm": 0.2162763625383377, "learning_rate": 5.843556066978318e-05, "loss": 0.0224, "step": 37900 }, { "grad_norm": 0.21616673469543457, "learning_rate": 5.8415187847218455e-05, "loss": 0.0274, "step": 37910 }, { "grad_norm": 0.3611259162425995, "learning_rate": 5.839481358672827e-05, "loss": 0.0257, "step": 37920 }, { "grad_norm": 0.2642183303833008, "learning_rate": 5.837443789179407e-05, "loss": 0.0268, "step": 37930 }, { "grad_norm": 0.5305295586585999, "learning_rate": 5.8354060765897445e-05, "loss": 0.0267, "step": 37940 }, { "grad_norm": 0.45830726623535156, "learning_rate": 5.8333682212520334e-05, "loss": 0.0416, "step": 37950 }, { "grad_norm": 0.37561625242233276, "learning_rate": 5.831330223514486e-05, "loss": 0.0241, "step": 37960 }, { "grad_norm": 0.32657262682914734, "learning_rate": 5.8292920837253396e-05, "loss": 0.0268, "step": 37970 }, { "grad_norm": 0.3136153817176819, "learning_rate": 5.827253802232857e-05, "loss": 0.0259, "step": 37980 }, { "grad_norm": 0.5455533862113953, "learning_rate": 5.825215379385327e-05, "loss": 0.0245, "step": 37990 }, { "grad_norm": 0.28168970346450806, "learning_rate": 5.823176815531057e-05, "loss": 0.0385, "step": 38000 }, { "grad_norm": 0.23056815564632416, "learning_rate": 5.8211381110183826e-05, "loss": 0.0356, "step": 38010 }, { "grad_norm": 0.2703823745250702, "learning_rate": 5.8190992661956645e-05, "loss": 0.0233, "step": 38020 }, { "grad_norm": 0.290072500705719, "learning_rate": 5.817060281411284e-05, "loss": 0.0294, "step": 38030 }, { "grad_norm": 0.35650429129600525, "learning_rate": 5.815021157013647e-05, "loss": 0.0225, "step": 38040 }, { "grad_norm": 0.3785342276096344, "learning_rate": 5.8129818933511856e-05, "loss": 0.0346, "step": 38050 }, { "grad_norm": 0.5913202166557312, "learning_rate": 5.8109424907723544e-05, "loss": 0.0347, "step": 38060 }, { "grad_norm": 0.2783624231815338, "learning_rate": 5.80890294962563e-05, "loss": 0.0221, "step": 38070 }, { "grad_norm": 0.3490135669708252, "learning_rate": 5.806863270259515e-05, "loss": 0.0289, "step": 38080 }, { "grad_norm": 0.37719935178756714, "learning_rate": 5.804823453022536e-05, "loss": 0.0299, "step": 38090 }, { "grad_norm": 0.3434222638607025, "learning_rate": 5.80278349826324e-05, "loss": 0.0241, "step": 38100 }, { "grad_norm": 0.2972836196422577, "learning_rate": 5.8007434063302014e-05, "loss": 0.0263, "step": 38110 }, { "grad_norm": 0.33082208037376404, "learning_rate": 5.7987031775720136e-05, "loss": 0.031, "step": 38120 }, { "grad_norm": 0.23119769990444183, "learning_rate": 5.7966628123372976e-05, "loss": 0.0381, "step": 38130 }, { "grad_norm": 0.4687909781932831, "learning_rate": 5.7946223109746956e-05, "loss": 0.0356, "step": 38140 }, { "grad_norm": 0.2977171242237091, "learning_rate": 5.7925816738328754e-05, "loss": 0.021, "step": 38150 }, { "grad_norm": 0.3271225690841675, "learning_rate": 5.790540901260521e-05, "loss": 0.027, "step": 38160 }, { "grad_norm": 0.4942810535430908, "learning_rate": 5.788499993606351e-05, "loss": 0.0258, "step": 38170 }, { "grad_norm": 0.46041634678840637, "learning_rate": 5.786458951219096e-05, "loss": 0.0311, "step": 38180 }, { "grad_norm": 0.3077097535133362, "learning_rate": 5.784417774447517e-05, "loss": 0.029, "step": 38190 }, { "grad_norm": 0.332022100687027, "learning_rate": 5.782376463640393e-05, "loss": 0.0258, "step": 38200 }, { "grad_norm": 0.3616830110549927, "learning_rate": 5.780335019146531e-05, "loss": 0.0223, "step": 38210 }, { "grad_norm": 0.3291468918323517, "learning_rate": 5.778293441314755e-05, "loss": 0.0268, "step": 38220 }, { "grad_norm": 0.32451239228248596, "learning_rate": 5.776251730493917e-05, "loss": 0.0348, "step": 38230 }, { "grad_norm": 0.294555127620697, "learning_rate": 5.774209887032887e-05, "loss": 0.0268, "step": 38240 }, { "grad_norm": 0.3415849804878235, "learning_rate": 5.772167911280565e-05, "loss": 0.028, "step": 38250 }, { "grad_norm": 0.2784949839115143, "learning_rate": 5.770125803585864e-05, "loss": 0.0235, "step": 38260 }, { "grad_norm": 0.3099631369113922, "learning_rate": 5.768083564297726e-05, "loss": 0.0285, "step": 38270 }, { "grad_norm": 0.3571709394454956, "learning_rate": 5.766041193765114e-05, "loss": 0.0274, "step": 38280 }, { "grad_norm": 0.6557638049125671, "learning_rate": 5.763998692337015e-05, "loss": 0.0231, "step": 38290 }, { "grad_norm": 0.2922033667564392, "learning_rate": 5.761956060362433e-05, "loss": 0.0298, "step": 38300 }, { "grad_norm": 0.27418211102485657, "learning_rate": 5.7599132981904e-05, "loss": 0.0284, "step": 38310 }, { "grad_norm": 0.28736501932144165, "learning_rate": 5.75787040616997e-05, "loss": 0.0326, "step": 38320 }, { "grad_norm": 0.3322197198867798, "learning_rate": 5.755827384650212e-05, "loss": 0.028, "step": 38330 }, { "grad_norm": 1.3013644218444824, "learning_rate": 5.753784233980228e-05, "loss": 0.0245, "step": 38340 }, { "grad_norm": 0.3221716284751892, "learning_rate": 5.751740954509135e-05, "loss": 0.0235, "step": 38350 }, { "grad_norm": 0.3265976905822754, "learning_rate": 5.7496975465860715e-05, "loss": 0.0215, "step": 38360 }, { "grad_norm": 0.2702547013759613, "learning_rate": 5.747654010560202e-05, "loss": 0.0225, "step": 38370 }, { "grad_norm": 0.4733632504940033, "learning_rate": 5.7456103467807097e-05, "loss": 0.0299, "step": 38380 }, { "grad_norm": 0.42820650339126587, "learning_rate": 5.7435665555968046e-05, "loss": 0.0246, "step": 38390 }, { "grad_norm": 0.27149251103401184, "learning_rate": 5.74152263735771e-05, "loss": 0.0312, "step": 38400 }, { "grad_norm": 0.3666890859603882, "learning_rate": 5.739478592412677e-05, "loss": 0.0279, "step": 38410 }, { "grad_norm": 0.4868658185005188, "learning_rate": 5.7374344211109766e-05, "loss": 0.0301, "step": 38420 }, { "grad_norm": 0.28668978810310364, "learning_rate": 5.735390123801904e-05, "loss": 0.0331, "step": 38430 }, { "grad_norm": 0.3715515434741974, "learning_rate": 5.7333457008347704e-05, "loss": 0.0352, "step": 38440 }, { "grad_norm": 0.3894983232021332, "learning_rate": 5.7313011525589156e-05, "loss": 0.0328, "step": 38450 }, { "grad_norm": 0.25489509105682373, "learning_rate": 5.729256479323694e-05, "loss": 0.0268, "step": 38460 }, { "grad_norm": 0.47269976139068604, "learning_rate": 5.727211681478485e-05, "loss": 0.0257, "step": 38470 }, { "grad_norm": 0.27503105998039246, "learning_rate": 5.7251667593726886e-05, "loss": 0.0356, "step": 38480 }, { "grad_norm": 0.29069551825523376, "learning_rate": 5.723121713355728e-05, "loss": 0.0385, "step": 38490 }, { "grad_norm": 0.30764392018318176, "learning_rate": 5.721076543777044e-05, "loss": 0.0252, "step": 38500 }, { "grad_norm": 0.31786635518074036, "learning_rate": 5.7190312509860986e-05, "loss": 0.0251, "step": 38510 }, { "grad_norm": 1.1391470432281494, "learning_rate": 5.716985835332379e-05, "loss": 0.0317, "step": 38520 }, { "grad_norm": 0.3149075508117676, "learning_rate": 5.714940297165389e-05, "loss": 0.0295, "step": 38530 }, { "grad_norm": 0.3632690906524658, "learning_rate": 5.712894636834656e-05, "loss": 0.0287, "step": 38540 }, { "grad_norm": 0.39163738489151, "learning_rate": 5.7108488546897276e-05, "loss": 0.0209, "step": 38550 }, { "grad_norm": 0.3222462832927704, "learning_rate": 5.708802951080172e-05, "loss": 0.0222, "step": 38560 }, { "grad_norm": 0.35612180829048157, "learning_rate": 5.706756926355576e-05, "loss": 0.0344, "step": 38570 }, { "grad_norm": 0.3529009521007538, "learning_rate": 5.704710780865554e-05, "loss": 0.0235, "step": 38580 }, { "grad_norm": 0.48259028792381287, "learning_rate": 5.7026645149597325e-05, "loss": 0.0295, "step": 38590 }, { "grad_norm": 0.31342825293540955, "learning_rate": 5.700618128987764e-05, "loss": 0.026, "step": 38600 }, { "grad_norm": 0.21055535972118378, "learning_rate": 5.698571623299317e-05, "loss": 0.023, "step": 38610 }, { "grad_norm": 0.35260364413261414, "learning_rate": 5.696524998244086e-05, "loss": 0.0232, "step": 38620 }, { "grad_norm": 0.331932008266449, "learning_rate": 5.6944782541717836e-05, "loss": 0.032, "step": 38630 }, { "grad_norm": 0.24550296366214752, "learning_rate": 5.69243139143214e-05, "loss": 0.0281, "step": 38640 }, { "grad_norm": 0.32363390922546387, "learning_rate": 5.6903844103749125e-05, "loss": 0.0285, "step": 38650 }, { "grad_norm": 0.2078377902507782, "learning_rate": 5.688337311349869e-05, "loss": 0.0215, "step": 38660 }, { "grad_norm": 0.2831094264984131, "learning_rate": 5.6862900947068074e-05, "loss": 0.0256, "step": 38670 }, { "grad_norm": 0.32080262899398804, "learning_rate": 5.6842427607955374e-05, "loss": 0.0308, "step": 38680 }, { "grad_norm": 0.3876562714576721, "learning_rate": 5.682195309965893e-05, "loss": 0.0238, "step": 38690 }, { "grad_norm": 0.3240658938884735, "learning_rate": 5.6801477425677294e-05, "loss": 0.0345, "step": 38700 }, { "grad_norm": 0.43183043599128723, "learning_rate": 5.678100058950917e-05, "loss": 0.032, "step": 38710 }, { "grad_norm": 0.38412463665008545, "learning_rate": 5.676052259465352e-05, "loss": 0.0316, "step": 38720 }, { "grad_norm": 0.4250090718269348, "learning_rate": 5.674004344460945e-05, "loss": 0.0249, "step": 38730 }, { "grad_norm": 0.4301498234272003, "learning_rate": 5.6719563142876295e-05, "loss": 0.0287, "step": 38740 }, { "grad_norm": 0.3163048326969147, "learning_rate": 5.669908169295359e-05, "loss": 0.023, "step": 38750 }, { "grad_norm": 0.34854045510292053, "learning_rate": 5.667859909834105e-05, "loss": 0.0271, "step": 38760 }, { "grad_norm": 0.28415676951408386, "learning_rate": 5.6658115362538565e-05, "loss": 0.0272, "step": 38770 }, { "grad_norm": 0.30436286330223083, "learning_rate": 5.663763048904628e-05, "loss": 0.0262, "step": 38780 }, { "grad_norm": 0.45192423462867737, "learning_rate": 5.661714448136447e-05, "loss": 0.0213, "step": 38790 }, { "grad_norm": 0.404619425535202, "learning_rate": 5.659665734299366e-05, "loss": 0.0199, "step": 38800 }, { "grad_norm": 0.2952730655670166, "learning_rate": 5.6576169077434516e-05, "loss": 0.0325, "step": 38810 }, { "grad_norm": 0.307073712348938, "learning_rate": 5.6555679688187944e-05, "loss": 0.024, "step": 38820 }, { "grad_norm": 0.36495697498321533, "learning_rate": 5.6535189178755e-05, "loss": 0.0223, "step": 38830 }, { "grad_norm": 0.3561285138130188, "learning_rate": 5.651469755263698e-05, "loss": 0.0243, "step": 38840 }, { "grad_norm": 0.18201002478599548, "learning_rate": 5.6494204813335316e-05, "loss": 0.0387, "step": 38850 }, { "grad_norm": 0.17252270877361298, "learning_rate": 5.647371096435168e-05, "loss": 0.0245, "step": 38860 }, { "grad_norm": 0.2707853317260742, "learning_rate": 5.645321600918788e-05, "loss": 0.0228, "step": 38870 }, { "grad_norm": 0.37433695793151855, "learning_rate": 5.643271995134597e-05, "loss": 0.0288, "step": 38880 }, { "grad_norm": 0.2767641246318817, "learning_rate": 5.641222279432814e-05, "loss": 0.0281, "step": 38890 }, { "grad_norm": 0.6128922700881958, "learning_rate": 5.6391724541636834e-05, "loss": 0.0242, "step": 38900 }, { "grad_norm": 0.47372302412986755, "learning_rate": 5.6371225196774605e-05, "loss": 0.0299, "step": 38910 }, { "grad_norm": 0.35953935980796814, "learning_rate": 5.635072476324423e-05, "loss": 0.0319, "step": 38920 }, { "grad_norm": 0.4270249009132385, "learning_rate": 5.63302232445487e-05, "loss": 0.0236, "step": 38930 }, { "grad_norm": 0.20910906791687012, "learning_rate": 5.6309720644191144e-05, "loss": 0.0324, "step": 38940 }, { "grad_norm": 0.41955938935279846, "learning_rate": 5.628921696567491e-05, "loss": 0.0242, "step": 38950 }, { "grad_norm": 0.48231202363967896, "learning_rate": 5.62687122125035e-05, "loss": 0.0285, "step": 38960 }, { "grad_norm": 0.28849560022354126, "learning_rate": 5.624820638818062e-05, "loss": 0.024, "step": 38970 }, { "grad_norm": 0.4555183947086334, "learning_rate": 5.6227699496210164e-05, "loss": 0.0398, "step": 38980 }, { "grad_norm": 0.21429570019245148, "learning_rate": 5.6207191540096195e-05, "loss": 0.0247, "step": 38990 }, { "grad_norm": 0.2774544358253479, "learning_rate": 5.618668252334296e-05, "loss": 0.0195, "step": 39000 }, { "grad_norm": 0.2934142053127289, "learning_rate": 5.616617244945488e-05, "loss": 0.0224, "step": 39010 }, { "grad_norm": 0.35490360856056213, "learning_rate": 5.614566132193656e-05, "loss": 0.0232, "step": 39020 }, { "grad_norm": 0.27421408891677856, "learning_rate": 5.612514914429282e-05, "loss": 0.0229, "step": 39030 }, { "grad_norm": 0.2678780257701874, "learning_rate": 5.610463592002863e-05, "loss": 0.0233, "step": 39040 }, { "grad_norm": 0.3583839237689972, "learning_rate": 5.608412165264909e-05, "loss": 0.0255, "step": 39050 }, { "grad_norm": 0.44065478444099426, "learning_rate": 5.606360634565959e-05, "loss": 0.0229, "step": 39060 }, { "grad_norm": 0.20832931995391846, "learning_rate": 5.604309000256559e-05, "loss": 0.0245, "step": 39070 }, { "grad_norm": 0.27751031517982483, "learning_rate": 5.6022572626872785e-05, "loss": 0.021, "step": 39080 }, { "grad_norm": 0.30843842029571533, "learning_rate": 5.600205422208704e-05, "loss": 0.0296, "step": 39090 }, { "grad_norm": 0.23911675810813904, "learning_rate": 5.59815347917144e-05, "loss": 0.0255, "step": 39100 }, { "grad_norm": 0.2673017382621765, "learning_rate": 5.596101433926103e-05, "loss": 0.0297, "step": 39110 }, { "grad_norm": 0.34042027592658997, "learning_rate": 5.5940492868233364e-05, "loss": 0.0224, "step": 39120 }, { "grad_norm": 0.39713266491889954, "learning_rate": 5.591997038213793e-05, "loss": 0.0321, "step": 39130 }, { "grad_norm": 0.2568087577819824, "learning_rate": 5.5899446884481475e-05, "loss": 0.0213, "step": 39140 }, { "grad_norm": 0.27950358390808105, "learning_rate": 5.5878922378770906e-05, "loss": 0.034, "step": 39150 }, { "grad_norm": 0.24934382736682892, "learning_rate": 5.5858396868513285e-05, "loss": 0.0247, "step": 39160 }, { "grad_norm": 0.2717784345149994, "learning_rate": 5.583787035721586e-05, "loss": 0.0264, "step": 39170 }, { "grad_norm": 0.31638017296791077, "learning_rate": 5.581734284838606e-05, "loss": 0.0242, "step": 39180 }, { "grad_norm": 0.30145251750946045, "learning_rate": 5.579681434553147e-05, "loss": 0.0247, "step": 39190 }, { "grad_norm": 0.37824177742004395, "learning_rate": 5.5776284852159854e-05, "loss": 0.0229, "step": 39200 }, { "grad_norm": 0.3012697994709015, "learning_rate": 5.575575437177913e-05, "loss": 0.0341, "step": 39210 }, { "grad_norm": 0.3636459708213806, "learning_rate": 5.573522290789742e-05, "loss": 0.0224, "step": 39220 }, { "grad_norm": 0.26792973279953003, "learning_rate": 5.571469046402298e-05, "loss": 0.0271, "step": 39230 }, { "grad_norm": 0.31917980313301086, "learning_rate": 5.5694157043664205e-05, "loss": 0.0317, "step": 39240 }, { "grad_norm": 0.3369191884994507, "learning_rate": 5.567362265032975e-05, "loss": 0.0218, "step": 39250 }, { "grad_norm": 0.3291851580142975, "learning_rate": 5.565308728752836e-05, "loss": 0.0267, "step": 39260 }, { "grad_norm": 0.3047884702682495, "learning_rate": 5.5632550958768956e-05, "loss": 0.0204, "step": 39270 }, { "grad_norm": 0.3584924638271332, "learning_rate": 5.5612013667560636e-05, "loss": 0.0253, "step": 39280 }, { "grad_norm": 0.3075823485851288, "learning_rate": 5.5591475417412676e-05, "loss": 0.0401, "step": 39290 }, { "grad_norm": 0.32869216799736023, "learning_rate": 5.557093621183451e-05, "loss": 0.0289, "step": 39300 }, { "grad_norm": 0.4965987801551819, "learning_rate": 5.55503960543357e-05, "loss": 0.0223, "step": 39310 }, { "grad_norm": 0.37289148569107056, "learning_rate": 5.552985494842601e-05, "loss": 0.024, "step": 39320 }, { "grad_norm": 0.16263540089130402, "learning_rate": 5.550931289761534e-05, "loss": 0.0209, "step": 39330 }, { "grad_norm": 0.26826152205467224, "learning_rate": 5.548876990541378e-05, "loss": 0.0256, "step": 39340 }, { "grad_norm": 0.4861297905445099, "learning_rate": 5.5468225975331564e-05, "loss": 0.0339, "step": 39350 }, { "grad_norm": 0.32048553228378296, "learning_rate": 5.544768111087909e-05, "loss": 0.0272, "step": 39360 }, { "grad_norm": 0.33354467153549194, "learning_rate": 5.54271353155669e-05, "loss": 0.028, "step": 39370 }, { "grad_norm": 0.7976072430610657, "learning_rate": 5.5406588592905715e-05, "loss": 0.0311, "step": 39380 }, { "grad_norm": 0.28816723823547363, "learning_rate": 5.5386040946406416e-05, "loss": 0.0319, "step": 39390 }, { "grad_norm": 0.629117488861084, "learning_rate": 5.536549237958004e-05, "loss": 0.035, "step": 39400 }, { "grad_norm": 0.3906579613685608, "learning_rate": 5.5344942895937744e-05, "loss": 0.0394, "step": 39410 }, { "grad_norm": 0.269514262676239, "learning_rate": 5.5324392498990904e-05, "loss": 0.0297, "step": 39420 }, { "grad_norm": 0.3886796236038208, "learning_rate": 5.5303841192251e-05, "loss": 0.0291, "step": 39430 }, { "grad_norm": 0.37107908725738525, "learning_rate": 5.52832889792297e-05, "loss": 0.0301, "step": 39440 }, { "grad_norm": 0.3491849899291992, "learning_rate": 5.526273586343881e-05, "loss": 0.0214, "step": 39450 }, { "grad_norm": 0.22949689626693726, "learning_rate": 5.5242181848390306e-05, "loss": 0.021, "step": 39460 }, { "grad_norm": 0.3300704061985016, "learning_rate": 5.5221626937596285e-05, "loss": 0.0221, "step": 39470 }, { "grad_norm": 0.2779483497142792, "learning_rate": 5.520107113456903e-05, "loss": 0.0349, "step": 39480 }, { "grad_norm": 0.5424917936325073, "learning_rate": 5.5180514442820974e-05, "loss": 0.0347, "step": 39490 }, { "grad_norm": 0.2299557626247406, "learning_rate": 5.515995686586469e-05, "loss": 0.0232, "step": 39500 }, { "grad_norm": 0.31462958455085754, "learning_rate": 5.5139398407212916e-05, "loss": 0.0259, "step": 39510 }, { "grad_norm": 0.378361314535141, "learning_rate": 5.511883907037849e-05, "loss": 0.0276, "step": 39520 }, { "grad_norm": 0.4574878513813019, "learning_rate": 5.509827885887449e-05, "loss": 0.0268, "step": 39530 }, { "grad_norm": 0.3020332455635071, "learning_rate": 5.507771777621406e-05, "loss": 0.0283, "step": 39540 }, { "grad_norm": 0.28653815388679504, "learning_rate": 5.505715582591052e-05, "loss": 0.0386, "step": 39550 }, { "grad_norm": 0.36801859736442566, "learning_rate": 5.50365930114774e-05, "loss": 0.0316, "step": 39560 }, { "grad_norm": 0.25381970405578613, "learning_rate": 5.5016029336428255e-05, "loss": 0.0232, "step": 39570 }, { "grad_norm": 0.6184704303741455, "learning_rate": 5.49954648042769e-05, "loss": 0.0278, "step": 39580 }, { "grad_norm": 0.2678416669368744, "learning_rate": 5.4974899418537226e-05, "loss": 0.0319, "step": 39590 }, { "grad_norm": 0.20917938649654388, "learning_rate": 5.4954333182723316e-05, "loss": 0.0216, "step": 39600 }, { "grad_norm": 0.39898422360420227, "learning_rate": 5.493376610034937e-05, "loss": 0.0233, "step": 39610 }, { "grad_norm": 0.2639634609222412, "learning_rate": 5.4913198174929735e-05, "loss": 0.0236, "step": 39620 }, { "grad_norm": 0.1613355576992035, "learning_rate": 5.489262940997889e-05, "loss": 0.0266, "step": 39630 }, { "grad_norm": 0.4242955148220062, "learning_rate": 5.487205980901151e-05, "loss": 0.031, "step": 39640 }, { "grad_norm": 0.35910332202911377, "learning_rate": 5.485148937554234e-05, "loss": 0.0463, "step": 39650 }, { "grad_norm": 0.2520313858985901, "learning_rate": 5.483091811308635e-05, "loss": 0.0265, "step": 39660 }, { "grad_norm": 0.3687128722667694, "learning_rate": 5.4810346025158564e-05, "loss": 0.032, "step": 39670 }, { "grad_norm": 0.29917198419570923, "learning_rate": 5.478977311527421e-05, "loss": 0.0287, "step": 39680 }, { "grad_norm": 0.2860550284385681, "learning_rate": 5.476919938694863e-05, "loss": 0.0272, "step": 39690 }, { "grad_norm": 0.3543678820133209, "learning_rate": 5.474862484369733e-05, "loss": 0.02, "step": 39700 }, { "grad_norm": 0.28605207800865173, "learning_rate": 5.472804948903589e-05, "loss": 0.0266, "step": 39710 }, { "grad_norm": 0.5792427659034729, "learning_rate": 5.470747332648013e-05, "loss": 0.0207, "step": 39720 }, { "grad_norm": 0.48855090141296387, "learning_rate": 5.468689635954591e-05, "loss": 0.0391, "step": 39730 }, { "grad_norm": 0.19447828829288483, "learning_rate": 5.46663185917493e-05, "loss": 0.024, "step": 39740 }, { "grad_norm": 0.424593985080719, "learning_rate": 5.464574002660645e-05, "loss": 0.0278, "step": 39750 }, { "grad_norm": 0.28703927993774414, "learning_rate": 5.4625160667633724e-05, "loss": 0.0232, "step": 39760 }, { "grad_norm": 0.2759090065956116, "learning_rate": 5.4604580518347505e-05, "loss": 0.0309, "step": 39770 }, { "grad_norm": 0.30199068784713745, "learning_rate": 5.458399958226442e-05, "loss": 0.0297, "step": 39780 }, { "grad_norm": 0.5409104824066162, "learning_rate": 5.456341786290119e-05, "loss": 0.0259, "step": 39790 }, { "grad_norm": 0.4606603980064392, "learning_rate": 5.454283536377465e-05, "loss": 0.0288, "step": 39800 }, { "grad_norm": 0.34239134192466736, "learning_rate": 5.452225208840179e-05, "loss": 0.0245, "step": 39810 }, { "grad_norm": 0.42981526255607605, "learning_rate": 5.450166804029975e-05, "loss": 0.0314, "step": 39820 }, { "grad_norm": 0.3173738121986389, "learning_rate": 5.448108322298574e-05, "loss": 0.0194, "step": 39830 }, { "grad_norm": 0.40687933564186096, "learning_rate": 5.446049763997717e-05, "loss": 0.0247, "step": 39840 }, { "grad_norm": 0.23927247524261475, "learning_rate": 5.4439911294791546e-05, "loss": 0.0205, "step": 39850 }, { "grad_norm": 0.28967568278312683, "learning_rate": 5.441932419094652e-05, "loss": 0.0203, "step": 39860 }, { "grad_norm": 0.32976263761520386, "learning_rate": 5.439873633195985e-05, "loss": 0.0276, "step": 39870 }, { "grad_norm": 0.30030685663223267, "learning_rate": 5.437814772134947e-05, "loss": 0.0297, "step": 39880 }, { "grad_norm": 0.30558279156684875, "learning_rate": 5.4357558362633366e-05, "loss": 0.0231, "step": 39890 }, { "grad_norm": 0.25781288743019104, "learning_rate": 5.4336968259329726e-05, "loss": 0.0189, "step": 39900 }, { "grad_norm": 0.3034931421279907, "learning_rate": 5.431637741495681e-05, "loss": 0.0234, "step": 39910 }, { "grad_norm": 0.3586087226867676, "learning_rate": 5.429578583303307e-05, "loss": 0.0289, "step": 39920 }, { "grad_norm": 0.39797329902648926, "learning_rate": 5.427519351707701e-05, "loss": 0.0306, "step": 39930 }, { "grad_norm": 0.32489803433418274, "learning_rate": 5.42546004706073e-05, "loss": 0.0263, "step": 39940 }, { "grad_norm": 0.3786819875240326, "learning_rate": 5.4234006697142735e-05, "loss": 0.0233, "step": 39950 }, { "grad_norm": 0.32725203037261963, "learning_rate": 5.421341220020224e-05, "loss": 0.0275, "step": 39960 }, { "grad_norm": 0.23452982306480408, "learning_rate": 5.419281698330482e-05, "loss": 0.0277, "step": 39970 }, { "grad_norm": 0.27342134714126587, "learning_rate": 5.4172221049969665e-05, "loss": 0.0261, "step": 39980 }, { "grad_norm": 0.19449402391910553, "learning_rate": 5.415162440371604e-05, "loss": 0.0221, "step": 39990 }, { "grad_norm": 0.310523122549057, "learning_rate": 5.413102704806334e-05, "loss": 0.0244, "step": 40000 }, { "grad_norm": 0.48924899101257324, "learning_rate": 5.41104289865311e-05, "loss": 0.0259, "step": 40010 }, { "grad_norm": 0.3209068775177002, "learning_rate": 5.408983022263898e-05, "loss": 0.0248, "step": 40020 }, { "grad_norm": 0.6494277715682983, "learning_rate": 5.406923075990673e-05, "loss": 0.0349, "step": 40030 }, { "grad_norm": 1.3773406744003296, "learning_rate": 5.404863060185423e-05, "loss": 0.0269, "step": 40040 }, { "grad_norm": 0.3650689125061035, "learning_rate": 5.40280297520015e-05, "loss": 0.0305, "step": 40050 }, { "grad_norm": 0.3414086401462555, "learning_rate": 5.400742821386865e-05, "loss": 0.025, "step": 40060 }, { "grad_norm": 0.29671934247016907, "learning_rate": 5.398682599097592e-05, "loss": 0.0241, "step": 40070 }, { "grad_norm": 0.21858668327331543, "learning_rate": 5.396622308684367e-05, "loss": 0.018, "step": 40080 }, { "grad_norm": 0.4737870693206787, "learning_rate": 5.394561950499236e-05, "loss": 0.0273, "step": 40090 }, { "grad_norm": 0.25017061829566956, "learning_rate": 5.39250152489426e-05, "loss": 0.0254, "step": 40100 }, { "grad_norm": 0.31921079754829407, "learning_rate": 5.390441032221507e-05, "loss": 0.0245, "step": 40110 }, { "grad_norm": 0.32766440510749817, "learning_rate": 5.388380472833062e-05, "loss": 0.0257, "step": 40120 }, { "grad_norm": 0.22860310971736908, "learning_rate": 5.386319847081016e-05, "loss": 0.0292, "step": 40130 }, { "grad_norm": 0.41620418429374695, "learning_rate": 5.384259155317473e-05, "loss": 0.0264, "step": 40140 }, { "grad_norm": 0.5103992223739624, "learning_rate": 5.38219839789455e-05, "loss": 0.0277, "step": 40150 }, { "grad_norm": 0.2230062633752823, "learning_rate": 5.380137575164376e-05, "loss": 0.0341, "step": 40160 }, { "grad_norm": 0.2150057852268219, "learning_rate": 5.378076687479085e-05, "loss": 0.0181, "step": 40170 }, { "grad_norm": 0.22537332773208618, "learning_rate": 5.3760157351908305e-05, "loss": 0.0285, "step": 40180 }, { "grad_norm": 0.1888190656900406, "learning_rate": 5.373954718651768e-05, "loss": 0.0235, "step": 40190 }, { "grad_norm": 0.2383362501859665, "learning_rate": 5.371893638214074e-05, "loss": 0.022, "step": 40200 }, { "grad_norm": 0.34196868538856506, "learning_rate": 5.369832494229927e-05, "loss": 0.0238, "step": 40210 }, { "grad_norm": 0.3673079013824463, "learning_rate": 5.367771287051524e-05, "loss": 0.0289, "step": 40220 }, { "grad_norm": 0.2596879005432129, "learning_rate": 5.365710017031065e-05, "loss": 0.0272, "step": 40230 }, { "grad_norm": 0.2516954243183136, "learning_rate": 5.3636486845207654e-05, "loss": 0.0232, "step": 40240 }, { "grad_norm": 0.2828417718410492, "learning_rate": 5.3615872898728526e-05, "loss": 0.0195, "step": 40250 }, { "grad_norm": 0.45088881254196167, "learning_rate": 5.3595258334395614e-05, "loss": 0.0305, "step": 40260 }, { "grad_norm": 0.30935364961624146, "learning_rate": 5.3574643155731396e-05, "loss": 0.0285, "step": 40270 }, { "grad_norm": 0.3878979980945587, "learning_rate": 5.3554027366258405e-05, "loss": 0.0254, "step": 40280 }, { "grad_norm": 0.32826581597328186, "learning_rate": 5.3533410969499355e-05, "loss": 0.0252, "step": 40290 }, { "grad_norm": 0.31853893399238586, "learning_rate": 5.3512793968977e-05, "loss": 0.0227, "step": 40300 }, { "grad_norm": 0.4491213262081146, "learning_rate": 5.349217636821423e-05, "loss": 0.0252, "step": 40310 }, { "grad_norm": 0.27011895179748535, "learning_rate": 5.347155817073405e-05, "loss": 0.0264, "step": 40320 }, { "grad_norm": 0.29823195934295654, "learning_rate": 5.34509393800595e-05, "loss": 0.03, "step": 40330 }, { "grad_norm": 0.31370675563812256, "learning_rate": 5.343031999971381e-05, "loss": 0.0283, "step": 40340 }, { "grad_norm": 0.8456327319145203, "learning_rate": 5.340970003322026e-05, "loss": 0.0287, "step": 40350 }, { "grad_norm": 0.4399412274360657, "learning_rate": 5.33890794841022e-05, "loss": 0.0249, "step": 40360 }, { "grad_norm": 0.2775859236717224, "learning_rate": 5.336845835588318e-05, "loss": 0.0247, "step": 40370 }, { "grad_norm": 0.2754952013492584, "learning_rate": 5.334783665208674e-05, "loss": 0.021, "step": 40380 }, { "grad_norm": 0.33436334133148193, "learning_rate": 5.332721437623657e-05, "loss": 0.0204, "step": 40390 }, { "grad_norm": 0.555381178855896, "learning_rate": 5.3306591531856464e-05, "loss": 0.029, "step": 40400 }, { "grad_norm": 0.25555065274238586, "learning_rate": 5.3285968122470295e-05, "loss": 0.0237, "step": 40410 }, { "grad_norm": 0.33965539932250977, "learning_rate": 5.326534415160207e-05, "loss": 0.0319, "step": 40420 }, { "grad_norm": 0.3242732882499695, "learning_rate": 5.324471962277582e-05, "loss": 0.0352, "step": 40430 }, { "grad_norm": 0.2873469889163971, "learning_rate": 5.3224094539515746e-05, "loss": 0.0218, "step": 40440 }, { "grad_norm": 0.29707232117652893, "learning_rate": 5.3203468905346076e-05, "loss": 0.0256, "step": 40450 }, { "grad_norm": 0.47249487042427063, "learning_rate": 5.3182842723791195e-05, "loss": 0.0318, "step": 40460 }, { "grad_norm": 0.22213448584079742, "learning_rate": 5.316221599837554e-05, "loss": 0.0219, "step": 40470 }, { "grad_norm": 0.3374476134777069, "learning_rate": 5.314158873262366e-05, "loss": 0.0307, "step": 40480 }, { "grad_norm": 0.25418156385421753, "learning_rate": 5.312096093006018e-05, "loss": 0.0188, "step": 40490 }, { "grad_norm": 0.5687118172645569, "learning_rate": 5.3100332594209846e-05, "loss": 0.0272, "step": 40500 }, { "grad_norm": 0.35538962483406067, "learning_rate": 5.3079703728597454e-05, "loss": 0.0294, "step": 40510 }, { "grad_norm": 0.23792147636413574, "learning_rate": 5.305907433674794e-05, "loss": 0.0264, "step": 40520 }, { "grad_norm": 0.33205467462539673, "learning_rate": 5.3038444422186284e-05, "loss": 0.0267, "step": 40530 }, { "grad_norm": 0.409940242767334, "learning_rate": 5.301781398843757e-05, "loss": 0.0274, "step": 40540 }, { "grad_norm": 0.2607572078704834, "learning_rate": 5.299718303902699e-05, "loss": 0.0245, "step": 40550 }, { "grad_norm": 0.27934500575065613, "learning_rate": 5.297655157747979e-05, "loss": 0.0236, "step": 40560 }, { "grad_norm": 0.4119957387447357, "learning_rate": 5.295591960732136e-05, "loss": 0.0237, "step": 40570 }, { "grad_norm": 0.3719383776187897, "learning_rate": 5.293528713207708e-05, "loss": 0.0276, "step": 40580 }, { "grad_norm": 0.18651098012924194, "learning_rate": 5.291465415527253e-05, "loss": 0.0236, "step": 40590 }, { "grad_norm": 0.23414383828639984, "learning_rate": 5.2894020680433296e-05, "loss": 0.0225, "step": 40600 }, { "grad_norm": 0.2671038508415222, "learning_rate": 5.287338671108507e-05, "loss": 0.0279, "step": 40610 }, { "grad_norm": 0.3848218023777008, "learning_rate": 5.285275225075367e-05, "loss": 0.0249, "step": 40620 }, { "grad_norm": 0.45514678955078125, "learning_rate": 5.283211730296492e-05, "loss": 0.0217, "step": 40630 }, { "grad_norm": 0.2851453423500061, "learning_rate": 5.281148187124477e-05, "loss": 0.0254, "step": 40640 }, { "grad_norm": 0.18834765255451202, "learning_rate": 5.279084595911927e-05, "loss": 0.0257, "step": 40650 }, { "grad_norm": 0.6125437021255493, "learning_rate": 5.277020957011453e-05, "loss": 0.0358, "step": 40660 }, { "grad_norm": 0.4350906312465668, "learning_rate": 5.274957270775673e-05, "loss": 0.0341, "step": 40670 }, { "grad_norm": 0.2526761293411255, "learning_rate": 5.2728935375572164e-05, "loss": 0.0231, "step": 40680 }, { "grad_norm": 0.22757887840270996, "learning_rate": 5.2708297577087176e-05, "loss": 0.022, "step": 40690 }, { "grad_norm": 0.37568822503089905, "learning_rate": 5.2687659315828196e-05, "loss": 0.0229, "step": 40700 }, { "grad_norm": 0.2823302149772644, "learning_rate": 5.266702059532175e-05, "loss": 0.0236, "step": 40710 }, { "grad_norm": 0.3739866316318512, "learning_rate": 5.264638141909444e-05, "loss": 0.0302, "step": 40720 }, { "grad_norm": 0.6417311429977417, "learning_rate": 5.2625741790672925e-05, "loss": 0.0252, "step": 40730 }, { "grad_norm": 0.25197362899780273, "learning_rate": 5.2605101713583935e-05, "loss": 0.0206, "step": 40740 }, { "grad_norm": 0.3927062153816223, "learning_rate": 5.2584461191354315e-05, "loss": 0.0304, "step": 40750 }, { "grad_norm": 0.2726221978664398, "learning_rate": 5.2563820227510964e-05, "loss": 0.0229, "step": 40760 }, { "grad_norm": 0.24223878979682922, "learning_rate": 5.2543178825580864e-05, "loss": 0.0253, "step": 40770 }, { "grad_norm": 0.3894861340522766, "learning_rate": 5.2522536989091054e-05, "loss": 0.0264, "step": 40780 }, { "grad_norm": 0.3308049738407135, "learning_rate": 5.2501894721568655e-05, "loss": 0.0223, "step": 40790 }, { "grad_norm": 0.3379824459552765, "learning_rate": 5.248125202654089e-05, "loss": 0.0254, "step": 40800 }, { "grad_norm": 0.4943683445453644, "learning_rate": 5.246060890753501e-05, "loss": 0.0329, "step": 40810 }, { "grad_norm": 0.19040505588054657, "learning_rate": 5.243996536807837e-05, "loss": 0.026, "step": 40820 }, { "grad_norm": 0.3078456223011017, "learning_rate": 5.2419321411698384e-05, "loss": 0.0194, "step": 40830 }, { "grad_norm": 0.2907037138938904, "learning_rate": 5.239867704192253e-05, "loss": 0.0272, "step": 40840 }, { "grad_norm": 0.18507781624794006, "learning_rate": 5.237803226227838e-05, "loss": 0.02, "step": 40850 }, { "grad_norm": 0.26395806670188904, "learning_rate": 5.235738707629354e-05, "loss": 0.0267, "step": 40860 }, { "grad_norm": 0.28781694173812866, "learning_rate": 5.233674148749575e-05, "loss": 0.026, "step": 40870 }, { "grad_norm": 0.6352828741073608, "learning_rate": 5.231609549941272e-05, "loss": 0.0311, "step": 40880 }, { "grad_norm": 0.3713562786579132, "learning_rate": 5.229544911557233e-05, "loss": 0.0287, "step": 40890 }, { "grad_norm": 0.20182783901691437, "learning_rate": 5.227480233950246e-05, "loss": 0.0265, "step": 40900 }, { "grad_norm": 0.34075728058815, "learning_rate": 5.22541551747311e-05, "loss": 0.027, "step": 40910 }, { "grad_norm": 1.276253581047058, "learning_rate": 5.223350762478626e-05, "loss": 0.0237, "step": 40920 }, { "grad_norm": 0.22761783003807068, "learning_rate": 5.221285969319606e-05, "loss": 0.023, "step": 40930 }, { "grad_norm": 0.24741335213184357, "learning_rate": 5.219221138348865e-05, "loss": 0.0282, "step": 40940 }, { "grad_norm": 0.290703684091568, "learning_rate": 5.217156269919228e-05, "loss": 0.0281, "step": 40950 }, { "grad_norm": 0.44768965244293213, "learning_rate": 5.215091364383523e-05, "loss": 0.023, "step": 40960 }, { "grad_norm": 0.4211200177669525, "learning_rate": 5.213026422094588e-05, "loss": 0.0229, "step": 40970 }, { "grad_norm": 0.23269349336624146, "learning_rate": 5.210961443405262e-05, "loss": 0.0229, "step": 40980 }, { "grad_norm": 0.2821583151817322, "learning_rate": 5.208896428668396e-05, "loss": 0.0191, "step": 40990 }, { "grad_norm": 0.3002674877643585, "learning_rate": 5.206831378236845e-05, "loss": 0.0266, "step": 41000 }, { "grad_norm": 0.35183286666870117, "learning_rate": 5.2047662924634666e-05, "loss": 0.0208, "step": 41010 }, { "grad_norm": 0.33861273527145386, "learning_rate": 5.20270117170113e-05, "loss": 0.0207, "step": 41020 }, { "grad_norm": 0.34742629528045654, "learning_rate": 5.200636016302707e-05, "loss": 0.0316, "step": 41030 }, { "grad_norm": 0.3260868191719055, "learning_rate": 5.1985708266210754e-05, "loss": 0.031, "step": 41040 }, { "grad_norm": 0.2834918797016144, "learning_rate": 5.1965056030091206e-05, "loss": 0.0277, "step": 41050 }, { "grad_norm": 0.3019079566001892, "learning_rate": 5.194440345819732e-05, "loss": 0.0244, "step": 41060 }, { "grad_norm": 0.3250303566455841, "learning_rate": 5.1923750554058084e-05, "loss": 0.0271, "step": 41070 }, { "grad_norm": 0.2018468677997589, "learning_rate": 5.1903097321202476e-05, "loss": 0.022, "step": 41080 }, { "grad_norm": 0.3848929703235626, "learning_rate": 5.18824437631596e-05, "loss": 0.0398, "step": 41090 }, { "grad_norm": 0.2216048389673233, "learning_rate": 5.186178988345856e-05, "loss": 0.0278, "step": 41100 }, { "grad_norm": 0.4551433026790619, "learning_rate": 5.184113568562855e-05, "loss": 0.0257, "step": 41110 }, { "grad_norm": 0.2835536301136017, "learning_rate": 5.18204811731988e-05, "loss": 0.029, "step": 41120 }, { "grad_norm": 0.3598698377609253, "learning_rate": 5.179982634969862e-05, "loss": 0.0224, "step": 41130 }, { "grad_norm": 0.4651077687740326, "learning_rate": 5.177917121865734e-05, "loss": 0.0331, "step": 41140 }, { "grad_norm": 0.4619622230529785, "learning_rate": 5.1758515783604346e-05, "loss": 0.0235, "step": 41150 }, { "grad_norm": 0.5031925439834595, "learning_rate": 5.173786004806911e-05, "loss": 0.033, "step": 41160 }, { "grad_norm": 0.6232220530509949, "learning_rate": 5.1717204015581135e-05, "loss": 0.0284, "step": 41170 }, { "grad_norm": 0.2349044531583786, "learning_rate": 5.169654768966994e-05, "loss": 0.0262, "step": 41180 }, { "grad_norm": 0.4106866717338562, "learning_rate": 5.1675891073865156e-05, "loss": 0.0266, "step": 41190 }, { "grad_norm": 0.2600201368331909, "learning_rate": 5.1655234171696424e-05, "loss": 0.0211, "step": 41200 }, { "grad_norm": 0.24791346490383148, "learning_rate": 5.163457698669343e-05, "loss": 0.0196, "step": 41210 }, { "grad_norm": 0.30232709646224976, "learning_rate": 5.1613919522385936e-05, "loss": 0.021, "step": 41220 }, { "grad_norm": 0.517259418964386, "learning_rate": 5.1593261782303746e-05, "loss": 0.0305, "step": 41230 }, { "grad_norm": 0.4003540277481079, "learning_rate": 5.157260376997669e-05, "loss": 0.0245, "step": 41240 }, { "grad_norm": 0.35502785444259644, "learning_rate": 5.155194548893466e-05, "loss": 0.0266, "step": 41250 }, { "grad_norm": 0.2783435881137848, "learning_rate": 5.153128694270759e-05, "loss": 0.0339, "step": 41260 }, { "grad_norm": 0.32346582412719727, "learning_rate": 5.151062813482548e-05, "loss": 0.035, "step": 41270 }, { "grad_norm": 0.3449438810348511, "learning_rate": 5.148996906881832e-05, "loss": 0.0212, "step": 41280 }, { "grad_norm": 0.380586177110672, "learning_rate": 5.1469309748216196e-05, "loss": 0.0247, "step": 41290 }, { "grad_norm": 0.2872578203678131, "learning_rate": 5.144865017654923e-05, "loss": 0.0187, "step": 41300 }, { "grad_norm": 0.31888023018836975, "learning_rate": 5.1427990357347556e-05, "loss": 0.0249, "step": 41310 }, { "grad_norm": 0.18896174430847168, "learning_rate": 5.140733029414139e-05, "loss": 0.0249, "step": 41320 }, { "grad_norm": 0.30910223722457886, "learning_rate": 5.138666999046098e-05, "loss": 0.0258, "step": 41330 }, { "grad_norm": 0.6432636976242065, "learning_rate": 5.136600944983658e-05, "loss": 0.0259, "step": 41340 }, { "grad_norm": 0.27641046047210693, "learning_rate": 5.134534867579853e-05, "loss": 0.0204, "step": 41350 }, { "grad_norm": 0.47773438692092896, "learning_rate": 5.1324687671877184e-05, "loss": 0.0222, "step": 41360 }, { "grad_norm": 0.35120663046836853, "learning_rate": 5.130402644160296e-05, "loss": 0.0293, "step": 41370 }, { "grad_norm": 0.23572134971618652, "learning_rate": 5.128336498850628e-05, "loss": 0.0187, "step": 41380 }, { "grad_norm": 0.3592251241207123, "learning_rate": 5.126270331611761e-05, "loss": 0.0222, "step": 41390 }, { "grad_norm": 0.38190409541130066, "learning_rate": 5.124204142796748e-05, "loss": 0.0219, "step": 41400 }, { "grad_norm": 0.40829506516456604, "learning_rate": 5.122137932758644e-05, "loss": 0.0222, "step": 41410 }, { "grad_norm": 0.3491629362106323, "learning_rate": 5.1200717018505074e-05, "loss": 0.0221, "step": 41420 }, { "grad_norm": 0.26555532217025757, "learning_rate": 5.118005450425403e-05, "loss": 0.0237, "step": 41430 }, { "grad_norm": 1.1667821407318115, "learning_rate": 5.115939178836392e-05, "loss": 0.0227, "step": 41440 }, { "grad_norm": 0.4157620668411255, "learning_rate": 5.113872887436547e-05, "loss": 0.0221, "step": 41450 }, { "grad_norm": 1.175241470336914, "learning_rate": 5.11180657657894e-05, "loss": 0.0264, "step": 41460 }, { "grad_norm": 0.35363873839378357, "learning_rate": 5.109740246616649e-05, "loss": 0.0223, "step": 41470 }, { "grad_norm": 0.4436236321926117, "learning_rate": 5.10767389790275e-05, "loss": 0.035, "step": 41480 }, { "grad_norm": 0.3856426179409027, "learning_rate": 5.1056075307903265e-05, "loss": 0.024, "step": 41490 }, { "grad_norm": 0.2612409293651581, "learning_rate": 5.1035411456324634e-05, "loss": 0.0262, "step": 41500 }, { "grad_norm": 0.29485851526260376, "learning_rate": 5.1014747427822504e-05, "loss": 0.0281, "step": 41510 }, { "grad_norm": 0.36586734652519226, "learning_rate": 5.0994083225927804e-05, "loss": 0.0353, "step": 41520 }, { "grad_norm": 0.23821957409381866, "learning_rate": 5.0973418854171475e-05, "loss": 0.0275, "step": 41530 }, { "grad_norm": 0.25382864475250244, "learning_rate": 5.095275431608447e-05, "loss": 0.0271, "step": 41540 }, { "grad_norm": 0.27072590589523315, "learning_rate": 5.093208961519782e-05, "loss": 0.0253, "step": 41550 }, { "grad_norm": 0.2655574679374695, "learning_rate": 5.091142475504255e-05, "loss": 0.0248, "step": 41560 }, { "grad_norm": 0.4507152736186981, "learning_rate": 5.089075973914971e-05, "loss": 0.0265, "step": 41570 }, { "grad_norm": 0.3901401162147522, "learning_rate": 5.08700945710504e-05, "loss": 0.0293, "step": 41580 }, { "grad_norm": 0.293711394071579, "learning_rate": 5.0849429254275714e-05, "loss": 0.0282, "step": 41590 }, { "grad_norm": 0.22900810837745667, "learning_rate": 5.0828763792356804e-05, "loss": 0.0197, "step": 41600 }, { "grad_norm": 0.4849144220352173, "learning_rate": 5.080809818882483e-05, "loss": 0.0317, "step": 41610 }, { "grad_norm": 0.31044459342956543, "learning_rate": 5.078743244721097e-05, "loss": 0.0256, "step": 41620 }, { "grad_norm": 0.30142080783843994, "learning_rate": 5.0766766571046455e-05, "loss": 0.0234, "step": 41630 }, { "grad_norm": 0.5218217372894287, "learning_rate": 5.07461005638625e-05, "loss": 0.0278, "step": 41640 }, { "grad_norm": 0.3248406648635864, "learning_rate": 5.072543442919037e-05, "loss": 0.0227, "step": 41650 }, { "grad_norm": 0.2520720958709717, "learning_rate": 5.070476817056132e-05, "loss": 0.0301, "step": 41660 }, { "grad_norm": 0.3026282787322998, "learning_rate": 5.068410179150668e-05, "loss": 0.0162, "step": 41670 }, { "grad_norm": 0.2960517704486847, "learning_rate": 5.066343529555775e-05, "loss": 0.0281, "step": 41680 }, { "grad_norm": 0.38390448689460754, "learning_rate": 5.064276868624587e-05, "loss": 0.022, "step": 41690 }, { "grad_norm": 0.5879650115966797, "learning_rate": 5.0622101967102396e-05, "loss": 0.018, "step": 41700 }, { "grad_norm": 0.26320454478263855, "learning_rate": 5.0601435141658714e-05, "loss": 0.0255, "step": 41710 }, { "grad_norm": 0.37715762853622437, "learning_rate": 5.058076821344622e-05, "loss": 0.0244, "step": 41720 }, { "grad_norm": 0.38033580780029297, "learning_rate": 5.056010118599632e-05, "loss": 0.0315, "step": 41730 }, { "grad_norm": 0.27935367822647095, "learning_rate": 5.053943406284044e-05, "loss": 0.0289, "step": 41740 }, { "grad_norm": 0.3950049579143524, "learning_rate": 5.051876684751006e-05, "loss": 0.0241, "step": 41750 }, { "grad_norm": 0.3094196021556854, "learning_rate": 5.0498099543536584e-05, "loss": 0.0236, "step": 41760 }, { "grad_norm": 0.48806387186050415, "learning_rate": 5.047743215445152e-05, "loss": 0.0251, "step": 41770 }, { "grad_norm": 0.3630558252334595, "learning_rate": 5.045676468378637e-05, "loss": 0.028, "step": 41780 }, { "grad_norm": 0.42751866579055786, "learning_rate": 5.0436097135072626e-05, "loss": 0.0306, "step": 41790 }, { "grad_norm": 0.280884325504303, "learning_rate": 5.041542951184181e-05, "loss": 0.0226, "step": 41800 }, { "grad_norm": 0.21131418645381927, "learning_rate": 5.039476181762545e-05, "loss": 0.0203, "step": 41810 }, { "grad_norm": 0.21608012914657593, "learning_rate": 5.037409405595508e-05, "loss": 0.0177, "step": 41820 }, { "grad_norm": 0.3166695535182953, "learning_rate": 5.035342623036229e-05, "loss": 0.0251, "step": 41830 }, { "grad_norm": 0.3216032385826111, "learning_rate": 5.033275834437862e-05, "loss": 0.0232, "step": 41840 }, { "grad_norm": 0.36173248291015625, "learning_rate": 5.031209040153564e-05, "loss": 0.0198, "step": 41850 }, { "grad_norm": 0.30426836013793945, "learning_rate": 5.0291422405364955e-05, "loss": 0.0347, "step": 41860 }, { "grad_norm": 0.30773958563804626, "learning_rate": 5.0270754359398133e-05, "loss": 0.0222, "step": 41870 }, { "grad_norm": 0.33674299716949463, "learning_rate": 5.025008626716682e-05, "loss": 0.0237, "step": 41880 }, { "grad_norm": 0.23055046796798706, "learning_rate": 5.0229418132202585e-05, "loss": 0.0214, "step": 41890 }, { "grad_norm": 0.44883090257644653, "learning_rate": 5.020874995803707e-05, "loss": 0.0288, "step": 41900 }, { "grad_norm": 0.3116038143634796, "learning_rate": 5.01880817482019e-05, "loss": 0.0323, "step": 41910 }, { "grad_norm": 0.37180909514427185, "learning_rate": 5.01674135062287e-05, "loss": 0.0236, "step": 41920 }, { "grad_norm": 0.4254174530506134, "learning_rate": 5.0146745235649115e-05, "loss": 0.0278, "step": 41930 }, { "grad_norm": 0.36492663621902466, "learning_rate": 5.0126076939994795e-05, "loss": 0.0244, "step": 41940 }, { "grad_norm": 0.21235215663909912, "learning_rate": 5.010540862279736e-05, "loss": 0.0295, "step": 41950 }, { "grad_norm": 0.20631837844848633, "learning_rate": 5.008474028758846e-05, "loss": 0.0312, "step": 41960 }, { "grad_norm": 0.30661541223526, "learning_rate": 5.0064071937899784e-05, "loss": 0.0255, "step": 41970 }, { "grad_norm": 0.3329806625843048, "learning_rate": 5.004340357726296e-05, "loss": 0.0237, "step": 41980 }, { "grad_norm": 0.5583317875862122, "learning_rate": 5.002273520920965e-05, "loss": 0.0238, "step": 41990 }, { "grad_norm": 0.23491273820400238, "learning_rate": 5.000206683727151e-05, "loss": 0.025, "step": 42000 }, { "grad_norm": 0.2290634661912918, "learning_rate": 4.998139846498021e-05, "loss": 0.02, "step": 42010 }, { "grad_norm": 0.3303406834602356, "learning_rate": 4.9960730095867405e-05, "loss": 0.0214, "step": 42020 }, { "grad_norm": 0.3151872456073761, "learning_rate": 4.9940061733464755e-05, "loss": 0.0248, "step": 42030 }, { "grad_norm": 0.22450311481952667, "learning_rate": 4.991939338130392e-05, "loss": 0.0299, "step": 42040 }, { "grad_norm": 0.46647146344184875, "learning_rate": 4.989872504291653e-05, "loss": 0.0356, "step": 42050 }, { "grad_norm": 0.3902831971645355, "learning_rate": 4.9878056721834273e-05, "loss": 0.0309, "step": 42060 }, { "grad_norm": 0.25451964139938354, "learning_rate": 4.98573884215888e-05, "loss": 0.021, "step": 42070 }, { "grad_norm": 0.32104530930519104, "learning_rate": 4.9836720145711715e-05, "loss": 0.0276, "step": 42080 }, { "grad_norm": 0.34044787287712097, "learning_rate": 4.9816051897734725e-05, "loss": 0.0245, "step": 42090 }, { "grad_norm": 0.3695928752422333, "learning_rate": 4.979538368118941e-05, "loss": 0.0254, "step": 42100 }, { "grad_norm": 0.20622527599334717, "learning_rate": 4.9774715499607446e-05, "loss": 0.019, "step": 42110 }, { "grad_norm": 0.3655107319355011, "learning_rate": 4.9754047356520416e-05, "loss": 0.0329, "step": 42120 }, { "grad_norm": 0.3273775577545166, "learning_rate": 4.973337925545999e-05, "loss": 0.0244, "step": 42130 }, { "grad_norm": 0.24830132722854614, "learning_rate": 4.9712711199957746e-05, "loss": 0.0167, "step": 42140 }, { "grad_norm": 0.40409207344055176, "learning_rate": 4.969204319354531e-05, "loss": 0.0267, "step": 42150 }, { "grad_norm": 0.2985953688621521, "learning_rate": 4.9671375239754267e-05, "loss": 0.023, "step": 42160 }, { "grad_norm": 0.2684936225414276, "learning_rate": 4.9650707342116195e-05, "loss": 0.0182, "step": 42170 }, { "grad_norm": 0.2860059142112732, "learning_rate": 4.96300395041627e-05, "loss": 0.0223, "step": 42180 }, { "grad_norm": 0.5556554198265076, "learning_rate": 4.960937172942532e-05, "loss": 0.0247, "step": 42190 }, { "grad_norm": 0.22816020250320435, "learning_rate": 4.958870402143565e-05, "loss": 0.0239, "step": 42200 }, { "grad_norm": 0.30458882451057434, "learning_rate": 4.9568036383725186e-05, "loss": 0.0263, "step": 42210 }, { "grad_norm": 0.25393471121788025, "learning_rate": 4.9547368819825516e-05, "loss": 0.0193, "step": 42220 }, { "grad_norm": 0.3461678922176361, "learning_rate": 4.952670133326812e-05, "loss": 0.024, "step": 42230 }, { "grad_norm": 0.29579564929008484, "learning_rate": 4.950603392758453e-05, "loss": 0.0304, "step": 42240 }, { "grad_norm": 0.2745462656021118, "learning_rate": 4.948536660630621e-05, "loss": 0.0219, "step": 42250 }, { "grad_norm": 0.28683561086654663, "learning_rate": 4.9464699372964685e-05, "loss": 0.0242, "step": 42260 }, { "grad_norm": 0.2587750256061554, "learning_rate": 4.9444032231091395e-05, "loss": 0.0231, "step": 42270 }, { "grad_norm": 0.38458406925201416, "learning_rate": 4.9423365184217776e-05, "loss": 0.023, "step": 42280 }, { "grad_norm": 0.3386947810649872, "learning_rate": 4.940269823587529e-05, "loss": 0.0247, "step": 42290 }, { "grad_norm": 0.2587997317314148, "learning_rate": 4.938203138959533e-05, "loss": 0.0164, "step": 42300 }, { "grad_norm": 0.349765807390213, "learning_rate": 4.9361364648909325e-05, "loss": 0.022, "step": 42310 }, { "grad_norm": 0.22287891805171967, "learning_rate": 4.9340698017348605e-05, "loss": 0.0175, "step": 42320 }, { "grad_norm": 0.339438259601593, "learning_rate": 4.932003149844458e-05, "loss": 0.0174, "step": 42330 }, { "grad_norm": 0.3359427750110626, "learning_rate": 4.929936509572857e-05, "loss": 0.0327, "step": 42340 }, { "grad_norm": 0.46537983417510986, "learning_rate": 4.927869881273191e-05, "loss": 0.0266, "step": 42350 }, { "grad_norm": 0.31228500604629517, "learning_rate": 4.9258032652985894e-05, "loss": 0.0315, "step": 42360 }, { "grad_norm": 0.343412309885025, "learning_rate": 4.9237366620021786e-05, "loss": 0.0292, "step": 42370 }, { "grad_norm": 0.21263961493968964, "learning_rate": 4.921670071737089e-05, "loss": 0.0244, "step": 42380 }, { "grad_norm": 0.3086661100387573, "learning_rate": 4.91960349485644e-05, "loss": 0.0264, "step": 42390 }, { "grad_norm": 0.2766701281070709, "learning_rate": 4.9175369317133566e-05, "loss": 0.0251, "step": 42400 }, { "grad_norm": 0.20736996829509735, "learning_rate": 4.915470382660953e-05, "loss": 0.0211, "step": 42410 }, { "grad_norm": 0.29219523072242737, "learning_rate": 4.9134038480523524e-05, "loss": 0.0217, "step": 42420 }, { "grad_norm": 0.280089795589447, "learning_rate": 4.911337328240664e-05, "loss": 0.0256, "step": 42430 }, { "grad_norm": 0.2094540297985077, "learning_rate": 4.909270823579003e-05, "loss": 0.0252, "step": 42440 }, { "grad_norm": 0.2967809736728668, "learning_rate": 4.907204334420476e-05, "loss": 0.0227, "step": 42450 }, { "grad_norm": 0.25294241309165955, "learning_rate": 4.9051378611181884e-05, "loss": 0.0265, "step": 42460 }, { "grad_norm": 0.2744944989681244, "learning_rate": 4.9030714040252486e-05, "loss": 0.0232, "step": 42470 }, { "grad_norm": 0.26721158623695374, "learning_rate": 4.901004963494752e-05, "loss": 0.0218, "step": 42480 }, { "grad_norm": 0.5265726447105408, "learning_rate": 4.898938539879802e-05, "loss": 0.0288, "step": 42490 }, { "grad_norm": 0.39078620076179504, "learning_rate": 4.8968721335334904e-05, "loss": 0.0264, "step": 42500 }, { "grad_norm": 0.4022480249404907, "learning_rate": 4.894805744808912e-05, "loss": 0.0257, "step": 42510 }, { "grad_norm": 0.28166094422340393, "learning_rate": 4.8927393740591524e-05, "loss": 0.0235, "step": 42520 }, { "grad_norm": 0.32682719826698303, "learning_rate": 4.890673021637302e-05, "loss": 0.0195, "step": 42530 }, { "grad_norm": 0.4102979600429535, "learning_rate": 4.888606687896442e-05, "loss": 0.023, "step": 42540 }, { "grad_norm": 0.2728889584541321, "learning_rate": 4.886540373189652e-05, "loss": 0.0221, "step": 42550 }, { "grad_norm": 0.3102894425392151, "learning_rate": 4.88447407787001e-05, "loss": 0.0235, "step": 42560 }, { "grad_norm": 0.31551384925842285, "learning_rate": 4.8824078022905864e-05, "loss": 0.0222, "step": 42570 }, { "grad_norm": 0.21829085052013397, "learning_rate": 4.880341546804455e-05, "loss": 0.0176, "step": 42580 }, { "grad_norm": 0.39295101165771484, "learning_rate": 4.878275311764679e-05, "loss": 0.0247, "step": 42590 }, { "grad_norm": 0.3516084551811218, "learning_rate": 4.8762090975243254e-05, "loss": 0.0227, "step": 42600 }, { "grad_norm": 0.2116176038980484, "learning_rate": 4.874142904436448e-05, "loss": 0.0203, "step": 42610 }, { "grad_norm": 0.41983720660209656, "learning_rate": 4.8720767328541094e-05, "loss": 0.0297, "step": 42620 }, { "grad_norm": 0.38538479804992676, "learning_rate": 4.870010583130358e-05, "loss": 0.0211, "step": 42630 }, { "grad_norm": 0.23346708714962006, "learning_rate": 4.867944455618241e-05, "loss": 0.0245, "step": 42640 }, { "grad_norm": 0.325692355632782, "learning_rate": 4.865878350670807e-05, "loss": 0.0233, "step": 42650 }, { "grad_norm": 0.35114774107933044, "learning_rate": 4.8638122686410914e-05, "loss": 0.0246, "step": 42660 }, { "grad_norm": 0.3819946050643921, "learning_rate": 4.861746209882137e-05, "loss": 0.0165, "step": 42670 }, { "grad_norm": 0.24126221239566803, "learning_rate": 4.859680174746972e-05, "loss": 0.0243, "step": 42680 }, { "grad_norm": 0.28215157985687256, "learning_rate": 4.857614163588629e-05, "loss": 0.0311, "step": 42690 }, { "grad_norm": 0.29792532324790955, "learning_rate": 4.855548176760131e-05, "loss": 0.0216, "step": 42700 }, { "grad_norm": 0.2108968198299408, "learning_rate": 4.8534822146144986e-05, "loss": 0.0239, "step": 42710 }, { "grad_norm": 0.33044856786727905, "learning_rate": 4.851416277504749e-05, "loss": 0.0333, "step": 42720 }, { "grad_norm": 0.3099982738494873, "learning_rate": 4.8493503657838923e-05, "loss": 0.0212, "step": 42730 }, { "grad_norm": 0.23720934987068176, "learning_rate": 4.84728447980494e-05, "loss": 0.0189, "step": 42740 }, { "grad_norm": 0.41695061326026917, "learning_rate": 4.8452186199208914e-05, "loss": 0.0246, "step": 42750 }, { "grad_norm": 0.2502864599227905, "learning_rate": 4.843152786484749e-05, "loss": 0.0241, "step": 42760 }, { "grad_norm": 0.1971157044172287, "learning_rate": 4.8410869798495035e-05, "loss": 0.0198, "step": 42770 }, { "grad_norm": 0.3103731870651245, "learning_rate": 4.8390212003681486e-05, "loss": 0.0232, "step": 42780 }, { "grad_norm": 0.5167508125305176, "learning_rate": 4.836955448393667e-05, "loss": 0.0306, "step": 42790 }, { "grad_norm": 0.23662056028842926, "learning_rate": 4.8348897242790406e-05, "loss": 0.0204, "step": 42800 }, { "grad_norm": 0.248050719499588, "learning_rate": 4.832824028377243e-05, "loss": 0.0207, "step": 42810 }, { "grad_norm": 0.4032907485961914, "learning_rate": 4.830758361041249e-05, "loss": 0.0203, "step": 42820 }, { "grad_norm": 0.3195030391216278, "learning_rate": 4.828692722624022e-05, "loss": 0.0226, "step": 42830 }, { "grad_norm": 0.15472061932086945, "learning_rate": 4.826627113478522e-05, "loss": 0.0367, "step": 42840 }, { "grad_norm": 0.20904456079006195, "learning_rate": 4.824561533957708e-05, "loss": 0.0173, "step": 42850 }, { "grad_norm": 0.33123695850372314, "learning_rate": 4.822495984414527e-05, "loss": 0.0224, "step": 42860 }, { "grad_norm": 0.2961006164550781, "learning_rate": 4.8204304652019304e-05, "loss": 0.0223, "step": 42870 }, { "grad_norm": 0.539409339427948, "learning_rate": 4.8183649766728537e-05, "loss": 0.0242, "step": 42880 }, { "grad_norm": 0.4054296612739563, "learning_rate": 4.816299519180237e-05, "loss": 0.0262, "step": 42890 }, { "grad_norm": 0.23256489634513855, "learning_rate": 4.8142340930770066e-05, "loss": 0.0217, "step": 42900 }, { "grad_norm": 0.3654249310493469, "learning_rate": 4.81216869871609e-05, "loss": 0.0186, "step": 42910 }, { "grad_norm": 0.2970329821109772, "learning_rate": 4.8101033364504064e-05, "loss": 0.0216, "step": 42920 }, { "grad_norm": 0.44361215829849243, "learning_rate": 4.808038006632866e-05, "loss": 0.0335, "step": 42930 }, { "grad_norm": 0.42554157972335815, "learning_rate": 4.805972709616383e-05, "loss": 0.0217, "step": 42940 }, { "grad_norm": 0.26803168654441833, "learning_rate": 4.8039074457538556e-05, "loss": 0.0179, "step": 42950 }, { "grad_norm": 0.26838794350624084, "learning_rate": 4.801842215398184e-05, "loss": 0.0247, "step": 42960 }, { "grad_norm": 0.35772427916526794, "learning_rate": 4.799777018902256e-05, "loss": 0.024, "step": 42970 }, { "grad_norm": 0.2938004732131958, "learning_rate": 4.797711856618961e-05, "loss": 0.0173, "step": 42980 }, { "grad_norm": 0.3027322590351105, "learning_rate": 4.7956467289011765e-05, "loss": 0.0188, "step": 42990 }, { "grad_norm": 0.41065850853919983, "learning_rate": 4.793581636101778e-05, "loss": 0.0214, "step": 43000 }, { "grad_norm": 0.25777584314346313, "learning_rate": 4.7915165785736326e-05, "loss": 0.0205, "step": 43010 }, { "grad_norm": 0.31216731667518616, "learning_rate": 4.789451556669599e-05, "loss": 0.0219, "step": 43020 }, { "grad_norm": 0.27758368849754333, "learning_rate": 4.7873865707425384e-05, "loss": 0.0241, "step": 43030 }, { "grad_norm": 0.22295346856117249, "learning_rate": 4.7853216211452975e-05, "loss": 0.0229, "step": 43040 }, { "grad_norm": 0.3690032362937927, "learning_rate": 4.78325670823072e-05, "loss": 0.0249, "step": 43050 }, { "grad_norm": 0.2719225287437439, "learning_rate": 4.781191832351641e-05, "loss": 0.0249, "step": 43060 }, { "grad_norm": 0.39544469118118286, "learning_rate": 4.7791269938608965e-05, "loss": 0.0231, "step": 43070 }, { "grad_norm": 0.3365795612335205, "learning_rate": 4.777062193111305e-05, "loss": 0.0242, "step": 43080 }, { "grad_norm": 0.23099003732204437, "learning_rate": 4.77499743045569e-05, "loss": 0.0224, "step": 43090 }, { "grad_norm": 0.274625688791275, "learning_rate": 4.7729327062468596e-05, "loss": 0.0246, "step": 43100 }, { "grad_norm": 0.4730696380138397, "learning_rate": 4.770868020837619e-05, "loss": 0.0269, "step": 43110 }, { "grad_norm": 0.22432583570480347, "learning_rate": 4.768803374580768e-05, "loss": 0.0182, "step": 43120 }, { "grad_norm": 0.37681201100349426, "learning_rate": 4.766738767829094e-05, "loss": 0.0182, "step": 43130 }, { "grad_norm": 0.2329859435558319, "learning_rate": 4.764674200935388e-05, "loss": 0.0242, "step": 43140 }, { "grad_norm": 0.2919180989265442, "learning_rate": 4.762609674252424e-05, "loss": 0.0218, "step": 43150 }, { "grad_norm": 0.3081183135509491, "learning_rate": 4.760545188132974e-05, "loss": 0.0383, "step": 43160 }, { "grad_norm": 0.2451677769422531, "learning_rate": 4.758480742929801e-05, "loss": 0.0219, "step": 43170 }, { "grad_norm": 0.3655115067958832, "learning_rate": 4.756416338995664e-05, "loss": 0.0301, "step": 43180 }, { "grad_norm": 0.4070141017436981, "learning_rate": 4.7543519766833145e-05, "loss": 0.0201, "step": 43190 }, { "grad_norm": 0.26547476649284363, "learning_rate": 4.752287656345492e-05, "loss": 0.0244, "step": 43200 }, { "grad_norm": 0.2462300956249237, "learning_rate": 4.7502233783349355e-05, "loss": 0.0299, "step": 43210 }, { "grad_norm": 0.34432950615882874, "learning_rate": 4.7481591430043694e-05, "loss": 0.0196, "step": 43220 }, { "grad_norm": 0.4856715798377991, "learning_rate": 4.746094950706521e-05, "loss": 0.0291, "step": 43230 }, { "grad_norm": 0.2962019741535187, "learning_rate": 4.744030801794099e-05, "loss": 0.0266, "step": 43240 }, { "grad_norm": 0.41413816809654236, "learning_rate": 4.741966696619813e-05, "loss": 0.0265, "step": 43250 }, { "grad_norm": 0.2628474533557892, "learning_rate": 4.739902635536361e-05, "loss": 0.0298, "step": 43260 }, { "grad_norm": 0.3044228255748749, "learning_rate": 4.737838618896436e-05, "loss": 0.0217, "step": 43270 }, { "grad_norm": 0.27203062176704407, "learning_rate": 4.7357746470527203e-05, "loss": 0.0252, "step": 43280 }, { "grad_norm": 0.3679685890674591, "learning_rate": 4.73371072035789e-05, "loss": 0.0291, "step": 43290 }, { "grad_norm": 0.5741770267486572, "learning_rate": 4.731646839164616e-05, "loss": 0.0231, "step": 43300 }, { "grad_norm": 0.359243243932724, "learning_rate": 4.729583003825556e-05, "loss": 0.024, "step": 43310 }, { "grad_norm": 0.29874691367149353, "learning_rate": 4.727519214693367e-05, "loss": 0.0243, "step": 43320 }, { "grad_norm": 0.45209750533103943, "learning_rate": 4.725455472120689e-05, "loss": 0.0253, "step": 43330 }, { "grad_norm": 0.1882946491241455, "learning_rate": 4.723391776460164e-05, "loss": 0.0234, "step": 43340 }, { "grad_norm": 0.44653406739234924, "learning_rate": 4.7213281280644186e-05, "loss": 0.0257, "step": 43350 }, { "grad_norm": 0.29949885606765747, "learning_rate": 4.719264527286075e-05, "loss": 0.0289, "step": 43360 }, { "grad_norm": 0.3239981234073639, "learning_rate": 4.717200974477744e-05, "loss": 0.0262, "step": 43370 }, { "grad_norm": 0.2533457577228546, "learning_rate": 4.715137469992034e-05, "loss": 0.0323, "step": 43380 }, { "grad_norm": 0.37959450483322144, "learning_rate": 4.713074014181539e-05, "loss": 0.024, "step": 43390 }, { "grad_norm": 0.26439809799194336, "learning_rate": 4.711010607398846e-05, "loss": 0.0203, "step": 43400 }, { "grad_norm": 0.3772830069065094, "learning_rate": 4.708947249996538e-05, "loss": 0.0245, "step": 43410 }, { "grad_norm": 0.25459256768226624, "learning_rate": 4.706883942327183e-05, "loss": 0.019, "step": 43420 }, { "grad_norm": 0.4836859703063965, "learning_rate": 4.704820684743347e-05, "loss": 0.0283, "step": 43430 }, { "grad_norm": 0.34502413868904114, "learning_rate": 4.702757477597581e-05, "loss": 0.0241, "step": 43440 }, { "grad_norm": 0.32767415046691895, "learning_rate": 4.700694321242435e-05, "loss": 0.0239, "step": 43450 }, { "grad_norm": 0.3979661464691162, "learning_rate": 4.698631216030441e-05, "loss": 0.0257, "step": 43460 }, { "grad_norm": 0.18834353983402252, "learning_rate": 4.6965681623141314e-05, "loss": 0.0196, "step": 43470 }, { "grad_norm": 0.31678593158721924, "learning_rate": 4.694505160446024e-05, "loss": 0.0167, "step": 43480 }, { "grad_norm": 0.2317478060722351, "learning_rate": 4.692442210778628e-05, "loss": 0.022, "step": 43490 }, { "grad_norm": 0.37033113837242126, "learning_rate": 4.690379313664448e-05, "loss": 0.0228, "step": 43500 }, { "grad_norm": 0.2781582772731781, "learning_rate": 4.688316469455973e-05, "loss": 0.0225, "step": 43510 }, { "grad_norm": 0.25107091665267944, "learning_rate": 4.6862536785056914e-05, "loss": 0.0209, "step": 43520 }, { "grad_norm": 0.35202479362487793, "learning_rate": 4.684190941166073e-05, "loss": 0.0271, "step": 43530 }, { "grad_norm": 0.2824023365974426, "learning_rate": 4.682128257789587e-05, "loss": 0.0231, "step": 43540 }, { "grad_norm": 0.5535755753517151, "learning_rate": 4.680065628728687e-05, "loss": 0.0238, "step": 43550 }, { "grad_norm": 0.28154826164245605, "learning_rate": 4.678003054335822e-05, "loss": 0.0264, "step": 43560 }, { "grad_norm": 0.33945077657699585, "learning_rate": 4.675940534963428e-05, "loss": 0.0286, "step": 43570 }, { "grad_norm": 0.2369990199804306, "learning_rate": 4.673878070963931e-05, "loss": 0.0251, "step": 43580 }, { "grad_norm": 0.2805902659893036, "learning_rate": 4.671815662689756e-05, "loss": 0.0163, "step": 43590 }, { "grad_norm": 0.3383795917034149, "learning_rate": 4.669753310493306e-05, "loss": 0.0234, "step": 43600 }, { "grad_norm": 0.22585324943065643, "learning_rate": 4.6676910147269845e-05, "loss": 0.0271, "step": 43610 }, { "grad_norm": 0.3920303285121918, "learning_rate": 4.665628775743177e-05, "loss": 0.0205, "step": 43620 }, { "grad_norm": 0.23255369067192078, "learning_rate": 4.66356659389427e-05, "loss": 0.0211, "step": 43630 }, { "grad_norm": 0.3061549663543701, "learning_rate": 4.661504469532629e-05, "loss": 0.02, "step": 43640 }, { "grad_norm": 0.21329647302627563, "learning_rate": 4.659442403010617e-05, "loss": 0.0178, "step": 43650 }, { "grad_norm": 0.31669121980667114, "learning_rate": 4.6573803946805845e-05, "loss": 0.0224, "step": 43660 }, { "grad_norm": 0.19245286285877228, "learning_rate": 4.6553184448948695e-05, "loss": 0.0249, "step": 43670 }, { "grad_norm": 0.2677464485168457, "learning_rate": 4.653256554005807e-05, "loss": 0.0182, "step": 43680 }, { "grad_norm": 0.18719252943992615, "learning_rate": 4.6511947223657145e-05, "loss": 0.0271, "step": 43690 }, { "grad_norm": 0.44645798206329346, "learning_rate": 4.649132950326906e-05, "loss": 0.0298, "step": 43700 }, { "grad_norm": 0.4605008661746979, "learning_rate": 4.647071238241679e-05, "loss": 0.0271, "step": 43710 }, { "grad_norm": 0.2522938847541809, "learning_rate": 4.6450095864623264e-05, "loss": 0.02, "step": 43720 }, { "grad_norm": 0.3580136001110077, "learning_rate": 4.642947995341123e-05, "loss": 0.0213, "step": 43730 }, { "grad_norm": 0.22574040293693542, "learning_rate": 4.640886465230345e-05, "loss": 0.0251, "step": 43740 }, { "grad_norm": 0.2827286124229431, "learning_rate": 4.6388249964822485e-05, "loss": 0.0262, "step": 43750 }, { "grad_norm": 0.2626369595527649, "learning_rate": 4.63676358944908e-05, "loss": 0.0212, "step": 43760 }, { "grad_norm": 0.4506363570690155, "learning_rate": 4.634702244483081e-05, "loss": 0.0295, "step": 43770 }, { "grad_norm": 0.24914787709712982, "learning_rate": 4.632640961936475e-05, "loss": 0.0209, "step": 43780 }, { "grad_norm": 0.2651832401752472, "learning_rate": 4.6305797421614835e-05, "loss": 0.0218, "step": 43790 }, { "grad_norm": 0.3414176106452942, "learning_rate": 4.6285185855103105e-05, "loss": 0.0246, "step": 43800 }, { "grad_norm": 0.39593765139579773, "learning_rate": 4.626457492335151e-05, "loss": 0.0231, "step": 43810 }, { "grad_norm": 0.2185574322938919, "learning_rate": 4.624396462988188e-05, "loss": 0.0186, "step": 43820 }, { "grad_norm": 0.2570001780986786, "learning_rate": 4.6223354978215985e-05, "loss": 0.0236, "step": 43830 }, { "grad_norm": 0.3100121319293976, "learning_rate": 4.620274597187544e-05, "loss": 0.0237, "step": 43840 }, { "grad_norm": 0.24741730093955994, "learning_rate": 4.6182137614381736e-05, "loss": 0.0218, "step": 43850 }, { "grad_norm": 0.26047566533088684, "learning_rate": 4.616152990925631e-05, "loss": 0.0219, "step": 43860 }, { "grad_norm": 0.19997331500053406, "learning_rate": 4.614092286002041e-05, "loss": 0.0245, "step": 43870 }, { "grad_norm": 0.28040722012519836, "learning_rate": 4.612031647019528e-05, "loss": 0.0284, "step": 43880 }, { "grad_norm": 0.5144198536872864, "learning_rate": 4.609971074330193e-05, "loss": 0.0285, "step": 43890 }, { "grad_norm": 0.41157329082489014, "learning_rate": 4.607910568286136e-05, "loss": 0.0242, "step": 43900 }, { "grad_norm": 0.37073108553886414, "learning_rate": 4.605850129239438e-05, "loss": 0.0257, "step": 43910 }, { "grad_norm": 0.30591052770614624, "learning_rate": 4.603789757542174e-05, "loss": 0.0227, "step": 43920 }, { "grad_norm": 0.453330397605896, "learning_rate": 4.601729453546404e-05, "loss": 0.0247, "step": 43930 }, { "grad_norm": 0.19472691416740417, "learning_rate": 4.599669217604177e-05, "loss": 0.0244, "step": 43940 }, { "grad_norm": 0.6104552149772644, "learning_rate": 4.597609050067532e-05, "loss": 0.0316, "step": 43950 }, { "grad_norm": 0.3135366439819336, "learning_rate": 4.595548951288495e-05, "loss": 0.0306, "step": 43960 }, { "grad_norm": 0.2640969455242157, "learning_rate": 4.593488921619081e-05, "loss": 0.0258, "step": 43970 }, { "grad_norm": 0.39146551489830017, "learning_rate": 4.591428961411289e-05, "loss": 0.0297, "step": 43980 }, { "grad_norm": 0.3184562623500824, "learning_rate": 4.589369071017117e-05, "loss": 0.0269, "step": 43990 }, { "grad_norm": 0.20836219191551208, "learning_rate": 4.587309250788538e-05, "loss": 0.0266, "step": 44000 }, { "grad_norm": 0.2598210573196411, "learning_rate": 4.585249501077522e-05, "loss": 0.0253, "step": 44010 }, { "grad_norm": 0.29013609886169434, "learning_rate": 4.583189822236021e-05, "loss": 0.0276, "step": 44020 }, { "grad_norm": 0.5352450609207153, "learning_rate": 4.5811302146159816e-05, "loss": 0.0252, "step": 44030 }, { "grad_norm": 0.2755991816520691, "learning_rate": 4.579070678569332e-05, "loss": 0.0208, "step": 44040 }, { "grad_norm": 0.29908618330955505, "learning_rate": 4.5770112144479904e-05, "loss": 0.0272, "step": 44050 }, { "grad_norm": 0.4539014399051666, "learning_rate": 4.5749518226038645e-05, "loss": 0.0252, "step": 44060 }, { "grad_norm": 0.3581398129463196, "learning_rate": 4.572892503388845e-05, "loss": 0.0218, "step": 44070 }, { "grad_norm": 0.30121397972106934, "learning_rate": 4.570833257154817e-05, "loss": 0.0285, "step": 44080 }, { "grad_norm": 0.2913811206817627, "learning_rate": 4.568774084253646e-05, "loss": 0.0271, "step": 44090 }, { "grad_norm": 0.2737225890159607, "learning_rate": 4.566714985037191e-05, "loss": 0.0298, "step": 44100 }, { "grad_norm": 0.3489956855773926, "learning_rate": 4.564655959857295e-05, "loss": 0.0231, "step": 44110 }, { "grad_norm": 0.2699337601661682, "learning_rate": 4.5625970090657893e-05, "loss": 0.024, "step": 44120 }, { "grad_norm": 0.34006592631340027, "learning_rate": 4.560538133014491e-05, "loss": 0.0205, "step": 44130 }, { "grad_norm": 0.3616899251937866, "learning_rate": 4.5584793320552055e-05, "loss": 0.0221, "step": 44140 }, { "grad_norm": 0.26292115449905396, "learning_rate": 4.556420606539728e-05, "loss": 0.0222, "step": 44150 }, { "grad_norm": 0.18623977899551392, "learning_rate": 4.554361956819836e-05, "loss": 0.0189, "step": 44160 }, { "grad_norm": 0.26222100853919983, "learning_rate": 4.552303383247299e-05, "loss": 0.0186, "step": 44170 }, { "grad_norm": 0.2450571209192276, "learning_rate": 4.5502448861738664e-05, "loss": 0.0221, "step": 44180 }, { "grad_norm": 0.293192595243454, "learning_rate": 4.548186465951284e-05, "loss": 0.0317, "step": 44190 }, { "grad_norm": 0.3349779546260834, "learning_rate": 4.546128122931277e-05, "loss": 0.0299, "step": 44200 }, { "grad_norm": 0.3038754165172577, "learning_rate": 4.544069857465561e-05, "loss": 0.0282, "step": 44210 }, { "grad_norm": 0.8868038058280945, "learning_rate": 4.542011669905837e-05, "loss": 0.0203, "step": 44220 }, { "grad_norm": 0.41738077998161316, "learning_rate": 4.53995356060379e-05, "loss": 0.0207, "step": 44230 }, { "grad_norm": 0.41858944296836853, "learning_rate": 4.5378955299110994e-05, "loss": 0.0201, "step": 44240 }, { "grad_norm": 0.6354717016220093, "learning_rate": 4.5358375781794233e-05, "loss": 0.0311, "step": 44250 }, { "grad_norm": 0.40340128540992737, "learning_rate": 4.533779705760411e-05, "loss": 0.0273, "step": 44260 }, { "grad_norm": 0.3899994194507599, "learning_rate": 4.5317219130056934e-05, "loss": 0.0216, "step": 44270 }, { "grad_norm": 0.2322281301021576, "learning_rate": 4.5296642002668964e-05, "loss": 0.0228, "step": 44280 }, { "grad_norm": 0.4357717037200928, "learning_rate": 4.5276065678956216e-05, "loss": 0.0278, "step": 44290 }, { "grad_norm": 0.30473190546035767, "learning_rate": 4.525549016243466e-05, "loss": 0.0276, "step": 44300 }, { "grad_norm": 0.26613256335258484, "learning_rate": 4.523491545662008e-05, "loss": 0.0233, "step": 44310 }, { "grad_norm": 0.28522858023643494, "learning_rate": 4.5214341565028104e-05, "loss": 0.0254, "step": 44320 }, { "grad_norm": 0.352899968624115, "learning_rate": 4.519376849117428e-05, "loss": 0.0233, "step": 44330 }, { "grad_norm": 0.17496585845947266, "learning_rate": 4.517319623857395e-05, "loss": 0.0309, "step": 44340 }, { "grad_norm": 0.3477826416492462, "learning_rate": 4.5152624810742394e-05, "loss": 0.0253, "step": 44350 }, { "grad_norm": 0.2533878684043884, "learning_rate": 4.5132054211194664e-05, "loss": 0.0209, "step": 44360 }, { "grad_norm": 0.34917959570884705, "learning_rate": 4.511148444344574e-05, "loss": 0.0251, "step": 44370 }, { "grad_norm": 0.4125940203666687, "learning_rate": 4.509091551101041e-05, "loss": 0.028, "step": 44380 }, { "grad_norm": 0.268017053604126, "learning_rate": 4.507034741740338e-05, "loss": 0.0224, "step": 44390 }, { "grad_norm": 0.31648024916648865, "learning_rate": 4.5049780166139145e-05, "loss": 0.0258, "step": 44400 }, { "grad_norm": 0.3926212191581726, "learning_rate": 4.5029213760732075e-05, "loss": 0.0325, "step": 44410 }, { "grad_norm": 0.273398220539093, "learning_rate": 4.5008648204696434e-05, "loss": 0.0279, "step": 44420 }, { "grad_norm": 0.3193771243095398, "learning_rate": 4.4988083501546284e-05, "loss": 0.019, "step": 44430 }, { "grad_norm": 0.2683701515197754, "learning_rate": 4.4967519654795606e-05, "loss": 0.0177, "step": 44440 }, { "grad_norm": 0.2887819707393646, "learning_rate": 4.494695666795816e-05, "loss": 0.0175, "step": 44450 }, { "grad_norm": 0.2791743576526642, "learning_rate": 4.4926394544547644e-05, "loss": 0.0216, "step": 44460 }, { "grad_norm": 0.25151628255844116, "learning_rate": 4.490583328807752e-05, "loss": 0.0259, "step": 44470 }, { "grad_norm": 0.3738817274570465, "learning_rate": 4.488527290206117e-05, "loss": 0.0227, "step": 44480 }, { "grad_norm": 0.34459954500198364, "learning_rate": 4.48647133900118e-05, "loss": 0.0199, "step": 44490 }, { "grad_norm": 0.24721698462963104, "learning_rate": 4.484415475544244e-05, "loss": 0.0269, "step": 44500 }, { "grad_norm": 0.26402825117111206, "learning_rate": 4.4823597001866035e-05, "loss": 0.019, "step": 44510 }, { "grad_norm": 0.43825584650039673, "learning_rate": 4.480304013279532e-05, "loss": 0.0222, "step": 44520 }, { "grad_norm": 0.37908947467803955, "learning_rate": 4.478248415174292e-05, "loss": 0.0194, "step": 44530 }, { "grad_norm": 0.24652674794197083, "learning_rate": 4.476192906222126e-05, "loss": 0.0267, "step": 44540 }, { "grad_norm": 0.314908891916275, "learning_rate": 4.474137486774268e-05, "loss": 0.0345, "step": 44550 }, { "grad_norm": 0.253103643655777, "learning_rate": 4.4720821571819296e-05, "loss": 0.0225, "step": 44560 }, { "grad_norm": 0.25280770659446716, "learning_rate": 4.470026917796314e-05, "loss": 0.0304, "step": 44570 }, { "grad_norm": 0.35723939538002014, "learning_rate": 4.4679717689686005e-05, "loss": 0.0401, "step": 44580 }, { "grad_norm": 0.26310548186302185, "learning_rate": 4.465916711049963e-05, "loss": 0.0203, "step": 44590 }, { "grad_norm": 0.2979086637496948, "learning_rate": 4.4638617443915524e-05, "loss": 0.0242, "step": 44600 }, { "grad_norm": 0.38932716846466064, "learning_rate": 4.4618068693445055e-05, "loss": 0.0297, "step": 44610 }, { "grad_norm": 0.2069370299577713, "learning_rate": 4.459752086259946e-05, "loss": 0.0185, "step": 44620 }, { "grad_norm": 0.40888267755508423, "learning_rate": 4.457697395488977e-05, "loss": 0.0355, "step": 44630 }, { "grad_norm": 0.254597544670105, "learning_rate": 4.455642797382693e-05, "loss": 0.0281, "step": 44640 }, { "grad_norm": 0.23420000076293945, "learning_rate": 4.4535882922921643e-05, "loss": 0.0298, "step": 44650 }, { "grad_norm": 0.31002503633499146, "learning_rate": 4.451533880568455e-05, "loss": 0.0224, "step": 44660 }, { "grad_norm": 0.26056164503097534, "learning_rate": 4.449479562562603e-05, "loss": 0.03, "step": 44670 }, { "grad_norm": 0.3124409317970276, "learning_rate": 4.447425338625639e-05, "loss": 0.0281, "step": 44680 }, { "grad_norm": 0.286763995885849, "learning_rate": 4.4453712091085705e-05, "loss": 0.0239, "step": 44690 }, { "grad_norm": 0.7260671257972717, "learning_rate": 4.443317174362392e-05, "loss": 0.0276, "step": 44700 }, { "grad_norm": 0.32927218079566956, "learning_rate": 4.441263234738085e-05, "loss": 0.0249, "step": 44710 }, { "grad_norm": 0.30218589305877686, "learning_rate": 4.4392093905866086e-05, "loss": 0.0176, "step": 44720 }, { "grad_norm": 0.37762171030044556, "learning_rate": 4.437155642258911e-05, "loss": 0.0205, "step": 44730 }, { "grad_norm": 0.22340388596057892, "learning_rate": 4.4351019901059177e-05, "loss": 0.0293, "step": 44740 }, { "grad_norm": 0.3091162443161011, "learning_rate": 4.433048434478547e-05, "loss": 0.0315, "step": 44750 }, { "grad_norm": 0.3813924789428711, "learning_rate": 4.4309949757276906e-05, "loss": 0.0265, "step": 44760 }, { "grad_norm": 0.26104244589805603, "learning_rate": 4.428941614204233e-05, "loss": 0.0286, "step": 44770 }, { "grad_norm": 0.33348652720451355, "learning_rate": 4.426888350259034e-05, "loss": 0.0251, "step": 44780 }, { "grad_norm": 0.30058908462524414, "learning_rate": 4.424835184242939e-05, "loss": 0.0365, "step": 44790 }, { "grad_norm": 0.24350547790527344, "learning_rate": 4.422782116506784e-05, "loss": 0.02, "step": 44800 }, { "grad_norm": 0.38216397166252136, "learning_rate": 4.4207291474013754e-05, "loss": 0.0276, "step": 44810 }, { "grad_norm": 0.2638923227787018, "learning_rate": 4.418676277277514e-05, "loss": 0.0309, "step": 44820 }, { "grad_norm": 0.2963992953300476, "learning_rate": 4.416623506485976e-05, "loss": 0.0228, "step": 44830 }, { "grad_norm": 0.26209428906440735, "learning_rate": 4.414570835377527e-05, "loss": 0.0258, "step": 44840 }, { "grad_norm": 0.351034015417099, "learning_rate": 4.412518264302909e-05, "loss": 0.0226, "step": 44850 }, { "grad_norm": 0.35863086581230164, "learning_rate": 4.4104657936128535e-05, "loss": 0.0249, "step": 44860 }, { "grad_norm": 0.2459232360124588, "learning_rate": 4.408413423658071e-05, "loss": 0.023, "step": 44870 }, { "grad_norm": 0.26672035455703735, "learning_rate": 4.406361154789253e-05, "loss": 0.0231, "step": 44880 }, { "grad_norm": 0.4404798448085785, "learning_rate": 4.4043089873570776e-05, "loss": 0.0231, "step": 44890 }, { "grad_norm": 0.3597796559333801, "learning_rate": 4.4022569217122025e-05, "loss": 0.0201, "step": 44900 }, { "grad_norm": 0.23784908652305603, "learning_rate": 4.400204958205274e-05, "loss": 0.0223, "step": 44910 }, { "grad_norm": 0.33541440963745117, "learning_rate": 4.3981530971869125e-05, "loss": 0.0285, "step": 44920 }, { "grad_norm": 0.31617262959480286, "learning_rate": 4.396101339007727e-05, "loss": 0.0177, "step": 44930 }, { "grad_norm": 0.5519271492958069, "learning_rate": 4.394049684018304e-05, "loss": 0.0265, "step": 44940 }, { "grad_norm": 0.4665592610836029, "learning_rate": 4.3919981325692186e-05, "loss": 0.0253, "step": 44950 }, { "grad_norm": 0.1739855259656906, "learning_rate": 4.389946685011024e-05, "loss": 0.0199, "step": 44960 }, { "grad_norm": 0.23022471368312836, "learning_rate": 4.387895341694255e-05, "loss": 0.028, "step": 44970 }, { "grad_norm": 0.19609498977661133, "learning_rate": 4.3858441029694324e-05, "loss": 0.0299, "step": 44980 }, { "grad_norm": 0.2812497615814209, "learning_rate": 4.3837929691870527e-05, "loss": 0.0204, "step": 44990 }, { "grad_norm": 0.2905833125114441, "learning_rate": 4.381741940697604e-05, "loss": 0.0239, "step": 45000 }, { "grad_norm": 0.27830374240875244, "learning_rate": 4.379691017851547e-05, "loss": 0.0255, "step": 45010 }, { "grad_norm": 0.28739598393440247, "learning_rate": 4.3776402009993304e-05, "loss": 0.0232, "step": 45020 }, { "grad_norm": 0.2667198181152344, "learning_rate": 4.3755894904913794e-05, "loss": 0.0185, "step": 45030 }, { "grad_norm": 0.4177500903606415, "learning_rate": 4.373538886678109e-05, "loss": 0.0239, "step": 45040 }, { "grad_norm": 0.3392273485660553, "learning_rate": 4.371488389909909e-05, "loss": 0.0219, "step": 45050 }, { "grad_norm": 0.337597131729126, "learning_rate": 4.3694380005371515e-05, "loss": 0.0276, "step": 45060 }, { "grad_norm": 0.2403828650712967, "learning_rate": 4.367387718910196e-05, "loss": 0.0211, "step": 45070 }, { "grad_norm": 0.3041883707046509, "learning_rate": 4.3653375453793764e-05, "loss": 0.0225, "step": 45080 }, { "grad_norm": 0.3099502623081207, "learning_rate": 4.3632874802950136e-05, "loss": 0.022, "step": 45090 }, { "grad_norm": 0.2682211101055145, "learning_rate": 4.3612375240074034e-05, "loss": 0.0237, "step": 45100 }, { "grad_norm": 0.22702445089817047, "learning_rate": 4.3591876768668325e-05, "loss": 0.0242, "step": 45110 }, { "grad_norm": 0.2978948950767517, "learning_rate": 4.3571379392235605e-05, "loss": 0.0262, "step": 45120 }, { "grad_norm": 0.35909149050712585, "learning_rate": 4.3550883114278335e-05, "loss": 0.0217, "step": 45130 }, { "grad_norm": 0.2854733467102051, "learning_rate": 4.353038793829876e-05, "loss": 0.0233, "step": 45140 }, { "grad_norm": 0.2798745036125183, "learning_rate": 4.350989386779891e-05, "loss": 0.0172, "step": 45150 }, { "grad_norm": 0.22547006607055664, "learning_rate": 4.3489400906280724e-05, "loss": 0.0354, "step": 45160 }, { "grad_norm": 0.28866538405418396, "learning_rate": 4.3468909057245845e-05, "loss": 0.0295, "step": 45170 }, { "grad_norm": 0.18799112737178802, "learning_rate": 4.3448418324195794e-05, "loss": 0.0271, "step": 45180 }, { "grad_norm": 0.43867921829223633, "learning_rate": 4.342792871063184e-05, "loss": 0.023, "step": 45190 }, { "grad_norm": 0.3794839382171631, "learning_rate": 4.3407440220055145e-05, "loss": 0.0254, "step": 45200 }, { "grad_norm": 0.3065919280052185, "learning_rate": 4.33869528559666e-05, "loss": 0.0197, "step": 45210 }, { "grad_norm": 0.2829049229621887, "learning_rate": 4.336646662186696e-05, "loss": 0.017, "step": 45220 }, { "grad_norm": 0.24466393887996674, "learning_rate": 4.334598152125672e-05, "loss": 0.0248, "step": 45230 }, { "grad_norm": 0.24082711338996887, "learning_rate": 4.3325497557636276e-05, "loss": 0.0253, "step": 45240 }, { "grad_norm": 0.1953597217798233, "learning_rate": 4.330501473450574e-05, "loss": 0.0257, "step": 45250 }, { "grad_norm": 0.3458867073059082, "learning_rate": 4.328453305536507e-05, "loss": 0.0202, "step": 45260 }, { "grad_norm": 0.41885310411453247, "learning_rate": 4.326405252371404e-05, "loss": 0.0276, "step": 45270 }, { "grad_norm": 0.2856435775756836, "learning_rate": 4.324357314305221e-05, "loss": 0.0199, "step": 45280 }, { "grad_norm": 0.3129379451274872, "learning_rate": 4.3223094916878945e-05, "loss": 0.0221, "step": 45290 }, { "grad_norm": 0.2356489598751068, "learning_rate": 4.320261784869338e-05, "loss": 0.0301, "step": 45300 }, { "grad_norm": 0.1754058301448822, "learning_rate": 4.318214194199455e-05, "loss": 0.0224, "step": 45310 }, { "grad_norm": 0.19883251190185547, "learning_rate": 4.316166720028118e-05, "loss": 0.0165, "step": 45320 }, { "grad_norm": 0.33961743116378784, "learning_rate": 4.3141193627051864e-05, "loss": 0.0204, "step": 45330 }, { "grad_norm": 0.32468152046203613, "learning_rate": 4.312072122580496e-05, "loss": 0.0221, "step": 45340 }, { "grad_norm": 0.24622508883476257, "learning_rate": 4.3100250000038646e-05, "loss": 0.0232, "step": 45350 }, { "grad_norm": 0.4563528001308441, "learning_rate": 4.307977995325091e-05, "loss": 0.0396, "step": 45360 }, { "grad_norm": 0.26715245842933655, "learning_rate": 4.30593110889395e-05, "loss": 0.0237, "step": 45370 }, { "grad_norm": 0.49499934911727905, "learning_rate": 4.3038843410602016e-05, "loss": 0.0185, "step": 45380 }, { "grad_norm": 0.21281728148460388, "learning_rate": 4.3018376921735774e-05, "loss": 0.0218, "step": 45390 }, { "grad_norm": 0.163165882229805, "learning_rate": 4.299791162583799e-05, "loss": 0.0207, "step": 45400 }, { "grad_norm": 0.1812545359134674, "learning_rate": 4.29774475264056e-05, "loss": 0.0284, "step": 45410 }, { "grad_norm": 0.2679615914821625, "learning_rate": 4.2956984626935365e-05, "loss": 0.0209, "step": 45420 }, { "grad_norm": 0.30200904607772827, "learning_rate": 4.293652293092383e-05, "loss": 0.0285, "step": 45430 }, { "grad_norm": 0.2617185413837433, "learning_rate": 4.2916062441867324e-05, "loss": 0.0226, "step": 45440 }, { "grad_norm": 0.2961723804473877, "learning_rate": 4.289560316326201e-05, "loss": 0.0272, "step": 45450 }, { "grad_norm": 0.40602511167526245, "learning_rate": 4.28751450986038e-05, "loss": 0.0244, "step": 45460 }, { "grad_norm": 0.4984895586967468, "learning_rate": 4.2854688251388444e-05, "loss": 0.0277, "step": 45470 }, { "grad_norm": 0.2599349021911621, "learning_rate": 4.2834232625111425e-05, "loss": 0.0249, "step": 45480 }, { "grad_norm": 0.8156977891921997, "learning_rate": 4.2813778223268086e-05, "loss": 0.0243, "step": 45490 }, { "grad_norm": 0.38195496797561646, "learning_rate": 4.2793325049353477e-05, "loss": 0.019, "step": 45500 }, { "grad_norm": 0.33453625440597534, "learning_rate": 4.2772873106862535e-05, "loss": 0.023, "step": 45510 }, { "grad_norm": 0.210636168718338, "learning_rate": 4.275242239928993e-05, "loss": 0.0199, "step": 45520 }, { "grad_norm": 0.387219101190567, "learning_rate": 4.273197293013009e-05, "loss": 0.0172, "step": 45530 }, { "grad_norm": 0.21513108909130096, "learning_rate": 4.271152470287731e-05, "loss": 0.0155, "step": 45540 }, { "grad_norm": 0.19702675938606262, "learning_rate": 4.2691077721025594e-05, "loss": 0.0239, "step": 45550 }, { "grad_norm": 0.4145348072052002, "learning_rate": 4.267063198806883e-05, "loss": 0.0232, "step": 45560 }, { "grad_norm": 0.24765659868717194, "learning_rate": 4.2650187507500574e-05, "loss": 0.0224, "step": 45570 }, { "grad_norm": 0.2645424008369446, "learning_rate": 4.2629744282814275e-05, "loss": 0.0164, "step": 45580 }, { "grad_norm": 0.2177380919456482, "learning_rate": 4.2609302317503074e-05, "loss": 0.0214, "step": 45590 }, { "grad_norm": 0.26281753182411194, "learning_rate": 4.258886161505999e-05, "loss": 0.0207, "step": 45600 }, { "grad_norm": 0.19410304725170135, "learning_rate": 4.2568422178977775e-05, "loss": 0.0209, "step": 45610 }, { "grad_norm": 0.2743338346481323, "learning_rate": 4.254798401274894e-05, "loss": 0.0274, "step": 45620 }, { "grad_norm": 0.3922981917858124, "learning_rate": 4.252754711986583e-05, "loss": 0.0235, "step": 45630 }, { "grad_norm": 0.1966322809457779, "learning_rate": 4.250711150382052e-05, "loss": 0.0202, "step": 45640 }, { "grad_norm": 0.18206988275051117, "learning_rate": 4.248667716810495e-05, "loss": 0.0213, "step": 45650 }, { "grad_norm": 0.27461275458335876, "learning_rate": 4.246624411621074e-05, "loss": 0.0176, "step": 45660 }, { "grad_norm": 0.36132124066352844, "learning_rate": 4.244581235162938e-05, "loss": 0.0246, "step": 45670 }, { "grad_norm": 0.19472365081310272, "learning_rate": 4.2425381877852075e-05, "loss": 0.0257, "step": 45680 }, { "grad_norm": 0.45029401779174805, "learning_rate": 4.2404952698369856e-05, "loss": 0.0215, "step": 45690 }, { "grad_norm": 0.23983755707740784, "learning_rate": 4.238452481667349e-05, "loss": 0.0258, "step": 45700 }, { "grad_norm": 0.3092312812805176, "learning_rate": 4.2364098236253526e-05, "loss": 0.0282, "step": 45710 }, { "grad_norm": 0.2804923355579376, "learning_rate": 4.2343672960600356e-05, "loss": 0.0195, "step": 45720 }, { "grad_norm": 0.25810888409614563, "learning_rate": 4.232324899320406e-05, "loss": 0.0154, "step": 45730 }, { "grad_norm": 0.24992436170578003, "learning_rate": 4.230282633755457e-05, "loss": 0.0209, "step": 45740 }, { "grad_norm": 0.26225098967552185, "learning_rate": 4.2282404997141515e-05, "loss": 0.0167, "step": 45750 }, { "grad_norm": 0.24653545022010803, "learning_rate": 4.2261984975454397e-05, "loss": 0.0246, "step": 45760 }, { "grad_norm": 0.31391602754592896, "learning_rate": 4.224156627598239e-05, "loss": 0.0202, "step": 45770 }, { "grad_norm": 0.2738106846809387, "learning_rate": 4.222114890221453e-05, "loss": 0.0236, "step": 45780 }, { "grad_norm": 0.28247758746147156, "learning_rate": 4.2200732857639546e-05, "loss": 0.0246, "step": 45790 }, { "grad_norm": 0.21374155580997467, "learning_rate": 4.2180318145746035e-05, "loss": 0.0226, "step": 45800 }, { "grad_norm": 0.3602764308452606, "learning_rate": 4.215990477002227e-05, "loss": 0.0269, "step": 45810 }, { "grad_norm": 0.2705678343772888, "learning_rate": 4.2139492733956356e-05, "loss": 0.02, "step": 45820 }, { "grad_norm": 0.4573066830635071, "learning_rate": 4.211908204103615e-05, "loss": 0.0329, "step": 45830 }, { "grad_norm": 0.27110546827316284, "learning_rate": 4.2098672694749265e-05, "loss": 0.0366, "step": 45840 }, { "grad_norm": 0.42951279878616333, "learning_rate": 4.2078264698583133e-05, "loss": 0.0258, "step": 45850 }, { "grad_norm": 0.2435847669839859, "learning_rate": 4.205785805602488e-05, "loss": 0.0245, "step": 45860 }, { "grad_norm": 0.3753831386566162, "learning_rate": 4.203745277056149e-05, "loss": 0.0261, "step": 45870 }, { "grad_norm": 0.26295679807662964, "learning_rate": 4.201704884567964e-05, "loss": 0.0261, "step": 45880 }, { "grad_norm": 0.2699902355670929, "learning_rate": 4.1996646284865816e-05, "loss": 0.0318, "step": 45890 }, { "grad_norm": 0.3082434833049774, "learning_rate": 4.197624509160625e-05, "loss": 0.0203, "step": 45900 }, { "grad_norm": 0.22860968112945557, "learning_rate": 4.195584526938692e-05, "loss": 0.0187, "step": 45910 }, { "grad_norm": 0.34295451641082764, "learning_rate": 4.193544682169365e-05, "loss": 0.0237, "step": 45920 }, { "grad_norm": 0.29904863238334656, "learning_rate": 4.1915049752011946e-05, "loss": 0.0182, "step": 45930 }, { "grad_norm": 0.23629669845104218, "learning_rate": 4.189465406382712e-05, "loss": 0.0191, "step": 45940 }, { "grad_norm": 0.233235165476799, "learning_rate": 4.187425976062422e-05, "loss": 0.0251, "step": 45950 }, { "grad_norm": 0.3998337686061859, "learning_rate": 4.18538668458881e-05, "loss": 0.0241, "step": 45960 }, { "grad_norm": 0.22840018570423126, "learning_rate": 4.183347532310333e-05, "loss": 0.0199, "step": 45970 }, { "grad_norm": 0.20949353277683258, "learning_rate": 4.181308519575429e-05, "loss": 0.0191, "step": 45980 }, { "grad_norm": 0.3344404101371765, "learning_rate": 4.179269646732507e-05, "loss": 0.0333, "step": 45990 }, { "grad_norm": 0.4747387766838074, "learning_rate": 4.177230914129954e-05, "loss": 0.0213, "step": 46000 }, { "grad_norm": 0.2691192328929901, "learning_rate": 4.175192322116136e-05, "loss": 0.019, "step": 46010 }, { "grad_norm": 0.2940899729728699, "learning_rate": 4.173153871039391e-05, "loss": 0.0165, "step": 46020 }, { "grad_norm": 0.39128538966178894, "learning_rate": 4.171115561248036e-05, "loss": 0.0257, "step": 46030 }, { "grad_norm": 0.18819044530391693, "learning_rate": 4.16907739309036e-05, "loss": 0.0165, "step": 46040 }, { "grad_norm": 0.21055687963962555, "learning_rate": 4.167039366914633e-05, "loss": 0.0213, "step": 46050 }, { "grad_norm": 0.257660835981369, "learning_rate": 4.165001483069096e-05, "loss": 0.0258, "step": 46060 }, { "grad_norm": 0.32601240277290344, "learning_rate": 4.16296374190197e-05, "loss": 0.0308, "step": 46070 }, { "grad_norm": 0.2708638608455658, "learning_rate": 4.1609261437614464e-05, "loss": 0.0206, "step": 46080 }, { "grad_norm": 0.40416258573532104, "learning_rate": 4.158888688995696e-05, "loss": 0.0248, "step": 46090 }, { "grad_norm": 0.37466153502464294, "learning_rate": 4.1568513779528645e-05, "loss": 0.0191, "step": 46100 }, { "grad_norm": 0.263290137052536, "learning_rate": 4.1548142109810704e-05, "loss": 0.0254, "step": 46110 }, { "grad_norm": 0.2035481482744217, "learning_rate": 4.152777188428414e-05, "loss": 0.0191, "step": 46120 }, { "grad_norm": 0.290831059217453, "learning_rate": 4.1507403106429646e-05, "loss": 0.0178, "step": 46130 }, { "grad_norm": 0.2959691286087036, "learning_rate": 4.148703577972768e-05, "loss": 0.0188, "step": 46140 }, { "grad_norm": 0.27888748049736023, "learning_rate": 4.146666990765846e-05, "loss": 0.0222, "step": 46150 }, { "grad_norm": 0.26387831568717957, "learning_rate": 4.1446305493701986e-05, "loss": 0.0216, "step": 46160 }, { "grad_norm": 0.34223783016204834, "learning_rate": 4.142594254133796e-05, "loss": 0.0204, "step": 46170 }, { "grad_norm": 0.3791511654853821, "learning_rate": 4.1405581054045835e-05, "loss": 0.023, "step": 46180 }, { "grad_norm": 0.2920071482658386, "learning_rate": 4.1385221035304864e-05, "loss": 0.0188, "step": 46190 }, { "grad_norm": 0.2884232997894287, "learning_rate": 4.136486248859398e-05, "loss": 0.0226, "step": 46200 }, { "grad_norm": 1.0253044366836548, "learning_rate": 4.1344505417391955e-05, "loss": 0.0214, "step": 46210 }, { "grad_norm": 0.3058967888355255, "learning_rate": 4.132414982517721e-05, "loss": 0.0218, "step": 46220 }, { "grad_norm": 0.240207239985466, "learning_rate": 4.130379571542798e-05, "loss": 0.0215, "step": 46230 }, { "grad_norm": 0.1959173083305359, "learning_rate": 4.12834430916222e-05, "loss": 0.0196, "step": 46240 }, { "grad_norm": 0.30711987614631653, "learning_rate": 4.126309195723763e-05, "loss": 0.0202, "step": 46250 }, { "grad_norm": 0.2626277208328247, "learning_rate": 4.124274231575168e-05, "loss": 0.025, "step": 46260 }, { "grad_norm": 0.22512541711330414, "learning_rate": 4.122239417064154e-05, "loss": 0.0154, "step": 46270 }, { "grad_norm": 0.17290763556957245, "learning_rate": 4.1202047525384184e-05, "loss": 0.018, "step": 46280 }, { "grad_norm": 0.3080567717552185, "learning_rate": 4.118170238345627e-05, "loss": 0.0219, "step": 46290 }, { "grad_norm": 0.2621755003929138, "learning_rate": 4.1161358748334256e-05, "loss": 0.0221, "step": 46300 }, { "grad_norm": 0.29317399859428406, "learning_rate": 4.1141016623494266e-05, "loss": 0.0241, "step": 46310 }, { "grad_norm": 0.3581804037094116, "learning_rate": 4.112067601241227e-05, "loss": 0.0224, "step": 46320 }, { "grad_norm": 0.25597015023231506, "learning_rate": 4.110033691856387e-05, "loss": 0.0249, "step": 46330 }, { "grad_norm": 0.48061084747314453, "learning_rate": 4.107999934542451e-05, "loss": 0.0353, "step": 46340 }, { "grad_norm": 0.2542838454246521, "learning_rate": 4.105966329646928e-05, "loss": 0.0282, "step": 46350 }, { "grad_norm": 0.30718058347702026, "learning_rate": 4.103932877517308e-05, "loss": 0.0286, "step": 46360 }, { "grad_norm": 0.2774847149848938, "learning_rate": 4.101899578501052e-05, "loss": 0.0227, "step": 46370 }, { "grad_norm": 0.21400684118270874, "learning_rate": 4.099866432945595e-05, "loss": 0.0216, "step": 46380 }, { "grad_norm": 0.374214231967926, "learning_rate": 4.097833441198346e-05, "loss": 0.0215, "step": 46390 }, { "grad_norm": 0.17433379590511322, "learning_rate": 4.095800603606685e-05, "loss": 0.0211, "step": 46400 }, { "grad_norm": 0.291245698928833, "learning_rate": 4.093767920517975e-05, "loss": 0.0228, "step": 46410 }, { "grad_norm": 0.2865527272224426, "learning_rate": 4.091735392279539e-05, "loss": 0.0219, "step": 46420 }, { "grad_norm": 0.26013466715812683, "learning_rate": 4.089703019238685e-05, "loss": 0.0213, "step": 46430 }, { "grad_norm": 0.2886298596858978, "learning_rate": 4.0876708017426866e-05, "loss": 0.0181, "step": 46440 }, { "grad_norm": 0.26228782534599304, "learning_rate": 4.085638740138798e-05, "loss": 0.0207, "step": 46450 }, { "grad_norm": 0.28807663917541504, "learning_rate": 4.08360683477424e-05, "loss": 0.0197, "step": 46460 }, { "grad_norm": 0.3155190348625183, "learning_rate": 4.0815750859962085e-05, "loss": 0.0217, "step": 46470 }, { "grad_norm": 0.20381832122802734, "learning_rate": 4.079543494151879e-05, "loss": 0.0249, "step": 46480 }, { "grad_norm": 0.3433380722999573, "learning_rate": 4.07751205958839e-05, "loss": 0.0205, "step": 46490 }, { "grad_norm": 0.3398283123970032, "learning_rate": 4.0754807826528615e-05, "loss": 0.0206, "step": 46500 }, { "grad_norm": 0.3311077058315277, "learning_rate": 4.073449663692379e-05, "loss": 0.0244, "step": 46510 }, { "grad_norm": 0.2416556030511856, "learning_rate": 4.07141870305401e-05, "loss": 0.0166, "step": 46520 }, { "grad_norm": 0.40184125304222107, "learning_rate": 4.0693879010847866e-05, "loss": 0.031, "step": 46530 }, { "grad_norm": 0.28593531250953674, "learning_rate": 4.06735725813172e-05, "loss": 0.0219, "step": 46540 }, { "grad_norm": 0.7159688472747803, "learning_rate": 4.0653267745417903e-05, "loss": 0.0178, "step": 46550 }, { "grad_norm": 0.1959301382303238, "learning_rate": 4.063296450661949e-05, "loss": 0.0183, "step": 46560 }, { "grad_norm": 0.1493169069290161, "learning_rate": 4.061266286839128e-05, "loss": 0.0216, "step": 46570 }, { "grad_norm": 0.4716782867908478, "learning_rate": 4.0592362834202225e-05, "loss": 0.0254, "step": 46580 }, { "grad_norm": 0.23716358840465546, "learning_rate": 4.057206440752107e-05, "loss": 0.0164, "step": 46590 }, { "grad_norm": 0.2384217530488968, "learning_rate": 4.0551767591816245e-05, "loss": 0.0188, "step": 46600 }, { "grad_norm": 0.27479469776153564, "learning_rate": 4.0531472390555935e-05, "loss": 0.0249, "step": 46610 }, { "grad_norm": 0.3941549062728882, "learning_rate": 4.051117880720802e-05, "loss": 0.0207, "step": 46620 }, { "grad_norm": 0.275534987449646, "learning_rate": 4.049088684524015e-05, "loss": 0.0196, "step": 46630 }, { "grad_norm": 0.2811813950538635, "learning_rate": 4.0470596508119636e-05, "loss": 0.0189, "step": 46640 }, { "grad_norm": 0.22386130690574646, "learning_rate": 4.0450307799313524e-05, "loss": 0.0234, "step": 46650 }, { "grad_norm": 0.30840569734573364, "learning_rate": 4.0430020722288656e-05, "loss": 0.0234, "step": 46660 }, { "grad_norm": 0.23564140498638153, "learning_rate": 4.040973528051148e-05, "loss": 0.0273, "step": 46670 }, { "grad_norm": 0.2796745300292969, "learning_rate": 4.038945147744827e-05, "loss": 0.0238, "step": 46680 }, { "grad_norm": 0.3296020030975342, "learning_rate": 4.0369169316564945e-05, "loss": 0.0206, "step": 46690 }, { "grad_norm": 0.4436933100223541, "learning_rate": 4.034888880132718e-05, "loss": 0.0263, "step": 46700 }, { "grad_norm": 0.42387762665748596, "learning_rate": 4.032860993520035e-05, "loss": 0.0185, "step": 46710 }, { "grad_norm": 0.22343593835830688, "learning_rate": 4.030833272164959e-05, "loss": 0.0212, "step": 46720 }, { "grad_norm": 0.22608819603919983, "learning_rate": 4.0288057164139705e-05, "loss": 0.0265, "step": 46730 }, { "grad_norm": 0.21316176652908325, "learning_rate": 4.0267783266135205e-05, "loss": 0.0149, "step": 46740 }, { "grad_norm": 0.3212033212184906, "learning_rate": 4.024751103110039e-05, "loss": 0.0223, "step": 46750 }, { "grad_norm": 0.25837114453315735, "learning_rate": 4.0227240462499176e-05, "loss": 0.0215, "step": 46760 }, { "grad_norm": 0.24706792831420898, "learning_rate": 4.020697156379531e-05, "loss": 0.022, "step": 46770 }, { "grad_norm": 0.2833370864391327, "learning_rate": 4.018670433845215e-05, "loss": 0.0268, "step": 46780 }, { "grad_norm": 0.314502477645874, "learning_rate": 4.016643878993284e-05, "loss": 0.0212, "step": 46790 }, { "grad_norm": 0.36874374747276306, "learning_rate": 4.014617492170017e-05, "loss": 0.0272, "step": 46800 }, { "grad_norm": 0.26984551548957825, "learning_rate": 4.0125912737216726e-05, "loss": 0.0283, "step": 46810 }, { "grad_norm": 0.3399408459663391, "learning_rate": 4.0105652239944735e-05, "loss": 0.0229, "step": 46820 }, { "grad_norm": 0.20631246268749237, "learning_rate": 4.0085393433346144e-05, "loss": 0.0256, "step": 46830 }, { "grad_norm": 0.22259952127933502, "learning_rate": 4.006513632088268e-05, "loss": 0.0203, "step": 46840 }, { "grad_norm": 0.26626256108283997, "learning_rate": 4.004488090601567e-05, "loss": 0.0163, "step": 46850 }, { "grad_norm": 0.25426721572875977, "learning_rate": 4.002462719220626e-05, "loss": 0.0208, "step": 46860 }, { "grad_norm": 0.2287476807832718, "learning_rate": 4.000437518291522e-05, "loss": 0.0265, "step": 46870 }, { "grad_norm": 0.19591738283634186, "learning_rate": 3.9984124881603094e-05, "loss": 0.0225, "step": 46880 }, { "grad_norm": 0.4700518250465393, "learning_rate": 3.9963876291730086e-05, "loss": 0.0272, "step": 46890 }, { "grad_norm": 0.24520374834537506, "learning_rate": 3.994362941675614e-05, "loss": 0.0206, "step": 46900 }, { "grad_norm": 0.29301947355270386, "learning_rate": 3.992338426014088e-05, "loss": 0.0263, "step": 46910 }, { "grad_norm": 0.4052371680736542, "learning_rate": 3.9903140825343636e-05, "loss": 0.0225, "step": 46920 }, { "grad_norm": 0.4991939663887024, "learning_rate": 3.98828991158235e-05, "loss": 0.0229, "step": 46930 }, { "grad_norm": 0.30233800411224365, "learning_rate": 3.9862659135039185e-05, "loss": 0.0237, "step": 46940 }, { "grad_norm": 0.32090330123901367, "learning_rate": 3.984242088644918e-05, "loss": 0.0222, "step": 46950 }, { "grad_norm": 0.29423442482948303, "learning_rate": 3.9822184373511615e-05, "loss": 0.0231, "step": 46960 }, { "grad_norm": 0.21111392974853516, "learning_rate": 3.98019495996844e-05, "loss": 0.0212, "step": 46970 }, { "grad_norm": 0.2615043818950653, "learning_rate": 3.978171656842507e-05, "loss": 0.0246, "step": 46980 }, { "grad_norm": 0.3028312027454376, "learning_rate": 3.976148528319091e-05, "loss": 0.0208, "step": 46990 }, { "grad_norm": 0.26239484548568726, "learning_rate": 3.974125574743888e-05, "loss": 0.0224, "step": 47000 }, { "grad_norm": 0.41740089654922485, "learning_rate": 3.9721027964625686e-05, "loss": 0.0233, "step": 47010 }, { "grad_norm": 0.2897869944572449, "learning_rate": 3.9700801938207676e-05, "loss": 0.021, "step": 47020 }, { "grad_norm": 0.4512873888015747, "learning_rate": 3.9680577671640916e-05, "loss": 0.0303, "step": 47030 }, { "grad_norm": 0.2957436442375183, "learning_rate": 3.966035516838121e-05, "loss": 0.0294, "step": 47040 }, { "grad_norm": 0.43853574991226196, "learning_rate": 3.9640134431884014e-05, "loss": 0.0229, "step": 47050 }, { "grad_norm": 0.2248227894306183, "learning_rate": 3.961991546560451e-05, "loss": 0.0188, "step": 47060 }, { "grad_norm": 0.18302951753139496, "learning_rate": 3.959969827299753e-05, "loss": 0.0235, "step": 47070 }, { "grad_norm": 0.2673962712287903, "learning_rate": 3.9579482857517684e-05, "loss": 0.0206, "step": 47080 }, { "grad_norm": 0.3161705434322357, "learning_rate": 3.955926922261921e-05, "loss": 0.0212, "step": 47090 }, { "grad_norm": 0.6883968710899353, "learning_rate": 3.9539057371756084e-05, "loss": 0.0253, "step": 47100 }, { "grad_norm": 0.27063703536987305, "learning_rate": 3.951884730838195e-05, "loss": 0.0148, "step": 47110 }, { "grad_norm": 0.24277262389659882, "learning_rate": 3.949863903595012e-05, "loss": 0.017, "step": 47120 }, { "grad_norm": 0.26366087794303894, "learning_rate": 3.947843255791369e-05, "loss": 0.0196, "step": 47130 }, { "grad_norm": 0.2946869730949402, "learning_rate": 3.9458227877725364e-05, "loss": 0.0258, "step": 47140 }, { "grad_norm": 0.44200608134269714, "learning_rate": 3.943802499883758e-05, "loss": 0.0176, "step": 47150 }, { "grad_norm": 0.3405042588710785, "learning_rate": 3.9417823924702437e-05, "loss": 0.0251, "step": 47160 }, { "grad_norm": 0.3681444227695465, "learning_rate": 3.939762465877178e-05, "loss": 0.0241, "step": 47170 }, { "grad_norm": 0.4092778265476227, "learning_rate": 3.937742720449708e-05, "loss": 0.0222, "step": 47180 }, { "grad_norm": 0.36242806911468506, "learning_rate": 3.9357231565329563e-05, "loss": 0.0229, "step": 47190 }, { "grad_norm": 0.29753926396369934, "learning_rate": 3.933703774472008e-05, "loss": 0.0202, "step": 47200 }, { "grad_norm": 0.2792198061943054, "learning_rate": 3.93168457461192e-05, "loss": 0.0182, "step": 47210 }, { "grad_norm": 0.351582407951355, "learning_rate": 3.9296655572977216e-05, "loss": 0.0217, "step": 47220 }, { "grad_norm": 0.3128179907798767, "learning_rate": 3.927646722874404e-05, "loss": 0.0233, "step": 47230 }, { "grad_norm": 0.1895594298839569, "learning_rate": 3.925628071686934e-05, "loss": 0.0198, "step": 47240 }, { "grad_norm": 0.22179891169071198, "learning_rate": 3.9236096040802415e-05, "loss": 0.0232, "step": 47250 }, { "grad_norm": 0.46257302165031433, "learning_rate": 3.9215913203992294e-05, "loss": 0.0311, "step": 47260 }, { "grad_norm": 0.2549460232257843, "learning_rate": 3.9195732209887645e-05, "loss": 0.0206, "step": 47270 }, { "grad_norm": 0.22048938274383545, "learning_rate": 3.9175553061936875e-05, "loss": 0.0196, "step": 47280 }, { "grad_norm": 0.27553117275238037, "learning_rate": 3.9155375763588045e-05, "loss": 0.0244, "step": 47290 }, { "grad_norm": 0.1992097795009613, "learning_rate": 3.913520031828889e-05, "loss": 0.019, "step": 47300 }, { "grad_norm": 0.3480793833732605, "learning_rate": 3.911502672948685e-05, "loss": 0.021, "step": 47310 }, { "grad_norm": 0.3129396438598633, "learning_rate": 3.9094855000629014e-05, "loss": 0.0225, "step": 47320 }, { "grad_norm": 0.22573333978652954, "learning_rate": 3.907468513516223e-05, "loss": 0.0209, "step": 47330 }, { "grad_norm": 0.22899959981441498, "learning_rate": 3.905451713653294e-05, "loss": 0.0177, "step": 47340 }, { "grad_norm": 0.23787952959537506, "learning_rate": 3.903435100818731e-05, "loss": 0.0212, "step": 47350 }, { "grad_norm": 0.8567137718200684, "learning_rate": 3.901418675357117e-05, "loss": 0.0249, "step": 47360 }, { "grad_norm": 0.28526973724365234, "learning_rate": 3.8994024376130075e-05, "loss": 0.0255, "step": 47370 }, { "grad_norm": 0.2351524382829666, "learning_rate": 3.8973863879309194e-05, "loss": 0.0232, "step": 47380 }, { "grad_norm": 0.34063920378685, "learning_rate": 3.8953705266553394e-05, "loss": 0.0226, "step": 47390 }, { "grad_norm": 0.310844749212265, "learning_rate": 3.893354854130727e-05, "loss": 0.0257, "step": 47400 }, { "grad_norm": 0.26060572266578674, "learning_rate": 3.8913393707015006e-05, "loss": 0.0215, "step": 47410 }, { "grad_norm": 0.2549753785133362, "learning_rate": 3.889324076712056e-05, "loss": 0.0256, "step": 47420 }, { "grad_norm": 0.25115588307380676, "learning_rate": 3.8873089725067476e-05, "loss": 0.019, "step": 47430 }, { "grad_norm": 0.3301404118537903, "learning_rate": 3.885294058429905e-05, "loss": 0.0289, "step": 47440 }, { "grad_norm": 0.1772613674402237, "learning_rate": 3.8832793348258206e-05, "loss": 0.0216, "step": 47450 }, { "grad_norm": 0.2785581946372986, "learning_rate": 3.881264802038756e-05, "loss": 0.0335, "step": 47460 }, { "grad_norm": 0.32159826159477234, "learning_rate": 3.879250460412939e-05, "loss": 0.0235, "step": 47470 }, { "grad_norm": 0.26982420682907104, "learning_rate": 3.8772363102925644e-05, "loss": 0.02, "step": 47480 }, { "grad_norm": 0.3960076570510864, "learning_rate": 3.875222352021798e-05, "loss": 0.0177, "step": 47490 }, { "grad_norm": 0.21723386645317078, "learning_rate": 3.8732085859447686e-05, "loss": 0.0218, "step": 47500 }, { "grad_norm": 0.27316024899482727, "learning_rate": 3.871195012405575e-05, "loss": 0.0206, "step": 47510 }, { "grad_norm": 0.30605438351631165, "learning_rate": 3.869181631748278e-05, "loss": 0.0203, "step": 47520 }, { "grad_norm": 0.3926416039466858, "learning_rate": 3.867168444316915e-05, "loss": 0.0197, "step": 47530 }, { "grad_norm": 0.1426890790462494, "learning_rate": 3.865155450455481e-05, "loss": 0.0152, "step": 47540 }, { "grad_norm": 0.19208014011383057, "learning_rate": 3.8631426505079426e-05, "loss": 0.0264, "step": 47550 }, { "grad_norm": 0.4119161069393158, "learning_rate": 3.8611300448182304e-05, "loss": 0.0221, "step": 47560 }, { "grad_norm": 0.27205556631088257, "learning_rate": 3.859117633730248e-05, "loss": 0.0186, "step": 47570 }, { "grad_norm": 0.3643154799938202, "learning_rate": 3.857105417587858e-05, "loss": 0.0209, "step": 47580 }, { "grad_norm": 0.31910645961761475, "learning_rate": 3.855093396734894e-05, "loss": 0.0175, "step": 47590 }, { "grad_norm": 0.26477593183517456, "learning_rate": 3.8530815715151545e-05, "loss": 0.0324, "step": 47600 }, { "grad_norm": 0.299571692943573, "learning_rate": 3.851069942272405e-05, "loss": 0.0162, "step": 47610 }, { "grad_norm": 0.3097379207611084, "learning_rate": 3.849058509350382e-05, "loss": 0.0202, "step": 47620 }, { "grad_norm": 0.25163599848747253, "learning_rate": 3.8470472730927783e-05, "loss": 0.0279, "step": 47630 }, { "grad_norm": 0.4119352102279663, "learning_rate": 3.845036233843264e-05, "loss": 0.0208, "step": 47640 }, { "grad_norm": 0.22191111743450165, "learning_rate": 3.843025391945469e-05, "loss": 0.0215, "step": 47650 }, { "grad_norm": 0.30575495958328247, "learning_rate": 3.841014747742992e-05, "loss": 0.0192, "step": 47660 }, { "grad_norm": 0.3584335148334503, "learning_rate": 3.839004301579397e-05, "loss": 0.0162, "step": 47670 }, { "grad_norm": 0.32937419414520264, "learning_rate": 3.8369940537982097e-05, "loss": 0.0219, "step": 47680 }, { "grad_norm": 0.6620471477508545, "learning_rate": 3.834984004742933e-05, "loss": 0.0182, "step": 47690 }, { "grad_norm": 0.3629049062728882, "learning_rate": 3.832974154757026e-05, "loss": 0.0162, "step": 47700 }, { "grad_norm": 0.3273513615131378, "learning_rate": 3.830964504183919e-05, "loss": 0.022, "step": 47710 }, { "grad_norm": 0.3594081699848175, "learning_rate": 3.828955053367003e-05, "loss": 0.0208, "step": 47720 }, { "grad_norm": 0.5019171833992004, "learning_rate": 3.826945802649642e-05, "loss": 0.0249, "step": 47730 }, { "grad_norm": 0.24926556646823883, "learning_rate": 3.824936752375159e-05, "loss": 0.0272, "step": 47740 }, { "grad_norm": 0.3270248472690582, "learning_rate": 3.822927902886848e-05, "loss": 0.0199, "step": 47750 }, { "grad_norm": 0.2955041825771332, "learning_rate": 3.8209192545279653e-05, "loss": 0.0234, "step": 47760 }, { "grad_norm": 0.23975816369056702, "learning_rate": 3.8189108076417326e-05, "loss": 0.0198, "step": 47770 }, { "grad_norm": 0.26993227005004883, "learning_rate": 3.816902562571342e-05, "loss": 0.0353, "step": 47780 }, { "grad_norm": 0.31148630380630493, "learning_rate": 3.814894519659944e-05, "loss": 0.0233, "step": 47790 }, { "grad_norm": 0.22016139328479767, "learning_rate": 3.812886679250661e-05, "loss": 0.0203, "step": 47800 }, { "grad_norm": 0.20305708050727844, "learning_rate": 3.810879041686575e-05, "loss": 0.0188, "step": 47810 }, { "grad_norm": 0.4096823036670685, "learning_rate": 3.808871607310741e-05, "loss": 0.0265, "step": 47820 }, { "grad_norm": 0.2823618948459625, "learning_rate": 3.806864376466169e-05, "loss": 0.0238, "step": 47830 }, { "grad_norm": 0.18104933202266693, "learning_rate": 3.804857349495845e-05, "loss": 0.0142, "step": 47840 }, { "grad_norm": 0.4298628568649292, "learning_rate": 3.802850526742713e-05, "loss": 0.0416, "step": 47850 }, { "grad_norm": 0.2891171872615814, "learning_rate": 3.800843908549683e-05, "loss": 0.0156, "step": 47860 }, { "grad_norm": 0.24473567306995392, "learning_rate": 3.7988374952596325e-05, "loss": 0.0165, "step": 47870 }, { "grad_norm": 0.3063776195049286, "learning_rate": 3.7968312872154e-05, "loss": 0.022, "step": 47880 }, { "grad_norm": 0.2877628207206726, "learning_rate": 3.7948252847597965e-05, "loss": 0.0231, "step": 47890 }, { "grad_norm": 0.322896271944046, "learning_rate": 3.7928194882355885e-05, "loss": 0.0222, "step": 47900 }, { "grad_norm": 0.21593362092971802, "learning_rate": 3.790813897985515e-05, "loss": 0.014, "step": 47910 }, { "grad_norm": 0.37554919719696045, "learning_rate": 3.7888085143522726e-05, "loss": 0.0268, "step": 47920 }, { "grad_norm": 0.28665149211883545, "learning_rate": 3.7868033376785314e-05, "loss": 0.0206, "step": 47930 }, { "grad_norm": 0.546358585357666, "learning_rate": 3.784798368306919e-05, "loss": 0.0211, "step": 47940 }, { "grad_norm": 0.5362433791160583, "learning_rate": 3.782793606580029e-05, "loss": 0.0209, "step": 47950 }, { "grad_norm": 0.2210458666086197, "learning_rate": 3.7807890528404205e-05, "loss": 0.0224, "step": 47960 }, { "grad_norm": 0.32326602935791016, "learning_rate": 3.778784707430616e-05, "loss": 0.0217, "step": 47970 }, { "grad_norm": 0.2312762588262558, "learning_rate": 3.776780570693107e-05, "loss": 0.0155, "step": 47980 }, { "grad_norm": 0.24367637932300568, "learning_rate": 3.774776642970342e-05, "loss": 0.026, "step": 47990 }, { "grad_norm": 0.23348462581634521, "learning_rate": 3.77277292460474e-05, "loss": 0.0178, "step": 48000 }, { "grad_norm": 0.24277465045452118, "learning_rate": 3.770769415938678e-05, "loss": 0.0208, "step": 48010 }, { "grad_norm": 0.3935532569885254, "learning_rate": 3.768766117314506e-05, "loss": 0.0279, "step": 48020 }, { "grad_norm": 1.1191222667694092, "learning_rate": 3.76676302907453e-05, "loss": 0.0281, "step": 48030 }, { "grad_norm": 0.5580732822418213, "learning_rate": 3.764760151561021e-05, "loss": 0.0162, "step": 48040 }, { "grad_norm": 0.24657364189624786, "learning_rate": 3.76275748511622e-05, "loss": 0.0194, "step": 48050 }, { "grad_norm": 0.3555978536605835, "learning_rate": 3.7607550300823255e-05, "loss": 0.0198, "step": 48060 }, { "grad_norm": 0.2994900345802307, "learning_rate": 3.7587527868015044e-05, "loss": 0.0312, "step": 48070 }, { "grad_norm": 0.2780686318874359, "learning_rate": 3.756750755615881e-05, "loss": 0.023, "step": 48080 }, { "grad_norm": 0.1773141771554947, "learning_rate": 3.754748936867553e-05, "loss": 0.0203, "step": 48090 }, { "grad_norm": 0.20435650646686554, "learning_rate": 3.752747330898573e-05, "loss": 0.0186, "step": 48100 }, { "grad_norm": 0.2012830376625061, "learning_rate": 3.750745938050962e-05, "loss": 0.0213, "step": 48110 }, { "grad_norm": 0.28698021173477173, "learning_rate": 3.7487447586667025e-05, "loss": 0.0225, "step": 48120 }, { "grad_norm": 0.21359151601791382, "learning_rate": 3.7467437930877426e-05, "loss": 0.0164, "step": 48130 }, { "grad_norm": 0.23218509554862976, "learning_rate": 3.744743041655992e-05, "loss": 0.0188, "step": 48140 }, { "grad_norm": 0.2680237591266632, "learning_rate": 3.742742504713324e-05, "loss": 0.0235, "step": 48150 }, { "grad_norm": 0.290046364068985, "learning_rate": 3.740742182601576e-05, "loss": 0.0221, "step": 48160 }, { "grad_norm": 0.19361627101898193, "learning_rate": 3.7387420756625464e-05, "loss": 0.02, "step": 48170 }, { "grad_norm": 0.28760746121406555, "learning_rate": 3.736742184238002e-05, "loss": 0.0217, "step": 48180 }, { "grad_norm": 0.17452573776245117, "learning_rate": 3.7347425086696684e-05, "loss": 0.0202, "step": 48190 }, { "grad_norm": 0.3068847954273224, "learning_rate": 3.732743049299235e-05, "loss": 0.0214, "step": 48200 }, { "grad_norm": 0.29352694749832153, "learning_rate": 3.730743806468354e-05, "loss": 0.0191, "step": 48210 }, { "grad_norm": 0.34745579957962036, "learning_rate": 3.7287447805186436e-05, "loss": 0.0188, "step": 48220 }, { "grad_norm": 0.20815140008926392, "learning_rate": 3.726745971791682e-05, "loss": 0.0232, "step": 48230 }, { "grad_norm": 0.3384186029434204, "learning_rate": 3.724747380629008e-05, "loss": 0.0286, "step": 48240 }, { "grad_norm": 0.19950661063194275, "learning_rate": 3.72274900737213e-05, "loss": 0.0231, "step": 48250 }, { "grad_norm": 0.22786392271518707, "learning_rate": 3.7207508523625123e-05, "loss": 0.0179, "step": 48260 }, { "grad_norm": 0.2070414125919342, "learning_rate": 3.718752915941588e-05, "loss": 0.0198, "step": 48270 }, { "grad_norm": 0.36358869075775146, "learning_rate": 3.7167551984507464e-05, "loss": 0.0197, "step": 48280 }, { "grad_norm": 0.30272218585014343, "learning_rate": 3.714757700231346e-05, "loss": 0.0193, "step": 48290 }, { "grad_norm": 0.2586989998817444, "learning_rate": 3.712760421624703e-05, "loss": 0.0183, "step": 48300 }, { "grad_norm": 0.21727623045444489, "learning_rate": 3.710763362972099e-05, "loss": 0.0194, "step": 48310 }, { "grad_norm": 0.3307825028896332, "learning_rate": 3.708766524614774e-05, "loss": 0.0242, "step": 48320 }, { "grad_norm": 0.308202862739563, "learning_rate": 3.7067699068939335e-05, "loss": 0.019, "step": 48330 }, { "grad_norm": 0.2250959277153015, "learning_rate": 3.704773510150748e-05, "loss": 0.02, "step": 48340 }, { "grad_norm": 0.29452207684516907, "learning_rate": 3.702777334726344e-05, "loss": 0.0233, "step": 48350 }, { "grad_norm": 0.289384663105011, "learning_rate": 3.7007813809618164e-05, "loss": 0.0166, "step": 48360 }, { "grad_norm": 0.39862769842147827, "learning_rate": 3.6987856491982145e-05, "loss": 0.0223, "step": 48370 }, { "grad_norm": 0.2507348656654358, "learning_rate": 3.696790139776558e-05, "loss": 0.0201, "step": 48380 }, { "grad_norm": 0.2702997028827667, "learning_rate": 3.6947948530378235e-05, "loss": 0.0253, "step": 48390 }, { "grad_norm": 0.36221379041671753, "learning_rate": 3.6927997893229516e-05, "loss": 0.0203, "step": 48400 }, { "grad_norm": 0.17968665063381195, "learning_rate": 3.690804948972845e-05, "loss": 0.017, "step": 48410 }, { "grad_norm": 0.16770346462726593, "learning_rate": 3.688810332328363e-05, "loss": 0.0195, "step": 48420 }, { "grad_norm": 0.25601640343666077, "learning_rate": 3.686815939730336e-05, "loss": 0.0243, "step": 48430 }, { "grad_norm": 0.30895161628723145, "learning_rate": 3.684821771519548e-05, "loss": 0.0193, "step": 48440 }, { "grad_norm": 0.25557082891464233, "learning_rate": 3.68282782803675e-05, "loss": 0.0254, "step": 48450 }, { "grad_norm": 0.38224002718925476, "learning_rate": 3.6808341096226504e-05, "loss": 0.0291, "step": 48460 }, { "grad_norm": 0.3258429169654846, "learning_rate": 3.678840616617924e-05, "loss": 0.0232, "step": 48470 }, { "grad_norm": 0.5057445764541626, "learning_rate": 3.6768473493632e-05, "loss": 0.0265, "step": 48480 }, { "grad_norm": 0.3202480971813202, "learning_rate": 3.6748543081990783e-05, "loss": 0.0246, "step": 48490 }, { "grad_norm": 0.2798607051372528, "learning_rate": 3.672861493466112e-05, "loss": 0.0158, "step": 48500 }, { "grad_norm": 0.4215480387210846, "learning_rate": 3.670868905504818e-05, "loss": 0.0231, "step": 48510 }, { "grad_norm": 0.2938987910747528, "learning_rate": 3.6688765446556784e-05, "loss": 0.019, "step": 48520 }, { "grad_norm": 0.3669556975364685, "learning_rate": 3.6668844112591276e-05, "loss": 0.0221, "step": 48530 }, { "grad_norm": 0.3286013901233673, "learning_rate": 3.664892505655573e-05, "loss": 0.021, "step": 48540 }, { "grad_norm": 0.3445904850959778, "learning_rate": 3.662900828185373e-05, "loss": 0.0279, "step": 48550 }, { "grad_norm": 0.2410888969898224, "learning_rate": 3.6609093791888516e-05, "loss": 0.0202, "step": 48560 }, { "grad_norm": 0.32876846194267273, "learning_rate": 3.658918159006292e-05, "loss": 0.0288, "step": 48570 }, { "grad_norm": 0.37196648120880127, "learning_rate": 3.656927167977942e-05, "loss": 0.0305, "step": 48580 }, { "grad_norm": 0.3043328523635864, "learning_rate": 3.654936406444006e-05, "loss": 0.0198, "step": 48590 }, { "grad_norm": 0.2990049719810486, "learning_rate": 3.65294587474465e-05, "loss": 0.0183, "step": 48600 }, { "grad_norm": 0.2427339106798172, "learning_rate": 3.650955573220002e-05, "loss": 0.05, "step": 48610 }, { "grad_norm": 0.3359922766685486, "learning_rate": 3.648965502210149e-05, "loss": 0.0187, "step": 48620 }, { "grad_norm": 0.3072715997695923, "learning_rate": 3.646975662055142e-05, "loss": 0.0225, "step": 48630 }, { "grad_norm": 0.2757081985473633, "learning_rate": 3.644986053094987e-05, "loss": 0.0212, "step": 48640 }, { "grad_norm": 0.2989623248577118, "learning_rate": 3.642996675669659e-05, "loss": 0.0227, "step": 48650 }, { "grad_norm": 0.2485203742980957, "learning_rate": 3.641007530119083e-05, "loss": 0.0163, "step": 48660 }, { "grad_norm": 0.2227446287870407, "learning_rate": 3.639018616783153e-05, "loss": 0.0217, "step": 48670 }, { "grad_norm": 0.2353764921426773, "learning_rate": 3.637029936001719e-05, "loss": 0.0188, "step": 48680 }, { "grad_norm": 0.24051877856254578, "learning_rate": 3.6350414881145886e-05, "loss": 0.0247, "step": 48690 }, { "grad_norm": 0.2116619199514389, "learning_rate": 3.6330532734615386e-05, "loss": 0.0233, "step": 48700 }, { "grad_norm": 0.2626500427722931, "learning_rate": 3.6310652923822975e-05, "loss": 0.0216, "step": 48710 }, { "grad_norm": 0.3220328390598297, "learning_rate": 3.629077545216558e-05, "loss": 0.0204, "step": 48720 }, { "grad_norm": 0.3927364647388458, "learning_rate": 3.627090032303969e-05, "loss": 0.0222, "step": 48730 }, { "grad_norm": 0.20623965561389923, "learning_rate": 3.625102753984146e-05, "loss": 0.0206, "step": 48740 }, { "grad_norm": 0.2227417528629303, "learning_rate": 3.623115710596659e-05, "loss": 0.017, "step": 48750 }, { "grad_norm": 0.34075412154197693, "learning_rate": 3.6211289024810395e-05, "loss": 0.0237, "step": 48760 }, { "grad_norm": 0.21662111580371857, "learning_rate": 3.619142329976777e-05, "loss": 0.0178, "step": 48770 }, { "grad_norm": 0.22664712369441986, "learning_rate": 3.6171559934233247e-05, "loss": 0.0198, "step": 48780 }, { "grad_norm": 0.2296510636806488, "learning_rate": 3.615169893160093e-05, "loss": 0.0158, "step": 48790 }, { "grad_norm": 0.38901129364967346, "learning_rate": 3.61318402952645e-05, "loss": 0.0223, "step": 48800 }, { "grad_norm": 0.34660398960113525, "learning_rate": 3.6111984028617285e-05, "loss": 0.0247, "step": 48810 }, { "grad_norm": 0.25214824080467224, "learning_rate": 3.6092130135052134e-05, "loss": 0.0258, "step": 48820 }, { "grad_norm": 0.22912411391735077, "learning_rate": 3.6072278617961584e-05, "loss": 0.0267, "step": 48830 }, { "grad_norm": 0.33427125215530396, "learning_rate": 3.605242948073767e-05, "loss": 0.0233, "step": 48840 }, { "grad_norm": 0.3357487618923187, "learning_rate": 3.603258272677212e-05, "loss": 0.0226, "step": 48850 }, { "grad_norm": 0.2439907044172287, "learning_rate": 3.601273835945616e-05, "loss": 0.0207, "step": 48860 }, { "grad_norm": 0.25029051303863525, "learning_rate": 3.5992896382180664e-05, "loss": 0.0186, "step": 48870 }, { "grad_norm": 0.2541619539260864, "learning_rate": 3.597305679833609e-05, "loss": 0.0222, "step": 48880 }, { "grad_norm": 0.34553077816963196, "learning_rate": 3.595321961131245e-05, "loss": 0.0247, "step": 48890 }, { "grad_norm": 0.23845119774341583, "learning_rate": 3.593338482449942e-05, "loss": 0.0261, "step": 48900 }, { "grad_norm": 0.3579633831977844, "learning_rate": 3.591355244128618e-05, "loss": 0.0316, "step": 48910 }, { "grad_norm": 0.23721390962600708, "learning_rate": 3.589372246506158e-05, "loss": 0.0247, "step": 48920 }, { "grad_norm": 0.2783891558647156, "learning_rate": 3.5873894899213984e-05, "loss": 0.0202, "step": 48930 }, { "grad_norm": 0.3280848264694214, "learning_rate": 3.5854069747131416e-05, "loss": 0.019, "step": 48940 }, { "grad_norm": 0.3743188679218292, "learning_rate": 3.583424701220143e-05, "loss": 0.0317, "step": 48950 }, { "grad_norm": 0.37965142726898193, "learning_rate": 3.581442669781121e-05, "loss": 0.0237, "step": 48960 }, { "grad_norm": 0.4120353162288666, "learning_rate": 3.579460880734749e-05, "loss": 0.021, "step": 48970 }, { "grad_norm": 0.2165631353855133, "learning_rate": 3.577479334419657e-05, "loss": 0.0213, "step": 48980 }, { "grad_norm": 0.24072757363319397, "learning_rate": 3.575498031174444e-05, "loss": 0.018, "step": 48990 }, { "grad_norm": 0.37210214138031006, "learning_rate": 3.573516971337657e-05, "loss": 0.0229, "step": 49000 }, { "grad_norm": 0.4564206302165985, "learning_rate": 3.5715361552478046e-05, "loss": 0.0241, "step": 49010 }, { "grad_norm": 0.2751970887184143, "learning_rate": 3.5695555832433536e-05, "loss": 0.0181, "step": 49020 }, { "grad_norm": 0.4030759930610657, "learning_rate": 3.5675752556627325e-05, "loss": 0.0216, "step": 49030 }, { "grad_norm": 0.44374629855155945, "learning_rate": 3.565595172844322e-05, "loss": 0.019, "step": 49040 }, { "grad_norm": 0.2556053698062897, "learning_rate": 3.5636153351264666e-05, "loss": 0.0227, "step": 49050 }, { "grad_norm": 0.24423924088478088, "learning_rate": 3.5616357428474655e-05, "loss": 0.024, "step": 49060 }, { "grad_norm": 0.2832287847995758, "learning_rate": 3.559656396345575e-05, "loss": 0.0258, "step": 49070 }, { "grad_norm": 0.20005980134010315, "learning_rate": 3.5576772959590146e-05, "loss": 0.0256, "step": 49080 }, { "grad_norm": 0.2208806872367859, "learning_rate": 3.5556984420259545e-05, "loss": 0.0207, "step": 49090 }, { "grad_norm": 0.27779215574264526, "learning_rate": 3.5537198348845305e-05, "loss": 0.0255, "step": 49100 }, { "grad_norm": 0.2648930251598358, "learning_rate": 3.551741474872831e-05, "loss": 0.0245, "step": 49110 }, { "grad_norm": 0.30563053488731384, "learning_rate": 3.549763362328903e-05, "loss": 0.0306, "step": 49120 }, { "grad_norm": 0.23250453174114227, "learning_rate": 3.5477854975907515e-05, "loss": 0.019, "step": 49130 }, { "grad_norm": 0.23119397461414337, "learning_rate": 3.5458078809963416e-05, "loss": 0.0197, "step": 49140 }, { "grad_norm": 0.2905368506908417, "learning_rate": 3.543830512883594e-05, "loss": 0.0172, "step": 49150 }, { "grad_norm": 0.3763059675693512, "learning_rate": 3.5418533935903824e-05, "loss": 0.0248, "step": 49160 }, { "grad_norm": 0.4049615263938904, "learning_rate": 3.539876523454547e-05, "loss": 0.0225, "step": 49170 }, { "grad_norm": 0.268621027469635, "learning_rate": 3.537899902813878e-05, "loss": 0.023, "step": 49180 }, { "grad_norm": 0.17299257218837738, "learning_rate": 3.5359235320061293e-05, "loss": 0.0166, "step": 49190 }, { "grad_norm": 0.2838208079338074, "learning_rate": 3.533947411369003e-05, "loss": 0.0165, "step": 49200 }, { "grad_norm": 0.22976362705230713, "learning_rate": 3.53197154124017e-05, "loss": 0.0192, "step": 49210 }, { "grad_norm": 0.36546456813812256, "learning_rate": 3.52999592195725e-05, "loss": 0.0221, "step": 49220 }, { "grad_norm": 0.2115529328584671, "learning_rate": 3.5280205538578224e-05, "loss": 0.0171, "step": 49230 }, { "grad_norm": 0.293863445520401, "learning_rate": 3.5260454372794236e-05, "loss": 0.017, "step": 49240 }, { "grad_norm": 0.2746432423591614, "learning_rate": 3.524070572559545e-05, "loss": 0.024, "step": 49250 }, { "grad_norm": 0.2659452557563782, "learning_rate": 3.5220959600356395e-05, "loss": 0.017, "step": 49260 }, { "grad_norm": 0.2562454640865326, "learning_rate": 3.5201216000451145e-05, "loss": 0.0169, "step": 49270 }, { "grad_norm": 0.2937743663787842, "learning_rate": 3.5181474929253335e-05, "loss": 0.0152, "step": 49280 }, { "grad_norm": 0.22176647186279297, "learning_rate": 3.516173639013615e-05, "loss": 0.0243, "step": 49290 }, { "grad_norm": 0.17955005168914795, "learning_rate": 3.5142000386472406e-05, "loss": 0.0172, "step": 49300 }, { "grad_norm": 0.23841896653175354, "learning_rate": 3.5122266921634427e-05, "loss": 0.0169, "step": 49310 }, { "grad_norm": 0.29202380776405334, "learning_rate": 3.510253599899413e-05, "loss": 0.0205, "step": 49320 }, { "grad_norm": 0.19337740540504456, "learning_rate": 3.5082807621922965e-05, "loss": 0.0153, "step": 49330 }, { "grad_norm": 0.2750174105167389, "learning_rate": 3.506308179379201e-05, "loss": 0.0195, "step": 49340 }, { "grad_norm": 0.18681548535823822, "learning_rate": 3.5043358517971844e-05, "loss": 0.0218, "step": 49350 }, { "grad_norm": 0.2377706617116928, "learning_rate": 3.502363779783264e-05, "loss": 0.026, "step": 49360 }, { "grad_norm": 0.22199030220508575, "learning_rate": 3.500391963674415e-05, "loss": 0.021, "step": 49370 }, { "grad_norm": 0.266298770904541, "learning_rate": 3.4984204038075615e-05, "loss": 0.0269, "step": 49380 }, { "grad_norm": 0.22277554869651794, "learning_rate": 3.496449100519595e-05, "loss": 0.0269, "step": 49390 }, { "grad_norm": 0.2607913613319397, "learning_rate": 3.494478054147354e-05, "loss": 0.0187, "step": 49400 }, { "grad_norm": 0.16510289907455444, "learning_rate": 3.4925072650276395e-05, "loss": 0.0161, "step": 49410 }, { "grad_norm": 0.28680655360221863, "learning_rate": 3.4905367334972016e-05, "loss": 0.0172, "step": 49420 }, { "grad_norm": 0.37450262904167175, "learning_rate": 3.488566459892752e-05, "loss": 0.0194, "step": 49430 }, { "grad_norm": 0.3918096423149109, "learning_rate": 3.4865964445509585e-05, "loss": 0.0196, "step": 49440 }, { "grad_norm": 0.2587636709213257, "learning_rate": 3.484626687808438e-05, "loss": 0.016, "step": 49450 }, { "grad_norm": 0.24217627942562103, "learning_rate": 3.4826571900017735e-05, "loss": 0.0136, "step": 49460 }, { "grad_norm": 0.30396541953086853, "learning_rate": 3.480687951467495e-05, "loss": 0.0177, "step": 49470 }, { "grad_norm": 0.23936109244823456, "learning_rate": 3.4787189725420925e-05, "loss": 0.0173, "step": 49480 }, { "grad_norm": 0.22842447459697723, "learning_rate": 3.4767502535620086e-05, "loss": 0.018, "step": 49490 }, { "grad_norm": 0.3063983917236328, "learning_rate": 3.474781794863648e-05, "loss": 0.0227, "step": 49500 }, { "grad_norm": 0.31447938084602356, "learning_rate": 3.472813596783363e-05, "loss": 0.0269, "step": 49510 }, { "grad_norm": 0.32065829634666443, "learning_rate": 3.470845659657466e-05, "loss": 0.0218, "step": 49520 }, { "grad_norm": 0.21583539247512817, "learning_rate": 3.468877983822223e-05, "loss": 0.0173, "step": 49530 }, { "grad_norm": 0.2191396951675415, "learning_rate": 3.466910569613855e-05, "loss": 0.0172, "step": 49540 }, { "grad_norm": 0.4218301475048065, "learning_rate": 3.464943417368542e-05, "loss": 0.0214, "step": 49550 }, { "grad_norm": 0.45102164149284363, "learning_rate": 3.462976527422415e-05, "loss": 0.0194, "step": 49560 }, { "grad_norm": 0.2537619173526764, "learning_rate": 3.461009900111562e-05, "loss": 0.0212, "step": 49570 }, { "grad_norm": 0.3468013405799866, "learning_rate": 3.459043535772023e-05, "loss": 0.0213, "step": 49580 }, { "grad_norm": 0.3275843560695648, "learning_rate": 3.4570774347398014e-05, "loss": 0.0212, "step": 49590 }, { "grad_norm": 0.41110190749168396, "learning_rate": 3.4551115973508454e-05, "loss": 0.024, "step": 49600 }, { "grad_norm": 0.3910842537879944, "learning_rate": 3.453146023941066e-05, "loss": 0.0319, "step": 49610 }, { "grad_norm": 0.27966660261154175, "learning_rate": 3.451180714846325e-05, "loss": 0.0222, "step": 49620 }, { "grad_norm": 1.0960825681686401, "learning_rate": 3.449215670402438e-05, "loss": 0.0244, "step": 49630 }, { "grad_norm": 0.5945114493370056, "learning_rate": 3.447250890945181e-05, "loss": 0.0231, "step": 49640 }, { "grad_norm": 0.3456118106842041, "learning_rate": 3.4452863768102754e-05, "loss": 0.0233, "step": 49650 }, { "grad_norm": 0.2121484875679016, "learning_rate": 3.443322128333409e-05, "loss": 0.0188, "step": 49660 }, { "grad_norm": 0.5123241543769836, "learning_rate": 3.441358145850215e-05, "loss": 0.0234, "step": 49670 }, { "grad_norm": 0.3453861176967621, "learning_rate": 3.439394429696286e-05, "loss": 0.0252, "step": 49680 }, { "grad_norm": 0.29443520307540894, "learning_rate": 3.4374309802071644e-05, "loss": 0.0212, "step": 49690 }, { "grad_norm": 0.3619208037853241, "learning_rate": 3.435467797718353e-05, "loss": 0.0174, "step": 49700 }, { "grad_norm": 0.2783958315849304, "learning_rate": 3.433504882565306e-05, "loss": 0.0189, "step": 49710 }, { "grad_norm": 0.25867533683776855, "learning_rate": 3.43154223508343e-05, "loss": 0.0236, "step": 49720 }, { "grad_norm": 0.15523535013198853, "learning_rate": 3.429579855608089e-05, "loss": 0.0186, "step": 49730 }, { "grad_norm": 0.33106422424316406, "learning_rate": 3.427617744474597e-05, "loss": 0.019, "step": 49740 }, { "grad_norm": 0.24964554607868195, "learning_rate": 3.425655902018231e-05, "loss": 0.0221, "step": 49750 }, { "grad_norm": 0.2217738777399063, "learning_rate": 3.423694328574211e-05, "loss": 0.0188, "step": 49760 }, { "grad_norm": 0.26501455903053284, "learning_rate": 3.42173302447772e-05, "loss": 0.0188, "step": 49770 }, { "grad_norm": 0.178275927901268, "learning_rate": 3.419771990063886e-05, "loss": 0.0165, "step": 49780 }, { "grad_norm": 0.4930424988269806, "learning_rate": 3.417811225667803e-05, "loss": 0.016, "step": 49790 }, { "grad_norm": 0.22566348314285278, "learning_rate": 3.415850731624508e-05, "loss": 0.0241, "step": 49800 }, { "grad_norm": 0.18364085257053375, "learning_rate": 3.4138905082689945e-05, "loss": 0.0259, "step": 49810 }, { "grad_norm": 0.22801938652992249, "learning_rate": 3.4119305559362145e-05, "loss": 0.0226, "step": 49820 }, { "grad_norm": 0.2403319627046585, "learning_rate": 3.4099708749610684e-05, "loss": 0.0163, "step": 49830 }, { "grad_norm": 0.28958868980407715, "learning_rate": 3.408011465678413e-05, "loss": 0.0271, "step": 49840 }, { "grad_norm": 0.22855240106582642, "learning_rate": 3.406052328423055e-05, "loss": 0.0212, "step": 49850 }, { "grad_norm": 0.20068302750587463, "learning_rate": 3.4040934635297615e-05, "loss": 0.0179, "step": 49860 }, { "grad_norm": 0.3313540816307068, "learning_rate": 3.4021348713332466e-05, "loss": 0.0166, "step": 49870 }, { "grad_norm": 0.35728347301483154, "learning_rate": 3.4001765521681807e-05, "loss": 0.0232, "step": 49880 }, { "grad_norm": 0.21516819298267365, "learning_rate": 3.398218506369188e-05, "loss": 0.0326, "step": 49890 }, { "grad_norm": 0.2207334339618683, "learning_rate": 3.3962607342708404e-05, "loss": 0.0158, "step": 49900 }, { "grad_norm": 0.27231940627098083, "learning_rate": 3.394303236207673e-05, "loss": 0.0158, "step": 49910 }, { "grad_norm": 0.2812823951244354, "learning_rate": 3.392346012514166e-05, "loss": 0.021, "step": 49920 }, { "grad_norm": 0.3490131199359894, "learning_rate": 3.390389063524757e-05, "loss": 0.0185, "step": 49930 }, { "grad_norm": 0.2852686047554016, "learning_rate": 3.3884323895738324e-05, "loss": 0.0163, "step": 49940 }, { "grad_norm": 0.2885226607322693, "learning_rate": 3.386475990995738e-05, "loss": 0.0219, "step": 49950 }, { "grad_norm": 0.28332948684692383, "learning_rate": 3.384519868124765e-05, "loss": 0.0158, "step": 49960 }, { "grad_norm": 0.1905858814716339, "learning_rate": 3.3825640212951645e-05, "loss": 0.0195, "step": 49970 }, { "grad_norm": 0.42218729853630066, "learning_rate": 3.380608450841134e-05, "loss": 0.0198, "step": 49980 }, { "grad_norm": 0.2990136444568634, "learning_rate": 3.3786531570968305e-05, "loss": 0.0214, "step": 49990 }, { "grad_norm": 0.2459457367658615, "learning_rate": 3.3766981403963584e-05, "loss": 0.0231, "step": 50000 }, { "grad_norm": 0.2324296534061432, "learning_rate": 3.374743401073775e-05, "loss": 0.0191, "step": 50010 }, { "grad_norm": 0.1992630958557129, "learning_rate": 3.372788939463095e-05, "loss": 0.0179, "step": 50020 }, { "grad_norm": 0.3623528480529785, "learning_rate": 3.370834755898281e-05, "loss": 0.0155, "step": 50030 }, { "grad_norm": 0.3259011209011078, "learning_rate": 3.3688808507132493e-05, "loss": 0.0228, "step": 50040 }, { "grad_norm": 0.305829256772995, "learning_rate": 3.3669272242418685e-05, "loss": 0.0261, "step": 50050 }, { "grad_norm": 0.28418752551078796, "learning_rate": 3.364973876817961e-05, "loss": 0.0199, "step": 50060 }, { "grad_norm": 0.26752349734306335, "learning_rate": 3.363020808775299e-05, "loss": 0.0153, "step": 50070 }, { "grad_norm": 0.18694934248924255, "learning_rate": 3.361068020447611e-05, "loss": 0.0174, "step": 50080 }, { "grad_norm": 0.18081976473331451, "learning_rate": 3.3591155121685724e-05, "loss": 0.0148, "step": 50090 }, { "grad_norm": 0.31422609090805054, "learning_rate": 3.357163284271814e-05, "loss": 0.0185, "step": 50100 }, { "grad_norm": 0.4065990149974823, "learning_rate": 3.355211337090919e-05, "loss": 0.0154, "step": 50110 }, { "grad_norm": 0.36646774411201477, "learning_rate": 3.353259670959421e-05, "loss": 0.023, "step": 50120 }, { "grad_norm": 0.28020161390304565, "learning_rate": 3.351308286210808e-05, "loss": 0.0221, "step": 50130 }, { "grad_norm": 0.20873872935771942, "learning_rate": 3.3493571831785156e-05, "loss": 0.0214, "step": 50140 }, { "grad_norm": 0.30584144592285156, "learning_rate": 3.347406362195936e-05, "loss": 0.0164, "step": 50150 }, { "grad_norm": 0.20935069024562836, "learning_rate": 3.345455823596411e-05, "loss": 0.0175, "step": 50160 }, { "grad_norm": 0.2399461716413498, "learning_rate": 3.3435055677132346e-05, "loss": 0.0205, "step": 50170 }, { "grad_norm": 0.31805506348609924, "learning_rate": 3.3415555948796505e-05, "loss": 0.0206, "step": 50180 }, { "grad_norm": 0.3415137231349945, "learning_rate": 3.3396059054288556e-05, "loss": 0.0206, "step": 50190 }, { "grad_norm": 0.35026249289512634, "learning_rate": 3.3376564996940015e-05, "loss": 0.0275, "step": 50200 }, { "grad_norm": 0.29979395866394043, "learning_rate": 3.3357073780081836e-05, "loss": 0.0179, "step": 50210 }, { "grad_norm": 0.19171489775180817, "learning_rate": 3.333758540704459e-05, "loss": 0.0159, "step": 50220 }, { "grad_norm": 0.18860748410224915, "learning_rate": 3.331809988115827e-05, "loss": 0.0157, "step": 50230 }, { "grad_norm": 0.32346421480178833, "learning_rate": 3.329861720575244e-05, "loss": 0.02, "step": 50240 }, { "grad_norm": 0.22401884198188782, "learning_rate": 3.3279137384156126e-05, "loss": 0.0175, "step": 50250 }, { "grad_norm": 0.2849726974964142, "learning_rate": 3.3259660419697934e-05, "loss": 0.0244, "step": 50260 }, { "grad_norm": 0.1897188425064087, "learning_rate": 3.3240186315705926e-05, "loss": 0.0261, "step": 50270 }, { "grad_norm": 0.5915935635566711, "learning_rate": 3.322071507550769e-05, "loss": 0.0271, "step": 50280 }, { "grad_norm": 0.33587124943733215, "learning_rate": 3.320124670243033e-05, "loss": 0.018, "step": 50290 }, { "grad_norm": 0.18287266790866852, "learning_rate": 3.318178119980045e-05, "loss": 0.0171, "step": 50300 }, { "grad_norm": 0.2607528269290924, "learning_rate": 3.316231857094421e-05, "loss": 0.0185, "step": 50310 }, { "grad_norm": 0.24417082965373993, "learning_rate": 3.31428588191872e-05, "loss": 0.0231, "step": 50320 }, { "grad_norm": 0.31145814061164856, "learning_rate": 3.312340194785458e-05, "loss": 0.0201, "step": 50330 }, { "grad_norm": 0.27609020471572876, "learning_rate": 3.3103947960270975e-05, "loss": 0.0131, "step": 50340 }, { "grad_norm": 0.23687812685966492, "learning_rate": 3.308449685976058e-05, "loss": 0.0147, "step": 50350 }, { "grad_norm": 0.290272057056427, "learning_rate": 3.3065048649647024e-05, "loss": 0.0183, "step": 50360 }, { "grad_norm": 0.2571449279785156, "learning_rate": 3.304560333325348e-05, "loss": 0.0162, "step": 50370 }, { "grad_norm": 0.3436010181903839, "learning_rate": 3.3026160913902624e-05, "loss": 0.0198, "step": 50380 }, { "grad_norm": 0.3737870156764984, "learning_rate": 3.300672139491662e-05, "loss": 0.0271, "step": 50390 }, { "grad_norm": 0.21858975291252136, "learning_rate": 3.298728477961717e-05, "loss": 0.0195, "step": 50400 }, { "grad_norm": 0.22674131393432617, "learning_rate": 3.296785107132544e-05, "loss": 0.02, "step": 50410 }, { "grad_norm": 0.3552747070789337, "learning_rate": 3.2948420273362156e-05, "loss": 0.0168, "step": 50420 }, { "grad_norm": 0.1935671865940094, "learning_rate": 3.292899238904747e-05, "loss": 0.02, "step": 50430 }, { "grad_norm": 0.2917889356613159, "learning_rate": 3.29095674217011e-05, "loss": 0.0203, "step": 50440 }, { "grad_norm": 0.2763173282146454, "learning_rate": 3.289014537464224e-05, "loss": 0.0213, "step": 50450 }, { "grad_norm": 0.22048839926719666, "learning_rate": 3.287072625118955e-05, "loss": 0.019, "step": 50460 }, { "grad_norm": 0.272861510515213, "learning_rate": 3.285131005466129e-05, "loss": 0.0208, "step": 50470 }, { "grad_norm": 0.3061118423938751, "learning_rate": 3.2831896788375105e-05, "loss": 0.02, "step": 50480 }, { "grad_norm": 0.5248686075210571, "learning_rate": 3.281248645564822e-05, "loss": 0.0214, "step": 50490 }, { "grad_norm": 0.293817400932312, "learning_rate": 3.2793079059797306e-05, "loss": 0.0282, "step": 50500 }, { "grad_norm": 0.21757063269615173, "learning_rate": 3.277367460413859e-05, "loss": 0.0196, "step": 50510 }, { "grad_norm": 0.17719870805740356, "learning_rate": 3.275427309198773e-05, "loss": 0.0228, "step": 50520 }, { "grad_norm": 0.3385154604911804, "learning_rate": 3.273487452665993e-05, "loss": 0.0169, "step": 50530 }, { "grad_norm": 0.2499026358127594, "learning_rate": 3.271547891146986e-05, "loss": 0.015, "step": 50540 }, { "grad_norm": 0.324492871761322, "learning_rate": 3.269608624973173e-05, "loss": 0.0167, "step": 50550 }, { "grad_norm": 0.36575281620025635, "learning_rate": 3.26766965447592e-05, "loss": 0.025, "step": 50560 }, { "grad_norm": 0.31690362095832825, "learning_rate": 3.2657309799865424e-05, "loss": 0.027, "step": 50570 }, { "grad_norm": 0.16647368669509888, "learning_rate": 3.2637926018363084e-05, "loss": 0.0171, "step": 50580 }, { "grad_norm": 0.37424588203430176, "learning_rate": 3.261854520356432e-05, "loss": 0.0252, "step": 50590 }, { "grad_norm": 0.23960953950881958, "learning_rate": 3.2599167358780816e-05, "loss": 0.0235, "step": 50600 }, { "grad_norm": 0.32777082920074463, "learning_rate": 3.2579792487323676e-05, "loss": 0.0165, "step": 50610 }, { "grad_norm": 0.24848443269729614, "learning_rate": 3.256042059250358e-05, "loss": 0.0265, "step": 50620 }, { "grad_norm": 0.5046510100364685, "learning_rate": 3.254105167763062e-05, "loss": 0.0239, "step": 50630 }, { "grad_norm": 0.3184038996696472, "learning_rate": 3.252168574601443e-05, "loss": 0.0186, "step": 50640 }, { "grad_norm": 0.5006924867630005, "learning_rate": 3.2502322800964115e-05, "loss": 0.0209, "step": 50650 }, { "grad_norm": 0.28500863909721375, "learning_rate": 3.248296284578825e-05, "loss": 0.0177, "step": 50660 }, { "grad_norm": 0.19283734261989594, "learning_rate": 3.246360588379497e-05, "loss": 0.0238, "step": 50670 }, { "grad_norm": 0.22062237560749054, "learning_rate": 3.24442519182918e-05, "loss": 0.0306, "step": 50680 }, { "grad_norm": 0.19669704139232635, "learning_rate": 3.2424900952585845e-05, "loss": 0.0175, "step": 50690 }, { "grad_norm": 0.3003772795200348, "learning_rate": 3.240555298998362e-05, "loss": 0.0199, "step": 50700 }, { "grad_norm": 0.5264759063720703, "learning_rate": 3.2386208033791194e-05, "loss": 0.0216, "step": 50710 }, { "grad_norm": 0.34599265456199646, "learning_rate": 3.2366866087314064e-05, "loss": 0.0253, "step": 50720 }, { "grad_norm": 0.32896503806114197, "learning_rate": 3.234752715385727e-05, "loss": 0.0234, "step": 50730 }, { "grad_norm": 0.24686986207962036, "learning_rate": 3.232819123672529e-05, "loss": 0.0249, "step": 50740 }, { "grad_norm": 0.4353852868080139, "learning_rate": 3.2308858339222076e-05, "loss": 0.0271, "step": 50750 }, { "grad_norm": 0.3313135504722595, "learning_rate": 3.2289528464651144e-05, "loss": 0.0234, "step": 50760 }, { "grad_norm": 0.2670099437236786, "learning_rate": 3.22702016163154e-05, "loss": 0.0182, "step": 50770 }, { "grad_norm": 0.34820428490638733, "learning_rate": 3.2250877797517295e-05, "loss": 0.0286, "step": 50780 }, { "grad_norm": 0.27585569024086, "learning_rate": 3.223155701155872e-05, "loss": 0.0192, "step": 50790 }, { "grad_norm": 0.21593689918518066, "learning_rate": 3.2212239261741105e-05, "loss": 0.0221, "step": 50800 }, { "grad_norm": 0.1739347130060196, "learning_rate": 3.219292455136528e-05, "loss": 0.016, "step": 50810 }, { "grad_norm": 0.1703529953956604, "learning_rate": 3.217361288373165e-05, "loss": 0.0158, "step": 50820 }, { "grad_norm": 0.17482443153858185, "learning_rate": 3.215430426214002e-05, "loss": 0.0207, "step": 50830 }, { "grad_norm": 0.33995795249938965, "learning_rate": 3.2134998689889705e-05, "loss": 0.0198, "step": 50840 }, { "grad_norm": 0.3183232247829437, "learning_rate": 3.211569617027952e-05, "loss": 0.0253, "step": 50850 }, { "grad_norm": 0.292227566242218, "learning_rate": 3.209639670660769e-05, "loss": 0.0234, "step": 50860 }, { "grad_norm": 0.5176140666007996, "learning_rate": 3.207710030217202e-05, "loss": 0.0195, "step": 50870 }, { "grad_norm": 0.47419434785842896, "learning_rate": 3.205780696026972e-05, "loss": 0.0206, "step": 50880 }, { "grad_norm": 0.17063696682453156, "learning_rate": 3.203851668419749e-05, "loss": 0.0152, "step": 50890 }, { "grad_norm": 0.22875958681106567, "learning_rate": 3.201922947725149e-05, "loss": 0.0198, "step": 50900 }, { "grad_norm": 0.26643338799476624, "learning_rate": 3.199994534272742e-05, "loss": 0.0209, "step": 50910 }, { "grad_norm": 0.2823846936225891, "learning_rate": 3.1980664283920394e-05, "loss": 0.0239, "step": 50920 }, { "grad_norm": 0.2471957951784134, "learning_rate": 3.196138630412499e-05, "loss": 0.0148, "step": 50930 }, { "grad_norm": 0.2834266722202301, "learning_rate": 3.194211140663532e-05, "loss": 0.0194, "step": 50940 }, { "grad_norm": 0.40675088763237, "learning_rate": 3.1922839594744914e-05, "loss": 0.0262, "step": 50950 }, { "grad_norm": 0.3383333683013916, "learning_rate": 3.190357087174683e-05, "loss": 0.0218, "step": 50960 }, { "grad_norm": 0.10962238162755966, "learning_rate": 3.188430524093353e-05, "loss": 0.016, "step": 50970 }, { "grad_norm": 0.5422779321670532, "learning_rate": 3.1865042705597014e-05, "loss": 0.0235, "step": 50980 }, { "grad_norm": 0.3102491497993469, "learning_rate": 3.184578326902868e-05, "loss": 0.0191, "step": 50990 }, { "grad_norm": 0.3130839467048645, "learning_rate": 3.182652693451949e-05, "loss": 0.0199, "step": 51000 }, { "grad_norm": 0.3838513195514679, "learning_rate": 3.180727370535981e-05, "loss": 0.0189, "step": 51010 }, { "grad_norm": 0.21112710237503052, "learning_rate": 3.178802358483945e-05, "loss": 0.0184, "step": 51020 }, { "grad_norm": 0.2368534952402115, "learning_rate": 3.17687765762478e-05, "loss": 0.0173, "step": 51030 }, { "grad_norm": 0.2362048178911209, "learning_rate": 3.174953268287357e-05, "loss": 0.0216, "step": 51040 }, { "grad_norm": 0.44086310267448425, "learning_rate": 3.173029190800508e-05, "loss": 0.0162, "step": 51050 }, { "grad_norm": 0.277325302362442, "learning_rate": 3.171105425493e-05, "loss": 0.0202, "step": 51060 }, { "grad_norm": 0.19752100110054016, "learning_rate": 3.169181972693557e-05, "loss": 0.014, "step": 51070 }, { "grad_norm": 0.30142053961753845, "learning_rate": 3.1672588327308404e-05, "loss": 0.0165, "step": 51080 }, { "grad_norm": 0.34408849477767944, "learning_rate": 3.1653360059334646e-05, "loss": 0.0185, "step": 51090 }, { "grad_norm": 0.4044937491416931, "learning_rate": 3.163413492629985e-05, "loss": 0.0331, "step": 51100 }, { "grad_norm": 0.43229782581329346, "learning_rate": 3.161491293148912e-05, "loss": 0.0174, "step": 51110 }, { "grad_norm": 0.25741690397262573, "learning_rate": 3.1595694078186925e-05, "loss": 0.0187, "step": 51120 }, { "grad_norm": 0.2678927779197693, "learning_rate": 3.157647836967724e-05, "loss": 0.0197, "step": 51130 }, { "grad_norm": 0.24391710758209229, "learning_rate": 3.155726580924354e-05, "loss": 0.0206, "step": 51140 }, { "grad_norm": 0.251609206199646, "learning_rate": 3.1538056400168676e-05, "loss": 0.0273, "step": 51150 }, { "grad_norm": 0.23327980935573578, "learning_rate": 3.151885014573506e-05, "loss": 0.0223, "step": 51160 }, { "grad_norm": 0.26772233843803406, "learning_rate": 3.149964704922448e-05, "loss": 0.0268, "step": 51170 }, { "grad_norm": 0.25382474064826965, "learning_rate": 3.148044711391825e-05, "loss": 0.0173, "step": 51180 }, { "grad_norm": 0.3840453326702118, "learning_rate": 3.146125034309707e-05, "loss": 0.0174, "step": 51190 }, { "grad_norm": 0.2076437771320343, "learning_rate": 3.1442056740041195e-05, "loss": 0.024, "step": 51200 }, { "grad_norm": 0.1842775046825409, "learning_rate": 3.142286630803026e-05, "loss": 0.0323, "step": 51210 }, { "grad_norm": 0.29862913489341736, "learning_rate": 3.140367905034337e-05, "loss": 0.0181, "step": 51220 }, { "grad_norm": 0.23371370136737823, "learning_rate": 3.138449497025914e-05, "loss": 0.0207, "step": 51230 }, { "grad_norm": 0.3223589062690735, "learning_rate": 3.136531407105557e-05, "loss": 0.022, "step": 51240 }, { "grad_norm": 0.2785145938396454, "learning_rate": 3.1346136356010184e-05, "loss": 0.0249, "step": 51250 }, { "grad_norm": 0.17698627710342407, "learning_rate": 3.132696182839988e-05, "loss": 0.0167, "step": 51260 }, { "grad_norm": 0.5848827362060547, "learning_rate": 3.130779049150111e-05, "loss": 0.0226, "step": 51270 }, { "grad_norm": 0.616098940372467, "learning_rate": 3.128862234858971e-05, "loss": 0.0195, "step": 51280 }, { "grad_norm": 0.25141751766204834, "learning_rate": 3.1269457402941005e-05, "loss": 0.0154, "step": 51290 }, { "grad_norm": 0.2965333163738251, "learning_rate": 3.125029565782974e-05, "loss": 0.0182, "step": 51300 }, { "grad_norm": 0.36899784207344055, "learning_rate": 3.123113711653013e-05, "loss": 0.0232, "step": 51310 }, { "grad_norm": 0.22631289064884186, "learning_rate": 3.121198178231587e-05, "loss": 0.0241, "step": 51320 }, { "grad_norm": 0.1825050562620163, "learning_rate": 3.119282965846006e-05, "loss": 0.0186, "step": 51330 }, { "grad_norm": 0.27842870354652405, "learning_rate": 3.1173680748235303e-05, "loss": 0.0217, "step": 51340 }, { "grad_norm": 0.3249817192554474, "learning_rate": 3.115453505491358e-05, "loss": 0.0243, "step": 51350 }, { "grad_norm": 0.271170049905777, "learning_rate": 3.1135392581766404e-05, "loss": 0.0193, "step": 51360 }, { "grad_norm": 0.23120303452014923, "learning_rate": 3.111625333206467e-05, "loss": 0.0167, "step": 51370 }, { "grad_norm": 0.20992740988731384, "learning_rate": 3.109711730907881e-05, "loss": 0.0174, "step": 51380 }, { "grad_norm": 0.41702309250831604, "learning_rate": 3.107798451607859e-05, "loss": 0.0203, "step": 51390 }, { "grad_norm": 0.29202234745025635, "learning_rate": 3.105885495633328e-05, "loss": 0.0196, "step": 51400 }, { "grad_norm": 0.10640490800142288, "learning_rate": 3.103972863311163e-05, "loss": 0.0146, "step": 51410 }, { "grad_norm": 0.33141669631004333, "learning_rate": 3.102060554968178e-05, "loss": 0.0205, "step": 51420 }, { "grad_norm": 0.24973899126052856, "learning_rate": 3.100148570931137e-05, "loss": 0.0148, "step": 51430 }, { "grad_norm": 0.3609800636768341, "learning_rate": 3.098236911526744e-05, "loss": 0.0219, "step": 51440 }, { "grad_norm": 0.17138268053531647, "learning_rate": 3.0963255770816495e-05, "loss": 0.014, "step": 51450 }, { "grad_norm": 0.6084128618240356, "learning_rate": 3.094414567922447e-05, "loss": 0.0178, "step": 51460 }, { "grad_norm": 0.1882048398256302, "learning_rate": 3.092503884375678e-05, "loss": 0.0189, "step": 51470 }, { "grad_norm": 0.4004550278186798, "learning_rate": 3.0905935267678255e-05, "loss": 0.0206, "step": 51480 }, { "grad_norm": 0.31189924478530884, "learning_rate": 3.0886834954253166e-05, "loss": 0.0173, "step": 51490 }, { "grad_norm": 0.2810860872268677, "learning_rate": 3.086773790674524e-05, "loss": 0.0182, "step": 51500 }, { "grad_norm": 0.4177873432636261, "learning_rate": 3.0848644128417617e-05, "loss": 0.0266, "step": 51510 }, { "grad_norm": 0.314042329788208, "learning_rate": 3.082955362253294e-05, "loss": 0.019, "step": 51520 }, { "grad_norm": 0.3074578046798706, "learning_rate": 3.081046639235323e-05, "loss": 0.0194, "step": 51530 }, { "grad_norm": 0.25961998105049133, "learning_rate": 3.0791382441139986e-05, "loss": 0.0158, "step": 51540 }, { "grad_norm": 0.19150960445404053, "learning_rate": 3.077230177215411e-05, "loss": 0.0232, "step": 51550 }, { "grad_norm": 0.1338832676410675, "learning_rate": 3.0753224388656e-05, "loss": 0.0181, "step": 51560 }, { "grad_norm": 0.36228859424591064, "learning_rate": 3.073415029390544e-05, "loss": 0.0241, "step": 51570 }, { "grad_norm": 0.3984237611293793, "learning_rate": 3.071507949116166e-05, "loss": 0.0174, "step": 51580 }, { "grad_norm": 0.21837913990020752, "learning_rate": 3.069601198368337e-05, "loss": 0.0173, "step": 51590 }, { "grad_norm": 0.17959202826023102, "learning_rate": 3.067694777472864e-05, "loss": 0.0202, "step": 51600 }, { "grad_norm": 0.23751257359981537, "learning_rate": 3.065788686755508e-05, "loss": 0.0201, "step": 51610 }, { "grad_norm": 0.2755337655544281, "learning_rate": 3.063882926541961e-05, "loss": 0.022, "step": 51620 }, { "grad_norm": 0.3895431160926819, "learning_rate": 3.061977497157872e-05, "loss": 0.0207, "step": 51630 }, { "grad_norm": 0.27540427446365356, "learning_rate": 3.0600723989288224e-05, "loss": 0.0189, "step": 51640 }, { "grad_norm": 0.46541938185691833, "learning_rate": 3.058167632180343e-05, "loss": 0.0265, "step": 51650 }, { "grad_norm": 0.31544259190559387, "learning_rate": 3.0562631972379074e-05, "loss": 0.0238, "step": 51660 }, { "grad_norm": 0.24184396862983704, "learning_rate": 3.0543590944269276e-05, "loss": 0.0192, "step": 51670 }, { "grad_norm": 0.306397408246994, "learning_rate": 3.052455324072766e-05, "loss": 0.0223, "step": 51680 }, { "grad_norm": 0.42582398653030396, "learning_rate": 3.0505518865007244e-05, "loss": 0.0214, "step": 51690 }, { "grad_norm": 0.23783983290195465, "learning_rate": 3.048648782036048e-05, "loss": 0.0168, "step": 51700 }, { "grad_norm": 0.47000643610954285, "learning_rate": 3.0467460110039236e-05, "loss": 0.0182, "step": 51710 }, { "grad_norm": 0.21906070411205292, "learning_rate": 3.0448435737294866e-05, "loss": 0.0191, "step": 51720 }, { "grad_norm": 0.16391202807426453, "learning_rate": 3.042941470537808e-05, "loss": 0.0169, "step": 51730 }, { "grad_norm": 0.21697351336479187, "learning_rate": 3.0410397017539072e-05, "loss": 0.0142, "step": 51740 }, { "grad_norm": 0.4946189820766449, "learning_rate": 3.039138267702742e-05, "loss": 0.019, "step": 51750 }, { "grad_norm": 0.3112259805202484, "learning_rate": 3.0372371687092193e-05, "loss": 0.0138, "step": 51760 }, { "grad_norm": 0.30603036284446716, "learning_rate": 3.0353364050981823e-05, "loss": 0.0186, "step": 51770 }, { "grad_norm": 0.2026030272245407, "learning_rate": 3.033435977194418e-05, "loss": 0.0202, "step": 51780 }, { "grad_norm": 0.28104570508003235, "learning_rate": 3.0315358853226618e-05, "loss": 0.022, "step": 51790 }, { "grad_norm": 0.19940334558486938, "learning_rate": 3.0296361298075844e-05, "loss": 0.0168, "step": 51800 }, { "grad_norm": 0.3847292363643646, "learning_rate": 3.027736710973803e-05, "loss": 0.0211, "step": 51810 }, { "grad_norm": 0.15380388498306274, "learning_rate": 3.025837629145875e-05, "loss": 0.0175, "step": 51820 }, { "grad_norm": 0.31755295395851135, "learning_rate": 3.0239388846483048e-05, "loss": 0.0212, "step": 51830 }, { "grad_norm": 0.3061847388744354, "learning_rate": 3.022040477805532e-05, "loss": 0.0236, "step": 51840 }, { "grad_norm": 0.31042805314064026, "learning_rate": 3.020142408941946e-05, "loss": 0.0193, "step": 51850 }, { "grad_norm": 0.23268979787826538, "learning_rate": 3.018244678381873e-05, "loss": 0.0226, "step": 51860 }, { "grad_norm": 0.406034916639328, "learning_rate": 3.0163472864495812e-05, "loss": 0.028, "step": 51870 }, { "grad_norm": 0.2951216399669647, "learning_rate": 3.014450233469287e-05, "loss": 0.0226, "step": 51880 }, { "grad_norm": 0.36146578192710876, "learning_rate": 3.0125535197651422e-05, "loss": 0.0262, "step": 51890 }, { "grad_norm": 0.2558216154575348, "learning_rate": 3.0106571456612448e-05, "loss": 0.018, "step": 51900 }, { "grad_norm": 0.2510012686252594, "learning_rate": 3.0087611114816305e-05, "loss": 0.0235, "step": 51910 }, { "grad_norm": 0.1930430382490158, "learning_rate": 3.006865417550284e-05, "loss": 0.0183, "step": 51920 }, { "grad_norm": 0.2347886711359024, "learning_rate": 3.0049700641911242e-05, "loss": 0.0228, "step": 51930 }, { "grad_norm": 0.20442256331443787, "learning_rate": 3.0030750517280175e-05, "loss": 0.0189, "step": 51940 }, { "grad_norm": 0.2173745036125183, "learning_rate": 3.0011803804847682e-05, "loss": 0.0223, "step": 51950 }, { "grad_norm": 0.7404335737228394, "learning_rate": 2.9992860507851224e-05, "loss": 0.0209, "step": 51960 }, { "grad_norm": 0.3184950351715088, "learning_rate": 2.9973920629527723e-05, "loss": 0.019, "step": 51970 }, { "grad_norm": 0.14322903752326965, "learning_rate": 2.9954984173113453e-05, "loss": 0.0156, "step": 51980 }, { "grad_norm": 0.26694899797439575, "learning_rate": 2.993605114184418e-05, "loss": 0.0187, "step": 51990 }, { "grad_norm": 0.38959801197052, "learning_rate": 2.9917121538955005e-05, "loss": 0.0246, "step": 52000 }, { "grad_norm": 0.44346243143081665, "learning_rate": 2.98981953676805e-05, "loss": 0.0265, "step": 52010 }, { "grad_norm": 0.25860926508903503, "learning_rate": 2.9879272631254594e-05, "loss": 0.0202, "step": 52020 }, { "grad_norm": 0.2151639610528946, "learning_rate": 2.986035333291073e-05, "loss": 0.0194, "step": 52030 }, { "grad_norm": 0.24347154796123505, "learning_rate": 2.9841437475881646e-05, "loss": 0.0243, "step": 52040 }, { "grad_norm": 0.3369540274143219, "learning_rate": 2.9822525063399555e-05, "loss": 0.0212, "step": 52050 }, { "grad_norm": 0.20042508840560913, "learning_rate": 2.9803616098696087e-05, "loss": 0.0249, "step": 52060 }, { "grad_norm": 0.27641767263412476, "learning_rate": 2.9784710585002228e-05, "loss": 0.0192, "step": 52070 }, { "grad_norm": 0.33357900381088257, "learning_rate": 2.9765808525548467e-05, "loss": 0.0202, "step": 52080 }, { "grad_norm": 0.2244933843612671, "learning_rate": 2.974690992356461e-05, "loss": 0.0196, "step": 52090 }, { "grad_norm": 0.20535902678966522, "learning_rate": 2.9728014782279934e-05, "loss": 0.022, "step": 52100 }, { "grad_norm": 0.31989315152168274, "learning_rate": 2.970912310492307e-05, "loss": 0.0196, "step": 52110 }, { "grad_norm": 0.2809840142726898, "learning_rate": 2.9690234894722134e-05, "loss": 0.0162, "step": 52120 }, { "grad_norm": 0.1327032446861267, "learning_rate": 2.9671350154904577e-05, "loss": 0.0233, "step": 52130 }, { "grad_norm": 0.30204230546951294, "learning_rate": 2.9652468888697282e-05, "loss": 0.0183, "step": 52140 }, { "grad_norm": 1.2628376483917236, "learning_rate": 2.9633591099326562e-05, "loss": 0.0203, "step": 52150 }, { "grad_norm": 0.2550373375415802, "learning_rate": 2.9614716790018072e-05, "loss": 0.0142, "step": 52160 }, { "grad_norm": 0.22175092995166779, "learning_rate": 2.959584596399697e-05, "loss": 0.016, "step": 52170 }, { "grad_norm": 0.308428555727005, "learning_rate": 2.9576978624487717e-05, "loss": 0.0224, "step": 52180 }, { "grad_norm": 0.2842237651348114, "learning_rate": 2.9558114774714263e-05, "loss": 0.0189, "step": 52190 }, { "grad_norm": 0.2852942645549774, "learning_rate": 2.9539254417899897e-05, "loss": 0.0223, "step": 52200 }, { "grad_norm": 0.21562710404396057, "learning_rate": 2.9520397557267365e-05, "loss": 0.0217, "step": 52210 }, { "grad_norm": 0.2458152323961258, "learning_rate": 2.9501544196038765e-05, "loss": 0.0228, "step": 52220 }, { "grad_norm": 0.32046905159950256, "learning_rate": 2.9482694337435614e-05, "loss": 0.026, "step": 52230 }, { "grad_norm": 0.2645340859889984, "learning_rate": 2.946384798467887e-05, "loss": 0.0168, "step": 52240 }, { "grad_norm": 0.32369503378868103, "learning_rate": 2.9445005140988824e-05, "loss": 0.0117, "step": 52250 }, { "grad_norm": 0.28550076484680176, "learning_rate": 2.942616580958524e-05, "loss": 0.0224, "step": 52260 }, { "grad_norm": 0.7635741829872131, "learning_rate": 2.9407329993687193e-05, "loss": 0.0174, "step": 52270 }, { "grad_norm": 0.3615005314350128, "learning_rate": 2.938849769651326e-05, "loss": 0.0158, "step": 52280 }, { "grad_norm": 0.3256984055042267, "learning_rate": 2.9369668921281345e-05, "loss": 0.0187, "step": 52290 }, { "grad_norm": 0.16185109317302704, "learning_rate": 2.9350843671208773e-05, "loss": 0.0149, "step": 52300 }, { "grad_norm": 0.28727903962135315, "learning_rate": 2.9332021949512245e-05, "loss": 0.0153, "step": 52310 }, { "grad_norm": 0.3324373960494995, "learning_rate": 2.9313203759407908e-05, "loss": 0.0164, "step": 52320 }, { "grad_norm": 0.2280465066432953, "learning_rate": 2.929438910411127e-05, "loss": 0.0274, "step": 52330 }, { "grad_norm": 0.37326446175575256, "learning_rate": 2.927557798683722e-05, "loss": 0.0202, "step": 52340 }, { "grad_norm": 0.43520838022232056, "learning_rate": 2.9256770410800095e-05, "loss": 0.0183, "step": 52350 }, { "grad_norm": 0.30908775329589844, "learning_rate": 2.9237966379213554e-05, "loss": 0.0206, "step": 52360 }, { "grad_norm": 0.359474778175354, "learning_rate": 2.9219165895290736e-05, "loss": 0.0194, "step": 52370 }, { "grad_norm": 0.2661236524581909, "learning_rate": 2.9200368962244083e-05, "loss": 0.021, "step": 52380 }, { "grad_norm": 0.21658101677894592, "learning_rate": 2.9181575583285536e-05, "loss": 0.0174, "step": 52390 }, { "grad_norm": 0.2540735602378845, "learning_rate": 2.916278576162632e-05, "loss": 0.0232, "step": 52400 }, { "grad_norm": 0.2207968831062317, "learning_rate": 2.9143999500477116e-05, "loss": 0.0175, "step": 52410 }, { "grad_norm": 0.2997523248195648, "learning_rate": 2.9125216803048004e-05, "loss": 0.0251, "step": 52420 }, { "grad_norm": 0.2012910693883896, "learning_rate": 2.9106437672548403e-05, "loss": 0.0174, "step": 52430 }, { "grad_norm": 0.35410645604133606, "learning_rate": 2.9087662112187154e-05, "loss": 0.0127, "step": 52440 }, { "grad_norm": 0.27082034945487976, "learning_rate": 2.9068890125172498e-05, "loss": 0.0165, "step": 52450 }, { "grad_norm": 0.5381771922111511, "learning_rate": 2.9050121714712058e-05, "loss": 0.0235, "step": 52460 }, { "grad_norm": 0.39531025290489197, "learning_rate": 2.9031356884012835e-05, "loss": 0.0226, "step": 52470 }, { "grad_norm": 0.198277547955513, "learning_rate": 2.9012595636281247e-05, "loss": 0.0239, "step": 52480 }, { "grad_norm": 0.18087469041347504, "learning_rate": 2.8993837974723016e-05, "loss": 0.0262, "step": 52490 }, { "grad_norm": 0.1803194284439087, "learning_rate": 2.8975083902543394e-05, "loss": 0.0159, "step": 52500 }, { "grad_norm": 0.29494771361351013, "learning_rate": 2.895633342294688e-05, "loss": 0.0223, "step": 52510 }, { "grad_norm": 0.4002939760684967, "learning_rate": 2.893758653913744e-05, "loss": 0.0172, "step": 52520 }, { "grad_norm": 0.22272798418998718, "learning_rate": 2.8918843254318423e-05, "loss": 0.0188, "step": 52530 }, { "grad_norm": 0.37561410665512085, "learning_rate": 2.8900103571692483e-05, "loss": 0.0191, "step": 52540 }, { "grad_norm": 0.25596100091934204, "learning_rate": 2.8881367494461796e-05, "loss": 0.0164, "step": 52550 }, { "grad_norm": 0.3179128170013428, "learning_rate": 2.886263502582779e-05, "loss": 0.0198, "step": 52560 }, { "grad_norm": 0.34158775210380554, "learning_rate": 2.8843906168991353e-05, "loss": 0.0181, "step": 52570 }, { "grad_norm": 0.27954521775245667, "learning_rate": 2.8825180927152728e-05, "loss": 0.0203, "step": 52580 }, { "grad_norm": 0.30973970890045166, "learning_rate": 2.8806459303511547e-05, "loss": 0.0167, "step": 52590 }, { "grad_norm": 0.33614835143089294, "learning_rate": 2.8787741301266852e-05, "loss": 0.0193, "step": 52600 }, { "grad_norm": 0.2706913650035858, "learning_rate": 2.876902692361697e-05, "loss": 0.0179, "step": 52610 }, { "grad_norm": 0.3248947262763977, "learning_rate": 2.875031617375976e-05, "loss": 0.0198, "step": 52620 }, { "grad_norm": 0.2376244217157364, "learning_rate": 2.8731609054892318e-05, "loss": 0.0207, "step": 52630 }, { "grad_norm": 0.20251841843128204, "learning_rate": 2.871290557021119e-05, "loss": 0.0149, "step": 52640 }, { "grad_norm": 0.22563520073890686, "learning_rate": 2.8694205722912298e-05, "loss": 0.0181, "step": 52650 }, { "grad_norm": 0.45702317357063293, "learning_rate": 2.8675509516190936e-05, "loss": 0.0211, "step": 52660 }, { "grad_norm": 0.31113407015800476, "learning_rate": 2.8656816953241772e-05, "loss": 0.0268, "step": 52670 }, { "grad_norm": 0.2689199149608612, "learning_rate": 2.863812803725887e-05, "loss": 0.0181, "step": 52680 }, { "grad_norm": 0.2555934190750122, "learning_rate": 2.8619442771435623e-05, "loss": 0.0173, "step": 52690 }, { "grad_norm": 0.2249334305524826, "learning_rate": 2.860076115896484e-05, "loss": 0.0283, "step": 52700 }, { "grad_norm": 0.3048219382762909, "learning_rate": 2.8582083203038712e-05, "loss": 0.0194, "step": 52710 }, { "grad_norm": 0.27551668882369995, "learning_rate": 2.8563408906848778e-05, "loss": 0.0149, "step": 52720 }, { "grad_norm": 0.29226619005203247, "learning_rate": 2.8544738273585993e-05, "loss": 0.0187, "step": 52730 }, { "grad_norm": 0.22496218979358673, "learning_rate": 2.8526071306440595e-05, "loss": 0.0139, "step": 52740 }, { "grad_norm": 0.3092915713787079, "learning_rate": 2.850740800860233e-05, "loss": 0.0173, "step": 52750 }, { "grad_norm": 0.251780241727829, "learning_rate": 2.8488748383260178e-05, "loss": 0.0169, "step": 52760 }, { "grad_norm": 0.3166722059249878, "learning_rate": 2.8470092433602635e-05, "loss": 0.0203, "step": 52770 }, { "grad_norm": 0.2873973548412323, "learning_rate": 2.845144016281745e-05, "loss": 0.0187, "step": 52780 }, { "grad_norm": 0.2580845057964325, "learning_rate": 2.8432791574091744e-05, "loss": 0.0258, "step": 52790 }, { "grad_norm": 0.2917977273464203, "learning_rate": 2.8414146670612134e-05, "loss": 0.0186, "step": 52800 }, { "grad_norm": 0.20068040490150452, "learning_rate": 2.8395505455564446e-05, "loss": 0.0224, "step": 52810 }, { "grad_norm": 0.20269012451171875, "learning_rate": 2.837686793213403e-05, "loss": 0.0188, "step": 52820 }, { "grad_norm": 0.27398741245269775, "learning_rate": 2.8358234103505477e-05, "loss": 0.0179, "step": 52830 }, { "grad_norm": 0.27628612518310547, "learning_rate": 2.8339603972862806e-05, "loss": 0.0196, "step": 52840 }, { "grad_norm": 0.4635702669620514, "learning_rate": 2.83209775433894e-05, "loss": 0.0202, "step": 52850 }, { "grad_norm": 0.2139737904071808, "learning_rate": 2.8302354818268013e-05, "loss": 0.0159, "step": 52860 }, { "grad_norm": 0.31546711921691895, "learning_rate": 2.8283735800680754e-05, "loss": 0.0257, "step": 52870 }, { "grad_norm": 0.18692024052143097, "learning_rate": 2.826512049380913e-05, "loss": 0.0146, "step": 52880 }, { "grad_norm": 0.27178072929382324, "learning_rate": 2.8246508900833935e-05, "loss": 0.0156, "step": 52890 }, { "grad_norm": 0.30225667357444763, "learning_rate": 2.822790102493541e-05, "loss": 0.0215, "step": 52900 }, { "grad_norm": 0.21788227558135986, "learning_rate": 2.820929686929314e-05, "loss": 0.0205, "step": 52910 }, { "grad_norm": 0.177808940410614, "learning_rate": 2.819069643708605e-05, "loss": 0.0145, "step": 52920 }, { "grad_norm": 0.32294997572898865, "learning_rate": 2.8172099731492484e-05, "loss": 0.0181, "step": 52930 }, { "grad_norm": 0.21213479340076447, "learning_rate": 2.815350675569003e-05, "loss": 0.0259, "step": 52940 }, { "grad_norm": 0.2809617817401886, "learning_rate": 2.8134917512855807e-05, "loss": 0.0253, "step": 52950 }, { "grad_norm": 0.18389198184013367, "learning_rate": 2.8116332006166134e-05, "loss": 0.0136, "step": 52960 }, { "grad_norm": 0.33310985565185547, "learning_rate": 2.809775023879685e-05, "loss": 0.0274, "step": 52970 }, { "grad_norm": 0.31723445653915405, "learning_rate": 2.807917221392299e-05, "loss": 0.0207, "step": 52980 }, { "grad_norm": 0.4436568021774292, "learning_rate": 2.8060597934719067e-05, "loss": 0.0176, "step": 52990 }, { "grad_norm": 0.16756297647953033, "learning_rate": 2.8042027404358927e-05, "loss": 0.0209, "step": 53000 }, { "grad_norm": 0.2919595241546631, "learning_rate": 2.802346062601572e-05, "loss": 0.021, "step": 53010 }, { "grad_norm": 0.450551301240921, "learning_rate": 2.800489760286207e-05, "loss": 0.0325, "step": 53020 }, { "grad_norm": 0.24416986107826233, "learning_rate": 2.7986338338069834e-05, "loss": 0.018, "step": 53030 }, { "grad_norm": 0.7579241991043091, "learning_rate": 2.7967782834810297e-05, "loss": 0.0255, "step": 53040 }, { "grad_norm": 0.2707695960998535, "learning_rate": 2.794923109625409e-05, "loss": 0.0185, "step": 53050 }, { "grad_norm": 0.24180185794830322, "learning_rate": 2.7930683125571205e-05, "loss": 0.0188, "step": 53060 }, { "grad_norm": 0.3835463523864746, "learning_rate": 2.7912138925930997e-05, "loss": 0.0236, "step": 53070 }, { "grad_norm": 0.2627931535243988, "learning_rate": 2.7893598500502117e-05, "loss": 0.0302, "step": 53080 }, { "grad_norm": 0.309983491897583, "learning_rate": 2.7875061852452644e-05, "loss": 0.0174, "step": 53090 }, { "grad_norm": 0.3279978632926941, "learning_rate": 2.7856528984949982e-05, "loss": 0.0224, "step": 53100 }, { "grad_norm": 0.18522949516773224, "learning_rate": 2.7837999901160888e-05, "loss": 0.0175, "step": 53110 }, { "grad_norm": 0.3371407389640808, "learning_rate": 2.7819474604251484e-05, "loss": 0.0225, "step": 53120 }, { "grad_norm": 0.2618868350982666, "learning_rate": 2.780095309738725e-05, "loss": 0.0205, "step": 53130 }, { "grad_norm": 0.3118233382701874, "learning_rate": 2.778243538373294e-05, "loss": 0.0188, "step": 53140 }, { "grad_norm": 0.15132135152816772, "learning_rate": 2.7763921466452826e-05, "loss": 0.0178, "step": 53150 }, { "grad_norm": 0.22214490175247192, "learning_rate": 2.7745411348710336e-05, "loss": 0.0243, "step": 53160 }, { "grad_norm": 0.30622607469558716, "learning_rate": 2.7726905033668395e-05, "loss": 0.0257, "step": 53170 }, { "grad_norm": 0.3118976056575775, "learning_rate": 2.7708402524489214e-05, "loss": 0.022, "step": 53180 }, { "grad_norm": 0.2861427366733551, "learning_rate": 2.7689903824334364e-05, "loss": 0.0228, "step": 53190 }, { "grad_norm": 0.2314165085554123, "learning_rate": 2.7671408936364785e-05, "loss": 0.0161, "step": 53200 }, { "grad_norm": 0.21894440054893494, "learning_rate": 2.765291786374069e-05, "loss": 0.02, "step": 53210 }, { "grad_norm": 0.3772246539592743, "learning_rate": 2.7634430609621775e-05, "loss": 0.0184, "step": 53220 }, { "grad_norm": 0.4748242497444153, "learning_rate": 2.7615947177166956e-05, "loss": 0.02, "step": 53230 }, { "grad_norm": 0.37066155672073364, "learning_rate": 2.7597467569534553e-05, "loss": 0.0144, "step": 53240 }, { "grad_norm": 0.2736389636993408, "learning_rate": 2.757899178988226e-05, "loss": 0.018, "step": 53250 }, { "grad_norm": 0.24169984459877014, "learning_rate": 2.7560519841367005e-05, "loss": 0.0207, "step": 53260 }, { "grad_norm": 0.18771901726722717, "learning_rate": 2.7542051727145237e-05, "loss": 0.0207, "step": 53270 }, { "grad_norm": 0.30466389656066895, "learning_rate": 2.7523587450372578e-05, "loss": 0.0259, "step": 53280 }, { "grad_norm": 0.5242581963539124, "learning_rate": 2.750512701420409e-05, "loss": 0.0176, "step": 53290 }, { "grad_norm": 0.1982596069574356, "learning_rate": 2.7486670421794158e-05, "loss": 0.0161, "step": 53300 }, { "grad_norm": 0.2930646538734436, "learning_rate": 2.7468217676296515e-05, "loss": 0.0269, "step": 53310 }, { "grad_norm": 0.30906179547309875, "learning_rate": 2.7449768780864226e-05, "loss": 0.0251, "step": 53320 }, { "grad_norm": 0.32294735312461853, "learning_rate": 2.7431323738649724e-05, "loss": 0.0169, "step": 53330 }, { "grad_norm": 0.42241165041923523, "learning_rate": 2.7412882552804713e-05, "loss": 0.0193, "step": 53340 }, { "grad_norm": 0.20548124611377716, "learning_rate": 2.739444522648032e-05, "loss": 0.0128, "step": 53350 }, { "grad_norm": 0.21504251658916473, "learning_rate": 2.737601176282697e-05, "loss": 0.0184, "step": 53360 }, { "grad_norm": 0.2973916232585907, "learning_rate": 2.735758216499445e-05, "loss": 0.0169, "step": 53370 }, { "grad_norm": 0.28913354873657227, "learning_rate": 2.7339156436131864e-05, "loss": 0.0123, "step": 53380 }, { "grad_norm": 0.7425004243850708, "learning_rate": 2.7320734579387663e-05, "loss": 0.0182, "step": 53390 }, { "grad_norm": 0.35859063267707825, "learning_rate": 2.730231659790966e-05, "loss": 0.0157, "step": 53400 }, { "grad_norm": 0.3220578730106354, "learning_rate": 2.728390249484494e-05, "loss": 0.0205, "step": 53410 }, { "grad_norm": 0.20711244642734528, "learning_rate": 2.7265492273340032e-05, "loss": 0.0173, "step": 53420 }, { "grad_norm": 0.25847774744033813, "learning_rate": 2.7247085936540678e-05, "loss": 0.0135, "step": 53430 }, { "grad_norm": 0.2140737771987915, "learning_rate": 2.722868348759205e-05, "loss": 0.0203, "step": 53440 }, { "grad_norm": 0.2240860015153885, "learning_rate": 2.7210284929638635e-05, "loss": 0.0208, "step": 53450 }, { "grad_norm": 0.18880537152290344, "learning_rate": 2.7191890265824183e-05, "loss": 0.0183, "step": 53460 }, { "grad_norm": 0.2015102505683899, "learning_rate": 2.7173499499291926e-05, "loss": 0.0165, "step": 53470 }, { "grad_norm": 0.3769358694553375, "learning_rate": 2.7155112633184277e-05, "loss": 0.0171, "step": 53480 }, { "grad_norm": 0.26402077078819275, "learning_rate": 2.713672967064307e-05, "loss": 0.0228, "step": 53490 }, { "grad_norm": 0.16819609701633453, "learning_rate": 2.711835061480945e-05, "loss": 0.0106, "step": 53500 }, { "grad_norm": 0.3732723891735077, "learning_rate": 2.7099975468823896e-05, "loss": 0.0229, "step": 53510 }, { "grad_norm": 0.23583608865737915, "learning_rate": 2.708160423582622e-05, "loss": 0.0189, "step": 53520 }, { "grad_norm": 0.5452398657798767, "learning_rate": 2.706323691895557e-05, "loss": 0.0223, "step": 53530 }, { "grad_norm": 0.4007023572921753, "learning_rate": 2.70448735213504e-05, "loss": 0.0146, "step": 53540 }, { "grad_norm": 0.3252926468849182, "learning_rate": 2.702651404614852e-05, "loss": 0.0214, "step": 53550 }, { "grad_norm": 0.2593262791633606, "learning_rate": 2.7008158496487056e-05, "loss": 0.02, "step": 53560 }, { "grad_norm": 0.20999284088611603, "learning_rate": 2.6989806875502487e-05, "loss": 0.0175, "step": 53570 }, { "grad_norm": 0.21662470698356628, "learning_rate": 2.6971459186330584e-05, "loss": 0.0139, "step": 53580 }, { "grad_norm": 0.3743145167827606, "learning_rate": 2.695311543210648e-05, "loss": 0.0188, "step": 53590 }, { "grad_norm": 0.30980631709098816, "learning_rate": 2.693477561596463e-05, "loss": 0.015, "step": 53600 }, { "grad_norm": 0.29672789573669434, "learning_rate": 2.6916439741038756e-05, "loss": 0.0174, "step": 53610 }, { "grad_norm": 0.309842586517334, "learning_rate": 2.689810781046203e-05, "loss": 0.0181, "step": 53620 }, { "grad_norm": 0.17554591596126556, "learning_rate": 2.6879779827366823e-05, "loss": 0.0184, "step": 53630 }, { "grad_norm": 0.18716394901275635, "learning_rate": 2.6861455794884904e-05, "loss": 0.0186, "step": 53640 }, { "grad_norm": 0.24309277534484863, "learning_rate": 2.6843135716147373e-05, "loss": 0.0175, "step": 53650 }, { "grad_norm": 0.26026451587677, "learning_rate": 2.6824819594284556e-05, "loss": 0.0202, "step": 53660 }, { "grad_norm": 0.23357023298740387, "learning_rate": 2.6806507432426275e-05, "loss": 0.0165, "step": 53670 }, { "grad_norm": 0.805025577545166, "learning_rate": 2.6788199233701512e-05, "loss": 0.0183, "step": 53680 }, { "grad_norm": 0.31934747099876404, "learning_rate": 2.6769895001238652e-05, "loss": 0.0212, "step": 53690 }, { "grad_norm": 0.3674349784851074, "learning_rate": 2.67515947381654e-05, "loss": 0.0201, "step": 53700 }, { "grad_norm": 0.21064361929893494, "learning_rate": 2.6733298447608745e-05, "loss": 0.0149, "step": 53710 }, { "grad_norm": 0.5651523470878601, "learning_rate": 2.6715006132695074e-05, "loss": 0.027, "step": 53720 }, { "grad_norm": 0.24205535650253296, "learning_rate": 2.6696717796549985e-05, "loss": 0.0255, "step": 53730 }, { "grad_norm": 0.4045044779777527, "learning_rate": 2.667843344229848e-05, "loss": 0.0192, "step": 53740 }, { "grad_norm": 0.3879241943359375, "learning_rate": 2.6660153073064846e-05, "loss": 0.0206, "step": 53750 }, { "grad_norm": 0.35307776927948, "learning_rate": 2.664187669197271e-05, "loss": 0.0229, "step": 53760 }, { "grad_norm": 0.40535998344421387, "learning_rate": 2.6623604302145005e-05, "loss": 0.0166, "step": 53770 }, { "grad_norm": 0.2743804454803467, "learning_rate": 2.6605335906703975e-05, "loss": 0.0175, "step": 53780 }, { "grad_norm": 0.25931763648986816, "learning_rate": 2.65870715087712e-05, "loss": 0.0204, "step": 53790 }, { "grad_norm": 0.24160197377204895, "learning_rate": 2.6568811111467573e-05, "loss": 0.0155, "step": 53800 }, { "grad_norm": 0.2535805404186249, "learning_rate": 2.6550554717913258e-05, "loss": 0.0203, "step": 53810 }, { "grad_norm": 0.2606932520866394, "learning_rate": 2.6532302331227805e-05, "loss": 0.0178, "step": 53820 }, { "grad_norm": 0.29456111788749695, "learning_rate": 2.651405395453004e-05, "loss": 0.019, "step": 53830 }, { "grad_norm": 0.22237969934940338, "learning_rate": 2.6495809590938115e-05, "loss": 0.0218, "step": 53840 }, { "grad_norm": 0.1582537591457367, "learning_rate": 2.64775692435695e-05, "loss": 0.0197, "step": 53850 }, { "grad_norm": 0.3046765923500061, "learning_rate": 2.6459332915540928e-05, "loss": 0.0182, "step": 53860 }, { "grad_norm": 0.25614699721336365, "learning_rate": 2.644110060996856e-05, "loss": 0.0238, "step": 53870 }, { "grad_norm": 0.30202025175094604, "learning_rate": 2.642287232996774e-05, "loss": 0.0178, "step": 53880 }, { "grad_norm": 0.239533469080925, "learning_rate": 2.6404648078653205e-05, "loss": 0.0139, "step": 53890 }, { "grad_norm": 0.47622427344322205, "learning_rate": 2.6386427859139002e-05, "loss": 0.0173, "step": 53900 }, { "grad_norm": 0.31651467084884644, "learning_rate": 2.63682116745384e-05, "loss": 0.0197, "step": 53910 }, { "grad_norm": 0.3309735357761383, "learning_rate": 2.6349999527964138e-05, "loss": 0.0169, "step": 53920 }, { "grad_norm": 0.21222303807735443, "learning_rate": 2.63317914225281e-05, "loss": 0.02, "step": 53930 }, { "grad_norm": 0.1909845471382141, "learning_rate": 2.6313587361341585e-05, "loss": 0.0185, "step": 53940 }, { "grad_norm": 0.509610116481781, "learning_rate": 2.6295387347515165e-05, "loss": 0.0248, "step": 53950 }, { "grad_norm": 0.12409067898988724, "learning_rate": 2.6277191384158727e-05, "loss": 0.0256, "step": 53960 }, { "grad_norm": 0.26313871145248413, "learning_rate": 2.6258999474381457e-05, "loss": 0.0222, "step": 53970 }, { "grad_norm": 0.2095792442560196, "learning_rate": 2.624081162129186e-05, "loss": 0.02, "step": 53980 }, { "grad_norm": 0.25024527311325073, "learning_rate": 2.6222627827997765e-05, "loss": 0.0312, "step": 53990 }, { "grad_norm": 0.25555261969566345, "learning_rate": 2.6204448097606236e-05, "loss": 0.0203, "step": 54000 }, { "grad_norm": 0.22058996558189392, "learning_rate": 2.6186272433223726e-05, "loss": 0.0195, "step": 54010 }, { "grad_norm": 0.18774235248565674, "learning_rate": 2.6168100837955943e-05, "loss": 0.0211, "step": 54020 }, { "grad_norm": 0.2359543740749359, "learning_rate": 2.6149933314907926e-05, "loss": 0.0239, "step": 54030 }, { "grad_norm": 0.38049590587615967, "learning_rate": 2.6131769867184e-05, "loss": 0.0256, "step": 54040 }, { "grad_norm": 0.3022622764110565, "learning_rate": 2.611361049788783e-05, "loss": 0.0233, "step": 54050 }, { "grad_norm": 0.34043532609939575, "learning_rate": 2.6095455210122292e-05, "loss": 0.0211, "step": 54060 }, { "grad_norm": 0.21919700503349304, "learning_rate": 2.6077304006989712e-05, "loss": 0.0123, "step": 54070 }, { "grad_norm": 0.2475307136774063, "learning_rate": 2.6059156891591562e-05, "loss": 0.0169, "step": 54080 }, { "grad_norm": 0.2745704650878906, "learning_rate": 2.6041013867028718e-05, "loss": 0.0198, "step": 54090 }, { "grad_norm": 0.31221187114715576, "learning_rate": 2.6022874936401347e-05, "loss": 0.0138, "step": 54100 }, { "grad_norm": 0.20047491788864136, "learning_rate": 2.6004740102808832e-05, "loss": 0.0205, "step": 54110 }, { "grad_norm": 0.30825623869895935, "learning_rate": 2.598660936935e-05, "loss": 0.0128, "step": 54120 }, { "grad_norm": 0.3920084238052368, "learning_rate": 2.5968482739122845e-05, "loss": 0.0194, "step": 54130 }, { "grad_norm": 0.28216516971588135, "learning_rate": 2.595036021522472e-05, "loss": 0.0156, "step": 54140 }, { "grad_norm": 0.738369882106781, "learning_rate": 2.5932241800752278e-05, "loss": 0.0228, "step": 54150 }, { "grad_norm": 0.45232370495796204, "learning_rate": 2.5914127498801453e-05, "loss": 0.0346, "step": 54160 }, { "grad_norm": 0.21819080412387848, "learning_rate": 2.5896017312467497e-05, "loss": 0.0177, "step": 54170 }, { "grad_norm": 0.33746370673179626, "learning_rate": 2.587791124484493e-05, "loss": 0.0156, "step": 54180 }, { "grad_norm": 0.24270521104335785, "learning_rate": 2.5859809299027615e-05, "loss": 0.0179, "step": 54190 }, { "grad_norm": 0.263785719871521, "learning_rate": 2.5841711478108632e-05, "loss": 0.0301, "step": 54200 }, { "grad_norm": 0.29352208971977234, "learning_rate": 2.582361778518043e-05, "loss": 0.0178, "step": 54210 }, { "grad_norm": 0.23424115777015686, "learning_rate": 2.580552822333472e-05, "loss": 0.0196, "step": 54220 }, { "grad_norm": 0.35259589552879333, "learning_rate": 2.578744279566252e-05, "loss": 0.0196, "step": 54230 }, { "grad_norm": 0.1770133674144745, "learning_rate": 2.576936150525413e-05, "loss": 0.0188, "step": 54240 }, { "grad_norm": 0.3197226822376251, "learning_rate": 2.5751284355199168e-05, "loss": 0.0158, "step": 54250 }, { "grad_norm": 0.14033478498458862, "learning_rate": 2.573321134858646e-05, "loss": 0.0116, "step": 54260 }, { "grad_norm": 0.1699572652578354, "learning_rate": 2.5715142488504286e-05, "loss": 0.0138, "step": 54270 }, { "grad_norm": 0.1559111326932907, "learning_rate": 2.5697077778040042e-05, "loss": 0.0193, "step": 54280 }, { "grad_norm": 0.300785630941391, "learning_rate": 2.5679017220280522e-05, "loss": 0.0324, "step": 54290 }, { "grad_norm": 0.17827177047729492, "learning_rate": 2.5660960818311796e-05, "loss": 0.0175, "step": 54300 }, { "grad_norm": 0.2961977422237396, "learning_rate": 2.564290857521915e-05, "loss": 0.0179, "step": 54310 }, { "grad_norm": 0.40404608845710754, "learning_rate": 2.5624860494087298e-05, "loss": 0.0187, "step": 54320 }, { "grad_norm": 0.16520805656909943, "learning_rate": 2.5606816578000115e-05, "loss": 0.0216, "step": 54330 }, { "grad_norm": 0.26897940039634705, "learning_rate": 2.558877683004082e-05, "loss": 0.017, "step": 54340 }, { "grad_norm": 0.1715862900018692, "learning_rate": 2.557074125329192e-05, "loss": 0.0154, "step": 54350 }, { "grad_norm": 0.2666095197200775, "learning_rate": 2.5552709850835195e-05, "loss": 0.0157, "step": 54360 }, { "grad_norm": 0.2311740517616272, "learning_rate": 2.5534682625751738e-05, "loss": 0.012, "step": 54370 }, { "grad_norm": 0.35824671387672424, "learning_rate": 2.551665958112186e-05, "loss": 0.0254, "step": 54380 }, { "grad_norm": 0.2097979485988617, "learning_rate": 2.549864072002527e-05, "loss": 0.0145, "step": 54390 }, { "grad_norm": 0.2478088140487671, "learning_rate": 2.5480626045540858e-05, "loss": 0.0244, "step": 54400 }, { "grad_norm": 0.26264873147010803, "learning_rate": 2.546261556074684e-05, "loss": 0.017, "step": 54410 }, { "grad_norm": 0.28049999475479126, "learning_rate": 2.5444609268720726e-05, "loss": 0.0184, "step": 54420 }, { "grad_norm": 0.19454945623874664, "learning_rate": 2.5426607172539297e-05, "loss": 0.0167, "step": 54430 }, { "grad_norm": 0.2751863896846771, "learning_rate": 2.5408609275278617e-05, "loss": 0.0223, "step": 54440 }, { "grad_norm": 0.19523076713085175, "learning_rate": 2.5390615580014055e-05, "loss": 0.0184, "step": 54450 }, { "grad_norm": 0.28772255778312683, "learning_rate": 2.5372626089820207e-05, "loss": 0.0223, "step": 54460 }, { "grad_norm": 0.337815523147583, "learning_rate": 2.5354640807770997e-05, "loss": 0.0158, "step": 54470 }, { "grad_norm": 0.3796999454498291, "learning_rate": 2.5336659736939622e-05, "loss": 0.0173, "step": 54480 }, { "grad_norm": 0.17898495495319366, "learning_rate": 2.5318682880398554e-05, "loss": 0.0215, "step": 54490 }, { "grad_norm": 0.21020716428756714, "learning_rate": 2.530071024121956e-05, "loss": 0.0195, "step": 54500 }, { "grad_norm": 0.24967901408672333, "learning_rate": 2.5282741822473627e-05, "loss": 0.0157, "step": 54510 }, { "grad_norm": 0.3170825242996216, "learning_rate": 2.526477762723114e-05, "loss": 0.0188, "step": 54520 }, { "grad_norm": 0.27806687355041504, "learning_rate": 2.5246817658561618e-05, "loss": 0.0216, "step": 54530 }, { "grad_norm": 0.16654032468795776, "learning_rate": 2.5228861919533965e-05, "loss": 0.0181, "step": 54540 }, { "grad_norm": 0.15851110219955444, "learning_rate": 2.5210910413216326e-05, "loss": 0.0272, "step": 54550 }, { "grad_norm": 0.26046353578567505, "learning_rate": 2.5192963142676086e-05, "loss": 0.0138, "step": 54560 }, { "grad_norm": 0.2363273948431015, "learning_rate": 2.517502011098001e-05, "loss": 0.0147, "step": 54570 }, { "grad_norm": 0.17628051340579987, "learning_rate": 2.5157081321193987e-05, "loss": 0.0175, "step": 54580 }, { "grad_norm": 0.24825744330883026, "learning_rate": 2.5139146776383356e-05, "loss": 0.0175, "step": 54590 }, { "grad_norm": 0.21106670796871185, "learning_rate": 2.5121216479612575e-05, "loss": 0.0198, "step": 54600 }, { "grad_norm": 0.27342697978019714, "learning_rate": 2.510329043394546e-05, "loss": 0.0278, "step": 54610 }, { "grad_norm": 0.355867862701416, "learning_rate": 2.508536864244508e-05, "loss": 0.0246, "step": 54620 }, { "grad_norm": 0.24933560192584991, "learning_rate": 2.5067451108173778e-05, "loss": 0.0173, "step": 54630 }, { "grad_norm": 0.34819871187210083, "learning_rate": 2.5049537834193204e-05, "loss": 0.0205, "step": 54640 }, { "grad_norm": 0.9184367656707764, "learning_rate": 2.5031628823564194e-05, "loss": 0.019, "step": 54650 }, { "grad_norm": 0.29152870178222656, "learning_rate": 2.5013724079346933e-05, "loss": 0.0197, "step": 54660 }, { "grad_norm": 0.22519122064113617, "learning_rate": 2.4995823604600854e-05, "loss": 0.0156, "step": 54670 }, { "grad_norm": 0.2419939935207367, "learning_rate": 2.497792740238465e-05, "loss": 0.017, "step": 54680 }, { "grad_norm": 0.2844580113887787, "learning_rate": 2.49600354757563e-05, "loss": 0.0199, "step": 54690 }, { "grad_norm": 0.21845518052577972, "learning_rate": 2.494214782777306e-05, "loss": 0.0181, "step": 54700 }, { "grad_norm": 0.30952373147010803, "learning_rate": 2.4924264461491386e-05, "loss": 0.0237, "step": 54710 }, { "grad_norm": 0.32077309489250183, "learning_rate": 2.4906385379967133e-05, "loss": 0.0194, "step": 54720 }, { "grad_norm": 0.24015994369983673, "learning_rate": 2.4888510586255285e-05, "loss": 0.0138, "step": 54730 }, { "grad_norm": 0.1884249597787857, "learning_rate": 2.487064008341018e-05, "loss": 0.0168, "step": 54740 }, { "grad_norm": 0.3021093010902405, "learning_rate": 2.4852773874485407e-05, "loss": 0.0163, "step": 54750 }, { "grad_norm": 0.30699414014816284, "learning_rate": 2.483491196253377e-05, "loss": 0.0203, "step": 54760 }, { "grad_norm": 0.26252424716949463, "learning_rate": 2.4817054350607443e-05, "loss": 0.0204, "step": 54770 }, { "grad_norm": 0.5376703143119812, "learning_rate": 2.4799201041757742e-05, "loss": 0.0236, "step": 54780 }, { "grad_norm": 0.33234667778015137, "learning_rate": 2.4781352039035373e-05, "loss": 0.0179, "step": 54790 }, { "grad_norm": 0.16792263090610504, "learning_rate": 2.4763507345490194e-05, "loss": 0.0132, "step": 54800 }, { "grad_norm": 0.22077719867229462, "learning_rate": 2.4745666964171386e-05, "loss": 0.0178, "step": 54810 }, { "grad_norm": 0.8380488157272339, "learning_rate": 2.4727830898127387e-05, "loss": 0.0226, "step": 54820 }, { "grad_norm": 0.18641406297683716, "learning_rate": 2.4709999150405895e-05, "loss": 0.0251, "step": 54830 }, { "grad_norm": 0.29865866899490356, "learning_rate": 2.469217172405388e-05, "loss": 0.0165, "step": 54840 }, { "grad_norm": 0.20319689810276031, "learning_rate": 2.4674348622117527e-05, "loss": 0.0135, "step": 54850 }, { "grad_norm": 0.815064549446106, "learning_rate": 2.465652984764234e-05, "loss": 0.0211, "step": 54860 }, { "grad_norm": 0.19120261073112488, "learning_rate": 2.4638715403673056e-05, "loss": 0.0145, "step": 54870 }, { "grad_norm": 0.4014303982257843, "learning_rate": 2.4620905293253676e-05, "loss": 0.0181, "step": 54880 }, { "grad_norm": 0.284505695104599, "learning_rate": 2.4603099519427463e-05, "loss": 0.0193, "step": 54890 }, { "grad_norm": 0.7445740103721619, "learning_rate": 2.458529808523695e-05, "loss": 0.0218, "step": 54900 }, { "grad_norm": 0.27958041429519653, "learning_rate": 2.456750099372387e-05, "loss": 0.0151, "step": 54910 }, { "grad_norm": 0.23638303577899933, "learning_rate": 2.4549708247929327e-05, "loss": 0.0212, "step": 54920 }, { "grad_norm": 0.22693811357021332, "learning_rate": 2.4531919850893554e-05, "loss": 0.0147, "step": 54930 }, { "grad_norm": 0.1611347794532776, "learning_rate": 2.4514135805656125e-05, "loss": 0.0139, "step": 54940 }, { "grad_norm": 0.3033471703529358, "learning_rate": 2.449635611525587e-05, "loss": 0.0193, "step": 54950 }, { "grad_norm": 0.2321266382932663, "learning_rate": 2.447858078273079e-05, "loss": 0.0181, "step": 54960 }, { "grad_norm": 0.2431270182132721, "learning_rate": 2.4460809811118275e-05, "loss": 0.0248, "step": 54970 }, { "grad_norm": 0.20199745893478394, "learning_rate": 2.444304320345483e-05, "loss": 0.0142, "step": 54980 }, { "grad_norm": 0.2612610161304474, "learning_rate": 2.4425280962776354e-05, "loss": 0.0206, "step": 54990 }, { "grad_norm": 0.2890211045742035, "learning_rate": 2.4407523092117875e-05, "loss": 0.0198, "step": 55000 }, { "grad_norm": 0.4136224687099457, "learning_rate": 2.438976959451374e-05, "loss": 0.0156, "step": 55010 }, { "grad_norm": 0.46007248759269714, "learning_rate": 2.4372020472997565e-05, "loss": 0.0193, "step": 55020 }, { "grad_norm": 0.25223636627197266, "learning_rate": 2.4354275730602122e-05, "loss": 0.0169, "step": 55030 }, { "grad_norm": 0.3520875573158264, "learning_rate": 2.433653537035958e-05, "loss": 0.0194, "step": 55040 }, { "grad_norm": 0.21048316359519958, "learning_rate": 2.431879939530123e-05, "loss": 0.0207, "step": 55050 }, { "grad_norm": 0.21275828778743744, "learning_rate": 2.4301067808457684e-05, "loss": 0.0136, "step": 55060 }, { "grad_norm": 0.2578234374523163, "learning_rate": 2.428334061285878e-05, "loss": 0.0156, "step": 55070 }, { "grad_norm": 0.2742640972137451, "learning_rate": 2.426561781153361e-05, "loss": 0.0183, "step": 55080 }, { "grad_norm": 0.17962221801280975, "learning_rate": 2.424789940751052e-05, "loss": 0.0181, "step": 55090 }, { "grad_norm": 0.32425186038017273, "learning_rate": 2.423018540381712e-05, "loss": 0.0201, "step": 55100 }, { "grad_norm": 0.16522659361362457, "learning_rate": 2.421247580348021e-05, "loss": 0.0237, "step": 55110 }, { "grad_norm": 0.2830313444137573, "learning_rate": 2.4194770609525897e-05, "loss": 0.0145, "step": 55120 }, { "grad_norm": 0.2450627088546753, "learning_rate": 2.417706982497951e-05, "loss": 0.0175, "step": 55130 }, { "grad_norm": 0.15749724209308624, "learning_rate": 2.415937345286563e-05, "loss": 0.0206, "step": 55140 }, { "grad_norm": 0.2784959375858307, "learning_rate": 2.4141681496208087e-05, "loss": 0.0174, "step": 55150 }, { "grad_norm": 0.22557775676250458, "learning_rate": 2.4123993958029946e-05, "loss": 0.0149, "step": 55160 }, { "grad_norm": 0.4260348677635193, "learning_rate": 2.4106310841353548e-05, "loss": 0.0199, "step": 55170 }, { "grad_norm": 0.1471235454082489, "learning_rate": 2.4088632149200398e-05, "loss": 0.0213, "step": 55180 }, { "grad_norm": 0.21999552845954895, "learning_rate": 2.4070957884591367e-05, "loss": 0.0165, "step": 55190 }, { "grad_norm": 0.5528849363327026, "learning_rate": 2.4053288050546464e-05, "loss": 0.0241, "step": 55200 }, { "grad_norm": 0.21723927557468414, "learning_rate": 2.403562265008498e-05, "loss": 0.0213, "step": 55210 }, { "grad_norm": 0.24623173475265503, "learning_rate": 2.4017961686225483e-05, "loss": 0.0146, "step": 55220 }, { "grad_norm": 0.22460472583770752, "learning_rate": 2.400030516198568e-05, "loss": 0.0252, "step": 55230 }, { "grad_norm": 0.21610280871391296, "learning_rate": 2.3982653080382673e-05, "loss": 0.0198, "step": 55240 }, { "grad_norm": 0.36687248945236206, "learning_rate": 2.396500544443266e-05, "loss": 0.0169, "step": 55250 }, { "grad_norm": 0.24860763549804688, "learning_rate": 2.3947362257151156e-05, "loss": 0.0152, "step": 55260 }, { "grad_norm": 0.3863976001739502, "learning_rate": 2.39297235215529e-05, "loss": 0.0232, "step": 55270 }, { "grad_norm": 0.2271682620048523, "learning_rate": 2.3912089240651873e-05, "loss": 0.0212, "step": 55280 }, { "grad_norm": 0.1619114875793457, "learning_rate": 2.389445941746129e-05, "loss": 0.025, "step": 55290 }, { "grad_norm": 0.2803623378276825, "learning_rate": 2.3876834054993625e-05, "loss": 0.0205, "step": 55300 }, { "grad_norm": 0.22054794430732727, "learning_rate": 2.3859213156260522e-05, "loss": 0.0154, "step": 55310 }, { "grad_norm": 0.2313164472579956, "learning_rate": 2.3841596724272948e-05, "loss": 0.0174, "step": 55320 }, { "grad_norm": 0.24633127450942993, "learning_rate": 2.382398476204106e-05, "loss": 0.0157, "step": 55330 }, { "grad_norm": 0.2411113679409027, "learning_rate": 2.3806377272574254e-05, "loss": 0.0159, "step": 55340 }, { "grad_norm": 0.3472982943058014, "learning_rate": 2.3788774258881174e-05, "loss": 0.0165, "step": 55350 }, { "grad_norm": 0.5761951804161072, "learning_rate": 2.37711757239697e-05, "loss": 0.0203, "step": 55360 }, { "grad_norm": 0.30983883142471313, "learning_rate": 2.3753581670846954e-05, "loss": 0.0109, "step": 55370 }, { "grad_norm": 0.15850421786308289, "learning_rate": 2.3735992102519216e-05, "loss": 0.02, "step": 55380 }, { "grad_norm": 0.24884703755378723, "learning_rate": 2.371840702199215e-05, "loss": 0.015, "step": 55390 }, { "grad_norm": 0.20190560817718506, "learning_rate": 2.3700826432270494e-05, "loss": 0.0118, "step": 55400 }, { "grad_norm": 0.2525915801525116, "learning_rate": 2.3683250336358326e-05, "loss": 0.0204, "step": 55410 }, { "grad_norm": 0.19755741953849792, "learning_rate": 2.3665678737258923e-05, "loss": 0.0252, "step": 55420 }, { "grad_norm": 0.1951468288898468, "learning_rate": 2.3648111637974745e-05, "loss": 0.0178, "step": 55430 }, { "grad_norm": 0.2264404296875, "learning_rate": 2.36305490415076e-05, "loss": 0.0168, "step": 55440 }, { "grad_norm": 0.36240994930267334, "learning_rate": 2.3612990950858392e-05, "loss": 0.021, "step": 55450 }, { "grad_norm": 0.23126450181007385, "learning_rate": 2.359543736902735e-05, "loss": 0.0152, "step": 55460 }, { "grad_norm": 0.2032727152109146, "learning_rate": 2.3577888299013896e-05, "loss": 0.0142, "step": 55470 }, { "grad_norm": 0.24952588975429535, "learning_rate": 2.3560343743816683e-05, "loss": 0.0162, "step": 55480 }, { "grad_norm": 0.22970476746559143, "learning_rate": 2.354280370643362e-05, "loss": 0.0213, "step": 55490 }, { "grad_norm": 0.4424349367618561, "learning_rate": 2.3525268189861777e-05, "loss": 0.0168, "step": 55500 }, { "grad_norm": 0.38994887471199036, "learning_rate": 2.3507737197097513e-05, "loss": 0.0153, "step": 55510 }, { "grad_norm": 0.25409257411956787, "learning_rate": 2.34902107311364e-05, "loss": 0.0178, "step": 55520 }, { "grad_norm": 0.2253188043832779, "learning_rate": 2.3472688794973225e-05, "loss": 0.0191, "step": 55530 }, { "grad_norm": 0.2203374058008194, "learning_rate": 2.3455171391602016e-05, "loss": 0.0208, "step": 55540 }, { "grad_norm": 0.2737092971801758, "learning_rate": 2.343765852401601e-05, "loss": 0.0153, "step": 55550 }, { "grad_norm": 0.2935727536678314, "learning_rate": 2.342015019520768e-05, "loss": 0.0203, "step": 55560 }, { "grad_norm": 0.2857016623020172, "learning_rate": 2.3402646408168742e-05, "loss": 0.0177, "step": 55570 }, { "grad_norm": 0.40787309408187866, "learning_rate": 2.3385147165890074e-05, "loss": 0.0244, "step": 55580 }, { "grad_norm": 0.17664478719234467, "learning_rate": 2.336765247136184e-05, "loss": 0.0168, "step": 55590 }, { "grad_norm": 0.4135848581790924, "learning_rate": 2.33501623275734e-05, "loss": 0.0274, "step": 55600 }, { "grad_norm": 0.3828704059123993, "learning_rate": 2.333267673751334e-05, "loss": 0.0167, "step": 55610 }, { "grad_norm": 0.27844780683517456, "learning_rate": 2.33151957041695e-05, "loss": 0.015, "step": 55620 }, { "grad_norm": 0.27033936977386475, "learning_rate": 2.329771923052884e-05, "loss": 0.0185, "step": 55630 }, { "grad_norm": 0.25462716817855835, "learning_rate": 2.3280247319577697e-05, "loss": 0.021, "step": 55640 }, { "grad_norm": 0.38302868604660034, "learning_rate": 2.3262779974301473e-05, "loss": 0.0218, "step": 55650 }, { "grad_norm": 0.36136502027511597, "learning_rate": 2.3245317197684895e-05, "loss": 0.0172, "step": 55660 }, { "grad_norm": 0.22279569506645203, "learning_rate": 2.322785899271188e-05, "loss": 0.0168, "step": 55670 }, { "grad_norm": 0.29251930117607117, "learning_rate": 2.3210405362365507e-05, "loss": 0.024, "step": 55680 }, { "grad_norm": 0.35701894760131836, "learning_rate": 2.31929563096282e-05, "loss": 0.0226, "step": 55690 }, { "grad_norm": 0.23635730147361755, "learning_rate": 2.317551183748146e-05, "loss": 0.0169, "step": 55700 }, { "grad_norm": 0.20452119410037994, "learning_rate": 2.3158071948906103e-05, "loss": 0.0186, "step": 55710 }, { "grad_norm": 0.22261933982372284, "learning_rate": 2.314063664688212e-05, "loss": 0.0166, "step": 55720 }, { "grad_norm": 0.3046751022338867, "learning_rate": 2.3123205934388725e-05, "loss": 0.0135, "step": 55730 }, { "grad_norm": 0.16972710192203522, "learning_rate": 2.310577981440436e-05, "loss": 0.021, "step": 55740 }, { "grad_norm": 0.20773077011108398, "learning_rate": 2.3088358289906665e-05, "loss": 0.0167, "step": 55750 }, { "grad_norm": 0.29881608486175537, "learning_rate": 2.307094136387252e-05, "loss": 0.0177, "step": 55760 }, { "grad_norm": 0.2760872542858124, "learning_rate": 2.305352903927796e-05, "loss": 0.0232, "step": 55770 }, { "grad_norm": 0.3227195739746094, "learning_rate": 2.303612131909831e-05, "loss": 0.0163, "step": 55780 }, { "grad_norm": 0.390119343996048, "learning_rate": 2.3018718206308054e-05, "loss": 0.0229, "step": 55790 }, { "grad_norm": 0.3636767864227295, "learning_rate": 2.3001319703880925e-05, "loss": 0.0196, "step": 55800 }, { "grad_norm": 0.3014564514160156, "learning_rate": 2.2983925814789835e-05, "loss": 0.0177, "step": 55810 }, { "grad_norm": 0.23712551593780518, "learning_rate": 2.2966536542006957e-05, "loss": 0.0187, "step": 55820 }, { "grad_norm": 0.21897616982460022, "learning_rate": 2.294915188850358e-05, "loss": 0.0157, "step": 55830 }, { "grad_norm": 0.18744970858097076, "learning_rate": 2.2931771857250333e-05, "loss": 0.0186, "step": 55840 }, { "grad_norm": 0.19459235668182373, "learning_rate": 2.2914396451216946e-05, "loss": 0.0141, "step": 55850 }, { "grad_norm": 0.28795671463012695, "learning_rate": 2.2897025673372412e-05, "loss": 0.0181, "step": 55860 }, { "grad_norm": 0.20188304781913757, "learning_rate": 2.287965952668494e-05, "loss": 0.0159, "step": 55870 }, { "grad_norm": 0.26813966035842896, "learning_rate": 2.2862298014121873e-05, "loss": 0.0175, "step": 55880 }, { "grad_norm": 0.19681519269943237, "learning_rate": 2.28449411386499e-05, "loss": 0.0155, "step": 55890 }, { "grad_norm": 0.20480601489543915, "learning_rate": 2.282758890323477e-05, "loss": 0.0186, "step": 55900 }, { "grad_norm": 0.1959836781024933, "learning_rate": 2.2810241310841528e-05, "loss": 0.0166, "step": 55910 }, { "grad_norm": 0.24325965344905853, "learning_rate": 2.2792898364434412e-05, "loss": 0.0136, "step": 55920 }, { "grad_norm": 0.3170790672302246, "learning_rate": 2.2775560066976843e-05, "loss": 0.0175, "step": 55930 }, { "grad_norm": 0.17708395421504974, "learning_rate": 2.275822642143147e-05, "loss": 0.0156, "step": 55940 }, { "grad_norm": 0.28037774562835693, "learning_rate": 2.274089743076014e-05, "loss": 0.0167, "step": 55950 }, { "grad_norm": 0.32807794213294983, "learning_rate": 2.2723573097923917e-05, "loss": 0.0171, "step": 55960 }, { "grad_norm": 0.19645196199417114, "learning_rate": 2.2706253425883024e-05, "loss": 0.0147, "step": 55970 }, { "grad_norm": 0.2848632335662842, "learning_rate": 2.2688938417596933e-05, "loss": 0.0182, "step": 55980 }, { "grad_norm": 0.16972248256206512, "learning_rate": 2.2671628076024305e-05, "loss": 0.0136, "step": 55990 }, { "grad_norm": 0.22955498099327087, "learning_rate": 2.2654322404123008e-05, "loss": 0.0127, "step": 56000 }, { "grad_norm": 0.15905477106571198, "learning_rate": 2.2637021404850105e-05, "loss": 0.017, "step": 56010 }, { "grad_norm": 0.3404504358768463, "learning_rate": 2.2619725081161885e-05, "loss": 0.0197, "step": 56020 }, { "grad_norm": 0.22417759895324707, "learning_rate": 2.260243343601376e-05, "loss": 0.017, "step": 56030 }, { "grad_norm": 0.2058088630437851, "learning_rate": 2.2585146472360473e-05, "loss": 0.0183, "step": 56040 }, { "grad_norm": 0.2726115584373474, "learning_rate": 2.2567864193155834e-05, "loss": 0.0171, "step": 56050 }, { "grad_norm": 0.21631361544132233, "learning_rate": 2.255058660135294e-05, "loss": 0.0189, "step": 56060 }, { "grad_norm": 0.42579385638237, "learning_rate": 2.253331369990407e-05, "loss": 0.0221, "step": 56070 }, { "grad_norm": 0.19692395627498627, "learning_rate": 2.251604549176063e-05, "loss": 0.0204, "step": 56080 }, { "grad_norm": 0.25266003608703613, "learning_rate": 2.249878197987337e-05, "loss": 0.0176, "step": 56090 }, { "grad_norm": 0.34713178873062134, "learning_rate": 2.2481523167192087e-05, "loss": 0.0171, "step": 56100 }, { "grad_norm": 0.22565533220767975, "learning_rate": 2.2464269056665867e-05, "loss": 0.0166, "step": 56110 }, { "grad_norm": 0.23061086237430573, "learning_rate": 2.2447019651242958e-05, "loss": 0.0155, "step": 56120 }, { "grad_norm": 0.3427419662475586, "learning_rate": 2.2429774953870818e-05, "loss": 0.0169, "step": 56130 }, { "grad_norm": 0.3945004940032959, "learning_rate": 2.241253496749611e-05, "loss": 0.0204, "step": 56140 }, { "grad_norm": 0.23959575593471527, "learning_rate": 2.2395299695064614e-05, "loss": 0.0159, "step": 56150 }, { "grad_norm": 0.26877740025520325, "learning_rate": 2.237806913952145e-05, "loss": 0.0177, "step": 56160 }, { "grad_norm": 0.25064945220947266, "learning_rate": 2.2360843303810798e-05, "loss": 0.0159, "step": 56170 }, { "grad_norm": 0.27789542078971863, "learning_rate": 2.2343622190876084e-05, "loss": 0.0185, "step": 56180 }, { "grad_norm": 0.24795277416706085, "learning_rate": 2.2326405803659935e-05, "loss": 0.016, "step": 56190 }, { "grad_norm": 0.32376477122306824, "learning_rate": 2.230919414510416e-05, "loss": 0.0151, "step": 56200 }, { "grad_norm": 0.3530971109867096, "learning_rate": 2.229198721814976e-05, "loss": 0.0191, "step": 56210 }, { "grad_norm": 0.2955930233001709, "learning_rate": 2.2274785025736948e-05, "loss": 0.0167, "step": 56220 }, { "grad_norm": 0.31166622042655945, "learning_rate": 2.225758757080507e-05, "loss": 0.0174, "step": 56230 }, { "grad_norm": 0.35196539759635925, "learning_rate": 2.2240394856292723e-05, "loss": 0.0164, "step": 56240 }, { "grad_norm": 0.6196457147598267, "learning_rate": 2.2223206885137664e-05, "loss": 0.0158, "step": 56250 }, { "grad_norm": 0.4230599105358124, "learning_rate": 2.2206023660276853e-05, "loss": 0.0172, "step": 56260 }, { "grad_norm": 0.33522430062294006, "learning_rate": 2.218884518464645e-05, "loss": 0.0176, "step": 56270 }, { "grad_norm": 0.3881331980228424, "learning_rate": 2.2171671461181732e-05, "loss": 0.0173, "step": 56280 }, { "grad_norm": 0.3978334367275238, "learning_rate": 2.2154502492817292e-05, "loss": 0.0193, "step": 56290 }, { "grad_norm": 0.18028947710990906, "learning_rate": 2.2137338282486782e-05, "loss": 0.0119, "step": 56300 }, { "grad_norm": 0.10058177262544632, "learning_rate": 2.2120178833123113e-05, "loss": 0.0154, "step": 56310 }, { "grad_norm": 0.32663092017173767, "learning_rate": 2.210302414765838e-05, "loss": 0.0208, "step": 56320 }, { "grad_norm": 0.22217102348804474, "learning_rate": 2.2085874229023808e-05, "loss": 0.0141, "step": 56330 }, { "grad_norm": 0.18792472779750824, "learning_rate": 2.2068729080149907e-05, "loss": 0.0162, "step": 56340 }, { "grad_norm": 0.30913224816322327, "learning_rate": 2.205158870396625e-05, "loss": 0.0165, "step": 56350 }, { "grad_norm": 0.36736953258514404, "learning_rate": 2.2034453103401732e-05, "loss": 0.0203, "step": 56360 }, { "grad_norm": 0.11283062398433685, "learning_rate": 2.20173222813843e-05, "loss": 0.0117, "step": 56370 }, { "grad_norm": 0.22328037023544312, "learning_rate": 2.200019624084116e-05, "loss": 0.0181, "step": 56380 }, { "grad_norm": 0.28876546025276184, "learning_rate": 2.1983074984698687e-05, "loss": 0.0223, "step": 56390 }, { "grad_norm": 0.3615325689315796, "learning_rate": 2.1965958515882433e-05, "loss": 0.0188, "step": 56400 }, { "grad_norm": 0.32995760440826416, "learning_rate": 2.1948846837317162e-05, "loss": 0.0171, "step": 56410 }, { "grad_norm": 0.2160121500492096, "learning_rate": 2.1931739951926738e-05, "loss": 0.0137, "step": 56420 }, { "grad_norm": 0.21220232546329498, "learning_rate": 2.191463786263429e-05, "loss": 0.0127, "step": 56430 }, { "grad_norm": 0.28182855248451233, "learning_rate": 2.1897540572362095e-05, "loss": 0.0169, "step": 56440 }, { "grad_norm": 0.17516039311885834, "learning_rate": 2.1880448084031614e-05, "loss": 0.0123, "step": 56450 }, { "grad_norm": 0.24202796816825867, "learning_rate": 2.1863360400563482e-05, "loss": 0.016, "step": 56460 }, { "grad_norm": 0.4460197389125824, "learning_rate": 2.184627752487754e-05, "loss": 0.0172, "step": 56470 }, { "grad_norm": 0.23316819965839386, "learning_rate": 2.1829199459892725e-05, "loss": 0.0187, "step": 56480 }, { "grad_norm": 0.21641652286052704, "learning_rate": 2.1812126208527282e-05, "loss": 0.0164, "step": 56490 }, { "grad_norm": 0.22647075355052948, "learning_rate": 2.179505777369852e-05, "loss": 0.0214, "step": 56500 }, { "grad_norm": 0.30763521790504456, "learning_rate": 2.1777994158322974e-05, "loss": 0.017, "step": 56510 }, { "grad_norm": 0.27849650382995605, "learning_rate": 2.176093536531637e-05, "loss": 0.0185, "step": 56520 }, { "grad_norm": 0.2665133774280548, "learning_rate": 2.1743881397593537e-05, "loss": 0.0183, "step": 56530 }, { "grad_norm": 0.4281524121761322, "learning_rate": 2.1726832258068595e-05, "loss": 0.0249, "step": 56540 }, { "grad_norm": 0.41377365589141846, "learning_rate": 2.170978794965472e-05, "loss": 0.0163, "step": 56550 }, { "grad_norm": 0.5776898860931396, "learning_rate": 2.169274847526438e-05, "loss": 0.0176, "step": 56560 }, { "grad_norm": 0.2956886291503906, "learning_rate": 2.1675713837809103e-05, "loss": 0.0163, "step": 56570 }, { "grad_norm": 0.1717115044593811, "learning_rate": 2.1658684040199655e-05, "loss": 0.0126, "step": 56580 }, { "grad_norm": 0.24309265613555908, "learning_rate": 2.1641659085345974e-05, "loss": 0.0142, "step": 56590 }, { "grad_norm": 0.1629326343536377, "learning_rate": 2.1624638976157154e-05, "loss": 0.0211, "step": 56600 }, { "grad_norm": 0.2798488140106201, "learning_rate": 2.1607623715541476e-05, "loss": 0.0235, "step": 56610 }, { "grad_norm": 0.2672504782676697, "learning_rate": 2.159061330640636e-05, "loss": 0.0137, "step": 56620 }, { "grad_norm": 0.24703548848628998, "learning_rate": 2.1573607751658425e-05, "loss": 0.0122, "step": 56630 }, { "grad_norm": 0.20884168148040771, "learning_rate": 2.155660705420347e-05, "loss": 0.0184, "step": 56640 }, { "grad_norm": 0.5991812348365784, "learning_rate": 2.153961121694644e-05, "loss": 0.018, "step": 56650 }, { "grad_norm": 0.2774963080883026, "learning_rate": 2.152262024279145e-05, "loss": 0.0268, "step": 56660 }, { "grad_norm": 0.4423239827156067, "learning_rate": 2.150563413464183e-05, "loss": 0.0187, "step": 56670 }, { "grad_norm": 0.17366114258766174, "learning_rate": 2.1488652895399974e-05, "loss": 0.0136, "step": 56680 }, { "grad_norm": 0.3221118450164795, "learning_rate": 2.1471676527967587e-05, "loss": 0.018, "step": 56690 }, { "grad_norm": 0.19062072038650513, "learning_rate": 2.1454705035245414e-05, "loss": 0.0227, "step": 56700 }, { "grad_norm": 0.26436999440193176, "learning_rate": 2.143773842013343e-05, "loss": 0.0185, "step": 56710 }, { "grad_norm": 0.2716779112815857, "learning_rate": 2.1420776685530796e-05, "loss": 0.0225, "step": 56720 }, { "grad_norm": 0.28094619512557983, "learning_rate": 2.1403819834335742e-05, "loss": 0.0107, "step": 56730 }, { "grad_norm": 0.14972180128097534, "learning_rate": 2.1386867869445808e-05, "loss": 0.0159, "step": 56740 }, { "grad_norm": 0.2170572280883789, "learning_rate": 2.1369920793757548e-05, "loss": 0.0196, "step": 56750 }, { "grad_norm": 0.23303109407424927, "learning_rate": 2.1352978610166828e-05, "loss": 0.0139, "step": 56760 }, { "grad_norm": 0.2475552260875702, "learning_rate": 2.1336041321568546e-05, "loss": 0.0166, "step": 56770 }, { "grad_norm": 0.23128755390644073, "learning_rate": 2.131910893085684e-05, "loss": 0.0192, "step": 56780 }, { "grad_norm": 0.21550141274929047, "learning_rate": 2.1302181440925012e-05, "loss": 0.0167, "step": 56790 }, { "grad_norm": 0.21178105473518372, "learning_rate": 2.128525885466546e-05, "loss": 0.0162, "step": 56800 }, { "grad_norm": 0.24212712049484253, "learning_rate": 2.1268341174969847e-05, "loss": 0.0193, "step": 56810 }, { "grad_norm": 0.2788558006286621, "learning_rate": 2.1251428404728894e-05, "loss": 0.0172, "step": 56820 }, { "grad_norm": 0.29281085729599, "learning_rate": 2.123452054683256e-05, "loss": 0.0156, "step": 56830 }, { "grad_norm": 0.3343334496021271, "learning_rate": 2.1217617604169927e-05, "loss": 0.0184, "step": 56840 }, { "grad_norm": 0.33304333686828613, "learning_rate": 2.120071957962924e-05, "loss": 0.0189, "step": 56850 }, { "grad_norm": 0.24952127039432526, "learning_rate": 2.1183826476097917e-05, "loss": 0.0231, "step": 56860 }, { "grad_norm": 0.1981409341096878, "learning_rate": 2.116693829646254e-05, "loss": 0.02, "step": 56870 }, { "grad_norm": 0.34824109077453613, "learning_rate": 2.1150055043608806e-05, "loss": 0.0156, "step": 56880 }, { "grad_norm": 0.37005650997161865, "learning_rate": 2.113317672042162e-05, "loss": 0.0227, "step": 56890 }, { "grad_norm": 0.23290090262889862, "learning_rate": 2.1116303329785025e-05, "loss": 0.0125, "step": 56900 }, { "grad_norm": 0.2381458431482315, "learning_rate": 2.109943487458222e-05, "loss": 0.0186, "step": 56910 }, { "grad_norm": 0.22828292846679688, "learning_rate": 2.1082571357695574e-05, "loss": 0.0138, "step": 56920 }, { "grad_norm": 0.24127592146396637, "learning_rate": 2.1065712782006557e-05, "loss": 0.0169, "step": 56930 }, { "grad_norm": 0.15259574353694916, "learning_rate": 2.104885915039591e-05, "loss": 0.0145, "step": 56940 }, { "grad_norm": 0.24836714565753937, "learning_rate": 2.103201046574338e-05, "loss": 0.015, "step": 56950 }, { "grad_norm": 0.30737245082855225, "learning_rate": 2.1015166730928022e-05, "loss": 0.0143, "step": 56960 }, { "grad_norm": 0.28776225447654724, "learning_rate": 2.0998327948827912e-05, "loss": 0.0177, "step": 56970 }, { "grad_norm": 0.3061771094799042, "learning_rate": 2.0981494122320363e-05, "loss": 0.0194, "step": 56980 }, { "grad_norm": 0.3671952784061432, "learning_rate": 2.0964665254281822e-05, "loss": 0.0201, "step": 56990 }, { "grad_norm": 0.25964242219924927, "learning_rate": 2.094784134758784e-05, "loss": 0.0165, "step": 57000 }, { "grad_norm": 0.22240625321865082, "learning_rate": 2.0931022405113226e-05, "loss": 0.0253, "step": 57010 }, { "grad_norm": 0.2264934480190277, "learning_rate": 2.091420842973183e-05, "loss": 0.0145, "step": 57020 }, { "grad_norm": 0.23042215406894684, "learning_rate": 2.0897399424316715e-05, "loss": 0.0115, "step": 57030 }, { "grad_norm": 0.31564849615097046, "learning_rate": 2.0880595391740078e-05, "loss": 0.0189, "step": 57040 }, { "grad_norm": 0.21616677939891815, "learning_rate": 2.0863796334873277e-05, "loss": 0.0176, "step": 57050 }, { "grad_norm": 0.5232923030853271, "learning_rate": 2.08470022565868e-05, "loss": 0.0209, "step": 57060 }, { "grad_norm": 0.24772989749908447, "learning_rate": 2.0830213159750317e-05, "loss": 0.0138, "step": 57070 }, { "grad_norm": 0.1670861840248108, "learning_rate": 2.0813429047232596e-05, "loss": 0.0173, "step": 57080 }, { "grad_norm": 0.18225200474262238, "learning_rate": 2.0796649921901594e-05, "loss": 0.0173, "step": 57090 }, { "grad_norm": 0.20298832654953003, "learning_rate": 2.077987578662441e-05, "loss": 0.015, "step": 57100 }, { "grad_norm": 0.2607171833515167, "learning_rate": 2.0763106644267277e-05, "loss": 0.0169, "step": 57110 }, { "grad_norm": 0.21380355954170227, "learning_rate": 2.0746342497695607e-05, "loss": 0.009, "step": 57120 }, { "grad_norm": 0.2865377962589264, "learning_rate": 2.0729583349773886e-05, "loss": 0.0135, "step": 57130 }, { "grad_norm": 0.36169055104255676, "learning_rate": 2.0712829203365853e-05, "loss": 0.0151, "step": 57140 }, { "grad_norm": 0.1873268038034439, "learning_rate": 2.0696080061334267e-05, "loss": 0.0175, "step": 57150 }, { "grad_norm": 0.2130965143442154, "learning_rate": 2.067933592654117e-05, "loss": 0.0117, "step": 57160 }, { "grad_norm": 0.24528449773788452, "learning_rate": 2.066259680184763e-05, "loss": 0.0154, "step": 57170 }, { "grad_norm": 0.1398811936378479, "learning_rate": 2.0645862690113908e-05, "loss": 0.0114, "step": 57180 }, { "grad_norm": 0.24199837446212769, "learning_rate": 2.0629133594199436e-05, "loss": 0.0219, "step": 57190 }, { "grad_norm": 0.2527417838573456, "learning_rate": 2.0612409516962704e-05, "loss": 0.0142, "step": 57200 }, { "grad_norm": 0.30910274386405945, "learning_rate": 2.0595690461261467e-05, "loss": 0.0203, "step": 57210 }, { "grad_norm": 0.29830870032310486, "learning_rate": 2.0578976429952503e-05, "loss": 0.0158, "step": 57220 }, { "grad_norm": 0.3225257098674774, "learning_rate": 2.0562267425891802e-05, "loss": 0.0165, "step": 57230 }, { "grad_norm": 0.29740214347839355, "learning_rate": 2.0545563451934467e-05, "loss": 0.0158, "step": 57240 }, { "grad_norm": 0.27437418699264526, "learning_rate": 2.0528864510934764e-05, "loss": 0.0152, "step": 57250 }, { "grad_norm": 0.2416953146457672, "learning_rate": 2.0512170605746096e-05, "loss": 0.0244, "step": 57260 }, { "grad_norm": 0.21800106763839722, "learning_rate": 2.0495481739220963e-05, "loss": 0.019, "step": 57270 }, { "grad_norm": 0.2498297095298767, "learning_rate": 2.0478797914211045e-05, "loss": 0.0202, "step": 57280 }, { "grad_norm": 0.24289928376674652, "learning_rate": 2.046211913356716e-05, "loss": 0.0167, "step": 57290 }, { "grad_norm": 0.25550711154937744, "learning_rate": 2.0445445400139247e-05, "loss": 0.02, "step": 57300 }, { "grad_norm": 0.323922723531723, "learning_rate": 2.0428776716776405e-05, "loss": 0.0166, "step": 57310 }, { "grad_norm": 0.2055048793554306, "learning_rate": 2.0412113086326856e-05, "loss": 0.0168, "step": 57320 }, { "grad_norm": 0.19973884522914886, "learning_rate": 2.0395454511637918e-05, "loss": 0.0148, "step": 57330 }, { "grad_norm": 0.3118641674518585, "learning_rate": 2.037880099555616e-05, "loss": 0.0158, "step": 57340 }, { "grad_norm": 0.4833236038684845, "learning_rate": 2.0362152540927144e-05, "loss": 0.0166, "step": 57350 }, { "grad_norm": 0.22767958045005798, "learning_rate": 2.0345509150595666e-05, "loss": 0.0279, "step": 57360 }, { "grad_norm": 0.27692341804504395, "learning_rate": 2.0328870827405617e-05, "loss": 0.0181, "step": 57370 }, { "grad_norm": 0.21803385019302368, "learning_rate": 2.0312237574200043e-05, "loss": 0.0179, "step": 57380 }, { "grad_norm": 0.30399906635284424, "learning_rate": 2.029560939382112e-05, "loss": 0.0149, "step": 57390 }, { "grad_norm": 0.35433152318000793, "learning_rate": 2.0278986289110097e-05, "loss": 0.0152, "step": 57400 }, { "grad_norm": 0.20112034678459167, "learning_rate": 2.0262368262907484e-05, "loss": 0.0143, "step": 57410 }, { "grad_norm": 0.289887011051178, "learning_rate": 2.024575531805279e-05, "loss": 0.0184, "step": 57420 }, { "grad_norm": 1.0651376247406006, "learning_rate": 2.0229147457384735e-05, "loss": 0.0143, "step": 57430 }, { "grad_norm": 0.2985445559024811, "learning_rate": 2.0212544683741157e-05, "loss": 0.026, "step": 57440 }, { "grad_norm": 0.397720605134964, "learning_rate": 2.0195946999958976e-05, "loss": 0.0296, "step": 57450 }, { "grad_norm": 0.26934924721717834, "learning_rate": 2.017935440887434e-05, "loss": 0.0217, "step": 57460 }, { "grad_norm": 0.19041474163532257, "learning_rate": 2.0162766913322423e-05, "loss": 0.0144, "step": 57470 }, { "grad_norm": 0.1851624995470047, "learning_rate": 2.0146184516137588e-05, "loss": 0.0193, "step": 57480 }, { "grad_norm": 0.36564508080482483, "learning_rate": 2.012960722015332e-05, "loss": 0.0178, "step": 57490 }, { "grad_norm": 0.30575212836265564, "learning_rate": 2.0113035028202214e-05, "loss": 0.0223, "step": 57500 }, { "grad_norm": 0.43559253215789795, "learning_rate": 2.009646794311602e-05, "loss": 0.0198, "step": 57510 }, { "grad_norm": 0.17154620587825775, "learning_rate": 2.007990596772559e-05, "loss": 0.0173, "step": 57520 }, { "grad_norm": 0.21874107420444489, "learning_rate": 2.0063349104860923e-05, "loss": 0.0169, "step": 57530 }, { "grad_norm": 0.3384631276130676, "learning_rate": 2.0046797357351116e-05, "loss": 0.025, "step": 57540 }, { "grad_norm": 0.16292719542980194, "learning_rate": 2.0030250728024412e-05, "loss": 0.0116, "step": 57550 }, { "grad_norm": 0.3925233781337738, "learning_rate": 2.001370921970819e-05, "loss": 0.0155, "step": 57560 }, { "grad_norm": 0.20181214809417725, "learning_rate": 1.9997172835228932e-05, "loss": 0.0147, "step": 57570 }, { "grad_norm": 0.32804471254348755, "learning_rate": 1.9980641577412262e-05, "loss": 0.0172, "step": 57580 }, { "grad_norm": 0.3193826675415039, "learning_rate": 1.9964115449082925e-05, "loss": 0.0174, "step": 57590 }, { "grad_norm": 0.20984016358852386, "learning_rate": 1.9947594453064742e-05, "loss": 0.0176, "step": 57600 }, { "grad_norm": 0.5913039445877075, "learning_rate": 1.9931078592180774e-05, "loss": 0.0185, "step": 57610 }, { "grad_norm": 0.3023962378501892, "learning_rate": 1.9914567869253065e-05, "loss": 0.0184, "step": 57620 }, { "grad_norm": 0.17871196568012238, "learning_rate": 1.989806228710287e-05, "loss": 0.0228, "step": 57630 }, { "grad_norm": 0.22154535353183746, "learning_rate": 1.9881561848550555e-05, "loss": 0.0115, "step": 57640 }, { "grad_norm": 0.20993739366531372, "learning_rate": 1.9865066556415544e-05, "loss": 0.0217, "step": 57650 }, { "grad_norm": 0.20611782371997833, "learning_rate": 1.98485764135165e-05, "loss": 0.0149, "step": 57660 }, { "grad_norm": 0.19559279084205627, "learning_rate": 1.983209142267109e-05, "loss": 0.0151, "step": 57670 }, { "grad_norm": 0.195679172873497, "learning_rate": 1.9815611586696165e-05, "loss": 0.0147, "step": 57680 }, { "grad_norm": 0.33267834782600403, "learning_rate": 1.9799136908407667e-05, "loss": 0.0197, "step": 57690 }, { "grad_norm": 0.2768668234348297, "learning_rate": 1.9782667390620678e-05, "loss": 0.021, "step": 57700 }, { "grad_norm": 0.34974560141563416, "learning_rate": 1.976620303614939e-05, "loss": 0.014, "step": 57710 }, { "grad_norm": 0.2678980529308319, "learning_rate": 1.9749743847807108e-05, "loss": 0.0164, "step": 57720 }, { "grad_norm": 0.31036683917045593, "learning_rate": 1.9733289828406272e-05, "loss": 0.0133, "step": 57730 }, { "grad_norm": 0.30848342180252075, "learning_rate": 1.9716840980758382e-05, "loss": 0.0184, "step": 57740 }, { "grad_norm": 0.260006308555603, "learning_rate": 1.9700397307674134e-05, "loss": 0.0182, "step": 57750 }, { "grad_norm": 0.24014607071876526, "learning_rate": 1.968395881196328e-05, "loss": 0.0137, "step": 57760 }, { "grad_norm": 0.3501168489456177, "learning_rate": 1.966752549643473e-05, "loss": 0.016, "step": 57770 }, { "grad_norm": 0.1600036323070526, "learning_rate": 1.965109736389647e-05, "loss": 0.0165, "step": 57780 }, { "grad_norm": 0.26078370213508606, "learning_rate": 1.9634674417155645e-05, "loss": 0.0177, "step": 57790 }, { "grad_norm": 0.28530609607696533, "learning_rate": 1.9618256659018434e-05, "loss": 0.0355, "step": 57800 }, { "grad_norm": 0.3655938506126404, "learning_rate": 1.9601844092290257e-05, "loss": 0.0135, "step": 57810 }, { "grad_norm": 0.3677513897418976, "learning_rate": 1.9585436719775512e-05, "loss": 0.0157, "step": 57820 }, { "grad_norm": 0.2266538441181183, "learning_rate": 1.9569034544277793e-05, "loss": 0.0156, "step": 57830 }, { "grad_norm": 0.27410000562667847, "learning_rate": 1.9552637568599798e-05, "loss": 0.0179, "step": 57840 }, { "grad_norm": 0.2308240383863449, "learning_rate": 1.953624579554327e-05, "loss": 0.0152, "step": 57850 }, { "grad_norm": 0.13947442173957825, "learning_rate": 1.951985922790918e-05, "loss": 0.0196, "step": 57860 }, { "grad_norm": 0.2191080003976822, "learning_rate": 1.9503477868497505e-05, "loss": 0.0104, "step": 57870 }, { "grad_norm": 0.38450220227241516, "learning_rate": 1.9487101720107375e-05, "loss": 0.0206, "step": 57880 }, { "grad_norm": 0.2576761841773987, "learning_rate": 1.9470730785537032e-05, "loss": 0.0166, "step": 57890 }, { "grad_norm": 0.17699944972991943, "learning_rate": 1.9454365067583823e-05, "loss": 0.0177, "step": 57900 }, { "grad_norm": 0.16784298419952393, "learning_rate": 1.9438004569044215e-05, "loss": 0.0134, "step": 57910 }, { "grad_norm": 0.2770475447177887, "learning_rate": 1.9421649292713724e-05, "loss": 0.029, "step": 57920 }, { "grad_norm": 0.21684975922107697, "learning_rate": 1.9405299241387076e-05, "loss": 0.0164, "step": 57930 }, { "grad_norm": 0.20433910191059113, "learning_rate": 1.9388954417858007e-05, "loss": 0.0115, "step": 57940 }, { "grad_norm": 0.2993503510951996, "learning_rate": 1.9372614824919417e-05, "loss": 0.0156, "step": 57950 }, { "grad_norm": 0.2646505832672119, "learning_rate": 1.9356280465363284e-05, "loss": 0.0178, "step": 57960 }, { "grad_norm": 0.20357368886470795, "learning_rate": 1.9339951341980723e-05, "loss": 0.0168, "step": 57970 }, { "grad_norm": 0.25544196367263794, "learning_rate": 1.9323627457561916e-05, "loss": 0.017, "step": 57980 }, { "grad_norm": 0.30326780676841736, "learning_rate": 1.9307308814896198e-05, "loss": 0.0125, "step": 57990 }, { "grad_norm": 0.23972472548484802, "learning_rate": 1.9290995416771935e-05, "loss": 0.0151, "step": 58000 }, { "grad_norm": 0.1913190633058548, "learning_rate": 1.9274687265976665e-05, "loss": 0.0141, "step": 58010 }, { "grad_norm": 0.28519874811172485, "learning_rate": 1.9258384365297e-05, "loss": 0.0159, "step": 58020 }, { "grad_norm": 0.30508124828338623, "learning_rate": 1.924208671751866e-05, "loss": 0.014, "step": 58030 }, { "grad_norm": 0.28482353687286377, "learning_rate": 1.9225794325426492e-05, "loss": 0.0133, "step": 58040 }, { "grad_norm": 0.26557061076164246, "learning_rate": 1.920950719180436e-05, "loss": 0.011, "step": 58050 }, { "grad_norm": 0.32075726985931396, "learning_rate": 1.919322531943536e-05, "loss": 0.0135, "step": 58060 }, { "grad_norm": 0.3042997121810913, "learning_rate": 1.917694871110157e-05, "loss": 0.0134, "step": 58070 }, { "grad_norm": 0.28494793176651, "learning_rate": 1.9160677369584234e-05, "loss": 0.016, "step": 58080 }, { "grad_norm": 0.3982415795326233, "learning_rate": 1.9144411297663694e-05, "loss": 0.0148, "step": 58090 }, { "grad_norm": 0.3195555508136749, "learning_rate": 1.9128150498119328e-05, "loss": 0.0176, "step": 58100 }, { "grad_norm": 0.2763403356075287, "learning_rate": 1.9111894973729726e-05, "loss": 0.0145, "step": 58110 }, { "grad_norm": 0.3973529636859894, "learning_rate": 1.9095644727272454e-05, "loss": 0.0208, "step": 58120 }, { "grad_norm": 0.38909539580345154, "learning_rate": 1.907939976152429e-05, "loss": 0.0179, "step": 58130 }, { "grad_norm": 0.2997547686100006, "learning_rate": 1.906316007926101e-05, "loss": 0.0227, "step": 58140 }, { "grad_norm": 0.1642552614212036, "learning_rate": 1.904692568325755e-05, "loss": 0.0184, "step": 58150 }, { "grad_norm": 0.37501809000968933, "learning_rate": 1.9030696576287925e-05, "loss": 0.0175, "step": 58160 }, { "grad_norm": 0.35830503702163696, "learning_rate": 1.9014472761125242e-05, "loss": 0.0193, "step": 58170 }, { "grad_norm": 0.21333996951580048, "learning_rate": 1.899825424054172e-05, "loss": 0.018, "step": 58180 }, { "grad_norm": 0.20275703072547913, "learning_rate": 1.898204101730863e-05, "loss": 0.0149, "step": 58190 }, { "grad_norm": 0.24378545582294464, "learning_rate": 1.8965833094196394e-05, "loss": 0.0285, "step": 58200 }, { "grad_norm": 0.2462945282459259, "learning_rate": 1.8949630473974495e-05, "loss": 0.0146, "step": 58210 }, { "grad_norm": 0.20501121878623962, "learning_rate": 1.8933433159411517e-05, "loss": 0.0176, "step": 58220 }, { "grad_norm": 0.14606095850467682, "learning_rate": 1.891724115327514e-05, "loss": 0.0162, "step": 58230 }, { "grad_norm": 0.2443154752254486, "learning_rate": 1.8901054458332152e-05, "loss": 0.0198, "step": 58240 }, { "grad_norm": 0.22937169671058655, "learning_rate": 1.8884873077348364e-05, "loss": 0.0158, "step": 58250 }, { "grad_norm": 0.2635493576526642, "learning_rate": 1.88686970130888e-05, "loss": 0.019, "step": 58260 }, { "grad_norm": 0.1828002631664276, "learning_rate": 1.8852526268317456e-05, "loss": 0.019, "step": 58270 }, { "grad_norm": 0.4951300024986267, "learning_rate": 1.883636084579749e-05, "loss": 0.0163, "step": 58280 }, { "grad_norm": 0.2839019298553467, "learning_rate": 1.8820200748291145e-05, "loss": 0.0182, "step": 58290 }, { "grad_norm": 0.21863234043121338, "learning_rate": 1.8804045978559686e-05, "loss": 0.017, "step": 58300 }, { "grad_norm": 0.26116809248924255, "learning_rate": 1.8787896539363594e-05, "loss": 0.0168, "step": 58310 }, { "grad_norm": 0.2654030919075012, "learning_rate": 1.8771752433462298e-05, "loss": 0.0113, "step": 58320 }, { "grad_norm": 0.268850713968277, "learning_rate": 1.8755613663614447e-05, "loss": 0.019, "step": 58330 }, { "grad_norm": 0.28933581709861755, "learning_rate": 1.873948023257767e-05, "loss": 0.0144, "step": 58340 }, { "grad_norm": 0.24567866325378418, "learning_rate": 1.8723352143108747e-05, "loss": 0.0117, "step": 58350 }, { "grad_norm": 0.2822689712047577, "learning_rate": 1.870722939796352e-05, "loss": 0.0167, "step": 58360 }, { "grad_norm": 0.23362602293491364, "learning_rate": 1.869111199989693e-05, "loss": 0.0171, "step": 58370 }, { "grad_norm": 0.2221624255180359, "learning_rate": 1.8674999951663018e-05, "loss": 0.014, "step": 58380 }, { "grad_norm": 0.20625494420528412, "learning_rate": 1.8658893256014857e-05, "loss": 0.0178, "step": 58390 }, { "grad_norm": 0.2229475975036621, "learning_rate": 1.8642791915704655e-05, "loss": 0.0156, "step": 58400 }, { "grad_norm": 0.748292863368988, "learning_rate": 1.8626695933483697e-05, "loss": 0.0144, "step": 58410 }, { "grad_norm": 0.280849814414978, "learning_rate": 1.8610605312102335e-05, "loss": 0.0169, "step": 58420 }, { "grad_norm": 0.30609941482543945, "learning_rate": 1.8594520054310032e-05, "loss": 0.0212, "step": 58430 }, { "grad_norm": 0.21837757527828217, "learning_rate": 1.8578440162855326e-05, "loss": 0.0184, "step": 58440 }, { "grad_norm": 0.20024384558200836, "learning_rate": 1.8562365640485774e-05, "loss": 0.018, "step": 58450 }, { "grad_norm": 0.25274890661239624, "learning_rate": 1.854629648994815e-05, "loss": 0.0152, "step": 58460 }, { "grad_norm": 0.23111753165721893, "learning_rate": 1.8530232713988183e-05, "loss": 0.0162, "step": 58470 }, { "grad_norm": 0.41084808111190796, "learning_rate": 1.851417431535074e-05, "loss": 0.022, "step": 58480 }, { "grad_norm": 0.17114399373531342, "learning_rate": 1.8498121296779785e-05, "loss": 0.024, "step": 58490 }, { "grad_norm": 0.22915396094322205, "learning_rate": 1.848207366101829e-05, "loss": 0.0184, "step": 58500 }, { "grad_norm": 0.18062806129455566, "learning_rate": 1.8466031410808422e-05, "loss": 0.0174, "step": 58510 }, { "grad_norm": 0.314363956451416, "learning_rate": 1.844999454889129e-05, "loss": 0.0169, "step": 58520 }, { "grad_norm": 0.26531982421875, "learning_rate": 1.8433963078007237e-05, "loss": 0.0172, "step": 58530 }, { "grad_norm": 0.2229633778333664, "learning_rate": 1.841793700089554e-05, "loss": 0.0174, "step": 58540 }, { "grad_norm": 0.259864866733551, "learning_rate": 1.8401916320294644e-05, "loss": 0.0207, "step": 58550 }, { "grad_norm": 0.2969053089618683, "learning_rate": 1.838590103894205e-05, "loss": 0.0169, "step": 58560 }, { "grad_norm": 0.12308678776025772, "learning_rate": 1.8369891159574292e-05, "loss": 0.0165, "step": 58570 }, { "grad_norm": 0.26938843727111816, "learning_rate": 1.835388668492708e-05, "loss": 0.0173, "step": 58580 }, { "grad_norm": 0.3151252865791321, "learning_rate": 1.8337887617735095e-05, "loss": 0.0161, "step": 58590 }, { "grad_norm": 0.17619425058364868, "learning_rate": 1.8321893960732157e-05, "loss": 0.0126, "step": 58600 }, { "grad_norm": 0.2831664979457855, "learning_rate": 1.8305905716651138e-05, "loss": 0.017, "step": 58610 }, { "grad_norm": 0.14253973960876465, "learning_rate": 1.8289922888224e-05, "loss": 0.0155, "step": 58620 }, { "grad_norm": 0.15710392594337463, "learning_rate": 1.8273945478181765e-05, "loss": 0.0171, "step": 58630 }, { "grad_norm": 0.337625116109848, "learning_rate": 1.8257973489254558e-05, "loss": 0.0184, "step": 58640 }, { "grad_norm": 0.21297302842140198, "learning_rate": 1.824200692417152e-05, "loss": 0.0145, "step": 58650 }, { "grad_norm": 0.1754864603281021, "learning_rate": 1.8226045785660912e-05, "loss": 0.013, "step": 58660 }, { "grad_norm": 0.31101682782173157, "learning_rate": 1.821009007645006e-05, "loss": 0.0192, "step": 58670 }, { "grad_norm": 0.20883913338184357, "learning_rate": 1.8194139799265357e-05, "loss": 0.0142, "step": 58680 }, { "grad_norm": 0.3275913894176483, "learning_rate": 1.8178194956832295e-05, "loss": 0.0179, "step": 58690 }, { "grad_norm": 0.17426754534244537, "learning_rate": 1.8162255551875346e-05, "loss": 0.0161, "step": 58700 }, { "grad_norm": 0.2512127459049225, "learning_rate": 1.8146321587118194e-05, "loss": 0.0214, "step": 58710 }, { "grad_norm": 0.25505074858665466, "learning_rate": 1.8130393065283448e-05, "loss": 0.0145, "step": 58720 }, { "grad_norm": 0.1297810971736908, "learning_rate": 1.8114469989092925e-05, "loss": 0.014, "step": 58730 }, { "grad_norm": 0.29019859433174133, "learning_rate": 1.8098552361267397e-05, "loss": 0.0151, "step": 58740 }, { "grad_norm": 0.20805194973945618, "learning_rate": 1.8082640184526763e-05, "loss": 0.018, "step": 58750 }, { "grad_norm": 0.21841692924499512, "learning_rate": 1.806673346158999e-05, "loss": 0.0224, "step": 58760 }, { "grad_norm": 0.3033614456653595, "learning_rate": 1.8050832195175067e-05, "loss": 0.0185, "step": 58770 }, { "grad_norm": 0.47701868414878845, "learning_rate": 1.8034936387999136e-05, "loss": 0.0197, "step": 58780 }, { "grad_norm": 0.14640885591506958, "learning_rate": 1.8019046042778315e-05, "loss": 0.0148, "step": 58790 }, { "grad_norm": 0.268555611371994, "learning_rate": 1.800316116222785e-05, "loss": 0.0144, "step": 58800 }, { "grad_norm": 0.1869468241930008, "learning_rate": 1.7987281749062018e-05, "loss": 0.0341, "step": 58810 }, { "grad_norm": 0.2698192894458771, "learning_rate": 1.7971407805994195e-05, "loss": 0.0292, "step": 58820 }, { "grad_norm": 0.3749011158943176, "learning_rate": 1.7955539335736787e-05, "loss": 0.0189, "step": 58830 }, { "grad_norm": 0.17818321287631989, "learning_rate": 1.7939676341001304e-05, "loss": 0.0258, "step": 58840 }, { "grad_norm": 0.2565600275993347, "learning_rate": 1.7923818824498275e-05, "loss": 0.0153, "step": 58850 }, { "grad_norm": 0.23383577167987823, "learning_rate": 1.7907966788937315e-05, "loss": 0.0171, "step": 58860 }, { "grad_norm": 0.25243183970451355, "learning_rate": 1.7892120237027116e-05, "loss": 0.0228, "step": 58870 }, { "grad_norm": 0.21835891902446747, "learning_rate": 1.7876279171475413e-05, "loss": 0.0157, "step": 58880 }, { "grad_norm": 0.22495262324810028, "learning_rate": 1.7860443594989028e-05, "loss": 0.0193, "step": 58890 }, { "grad_norm": 0.13889695703983307, "learning_rate": 1.784461351027379e-05, "loss": 0.0114, "step": 58900 }, { "grad_norm": 0.3116607367992401, "learning_rate": 1.7828788920034677e-05, "loss": 0.0217, "step": 58910 }, { "grad_norm": 0.3043479919433594, "learning_rate": 1.7812969826975623e-05, "loss": 0.017, "step": 58920 }, { "grad_norm": 0.28138864040374756, "learning_rate": 1.7797156233799738e-05, "loss": 0.0156, "step": 58930 }, { "grad_norm": 0.19722431898117065, "learning_rate": 1.7781348143209094e-05, "loss": 0.0159, "step": 58940 }, { "grad_norm": 0.23955650627613068, "learning_rate": 1.7765545557904862e-05, "loss": 0.0249, "step": 58950 }, { "grad_norm": 0.2544398307800293, "learning_rate": 1.7749748480587302e-05, "loss": 0.0198, "step": 58960 }, { "grad_norm": 0.14313536882400513, "learning_rate": 1.773395691395564e-05, "loss": 0.0139, "step": 58970 }, { "grad_norm": 0.19760248064994812, "learning_rate": 1.7718170860708305e-05, "loss": 0.0127, "step": 58980 }, { "grad_norm": 0.22801950573921204, "learning_rate": 1.770239032354264e-05, "loss": 0.0126, "step": 58990 }, { "grad_norm": 0.2534565329551697, "learning_rate": 1.7686615305155124e-05, "loss": 0.0269, "step": 59000 }, { "grad_norm": 0.3016597330570221, "learning_rate": 1.767084580824128e-05, "loss": 0.0134, "step": 59010 }, { "grad_norm": 0.4355064630508423, "learning_rate": 1.765508183549569e-05, "loss": 0.0197, "step": 59020 }, { "grad_norm": 0.29299432039260864, "learning_rate": 1.763932338961199e-05, "loss": 0.0196, "step": 59030 }, { "grad_norm": 0.35205549001693726, "learning_rate": 1.762357047328284e-05, "loss": 0.0116, "step": 59040 }, { "grad_norm": 0.18841010332107544, "learning_rate": 1.7607823089199997e-05, "loss": 0.0175, "step": 59050 }, { "grad_norm": 0.19504325091838837, "learning_rate": 1.7592081240054265e-05, "loss": 0.0195, "step": 59060 }, { "grad_norm": 0.28658515214920044, "learning_rate": 1.7576344928535488e-05, "loss": 0.0177, "step": 59070 }, { "grad_norm": 0.35209906101226807, "learning_rate": 1.7560614157332573e-05, "loss": 0.0201, "step": 59080 }, { "grad_norm": 0.3098900616168976, "learning_rate": 1.7544888929133495e-05, "loss": 0.0207, "step": 59090 }, { "grad_norm": 0.19015143811702728, "learning_rate": 1.752916924662522e-05, "loss": 0.0186, "step": 59100 }, { "grad_norm": 0.17418210208415985, "learning_rate": 1.751345511249387e-05, "loss": 0.0172, "step": 59110 }, { "grad_norm": 0.17535440623760223, "learning_rate": 1.749774652942452e-05, "loss": 0.0161, "step": 59120 }, { "grad_norm": 0.300467848777771, "learning_rate": 1.748204350010135e-05, "loss": 0.0175, "step": 59130 }, { "grad_norm": 0.19680030643939972, "learning_rate": 1.7466346027207574e-05, "loss": 0.0139, "step": 59140 }, { "grad_norm": 0.24702398478984833, "learning_rate": 1.745065411342547e-05, "loss": 0.0169, "step": 59150 }, { "grad_norm": 0.2075502574443817, "learning_rate": 1.7434967761436366e-05, "loss": 0.0204, "step": 59160 }, { "grad_norm": 0.25622713565826416, "learning_rate": 1.741928697392058e-05, "loss": 0.0163, "step": 59170 }, { "grad_norm": 0.22580836713314056, "learning_rate": 1.7403611753557597e-05, "loss": 0.0139, "step": 59180 }, { "grad_norm": 0.2130891978740692, "learning_rate": 1.738794210302584e-05, "loss": 0.0112, "step": 59190 }, { "grad_norm": 0.17832854390144348, "learning_rate": 1.7372278025002837e-05, "loss": 0.014, "step": 59200 }, { "grad_norm": 0.20493900775909424, "learning_rate": 1.7356619522165164e-05, "loss": 0.0166, "step": 59210 }, { "grad_norm": 0.21596717834472656, "learning_rate": 1.7340966597188378e-05, "loss": 0.0114, "step": 59220 }, { "grad_norm": 0.2937048673629761, "learning_rate": 1.732531925274722e-05, "loss": 0.0154, "step": 59230 }, { "grad_norm": 0.31500738859176636, "learning_rate": 1.7309677491515318e-05, "loss": 0.0145, "step": 59240 }, { "grad_norm": 0.22381585836410522, "learning_rate": 1.7294041316165455e-05, "loss": 0.0122, "step": 59250 }, { "grad_norm": 0.3187341094017029, "learning_rate": 1.727841072936942e-05, "loss": 0.0188, "step": 59260 }, { "grad_norm": 0.30186328291893005, "learning_rate": 1.7262785733798058e-05, "loss": 0.0137, "step": 59270 }, { "grad_norm": 0.3049086332321167, "learning_rate": 1.7247166332121246e-05, "loss": 0.0159, "step": 59280 }, { "grad_norm": 0.20752117037773132, "learning_rate": 1.7231552527007933e-05, "loss": 0.0139, "step": 59290 }, { "grad_norm": 0.2435063272714615, "learning_rate": 1.721594432112606e-05, "loss": 0.0165, "step": 59300 }, { "grad_norm": 0.25321492552757263, "learning_rate": 1.7200341717142655e-05, "loss": 0.0195, "step": 59310 }, { "grad_norm": 0.3365323841571808, "learning_rate": 1.718474471772377e-05, "loss": 0.016, "step": 59320 }, { "grad_norm": 0.18219523131847382, "learning_rate": 1.7169153325534528e-05, "loss": 0.0161, "step": 59330 }, { "grad_norm": 0.21119263768196106, "learning_rate": 1.7153567543239045e-05, "loss": 0.0163, "step": 59340 }, { "grad_norm": 0.26628732681274414, "learning_rate": 1.7137987373500525e-05, "loss": 0.0137, "step": 59350 }, { "grad_norm": 0.13772080838680267, "learning_rate": 1.7122412818981198e-05, "loss": 0.0154, "step": 59360 }, { "grad_norm": 0.3021615743637085, "learning_rate": 1.7106843882342283e-05, "loss": 0.0195, "step": 59370 }, { "grad_norm": 0.24713531136512756, "learning_rate": 1.709128056624415e-05, "loss": 0.0169, "step": 59380 }, { "grad_norm": 0.30673500895500183, "learning_rate": 1.70757228733461e-05, "loss": 0.013, "step": 59390 }, { "grad_norm": 0.22415925562381744, "learning_rate": 1.706017080630653e-05, "loss": 0.0163, "step": 59400 }, { "grad_norm": 0.21769684553146362, "learning_rate": 1.7044624367782873e-05, "loss": 0.0119, "step": 59410 }, { "grad_norm": 0.21629270911216736, "learning_rate": 1.7029083560431553e-05, "loss": 0.0176, "step": 59420 }, { "grad_norm": 0.3243848979473114, "learning_rate": 1.701354838690813e-05, "loss": 0.0209, "step": 59430 }, { "grad_norm": 0.31618911027908325, "learning_rate": 1.6998018849867087e-05, "loss": 0.0137, "step": 59440 }, { "grad_norm": 0.20184385776519775, "learning_rate": 1.698249495196202e-05, "loss": 0.0197, "step": 59450 }, { "grad_norm": 0.24298584461212158, "learning_rate": 1.6966976695845528e-05, "loss": 0.0156, "step": 59460 }, { "grad_norm": 0.17595715820789337, "learning_rate": 1.6951464084169268e-05, "loss": 0.0133, "step": 59470 }, { "grad_norm": 0.3101097047328949, "learning_rate": 1.6935957119583916e-05, "loss": 0.0212, "step": 59480 }, { "grad_norm": 0.16983376443386078, "learning_rate": 1.6920455804739205e-05, "loss": 0.0138, "step": 59490 }, { "grad_norm": 0.21987870335578918, "learning_rate": 1.6904960142283855e-05, "loss": 0.0183, "step": 59500 }, { "grad_norm": 0.22211961448192596, "learning_rate": 1.6889470134865666e-05, "loss": 0.0143, "step": 59510 }, { "grad_norm": 0.1996040642261505, "learning_rate": 1.687398578513145e-05, "loss": 0.0162, "step": 59520 }, { "grad_norm": 0.2708384692668915, "learning_rate": 1.6858507095727066e-05, "loss": 0.019, "step": 59530 }, { "grad_norm": 0.17354217171669006, "learning_rate": 1.6843034069297403e-05, "loss": 0.0128, "step": 59540 }, { "grad_norm": 0.2832501232624054, "learning_rate": 1.682756670848637e-05, "loss": 0.018, "step": 59550 }, { "grad_norm": 0.23348546028137207, "learning_rate": 1.6812105015936937e-05, "loss": 0.0152, "step": 59560 }, { "grad_norm": 0.6770646572113037, "learning_rate": 1.6796648994291027e-05, "loss": 0.0209, "step": 59570 }, { "grad_norm": 0.24111652374267578, "learning_rate": 1.678119864618973e-05, "loss": 0.0131, "step": 59580 }, { "grad_norm": 0.2585538625717163, "learning_rate": 1.6765753974273023e-05, "loss": 0.0183, "step": 59590 }, { "grad_norm": 0.2234894335269928, "learning_rate": 1.675031498118001e-05, "loss": 0.0146, "step": 59600 }, { "grad_norm": 0.2742810845375061, "learning_rate": 1.6734881669548804e-05, "loss": 0.0129, "step": 59610 }, { "grad_norm": 0.21296089887619019, "learning_rate": 1.671945404201648e-05, "loss": 0.0194, "step": 59620 }, { "grad_norm": 0.24855419993400574, "learning_rate": 1.670403210121927e-05, "loss": 0.015, "step": 59630 }, { "grad_norm": 0.2483106553554535, "learning_rate": 1.6688615849792312e-05, "loss": 0.0115, "step": 59640 }, { "grad_norm": 0.35488566756248474, "learning_rate": 1.6673205290369832e-05, "loss": 0.0176, "step": 59650 }, { "grad_norm": 0.31423190236091614, "learning_rate": 1.6657800425585086e-05, "loss": 0.019, "step": 59660 }, { "grad_norm": 0.24618464708328247, "learning_rate": 1.6642401258070327e-05, "loss": 0.023, "step": 59670 }, { "grad_norm": 0.39205458760261536, "learning_rate": 1.6627007790456884e-05, "loss": 0.0182, "step": 59680 }, { "grad_norm": 0.09699148684740067, "learning_rate": 1.6611620025375037e-05, "loss": 0.0094, "step": 59690 }, { "grad_norm": 0.23085428774356842, "learning_rate": 1.6596237965454154e-05, "loss": 0.018, "step": 59700 }, { "grad_norm": 0.3741036355495453, "learning_rate": 1.6580861613322607e-05, "loss": 0.0168, "step": 59710 }, { "grad_norm": 0.24834458529949188, "learning_rate": 1.6565490971607796e-05, "loss": 0.011, "step": 59720 }, { "grad_norm": 0.18985946476459503, "learning_rate": 1.655012604293614e-05, "loss": 0.0159, "step": 59730 }, { "grad_norm": 0.19348037242889404, "learning_rate": 1.653476682993309e-05, "loss": 0.0198, "step": 59740 }, { "grad_norm": 0.2751774787902832, "learning_rate": 1.651941333522311e-05, "loss": 0.0163, "step": 59750 }, { "grad_norm": 0.279585599899292, "learning_rate": 1.6504065561429715e-05, "loss": 0.0148, "step": 59760 }, { "grad_norm": 0.20160315930843353, "learning_rate": 1.6488723511175385e-05, "loss": 0.0232, "step": 59770 }, { "grad_norm": 0.27908191084861755, "learning_rate": 1.6473387187081668e-05, "loss": 0.0176, "step": 59780 }, { "grad_norm": 0.2253027856349945, "learning_rate": 1.6458056591769123e-05, "loss": 0.0155, "step": 59790 }, { "grad_norm": 0.46317797899246216, "learning_rate": 1.6442731727857335e-05, "loss": 0.0188, "step": 59800 }, { "grad_norm": 0.24160878360271454, "learning_rate": 1.642741259796492e-05, "loss": 0.0132, "step": 59810 }, { "grad_norm": 0.17468327283859253, "learning_rate": 1.641209920470944e-05, "loss": 0.0126, "step": 59820 }, { "grad_norm": 0.7100775241851807, "learning_rate": 1.639679155070762e-05, "loss": 0.0165, "step": 59830 }, { "grad_norm": 0.31595298647880554, "learning_rate": 1.6381489638575048e-05, "loss": 0.0168, "step": 59840 }, { "grad_norm": 0.23547549545764923, "learning_rate": 1.636619347092643e-05, "loss": 0.0125, "step": 59850 }, { "grad_norm": 0.339608371257782, "learning_rate": 1.6350903050375476e-05, "loss": 0.0177, "step": 59860 }, { "grad_norm": 0.3159312605857849, "learning_rate": 1.6335618379534856e-05, "loss": 0.015, "step": 59870 }, { "grad_norm": 0.2813291847705841, "learning_rate": 1.6320339461016364e-05, "loss": 0.0144, "step": 59880 }, { "grad_norm": 0.30950239300727844, "learning_rate": 1.6305066297430687e-05, "loss": 0.0185, "step": 59890 }, { "grad_norm": 0.2841225266456604, "learning_rate": 1.6289798891387654e-05, "loss": 0.0204, "step": 59900 }, { "grad_norm": 0.2522485554218292, "learning_rate": 1.6274537245495995e-05, "loss": 0.012, "step": 59910 }, { "grad_norm": 0.2115926742553711, "learning_rate": 1.6259281362363527e-05, "loss": 0.0136, "step": 59920 }, { "grad_norm": 0.49458035826683044, "learning_rate": 1.6244031244597068e-05, "loss": 0.0233, "step": 59930 }, { "grad_norm": 0.3310390114784241, "learning_rate": 1.622878689480244e-05, "loss": 0.0139, "step": 59940 }, { "grad_norm": 0.22959209978580475, "learning_rate": 1.6213548315584498e-05, "loss": 0.0218, "step": 59950 }, { "grad_norm": 0.203314408659935, "learning_rate": 1.6198315509547074e-05, "loss": 0.0171, "step": 59960 }, { "grad_norm": 0.1689014434814453, "learning_rate": 1.6183088479293056e-05, "loss": 0.0133, "step": 59970 }, { "grad_norm": 0.2050919085741043, "learning_rate": 1.6167867227424316e-05, "loss": 0.0129, "step": 59980 }, { "grad_norm": 0.16156724095344543, "learning_rate": 1.615265175654176e-05, "loss": 0.0165, "step": 59990 }, { "grad_norm": 0.3368280529975891, "learning_rate": 1.613744206924529e-05, "loss": 0.0177, "step": 60000 }, { "grad_norm": 0.19286562502384186, "learning_rate": 1.6122238168133845e-05, "loss": 0.0154, "step": 60010 }, { "grad_norm": 0.19661235809326172, "learning_rate": 1.6107040055805305e-05, "loss": 0.0146, "step": 60020 }, { "grad_norm": 0.23125062882900238, "learning_rate": 1.6091847734856673e-05, "loss": 0.0156, "step": 60030 }, { "grad_norm": 0.2608181834220886, "learning_rate": 1.6076661207883865e-05, "loss": 0.0159, "step": 60040 }, { "grad_norm": 0.11890207231044769, "learning_rate": 1.6061480477481848e-05, "loss": 0.0136, "step": 60050 }, { "grad_norm": 0.33189696073532104, "learning_rate": 1.604630554624461e-05, "loss": 0.0203, "step": 60060 }, { "grad_norm": 0.3131222128868103, "learning_rate": 1.603113641676509e-05, "loss": 0.0154, "step": 60070 }, { "grad_norm": 0.1465412676334381, "learning_rate": 1.6015973091635338e-05, "loss": 0.0191, "step": 60080 }, { "grad_norm": 0.6534298658370972, "learning_rate": 1.6000815573446288e-05, "loss": 0.0168, "step": 60090 }, { "grad_norm": 0.3307473063468933, "learning_rate": 1.5985663864788002e-05, "loss": 0.0223, "step": 60100 }, { "grad_norm": 0.2744797468185425, "learning_rate": 1.597051796824946e-05, "loss": 0.0196, "step": 60110 }, { "grad_norm": 0.20329129695892334, "learning_rate": 1.5955377886418682e-05, "loss": 0.0163, "step": 60120 }, { "grad_norm": 0.20849740505218506, "learning_rate": 1.5940243621882704e-05, "loss": 0.0148, "step": 60130 }, { "grad_norm": 0.30454114079475403, "learning_rate": 1.5925115177227555e-05, "loss": 0.02, "step": 60140 }, { "grad_norm": 0.278398334980011, "learning_rate": 1.5909992555038288e-05, "loss": 0.0181, "step": 60150 }, { "grad_norm": 0.29220619797706604, "learning_rate": 1.5894875757898912e-05, "loss": 0.0234, "step": 60160 }, { "grad_norm": 0.1613129824399948, "learning_rate": 1.58797647883925e-05, "loss": 0.0165, "step": 60170 }, { "grad_norm": 0.35150405764579773, "learning_rate": 1.586465964910109e-05, "loss": 0.0167, "step": 60180 }, { "grad_norm": 0.2636636197566986, "learning_rate": 1.5849560342605734e-05, "loss": 0.0168, "step": 60190 }, { "grad_norm": 0.28724175691604614, "learning_rate": 1.583446687148651e-05, "loss": 0.0148, "step": 60200 }, { "grad_norm": 0.2526566982269287, "learning_rate": 1.581937923832248e-05, "loss": 0.0132, "step": 60210 }, { "grad_norm": 0.21869510412216187, "learning_rate": 1.5804297445691663e-05, "loss": 0.0149, "step": 60220 }, { "grad_norm": 0.2615985870361328, "learning_rate": 1.578922149617119e-05, "loss": 0.0169, "step": 60230 }, { "grad_norm": 0.17333625257015228, "learning_rate": 1.5774151392337084e-05, "loss": 0.0126, "step": 60240 }, { "grad_norm": 0.38618868589401245, "learning_rate": 1.575908713676442e-05, "loss": 0.0163, "step": 60250 }, { "grad_norm": 0.3655758500099182, "learning_rate": 1.5744028732027293e-05, "loss": 0.0183, "step": 60260 }, { "grad_norm": 0.3325079381465912, "learning_rate": 1.5728976180698723e-05, "loss": 0.0173, "step": 60270 }, { "grad_norm": 0.15750741958618164, "learning_rate": 1.5713929485350836e-05, "loss": 0.0171, "step": 60280 }, { "grad_norm": 0.17332158982753754, "learning_rate": 1.569888864855465e-05, "loss": 0.0161, "step": 60290 }, { "grad_norm": 0.1569541096687317, "learning_rate": 1.568385367288028e-05, "loss": 0.0125, "step": 60300 }, { "grad_norm": 0.29948484897613525, "learning_rate": 1.5668824560896755e-05, "loss": 0.0173, "step": 60310 }, { "grad_norm": 0.18566875159740448, "learning_rate": 1.5653801315172156e-05, "loss": 0.0132, "step": 60320 }, { "grad_norm": 0.2693571150302887, "learning_rate": 1.5638783938273554e-05, "loss": 0.0155, "step": 60330 }, { "grad_norm": 0.23561815917491913, "learning_rate": 1.5623772432766966e-05, "loss": 0.0222, "step": 60340 }, { "grad_norm": 0.23492568731307983, "learning_rate": 1.5608766801217507e-05, "loss": 0.0147, "step": 60350 }, { "grad_norm": 0.139827698469162, "learning_rate": 1.5593767046189184e-05, "loss": 0.0118, "step": 60360 }, { "grad_norm": 0.24842700362205505, "learning_rate": 1.5578773170245064e-05, "loss": 0.0126, "step": 60370 }, { "grad_norm": 0.3826378881931305, "learning_rate": 1.5563785175947182e-05, "loss": 0.0186, "step": 60380 }, { "grad_norm": 0.22889088094234467, "learning_rate": 1.5548803065856582e-05, "loss": 0.0139, "step": 60390 }, { "grad_norm": 0.47961699962615967, "learning_rate": 1.55338268425333e-05, "loss": 0.0152, "step": 60400 }, { "grad_norm": 0.24755647778511047, "learning_rate": 1.5518856508536373e-05, "loss": 0.0147, "step": 60410 }, { "grad_norm": 0.3115716874599457, "learning_rate": 1.5503892066423786e-05, "loss": 0.0161, "step": 60420 }, { "grad_norm": 0.42139533162117004, "learning_rate": 1.548893351875258e-05, "loss": 0.0244, "step": 60430 }, { "grad_norm": 0.23882359266281128, "learning_rate": 1.547398086807875e-05, "loss": 0.0134, "step": 60440 }, { "grad_norm": 0.3126414120197296, "learning_rate": 1.5459034116957305e-05, "loss": 0.0231, "step": 60450 }, { "grad_norm": 0.21557781100273132, "learning_rate": 1.544409326794225e-05, "loss": 0.0263, "step": 60460 }, { "grad_norm": 0.2659528851509094, "learning_rate": 1.542915832358651e-05, "loss": 0.0164, "step": 60470 }, { "grad_norm": 0.1581699252128601, "learning_rate": 1.541422928644214e-05, "loss": 0.0118, "step": 60480 }, { "grad_norm": 0.30739203095436096, "learning_rate": 1.5399306159060024e-05, "loss": 0.0223, "step": 60490 }, { "grad_norm": 0.20260033011436462, "learning_rate": 1.5384388943990185e-05, "loss": 0.0148, "step": 60500 }, { "grad_norm": 0.28721755743026733, "learning_rate": 1.5369477643781526e-05, "loss": 0.0135, "step": 60510 }, { "grad_norm": 0.24153581261634827, "learning_rate": 1.5354572260981985e-05, "loss": 0.0187, "step": 60520 }, { "grad_norm": 0.24488621950149536, "learning_rate": 1.533967279813851e-05, "loss": 0.0137, "step": 60530 }, { "grad_norm": 0.30823853611946106, "learning_rate": 1.5324779257796956e-05, "loss": 0.0157, "step": 60540 }, { "grad_norm": 0.27428171038627625, "learning_rate": 1.5309891642502293e-05, "loss": 0.0146, "step": 60550 }, { "grad_norm": 0.22473584115505219, "learning_rate": 1.5295009954798357e-05, "loss": 0.0126, "step": 60560 }, { "grad_norm": 0.3519964814186096, "learning_rate": 1.5280134197228036e-05, "loss": 0.0178, "step": 60570 }, { "grad_norm": 0.15554387867450714, "learning_rate": 1.526526437233319e-05, "loss": 0.0169, "step": 60580 }, { "grad_norm": 0.13066887855529785, "learning_rate": 1.525040048265466e-05, "loss": 0.0109, "step": 60590 }, { "grad_norm": 0.2105410099029541, "learning_rate": 1.5235542530732288e-05, "loss": 0.0132, "step": 60600 }, { "grad_norm": 0.1842048168182373, "learning_rate": 1.5220690519104901e-05, "loss": 0.018, "step": 60610 }, { "grad_norm": 0.20073704421520233, "learning_rate": 1.5205844450310275e-05, "loss": 0.0143, "step": 60620 }, { "grad_norm": 0.3095932900905609, "learning_rate": 1.519100432688521e-05, "loss": 0.0154, "step": 60630 }, { "grad_norm": 0.2331896424293518, "learning_rate": 1.5176170151365476e-05, "loss": 0.0157, "step": 60640 }, { "grad_norm": 0.11383923143148422, "learning_rate": 1.5161341926285833e-05, "loss": 0.0105, "step": 60650 }, { "grad_norm": 0.25481706857681274, "learning_rate": 1.5146519654180025e-05, "loss": 0.0146, "step": 60660 }, { "grad_norm": 0.23718172311782837, "learning_rate": 1.5131703337580739e-05, "loss": 0.0214, "step": 60670 }, { "grad_norm": 0.18432581424713135, "learning_rate": 1.5116892979019731e-05, "loss": 0.0198, "step": 60680 }, { "grad_norm": 0.22962726652622223, "learning_rate": 1.5102088581027623e-05, "loss": 0.0137, "step": 60690 }, { "grad_norm": 0.28625622391700745, "learning_rate": 1.5087290146134154e-05, "loss": 0.013, "step": 60700 }, { "grad_norm": 0.2187798172235489, "learning_rate": 1.5072497676867915e-05, "loss": 0.0145, "step": 60710 }, { "grad_norm": 0.17883244156837463, "learning_rate": 1.505771117575655e-05, "loss": 0.018, "step": 60720 }, { "grad_norm": 0.15419946610927582, "learning_rate": 1.5042930645326691e-05, "loss": 0.0149, "step": 60730 }, { "grad_norm": 0.2844361364841461, "learning_rate": 1.5028156088103879e-05, "loss": 0.0184, "step": 60740 }, { "grad_norm": 0.3760194182395935, "learning_rate": 1.5013387506612735e-05, "loss": 0.0202, "step": 60750 }, { "grad_norm": 0.25846704840660095, "learning_rate": 1.4998624903376767e-05, "loss": 0.0172, "step": 60760 }, { "grad_norm": 0.8210546970367432, "learning_rate": 1.4983868280918517e-05, "loss": 0.0167, "step": 60770 }, { "grad_norm": 0.32177865505218506, "learning_rate": 1.4969117641759478e-05, "loss": 0.0206, "step": 60780 }, { "grad_norm": 0.21544399857521057, "learning_rate": 1.495437298842014e-05, "loss": 0.017, "step": 60790 }, { "grad_norm": 0.24070513248443604, "learning_rate": 1.4939634323419976e-05, "loss": 0.0184, "step": 60800 }, { "grad_norm": 0.3551013767719269, "learning_rate": 1.4924901649277384e-05, "loss": 0.0207, "step": 60810 }, { "grad_norm": 0.5054852962493896, "learning_rate": 1.4910174968509793e-05, "loss": 0.0133, "step": 60820 }, { "grad_norm": 0.2193395048379898, "learning_rate": 1.4895454283633598e-05, "loss": 0.014, "step": 60830 }, { "grad_norm": 0.5687366724014282, "learning_rate": 1.488073959716415e-05, "loss": 0.0169, "step": 60840 }, { "grad_norm": 0.24904939532279968, "learning_rate": 1.4866030911615791e-05, "loss": 0.0171, "step": 60850 }, { "grad_norm": 0.17867602407932281, "learning_rate": 1.4851328229501849e-05, "loss": 0.0136, "step": 60860 }, { "grad_norm": 0.21215641498565674, "learning_rate": 1.4836631553334562e-05, "loss": 0.0118, "step": 60870 }, { "grad_norm": 0.16745232045650482, "learning_rate": 1.4821940885625251e-05, "loss": 0.013, "step": 60880 }, { "grad_norm": 0.32069405913352966, "learning_rate": 1.4807256228884109e-05, "loss": 0.0147, "step": 60890 }, { "grad_norm": 0.19197364151477814, "learning_rate": 1.4792577585620353e-05, "loss": 0.0157, "step": 60900 }, { "grad_norm": 0.2285783886909485, "learning_rate": 1.4777904958342164e-05, "loss": 0.015, "step": 60910 }, { "grad_norm": 0.214666485786438, "learning_rate": 1.4763238349556691e-05, "loss": 0.0109, "step": 60920 }, { "grad_norm": 0.23270903527736664, "learning_rate": 1.4748577761770072e-05, "loss": 0.018, "step": 60930 }, { "grad_norm": 0.17910635471343994, "learning_rate": 1.4733923197487354e-05, "loss": 0.0126, "step": 60940 }, { "grad_norm": 0.23362784087657928, "learning_rate": 1.4719274659212662e-05, "loss": 0.0173, "step": 60950 }, { "grad_norm": 0.16895435750484467, "learning_rate": 1.4704632149448983e-05, "loss": 0.0165, "step": 60960 }, { "grad_norm": 0.1562226265668869, "learning_rate": 1.4689995670698343e-05, "loss": 0.0153, "step": 60970 }, { "grad_norm": 0.24590140581130981, "learning_rate": 1.4675365225461728e-05, "loss": 0.0173, "step": 60980 }, { "grad_norm": 0.2882270812988281, "learning_rate": 1.4660740816239032e-05, "loss": 0.0124, "step": 60990 }, { "grad_norm": 0.29224473237991333, "learning_rate": 1.464612244552923e-05, "loss": 0.014, "step": 61000 }, { "grad_norm": 0.19099706411361694, "learning_rate": 1.4631510115830161e-05, "loss": 0.0128, "step": 61010 }, { "grad_norm": 0.24866212904453278, "learning_rate": 1.4616903829638679e-05, "loss": 0.0145, "step": 61020 }, { "grad_norm": 0.18667161464691162, "learning_rate": 1.46023035894506e-05, "loss": 0.0113, "step": 61030 }, { "grad_norm": 0.3576802909374237, "learning_rate": 1.4587709397760713e-05, "loss": 0.0174, "step": 61040 }, { "grad_norm": 0.22127686440944672, "learning_rate": 1.4573121257062755e-05, "loss": 0.0154, "step": 61050 }, { "grad_norm": 0.48633527755737305, "learning_rate": 1.4558539169849472e-05, "loss": 0.025, "step": 61060 }, { "grad_norm": 0.32600465416908264, "learning_rate": 1.4543963138612499e-05, "loss": 0.0147, "step": 61070 }, { "grad_norm": 0.20058076083660126, "learning_rate": 1.4529393165842498e-05, "loss": 0.0124, "step": 61080 }, { "grad_norm": 0.2640115022659302, "learning_rate": 1.4514829254029084e-05, "loss": 0.014, "step": 61090 }, { "grad_norm": 0.16662465035915375, "learning_rate": 1.450027140566083e-05, "loss": 0.012, "step": 61100 }, { "grad_norm": 0.2595897614955902, "learning_rate": 1.4485719623225268e-05, "loss": 0.0159, "step": 61110 }, { "grad_norm": 0.1617364138364792, "learning_rate": 1.4471173909208912e-05, "loss": 0.0161, "step": 61120 }, { "grad_norm": 0.2668445110321045, "learning_rate": 1.4456634266097236e-05, "loss": 0.0201, "step": 61130 }, { "grad_norm": 0.2362057864665985, "learning_rate": 1.444210069637461e-05, "loss": 0.0195, "step": 61140 }, { "grad_norm": 0.4168497622013092, "learning_rate": 1.4427573202524502e-05, "loss": 0.0133, "step": 61150 }, { "grad_norm": 0.18463683128356934, "learning_rate": 1.4413051787029208e-05, "loss": 0.0212, "step": 61160 }, { "grad_norm": 0.21803659200668335, "learning_rate": 1.439853645237006e-05, "loss": 0.0195, "step": 61170 }, { "grad_norm": 0.1866220384836197, "learning_rate": 1.4384027201027344e-05, "loss": 0.0144, "step": 61180 }, { "grad_norm": 0.15156573057174683, "learning_rate": 1.4369524035480253e-05, "loss": 0.0139, "step": 61190 }, { "grad_norm": 0.4128952920436859, "learning_rate": 1.4355026958207035e-05, "loss": 0.0146, "step": 61200 }, { "grad_norm": 0.25445353984832764, "learning_rate": 1.434053597168481e-05, "loss": 0.0153, "step": 61210 }, { "grad_norm": 0.43243175745010376, "learning_rate": 1.432605107838969e-05, "loss": 0.0202, "step": 61220 }, { "grad_norm": 0.2198060005903244, "learning_rate": 1.4311572280796759e-05, "loss": 0.0135, "step": 61230 }, { "grad_norm": 0.38899630308151245, "learning_rate": 1.4297099581380047e-05, "loss": 0.0225, "step": 61240 }, { "grad_norm": 0.4022720456123352, "learning_rate": 1.4282632982612538e-05, "loss": 0.0203, "step": 61250 }, { "grad_norm": 0.1858191192150116, "learning_rate": 1.4268172486966197e-05, "loss": 0.0191, "step": 61260 }, { "grad_norm": 0.2504400610923767, "learning_rate": 1.4253718096911894e-05, "loss": 0.0195, "step": 61270 }, { "grad_norm": 0.294328898191452, "learning_rate": 1.4239269814919504e-05, "loss": 0.0242, "step": 61280 }, { "grad_norm": 0.22045914828777313, "learning_rate": 1.4224827643457843e-05, "loss": 0.0202, "step": 61290 }, { "grad_norm": 0.1947035938501358, "learning_rate": 1.4210391584994686e-05, "loss": 0.014, "step": 61300 }, { "grad_norm": 0.19626092910766602, "learning_rate": 1.4195961641996763e-05, "loss": 0.0136, "step": 61310 }, { "grad_norm": 0.23063217103481293, "learning_rate": 1.4181537816929751e-05, "loss": 0.0159, "step": 61320 }, { "grad_norm": 0.2755553126335144, "learning_rate": 1.4167120112258303e-05, "loss": 0.0199, "step": 61330 }, { "grad_norm": 0.17889048159122467, "learning_rate": 1.4152708530445963e-05, "loss": 0.0134, "step": 61340 }, { "grad_norm": 0.16246236860752106, "learning_rate": 1.4138303073955344e-05, "loss": 0.0165, "step": 61350 }, { "grad_norm": 0.22505106031894684, "learning_rate": 1.4123903745247897e-05, "loss": 0.0156, "step": 61360 }, { "grad_norm": 0.12761914730072021, "learning_rate": 1.4109510546784082e-05, "loss": 0.011, "step": 61370 }, { "grad_norm": 0.32800930738449097, "learning_rate": 1.4095123481023325e-05, "loss": 0.0212, "step": 61380 }, { "grad_norm": 0.1679249256849289, "learning_rate": 1.4080742550423936e-05, "loss": 0.0127, "step": 61390 }, { "grad_norm": 0.13777247071266174, "learning_rate": 1.4066367757443278e-05, "loss": 0.0182, "step": 61400 }, { "grad_norm": 0.25293275713920593, "learning_rate": 1.4051999104537572e-05, "loss": 0.0116, "step": 61410 }, { "grad_norm": 0.3258253335952759, "learning_rate": 1.4037636594162034e-05, "loss": 0.0161, "step": 61420 }, { "grad_norm": 0.2447868436574936, "learning_rate": 1.402328022877083e-05, "loss": 0.0097, "step": 61430 }, { "grad_norm": 0.18679961562156677, "learning_rate": 1.4008930010817073e-05, "loss": 0.0206, "step": 61440 }, { "grad_norm": 0.2898402512073517, "learning_rate": 1.3994585942752831e-05, "loss": 0.0114, "step": 61450 }, { "grad_norm": 0.45495426654815674, "learning_rate": 1.398024802702909e-05, "loss": 0.0153, "step": 61460 }, { "grad_norm": 0.3734300136566162, "learning_rate": 1.3965916266095819e-05, "loss": 0.0178, "step": 61470 }, { "grad_norm": 0.3183586597442627, "learning_rate": 1.395159066240192e-05, "loss": 0.014, "step": 61480 }, { "grad_norm": 0.24954423308372498, "learning_rate": 1.3937271218395248e-05, "loss": 0.01, "step": 61490 }, { "grad_norm": 0.2380307912826538, "learning_rate": 1.3922957936522613e-05, "loss": 0.02, "step": 61500 }, { "grad_norm": 0.3125859498977661, "learning_rate": 1.3908650819229758e-05, "loss": 0.0158, "step": 61510 }, { "grad_norm": 0.285072386264801, "learning_rate": 1.389434986896137e-05, "loss": 0.0131, "step": 61520 }, { "grad_norm": 0.19163237512111664, "learning_rate": 1.388005508816112e-05, "loss": 0.0109, "step": 61530 }, { "grad_norm": 0.23854447901248932, "learning_rate": 1.3865766479271557e-05, "loss": 0.0182, "step": 61540 }, { "grad_norm": 0.18760012090206146, "learning_rate": 1.3851484044734225e-05, "loss": 0.0179, "step": 61550 }, { "grad_norm": 0.20184451341629028, "learning_rate": 1.3837207786989608e-05, "loss": 0.026, "step": 61560 }, { "grad_norm": 0.30570340156555176, "learning_rate": 1.382293770847713e-05, "loss": 0.0158, "step": 61570 }, { "grad_norm": 0.27238261699676514, "learning_rate": 1.3808673811635159e-05, "loss": 0.0138, "step": 61580 }, { "grad_norm": 0.25777381658554077, "learning_rate": 1.3794416098900975e-05, "loss": 0.0185, "step": 61590 }, { "grad_norm": 0.3582482933998108, "learning_rate": 1.3780164572710886e-05, "loss": 0.0242, "step": 61600 }, { "grad_norm": 0.23365141451358795, "learning_rate": 1.3765919235500035e-05, "loss": 0.0181, "step": 61610 }, { "grad_norm": 0.15020571649074554, "learning_rate": 1.375168008970259e-05, "loss": 0.0151, "step": 61620 }, { "grad_norm": 0.2459876984357834, "learning_rate": 1.3737447137751635e-05, "loss": 0.0152, "step": 61630 }, { "grad_norm": 0.24595576524734497, "learning_rate": 1.372322038207915e-05, "loss": 0.011, "step": 61640 }, { "grad_norm": 0.2895495593547821, "learning_rate": 1.3708999825116159e-05, "loss": 0.0209, "step": 61650 }, { "grad_norm": 0.3204560875892639, "learning_rate": 1.3694785469292526e-05, "loss": 0.0227, "step": 61660 }, { "grad_norm": 0.2354067862033844, "learning_rate": 1.3680577317037101e-05, "loss": 0.0168, "step": 61670 }, { "grad_norm": 0.2656663954257965, "learning_rate": 1.3666375370777678e-05, "loss": 0.0143, "step": 61680 }, { "grad_norm": 0.27080217003822327, "learning_rate": 1.3652179632940981e-05, "loss": 0.016, "step": 61690 }, { "grad_norm": 0.19394448399543762, "learning_rate": 1.3637990105952664e-05, "loss": 0.0172, "step": 61700 }, { "grad_norm": 0.2655796408653259, "learning_rate": 1.3623806792237337e-05, "loss": 0.0149, "step": 61710 }, { "grad_norm": 0.3465355336666107, "learning_rate": 1.3609629694218551e-05, "loss": 0.0136, "step": 61720 }, { "grad_norm": 0.27636846899986267, "learning_rate": 1.3595458814318762e-05, "loss": 0.0122, "step": 61730 }, { "grad_norm": 0.23081353306770325, "learning_rate": 1.3581294154959389e-05, "loss": 0.0171, "step": 61740 }, { "grad_norm": 0.3244650959968567, "learning_rate": 1.3567135718560792e-05, "loss": 0.0159, "step": 61750 }, { "grad_norm": 0.17501060664653778, "learning_rate": 1.3552983507542261e-05, "loss": 0.0138, "step": 61760 }, { "grad_norm": 0.4855952560901642, "learning_rate": 1.3538837524322023e-05, "loss": 0.0209, "step": 61770 }, { "grad_norm": 0.10728301107883453, "learning_rate": 1.3524697771317251e-05, "loss": 0.016, "step": 61780 }, { "grad_norm": 0.10721270740032196, "learning_rate": 1.3510564250943997e-05, "loss": 0.0157, "step": 61790 }, { "grad_norm": 0.19868266582489014, "learning_rate": 1.3496436965617353e-05, "loss": 0.0159, "step": 61800 }, { "grad_norm": 0.3578619062900543, "learning_rate": 1.3482315917751243e-05, "loss": 0.0189, "step": 61810 }, { "grad_norm": 0.25460314750671387, "learning_rate": 1.3468201109758582e-05, "loss": 0.0181, "step": 61820 }, { "grad_norm": 0.44027936458587646, "learning_rate": 1.3454092544051222e-05, "loss": 0.0143, "step": 61830 }, { "grad_norm": 0.3292066156864166, "learning_rate": 1.3439990223039878e-05, "loss": 0.0134, "step": 61840 }, { "grad_norm": 0.19455184042453766, "learning_rate": 1.3425894149134321e-05, "loss": 0.012, "step": 61850 }, { "grad_norm": 0.2747335433959961, "learning_rate": 1.3411804324743132e-05, "loss": 0.0203, "step": 61860 }, { "grad_norm": 0.20481398701667786, "learning_rate": 1.3397720752273896e-05, "loss": 0.0146, "step": 61870 }, { "grad_norm": 0.730950117111206, "learning_rate": 1.3383643434133108e-05, "loss": 0.0138, "step": 61880 }, { "grad_norm": 0.4618913233280182, "learning_rate": 1.3369572372726196e-05, "loss": 0.0158, "step": 61890 }, { "grad_norm": 0.1985127031803131, "learning_rate": 1.3355507570457525e-05, "loss": 0.0138, "step": 61900 }, { "grad_norm": 0.20694342255592346, "learning_rate": 1.334144902973038e-05, "loss": 0.0125, "step": 61910 }, { "grad_norm": 0.2557481527328491, "learning_rate": 1.3327396752946997e-05, "loss": 0.0139, "step": 61920 }, { "grad_norm": 0.2066998928785324, "learning_rate": 1.3313350742508496e-05, "loss": 0.0176, "step": 61930 }, { "grad_norm": 0.22457745671272278, "learning_rate": 1.3299311000814973e-05, "loss": 0.0219, "step": 61940 }, { "grad_norm": 0.11392556130886078, "learning_rate": 1.3285277530265432e-05, "loss": 0.0175, "step": 61950 }, { "grad_norm": 0.26429471373558044, "learning_rate": 1.3271250333257813e-05, "loss": 0.0132, "step": 61960 }, { "grad_norm": 0.42522281408309937, "learning_rate": 1.3257229412188977e-05, "loss": 0.0103, "step": 61970 }, { "grad_norm": 0.20122182369232178, "learning_rate": 1.3243214769454726e-05, "loss": 0.0174, "step": 61980 }, { "grad_norm": 0.13854749500751495, "learning_rate": 1.3229206407449751e-05, "loss": 0.011, "step": 61990 }, { "grad_norm": 0.2914721965789795, "learning_rate": 1.3215204328567742e-05, "loss": 0.012, "step": 62000 }, { "grad_norm": 0.34181883931159973, "learning_rate": 1.3201208535201232e-05, "loss": 0.0131, "step": 62010 }, { "grad_norm": 0.142320916056633, "learning_rate": 1.3187219029741737e-05, "loss": 0.0153, "step": 62020 }, { "grad_norm": 0.29394733905792236, "learning_rate": 1.3173235814579693e-05, "loss": 0.0116, "step": 62030 }, { "grad_norm": 0.22753556072711945, "learning_rate": 1.3159258892104398e-05, "loss": 0.0166, "step": 62040 }, { "grad_norm": 0.20011398196220398, "learning_rate": 1.3145288264704198e-05, "loss": 0.0179, "step": 62050 }, { "grad_norm": 0.7009416222572327, "learning_rate": 1.3131323934766237e-05, "loss": 0.0162, "step": 62060 }, { "grad_norm": 0.1748904436826706, "learning_rate": 1.311736590467666e-05, "loss": 0.0136, "step": 62070 }, { "grad_norm": 0.16377995908260345, "learning_rate": 1.3103414176820505e-05, "loss": 0.0174, "step": 62080 }, { "grad_norm": 0.26088178157806396, "learning_rate": 1.308946875358174e-05, "loss": 0.0264, "step": 62090 }, { "grad_norm": 0.26207074522972107, "learning_rate": 1.3075529637343276e-05, "loss": 0.0202, "step": 62100 }, { "grad_norm": 0.2598508298397064, "learning_rate": 1.3061596830486883e-05, "loss": 0.0178, "step": 62110 }, { "grad_norm": 0.2782786190509796, "learning_rate": 1.3047670335393353e-05, "loss": 0.0128, "step": 62120 }, { "grad_norm": 0.6918319463729858, "learning_rate": 1.3033750154442298e-05, "loss": 0.0173, "step": 62130 }, { "grad_norm": 0.16742223501205444, "learning_rate": 1.3019836290012316e-05, "loss": 0.0136, "step": 62140 }, { "grad_norm": 0.18271291255950928, "learning_rate": 1.30059287444809e-05, "loss": 0.0157, "step": 62150 }, { "grad_norm": 0.2860631048679352, "learning_rate": 1.299202752022447e-05, "loss": 0.0257, "step": 62160 }, { "grad_norm": 0.20554567873477936, "learning_rate": 1.2978132619618371e-05, "loss": 0.0133, "step": 62170 }, { "grad_norm": 0.25064119696617126, "learning_rate": 1.2964244045036866e-05, "loss": 0.0216, "step": 62180 }, { "grad_norm": 0.1295599341392517, "learning_rate": 1.295036179885311e-05, "loss": 0.0124, "step": 62190 }, { "grad_norm": 0.1399931013584137, "learning_rate": 1.293648588343922e-05, "loss": 0.0157, "step": 62200 }, { "grad_norm": 0.2826724946498871, "learning_rate": 1.2922616301166196e-05, "loss": 0.0157, "step": 62210 }, { "grad_norm": 0.2725164592266083, "learning_rate": 1.2908753054403976e-05, "loss": 0.0121, "step": 62220 }, { "grad_norm": 0.22884616255760193, "learning_rate": 1.2894896145521429e-05, "loss": 0.013, "step": 62230 }, { "grad_norm": 0.2522304654121399, "learning_rate": 1.2881045576886275e-05, "loss": 0.0161, "step": 62240 }, { "grad_norm": 0.1762387454509735, "learning_rate": 1.2867201350865254e-05, "loss": 0.0153, "step": 62250 }, { "grad_norm": 0.21833737194538116, "learning_rate": 1.2853363469823914e-05, "loss": 0.0151, "step": 62260 }, { "grad_norm": 0.24771668016910553, "learning_rate": 1.283953193612682e-05, "loss": 0.0152, "step": 62270 }, { "grad_norm": 0.2301015406847, "learning_rate": 1.2825706752137372e-05, "loss": 0.0154, "step": 62280 }, { "grad_norm": 0.23860320448875427, "learning_rate": 1.2811887920217896e-05, "loss": 0.018, "step": 62290 }, { "grad_norm": 0.16365276277065277, "learning_rate": 1.2798075442729707e-05, "loss": 0.0149, "step": 62300 }, { "grad_norm": 0.23618267476558685, "learning_rate": 1.2784269322032922e-05, "loss": 0.0144, "step": 62310 }, { "grad_norm": 0.12585705518722534, "learning_rate": 1.2770469560486686e-05, "loss": 0.0163, "step": 62320 }, { "grad_norm": 0.15619462728500366, "learning_rate": 1.2756676160448956e-05, "loss": 0.0129, "step": 62330 }, { "grad_norm": 0.2220771312713623, "learning_rate": 1.2742889124276663e-05, "loss": 0.0154, "step": 62340 }, { "grad_norm": 0.33755478262901306, "learning_rate": 1.2729108454325639e-05, "loss": 0.0185, "step": 62350 }, { "grad_norm": 0.207758367061615, "learning_rate": 1.2715334152950614e-05, "loss": 0.0179, "step": 62360 }, { "grad_norm": 0.21526935696601868, "learning_rate": 1.2701566222505246e-05, "loss": 0.0122, "step": 62370 }, { "grad_norm": 0.3360205590724945, "learning_rate": 1.2687804665342107e-05, "loss": 0.0175, "step": 62380 }, { "grad_norm": 0.1800515055656433, "learning_rate": 1.267404948381265e-05, "loss": 0.0141, "step": 62390 }, { "grad_norm": 0.18628381192684174, "learning_rate": 1.2660300680267267e-05, "loss": 0.0116, "step": 62400 }, { "grad_norm": 0.38468897342681885, "learning_rate": 1.2646558257055257e-05, "loss": 0.0156, "step": 62410 }, { "grad_norm": 0.17042715847492218, "learning_rate": 1.263282221652482e-05, "loss": 0.0094, "step": 62420 }, { "grad_norm": 0.15895873308181763, "learning_rate": 1.2619092561023088e-05, "loss": 0.0139, "step": 62430 }, { "grad_norm": 0.15760751068592072, "learning_rate": 1.2605369292896036e-05, "loss": 0.0144, "step": 62440 }, { "grad_norm": 0.1210014820098877, "learning_rate": 1.2591652414488658e-05, "loss": 0.0154, "step": 62450 }, { "grad_norm": 0.2123420685529709, "learning_rate": 1.2577941928144732e-05, "loss": 0.0151, "step": 62460 }, { "grad_norm": 0.3992132842540741, "learning_rate": 1.256423783620706e-05, "loss": 0.0136, "step": 62470 }, { "grad_norm": 0.23640652000904083, "learning_rate": 1.2550540141017264e-05, "loss": 0.0161, "step": 62480 }, { "grad_norm": 0.17729713022708893, "learning_rate": 1.253684884491591e-05, "loss": 0.0116, "step": 62490 }, { "grad_norm": 0.29996374249458313, "learning_rate": 1.2523163950242483e-05, "loss": 0.0121, "step": 62500 }, { "grad_norm": 0.3395889699459076, "learning_rate": 1.2509485459335313e-05, "loss": 0.0145, "step": 62510 }, { "grad_norm": 0.19647471606731415, "learning_rate": 1.2495813374531739e-05, "loss": 0.0175, "step": 62520 }, { "grad_norm": 0.39869633316993713, "learning_rate": 1.2482147698167907e-05, "loss": 0.0125, "step": 62530 }, { "grad_norm": 0.19468064606189728, "learning_rate": 1.2468488432578911e-05, "loss": 0.0121, "step": 62540 }, { "grad_norm": 0.2873178720474243, "learning_rate": 1.2454835580098761e-05, "loss": 0.0168, "step": 62550 }, { "grad_norm": 0.17939789593219757, "learning_rate": 1.2441189143060338e-05, "loss": 0.0146, "step": 62560 }, { "grad_norm": 0.24182292819023132, "learning_rate": 1.242754912379548e-05, "loss": 0.0141, "step": 62570 }, { "grad_norm": 0.2128472626209259, "learning_rate": 1.2413915524634844e-05, "loss": 0.0278, "step": 62580 }, { "grad_norm": 0.22058913111686707, "learning_rate": 1.2400288347908073e-05, "loss": 0.0178, "step": 62590 }, { "grad_norm": 0.20009161531925201, "learning_rate": 1.2386667595943663e-05, "loss": 0.0167, "step": 62600 }, { "grad_norm": 0.24288976192474365, "learning_rate": 1.2373053271069035e-05, "loss": 0.0128, "step": 62610 }, { "grad_norm": 0.20397019386291504, "learning_rate": 1.2359445375610501e-05, "loss": 0.0126, "step": 62620 }, { "grad_norm": 0.20960265398025513, "learning_rate": 1.2345843911893301e-05, "loss": 0.013, "step": 62630 }, { "grad_norm": 0.2841585874557495, "learning_rate": 1.2332248882241498e-05, "loss": 0.026, "step": 62640 }, { "grad_norm": 0.26101118326187134, "learning_rate": 1.2318660288978178e-05, "loss": 0.0153, "step": 62650 }, { "grad_norm": 0.19161681830883026, "learning_rate": 1.2305078134425213e-05, "loss": 0.0147, "step": 62660 }, { "grad_norm": 0.13511259853839874, "learning_rate": 1.2291502420903434e-05, "loss": 0.0162, "step": 62670 }, { "grad_norm": 0.2647046744823456, "learning_rate": 1.2277933150732567e-05, "loss": 0.0142, "step": 62680 }, { "grad_norm": 0.16351951658725739, "learning_rate": 1.2264370326231216e-05, "loss": 0.0154, "step": 62690 }, { "grad_norm": 0.1673157662153244, "learning_rate": 1.2250813949716927e-05, "loss": 0.0169, "step": 62700 }, { "grad_norm": 0.17648717761039734, "learning_rate": 1.2237264023506063e-05, "loss": 0.014, "step": 62710 }, { "grad_norm": 0.14495864510536194, "learning_rate": 1.2223720549913987e-05, "loss": 0.0139, "step": 62720 }, { "grad_norm": 0.4774649739265442, "learning_rate": 1.221018353125487e-05, "loss": 0.0152, "step": 62730 }, { "grad_norm": 0.16417081654071808, "learning_rate": 1.2196652969841837e-05, "loss": 0.0169, "step": 62740 }, { "grad_norm": 0.2996702790260315, "learning_rate": 1.2183128867986904e-05, "loss": 0.0161, "step": 62750 }, { "grad_norm": 0.20158003270626068, "learning_rate": 1.2169611228000927e-05, "loss": 0.0121, "step": 62760 }, { "grad_norm": 0.21077953279018402, "learning_rate": 1.2156100052193752e-05, "loss": 0.0151, "step": 62770 }, { "grad_norm": 0.24642314016819, "learning_rate": 1.214259534287403e-05, "loss": 0.0115, "step": 62780 }, { "grad_norm": 0.12450749427080154, "learning_rate": 1.2129097102349363e-05, "loss": 0.0154, "step": 62790 }, { "grad_norm": 0.17385296523571014, "learning_rate": 1.2115605332926227e-05, "loss": 0.0137, "step": 62800 }, { "grad_norm": 0.1629948616027832, "learning_rate": 1.2102120036910003e-05, "loss": 0.0228, "step": 62810 }, { "grad_norm": 0.2174239307641983, "learning_rate": 1.2088641216604956e-05, "loss": 0.0166, "step": 62820 }, { "grad_norm": 0.14715011417865753, "learning_rate": 1.2075168874314264e-05, "loss": 0.0159, "step": 62830 }, { "grad_norm": 0.16116595268249512, "learning_rate": 1.2061703012339942e-05, "loss": 0.0133, "step": 62840 }, { "grad_norm": 0.2764643728733063, "learning_rate": 1.204824363298297e-05, "loss": 0.0158, "step": 62850 }, { "grad_norm": 0.22772563993930817, "learning_rate": 1.2034790738543173e-05, "loss": 0.0157, "step": 62860 }, { "grad_norm": 0.19100576639175415, "learning_rate": 1.2021344331319284e-05, "loss": 0.0142, "step": 62870 }, { "grad_norm": 0.16941215097904205, "learning_rate": 1.200790441360894e-05, "loss": 0.0181, "step": 62880 }, { "grad_norm": 0.16957125067710876, "learning_rate": 1.199447098770864e-05, "loss": 0.0124, "step": 62890 }, { "grad_norm": 0.25580140948295593, "learning_rate": 1.198104405591381e-05, "loss": 0.0275, "step": 62900 }, { "grad_norm": 0.19946305453777313, "learning_rate": 1.1967623620518697e-05, "loss": 0.0137, "step": 62910 }, { "grad_norm": 0.321251779794693, "learning_rate": 1.1954209683816554e-05, "loss": 0.0166, "step": 62920 }, { "grad_norm": 0.14461703598499298, "learning_rate": 1.1940802248099402e-05, "loss": 0.0142, "step": 62930 }, { "grad_norm": 0.27110233902931213, "learning_rate": 1.1927401315658232e-05, "loss": 0.0172, "step": 62940 }, { "grad_norm": 0.3087044358253479, "learning_rate": 1.1914006888782898e-05, "loss": 0.0176, "step": 62950 }, { "grad_norm": 0.2776985168457031, "learning_rate": 1.1900618969762106e-05, "loss": 0.016, "step": 62960 }, { "grad_norm": 0.25357431173324585, "learning_rate": 1.188723756088354e-05, "loss": 0.0126, "step": 62970 }, { "grad_norm": 0.22182102501392365, "learning_rate": 1.1873862664433672e-05, "loss": 0.0137, "step": 62980 }, { "grad_norm": 0.2048182338476181, "learning_rate": 1.1860494282697927e-05, "loss": 0.0124, "step": 62990 }, { "grad_norm": 0.22208037972450256, "learning_rate": 1.1847132417960588e-05, "loss": 0.0125, "step": 63000 }, { "grad_norm": 0.406464546918869, "learning_rate": 1.1833777072504832e-05, "loss": 0.019, "step": 63010 }, { "grad_norm": 0.21081244945526123, "learning_rate": 1.1820428248612731e-05, "loss": 0.0122, "step": 63020 }, { "grad_norm": 0.38263434171676636, "learning_rate": 1.1807085948565245e-05, "loss": 0.0162, "step": 63030 }, { "grad_norm": 0.18156185746192932, "learning_rate": 1.1793750174642172e-05, "loss": 0.0115, "step": 63040 }, { "grad_norm": 0.25202903151512146, "learning_rate": 1.1780420929122254e-05, "loss": 0.0155, "step": 63050 }, { "grad_norm": 0.265959233045578, "learning_rate": 1.176709821428309e-05, "loss": 0.017, "step": 63060 }, { "grad_norm": 0.36458420753479004, "learning_rate": 1.1753782032401173e-05, "loss": 0.0194, "step": 63070 }, { "grad_norm": 0.3363581597805023, "learning_rate": 1.1740472385751866e-05, "loss": 0.0194, "step": 63080 }, { "grad_norm": 0.17060990631580353, "learning_rate": 1.172716927660943e-05, "loss": 0.0122, "step": 63090 }, { "grad_norm": 0.30806538462638855, "learning_rate": 1.1713872707247015e-05, "loss": 0.0173, "step": 63100 }, { "grad_norm": 0.18666042387485504, "learning_rate": 1.1700582679936595e-05, "loss": 0.0171, "step": 63110 }, { "grad_norm": 0.2685381770133972, "learning_rate": 1.1687299196949136e-05, "loss": 0.0211, "step": 63120 }, { "grad_norm": 0.46923890709877014, "learning_rate": 1.1674022260554374e-05, "loss": 0.019, "step": 63130 }, { "grad_norm": 0.2027440369129181, "learning_rate": 1.1660751873020987e-05, "loss": 0.0158, "step": 63140 }, { "grad_norm": 0.3509024381637573, "learning_rate": 1.1647488036616538e-05, "loss": 0.0159, "step": 63150 }, { "grad_norm": 0.14981424808502197, "learning_rate": 1.1634230753607417e-05, "loss": 0.0186, "step": 63160 }, { "grad_norm": 0.18108049035072327, "learning_rate": 1.1620980026258982e-05, "loss": 0.0159, "step": 63170 }, { "grad_norm": 0.21784517168998718, "learning_rate": 1.1607735856835373e-05, "loss": 0.0118, "step": 63180 }, { "grad_norm": 0.17385676503181458, "learning_rate": 1.1594498247599677e-05, "loss": 0.0122, "step": 63190 }, { "grad_norm": 0.23552198708057404, "learning_rate": 1.158126720081384e-05, "loss": 0.0188, "step": 63200 }, { "grad_norm": 0.31650036573410034, "learning_rate": 1.156804271873868e-05, "loss": 0.0167, "step": 63210 }, { "grad_norm": 0.28212228417396545, "learning_rate": 1.1554824803633924e-05, "loss": 0.0154, "step": 63220 }, { "grad_norm": 0.3055218756198883, "learning_rate": 1.154161345775811e-05, "loss": 0.013, "step": 63230 }, { "grad_norm": 0.17884665727615356, "learning_rate": 1.1528408683368724e-05, "loss": 0.0161, "step": 63240 }, { "grad_norm": 0.325111985206604, "learning_rate": 1.1515210482722088e-05, "loss": 0.0169, "step": 63250 }, { "grad_norm": 0.1289576292037964, "learning_rate": 1.150201885807342e-05, "loss": 0.0211, "step": 63260 }, { "grad_norm": 0.23338784277439117, "learning_rate": 1.1488833811676807e-05, "loss": 0.0135, "step": 63270 }, { "grad_norm": 0.20459690690040588, "learning_rate": 1.1475655345785213e-05, "loss": 0.0159, "step": 63280 }, { "grad_norm": 0.1409185528755188, "learning_rate": 1.1462483462650481e-05, "loss": 0.0134, "step": 63290 }, { "grad_norm": 0.17651623487472534, "learning_rate": 1.1449318164523331e-05, "loss": 0.0174, "step": 63300 }, { "grad_norm": 0.34035369753837585, "learning_rate": 1.1436159453653334e-05, "loss": 0.0145, "step": 63310 }, { "grad_norm": 0.28601494431495667, "learning_rate": 1.1423007332288955e-05, "loss": 0.0163, "step": 63320 }, { "grad_norm": 0.2818712294101715, "learning_rate": 1.1409861802677546e-05, "loss": 0.0148, "step": 63330 }, { "grad_norm": 0.3566758334636688, "learning_rate": 1.1396722867065313e-05, "loss": 0.0144, "step": 63340 }, { "grad_norm": 0.24881155788898468, "learning_rate": 1.1383590527697352e-05, "loss": 0.0177, "step": 63350 }, { "grad_norm": 0.20713774859905243, "learning_rate": 1.137046478681758e-05, "loss": 0.0125, "step": 63360 }, { "grad_norm": 0.21750925481319427, "learning_rate": 1.1357345646668888e-05, "loss": 0.028, "step": 63370 }, { "grad_norm": 0.2769322395324707, "learning_rate": 1.1344233109492924e-05, "loss": 0.0165, "step": 63380 }, { "grad_norm": 0.2217826247215271, "learning_rate": 1.1331127177530292e-05, "loss": 0.0143, "step": 63390 }, { "grad_norm": 0.21500150859355927, "learning_rate": 1.1318027853020441e-05, "loss": 0.0115, "step": 63400 }, { "grad_norm": 0.9364979863166809, "learning_rate": 1.1304935138201645e-05, "loss": 0.0141, "step": 63410 }, { "grad_norm": 0.20433299243450165, "learning_rate": 1.1291849035311153e-05, "loss": 0.0144, "step": 63420 }, { "grad_norm": 0.26266297698020935, "learning_rate": 1.1278769546584972e-05, "loss": 0.0155, "step": 63430 }, { "grad_norm": 0.25714707374572754, "learning_rate": 1.1265696674258052e-05, "loss": 0.0118, "step": 63440 }, { "grad_norm": 0.17980021238327026, "learning_rate": 1.1252630420564186e-05, "loss": 0.0161, "step": 63450 }, { "grad_norm": 0.8736885190010071, "learning_rate": 1.1239570787736036e-05, "loss": 0.0202, "step": 63460 }, { "grad_norm": 0.4379744529724121, "learning_rate": 1.1226517778005135e-05, "loss": 0.022, "step": 63470 }, { "grad_norm": 0.5189601182937622, "learning_rate": 1.1213471393601893e-05, "loss": 0.0128, "step": 63480 }, { "grad_norm": 0.4011845588684082, "learning_rate": 1.1200431636755587e-05, "loss": 0.0213, "step": 63490 }, { "grad_norm": 0.3575383424758911, "learning_rate": 1.1187398509694336e-05, "loss": 0.0175, "step": 63500 }, { "grad_norm": 0.17087523639202118, "learning_rate": 1.1174372014645146e-05, "loss": 0.0125, "step": 63510 }, { "grad_norm": 0.22849948704242706, "learning_rate": 1.1161352153833899e-05, "loss": 0.0202, "step": 63520 }, { "grad_norm": 0.1802576333284378, "learning_rate": 1.1148338929485325e-05, "loss": 0.0128, "step": 63530 }, { "grad_norm": 0.17830555140972137, "learning_rate": 1.113533234382304e-05, "loss": 0.0152, "step": 63540 }, { "grad_norm": 0.2209617644548416, "learning_rate": 1.1122332399069513e-05, "loss": 0.0155, "step": 63550 }, { "grad_norm": 0.26134875416755676, "learning_rate": 1.1109339097446047e-05, "loss": 0.0191, "step": 63560 }, { "grad_norm": 0.29380542039871216, "learning_rate": 1.1096352441172897e-05, "loss": 0.0173, "step": 63570 }, { "grad_norm": 0.2794063985347748, "learning_rate": 1.1083372432469086e-05, "loss": 0.0122, "step": 63580 }, { "grad_norm": 0.1852661669254303, "learning_rate": 1.107039907355255e-05, "loss": 0.0199, "step": 63590 }, { "grad_norm": 0.2829529345035553, "learning_rate": 1.1057432366640103e-05, "loss": 0.0121, "step": 63600 }, { "grad_norm": 0.3096056282520294, "learning_rate": 1.1044472313947352e-05, "loss": 0.0203, "step": 63610 }, { "grad_norm": 0.31927186250686646, "learning_rate": 1.1031518917688877e-05, "loss": 0.0193, "step": 63620 }, { "grad_norm": 0.7968941330909729, "learning_rate": 1.1018572180078007e-05, "loss": 0.0238, "step": 63630 }, { "grad_norm": 0.23438580334186554, "learning_rate": 1.1005632103327018e-05, "loss": 0.0119, "step": 63640 }, { "grad_norm": 0.2161823958158493, "learning_rate": 1.0992698689646996e-05, "loss": 0.0124, "step": 63650 }, { "grad_norm": 0.20099611580371857, "learning_rate": 1.0979771941247919e-05, "loss": 0.0174, "step": 63660 }, { "grad_norm": 0.2050066739320755, "learning_rate": 1.0966851860338611e-05, "loss": 0.0158, "step": 63670 }, { "grad_norm": 0.14898614585399628, "learning_rate": 1.0953938449126766e-05, "loss": 0.0153, "step": 63680 }, { "grad_norm": 0.4449041485786438, "learning_rate": 1.0941031709818933e-05, "loss": 0.0215, "step": 63690 }, { "grad_norm": 0.1980358064174652, "learning_rate": 1.0928131644620509e-05, "loss": 0.0229, "step": 63700 }, { "grad_norm": 0.2154635488986969, "learning_rate": 1.0915238255735766e-05, "loss": 0.013, "step": 63710 }, { "grad_norm": 0.15268753468990326, "learning_rate": 1.0902351545367833e-05, "loss": 0.0137, "step": 63720 }, { "grad_norm": 0.2505949139595032, "learning_rate": 1.0889471515718702e-05, "loss": 0.0151, "step": 63730 }, { "grad_norm": 0.16859154403209686, "learning_rate": 1.087659816898921e-05, "loss": 0.0124, "step": 63740 }, { "grad_norm": 0.2760348320007324, "learning_rate": 1.0863731507379082e-05, "loss": 0.014, "step": 63750 }, { "grad_norm": 0.2697637379169464, "learning_rate": 1.0850871533086827e-05, "loss": 0.0144, "step": 63760 }, { "grad_norm": 0.29865992069244385, "learning_rate": 1.0838018248309927e-05, "loss": 0.0121, "step": 63770 }, { "grad_norm": 0.38408970832824707, "learning_rate": 1.0825171655244615e-05, "loss": 0.0178, "step": 63780 }, { "grad_norm": 0.2418876588344574, "learning_rate": 1.0812331756086025e-05, "loss": 0.0161, "step": 63790 }, { "grad_norm": 0.2713901102542877, "learning_rate": 1.079949855302817e-05, "loss": 0.0129, "step": 63800 }, { "grad_norm": 0.29413747787475586, "learning_rate": 1.0786672048263852e-05, "loss": 0.0172, "step": 63810 }, { "grad_norm": 0.18135708570480347, "learning_rate": 1.0773852243984817e-05, "loss": 0.0192, "step": 63820 }, { "grad_norm": 0.27975350618362427, "learning_rate": 1.0761039142381586e-05, "loss": 0.0155, "step": 63830 }, { "grad_norm": 0.22565288841724396, "learning_rate": 1.0748232745643577e-05, "loss": 0.0165, "step": 63840 }, { "grad_norm": 0.21656033396720886, "learning_rate": 1.0735433055959055e-05, "loss": 0.016, "step": 63850 }, { "grad_norm": 0.23874889314174652, "learning_rate": 1.0722640075515133e-05, "loss": 0.0143, "step": 63860 }, { "grad_norm": 0.15307791531085968, "learning_rate": 1.0709853806497795e-05, "loss": 0.0119, "step": 63870 }, { "grad_norm": 0.7672516703605652, "learning_rate": 1.0697074251091831e-05, "loss": 0.019, "step": 63880 }, { "grad_norm": 0.19006824493408203, "learning_rate": 1.0684301411480962e-05, "loss": 0.0163, "step": 63890 }, { "grad_norm": 0.20483629405498505, "learning_rate": 1.067153528984769e-05, "loss": 0.0122, "step": 63900 }, { "grad_norm": 0.3590266704559326, "learning_rate": 1.0658775888373395e-05, "loss": 0.0159, "step": 63910 }, { "grad_norm": 0.18545745313167572, "learning_rate": 1.0646023209238314e-05, "loss": 0.0154, "step": 63920 }, { "grad_norm": 0.1767590194940567, "learning_rate": 1.0633277254621537e-05, "loss": 0.0122, "step": 63930 }, { "grad_norm": 0.17156188189983368, "learning_rate": 1.0620538026700994e-05, "loss": 0.0146, "step": 63940 }, { "grad_norm": 0.2985459566116333, "learning_rate": 1.0607805527653486e-05, "loss": 0.0157, "step": 63950 }, { "grad_norm": 0.4676505923271179, "learning_rate": 1.059507975965462e-05, "loss": 0.0176, "step": 63960 }, { "grad_norm": 0.19910773634910583, "learning_rate": 1.0582360724878898e-05, "loss": 0.0124, "step": 63970 }, { "grad_norm": 0.2432989627122879, "learning_rate": 1.0569648425499651e-05, "loss": 0.0163, "step": 63980 }, { "grad_norm": 0.1593886911869049, "learning_rate": 1.0556942863689063e-05, "loss": 0.0141, "step": 63990 }, { "grad_norm": 0.3003239035606384, "learning_rate": 1.054424404161819e-05, "loss": 0.0128, "step": 64000 }, { "grad_norm": 0.18893501162528992, "learning_rate": 1.053155196145686e-05, "loss": 0.014, "step": 64010 }, { "grad_norm": 0.3009240925312042, "learning_rate": 1.0518866625373863e-05, "loss": 0.0123, "step": 64020 }, { "grad_norm": 0.11827653646469116, "learning_rate": 1.0506188035536735e-05, "loss": 0.0214, "step": 64030 }, { "grad_norm": 0.14695824682712555, "learning_rate": 1.0493516194111919e-05, "loss": 0.0181, "step": 64040 }, { "grad_norm": 0.23790119588375092, "learning_rate": 1.0480851103264688e-05, "loss": 0.0131, "step": 64050 }, { "grad_norm": 0.8017176389694214, "learning_rate": 1.046819276515913e-05, "loss": 0.0166, "step": 64060 }, { "grad_norm": 0.3655679225921631, "learning_rate": 1.0455541181958256e-05, "loss": 0.016, "step": 64070 }, { "grad_norm": 0.40753114223480225, "learning_rate": 1.0442896355823822e-05, "loss": 0.0152, "step": 64080 }, { "grad_norm": 0.23284050822257996, "learning_rate": 1.0430258288916539e-05, "loss": 0.0133, "step": 64090 }, { "grad_norm": 0.12760469317436218, "learning_rate": 1.0417626983395868e-05, "loss": 0.0155, "step": 64100 }, { "grad_norm": 0.244886115193367, "learning_rate": 1.0405002441420165e-05, "loss": 0.0122, "step": 64110 }, { "grad_norm": 0.2251352220773697, "learning_rate": 1.039238466514662e-05, "loss": 0.0147, "step": 64120 }, { "grad_norm": 0.1775483787059784, "learning_rate": 1.0379773656731262e-05, "loss": 0.0168, "step": 64130 }, { "grad_norm": 0.24359896779060364, "learning_rate": 1.0367169418328986e-05, "loss": 0.0139, "step": 64140 }, { "grad_norm": 0.4013436436653137, "learning_rate": 1.0354571952093484e-05, "loss": 0.0169, "step": 64150 }, { "grad_norm": 0.22990800440311432, "learning_rate": 1.034198126017733e-05, "loss": 0.0175, "step": 64160 }, { "grad_norm": 0.22690507769584656, "learning_rate": 1.0329397344731928e-05, "loss": 0.0144, "step": 64170 }, { "grad_norm": 0.2745170593261719, "learning_rate": 1.0316820207907524e-05, "loss": 0.0182, "step": 64180 }, { "grad_norm": 0.22636710107326508, "learning_rate": 1.0304249851853215e-05, "loss": 0.0146, "step": 64190 }, { "grad_norm": 0.29162654280662537, "learning_rate": 1.0291686278716933e-05, "loss": 0.0127, "step": 64200 }, { "grad_norm": 0.24790549278259277, "learning_rate": 1.0279129490645418e-05, "loss": 0.0137, "step": 64210 }, { "grad_norm": 0.21525946259498596, "learning_rate": 1.0266579489784328e-05, "loss": 0.0157, "step": 64220 }, { "grad_norm": 0.2283935695886612, "learning_rate": 1.0254036278278084e-05, "loss": 0.0147, "step": 64230 }, { "grad_norm": 0.18267953395843506, "learning_rate": 1.0241499858269982e-05, "loss": 0.0109, "step": 64240 }, { "grad_norm": 0.24379301071166992, "learning_rate": 1.0228970231902169e-05, "loss": 0.0127, "step": 64250 }, { "grad_norm": 0.3061056435108185, "learning_rate": 1.0216447401315582e-05, "loss": 0.0131, "step": 64260 }, { "grad_norm": 0.2644636929035187, "learning_rate": 1.020393136865007e-05, "loss": 0.0118, "step": 64270 }, { "grad_norm": 0.35003983974456787, "learning_rate": 1.0191422136044242e-05, "loss": 0.02, "step": 64280 }, { "grad_norm": 0.20120814442634583, "learning_rate": 1.017891970563563e-05, "loss": 0.0136, "step": 64290 }, { "grad_norm": 0.19256623089313507, "learning_rate": 1.0166424079560516e-05, "loss": 0.0151, "step": 64300 }, { "grad_norm": 0.23697501420974731, "learning_rate": 1.0153935259954078e-05, "loss": 0.0121, "step": 64310 }, { "grad_norm": 0.1959504783153534, "learning_rate": 1.0141453248950311e-05, "loss": 0.0168, "step": 64320 }, { "grad_norm": 0.2148093432188034, "learning_rate": 1.0128978048682054e-05, "loss": 0.0162, "step": 64330 }, { "grad_norm": 0.3438093066215515, "learning_rate": 1.0116509661280982e-05, "loss": 0.014, "step": 64340 }, { "grad_norm": 0.19249507784843445, "learning_rate": 1.0104048088877576e-05, "loss": 0.0119, "step": 64350 }, { "grad_norm": 0.2867085635662079, "learning_rate": 1.0091593333601201e-05, "loss": 0.0154, "step": 64360 }, { "grad_norm": 0.3374037444591522, "learning_rate": 1.0079145397580031e-05, "loss": 0.0192, "step": 64370 }, { "grad_norm": 0.4166593849658966, "learning_rate": 1.006670428294107e-05, "loss": 0.0203, "step": 64380 }, { "grad_norm": 0.22328147292137146, "learning_rate": 1.0054269991810166e-05, "loss": 0.0175, "step": 64390 }, { "grad_norm": 0.1739894300699234, "learning_rate": 1.0041842526312024e-05, "loss": 0.0128, "step": 64400 }, { "grad_norm": 0.2506529986858368, "learning_rate": 1.0029421888570101e-05, "loss": 0.0122, "step": 64410 }, { "grad_norm": 0.26622405648231506, "learning_rate": 1.0017008080706813e-05, "loss": 0.0229, "step": 64420 }, { "grad_norm": 0.32341691851615906, "learning_rate": 1.0004601104843287e-05, "loss": 0.0136, "step": 64430 }, { "grad_norm": 0.4513513445854187, "learning_rate": 9.992200963099562e-06, "loss": 0.0204, "step": 64440 }, { "grad_norm": 0.1742604821920395, "learning_rate": 9.979807657594486e-06, "loss": 0.0173, "step": 64450 }, { "grad_norm": 0.24366886913776398, "learning_rate": 9.967421190445703e-06, "loss": 0.0128, "step": 64460 }, { "grad_norm": 0.1797311007976532, "learning_rate": 9.955041563769769e-06, "loss": 0.0144, "step": 64470 }, { "grad_norm": 0.9497100710868835, "learning_rate": 9.942668779681974e-06, "loss": 0.0138, "step": 64480 }, { "grad_norm": 0.2485690414905548, "learning_rate": 9.930302840296541e-06, "loss": 0.0153, "step": 64490 }, { "grad_norm": 0.15375515818595886, "learning_rate": 9.917943747726426e-06, "loss": 0.0149, "step": 64500 }, { "grad_norm": 1.0117107629776, "learning_rate": 9.905591504083484e-06, "loss": 0.0137, "step": 64510 }, { "grad_norm": 0.24269895255565643, "learning_rate": 9.893246111478382e-06, "loss": 0.0157, "step": 64520 }, { "grad_norm": 0.3853413760662079, "learning_rate": 9.88090757202057e-06, "loss": 0.0138, "step": 64530 }, { "grad_norm": 0.29365816712379456, "learning_rate": 9.868575887818421e-06, "loss": 0.0228, "step": 64540 }, { "grad_norm": 0.14419636130332947, "learning_rate": 9.856251060979044e-06, "loss": 0.0121, "step": 64550 }, { "grad_norm": 0.44468382000923157, "learning_rate": 9.843933093608426e-06, "loss": 0.0174, "step": 64560 }, { "grad_norm": 0.22450408339500427, "learning_rate": 9.831621987811368e-06, "loss": 0.0175, "step": 64570 }, { "grad_norm": 0.250994473695755, "learning_rate": 9.819317745691509e-06, "loss": 0.0122, "step": 64580 }, { "grad_norm": 0.24555328488349915, "learning_rate": 9.8070203693513e-06, "loss": 0.0129, "step": 64590 }, { "grad_norm": 0.23089955747127533, "learning_rate": 9.794729860892048e-06, "loss": 0.0133, "step": 64600 }, { "grad_norm": 0.20137707889080048, "learning_rate": 9.782446222413827e-06, "loss": 0.0186, "step": 64610 }, { "grad_norm": 0.8007503747940063, "learning_rate": 9.770169456015598e-06, "loss": 0.019, "step": 64620 }, { "grad_norm": 0.12799914181232452, "learning_rate": 9.75789956379512e-06, "loss": 0.0127, "step": 64630 }, { "grad_norm": 0.26006996631622314, "learning_rate": 9.74563654784898e-06, "loss": 0.0143, "step": 64640 }, { "grad_norm": 0.27669763565063477, "learning_rate": 9.733380410272596e-06, "loss": 0.0138, "step": 64650 }, { "grad_norm": 0.24868707358837128, "learning_rate": 9.721131153160207e-06, "loss": 0.013, "step": 64660 }, { "grad_norm": 0.17851032316684723, "learning_rate": 9.708888778604886e-06, "loss": 0.0105, "step": 64670 }, { "grad_norm": 0.17965948581695557, "learning_rate": 9.696653288698488e-06, "loss": 0.0126, "step": 64680 }, { "grad_norm": 0.24547260999679565, "learning_rate": 9.684424685531763e-06, "loss": 0.0167, "step": 64690 }, { "grad_norm": 0.24038618803024292, "learning_rate": 9.672202971194216e-06, "loss": 0.0211, "step": 64700 }, { "grad_norm": 0.09851238131523132, "learning_rate": 9.659988147774213e-06, "loss": 0.0154, "step": 64710 }, { "grad_norm": 0.24438953399658203, "learning_rate": 9.647780217358942e-06, "loss": 0.0132, "step": 64720 }, { "grad_norm": 0.19311223924160004, "learning_rate": 9.635579182034376e-06, "loss": 0.011, "step": 64730 }, { "grad_norm": 0.3228036165237427, "learning_rate": 9.623385043885386e-06, "loss": 0.0215, "step": 64740 }, { "grad_norm": 0.18280629813671112, "learning_rate": 9.61119780499557e-06, "loss": 0.0109, "step": 64750 }, { "grad_norm": 0.18092131614685059, "learning_rate": 9.599017467447418e-06, "loss": 0.0107, "step": 64760 }, { "grad_norm": 0.29427820444107056, "learning_rate": 9.586844033322206e-06, "loss": 0.0111, "step": 64770 }, { "grad_norm": 0.35099199414253235, "learning_rate": 9.574677504700052e-06, "loss": 0.0192, "step": 64780 }, { "grad_norm": 0.1902250349521637, "learning_rate": 9.562517883659877e-06, "loss": 0.0185, "step": 64790 }, { "grad_norm": 0.23634383082389832, "learning_rate": 9.55036517227944e-06, "loss": 0.014, "step": 64800 }, { "grad_norm": 0.23064087331295013, "learning_rate": 9.538219372635282e-06, "loss": 0.0126, "step": 64810 }, { "grad_norm": 0.29326915740966797, "learning_rate": 9.526080486802802e-06, "loss": 0.0156, "step": 64820 }, { "grad_norm": 0.19596168398857117, "learning_rate": 9.513948516856203e-06, "loss": 0.013, "step": 64830 }, { "grad_norm": 0.1336105763912201, "learning_rate": 9.501823464868504e-06, "loss": 0.0101, "step": 64840 }, { "grad_norm": 0.16222035884857178, "learning_rate": 9.489705332911547e-06, "loss": 0.0118, "step": 64850 }, { "grad_norm": 0.37515270709991455, "learning_rate": 9.477594123055994e-06, "loss": 0.0137, "step": 64860 }, { "grad_norm": 0.2407342940568924, "learning_rate": 9.465489837371321e-06, "loss": 0.0225, "step": 64870 }, { "grad_norm": 0.22065845131874084, "learning_rate": 9.453392477925794e-06, "loss": 0.0131, "step": 64880 }, { "grad_norm": 0.20914039015769958, "learning_rate": 9.441302046786566e-06, "loss": 0.0152, "step": 64890 }, { "grad_norm": 0.33302927017211914, "learning_rate": 9.429218546019519e-06, "loss": 0.0156, "step": 64900 }, { "grad_norm": 0.39023613929748535, "learning_rate": 9.41714197768941e-06, "loss": 0.0207, "step": 64910 }, { "grad_norm": 0.19365426898002625, "learning_rate": 9.405072343859805e-06, "loss": 0.0123, "step": 64920 }, { "grad_norm": 0.3153393268585205, "learning_rate": 9.393009646593043e-06, "loss": 0.0146, "step": 64930 }, { "grad_norm": 0.2627466320991516, "learning_rate": 9.38095388795035e-06, "loss": 0.0145, "step": 64940 }, { "grad_norm": 0.4131721556186676, "learning_rate": 9.36890506999169e-06, "loss": 0.0139, "step": 64950 }, { "grad_norm": 0.154349222779274, "learning_rate": 9.356863194775894e-06, "loss": 0.0132, "step": 64960 }, { "grad_norm": 0.2925010919570923, "learning_rate": 9.344828264360583e-06, "loss": 0.0145, "step": 64970 }, { "grad_norm": 0.2585504949092865, "learning_rate": 9.332800280802201e-06, "loss": 0.0164, "step": 64980 }, { "grad_norm": 0.13130800426006317, "learning_rate": 9.32077924615602e-06, "loss": 0.0115, "step": 64990 }, { "grad_norm": 0.2204546481370926, "learning_rate": 9.308765162476063e-06, "loss": 0.0137, "step": 65000 }, { "grad_norm": 0.21769239008426666, "learning_rate": 9.296758031815239e-06, "loss": 0.0188, "step": 65010 }, { "grad_norm": 0.34817543625831604, "learning_rate": 9.284757856225229e-06, "loss": 0.012, "step": 65020 }, { "grad_norm": 0.19210000336170197, "learning_rate": 9.272764637756538e-06, "loss": 0.0146, "step": 65030 }, { "grad_norm": 0.28259947896003723, "learning_rate": 9.260778378458479e-06, "loss": 0.0175, "step": 65040 }, { "grad_norm": 0.1987917125225067, "learning_rate": 9.248799080379172e-06, "loss": 0.0143, "step": 65050 }, { "grad_norm": 0.24357344210147858, "learning_rate": 9.236826745565558e-06, "loss": 0.0193, "step": 65060 }, { "grad_norm": 0.1924048662185669, "learning_rate": 9.224861376063388e-06, "loss": 0.0182, "step": 65070 }, { "grad_norm": 0.1753280609846115, "learning_rate": 9.212902973917192e-06, "loss": 0.0145, "step": 65080 }, { "grad_norm": 0.18377472460269928, "learning_rate": 9.20095154117035e-06, "loss": 0.0161, "step": 65090 }, { "grad_norm": 0.2507942020893097, "learning_rate": 9.189007079865036e-06, "loss": 0.0165, "step": 65100 }, { "grad_norm": 0.1696626842021942, "learning_rate": 9.177069592042226e-06, "loss": 0.0141, "step": 65110 }, { "grad_norm": 0.22809535264968872, "learning_rate": 9.165139079741724e-06, "loss": 0.0172, "step": 65120 }, { "grad_norm": 0.3275577127933502, "learning_rate": 9.153215545002098e-06, "loss": 0.0199, "step": 65130 }, { "grad_norm": 0.19220365583896637, "learning_rate": 9.141298989860798e-06, "loss": 0.0183, "step": 65140 }, { "grad_norm": 0.0706908106803894, "learning_rate": 9.129389416353994e-06, "loss": 0.0112, "step": 65150 }, { "grad_norm": 0.30669787526130676, "learning_rate": 9.11748682651673e-06, "loss": 0.0111, "step": 65160 }, { "grad_norm": 0.22318719327449799, "learning_rate": 9.105591222382837e-06, "loss": 0.0145, "step": 65170 }, { "grad_norm": 0.21343766152858734, "learning_rate": 9.093702605984915e-06, "loss": 0.0139, "step": 65180 }, { "grad_norm": 0.25809112191200256, "learning_rate": 9.081820979354455e-06, "loss": 0.0131, "step": 65190 }, { "grad_norm": 0.29064857959747314, "learning_rate": 9.069946344521663e-06, "loss": 0.0135, "step": 65200 }, { "grad_norm": 0.21691365540027618, "learning_rate": 9.058078703515598e-06, "loss": 0.0118, "step": 65210 }, { "grad_norm": 0.3256058096885681, "learning_rate": 9.046218058364125e-06, "loss": 0.02, "step": 65220 }, { "grad_norm": 0.4211615324020386, "learning_rate": 9.034364411093893e-06, "loss": 0.0146, "step": 65230 }, { "grad_norm": 0.15400640666484833, "learning_rate": 9.022517763730371e-06, "loss": 0.0156, "step": 65240 }, { "grad_norm": 0.25315341353416443, "learning_rate": 9.010678118297827e-06, "loss": 0.0162, "step": 65250 }, { "grad_norm": 0.24325481057167053, "learning_rate": 8.998845476819345e-06, "loss": 0.0156, "step": 65260 }, { "grad_norm": 0.2661627233028412, "learning_rate": 8.987019841316773e-06, "loss": 0.0143, "step": 65270 }, { "grad_norm": 0.1932230144739151, "learning_rate": 8.975201213810802e-06, "loss": 0.0103, "step": 65280 }, { "grad_norm": 0.23400259017944336, "learning_rate": 8.963389596320915e-06, "loss": 0.014, "step": 65290 }, { "grad_norm": 0.1948838084936142, "learning_rate": 8.951584990865391e-06, "loss": 0.0134, "step": 65300 }, { "grad_norm": 0.22074902057647705, "learning_rate": 8.939787399461319e-06, "loss": 0.0158, "step": 65310 }, { "grad_norm": 0.4620567858219147, "learning_rate": 8.927996824124591e-06, "loss": 0.0155, "step": 65320 }, { "grad_norm": 0.3163760304450989, "learning_rate": 8.916213266869854e-06, "loss": 0.0167, "step": 65330 }, { "grad_norm": 0.17160119116306305, "learning_rate": 8.904436729710658e-06, "loss": 0.0141, "step": 65340 }, { "grad_norm": 0.25397971272468567, "learning_rate": 8.892667214659245e-06, "loss": 0.0107, "step": 65350 }, { "grad_norm": 0.21090228855609894, "learning_rate": 8.880904723726713e-06, "loss": 0.0121, "step": 65360 }, { "grad_norm": 0.24372300505638123, "learning_rate": 8.869149258922971e-06, "loss": 0.0129, "step": 65370 }, { "grad_norm": 0.347212553024292, "learning_rate": 8.857400822256662e-06, "loss": 0.0127, "step": 65380 }, { "grad_norm": 0.12829051911830902, "learning_rate": 8.845659415735324e-06, "loss": 0.0178, "step": 65390 }, { "grad_norm": 0.1960497945547104, "learning_rate": 8.83392504136521e-06, "loss": 0.0161, "step": 65400 }, { "grad_norm": 0.11710423976182938, "learning_rate": 8.822197701151407e-06, "loss": 0.0147, "step": 65410 }, { "grad_norm": 0.2034565955400467, "learning_rate": 8.810477397097804e-06, "loss": 0.0102, "step": 65420 }, { "grad_norm": 0.15702645480632782, "learning_rate": 8.798764131207077e-06, "loss": 0.0155, "step": 65430 }, { "grad_norm": 0.25273141264915466, "learning_rate": 8.787057905480706e-06, "loss": 0.0151, "step": 65440 }, { "grad_norm": 0.21443995833396912, "learning_rate": 8.775358721918958e-06, "loss": 0.0125, "step": 65450 }, { "grad_norm": 0.37302127480506897, "learning_rate": 8.763666582520923e-06, "loss": 0.0148, "step": 65460 }, { "grad_norm": 0.249351367354393, "learning_rate": 8.751981489284445e-06, "loss": 0.0136, "step": 65470 }, { "grad_norm": 0.23207764327526093, "learning_rate": 8.740303444206188e-06, "loss": 0.0132, "step": 65480 }, { "grad_norm": 0.11612141132354736, "learning_rate": 8.72863244928162e-06, "loss": 0.011, "step": 65490 }, { "grad_norm": 0.19741877913475037, "learning_rate": 8.716968506504991e-06, "loss": 0.019, "step": 65500 }, { "grad_norm": 0.8042680025100708, "learning_rate": 8.70531161786935e-06, "loss": 0.0116, "step": 65510 }, { "grad_norm": 0.21878664195537567, "learning_rate": 8.693661785366558e-06, "loss": 0.0225, "step": 65520 }, { "grad_norm": 0.15638281404972076, "learning_rate": 8.682019010987208e-06, "loss": 0.0208, "step": 65530 }, { "grad_norm": 0.16833345592021942, "learning_rate": 8.670383296720786e-06, "loss": 0.01, "step": 65540 }, { "grad_norm": 0.28703126311302185, "learning_rate": 8.658754644555478e-06, "loss": 0.011, "step": 65550 }, { "grad_norm": 0.19357898831367493, "learning_rate": 8.647133056478313e-06, "loss": 0.016, "step": 65560 }, { "grad_norm": 0.6387572288513184, "learning_rate": 8.635518534475123e-06, "loss": 0.0171, "step": 65570 }, { "grad_norm": 0.1817665696144104, "learning_rate": 8.623911080530467e-06, "loss": 0.0168, "step": 65580 }, { "grad_norm": 0.2649601995944977, "learning_rate": 8.61231069662779e-06, "loss": 0.0138, "step": 65590 }, { "grad_norm": 0.14567923545837402, "learning_rate": 8.600717384749252e-06, "loss": 0.012, "step": 65600 }, { "grad_norm": 0.3202335834503174, "learning_rate": 8.58913114687584e-06, "loss": 0.0135, "step": 65610 }, { "grad_norm": 0.3593510389328003, "learning_rate": 8.57755198498732e-06, "loss": 0.0188, "step": 65620 }, { "grad_norm": 0.16583265364170074, "learning_rate": 8.565979901062265e-06, "loss": 0.0223, "step": 65630 }, { "grad_norm": 0.17012204229831696, "learning_rate": 8.554414897078033e-06, "loss": 0.0143, "step": 65640 }, { "grad_norm": 0.1544560045003891, "learning_rate": 8.542856975010727e-06, "loss": 0.0114, "step": 65650 }, { "grad_norm": 0.17851325869560242, "learning_rate": 8.531306136835337e-06, "loss": 0.0102, "step": 65660 }, { "grad_norm": 0.2537996768951416, "learning_rate": 8.519762384525548e-06, "loss": 0.0142, "step": 65670 }, { "grad_norm": 0.2478475570678711, "learning_rate": 8.508225720053875e-06, "loss": 0.0119, "step": 65680 }, { "grad_norm": 0.28503602743148804, "learning_rate": 8.496696145391625e-06, "loss": 0.0132, "step": 65690 }, { "grad_norm": 0.19463704526424408, "learning_rate": 8.485173662508889e-06, "loss": 0.0122, "step": 65700 }, { "grad_norm": 0.1735149621963501, "learning_rate": 8.473658273374536e-06, "loss": 0.0135, "step": 65710 }, { "grad_norm": 0.28906118869781494, "learning_rate": 8.462149979956253e-06, "loss": 0.0135, "step": 65720 }, { "grad_norm": 0.3180815875530243, "learning_rate": 8.450648784220461e-06, "loss": 0.0177, "step": 65730 }, { "grad_norm": 0.25787168741226196, "learning_rate": 8.439154688132417e-06, "loss": 0.0143, "step": 65740 }, { "grad_norm": 0.2686995267868042, "learning_rate": 8.427667693656143e-06, "loss": 0.0166, "step": 65750 }, { "grad_norm": 0.20425689220428467, "learning_rate": 8.416187802754454e-06, "loss": 0.0152, "step": 65760 }, { "grad_norm": 0.2628706693649292, "learning_rate": 8.404715017388965e-06, "loss": 0.0143, "step": 65770 }, { "grad_norm": 0.2767013609409332, "learning_rate": 8.393249339520015e-06, "loss": 0.0155, "step": 65780 }, { "grad_norm": 0.17242570221424103, "learning_rate": 8.381790771106834e-06, "loss": 0.0123, "step": 65790 }, { "grad_norm": 0.3108120858669281, "learning_rate": 8.370339314107339e-06, "loss": 0.0092, "step": 65800 }, { "grad_norm": 0.251364529132843, "learning_rate": 8.358894970478281e-06, "loss": 0.0123, "step": 65810 }, { "grad_norm": 0.25058314204216003, "learning_rate": 8.347457742175196e-06, "loss": 0.0132, "step": 65820 }, { "grad_norm": 0.29834792017936707, "learning_rate": 8.33602763115236e-06, "loss": 0.0144, "step": 65830 }, { "grad_norm": 0.2898118197917938, "learning_rate": 8.324604639362916e-06, "loss": 0.0183, "step": 65840 }, { "grad_norm": 0.20192793011665344, "learning_rate": 8.31318876875869e-06, "loss": 0.012, "step": 65850 }, { "grad_norm": 0.20023924112319946, "learning_rate": 8.30178002129039e-06, "loss": 0.0122, "step": 65860 }, { "grad_norm": 0.22717790305614471, "learning_rate": 8.290378398907423e-06, "loss": 0.0115, "step": 65870 }, { "grad_norm": 0.2656925320625305, "learning_rate": 8.278983903558029e-06, "loss": 0.0188, "step": 65880 }, { "grad_norm": 0.22094114124774933, "learning_rate": 8.26759653718921e-06, "loss": 0.0119, "step": 65890 }, { "grad_norm": 0.21770313382148743, "learning_rate": 8.25621630174676e-06, "loss": 0.0139, "step": 65900 }, { "grad_norm": 0.14656519889831543, "learning_rate": 8.244843199175261e-06, "loss": 0.0213, "step": 65910 }, { "grad_norm": 0.22300010919570923, "learning_rate": 8.233477231418046e-06, "loss": 0.0097, "step": 65920 }, { "grad_norm": 0.17560678720474243, "learning_rate": 8.22211840041725e-06, "loss": 0.0144, "step": 65930 }, { "grad_norm": 0.4602450430393219, "learning_rate": 8.210766708113792e-06, "loss": 0.0195, "step": 65940 }, { "grad_norm": 0.2510773837566376, "learning_rate": 8.199422156447367e-06, "loss": 0.0132, "step": 65950 }, { "grad_norm": 0.25027745962142944, "learning_rate": 8.188084747356451e-06, "loss": 0.0187, "step": 65960 }, { "grad_norm": 0.2033836990594864, "learning_rate": 8.176754482778299e-06, "loss": 0.0105, "step": 65970 }, { "grad_norm": 0.2262023687362671, "learning_rate": 8.165431364648918e-06, "loss": 0.0208, "step": 65980 }, { "grad_norm": 0.2303183227777481, "learning_rate": 8.154115394903162e-06, "loss": 0.0137, "step": 65990 }, { "grad_norm": 0.22138060629367828, "learning_rate": 8.142806575474582e-06, "loss": 0.0147, "step": 66000 }, { "grad_norm": 0.2424066811800003, "learning_rate": 8.131504908295562e-06, "loss": 0.0139, "step": 66010 }, { "grad_norm": 0.23118175566196442, "learning_rate": 8.120210395297262e-06, "loss": 0.0115, "step": 66020 }, { "grad_norm": 0.2512112557888031, "learning_rate": 8.108923038409565e-06, "loss": 0.0171, "step": 66030 }, { "grad_norm": 0.30068209767341614, "learning_rate": 8.097642839561226e-06, "loss": 0.0156, "step": 66040 }, { "grad_norm": 0.5349729061126709, "learning_rate": 8.08636980067966e-06, "loss": 0.0181, "step": 66050 }, { "grad_norm": 0.16358892619609833, "learning_rate": 8.075103923691186e-06, "loss": 0.0118, "step": 66060 }, { "grad_norm": 0.1527557522058487, "learning_rate": 8.063845210520793e-06, "loss": 0.0121, "step": 66070 }, { "grad_norm": 0.2117648869752884, "learning_rate": 8.052593663092295e-06, "loss": 0.0166, "step": 66080 }, { "grad_norm": 0.23114916682243347, "learning_rate": 8.04134928332827e-06, "loss": 0.0133, "step": 66090 }, { "grad_norm": 0.24162845313549042, "learning_rate": 8.030112073150086e-06, "loss": 0.0155, "step": 66100 }, { "grad_norm": 0.21654479205608368, "learning_rate": 8.018882034477881e-06, "loss": 0.0113, "step": 66110 }, { "grad_norm": 0.16092269122600555, "learning_rate": 8.007659169230541e-06, "loss": 0.0119, "step": 66120 }, { "grad_norm": 0.2783711552619934, "learning_rate": 7.996443479325755e-06, "loss": 0.0127, "step": 66130 }, { "grad_norm": 0.41423991322517395, "learning_rate": 7.985234966679977e-06, "loss": 0.0135, "step": 66140 }, { "grad_norm": 0.4379727244377136, "learning_rate": 7.974033633208438e-06, "loss": 0.0132, "step": 66150 }, { "grad_norm": 0.18853473663330078, "learning_rate": 7.962839480825135e-06, "loss": 0.0124, "step": 66160 }, { "grad_norm": 0.38924625515937805, "learning_rate": 7.951652511442858e-06, "loss": 0.0136, "step": 66170 }, { "grad_norm": 0.18854667246341705, "learning_rate": 7.940472726973125e-06, "loss": 0.0145, "step": 66180 }, { "grad_norm": 0.18689046800136566, "learning_rate": 7.929300129326289e-06, "loss": 0.013, "step": 66190 }, { "grad_norm": 0.24324914813041687, "learning_rate": 7.91813472041142e-06, "loss": 0.0122, "step": 66200 }, { "grad_norm": 0.21350087225437164, "learning_rate": 7.906976502136376e-06, "loss": 0.0131, "step": 66210 }, { "grad_norm": 0.1478373259305954, "learning_rate": 7.89582547640782e-06, "loss": 0.0159, "step": 66220 }, { "grad_norm": 0.30085498094558716, "learning_rate": 7.884681645131115e-06, "loss": 0.0151, "step": 66230 }, { "grad_norm": 0.2914828360080719, "learning_rate": 7.87354501021048e-06, "loss": 0.0134, "step": 66240 }, { "grad_norm": 0.2183752954006195, "learning_rate": 7.86241557354882e-06, "loss": 0.0128, "step": 66250 }, { "grad_norm": 0.18735632300376892, "learning_rate": 7.8512933370479e-06, "loss": 0.0132, "step": 66260 }, { "grad_norm": 0.2276463657617569, "learning_rate": 7.840178302608158e-06, "loss": 0.0119, "step": 66270 }, { "grad_norm": 0.873108983039856, "learning_rate": 7.82907047212888e-06, "loss": 0.014, "step": 66280 }, { "grad_norm": 0.20001982152462006, "learning_rate": 7.81796984750809e-06, "loss": 0.0143, "step": 66290 }, { "grad_norm": 0.16743920743465424, "learning_rate": 7.806876430642546e-06, "loss": 0.0131, "step": 66300 }, { "grad_norm": 0.396721214056015, "learning_rate": 7.795790223427862e-06, "loss": 0.0165, "step": 66310 }, { "grad_norm": 0.3507241904735565, "learning_rate": 7.784711227758324e-06, "loss": 0.0164, "step": 66320 }, { "grad_norm": 0.18246202170848846, "learning_rate": 7.773639445527053e-06, "loss": 0.0154, "step": 66330 }, { "grad_norm": 0.11997046321630478, "learning_rate": 7.762574878625905e-06, "loss": 0.0147, "step": 66340 }, { "grad_norm": 0.17904353141784668, "learning_rate": 7.751517528945513e-06, "loss": 0.0129, "step": 66350 }, { "grad_norm": 0.37772879004478455, "learning_rate": 7.740467398375278e-06, "loss": 0.0166, "step": 66360 }, { "grad_norm": 0.2656500041484833, "learning_rate": 7.729424488803378e-06, "loss": 0.0141, "step": 66370 }, { "grad_norm": 0.20660564303398132, "learning_rate": 7.71838880211671e-06, "loss": 0.0136, "step": 66380 }, { "grad_norm": 0.3003290593624115, "learning_rate": 7.707360340200997e-06, "loss": 0.0093, "step": 66390 }, { "grad_norm": 0.159606471657753, "learning_rate": 7.696339104940697e-06, "loss": 0.0182, "step": 66400 }, { "grad_norm": 0.12256310135126114, "learning_rate": 7.685325098219038e-06, "loss": 0.0125, "step": 66410 }, { "grad_norm": 0.1784525215625763, "learning_rate": 7.674318321918017e-06, "loss": 0.0137, "step": 66420 }, { "grad_norm": 0.1656436175107956, "learning_rate": 7.663318777918366e-06, "loss": 0.0159, "step": 66430 }, { "grad_norm": 0.22164252400398254, "learning_rate": 7.652326468099647e-06, "loss": 0.0199, "step": 66440 }, { "grad_norm": 0.21218614280223846, "learning_rate": 7.641341394340096e-06, "loss": 0.0112, "step": 66450 }, { "grad_norm": 0.5147440433502197, "learning_rate": 7.630363558516818e-06, "loss": 0.0146, "step": 66460 }, { "grad_norm": 0.19560785591602325, "learning_rate": 7.619392962505578e-06, "loss": 0.0167, "step": 66470 }, { "grad_norm": 0.2347905933856964, "learning_rate": 7.6084296081809725e-06, "loss": 0.0128, "step": 66480 }, { "grad_norm": 0.3267754018306732, "learning_rate": 7.597473497416347e-06, "loss": 0.0149, "step": 66490 }, { "grad_norm": 0.10474855452775955, "learning_rate": 7.586524632083764e-06, "loss": 0.0112, "step": 66500 }, { "grad_norm": 0.18299293518066406, "learning_rate": 7.5755830140541326e-06, "loss": 0.0122, "step": 66510 }, { "grad_norm": 0.14511680603027344, "learning_rate": 7.5646486451970475e-06, "loss": 0.0132, "step": 66520 }, { "grad_norm": 0.1483357548713684, "learning_rate": 7.553721527380897e-06, "loss": 0.0134, "step": 66530 }, { "grad_norm": 0.29491570591926575, "learning_rate": 7.542801662472826e-06, "loss": 0.0151, "step": 66540 }, { "grad_norm": 0.5321530699729919, "learning_rate": 7.5318890523387475e-06, "loss": 0.0129, "step": 66550 }, { "grad_norm": 0.12913446128368378, "learning_rate": 7.52098369884332e-06, "loss": 0.0095, "step": 66560 }, { "grad_norm": 0.23710379004478455, "learning_rate": 7.510085603849992e-06, "loss": 0.0185, "step": 66570 }, { "grad_norm": 0.16410472989082336, "learning_rate": 7.499194769220918e-06, "loss": 0.013, "step": 66580 }, { "grad_norm": 0.606158971786499, "learning_rate": 7.48831119681706e-06, "loss": 0.0213, "step": 66590 }, { "grad_norm": 0.323965847492218, "learning_rate": 7.477434888498119e-06, "loss": 0.0148, "step": 66600 }, { "grad_norm": 0.31275802850723267, "learning_rate": 7.466565846122564e-06, "loss": 0.0201, "step": 66610 }, { "grad_norm": 0.20842719078063965, "learning_rate": 7.455704071547626e-06, "loss": 0.0151, "step": 66620 }, { "grad_norm": 0.21493709087371826, "learning_rate": 7.444849566629247e-06, "loss": 0.0164, "step": 66630 }, { "grad_norm": 0.2428281307220459, "learning_rate": 7.434002333222212e-06, "loss": 0.0104, "step": 66640 }, { "grad_norm": 0.4739057421684265, "learning_rate": 7.423162373179976e-06, "loss": 0.0229, "step": 66650 }, { "grad_norm": 0.28176403045654297, "learning_rate": 7.412329688354835e-06, "loss": 0.0135, "step": 66660 }, { "grad_norm": 0.28232884407043457, "learning_rate": 7.40150428059776e-06, "loss": 0.009, "step": 66670 }, { "grad_norm": 0.30684104561805725, "learning_rate": 7.3906861517585354e-06, "loss": 0.0135, "step": 66680 }, { "grad_norm": 0.26832303404808044, "learning_rate": 7.37987530368569e-06, "loss": 0.0148, "step": 66690 }, { "grad_norm": 0.1730795055627823, "learning_rate": 7.369071738226474e-06, "loss": 0.0135, "step": 66700 }, { "grad_norm": 0.16316872835159302, "learning_rate": 7.358275457226954e-06, "loss": 0.0118, "step": 66710 }, { "grad_norm": 0.23573067784309387, "learning_rate": 7.347486462531899e-06, "loss": 0.0143, "step": 66720 }, { "grad_norm": 0.26102662086486816, "learning_rate": 7.336704755984858e-06, "loss": 0.0145, "step": 66730 }, { "grad_norm": 0.14775261282920837, "learning_rate": 7.325930339428133e-06, "loss": 0.0161, "step": 66740 }, { "grad_norm": 0.6006825566291809, "learning_rate": 7.315163214702769e-06, "loss": 0.0156, "step": 66750 }, { "grad_norm": 0.2518087327480316, "learning_rate": 7.304403383648595e-06, "loss": 0.0147, "step": 66760 }, { "grad_norm": 0.17460714280605316, "learning_rate": 7.293650848104139e-06, "loss": 0.0185, "step": 66770 }, { "grad_norm": 0.1312212198972702, "learning_rate": 7.2829056099067374e-06, "loss": 0.0137, "step": 66780 }, { "grad_norm": 0.19581179320812225, "learning_rate": 7.2721676708924494e-06, "loss": 0.0142, "step": 66790 }, { "grad_norm": 0.46657848358154297, "learning_rate": 7.261437032896096e-06, "loss": 0.023, "step": 66800 }, { "grad_norm": 0.20167909562587738, "learning_rate": 7.250713697751255e-06, "loss": 0.0158, "step": 66810 }, { "grad_norm": 0.2026352435350418, "learning_rate": 7.239997667290255e-06, "loss": 0.0114, "step": 66820 }, { "grad_norm": 0.11581362783908844, "learning_rate": 7.2292889433441425e-06, "loss": 0.0146, "step": 66830 }, { "grad_norm": 0.3529985845088959, "learning_rate": 7.218587527742793e-06, "loss": 0.0149, "step": 66840 }, { "grad_norm": 0.29025596380233765, "learning_rate": 7.207893422314749e-06, "loss": 0.0141, "step": 66850 }, { "grad_norm": 0.28915607929229736, "learning_rate": 7.1972066288873545e-06, "loss": 0.012, "step": 66860 }, { "grad_norm": 0.2683699131011963, "learning_rate": 7.186527149286687e-06, "loss": 0.0242, "step": 66870 }, { "grad_norm": 0.4133109748363495, "learning_rate": 7.175854985337576e-06, "loss": 0.0133, "step": 66880 }, { "grad_norm": 0.21933971345424652, "learning_rate": 7.16519013886362e-06, "loss": 0.0137, "step": 66890 }, { "grad_norm": 0.2722699046134949, "learning_rate": 7.154532611687109e-06, "loss": 0.0152, "step": 66900 }, { "grad_norm": 0.21636147797107697, "learning_rate": 7.143882405629177e-06, "loss": 0.0133, "step": 66910 }, { "grad_norm": 0.16270799934864044, "learning_rate": 7.133239522509605e-06, "loss": 0.0111, "step": 66920 }, { "grad_norm": 0.1655185967683792, "learning_rate": 7.1226039641469956e-06, "loss": 0.0126, "step": 66930 }, { "grad_norm": 0.19926764070987701, "learning_rate": 7.111975732358678e-06, "loss": 0.01, "step": 66940 }, { "grad_norm": 0.15298621356487274, "learning_rate": 7.101354828960693e-06, "loss": 0.012, "step": 66950 }, { "grad_norm": 0.17753739655017853, "learning_rate": 7.090741255767918e-06, "loss": 0.0111, "step": 66960 }, { "grad_norm": 0.26670292019844055, "learning_rate": 7.080135014593875e-06, "loss": 0.0132, "step": 66970 }, { "grad_norm": 0.2650458514690399, "learning_rate": 7.069536107250896e-06, "loss": 0.0108, "step": 66980 }, { "grad_norm": 0.21390829980373383, "learning_rate": 7.058944535550049e-06, "loss": 0.0129, "step": 66990 }, { "grad_norm": 0.19710110127925873, "learning_rate": 7.048360301301138e-06, "loss": 0.0127, "step": 67000 }, { "grad_norm": 0.2727247476577759, "learning_rate": 7.03778340631272e-06, "loss": 0.0152, "step": 67010 }, { "grad_norm": 0.2607077360153198, "learning_rate": 7.0272138523921e-06, "loss": 0.0155, "step": 67020 }, { "grad_norm": 0.2617398798465729, "learning_rate": 7.016651641345334e-06, "loss": 0.0201, "step": 67030 }, { "grad_norm": 0.2734626233577728, "learning_rate": 7.0060967749772e-06, "loss": 0.0125, "step": 67040 }, { "grad_norm": 0.4374639093875885, "learning_rate": 6.995549255091238e-06, "loss": 0.0217, "step": 67050 }, { "grad_norm": 0.2406790852546692, "learning_rate": 6.98500908348973e-06, "loss": 0.0175, "step": 67060 }, { "grad_norm": 0.32577386498451233, "learning_rate": 6.974476261973711e-06, "loss": 0.0128, "step": 67070 }, { "grad_norm": 0.3252185881137848, "learning_rate": 6.963950792342949e-06, "loss": 0.0117, "step": 67080 }, { "grad_norm": 0.17595741152763367, "learning_rate": 6.9534326763959715e-06, "loss": 0.0088, "step": 67090 }, { "grad_norm": 0.23175707459449768, "learning_rate": 6.942921915929995e-06, "loss": 0.0116, "step": 67100 }, { "grad_norm": 0.2368069589138031, "learning_rate": 6.9324185127410735e-06, "loss": 0.0218, "step": 67110 }, { "grad_norm": 0.44444265961647034, "learning_rate": 6.921922468623921e-06, "loss": 0.0132, "step": 67120 }, { "grad_norm": 0.19466789066791534, "learning_rate": 6.911433785372023e-06, "loss": 0.0149, "step": 67130 }, { "grad_norm": 0.2188010811805725, "learning_rate": 6.900952464777632e-06, "loss": 0.0182, "step": 67140 }, { "grad_norm": 0.29008302092552185, "learning_rate": 6.8904785086316815e-06, "loss": 0.0122, "step": 67150 }, { "grad_norm": 0.278055340051651, "learning_rate": 6.880011918723927e-06, "loss": 0.0114, "step": 67160 }, { "grad_norm": 0.3408415913581848, "learning_rate": 6.86955269684279e-06, "loss": 0.0127, "step": 67170 }, { "grad_norm": 0.2992589771747589, "learning_rate": 6.859100844775473e-06, "loss": 0.0245, "step": 67180 }, { "grad_norm": 0.17632733285427094, "learning_rate": 6.84865636430792e-06, "loss": 0.0169, "step": 67190 }, { "grad_norm": 0.24695247411727905, "learning_rate": 6.838219257224804e-06, "loss": 0.0112, "step": 67200 }, { "grad_norm": 0.25387394428253174, "learning_rate": 6.827789525309536e-06, "loss": 0.0139, "step": 67210 }, { "grad_norm": 0.3119005560874939, "learning_rate": 6.81736717034428e-06, "loss": 0.0136, "step": 67220 }, { "grad_norm": 0.2555665075778961, "learning_rate": 6.806952194109933e-06, "loss": 0.0225, "step": 67230 }, { "grad_norm": 0.10276803374290466, "learning_rate": 6.796544598386112e-06, "loss": 0.0115, "step": 67240 }, { "grad_norm": 0.2034897804260254, "learning_rate": 6.786144384951204e-06, "loss": 0.0177, "step": 67250 }, { "grad_norm": 0.20213554799556732, "learning_rate": 6.775751555582322e-06, "loss": 0.012, "step": 67260 }, { "grad_norm": 0.49426546692848206, "learning_rate": 6.76536611205531e-06, "loss": 0.0155, "step": 67270 }, { "grad_norm": 0.3281289041042328, "learning_rate": 6.754988056144762e-06, "loss": 0.0172, "step": 67280 }, { "grad_norm": 0.1182033121585846, "learning_rate": 6.744617389624014e-06, "loss": 0.0112, "step": 67290 }, { "grad_norm": 0.24607054889202118, "learning_rate": 6.734254114265087e-06, "loss": 0.0132, "step": 67300 }, { "grad_norm": 0.28168654441833496, "learning_rate": 6.723898231838843e-06, "loss": 0.0114, "step": 67310 }, { "grad_norm": 0.23198741674423218, "learning_rate": 6.713549744114766e-06, "loss": 0.0147, "step": 67320 }, { "grad_norm": 0.18401627242565155, "learning_rate": 6.703208652861159e-06, "loss": 0.0149, "step": 67330 }, { "grad_norm": 0.2066373974084854, "learning_rate": 6.69287495984503e-06, "loss": 0.0149, "step": 67340 }, { "grad_norm": 0.2006249576807022, "learning_rate": 6.6825486668321e-06, "loss": 0.0129, "step": 67350 }, { "grad_norm": 0.13673745095729828, "learning_rate": 6.672229775586886e-06, "loss": 0.0147, "step": 67360 }, { "grad_norm": 0.19512596726417542, "learning_rate": 6.661918287872576e-06, "loss": 0.0116, "step": 67370 }, { "grad_norm": 0.17982624471187592, "learning_rate": 6.6516142054511346e-06, "loss": 0.0153, "step": 67380 }, { "grad_norm": 0.2867700159549713, "learning_rate": 6.641317530083241e-06, "loss": 0.0117, "step": 67390 }, { "grad_norm": 0.30355435609817505, "learning_rate": 6.631028263528322e-06, "loss": 0.0149, "step": 67400 }, { "grad_norm": 0.1687110960483551, "learning_rate": 6.620746407544537e-06, "loss": 0.0138, "step": 67410 }, { "grad_norm": 0.2043817937374115, "learning_rate": 6.610471963888742e-06, "loss": 0.0157, "step": 67420 }, { "grad_norm": 0.17426323890686035, "learning_rate": 6.600204934316606e-06, "loss": 0.0107, "step": 67430 }, { "grad_norm": 0.16835758090019226, "learning_rate": 6.589945320582452e-06, "loss": 0.0118, "step": 67440 }, { "grad_norm": 0.21161842346191406, "learning_rate": 6.579693124439374e-06, "loss": 0.0172, "step": 67450 }, { "grad_norm": 0.2464224249124527, "learning_rate": 6.56944834763919e-06, "loss": 0.0127, "step": 67460 }, { "grad_norm": 0.2655923068523407, "learning_rate": 6.5592109919324575e-06, "loss": 0.0193, "step": 67470 }, { "grad_norm": 0.26761388778686523, "learning_rate": 6.54898105906846e-06, "loss": 0.0182, "step": 67480 }, { "grad_norm": 0.13419489562511444, "learning_rate": 6.5387585507952155e-06, "loss": 0.0179, "step": 67490 }, { "grad_norm": 0.24763979017734528, "learning_rate": 6.528543468859461e-06, "loss": 0.0139, "step": 67500 }, { "grad_norm": 0.22395674884319305, "learning_rate": 6.518335815006682e-06, "loss": 0.0154, "step": 67510 }, { "grad_norm": 0.3156868517398834, "learning_rate": 6.5081355909810845e-06, "loss": 0.014, "step": 67520 }, { "grad_norm": 0.24212537705898285, "learning_rate": 6.497942798525608e-06, "loss": 0.0179, "step": 67530 }, { "grad_norm": 0.5647813081741333, "learning_rate": 6.487757439381936e-06, "loss": 0.0199, "step": 67540 }, { "grad_norm": 0.35870954394340515, "learning_rate": 6.477579515290433e-06, "loss": 0.0127, "step": 67550 }, { "grad_norm": 0.15377284586429596, "learning_rate": 6.467409027990273e-06, "loss": 0.0167, "step": 67560 }, { "grad_norm": 0.23601852357387543, "learning_rate": 6.457245979219279e-06, "loss": 0.017, "step": 67570 }, { "grad_norm": 0.3039679527282715, "learning_rate": 6.447090370714054e-06, "loss": 0.0183, "step": 67580 }, { "grad_norm": 0.12767678499221802, "learning_rate": 6.436942204209917e-06, "loss": 0.0194, "step": 67590 }, { "grad_norm": 0.20494307577610016, "learning_rate": 6.4268014814408804e-06, "loss": 0.0193, "step": 67600 }, { "grad_norm": 0.24158544838428497, "learning_rate": 6.41666820413977e-06, "loss": 0.0142, "step": 67610 }, { "grad_norm": 0.24067653715610504, "learning_rate": 6.406542374038033e-06, "loss": 0.016, "step": 67620 }, { "grad_norm": 0.21860653162002563, "learning_rate": 6.396423992865935e-06, "loss": 0.0157, "step": 67630 }, { "grad_norm": 0.210557222366333, "learning_rate": 6.386313062352412e-06, "loss": 0.0142, "step": 67640 }, { "grad_norm": 0.3110707998275757, "learning_rate": 6.376209584225152e-06, "loss": 0.0124, "step": 67650 }, { "grad_norm": 0.22605274617671967, "learning_rate": 6.366113560210557e-06, "loss": 0.0098, "step": 67660 }, { "grad_norm": 0.4025517702102661, "learning_rate": 6.356024992033766e-06, "loss": 0.0215, "step": 67670 }, { "grad_norm": 0.20037449896335602, "learning_rate": 6.345943881418648e-06, "loss": 0.0124, "step": 67680 }, { "grad_norm": 0.23183459043502808, "learning_rate": 6.33587023008777e-06, "loss": 0.0128, "step": 67690 }, { "grad_norm": 0.19301122426986694, "learning_rate": 6.32580403976245e-06, "loss": 0.015, "step": 67700 }, { "grad_norm": 0.21052156388759613, "learning_rate": 6.31574531216273e-06, "loss": 0.0129, "step": 67710 }, { "grad_norm": 0.3933112621307373, "learning_rate": 6.305694049007371e-06, "loss": 0.02, "step": 67720 }, { "grad_norm": 0.27336585521698, "learning_rate": 6.2956502520138575e-06, "loss": 0.015, "step": 67730 }, { "grad_norm": 0.2236975133419037, "learning_rate": 6.285613922898409e-06, "loss": 0.0099, "step": 67740 }, { "grad_norm": 0.1837151050567627, "learning_rate": 6.275585063375927e-06, "loss": 0.0158, "step": 67750 }, { "grad_norm": 0.2119634747505188, "learning_rate": 6.265563675160113e-06, "loss": 0.0141, "step": 67760 }, { "grad_norm": 0.16922582685947418, "learning_rate": 6.25554975996332e-06, "loss": 0.0089, "step": 67770 }, { "grad_norm": 0.2131797969341278, "learning_rate": 6.245543319496661e-06, "loss": 0.0151, "step": 67780 }, { "grad_norm": 0.1993831843137741, "learning_rate": 6.2355443554699685e-06, "loss": 0.0168, "step": 67790 }, { "grad_norm": 0.132177472114563, "learning_rate": 6.225552869591766e-06, "loss": 0.0109, "step": 67800 }, { "grad_norm": 0.18301044404506683, "learning_rate": 6.215568863569365e-06, "loss": 0.0181, "step": 67810 }, { "grad_norm": 0.45366939902305603, "learning_rate": 6.205592339108712e-06, "loss": 0.0195, "step": 67820 }, { "grad_norm": 0.22460304200649261, "learning_rate": 6.195623297914577e-06, "loss": 0.0138, "step": 67830 }, { "grad_norm": 0.31127476692199707, "learning_rate": 6.185661741690357e-06, "loss": 0.0128, "step": 67840 }, { "grad_norm": 0.33859774470329285, "learning_rate": 6.1757076721382145e-06, "loss": 0.0163, "step": 67850 }, { "grad_norm": 0.20510567724704742, "learning_rate": 6.165761090959038e-06, "loss": 0.0193, "step": 67860 }, { "grad_norm": 0.23260392248630524, "learning_rate": 6.155821999852424e-06, "loss": 0.0152, "step": 67870 }, { "grad_norm": 0.2179306000471115, "learning_rate": 6.145890400516696e-06, "loss": 0.0132, "step": 67880 }, { "grad_norm": 0.20341891050338745, "learning_rate": 6.1359662946488816e-06, "loss": 0.013, "step": 67890 }, { "grad_norm": 0.2303960919380188, "learning_rate": 6.12604968394474e-06, "loss": 0.0148, "step": 67900 }, { "grad_norm": 0.187620148062706, "learning_rate": 6.11614057009875e-06, "loss": 0.0158, "step": 67910 }, { "grad_norm": 0.19727419316768646, "learning_rate": 6.106238954804111e-06, "loss": 0.0153, "step": 67920 }, { "grad_norm": 0.14556629955768585, "learning_rate": 6.096344839752738e-06, "loss": 0.0123, "step": 67930 }, { "grad_norm": 0.12042037397623062, "learning_rate": 6.086458226635278e-06, "loss": 0.0154, "step": 67940 }, { "grad_norm": 0.2958913743495941, "learning_rate": 6.076579117141046e-06, "loss": 0.0206, "step": 67950 }, { "grad_norm": 0.3478211760520935, "learning_rate": 6.066707512958153e-06, "loss": 0.0168, "step": 67960 }, { "grad_norm": 0.23408961296081543, "learning_rate": 6.056843415773361e-06, "loss": 0.0157, "step": 67970 }, { "grad_norm": 0.23849041759967804, "learning_rate": 6.0469868272721776e-06, "loss": 0.0127, "step": 67980 }, { "grad_norm": 0.1892017126083374, "learning_rate": 6.037137749138844e-06, "loss": 0.0168, "step": 67990 }, { "grad_norm": 0.3120902478694916, "learning_rate": 6.027296183056252e-06, "loss": 0.0213, "step": 68000 }, { "grad_norm": 0.2347114086151123, "learning_rate": 6.017462130706114e-06, "loss": 0.0139, "step": 68010 }, { "grad_norm": 0.15660148859024048, "learning_rate": 6.0076355937687516e-06, "loss": 0.0177, "step": 68020 }, { "grad_norm": 0.18289132416248322, "learning_rate": 5.9978165739232925e-06, "loss": 0.0108, "step": 68030 }, { "grad_norm": 0.376238614320755, "learning_rate": 5.988005072847508e-06, "loss": 0.0106, "step": 68040 }, { "grad_norm": 0.18232473731040955, "learning_rate": 5.978201092217928e-06, "loss": 0.021, "step": 68050 }, { "grad_norm": 0.2243288904428482, "learning_rate": 5.9684046337097895e-06, "loss": 0.0108, "step": 68060 }, { "grad_norm": 0.14527902007102966, "learning_rate": 5.958615698997016e-06, "loss": 0.0131, "step": 68070 }, { "grad_norm": 0.27091512084007263, "learning_rate": 5.948834289752303e-06, "loss": 0.0127, "step": 68080 }, { "grad_norm": 0.30810344219207764, "learning_rate": 5.939060407646996e-06, "loss": 0.0189, "step": 68090 }, { "grad_norm": 0.19916652143001556, "learning_rate": 5.929294054351198e-06, "loss": 0.0166, "step": 68100 }, { "grad_norm": 0.47176945209503174, "learning_rate": 5.919535231533707e-06, "loss": 0.0135, "step": 68110 }, { "grad_norm": 0.20677223801612854, "learning_rate": 5.909783940862046e-06, "loss": 0.0124, "step": 68120 }, { "grad_norm": 0.2627597451210022, "learning_rate": 5.900040184002436e-06, "loss": 0.0158, "step": 68130 }, { "grad_norm": 0.26714274287223816, "learning_rate": 5.890303962619831e-06, "loss": 0.0123, "step": 68140 }, { "grad_norm": 0.330342173576355, "learning_rate": 5.88057527837787e-06, "loss": 0.0159, "step": 68150 }, { "grad_norm": 0.16654221713542938, "learning_rate": 5.870854132938924e-06, "loss": 0.017, "step": 68160 }, { "grad_norm": 0.3063760995864868, "learning_rate": 5.861140527964071e-06, "loss": 0.0163, "step": 68170 }, { "grad_norm": 0.16354194283485413, "learning_rate": 5.851434465113098e-06, "loss": 0.0117, "step": 68180 }, { "grad_norm": 0.17436034977436066, "learning_rate": 5.841735946044524e-06, "loss": 0.0129, "step": 68190 }, { "grad_norm": 0.24212588369846344, "learning_rate": 5.832044972415523e-06, "loss": 0.0168, "step": 68200 }, { "grad_norm": 0.33108553290367126, "learning_rate": 5.8223615458820655e-06, "loss": 0.0133, "step": 68210 }, { "grad_norm": 0.12486717104911804, "learning_rate": 5.812685668098733e-06, "loss": 0.0139, "step": 68220 }, { "grad_norm": 0.23048478364944458, "learning_rate": 5.803017340718913e-06, "loss": 0.0159, "step": 68230 }, { "grad_norm": 0.20959904789924622, "learning_rate": 5.793356565394636e-06, "loss": 0.0131, "step": 68240 }, { "grad_norm": 0.24706995487213135, "learning_rate": 5.783703343776658e-06, "loss": 0.0116, "step": 68250 }, { "grad_norm": 0.19903336465358734, "learning_rate": 5.774057677514477e-06, "loss": 0.0117, "step": 68260 }, { "grad_norm": 0.198921799659729, "learning_rate": 5.764419568256235e-06, "loss": 0.0124, "step": 68270 }, { "grad_norm": 0.2692584991455078, "learning_rate": 5.754789017648865e-06, "loss": 0.0228, "step": 68280 }, { "grad_norm": 0.09218350052833557, "learning_rate": 5.745166027337934e-06, "loss": 0.0151, "step": 68290 }, { "grad_norm": 0.21857601404190063, "learning_rate": 5.735550598967754e-06, "loss": 0.0158, "step": 68300 }, { "grad_norm": 0.2648054361343384, "learning_rate": 5.725942734181339e-06, "loss": 0.016, "step": 68310 }, { "grad_norm": 0.24381765723228455, "learning_rate": 5.7163424346204146e-06, "loss": 0.0125, "step": 68320 }, { "grad_norm": 0.22240526974201202, "learning_rate": 5.706749701925407e-06, "loss": 0.0114, "step": 68330 }, { "grad_norm": 0.17284199595451355, "learning_rate": 5.697164537735461e-06, "loss": 0.0112, "step": 68340 }, { "grad_norm": 0.34421205520629883, "learning_rate": 5.687586943688406e-06, "loss": 0.0138, "step": 68350 }, { "grad_norm": 0.14576415717601776, "learning_rate": 5.678016921420792e-06, "loss": 0.0128, "step": 68360 }, { "grad_norm": 0.18213094770908356, "learning_rate": 5.668454472567875e-06, "loss": 0.0206, "step": 68370 }, { "grad_norm": 0.3075789213180542, "learning_rate": 5.658899598763617e-06, "loss": 0.0158, "step": 68380 }, { "grad_norm": 0.2774377763271332, "learning_rate": 5.6493523016407035e-06, "loss": 0.0136, "step": 68390 }, { "grad_norm": 0.16622896492481232, "learning_rate": 5.6398125828304615e-06, "loss": 0.0135, "step": 68400 }, { "grad_norm": 0.19655492901802063, "learning_rate": 5.630280443963015e-06, "loss": 0.0194, "step": 68410 }, { "grad_norm": 0.2294033318758011, "learning_rate": 5.620755886667112e-06, "loss": 0.0139, "step": 68420 }, { "grad_norm": 0.3590867817401886, "learning_rate": 5.611238912570271e-06, "loss": 0.0151, "step": 68430 }, { "grad_norm": 0.3091208040714264, "learning_rate": 5.601729523298649e-06, "loss": 0.0136, "step": 68440 }, { "grad_norm": 0.19865794479846954, "learning_rate": 5.592227720477161e-06, "loss": 0.0144, "step": 68450 }, { "grad_norm": 0.250356525182724, "learning_rate": 5.582733505729415e-06, "loss": 0.0152, "step": 68460 }, { "grad_norm": 0.3202044665813446, "learning_rate": 5.573246880677668e-06, "loss": 0.0176, "step": 68470 }, { "grad_norm": 0.29594963788986206, "learning_rate": 5.56376784694298e-06, "loss": 0.0183, "step": 68480 }, { "grad_norm": 0.33259615302085876, "learning_rate": 5.554296406145027e-06, "loss": 0.0194, "step": 68490 }, { "grad_norm": 0.17220251262187958, "learning_rate": 5.544832559902219e-06, "loss": 0.0144, "step": 68500 }, { "grad_norm": 0.3222741186618805, "learning_rate": 5.53537630983168e-06, "loss": 0.0158, "step": 68510 }, { "grad_norm": 0.19353020191192627, "learning_rate": 5.5259276575492125e-06, "loss": 0.0147, "step": 68520 }, { "grad_norm": 0.2766171097755432, "learning_rate": 5.516486604669357e-06, "loss": 0.0196, "step": 68530 }, { "grad_norm": 0.20508822798728943, "learning_rate": 5.507053152805303e-06, "loss": 0.0196, "step": 68540 }, { "grad_norm": 0.2408292293548584, "learning_rate": 5.497627303568975e-06, "loss": 0.0164, "step": 68550 }, { "grad_norm": 0.09236177802085876, "learning_rate": 5.488209058571003e-06, "loss": 0.0093, "step": 68560 }, { "grad_norm": 0.2777661085128784, "learning_rate": 5.4787984194207054e-06, "loss": 0.0115, "step": 68570 }, { "grad_norm": 0.20233868062496185, "learning_rate": 5.469395387726095e-06, "loss": 0.0123, "step": 68580 }, { "grad_norm": 0.23063719272613525, "learning_rate": 5.45999996509391e-06, "loss": 0.0123, "step": 68590 }, { "grad_norm": 0.22275014221668243, "learning_rate": 5.450612153129536e-06, "loss": 0.0152, "step": 68600 }, { "grad_norm": 0.22558943927288055, "learning_rate": 5.4412319534371426e-06, "loss": 0.0151, "step": 68610 }, { "grad_norm": 0.22662432491779327, "learning_rate": 5.431859367619513e-06, "loss": 0.0106, "step": 68620 }, { "grad_norm": 0.30912914872169495, "learning_rate": 5.422494397278172e-06, "loss": 0.0158, "step": 68630 }, { "grad_norm": 0.39302337169647217, "learning_rate": 5.413137044013344e-06, "loss": 0.0144, "step": 68640 }, { "grad_norm": 0.20741067826747894, "learning_rate": 5.403787309423941e-06, "loss": 0.0151, "step": 68650 }, { "grad_norm": 0.18314775824546814, "learning_rate": 5.3944451951075835e-06, "loss": 0.0147, "step": 68660 }, { "grad_norm": 0.1758710741996765, "learning_rate": 5.385110702660562e-06, "loss": 0.0126, "step": 68670 }, { "grad_norm": 0.15882115066051483, "learning_rate": 5.375783833677922e-06, "loss": 0.0195, "step": 68680 }, { "grad_norm": 0.27138906717300415, "learning_rate": 5.36646458975334e-06, "loss": 0.0159, "step": 68690 }, { "grad_norm": 0.20349116623401642, "learning_rate": 5.3571529724792294e-06, "loss": 0.0105, "step": 68700 }, { "grad_norm": 0.3192633390426636, "learning_rate": 5.347848983446702e-06, "loss": 0.0117, "step": 68710 }, { "grad_norm": 0.24573537707328796, "learning_rate": 5.3385526242455185e-06, "loss": 0.0144, "step": 68720 }, { "grad_norm": 0.17443805932998657, "learning_rate": 5.329263896464226e-06, "loss": 0.0135, "step": 68730 }, { "grad_norm": 0.16985423862934113, "learning_rate": 5.3199828016899715e-06, "loss": 0.0148, "step": 68740 }, { "grad_norm": 0.26287683844566345, "learning_rate": 5.310709341508657e-06, "loss": 0.0166, "step": 68750 }, { "grad_norm": 0.18651515245437622, "learning_rate": 5.301443517504861e-06, "loss": 0.0109, "step": 68760 }, { "grad_norm": 0.21389827132225037, "learning_rate": 5.292185331261862e-06, "loss": 0.0111, "step": 68770 }, { "grad_norm": 0.27599573135375977, "learning_rate": 5.28293478436162e-06, "loss": 0.0114, "step": 68780 }, { "grad_norm": 0.17390510439872742, "learning_rate": 5.273691878384829e-06, "loss": 0.0153, "step": 68790 }, { "grad_norm": 0.17570984363555908, "learning_rate": 5.264456614910812e-06, "loss": 0.0134, "step": 68800 }, { "grad_norm": 0.20558999478816986, "learning_rate": 5.255228995517647e-06, "loss": 0.019, "step": 68810 }, { "grad_norm": 0.17048242688179016, "learning_rate": 5.246009021782067e-06, "loss": 0.0118, "step": 68820 }, { "grad_norm": 0.21917620301246643, "learning_rate": 5.2367966952795225e-06, "loss": 0.0214, "step": 68830 }, { "grad_norm": 0.22529438138008118, "learning_rate": 5.2275920175841485e-06, "loss": 0.0146, "step": 68840 }, { "grad_norm": 0.14408685266971588, "learning_rate": 5.218394990268771e-06, "loss": 0.0133, "step": 68850 }, { "grad_norm": 0.3385046720504761, "learning_rate": 5.209205614904916e-06, "loss": 0.0157, "step": 68860 }, { "grad_norm": 0.31924623250961304, "learning_rate": 5.200023893062777e-06, "loss": 0.0122, "step": 68870 }, { "grad_norm": 0.19726137816905975, "learning_rate": 5.190849826311289e-06, "loss": 0.0118, "step": 68880 }, { "grad_norm": 0.3217684328556061, "learning_rate": 5.181683416218025e-06, "loss": 0.014, "step": 68890 }, { "grad_norm": 0.18969844281673431, "learning_rate": 5.172524664349276e-06, "loss": 0.009, "step": 68900 }, { "grad_norm": 0.31447064876556396, "learning_rate": 5.1633735722700416e-06, "loss": 0.0113, "step": 68910 }, { "grad_norm": 0.1358659267425537, "learning_rate": 5.154230141543958e-06, "loss": 0.0181, "step": 68920 }, { "grad_norm": 0.44938239455223083, "learning_rate": 5.145094373733433e-06, "loss": 0.0147, "step": 68930 }, { "grad_norm": 0.21973921358585358, "learning_rate": 5.135966270399478e-06, "loss": 0.0124, "step": 68940 }, { "grad_norm": 0.5032336711883545, "learning_rate": 5.126845833101857e-06, "loss": 0.0166, "step": 68950 }, { "grad_norm": 0.19430090487003326, "learning_rate": 5.1177330633989994e-06, "loss": 0.0121, "step": 68960 }, { "grad_norm": 0.3199518918991089, "learning_rate": 5.108627962848033e-06, "loss": 0.0281, "step": 68970 }, { "grad_norm": 0.16768790781497955, "learning_rate": 5.099530533004759e-06, "loss": 0.0153, "step": 68980 }, { "grad_norm": 0.1549002081155777, "learning_rate": 5.090440775423699e-06, "loss": 0.0132, "step": 68990 }, { "grad_norm": 0.31474217772483826, "learning_rate": 5.081358691658022e-06, "loss": 0.0123, "step": 69000 }, { "grad_norm": 0.310615211725235, "learning_rate": 5.072284283259621e-06, "loss": 0.0165, "step": 69010 }, { "grad_norm": 0.13637401163578033, "learning_rate": 5.063217551779053e-06, "loss": 0.013, "step": 69020 }, { "grad_norm": 0.2583564519882202, "learning_rate": 5.054158498765588e-06, "loss": 0.0107, "step": 69030 }, { "grad_norm": 0.2886904776096344, "learning_rate": 5.0451071257671625e-06, "loss": 0.0146, "step": 69040 }, { "grad_norm": 0.3340311050415039, "learning_rate": 5.036063434330407e-06, "loss": 0.0184, "step": 69050 }, { "grad_norm": 0.11617635935544968, "learning_rate": 5.027027426000652e-06, "loss": 0.0133, "step": 69060 }, { "grad_norm": 0.314054936170578, "learning_rate": 5.017999102321886e-06, "loss": 0.0159, "step": 69070 }, { "grad_norm": 0.42284873127937317, "learning_rate": 5.0089784648368224e-06, "loss": 0.0153, "step": 69080 }, { "grad_norm": 0.45730355381965637, "learning_rate": 4.999965515086829e-06, "loss": 0.0116, "step": 69090 }, { "grad_norm": 0.19283422827720642, "learning_rate": 4.990960254611976e-06, "loss": 0.0107, "step": 69100 }, { "grad_norm": 0.45647844672203064, "learning_rate": 4.9819626849510194e-06, "loss": 0.0123, "step": 69110 }, { "grad_norm": 0.23646579682826996, "learning_rate": 4.97297280764138e-06, "loss": 0.0181, "step": 69120 }, { "grad_norm": 0.19898389279842377, "learning_rate": 4.963990624219211e-06, "loss": 0.0199, "step": 69130 }, { "grad_norm": 0.3398112952709198, "learning_rate": 4.955016136219298e-06, "loss": 0.0148, "step": 69140 }, { "grad_norm": 0.20378074049949646, "learning_rate": 4.946049345175146e-06, "loss": 0.0129, "step": 69150 }, { "grad_norm": 0.4330587685108185, "learning_rate": 4.937090252618937e-06, "loss": 0.0115, "step": 69160 }, { "grad_norm": 0.17840713262557983, "learning_rate": 4.928138860081521e-06, "loss": 0.0173, "step": 69170 }, { "grad_norm": 0.19075417518615723, "learning_rate": 4.919195169092472e-06, "loss": 0.0169, "step": 69180 }, { "grad_norm": 0.1286921352148056, "learning_rate": 4.910259181179994e-06, "loss": 0.0113, "step": 69190 }, { "grad_norm": 0.255429208278656, "learning_rate": 4.901330897871015e-06, "loss": 0.0179, "step": 69200 }, { "grad_norm": 0.18192823231220245, "learning_rate": 4.8924103206911375e-06, "loss": 0.012, "step": 69210 }, { "grad_norm": 0.17038002610206604, "learning_rate": 4.883497451164637e-06, "loss": 0.0148, "step": 69220 }, { "grad_norm": 0.33095529675483704, "learning_rate": 4.874592290814484e-06, "loss": 0.0144, "step": 69230 }, { "grad_norm": 0.43415743112564087, "learning_rate": 4.865694841162327e-06, "loss": 0.0222, "step": 69240 }, { "grad_norm": 0.16767993569374084, "learning_rate": 4.856805103728496e-06, "loss": 0.0119, "step": 69250 }, { "grad_norm": 0.18570460379123688, "learning_rate": 4.847923080032007e-06, "loss": 0.0139, "step": 69260 }, { "grad_norm": 0.20375514030456543, "learning_rate": 4.8390487715905486e-06, "loss": 0.0129, "step": 69270 }, { "grad_norm": 0.3351147472858429, "learning_rate": 4.8301821799205e-06, "loss": 0.0119, "step": 69280 }, { "grad_norm": 0.1142229363322258, "learning_rate": 4.821323306536918e-06, "loss": 0.0084, "step": 69290 }, { "grad_norm": 0.17150269448757172, "learning_rate": 4.8124721529535455e-06, "loss": 0.014, "step": 69300 }, { "grad_norm": 0.3300918936729431, "learning_rate": 4.803628720682807e-06, "loss": 0.0213, "step": 69310 }, { "grad_norm": 0.19297440350055695, "learning_rate": 4.794793011235776e-06, "loss": 0.0117, "step": 69320 }, { "grad_norm": 0.38009148836135864, "learning_rate": 4.78596502612228e-06, "loss": 0.0199, "step": 69330 }, { "grad_norm": 0.2328152060508728, "learning_rate": 4.777144766850738e-06, "loss": 0.0161, "step": 69340 }, { "grad_norm": 0.2120913416147232, "learning_rate": 4.768332234928313e-06, "loss": 0.0176, "step": 69350 }, { "grad_norm": 0.1515486240386963, "learning_rate": 4.759527431860828e-06, "loss": 0.0092, "step": 69360 }, { "grad_norm": 0.2778726816177368, "learning_rate": 4.750730359152755e-06, "loss": 0.0207, "step": 69370 }, { "grad_norm": 0.290804386138916, "learning_rate": 4.741941018307311e-06, "loss": 0.0136, "step": 69380 }, { "grad_norm": 0.28641602396965027, "learning_rate": 4.733159410826321e-06, "loss": 0.0193, "step": 69390 }, { "grad_norm": 0.14856770634651184, "learning_rate": 4.724385538210357e-06, "loss": 0.0128, "step": 69400 }, { "grad_norm": 0.2436075359582901, "learning_rate": 4.715619401958599e-06, "loss": 0.0137, "step": 69410 }, { "grad_norm": 0.25766319036483765, "learning_rate": 4.706861003568958e-06, "loss": 0.0162, "step": 69420 }, { "grad_norm": 0.14900605380535126, "learning_rate": 4.698110344538003e-06, "loss": 0.0137, "step": 69430 }, { "grad_norm": 0.3331241011619568, "learning_rate": 4.689367426360975e-06, "loss": 0.0148, "step": 69440 }, { "grad_norm": 0.3142268657684326, "learning_rate": 4.680632250531819e-06, "loss": 0.0098, "step": 69450 }, { "grad_norm": 0.16336661577224731, "learning_rate": 4.671904818543115e-06, "loss": 0.0126, "step": 69460 }, { "grad_norm": 0.28134700655937195, "learning_rate": 4.66318513188615e-06, "loss": 0.012, "step": 69470 }, { "grad_norm": 0.20868033170700073, "learning_rate": 4.654473192050884e-06, "loss": 0.0142, "step": 69480 }, { "grad_norm": 0.20232339203357697, "learning_rate": 4.64576900052594e-06, "loss": 0.0132, "step": 69490 }, { "grad_norm": 0.2189362496137619, "learning_rate": 4.637072558798638e-06, "loss": 0.0095, "step": 69500 }, { "grad_norm": 0.29597294330596924, "learning_rate": 4.628383868354969e-06, "loss": 0.016, "step": 69510 }, { "grad_norm": 0.22751300036907196, "learning_rate": 4.6197029306795595e-06, "loss": 0.0134, "step": 69520 }, { "grad_norm": 0.14014317095279694, "learning_rate": 4.611029747255779e-06, "loss": 0.0102, "step": 69530 }, { "grad_norm": 0.23639346659183502, "learning_rate": 4.6023643195656164e-06, "loss": 0.0135, "step": 69540 }, { "grad_norm": 0.1092841625213623, "learning_rate": 4.593706649089768e-06, "loss": 0.0107, "step": 69550 }, { "grad_norm": 0.3517815172672272, "learning_rate": 4.585056737307597e-06, "loss": 0.0141, "step": 69560 }, { "grad_norm": 0.2846706211566925, "learning_rate": 4.576414585697103e-06, "loss": 0.0116, "step": 69570 }, { "grad_norm": 0.26010289788246155, "learning_rate": 4.567780195735044e-06, "loss": 0.013, "step": 69580 }, { "grad_norm": 0.19293420016765594, "learning_rate": 4.559153568896757e-06, "loss": 0.0121, "step": 69590 }, { "grad_norm": 0.49945732951164246, "learning_rate": 4.550534706656329e-06, "loss": 0.0138, "step": 69600 }, { "grad_norm": 0.13585276901721954, "learning_rate": 4.541923610486465e-06, "loss": 0.0117, "step": 69610 }, { "grad_norm": 0.27385184168815613, "learning_rate": 4.533320281858578e-06, "loss": 0.0122, "step": 69620 }, { "grad_norm": 0.21966636180877686, "learning_rate": 4.52472472224274e-06, "loss": 0.0177, "step": 69630 }, { "grad_norm": 0.16855928301811218, "learning_rate": 4.51613693310769e-06, "loss": 0.0149, "step": 69640 }, { "grad_norm": 1.8396183252334595, "learning_rate": 4.507556915920868e-06, "loss": 0.0165, "step": 69650 }, { "grad_norm": 0.2180638164281845, "learning_rate": 4.498984672148332e-06, "loss": 0.0133, "step": 69660 }, { "grad_norm": 0.28532955050468445, "learning_rate": 4.490420203254864e-06, "loss": 0.0148, "step": 69670 }, { "grad_norm": 0.3569595515727997, "learning_rate": 4.481863510703893e-06, "loss": 0.0149, "step": 69680 }, { "grad_norm": 0.1082480251789093, "learning_rate": 4.473314595957523e-06, "loss": 0.0103, "step": 69690 }, { "grad_norm": 0.2397415190935135, "learning_rate": 4.464773460476535e-06, "loss": 0.0158, "step": 69700 }, { "grad_norm": 0.146466925740242, "learning_rate": 4.456240105720372e-06, "loss": 0.0121, "step": 69710 }, { "grad_norm": 0.13092149794101715, "learning_rate": 4.4477145331471405e-06, "loss": 0.0222, "step": 69720 }, { "grad_norm": 0.2613750696182251, "learning_rate": 4.439196744213653e-06, "loss": 0.0131, "step": 69730 }, { "grad_norm": 0.18214941024780273, "learning_rate": 4.430686740375339e-06, "loss": 0.0143, "step": 69740 }, { "grad_norm": 0.1726163625717163, "learning_rate": 4.422184523086342e-06, "loss": 0.0133, "step": 69750 }, { "grad_norm": 0.2981415092945099, "learning_rate": 4.41369009379946e-06, "loss": 0.0135, "step": 69760 }, { "grad_norm": 0.2894788086414337, "learning_rate": 4.405203453966139e-06, "loss": 0.0143, "step": 69770 }, { "grad_norm": 0.4600125849246979, "learning_rate": 4.396724605036539e-06, "loss": 0.0138, "step": 69780 }, { "grad_norm": 0.26609230041503906, "learning_rate": 4.388253548459437e-06, "loss": 0.013, "step": 69790 }, { "grad_norm": 0.15806402266025543, "learning_rate": 4.3797902856823395e-06, "loss": 0.0122, "step": 69800 }, { "grad_norm": 0.12476009130477905, "learning_rate": 4.371334818151357e-06, "loss": 0.013, "step": 69810 }, { "grad_norm": 0.2629356384277344, "learning_rate": 4.362887147311306e-06, "loss": 0.0109, "step": 69820 }, { "grad_norm": 0.1732344776391983, "learning_rate": 4.354447274605672e-06, "loss": 0.0118, "step": 69830 }, { "grad_norm": 0.2484450489282608, "learning_rate": 4.346015201476572e-06, "loss": 0.0186, "step": 69840 }, { "grad_norm": 0.15737135708332062, "learning_rate": 4.337590929364855e-06, "loss": 0.0145, "step": 69850 }, { "grad_norm": 0.288287490606308, "learning_rate": 4.329174459709973e-06, "loss": 0.0123, "step": 69860 }, { "grad_norm": 0.1589587926864624, "learning_rate": 4.320765793950071e-06, "loss": 0.0174, "step": 69870 }, { "grad_norm": 0.3534490168094635, "learning_rate": 4.312364933521962e-06, "loss": 0.0129, "step": 69880 }, { "grad_norm": 0.15379875898361206, "learning_rate": 4.303971879861129e-06, "loss": 0.0137, "step": 69890 }, { "grad_norm": 0.30530306696891785, "learning_rate": 4.295586634401716e-06, "loss": 0.0143, "step": 69900 }, { "grad_norm": 0.15912742912769318, "learning_rate": 4.287209198576536e-06, "loss": 0.0089, "step": 69910 }, { "grad_norm": 0.1811404824256897, "learning_rate": 4.278839573817045e-06, "loss": 0.016, "step": 69920 }, { "grad_norm": 0.30601879954338074, "learning_rate": 4.270477761553399e-06, "loss": 0.0151, "step": 69930 }, { "grad_norm": 0.18430417776107788, "learning_rate": 4.262123763214393e-06, "loss": 0.0112, "step": 69940 }, { "grad_norm": 0.17681866884231567, "learning_rate": 4.253777580227508e-06, "loss": 0.0106, "step": 69950 }, { "grad_norm": 0.24783381819725037, "learning_rate": 4.2454392140188755e-06, "loss": 0.0153, "step": 69960 }, { "grad_norm": 0.21906498074531555, "learning_rate": 4.2371086660132785e-06, "loss": 0.0154, "step": 69970 }, { "grad_norm": 0.2701629400253296, "learning_rate": 4.228785937634205e-06, "loss": 0.0123, "step": 69980 }, { "grad_norm": 0.17139269411563873, "learning_rate": 4.220471030303758e-06, "loss": 0.0109, "step": 69990 }, { "grad_norm": 0.21209245920181274, "learning_rate": 4.212163945442754e-06, "loss": 0.0078, "step": 70000 }, { "grad_norm": 0.36058124899864197, "learning_rate": 4.203864684470621e-06, "loss": 0.0116, "step": 70010 }, { "grad_norm": 0.12761825323104858, "learning_rate": 4.19557324880549e-06, "loss": 0.0124, "step": 70020 }, { "grad_norm": 0.19218593835830688, "learning_rate": 4.187289639864145e-06, "loss": 0.0109, "step": 70030 }, { "grad_norm": 0.16342034935951233, "learning_rate": 4.1790138590619974e-06, "loss": 0.0136, "step": 70040 }, { "grad_norm": 0.21425364911556244, "learning_rate": 4.170745907813195e-06, "loss": 0.0163, "step": 70050 }, { "grad_norm": 0.247722789645195, "learning_rate": 4.162485787530479e-06, "loss": 0.0138, "step": 70060 }, { "grad_norm": 0.19754914939403534, "learning_rate": 4.154233499625282e-06, "loss": 0.0126, "step": 70070 }, { "grad_norm": 0.12932562828063965, "learning_rate": 4.145989045507692e-06, "loss": 0.0119, "step": 70080 }, { "grad_norm": 0.13711676001548767, "learning_rate": 4.1377524265864666e-06, "loss": 0.016, "step": 70090 }, { "grad_norm": 1.0509321689605713, "learning_rate": 4.1295236442690175e-06, "loss": 0.0149, "step": 70100 }, { "grad_norm": 0.3132372498512268, "learning_rate": 4.121302699961421e-06, "loss": 0.014, "step": 70110 }, { "grad_norm": 0.36156049370765686, "learning_rate": 4.113089595068403e-06, "loss": 0.0148, "step": 70120 }, { "grad_norm": 0.31909963488578796, "learning_rate": 4.104884330993364e-06, "loss": 0.0153, "step": 70130 }, { "grad_norm": 0.3042975664138794, "learning_rate": 4.0966869091383585e-06, "loss": 0.0166, "step": 70140 }, { "grad_norm": 0.22167131304740906, "learning_rate": 4.088497330904101e-06, "loss": 0.015, "step": 70150 }, { "grad_norm": 0.1866436004638672, "learning_rate": 4.080315597689976e-06, "loss": 0.0111, "step": 70160 }, { "grad_norm": 0.20907539129257202, "learning_rate": 4.072141710893995e-06, "loss": 0.0195, "step": 70170 }, { "grad_norm": 0.25636231899261475, "learning_rate": 4.063975671912879e-06, "loss": 0.023, "step": 70180 }, { "grad_norm": 0.24376162886619568, "learning_rate": 4.055817482141949e-06, "loss": 0.01, "step": 70190 }, { "grad_norm": 0.2637793719768524, "learning_rate": 4.047667142975259e-06, "loss": 0.0129, "step": 70200 }, { "grad_norm": 0.33189257979393005, "learning_rate": 4.039524655805443e-06, "loss": 0.0172, "step": 70210 }, { "grad_norm": 0.17709816992282867, "learning_rate": 4.03139002202384e-06, "loss": 0.0099, "step": 70220 }, { "grad_norm": 0.1480991542339325, "learning_rate": 4.023263243020447e-06, "loss": 0.0136, "step": 70230 }, { "grad_norm": 0.11466710269451141, "learning_rate": 4.015144320183884e-06, "loss": 0.0148, "step": 70240 }, { "grad_norm": 0.24549627304077148, "learning_rate": 4.007033254901482e-06, "loss": 0.0113, "step": 70250 }, { "grad_norm": 0.24312490224838257, "learning_rate": 3.9989300485591795e-06, "loss": 0.012, "step": 70260 }, { "grad_norm": 0.22752267122268677, "learning_rate": 3.990834702541601e-06, "loss": 0.0125, "step": 70270 }, { "grad_norm": 0.40264439582824707, "learning_rate": 3.982747218232019e-06, "loss": 0.0099, "step": 70280 }, { "grad_norm": 0.340343713760376, "learning_rate": 3.97466759701236e-06, "loss": 0.0157, "step": 70290 }, { "grad_norm": 0.2639731764793396, "learning_rate": 3.966595840263226e-06, "loss": 0.0124, "step": 70300 }, { "grad_norm": 0.26674774289131165, "learning_rate": 3.958531949363836e-06, "loss": 0.0122, "step": 70310 }, { "grad_norm": 0.2242213785648346, "learning_rate": 3.950475925692098e-06, "loss": 0.0156, "step": 70320 }, { "grad_norm": 0.23041044175624847, "learning_rate": 3.9424277706245685e-06, "loss": 0.0124, "step": 70330 }, { "grad_norm": 0.20741868019104004, "learning_rate": 3.934387485536451e-06, "loss": 0.0155, "step": 70340 }, { "grad_norm": 0.14919684827327728, "learning_rate": 3.926355071801619e-06, "loss": 0.0121, "step": 70350 }, { "grad_norm": 0.2570219933986664, "learning_rate": 3.9183305307925965e-06, "loss": 0.0127, "step": 70360 }, { "grad_norm": 0.22245362401008606, "learning_rate": 3.910313863880533e-06, "loss": 0.0123, "step": 70370 }, { "grad_norm": 0.11992932856082916, "learning_rate": 3.902305072435292e-06, "loss": 0.0104, "step": 70380 }, { "grad_norm": 0.19065307080745697, "learning_rate": 3.894304157825329e-06, "loss": 0.012, "step": 70390 }, { "grad_norm": 0.17733143270015717, "learning_rate": 3.886311121417791e-06, "loss": 0.0114, "step": 70400 }, { "grad_norm": 0.1612129509449005, "learning_rate": 3.878325964578472e-06, "loss": 0.0104, "step": 70410 }, { "grad_norm": 0.17824943363666534, "learning_rate": 3.870348688671815e-06, "loss": 0.0108, "step": 70420 }, { "grad_norm": 0.28142035007476807, "learning_rate": 3.862379295060931e-06, "loss": 0.0107, "step": 70430 }, { "grad_norm": 0.24203883111476898, "learning_rate": 3.854417785107534e-06, "loss": 0.0168, "step": 70440 }, { "grad_norm": 0.3255351185798645, "learning_rate": 3.8464641601720755e-06, "loss": 0.015, "step": 70450 }, { "grad_norm": 0.386902391910553, "learning_rate": 3.838518421613579e-06, "loss": 0.0203, "step": 70460 }, { "grad_norm": 0.29627618193626404, "learning_rate": 3.830580570789766e-06, "loss": 0.0201, "step": 70470 }, { "grad_norm": 0.17035090923309326, "learning_rate": 3.822650609057005e-06, "loss": 0.012, "step": 70480 }, { "grad_norm": 0.15820910036563873, "learning_rate": 3.814728537770285e-06, "loss": 0.0132, "step": 70490 }, { "grad_norm": 0.2590981423854828, "learning_rate": 3.806814358283306e-06, "loss": 0.0166, "step": 70500 }, { "grad_norm": 0.25200510025024414, "learning_rate": 3.7989080719483596e-06, "loss": 0.0097, "step": 70510 }, { "grad_norm": 0.17028118669986725, "learning_rate": 3.7910096801164143e-06, "loss": 0.015, "step": 70520 }, { "grad_norm": 0.2595033347606659, "learning_rate": 3.7831191841371016e-06, "loss": 0.0114, "step": 70530 }, { "grad_norm": 0.878629744052887, "learning_rate": 3.775236585358688e-06, "loss": 0.0192, "step": 70540 }, { "grad_norm": 0.23054994642734528, "learning_rate": 3.7673618851280843e-06, "loss": 0.015, "step": 70550 }, { "grad_norm": 0.269160658121109, "learning_rate": 3.759495084790887e-06, "loss": 0.0129, "step": 70560 }, { "grad_norm": 0.26591747999191284, "learning_rate": 3.751636185691282e-06, "loss": 0.0132, "step": 70570 }, { "grad_norm": 0.19029152393341064, "learning_rate": 3.7437851891721607e-06, "loss": 0.0125, "step": 70580 }, { "grad_norm": 0.40677422285079956, "learning_rate": 3.7359420965750404e-06, "loss": 0.0157, "step": 70590 }, { "grad_norm": 0.15216945111751556, "learning_rate": 3.7281069092400922e-06, "loss": 0.0105, "step": 70600 }, { "grad_norm": 0.25689607858657837, "learning_rate": 3.7202796285061348e-06, "loss": 0.0107, "step": 70610 }, { "grad_norm": 0.199461430311203, "learning_rate": 3.712460255710637e-06, "loss": 0.0166, "step": 70620 }, { "grad_norm": 0.22491371631622314, "learning_rate": 3.704648792189719e-06, "loss": 0.0117, "step": 70630 }, { "grad_norm": 0.2894037663936615, "learning_rate": 3.696845239278124e-06, "loss": 0.0133, "step": 70640 }, { "grad_norm": 0.2804933190345764, "learning_rate": 3.689049598309302e-06, "loss": 0.0126, "step": 70650 }, { "grad_norm": 0.5238708853721619, "learning_rate": 3.681261870615288e-06, "loss": 0.0211, "step": 70660 }, { "grad_norm": 0.2944938838481903, "learning_rate": 3.6734820575268004e-06, "loss": 0.0265, "step": 70670 }, { "grad_norm": 0.12213878333568573, "learning_rate": 3.665710160373209e-06, "loss": 0.0189, "step": 70680 }, { "grad_norm": 0.15793442726135254, "learning_rate": 3.65794618048248e-06, "loss": 0.0102, "step": 70690 }, { "grad_norm": 0.14519286155700684, "learning_rate": 3.6501901191813125e-06, "loss": 0.0114, "step": 70700 }, { "grad_norm": 0.12170840799808502, "learning_rate": 3.642441977794975e-06, "loss": 0.0094, "step": 70710 }, { "grad_norm": 0.26114577054977417, "learning_rate": 3.634701757647424e-06, "loss": 0.0114, "step": 70720 }, { "grad_norm": 0.14831861853599548, "learning_rate": 3.6269694600612468e-06, "loss": 0.0082, "step": 70730 }, { "grad_norm": 0.26712578535079956, "learning_rate": 3.619245086357681e-06, "loss": 0.0173, "step": 70740 }, { "grad_norm": 0.3512619137763977, "learning_rate": 3.611528637856615e-06, "loss": 0.0114, "step": 70750 }, { "grad_norm": 0.3430233895778656, "learning_rate": 3.6038201158765884e-06, "loss": 0.0142, "step": 70760 }, { "grad_norm": 0.6285650134086609, "learning_rate": 3.5961195217347534e-06, "loss": 0.0215, "step": 70770 }, { "grad_norm": 0.12676158547401428, "learning_rate": 3.588426856746946e-06, "loss": 0.0133, "step": 70780 }, { "grad_norm": 0.13908794522285461, "learning_rate": 3.5807421222276316e-06, "loss": 0.0224, "step": 70790 }, { "grad_norm": 0.20932024717330933, "learning_rate": 3.5730653194899156e-06, "loss": 0.012, "step": 70800 }, { "grad_norm": 0.22755639255046844, "learning_rate": 3.565396449845554e-06, "loss": 0.0109, "step": 70810 }, { "grad_norm": 0.45790377259254456, "learning_rate": 3.557735514604954e-06, "loss": 0.0146, "step": 70820 }, { "grad_norm": 0.1816926747560501, "learning_rate": 3.5500825150771633e-06, "loss": 0.0142, "step": 70830 }, { "grad_norm": 0.21023817360401154, "learning_rate": 3.5424374525698466e-06, "loss": 0.0113, "step": 70840 }, { "grad_norm": 0.2179459184408188, "learning_rate": 3.5348003283893704e-06, "loss": 0.0115, "step": 70850 }, { "grad_norm": 0.0970102995634079, "learning_rate": 3.527171143840685e-06, "loss": 0.0146, "step": 70860 }, { "grad_norm": 0.16364453732967377, "learning_rate": 3.519549900227409e-06, "loss": 0.017, "step": 70870 }, { "grad_norm": 0.4017125964164734, "learning_rate": 3.511936598851828e-06, "loss": 0.0163, "step": 70880 }, { "grad_norm": 0.18755574524402618, "learning_rate": 3.504331241014813e-06, "loss": 0.0157, "step": 70890 }, { "grad_norm": 0.2702358067035675, "learning_rate": 3.4967338280159414e-06, "loss": 0.0112, "step": 70900 }, { "grad_norm": 0.1641334593296051, "learning_rate": 3.4891443611533846e-06, "loss": 0.0166, "step": 70910 }, { "grad_norm": 0.21735206246376038, "learning_rate": 3.481562841723984e-06, "loss": 0.0175, "step": 70920 }, { "grad_norm": 0.1783340573310852, "learning_rate": 3.473989271023215e-06, "loss": 0.0104, "step": 70930 }, { "grad_norm": 0.25903886556625366, "learning_rate": 3.4664236503451854e-06, "loss": 0.0143, "step": 70940 }, { "grad_norm": 0.18361464142799377, "learning_rate": 3.4588659809826674e-06, "loss": 0.0098, "step": 70950 }, { "grad_norm": 0.3119083642959595, "learning_rate": 3.451316264227039e-06, "loss": 0.0157, "step": 70960 }, { "grad_norm": 0.20622234046459198, "learning_rate": 3.4437745013683574e-06, "loss": 0.0112, "step": 70970 }, { "grad_norm": 0.22848796844482422, "learning_rate": 3.4362406936952916e-06, "loss": 0.0143, "step": 70980 }, { "grad_norm": 0.19112226366996765, "learning_rate": 3.4287148424951786e-06, "loss": 0.0135, "step": 70990 }, { "grad_norm": 0.19034495949745178, "learning_rate": 3.4211969490539617e-06, "loss": 0.0148, "step": 71000 }, { "grad_norm": 0.22300902009010315, "learning_rate": 3.4136870146562584e-06, "loss": 0.0149, "step": 71010 }, { "grad_norm": 0.2380085438489914, "learning_rate": 3.406185040585308e-06, "loss": 0.0121, "step": 71020 }, { "grad_norm": 0.26274776458740234, "learning_rate": 3.3986910281229966e-06, "loss": 0.0155, "step": 71030 }, { "grad_norm": 0.2792733609676361, "learning_rate": 3.391204978549828e-06, "loss": 0.0137, "step": 71040 }, { "grad_norm": 0.23792937397956848, "learning_rate": 3.3837268931449785e-06, "loss": 0.019, "step": 71050 }, { "grad_norm": 0.23192338645458221, "learning_rate": 3.376256773186248e-06, "loss": 0.0143, "step": 71060 }, { "grad_norm": 0.19061866402626038, "learning_rate": 3.3687946199500664e-06, "loss": 0.0214, "step": 71070 }, { "grad_norm": 0.16156286001205444, "learning_rate": 3.3613404347115295e-06, "loss": 0.0102, "step": 71080 }, { "grad_norm": 0.10895701497793198, "learning_rate": 3.35389421874433e-06, "loss": 0.0079, "step": 71090 }, { "grad_norm": 0.10470033437013626, "learning_rate": 3.3464559733208446e-06, "loss": 0.0146, "step": 71100 }, { "grad_norm": 0.33695188164711, "learning_rate": 3.3390256997120505e-06, "loss": 0.0173, "step": 71110 }, { "grad_norm": 0.2660501301288605, "learning_rate": 3.331603399187583e-06, "loss": 0.0132, "step": 71120 }, { "grad_norm": 0.13225455582141876, "learning_rate": 3.324189073015721e-06, "loss": 0.0105, "step": 71130 }, { "grad_norm": 0.2159639596939087, "learning_rate": 3.3167827224633465e-06, "loss": 0.014, "step": 71140 }, { "grad_norm": 0.26726871728897095, "learning_rate": 3.309384348796024e-06, "loss": 0.0167, "step": 71150 }, { "grad_norm": 0.24125750362873077, "learning_rate": 3.301993953277921e-06, "loss": 0.0144, "step": 71160 }, { "grad_norm": 0.32877984642982483, "learning_rate": 3.29461153717186e-06, "loss": 0.012, "step": 71170 }, { "grad_norm": 0.26316237449645996, "learning_rate": 3.287237101739293e-06, "loss": 0.0162, "step": 71180 }, { "grad_norm": 0.37003204226493835, "learning_rate": 3.2798706482403075e-06, "loss": 0.0194, "step": 71190 }, { "grad_norm": 0.3388337790966034, "learning_rate": 3.2725121779336285e-06, "loss": 0.0141, "step": 71200 }, { "grad_norm": 0.30173563957214355, "learning_rate": 3.265161692076618e-06, "loss": 0.0183, "step": 71210 }, { "grad_norm": 0.118876151740551, "learning_rate": 3.257819191925282e-06, "loss": 0.0101, "step": 71220 }, { "grad_norm": 0.24605681002140045, "learning_rate": 3.2504846787342392e-06, "loss": 0.0124, "step": 71230 }, { "grad_norm": 0.23858505487442017, "learning_rate": 3.243158153756759e-06, "loss": 0.0125, "step": 71240 }, { "grad_norm": 0.11572329699993134, "learning_rate": 3.2358396182447514e-06, "loss": 0.0101, "step": 71250 }, { "grad_norm": 0.20344175398349762, "learning_rate": 3.2285290734487496e-06, "loss": 0.0116, "step": 71260 }, { "grad_norm": 0.17114244401454926, "learning_rate": 3.221226520617926e-06, "loss": 0.0158, "step": 71270 }, { "grad_norm": 0.21955706179141998, "learning_rate": 3.213931961000094e-06, "loss": 0.0186, "step": 71280 }, { "grad_norm": 0.28326138854026794, "learning_rate": 3.2066453958416733e-06, "loss": 0.0091, "step": 71290 }, { "grad_norm": 0.29483532905578613, "learning_rate": 3.199366826387773e-06, "loss": 0.0149, "step": 71300 }, { "grad_norm": 0.2642099857330322, "learning_rate": 3.192096253882071e-06, "loss": 0.0145, "step": 71310 }, { "grad_norm": 0.2594606280326843, "learning_rate": 3.1848336795669177e-06, "loss": 0.0133, "step": 71320 }, { "grad_norm": 0.11798039078712463, "learning_rate": 3.1775791046833035e-06, "loss": 0.011, "step": 71330 }, { "grad_norm": 0.14424222707748413, "learning_rate": 3.170332530470804e-06, "loss": 0.0124, "step": 71340 }, { "grad_norm": 0.36826595664024353, "learning_rate": 3.1630939581677012e-06, "loss": 0.0182, "step": 71350 }, { "grad_norm": 0.1927110105752945, "learning_rate": 3.155863389010838e-06, "loss": 0.0142, "step": 71360 }, { "grad_norm": 0.1448354870080948, "learning_rate": 3.1486408242357323e-06, "loss": 0.0145, "step": 71370 }, { "grad_norm": 0.5245060324668884, "learning_rate": 3.1414262650765248e-06, "loss": 0.0281, "step": 71380 }, { "grad_norm": 0.2858167588710785, "learning_rate": 3.134219712765979e-06, "loss": 0.0168, "step": 71390 }, { "grad_norm": 0.1585555225610733, "learning_rate": 3.1270211685355044e-06, "loss": 0.0142, "step": 71400 }, { "grad_norm": 0.23550371825695038, "learning_rate": 3.1198306336151338e-06, "loss": 0.013, "step": 71410 }, { "grad_norm": 0.27410808205604553, "learning_rate": 3.11264810923354e-06, "loss": 0.0138, "step": 71420 }, { "grad_norm": 0.10168591886758804, "learning_rate": 3.105473596618008e-06, "loss": 0.009, "step": 71430 }, { "grad_norm": 0.20741838216781616, "learning_rate": 3.0983070969944683e-06, "loss": 0.0116, "step": 71440 }, { "grad_norm": 0.626613438129425, "learning_rate": 3.09114861158748e-06, "loss": 0.0119, "step": 71450 }, { "grad_norm": 0.13755325973033905, "learning_rate": 3.0839981416202314e-06, "loss": 0.0113, "step": 71460 }, { "grad_norm": 0.2142248898744583, "learning_rate": 3.0768556883145507e-06, "loss": 0.0111, "step": 71470 }, { "grad_norm": 0.19751961529254913, "learning_rate": 3.0697212528908834e-06, "loss": 0.0118, "step": 71480 }, { "grad_norm": 0.2547900378704071, "learning_rate": 3.0625948365683e-06, "loss": 0.0137, "step": 71490 }, { "grad_norm": 0.16956089437007904, "learning_rate": 3.055476440564531e-06, "loss": 0.0168, "step": 71500 }, { "grad_norm": 0.19903181493282318, "learning_rate": 3.0483660660958924e-06, "loss": 0.0135, "step": 71510 }, { "grad_norm": 0.35729318857192993, "learning_rate": 3.0412637143773624e-06, "loss": 0.0191, "step": 71520 }, { "grad_norm": 0.23558735847473145, "learning_rate": 3.034169386622554e-06, "loss": 0.0103, "step": 71530 }, { "grad_norm": 0.17359481751918793, "learning_rate": 3.027083084043658e-06, "loss": 0.0118, "step": 71540 }, { "grad_norm": 0.18787522614002228, "learning_rate": 3.0200048078515676e-06, "loss": 0.0134, "step": 71550 }, { "grad_norm": 0.2454688400030136, "learning_rate": 3.012934559255742e-06, "loss": 0.0124, "step": 71560 }, { "grad_norm": 0.24529512226581573, "learning_rate": 3.0058723394642994e-06, "loss": 0.0149, "step": 71570 }, { "grad_norm": 0.2130126804113388, "learning_rate": 2.998818149683985e-06, "loss": 0.0189, "step": 71580 }, { "grad_norm": 0.2111039161682129, "learning_rate": 2.9917719911201627e-06, "loss": 0.0135, "step": 71590 }, { "grad_norm": 0.2610170841217041, "learning_rate": 2.9847338649768352e-06, "loss": 0.0131, "step": 71600 }, { "grad_norm": 0.3330453932285309, "learning_rate": 2.977703772456608e-06, "loss": 0.0142, "step": 71610 }, { "grad_norm": 0.22303937375545502, "learning_rate": 2.9706817147607535e-06, "loss": 0.0122, "step": 71620 }, { "grad_norm": 0.27307504415512085, "learning_rate": 2.9636676930891393e-06, "loss": 0.0133, "step": 71630 }, { "grad_norm": 0.27647313475608826, "learning_rate": 2.9566617086402625e-06, "loss": 0.0123, "step": 71640 }, { "grad_norm": 0.2349361628293991, "learning_rate": 2.9496637626112655e-06, "loss": 0.0122, "step": 71650 }, { "grad_norm": 0.3256811499595642, "learning_rate": 2.9426738561979027e-06, "loss": 0.0117, "step": 71660 }, { "grad_norm": 0.38627493381500244, "learning_rate": 2.9356919905945524e-06, "loss": 0.0276, "step": 71670 }, { "grad_norm": 0.19528962671756744, "learning_rate": 2.928718166994243e-06, "loss": 0.0133, "step": 71680 }, { "grad_norm": 0.4982285499572754, "learning_rate": 2.9217523865885833e-06, "loss": 0.0141, "step": 71690 }, { "grad_norm": 0.22180819511413574, "learning_rate": 2.9147946505678537e-06, "loss": 0.0138, "step": 71700 }, { "grad_norm": 0.10838650166988373, "learning_rate": 2.9078449601209313e-06, "loss": 0.011, "step": 71710 }, { "grad_norm": 0.3769826889038086, "learning_rate": 2.900903316435333e-06, "loss": 0.0138, "step": 71720 }, { "grad_norm": 0.280381441116333, "learning_rate": 2.8939697206971983e-06, "loss": 0.0143, "step": 71730 }, { "grad_norm": 0.22823458909988403, "learning_rate": 2.8870441740912746e-06, "loss": 0.0138, "step": 71740 }, { "grad_norm": 0.239036425948143, "learning_rate": 2.8801266778009762e-06, "loss": 0.0145, "step": 71750 }, { "grad_norm": 0.18229064345359802, "learning_rate": 2.8732172330082798e-06, "loss": 0.0173, "step": 71760 }, { "grad_norm": 0.17181065678596497, "learning_rate": 2.8663158408938517e-06, "loss": 0.015, "step": 71770 }, { "grad_norm": 0.1710011214017868, "learning_rate": 2.859422502636938e-06, "loss": 0.0124, "step": 71780 }, { "grad_norm": 0.23944513499736786, "learning_rate": 2.8525372194154076e-06, "loss": 0.0108, "step": 71790 }, { "grad_norm": 0.5019873380661011, "learning_rate": 2.8456599924057913e-06, "loss": 0.013, "step": 71800 }, { "grad_norm": 0.12433643639087677, "learning_rate": 2.8387908227831995e-06, "loss": 0.0136, "step": 71810 }, { "grad_norm": 0.13967765867710114, "learning_rate": 2.831929711721404e-06, "loss": 0.0134, "step": 71820 }, { "grad_norm": 0.2840562164783478, "learning_rate": 2.825076660392767e-06, "loss": 0.0136, "step": 71830 }, { "grad_norm": 0.12350259721279144, "learning_rate": 2.8182316699682908e-06, "loss": 0.0202, "step": 71840 }, { "grad_norm": 0.11882294714450836, "learning_rate": 2.811394741617601e-06, "loss": 0.0134, "step": 71850 }, { "grad_norm": 0.4142855703830719, "learning_rate": 2.8045658765089356e-06, "loss": 0.015, "step": 71860 }, { "grad_norm": 0.237574964761734, "learning_rate": 2.7977450758091605e-06, "loss": 0.0189, "step": 71870 }, { "grad_norm": 0.20965337753295898, "learning_rate": 2.790932340683783e-06, "loss": 0.015, "step": 71880 }, { "grad_norm": 0.25238552689552307, "learning_rate": 2.7841276722968823e-06, "loss": 0.0127, "step": 71890 }, { "grad_norm": 0.4354785084724426, "learning_rate": 2.7773310718112067e-06, "loss": 0.0132, "step": 71900 }, { "grad_norm": 0.21102453768253326, "learning_rate": 2.7705425403881102e-06, "loss": 0.0132, "step": 71910 }, { "grad_norm": 0.715911328792572, "learning_rate": 2.763762079187565e-06, "loss": 0.0172, "step": 71920 }, { "grad_norm": 0.13742054998874664, "learning_rate": 2.756989689368178e-06, "loss": 0.0127, "step": 71930 }, { "grad_norm": 0.2582387328147888, "learning_rate": 2.750225372087134e-06, "loss": 0.0154, "step": 71940 }, { "grad_norm": 0.34126630425453186, "learning_rate": 2.7434691285003033e-06, "loss": 0.0102, "step": 71950 }, { "grad_norm": 0.2328789085149765, "learning_rate": 2.7367209597621178e-06, "loss": 0.0153, "step": 71960 }, { "grad_norm": 0.18274006247520447, "learning_rate": 2.729980867025683e-06, "loss": 0.0169, "step": 71970 }, { "grad_norm": 0.2900982201099396, "learning_rate": 2.7232488514426724e-06, "loss": 0.0122, "step": 71980 }, { "grad_norm": 0.16520270705223083, "learning_rate": 2.7165249141634097e-06, "loss": 0.0105, "step": 71990 }, { "grad_norm": 0.21856184303760529, "learning_rate": 2.709809056336837e-06, "loss": 0.0132, "step": 72000 }, { "grad_norm": 0.23336289823055267, "learning_rate": 2.703101279110498e-06, "loss": 0.0128, "step": 72010 }, { "grad_norm": 0.31290432810783386, "learning_rate": 2.696401583630587e-06, "loss": 0.0112, "step": 72020 }, { "grad_norm": 0.16092449426651, "learning_rate": 2.689709971041887e-06, "loss": 0.011, "step": 72030 }, { "grad_norm": 0.2492312341928482, "learning_rate": 2.6830264424878126e-06, "loss": 0.01, "step": 72040 }, { "grad_norm": 0.20137913525104523, "learning_rate": 2.676350999110394e-06, "loss": 0.0212, "step": 72050 }, { "grad_norm": 0.16601938009262085, "learning_rate": 2.6696836420502856e-06, "loss": 0.0201, "step": 72060 }, { "grad_norm": 0.17991921305656433, "learning_rate": 2.663024372446765e-06, "loss": 0.0123, "step": 72070 }, { "grad_norm": 0.23048990964889526, "learning_rate": 2.6563731914377055e-06, "loss": 0.0099, "step": 72080 }, { "grad_norm": 0.2475748509168625, "learning_rate": 2.6497301001596087e-06, "loss": 0.0166, "step": 72090 }, { "grad_norm": 0.3310548961162567, "learning_rate": 2.643095099747611e-06, "loss": 0.0154, "step": 72100 }, { "grad_norm": 0.11723431944847107, "learning_rate": 2.636468191335445e-06, "loss": 0.0139, "step": 72110 }, { "grad_norm": 0.21513809263706207, "learning_rate": 2.629849376055471e-06, "loss": 0.0177, "step": 72120 }, { "grad_norm": 0.2095523625612259, "learning_rate": 2.6232386550386678e-06, "loss": 0.0113, "step": 72130 }, { "grad_norm": 0.2141009271144867, "learning_rate": 2.6166360294146097e-06, "loss": 0.0124, "step": 72140 }, { "grad_norm": 0.3337722420692444, "learning_rate": 2.6100415003115275e-06, "loss": 0.0181, "step": 72150 }, { "grad_norm": 0.2256215512752533, "learning_rate": 2.603455068856225e-06, "loss": 0.016, "step": 72160 }, { "grad_norm": 0.15390445291996002, "learning_rate": 2.5968767361741584e-06, "loss": 0.014, "step": 72170 }, { "grad_norm": 0.19927307963371277, "learning_rate": 2.590306503389378e-06, "loss": 0.0158, "step": 72180 }, { "grad_norm": 0.16932570934295654, "learning_rate": 2.583744371624558e-06, "loss": 0.0179, "step": 72190 }, { "grad_norm": 0.26290568709373474, "learning_rate": 2.57719034200099e-06, "loss": 0.0115, "step": 72200 }, { "grad_norm": 0.2577226459980011, "learning_rate": 2.570644415638568e-06, "loss": 0.013, "step": 72210 }, { "grad_norm": 0.34210869669914246, "learning_rate": 2.5641065936558295e-06, "loss": 0.0152, "step": 72220 }, { "grad_norm": 0.4596814215183258, "learning_rate": 2.557576877169898e-06, "loss": 0.0123, "step": 72230 }, { "grad_norm": 0.20560719072818756, "learning_rate": 2.5510552672965205e-06, "loss": 0.0091, "step": 72240 }, { "grad_norm": 0.19459731876850128, "learning_rate": 2.544541765150077e-06, "loss": 0.0171, "step": 72250 }, { "grad_norm": 0.18104222416877747, "learning_rate": 2.5380363718435165e-06, "loss": 0.0137, "step": 72260 }, { "grad_norm": 0.1629362851381302, "learning_rate": 2.5315390884884714e-06, "loss": 0.0101, "step": 72270 }, { "grad_norm": 0.2557613253593445, "learning_rate": 2.5250499161951214e-06, "loss": 0.015, "step": 72280 }, { "grad_norm": 0.2239384949207306, "learning_rate": 2.518568856072301e-06, "loss": 0.0112, "step": 72290 }, { "grad_norm": 0.1767413467168808, "learning_rate": 2.512095909227441e-06, "loss": 0.0135, "step": 72300 }, { "grad_norm": 0.23917914927005768, "learning_rate": 2.50563107676659e-06, "loss": 0.0134, "step": 72310 }, { "grad_norm": 0.14462526142597198, "learning_rate": 2.4991743597944152e-06, "loss": 0.0135, "step": 72320 }, { "grad_norm": 0.18731524050235748, "learning_rate": 2.492725759414205e-06, "loss": 0.0149, "step": 72330 }, { "grad_norm": 0.4916157126426697, "learning_rate": 2.4862852767278234e-06, "loss": 0.0087, "step": 72340 }, { "grad_norm": 0.21618980169296265, "learning_rate": 2.479852912835784e-06, "loss": 0.0131, "step": 72350 }, { "grad_norm": 0.2235795259475708, "learning_rate": 2.4734286688372075e-06, "loss": 0.0141, "step": 72360 }, { "grad_norm": 0.21829646825790405, "learning_rate": 2.467012545829811e-06, "loss": 0.0144, "step": 72370 }, { "grad_norm": 0.22250598669052124, "learning_rate": 2.460604544909945e-06, "loss": 0.0102, "step": 72380 }, { "grad_norm": 0.30422285199165344, "learning_rate": 2.454204667172555e-06, "loss": 0.0153, "step": 72390 }, { "grad_norm": 0.20552682876586914, "learning_rate": 2.447812913711217e-06, "loss": 0.0118, "step": 72400 }, { "grad_norm": 0.2811253070831299, "learning_rate": 2.44142928561808e-06, "loss": 0.0172, "step": 72410 }, { "grad_norm": 0.24101382493972778, "learning_rate": 2.435053783983965e-06, "loss": 0.0113, "step": 72420 }, { "grad_norm": 0.13723722100257874, "learning_rate": 2.4286864098982453e-06, "loss": 0.0134, "step": 72430 }, { "grad_norm": 0.12886619567871094, "learning_rate": 2.4223271644489397e-06, "loss": 0.016, "step": 72440 }, { "grad_norm": 0.15406441688537598, "learning_rate": 2.415976048722679e-06, "loss": 0.0113, "step": 72450 }, { "grad_norm": 0.17950895428657532, "learning_rate": 2.4096330638046673e-06, "loss": 0.008, "step": 72460 }, { "grad_norm": 0.20680001378059387, "learning_rate": 2.4032982107787816e-06, "loss": 0.0128, "step": 72470 }, { "grad_norm": 0.2228514403104782, "learning_rate": 2.396971490727451e-06, "loss": 0.0155, "step": 72480 }, { "grad_norm": 0.21077178418636322, "learning_rate": 2.3906529047317493e-06, "loss": 0.0122, "step": 72490 }, { "grad_norm": 0.13493745028972626, "learning_rate": 2.3843424538713465e-06, "loss": 0.0103, "step": 72500 }, { "grad_norm": 0.25706422328948975, "learning_rate": 2.3780401392245298e-06, "loss": 0.0123, "step": 72510 }, { "grad_norm": 0.170322448015213, "learning_rate": 2.3717459618681883e-06, "loss": 0.0149, "step": 72520 }, { "grad_norm": 0.18501915037631989, "learning_rate": 2.3654599228778274e-06, "loss": 0.0122, "step": 72530 }, { "grad_norm": 0.28049030900001526, "learning_rate": 2.3591820233275607e-06, "loss": 0.0171, "step": 72540 }, { "grad_norm": 0.20598365366458893, "learning_rate": 2.3529122642901024e-06, "loss": 0.0165, "step": 72550 }, { "grad_norm": 0.1931799054145813, "learning_rate": 2.3466506468367845e-06, "loss": 0.0213, "step": 72560 }, { "grad_norm": 0.3537050783634186, "learning_rate": 2.3403971720375506e-06, "loss": 0.0117, "step": 72570 }, { "grad_norm": 0.2502991855144501, "learning_rate": 2.334151840960952e-06, "loss": 0.0092, "step": 72580 }, { "grad_norm": 0.14789605140686035, "learning_rate": 2.3279146546741347e-06, "loss": 0.0114, "step": 72590 }, { "grad_norm": 0.18720290064811707, "learning_rate": 2.32168561424288e-06, "loss": 0.0118, "step": 72600 }, { "grad_norm": 0.49592363834381104, "learning_rate": 2.3154647207315307e-06, "loss": 0.0165, "step": 72610 }, { "grad_norm": 0.20578815042972565, "learning_rate": 2.309251975203103e-06, "loss": 0.0155, "step": 72620 }, { "grad_norm": 0.2770591676235199, "learning_rate": 2.303047378719159e-06, "loss": 0.0144, "step": 72630 }, { "grad_norm": 0.24629361927509308, "learning_rate": 2.2968509323399e-06, "loss": 0.0149, "step": 72640 }, { "grad_norm": 0.18472671508789062, "learning_rate": 2.290662637124147e-06, "loss": 0.0119, "step": 72650 }, { "grad_norm": 0.22276756167411804, "learning_rate": 2.2844824941292807e-06, "loss": 0.0087, "step": 72660 }, { "grad_norm": 0.3011378347873688, "learning_rate": 2.2783105044113406e-06, "loss": 0.0138, "step": 72670 }, { "grad_norm": 0.32518184185028076, "learning_rate": 2.272146669024944e-06, "loss": 0.0108, "step": 72680 }, { "grad_norm": 0.1590735763311386, "learning_rate": 2.26599098902332e-06, "loss": 0.0124, "step": 72690 }, { "grad_norm": 0.24796785414218903, "learning_rate": 2.2598434654583113e-06, "loss": 0.0137, "step": 72700 }, { "grad_norm": 0.3036697506904602, "learning_rate": 2.253704099380355e-06, "loss": 0.0189, "step": 72710 }, { "grad_norm": 0.2751617431640625, "learning_rate": 2.247572891838512e-06, "loss": 0.0149, "step": 72720 }, { "grad_norm": 0.26018282771110535, "learning_rate": 2.241449843880422e-06, "loss": 0.016, "step": 72730 }, { "grad_norm": 0.07197724282741547, "learning_rate": 2.235334956552354e-06, "loss": 0.0137, "step": 72740 }, { "grad_norm": 0.6602910757064819, "learning_rate": 2.2292282308991775e-06, "loss": 0.0152, "step": 72750 }, { "grad_norm": 0.19461296498775482, "learning_rate": 2.223129667964363e-06, "loss": 0.0134, "step": 72760 }, { "grad_norm": 0.33705300092697144, "learning_rate": 2.2170392687899834e-06, "loss": 0.0126, "step": 72770 }, { "grad_norm": 0.16027890145778656, "learning_rate": 2.210957034416733e-06, "loss": 0.016, "step": 72780 }, { "grad_norm": 0.28257685899734497, "learning_rate": 2.2048829658838867e-06, "loss": 0.0166, "step": 72790 }, { "grad_norm": 0.13633769750595093, "learning_rate": 2.1988170642293525e-06, "loss": 0.0103, "step": 72800 }, { "grad_norm": 0.17822928726673126, "learning_rate": 2.1927593304896075e-06, "loss": 0.0113, "step": 72810 }, { "grad_norm": 0.1346299797296524, "learning_rate": 2.1867097656997626e-06, "loss": 0.0126, "step": 72820 }, { "grad_norm": 0.20572972297668457, "learning_rate": 2.180668370893524e-06, "loss": 0.0138, "step": 72830 }, { "grad_norm": 0.3233377933502197, "learning_rate": 2.174635147103199e-06, "loss": 0.0125, "step": 72840 }, { "grad_norm": 0.18362535536289215, "learning_rate": 2.1686100953597075e-06, "loss": 0.0156, "step": 72850 }, { "grad_norm": 0.2358720600605011, "learning_rate": 2.1625932166925433e-06, "loss": 0.0107, "step": 72860 }, { "grad_norm": 0.17612311244010925, "learning_rate": 2.1565845121298556e-06, "loss": 0.0136, "step": 72870 }, { "grad_norm": 0.25030913949012756, "learning_rate": 2.150583982698351e-06, "loss": 0.0161, "step": 72880 }, { "grad_norm": 0.11541236191987991, "learning_rate": 2.144591629423359e-06, "loss": 0.0099, "step": 72890 }, { "grad_norm": 0.161396324634552, "learning_rate": 2.138607453328817e-06, "loss": 0.0149, "step": 72900 }, { "grad_norm": 0.2280549556016922, "learning_rate": 2.132631455437234e-06, "loss": 0.0131, "step": 72910 }, { "grad_norm": 0.2626156210899353, "learning_rate": 2.1266636367697714e-06, "loss": 0.021, "step": 72920 }, { "grad_norm": 0.2726728618144989, "learning_rate": 2.120703998346152e-06, "loss": 0.0128, "step": 72930 }, { "grad_norm": 0.2780972719192505, "learning_rate": 2.1147525411847114e-06, "loss": 0.0145, "step": 72940 }, { "grad_norm": 0.17888878285884857, "learning_rate": 2.108809266302403e-06, "loss": 0.0138, "step": 72950 }, { "grad_norm": 0.27914804220199585, "learning_rate": 2.102874174714764e-06, "loss": 0.0151, "step": 72960 }, { "grad_norm": 0.2176283746957779, "learning_rate": 2.0969472674359393e-06, "loss": 0.0118, "step": 72970 }, { "grad_norm": 0.09795217961072922, "learning_rate": 2.091028545478668e-06, "loss": 0.012, "step": 72980 }, { "grad_norm": 0.1855398714542389, "learning_rate": 2.08511800985432e-06, "loss": 0.0106, "step": 72990 }, { "grad_norm": 0.16196830570697784, "learning_rate": 2.07921566157282e-06, "loss": 0.0213, "step": 73000 }, { "grad_norm": 0.1976282298564911, "learning_rate": 2.073321501642728e-06, "loss": 0.0121, "step": 73010 }, { "grad_norm": 0.10969007760286331, "learning_rate": 2.0674355310711937e-06, "loss": 0.014, "step": 73020 }, { "grad_norm": 0.17679770290851593, "learning_rate": 2.0615577508639682e-06, "loss": 0.0089, "step": 73030 }, { "grad_norm": 0.10032123327255249, "learning_rate": 2.055688162025399e-06, "loss": 0.0107, "step": 73040 }, { "grad_norm": 0.1846601516008377, "learning_rate": 2.0498267655584546e-06, "loss": 0.0149, "step": 73050 }, { "grad_norm": 0.21044231951236725, "learning_rate": 2.0439735624646626e-06, "loss": 0.0133, "step": 73060 }, { "grad_norm": 0.07941550761461258, "learning_rate": 2.0381285537442008e-06, "loss": 0.0086, "step": 73070 }, { "grad_norm": 0.20176447927951813, "learning_rate": 2.032291740395803e-06, "loss": 0.0131, "step": 73080 }, { "grad_norm": 0.3791986107826233, "learning_rate": 2.0264631234168276e-06, "loss": 0.017, "step": 73090 }, { "grad_norm": 0.24032999575138092, "learning_rate": 2.0206427038032326e-06, "loss": 0.0186, "step": 73100 }, { "grad_norm": 0.2550540566444397, "learning_rate": 2.0148304825495457e-06, "loss": 0.0106, "step": 73110 }, { "grad_norm": 0.1967051774263382, "learning_rate": 2.0090264606489496e-06, "loss": 0.0112, "step": 73120 }, { "grad_norm": 0.37933632731437683, "learning_rate": 2.003230639093162e-06, "loss": 0.0144, "step": 73130 }, { "grad_norm": 0.3000602126121521, "learning_rate": 1.9974430188725525e-06, "loss": 0.0224, "step": 73140 }, { "grad_norm": 0.17368349432945251, "learning_rate": 1.9916636009760513e-06, "loss": 0.0154, "step": 73150 }, { "grad_norm": 0.12139259278774261, "learning_rate": 1.985892386391208e-06, "loss": 0.0171, "step": 73160 }, { "grad_norm": 0.24395465850830078, "learning_rate": 1.980129376104173e-06, "loss": 0.0116, "step": 73170 }, { "grad_norm": 0.18993422389030457, "learning_rate": 1.9743745710996796e-06, "loss": 0.0105, "step": 73180 }, { "grad_norm": 0.19643931090831757, "learning_rate": 1.9686279723610757e-06, "loss": 0.0139, "step": 73190 }, { "grad_norm": 0.1809406578540802, "learning_rate": 1.962889580870281e-06, "loss": 0.0108, "step": 73200 }, { "grad_norm": 0.16401049494743347, "learning_rate": 1.9571593976078385e-06, "loss": 0.0152, "step": 73210 }, { "grad_norm": 0.2548041045665741, "learning_rate": 1.9514374235528876e-06, "loss": 0.0124, "step": 73220 }, { "grad_norm": 0.24407035112380981, "learning_rate": 1.9457236596831408e-06, "loss": 0.009, "step": 73230 }, { "grad_norm": 0.25596120953559875, "learning_rate": 1.940018106974939e-06, "loss": 0.0145, "step": 73240 }, { "grad_norm": 0.22910039126873016, "learning_rate": 1.9343207664032025e-06, "loss": 0.0104, "step": 73250 }, { "grad_norm": 0.17415444552898407, "learning_rate": 1.928631638941436e-06, "loss": 0.0113, "step": 73260 }, { "grad_norm": 0.22226284444332123, "learning_rate": 1.922950725561784e-06, "loss": 0.0123, "step": 73270 }, { "grad_norm": 0.26650986075401306, "learning_rate": 1.9172780272349312e-06, "loss": 0.0102, "step": 73280 }, { "grad_norm": 0.15930934250354767, "learning_rate": 1.911613544930202e-06, "loss": 0.0141, "step": 73290 }, { "grad_norm": 0.20373032987117767, "learning_rate": 1.9059572796155e-06, "loss": 0.0116, "step": 73300 }, { "grad_norm": 0.2042316347360611, "learning_rate": 1.9003092322573192e-06, "loss": 0.0128, "step": 73310 }, { "grad_norm": 0.25393566489219666, "learning_rate": 1.8946694038207647e-06, "loss": 0.0138, "step": 73320 }, { "grad_norm": 0.2632872462272644, "learning_rate": 1.8890377952695215e-06, "loss": 0.0118, "step": 73330 }, { "grad_norm": 0.14908044040203094, "learning_rate": 1.8834144075658865e-06, "loss": 0.0104, "step": 73340 }, { "grad_norm": 0.34159019589424133, "learning_rate": 1.8777992416707357e-06, "loss": 0.0124, "step": 73350 }, { "grad_norm": 0.18863536417484283, "learning_rate": 1.8721922985435458e-06, "loss": 0.0107, "step": 73360 }, { "grad_norm": 0.35071855783462524, "learning_rate": 1.8665935791424006e-06, "loss": 0.0213, "step": 73370 }, { "grad_norm": 0.1918717324733734, "learning_rate": 1.8610030844239512e-06, "loss": 0.0106, "step": 73380 }, { "grad_norm": 0.18379053473472595, "learning_rate": 1.8554208153434838e-06, "loss": 0.0134, "step": 73390 }, { "grad_norm": 0.11906290799379349, "learning_rate": 1.8498467728548296e-06, "loss": 0.0115, "step": 73400 }, { "grad_norm": 0.24198777973651886, "learning_rate": 1.8442809579104547e-06, "loss": 0.0113, "step": 73410 }, { "grad_norm": 0.33750537037849426, "learning_rate": 1.8387233714614039e-06, "loss": 0.0151, "step": 73420 }, { "grad_norm": 0.14454945921897888, "learning_rate": 1.8331740144573174e-06, "loss": 0.0215, "step": 73430 }, { "grad_norm": 0.30115634202957153, "learning_rate": 1.8276328878464255e-06, "loss": 0.0171, "step": 73440 }, { "grad_norm": 0.49868419766426086, "learning_rate": 1.82209999257556e-06, "loss": 0.0202, "step": 73450 }, { "grad_norm": 0.159150630235672, "learning_rate": 1.8165753295901311e-06, "loss": 0.0114, "step": 73460 }, { "grad_norm": 0.1892177164554596, "learning_rate": 1.8110588998341616e-06, "loss": 0.0136, "step": 73470 }, { "grad_norm": 0.18255509436130524, "learning_rate": 1.805550704250253e-06, "loss": 0.0187, "step": 73480 }, { "grad_norm": 0.20942817628383636, "learning_rate": 1.8000507437796077e-06, "loss": 0.0118, "step": 73490 }, { "grad_norm": 0.24934715032577515, "learning_rate": 1.7945590193620242e-06, "loss": 0.0131, "step": 73500 }, { "grad_norm": 0.18434929847717285, "learning_rate": 1.7890755319358742e-06, "loss": 0.0116, "step": 73510 }, { "grad_norm": 0.2762903571128845, "learning_rate": 1.7836002824381525e-06, "loss": 0.0169, "step": 73520 }, { "grad_norm": 0.17903339862823486, "learning_rate": 1.7781332718044165e-06, "loss": 0.0115, "step": 73530 }, { "grad_norm": 0.25505515933036804, "learning_rate": 1.772674500968835e-06, "loss": 0.013, "step": 73540 }, { "grad_norm": 0.2558244466781616, "learning_rate": 1.7672239708641624e-06, "loss": 0.0138, "step": 73550 }, { "grad_norm": 0.16699793934822083, "learning_rate": 1.7617816824217371e-06, "loss": 0.0126, "step": 73560 }, { "grad_norm": 0.23300297558307648, "learning_rate": 1.7563476365715148e-06, "loss": 0.0196, "step": 73570 }, { "grad_norm": 0.1398376226425171, "learning_rate": 1.750921834242003e-06, "loss": 0.0163, "step": 73580 }, { "grad_norm": 0.33017462491989136, "learning_rate": 1.7455042763603436e-06, "loss": 0.0191, "step": 73590 }, { "grad_norm": 0.1896389275789261, "learning_rate": 1.7400949638522345e-06, "loss": 0.0106, "step": 73600 }, { "grad_norm": 0.1672956794500351, "learning_rate": 1.734693897641987e-06, "loss": 0.0254, "step": 73610 }, { "grad_norm": 0.26871123909950256, "learning_rate": 1.7293010786524955e-06, "loss": 0.0117, "step": 73620 }, { "grad_norm": 0.19817054271697998, "learning_rate": 1.72391650780524e-06, "loss": 0.0142, "step": 73630 }, { "grad_norm": 0.259185254573822, "learning_rate": 1.718540186020301e-06, "loss": 0.0155, "step": 73640 }, { "grad_norm": 0.13979563117027283, "learning_rate": 1.7131721142163437e-06, "loss": 0.0162, "step": 73650 }, { "grad_norm": 0.2544076442718506, "learning_rate": 1.7078122933106233e-06, "loss": 0.0172, "step": 73660 }, { "grad_norm": 0.16418804228305817, "learning_rate": 1.7024607242189905e-06, "loss": 0.0158, "step": 73670 }, { "grad_norm": 0.1999898999929428, "learning_rate": 1.6971174078558749e-06, "loss": 0.0099, "step": 73680 }, { "grad_norm": 0.17247603833675385, "learning_rate": 1.6917823451343073e-06, "loss": 0.0141, "step": 73690 }, { "grad_norm": 0.33970698714256287, "learning_rate": 1.6864555369659142e-06, "loss": 0.0131, "step": 73700 }, { "grad_norm": 0.20303428173065186, "learning_rate": 1.681136984260878e-06, "loss": 0.0151, "step": 73710 }, { "grad_norm": 0.19836591184139252, "learning_rate": 1.6758266879280172e-06, "loss": 0.0143, "step": 73720 }, { "grad_norm": 0.40460917353630066, "learning_rate": 1.6705246488746996e-06, "loss": 0.0135, "step": 73730 }, { "grad_norm": 0.21954560279846191, "learning_rate": 1.6652308680069062e-06, "loss": 0.011, "step": 73740 }, { "grad_norm": 0.3032427728176117, "learning_rate": 1.6599453462292081e-06, "loss": 0.0109, "step": 73750 }, { "grad_norm": 0.6857705116271973, "learning_rate": 1.6546680844447326e-06, "loss": 0.0195, "step": 73760 }, { "grad_norm": 0.16226109862327576, "learning_rate": 1.6493990835552475e-06, "loss": 0.0103, "step": 73770 }, { "grad_norm": 0.15176980197429657, "learning_rate": 1.6441383444610547e-06, "loss": 0.013, "step": 73780 }, { "grad_norm": 0.25218865275382996, "learning_rate": 1.6388858680610908e-06, "loss": 0.0131, "step": 73790 }, { "grad_norm": 0.2328205108642578, "learning_rate": 1.6336416552528543e-06, "loss": 0.0133, "step": 73800 }, { "grad_norm": 0.12404470890760422, "learning_rate": 1.6284057069324288e-06, "loss": 0.0213, "step": 73810 }, { "grad_norm": 0.16952833533287048, "learning_rate": 1.623178023994504e-06, "loss": 0.0117, "step": 73820 }, { "grad_norm": 0.19379553198814392, "learning_rate": 1.6179586073323483e-06, "loss": 0.0134, "step": 73830 }, { "grad_norm": 0.39902380108833313, "learning_rate": 1.612747457837821e-06, "loss": 0.0152, "step": 73840 }, { "grad_norm": 0.274108350276947, "learning_rate": 1.607544576401354e-06, "loss": 0.0114, "step": 73850 }, { "grad_norm": 0.5123173594474792, "learning_rate": 1.6023499639119754e-06, "loss": 0.014, "step": 73860 }, { "grad_norm": 0.2400541752576828, "learning_rate": 1.5971636212573138e-06, "loss": 0.0106, "step": 73870 }, { "grad_norm": 0.2987861931324005, "learning_rate": 1.5919855493235714e-06, "loss": 0.0153, "step": 73880 }, { "grad_norm": 0.2567817270755768, "learning_rate": 1.586815748995535e-06, "loss": 0.011, "step": 73890 }, { "grad_norm": 0.24272844195365906, "learning_rate": 1.5816542211565866e-06, "loss": 0.0118, "step": 73900 }, { "grad_norm": 0.23108214139938354, "learning_rate": 1.5765009666886765e-06, "loss": 0.0101, "step": 73910 }, { "grad_norm": 0.2836073338985443, "learning_rate": 1.5713559864723781e-06, "loss": 0.0115, "step": 73920 }, { "grad_norm": 0.18552276492118835, "learning_rate": 1.5662192813868104e-06, "loss": 0.0101, "step": 73930 }, { "grad_norm": 0.1657407134771347, "learning_rate": 1.561090852309699e-06, "loss": 0.0192, "step": 73940 }, { "grad_norm": 0.25475427508354187, "learning_rate": 1.5559707001173651e-06, "loss": 0.0168, "step": 73950 }, { "grad_norm": 0.2664794921875, "learning_rate": 1.5508588256846757e-06, "loss": 0.0124, "step": 73960 }, { "grad_norm": 0.15773890912532806, "learning_rate": 1.5457552298851319e-06, "loss": 0.0107, "step": 73970 }, { "grad_norm": 0.25286000967025757, "learning_rate": 1.5406599135907918e-06, "loss": 0.0144, "step": 73980 }, { "grad_norm": 0.12159089744091034, "learning_rate": 1.5355728776723088e-06, "loss": 0.0104, "step": 73990 }, { "grad_norm": 0.16800770163536072, "learning_rate": 1.5304941229989155e-06, "loss": 0.0132, "step": 74000 }, { "grad_norm": 0.23822946846485138, "learning_rate": 1.5254236504384345e-06, "loss": 0.0127, "step": 74010 }, { "grad_norm": 0.1863868236541748, "learning_rate": 1.5203614608572726e-06, "loss": 0.0145, "step": 74020 }, { "grad_norm": 0.27667656540870667, "learning_rate": 1.5153075551204044e-06, "loss": 0.0112, "step": 74030 }, { "grad_norm": 0.22428609430789948, "learning_rate": 1.5102619340914225e-06, "loss": 0.021, "step": 74040 }, { "grad_norm": 0.23440368473529816, "learning_rate": 1.505224598632482e-06, "loss": 0.01, "step": 74050 }, { "grad_norm": 0.17550581693649292, "learning_rate": 1.5001955496043164e-06, "loss": 0.0171, "step": 74060 }, { "grad_norm": 0.2696712911128998, "learning_rate": 1.4951747878662604e-06, "loss": 0.01, "step": 74070 }, { "grad_norm": 0.12503454089164734, "learning_rate": 1.4901623142762221e-06, "loss": 0.0099, "step": 74080 }, { "grad_norm": 0.1947462409734726, "learning_rate": 1.4851581296907001e-06, "loss": 0.0157, "step": 74090 }, { "grad_norm": 1.4461145401000977, "learning_rate": 1.4801622349647714e-06, "loss": 0.0118, "step": 74100 }, { "grad_norm": 0.30805015563964844, "learning_rate": 1.4751746309520976e-06, "loss": 0.0129, "step": 74110 }, { "grad_norm": 0.22215214371681213, "learning_rate": 1.4701953185049132e-06, "loss": 0.0163, "step": 74120 }, { "grad_norm": 0.1658245325088501, "learning_rate": 1.4652242984740661e-06, "loss": 0.016, "step": 74130 }, { "grad_norm": 0.18835103511810303, "learning_rate": 1.4602615717089484e-06, "loss": 0.0219, "step": 74140 }, { "grad_norm": 0.25187450647354126, "learning_rate": 1.4553071390575657e-06, "loss": 0.0146, "step": 74150 }, { "grad_norm": 0.22988222539424896, "learning_rate": 1.450361001366496e-06, "loss": 0.0186, "step": 74160 }, { "grad_norm": 0.28887486457824707, "learning_rate": 1.4454231594809021e-06, "loss": 0.0131, "step": 74170 }, { "grad_norm": 0.30455559492111206, "learning_rate": 1.4404936142445036e-06, "loss": 0.0175, "step": 74180 }, { "grad_norm": 0.1958400309085846, "learning_rate": 1.4355723664996546e-06, "loss": 0.0109, "step": 74190 }, { "grad_norm": 0.2849753797054291, "learning_rate": 1.4306594170872433e-06, "loss": 0.0162, "step": 74200 }, { "grad_norm": 0.21684899926185608, "learning_rate": 1.4257547668467598e-06, "loss": 0.0163, "step": 74210 }, { "grad_norm": 0.23759448528289795, "learning_rate": 1.4208584166162886e-06, "loss": 0.0128, "step": 74220 }, { "grad_norm": 0.292586088180542, "learning_rate": 1.4159703672324553e-06, "loss": 0.0128, "step": 74230 }, { "grad_norm": 0.21289317309856415, "learning_rate": 1.4110906195305252e-06, "loss": 0.0138, "step": 74240 }, { "grad_norm": 0.15434513986110687, "learning_rate": 1.406219174344292e-06, "loss": 0.0114, "step": 74250 }, { "grad_norm": 0.1952080875635147, "learning_rate": 1.401356032506157e-06, "loss": 0.0128, "step": 74260 }, { "grad_norm": 0.23048503696918488, "learning_rate": 1.3965011948471051e-06, "loss": 0.0157, "step": 74270 }, { "grad_norm": 0.3089260458946228, "learning_rate": 1.3916546621966842e-06, "loss": 0.0118, "step": 74280 }, { "grad_norm": 0.21729497611522675, "learning_rate": 1.3868164353830482e-06, "loss": 0.0107, "step": 74290 }, { "grad_norm": 0.2830836772918701, "learning_rate": 1.3819865152329082e-06, "loss": 0.0109, "step": 74300 }, { "grad_norm": 0.20726366341114044, "learning_rate": 1.377164902571565e-06, "loss": 0.0157, "step": 74310 }, { "grad_norm": 1.3393774032592773, "learning_rate": 1.372351598222904e-06, "loss": 0.0119, "step": 74320 }, { "grad_norm": 0.21241720020771027, "learning_rate": 1.3675466030093843e-06, "loss": 0.0136, "step": 74330 }, { "grad_norm": 0.5966533422470093, "learning_rate": 1.3627499177520486e-06, "loss": 0.014, "step": 74340 }, { "grad_norm": 0.16155263781547546, "learning_rate": 1.3579615432705196e-06, "loss": 0.0162, "step": 74350 }, { "grad_norm": 1.001043438911438, "learning_rate": 1.3531814803829978e-06, "loss": 0.0146, "step": 74360 }, { "grad_norm": 0.17766687273979187, "learning_rate": 1.3484097299062803e-06, "loss": 0.0128, "step": 74370 }, { "grad_norm": 0.2973533272743225, "learning_rate": 1.343646292655698e-06, "loss": 0.0158, "step": 74380 }, { "grad_norm": 0.15681840479373932, "learning_rate": 1.3388911694452277e-06, "loss": 0.0116, "step": 74390 }, { "grad_norm": 0.3536394536495209, "learning_rate": 1.3341443610873638e-06, "loss": 0.0154, "step": 74400 }, { "grad_norm": 0.2518245577812195, "learning_rate": 1.329405868393213e-06, "loss": 0.0135, "step": 74410 }, { "grad_norm": 0.2641878128051758, "learning_rate": 1.3246756921724613e-06, "loss": 0.0156, "step": 74420 }, { "grad_norm": 0.19004862010478973, "learning_rate": 1.3199538332333506e-06, "loss": 0.0107, "step": 74430 }, { "grad_norm": 0.11651939153671265, "learning_rate": 1.3152402923827411e-06, "loss": 0.0153, "step": 74440 }, { "grad_norm": 0.19387131929397583, "learning_rate": 1.3105350704260277e-06, "loss": 0.0096, "step": 74450 }, { "grad_norm": 0.22158464789390564, "learning_rate": 1.305838168167206e-06, "loss": 0.0128, "step": 74460 }, { "grad_norm": 0.12260308861732483, "learning_rate": 1.301149586408862e-06, "loss": 0.0111, "step": 74470 }, { "grad_norm": 0.11002494394779205, "learning_rate": 1.2964693259521322e-06, "loss": 0.0154, "step": 74480 }, { "grad_norm": 0.1811838299036026, "learning_rate": 1.2917973875967548e-06, "loss": 0.0116, "step": 74490 }, { "grad_norm": 0.46287742257118225, "learning_rate": 1.2871337721410249e-06, "loss": 0.0184, "step": 74500 }, { "grad_norm": 0.2414950728416443, "learning_rate": 1.2824784803818379e-06, "loss": 0.0143, "step": 74510 }, { "grad_norm": 0.256146639585495, "learning_rate": 1.2778315131146467e-06, "loss": 0.0118, "step": 74520 }, { "grad_norm": 0.2647702693939209, "learning_rate": 1.2731928711334994e-06, "loss": 0.0108, "step": 74530 }, { "grad_norm": 0.15928734838962555, "learning_rate": 1.2685625552310122e-06, "loss": 0.0098, "step": 74540 }, { "grad_norm": 0.18510383367538452, "learning_rate": 1.263940566198374e-06, "loss": 0.015, "step": 74550 }, { "grad_norm": 0.10046885162591934, "learning_rate": 1.259326904825353e-06, "loss": 0.0135, "step": 74560 }, { "grad_norm": 0.19775913655757904, "learning_rate": 1.2547215719003137e-06, "loss": 0.0164, "step": 74570 }, { "grad_norm": 0.26944437623023987, "learning_rate": 1.2501245682101703e-06, "loss": 0.0117, "step": 74580 }, { "grad_norm": 1.077818512916565, "learning_rate": 1.2455358945404171e-06, "loss": 0.015, "step": 74590 }, { "grad_norm": 0.17353415489196777, "learning_rate": 1.2409555516751493e-06, "loss": 0.0128, "step": 74600 }, { "grad_norm": 0.2583039104938507, "learning_rate": 1.2363835403970125e-06, "loss": 0.0124, "step": 74610 }, { "grad_norm": 0.14985285699367523, "learning_rate": 1.2318198614872489e-06, "loss": 0.0125, "step": 74620 }, { "grad_norm": 0.2239268273115158, "learning_rate": 1.2272645157256457e-06, "loss": 0.0121, "step": 74630 }, { "grad_norm": 0.20610135793685913, "learning_rate": 1.222717503890608e-06, "loss": 0.0156, "step": 74640 }, { "grad_norm": 0.44563615322113037, "learning_rate": 1.218178826759081e-06, "loss": 0.0203, "step": 74650 }, { "grad_norm": 0.3727053701877594, "learning_rate": 1.213648485106611e-06, "loss": 0.0159, "step": 74660 }, { "grad_norm": 0.542785108089447, "learning_rate": 1.2091264797073066e-06, "loss": 0.0193, "step": 74670 }, { "grad_norm": 0.2853366732597351, "learning_rate": 1.2046128113338494e-06, "loss": 0.0237, "step": 74680 }, { "grad_norm": 0.2429700344800949, "learning_rate": 1.200107480757512e-06, "loss": 0.0162, "step": 74690 }, { "grad_norm": 0.21609874069690704, "learning_rate": 1.1956104887481168e-06, "loss": 0.0132, "step": 74700 }, { "grad_norm": 0.16803409159183502, "learning_rate": 1.191121836074094e-06, "loss": 0.0103, "step": 74710 }, { "grad_norm": 0.24673272669315338, "learning_rate": 1.1866415235024186e-06, "loss": 0.0095, "step": 74720 }, { "grad_norm": 0.1874484419822693, "learning_rate": 1.182169551798662e-06, "loss": 0.018, "step": 74730 }, { "grad_norm": 0.1735861748456955, "learning_rate": 1.177705921726957e-06, "loss": 0.0107, "step": 74740 }, { "grad_norm": 0.16625383496284485, "learning_rate": 1.173250634050016e-06, "loss": 0.0148, "step": 74750 }, { "grad_norm": 0.17477074265480042, "learning_rate": 1.1688036895291354e-06, "loss": 0.0114, "step": 74760 }, { "grad_norm": 0.2318808138370514, "learning_rate": 1.1643650889241574e-06, "loss": 0.0139, "step": 74770 }, { "grad_norm": 0.17357587814331055, "learning_rate": 1.1599348329935311e-06, "loss": 0.013, "step": 74780 }, { "grad_norm": 0.14331966638565063, "learning_rate": 1.1555129224942673e-06, "loss": 0.0205, "step": 74790 }, { "grad_norm": 0.15400530397891998, "learning_rate": 1.1510993581819396e-06, "loss": 0.0125, "step": 74800 }, { "grad_norm": 0.20997659862041473, "learning_rate": 1.1466941408107112e-06, "loss": 0.0133, "step": 74810 }, { "grad_norm": 0.11736132949590683, "learning_rate": 1.1422972711333247e-06, "loss": 0.0117, "step": 74820 }, { "grad_norm": 0.22592855989933014, "learning_rate": 1.1379087499010565e-06, "loss": 0.013, "step": 74830 }, { "grad_norm": 0.1996692270040512, "learning_rate": 1.1335285778638127e-06, "loss": 0.0105, "step": 74840 }, { "grad_norm": 0.42968031764030457, "learning_rate": 1.1291567557700333e-06, "loss": 0.0152, "step": 74850 }, { "grad_norm": 0.24569670855998993, "learning_rate": 1.124793284366743e-06, "loss": 0.0128, "step": 74860 }, { "grad_norm": 0.2624876797199249, "learning_rate": 1.1204381643995455e-06, "loss": 0.0113, "step": 74870 }, { "grad_norm": 0.3262380063533783, "learning_rate": 1.116091396612595e-06, "loss": 0.0145, "step": 74880 }, { "grad_norm": 0.22257691621780396, "learning_rate": 1.1117529817486538e-06, "loss": 0.0104, "step": 74890 }, { "grad_norm": 0.16106444597244263, "learning_rate": 1.107422920549034e-06, "loss": 0.0144, "step": 74900 }, { "grad_norm": 0.34660762548446655, "learning_rate": 1.1031012137536157e-06, "loss": 0.0169, "step": 74910 }, { "grad_norm": 0.21174579858779907, "learning_rate": 1.0987878621008695e-06, "loss": 0.0107, "step": 74920 }, { "grad_norm": 0.2982857823371887, "learning_rate": 1.0944828663278284e-06, "loss": 0.0143, "step": 74930 }, { "grad_norm": 0.33997026085853577, "learning_rate": 1.0901862271700925e-06, "loss": 0.0144, "step": 74940 }, { "grad_norm": 0.1729474812746048, "learning_rate": 1.0858979453618467e-06, "loss": 0.0137, "step": 74950 }, { "grad_norm": 0.2471083700656891, "learning_rate": 1.0816180216358441e-06, "loss": 0.0162, "step": 74960 }, { "grad_norm": 0.2033565491437912, "learning_rate": 1.0773464567233937e-06, "loss": 0.0118, "step": 74970 }, { "grad_norm": 0.31863918900489807, "learning_rate": 1.0730832513543953e-06, "loss": 0.012, "step": 74980 }, { "grad_norm": 0.13668209314346313, "learning_rate": 1.0688284062573218e-06, "loss": 0.0121, "step": 74990 }, { "grad_norm": 0.3616274893283844, "learning_rate": 1.0645819221591969e-06, "loss": 0.0129, "step": 75000 }, { "grad_norm": 0.23310939967632294, "learning_rate": 1.0603437997856347e-06, "loss": 0.0123, "step": 75010 }, { "grad_norm": 0.18306013941764832, "learning_rate": 1.0561140398608228e-06, "loss": 0.0122, "step": 75020 }, { "grad_norm": 0.08199108392000198, "learning_rate": 1.0518926431074937e-06, "loss": 0.0101, "step": 75030 }, { "grad_norm": 0.14879468083381653, "learning_rate": 1.0476796102469877e-06, "loss": 0.0128, "step": 75040 }, { "grad_norm": 0.17811021208763123, "learning_rate": 1.0434749419991786e-06, "loss": 0.014, "step": 75050 }, { "grad_norm": 0.15968628227710724, "learning_rate": 1.0392786390825415e-06, "loss": 0.0116, "step": 75060 }, { "grad_norm": 0.35586363077163696, "learning_rate": 1.0350907022141087e-06, "loss": 0.0146, "step": 75070 }, { "grad_norm": 0.3247632682323456, "learning_rate": 1.0309111321094744e-06, "loss": 0.0176, "step": 75080 }, { "grad_norm": 0.21508163213729858, "learning_rate": 1.026739929482834e-06, "loss": 0.01, "step": 75090 }, { "grad_norm": 0.29772403836250305, "learning_rate": 1.022577095046906e-06, "loss": 0.0169, "step": 75100 }, { "grad_norm": 0.3244505822658539, "learning_rate": 1.0184226295130217e-06, "loss": 0.0114, "step": 75110 }, { "grad_norm": 0.27581605315208435, "learning_rate": 1.0142765335910575e-06, "loss": 0.0143, "step": 75120 }, { "grad_norm": 0.31488704681396484, "learning_rate": 1.0101388079894746e-06, "loss": 0.0132, "step": 75130 }, { "grad_norm": 0.15265728533267975, "learning_rate": 1.0060094534152908e-06, "loss": 0.012, "step": 75140 }, { "grad_norm": 0.2771815061569214, "learning_rate": 1.0018884705741028e-06, "loss": 0.0111, "step": 75150 }, { "grad_norm": 0.17980828881263733, "learning_rate": 9.977758601700805e-07, "loss": 0.018, "step": 75160 }, { "grad_norm": 0.2357635349035263, "learning_rate": 9.93671622905945e-07, "loss": 0.019, "step": 75170 }, { "grad_norm": 0.17200523614883423, "learning_rate": 9.895757594830024e-07, "loss": 0.0111, "step": 75180 }, { "grad_norm": 0.11556033790111542, "learning_rate": 9.854882706011204e-07, "loss": 0.0098, "step": 75190 }, { "grad_norm": 0.2089778184890747, "learning_rate": 9.814091569587513e-07, "loss": 0.0116, "step": 75200 }, { "grad_norm": 0.40465885400772095, "learning_rate": 9.77338419252888e-07, "loss": 0.0148, "step": 75210 }, { "grad_norm": 0.30112189054489136, "learning_rate": 9.732760581791289e-07, "loss": 0.0117, "step": 75220 }, { "grad_norm": 0.2100004404783249, "learning_rate": 9.692220744315972e-07, "loss": 0.0169, "step": 75230 }, { "grad_norm": 0.16500039398670197, "learning_rate": 9.651764687030162e-07, "loss": 0.0106, "step": 75240 }, { "grad_norm": 0.24947863817214966, "learning_rate": 9.611392416846776e-07, "loss": 0.0164, "step": 75250 }, { "grad_norm": 0.19773022830486298, "learning_rate": 9.571103940664238e-07, "loss": 0.0162, "step": 75260 }, { "grad_norm": 0.11230410635471344, "learning_rate": 9.530899265366822e-07, "loss": 0.0143, "step": 75270 }, { "grad_norm": 0.18415363132953644, "learning_rate": 9.490778397824307e-07, "loss": 0.0091, "step": 75280 }, { "grad_norm": 0.35895463824272156, "learning_rate": 9.450741344892322e-07, "loss": 0.0141, "step": 75290 }, { "grad_norm": 0.3312452435493469, "learning_rate": 9.410788113412116e-07, "loss": 0.0135, "step": 75300 }, { "grad_norm": 0.2973821759223938, "learning_rate": 9.370918710210563e-07, "loss": 0.0131, "step": 75310 }, { "grad_norm": 0.3204384744167328, "learning_rate": 9.331133142100268e-07, "loss": 0.0132, "step": 75320 }, { "grad_norm": 0.2076157182455063, "learning_rate": 9.291431415879459e-07, "loss": 0.0111, "step": 75330 }, { "grad_norm": 0.1795833855867386, "learning_rate": 9.251813538332155e-07, "loss": 0.0158, "step": 75340 }, { "grad_norm": 0.2437044084072113, "learning_rate": 9.212279516227884e-07, "loss": 0.0151, "step": 75350 }, { "grad_norm": 0.15037992596626282, "learning_rate": 9.172829356322021e-07, "loss": 0.0109, "step": 75360 }, { "grad_norm": 0.11067163199186325, "learning_rate": 9.133463065355452e-07, "loss": 0.0087, "step": 75370 }, { "grad_norm": 0.3325028419494629, "learning_rate": 9.094180650054796e-07, "loss": 0.0121, "step": 75380 }, { "grad_norm": 0.8675794005393982, "learning_rate": 9.054982117132404e-07, "loss": 0.0149, "step": 75390 }, { "grad_norm": 0.27765730023384094, "learning_rate": 9.015867473286143e-07, "loss": 0.0135, "step": 75400 }, { "grad_norm": 0.21231532096862793, "learning_rate": 8.976836725199778e-07, "loss": 0.0097, "step": 75410 }, { "grad_norm": 0.3012353479862213, "learning_rate": 8.937889879542416e-07, "loss": 0.0117, "step": 75420 }, { "grad_norm": 0.13571873307228088, "learning_rate": 8.899026942969068e-07, "loss": 0.0103, "step": 75430 }, { "grad_norm": 0.15390653908252716, "learning_rate": 8.860247922120424e-07, "loss": 0.0162, "step": 75440 }, { "grad_norm": 0.3077457547187805, "learning_rate": 8.821552823622737e-07, "loss": 0.0124, "step": 75450 }, { "grad_norm": 0.1219395250082016, "learning_rate": 8.782941654087884e-07, "loss": 0.0148, "step": 75460 }, { "grad_norm": 0.2820930480957031, "learning_rate": 8.744414420113478e-07, "loss": 0.0131, "step": 75470 }, { "grad_norm": 0.17640328407287598, "learning_rate": 8.705971128282753e-07, "loss": 0.0108, "step": 75480 }, { "grad_norm": 0.21756206452846527, "learning_rate": 8.66761178516473e-07, "loss": 0.0103, "step": 75490 }, { "grad_norm": 0.21461325883865356, "learning_rate": 8.629336397313781e-07, "loss": 0.0106, "step": 75500 }, { "grad_norm": 0.16207855939865112, "learning_rate": 8.591144971270282e-07, "loss": 0.0138, "step": 75510 }, { "grad_norm": 0.30791959166526794, "learning_rate": 8.553037513560069e-07, "loss": 0.0165, "step": 75520 }, { "grad_norm": 0.16840103268623352, "learning_rate": 8.515014030694546e-07, "loss": 0.0108, "step": 75530 }, { "grad_norm": 0.1649792343378067, "learning_rate": 8.47707452917107e-07, "loss": 0.0122, "step": 75540 }, { "grad_norm": 0.1517370492219925, "learning_rate": 8.439219015472343e-07, "loss": 0.0164, "step": 75550 }, { "grad_norm": 0.15621444582939148, "learning_rate": 8.401447496066861e-07, "loss": 0.009, "step": 75560 }, { "grad_norm": 0.25421229004859924, "learning_rate": 8.363759977408792e-07, "loss": 0.0106, "step": 75570 }, { "grad_norm": 0.2204422950744629, "learning_rate": 8.326156465937817e-07, "loss": 0.015, "step": 75580 }, { "grad_norm": 0.6072835922241211, "learning_rate": 8.28863696807941e-07, "loss": 0.02, "step": 75590 }, { "grad_norm": 0.18813452124595642, "learning_rate": 8.251201490244609e-07, "loss": 0.0124, "step": 75600 }, { "grad_norm": 0.22240397334098816, "learning_rate": 8.213850038830129e-07, "loss": 0.0179, "step": 75610 }, { "grad_norm": 0.22198714315891266, "learning_rate": 8.176582620218309e-07, "loss": 0.014, "step": 75620 }, { "grad_norm": 0.20022179186344147, "learning_rate": 8.139399240777057e-07, "loss": 0.0107, "step": 75630 }, { "grad_norm": 0.25212326645851135, "learning_rate": 8.102299906860122e-07, "loss": 0.0121, "step": 75640 }, { "grad_norm": 0.3592337965965271, "learning_rate": 8.065284624806657e-07, "loss": 0.0167, "step": 75650 }, { "grad_norm": 0.18425245583057404, "learning_rate": 8.028353400941601e-07, "loss": 0.0146, "step": 75660 }, { "grad_norm": 0.11479339003562927, "learning_rate": 7.991506241575575e-07, "loss": 0.013, "step": 75670 }, { "grad_norm": 0.3550790250301361, "learning_rate": 7.954743153004541e-07, "loss": 0.0173, "step": 75680 }, { "grad_norm": 0.2495654672384262, "learning_rate": 7.918064141510528e-07, "loss": 0.018, "step": 75690 }, { "grad_norm": 0.14948704838752747, "learning_rate": 7.881469213360859e-07, "loss": 0.017, "step": 75700 }, { "grad_norm": 0.1658470332622528, "learning_rate": 7.844958374808642e-07, "loss": 0.0104, "step": 75710 }, { "grad_norm": 0.2578012943267822, "learning_rate": 7.808531632092608e-07, "loss": 0.012, "step": 75720 }, { "grad_norm": 0.4920015037059784, "learning_rate": 7.772188991436946e-07, "loss": 0.0119, "step": 75730 }, { "grad_norm": 0.157416433095932, "learning_rate": 7.735930459051799e-07, "loss": 0.0108, "step": 75740 }, { "grad_norm": 0.274106502532959, "learning_rate": 7.699756041132655e-07, "loss": 0.0121, "step": 75750 }, { "grad_norm": 0.25084027647972107, "learning_rate": 7.663665743860793e-07, "loss": 0.0111, "step": 75760 }, { "grad_norm": 0.3076801002025604, "learning_rate": 7.627659573403001e-07, "loss": 0.0136, "step": 75770 }, { "grad_norm": 0.3099020719528198, "learning_rate": 7.591737535911802e-07, "loss": 0.0119, "step": 75780 }, { "grad_norm": 0.1478620320558548, "learning_rate": 7.555899637525288e-07, "loss": 0.012, "step": 75790 }, { "grad_norm": 0.2886447608470917, "learning_rate": 7.520145884367058e-07, "loss": 0.0101, "step": 75800 }, { "grad_norm": 0.30662089586257935, "learning_rate": 7.484476282546615e-07, "loss": 0.0136, "step": 75810 }, { "grad_norm": 0.3441165089607239, "learning_rate": 7.448890838158806e-07, "loss": 0.0174, "step": 75820 }, { "grad_norm": 0.30560797452926636, "learning_rate": 7.413389557284267e-07, "loss": 0.0143, "step": 75830 }, { "grad_norm": 0.09370995312929153, "learning_rate": 7.377972445989201e-07, "loss": 0.0134, "step": 75840 }, { "grad_norm": 0.37065109610557556, "learning_rate": 7.342639510325377e-07, "loss": 0.0172, "step": 75850 }, { "grad_norm": 0.2773585021495819, "learning_rate": 7.307390756330246e-07, "loss": 0.0104, "step": 75860 }, { "grad_norm": 0.28908470273017883, "learning_rate": 7.272226190026876e-07, "loss": 0.009, "step": 75870 }, { "grad_norm": 0.14691373705863953, "learning_rate": 7.237145817423907e-07, "loss": 0.0116, "step": 75880 }, { "grad_norm": 0.16548889875411987, "learning_rate": 7.202149644515654e-07, "loss": 0.0122, "step": 75890 }, { "grad_norm": 0.2201811820268631, "learning_rate": 7.167237677281946e-07, "loss": 0.0136, "step": 75900 }, { "grad_norm": 0.2969520390033722, "learning_rate": 7.132409921688288e-07, "loss": 0.0141, "step": 75910 }, { "grad_norm": 0.11639277637004852, "learning_rate": 7.09766638368592e-07, "loss": 0.0098, "step": 75920 }, { "grad_norm": 0.23841238021850586, "learning_rate": 7.063007069211313e-07, "loss": 0.0127, "step": 75930 }, { "grad_norm": 0.3360111713409424, "learning_rate": 7.028431984187067e-07, "loss": 0.0113, "step": 75940 }, { "grad_norm": 0.16104871034622192, "learning_rate": 6.993941134520898e-07, "loss": 0.012, "step": 75950 }, { "grad_norm": 0.25735098123550415, "learning_rate": 6.959534526106537e-07, "loss": 0.0099, "step": 75960 }, { "grad_norm": 0.23488306999206543, "learning_rate": 6.925212164822948e-07, "loss": 0.0093, "step": 75970 }, { "grad_norm": 0.5074533224105835, "learning_rate": 6.890974056535049e-07, "loss": 0.0168, "step": 75980 }, { "grad_norm": 0.2213398516178131, "learning_rate": 6.856820207093106e-07, "loss": 0.0092, "step": 75990 }, { "grad_norm": 0.33392930030822754, "learning_rate": 6.822750622333063e-07, "loss": 0.0113, "step": 76000 }, { "grad_norm": 0.32731378078460693, "learning_rate": 6.788765308076539e-07, "loss": 0.0146, "step": 76010 }, { "grad_norm": 0.22040575742721558, "learning_rate": 6.754864270130668e-07, "loss": 0.011, "step": 76020 }, { "grad_norm": 0.21494989097118378, "learning_rate": 6.721047514288203e-07, "loss": 0.0124, "step": 76030 }, { "grad_norm": 0.4089977443218231, "learning_rate": 6.687315046327469e-07, "loss": 0.014, "step": 76040 }, { "grad_norm": 0.4023652672767639, "learning_rate": 6.65366687201252e-07, "loss": 0.0164, "step": 76050 }, { "grad_norm": 0.33602675795555115, "learning_rate": 6.620102997092814e-07, "loss": 0.0169, "step": 76060 }, { "grad_norm": 0.21175022423267365, "learning_rate": 6.586623427303596e-07, "loss": 0.0114, "step": 76070 }, { "grad_norm": 0.22138801217079163, "learning_rate": 6.553228168365455e-07, "loss": 0.018, "step": 76080 }, { "grad_norm": 1.6574175357818604, "learning_rate": 6.519917225984884e-07, "loss": 0.0151, "step": 76090 }, { "grad_norm": 0.22805540263652802, "learning_rate": 6.486690605853718e-07, "loss": 0.0133, "step": 76100 }, { "grad_norm": 0.3535352647304535, "learning_rate": 6.453548313649527e-07, "loss": 0.0132, "step": 76110 }, { "grad_norm": 0.34211623668670654, "learning_rate": 6.420490355035446e-07, "loss": 0.0145, "step": 76120 }, { "grad_norm": 0.2473304718732834, "learning_rate": 6.387516735660071e-07, "loss": 0.0152, "step": 76130 }, { "grad_norm": 0.22307591140270233, "learning_rate": 6.35462746115778e-07, "loss": 0.0116, "step": 76140 }, { "grad_norm": 0.25375714898109436, "learning_rate": 6.321822537148358e-07, "loss": 0.0182, "step": 76150 }, { "grad_norm": 0.13778769969940186, "learning_rate": 6.289101969237432e-07, "loss": 0.0125, "step": 76160 }, { "grad_norm": 0.23760788142681122, "learning_rate": 6.256465763015918e-07, "loss": 0.0105, "step": 76170 }, { "grad_norm": 0.1366509199142456, "learning_rate": 6.22391392406052e-07, "loss": 0.0123, "step": 76180 }, { "grad_norm": 0.16472665965557098, "learning_rate": 6.191446457933403e-07, "loss": 0.009, "step": 76190 }, { "grad_norm": 0.0987003892660141, "learning_rate": 6.159063370182406e-07, "loss": 0.0097, "step": 76200 }, { "grad_norm": 0.2525475025177002, "learning_rate": 6.126764666340879e-07, "loss": 0.0091, "step": 76210 }, { "grad_norm": 0.15532630681991577, "learning_rate": 6.094550351927852e-07, "loss": 0.0117, "step": 76220 }, { "grad_norm": 0.2964027523994446, "learning_rate": 6.062420432447757e-07, "loss": 0.015, "step": 76230 }, { "grad_norm": 0.1829824149608612, "learning_rate": 6.030374913390813e-07, "loss": 0.0104, "step": 76240 }, { "grad_norm": 0.37022900581359863, "learning_rate": 5.998413800232694e-07, "loss": 0.0123, "step": 76250 }, { "grad_norm": 0.18458908796310425, "learning_rate": 5.966537098434755e-07, "loss": 0.0114, "step": 76260 }, { "grad_norm": 2.7766826152801514, "learning_rate": 5.934744813443694e-07, "loss": 0.0109, "step": 76270 }, { "grad_norm": 0.2645915746688843, "learning_rate": 5.903036950692054e-07, "loss": 0.0121, "step": 76280 }, { "grad_norm": 0.1498536467552185, "learning_rate": 5.871413515597835e-07, "loss": 0.0125, "step": 76290 }, { "grad_norm": 0.28452542424201965, "learning_rate": 5.839874513564547e-07, "loss": 0.0107, "step": 76300 }, { "grad_norm": 0.3213084638118744, "learning_rate": 5.808419949981436e-07, "loss": 0.0179, "step": 76310 }, { "grad_norm": 0.2037166953086853, "learning_rate": 5.777049830223257e-07, "loss": 0.0153, "step": 76320 }, { "grad_norm": 0.15611110627651215, "learning_rate": 5.745764159650114e-07, "loss": 0.0113, "step": 76330 }, { "grad_norm": 0.16451877355575562, "learning_rate": 5.71456294360806e-07, "loss": 0.0103, "step": 76340 }, { "grad_norm": 0.27612540125846863, "learning_rate": 5.683446187428443e-07, "loss": 0.01, "step": 76350 }, { "grad_norm": 0.2916302978992462, "learning_rate": 5.652413896428288e-07, "loss": 0.0121, "step": 76360 }, { "grad_norm": 0.30078375339508057, "learning_rate": 5.621466075910131e-07, "loss": 0.0138, "step": 76370 }, { "grad_norm": 0.08333563059568405, "learning_rate": 5.590602731162187e-07, "loss": 0.0107, "step": 76380 }, { "grad_norm": 0.2485390305519104, "learning_rate": 5.559823867458125e-07, "loss": 0.0129, "step": 76390 }, { "grad_norm": 0.1553695946931839, "learning_rate": 5.529129490057128e-07, "loss": 0.0166, "step": 76400 }, { "grad_norm": 0.22265343368053436, "learning_rate": 5.498519604204167e-07, "loss": 0.0145, "step": 76410 }, { "grad_norm": 0.20292551815509796, "learning_rate": 5.467994215129501e-07, "loss": 0.0125, "step": 76420 }, { "grad_norm": 0.1614811271429062, "learning_rate": 5.437553328049183e-07, "loss": 0.0124, "step": 76430 }, { "grad_norm": 0.2636985778808594, "learning_rate": 5.407196948164661e-07, "loss": 0.0137, "step": 76440 }, { "grad_norm": 0.15109196305274963, "learning_rate": 5.376925080663009e-07, "loss": 0.0114, "step": 76450 }, { "grad_norm": 0.29943472146987915, "learning_rate": 5.346737730716977e-07, "loss": 0.0118, "step": 76460 }, { "grad_norm": 0.1634739637374878, "learning_rate": 5.316634903484607e-07, "loss": 0.0122, "step": 76470 }, { "grad_norm": 0.23378948867321014, "learning_rate": 5.286616604109728e-07, "loss": 0.012, "step": 76480 }, { "grad_norm": 0.18774576485157013, "learning_rate": 5.256682837721627e-07, "loss": 0.0181, "step": 76490 }, { "grad_norm": 0.3629697561264038, "learning_rate": 5.226833609435156e-07, "loss": 0.0123, "step": 76500 }, { "grad_norm": 0.15584447979927063, "learning_rate": 5.19706892435079e-07, "loss": 0.0098, "step": 76510 }, { "grad_norm": 0.29433581233024597, "learning_rate": 5.167388787554406e-07, "loss": 0.0128, "step": 76520 }, { "grad_norm": 0.2708092927932739, "learning_rate": 5.137793204117614e-07, "loss": 0.0132, "step": 76530 }, { "grad_norm": 0.348320871591568, "learning_rate": 5.108282179097479e-07, "loss": 0.0131, "step": 76540 }, { "grad_norm": 0.2918691337108612, "learning_rate": 5.078855717536523e-07, "loss": 0.0149, "step": 76550 }, { "grad_norm": 0.31385788321495056, "learning_rate": 5.049513824463059e-07, "loss": 0.0104, "step": 76560 }, { "grad_norm": 0.16537463665008545, "learning_rate": 5.020256504890742e-07, "loss": 0.0115, "step": 76570 }, { "grad_norm": 0.17323486506938934, "learning_rate": 4.991083763818849e-07, "loss": 0.0132, "step": 76580 }, { "grad_norm": 0.29733267426490784, "learning_rate": 4.961995606232228e-07, "loss": 0.0176, "step": 76590 }, { "grad_norm": 0.19777150452136993, "learning_rate": 4.932992037101236e-07, "loss": 0.0115, "step": 76600 }, { "grad_norm": 0.2735408842563629, "learning_rate": 4.904073061381798e-07, "loss": 0.0171, "step": 76610 }, { "grad_norm": 0.32912471890449524, "learning_rate": 4.875238684015349e-07, "loss": 0.0166, "step": 76620 }, { "grad_norm": 0.23259936273097992, "learning_rate": 4.846488909928948e-07, "loss": 0.0094, "step": 76630 }, { "grad_norm": 0.10927346348762512, "learning_rate": 4.81782374403511e-07, "loss": 0.017, "step": 76640 }, { "grad_norm": 0.15137112140655518, "learning_rate": 4.789243191231918e-07, "loss": 0.0092, "step": 76650 }, { "grad_norm": 0.2133006453514099, "learning_rate": 4.76074725640302e-07, "loss": 0.0092, "step": 76660 }, { "grad_norm": 0.21486328542232513, "learning_rate": 4.732335944417632e-07, "loss": 0.0204, "step": 76670 }, { "grad_norm": 0.2380450814962387, "learning_rate": 4.704009260130371e-07, "loss": 0.016, "step": 76680 }, { "grad_norm": 0.10261975973844528, "learning_rate": 4.675767208381587e-07, "loss": 0.0112, "step": 76690 }, { "grad_norm": 0.35568347573280334, "learning_rate": 4.647609793997032e-07, "loss": 0.0142, "step": 76700 }, { "grad_norm": 0.08092191070318222, "learning_rate": 4.619537021788023e-07, "loss": 0.0116, "step": 76710 }, { "grad_norm": 0.20677213370800018, "learning_rate": 4.5915488965515005e-07, "loss": 0.0151, "step": 76720 }, { "grad_norm": 0.2277132272720337, "learning_rate": 4.563645423069807e-07, "loss": 0.0146, "step": 76730 }, { "grad_norm": 0.3075593411922455, "learning_rate": 4.535826606110849e-07, "loss": 0.0124, "step": 76740 }, { "grad_norm": 0.24492356181144714, "learning_rate": 4.508092450428214e-07, "loss": 0.0144, "step": 76750 }, { "grad_norm": 0.1494653820991516, "learning_rate": 4.480442960760778e-07, "loss": 0.0122, "step": 76760 }, { "grad_norm": 0.18201862275600433, "learning_rate": 4.4528781418332053e-07, "loss": 0.0117, "step": 76770 }, { "grad_norm": 0.36063241958618164, "learning_rate": 4.4253979983555073e-07, "loss": 0.0189, "step": 76780 }, { "grad_norm": 0.12044063955545425, "learning_rate": 4.3980025350233154e-07, "loss": 0.0097, "step": 76790 }, { "grad_norm": 0.1808733493089676, "learning_rate": 4.370691756517664e-07, "loss": 0.0115, "step": 76800 }, { "grad_norm": 0.2581973075866699, "learning_rate": 4.3434656675053753e-07, "loss": 0.0094, "step": 76810 }, { "grad_norm": 0.22426174581050873, "learning_rate": 4.3163242726385613e-07, "loss": 0.0086, "step": 76820 }, { "grad_norm": 0.1040792316198349, "learning_rate": 4.289267576554956e-07, "loss": 0.0142, "step": 76830 }, { "grad_norm": 0.23722374439239502, "learning_rate": 4.262295583877807e-07, "loss": 0.0119, "step": 76840 }, { "grad_norm": 0.25085970759391785, "learning_rate": 4.235408299215815e-07, "loss": 0.0169, "step": 76850 }, { "grad_norm": 0.26762816309928894, "learning_rate": 4.2086057271634173e-07, "loss": 0.011, "step": 76860 }, { "grad_norm": 0.1961798220872879, "learning_rate": 4.181887872300394e-07, "loss": 0.0143, "step": 76870 }, { "grad_norm": 0.24766331911087036, "learning_rate": 4.155254739192038e-07, "loss": 0.0107, "step": 76880 }, { "grad_norm": 0.16920679807662964, "learning_rate": 4.128706332389265e-07, "loss": 0.0168, "step": 76890 }, { "grad_norm": 0.30464306473731995, "learning_rate": 4.1022426564284453e-07, "loss": 0.0114, "step": 76900 }, { "grad_norm": 0.14479194581508636, "learning_rate": 4.075863715831574e-07, "loss": 0.0142, "step": 76910 }, { "grad_norm": 0.28739964962005615, "learning_rate": 4.0495695151059887e-07, "loss": 0.014, "step": 76920 }, { "grad_norm": 0.2986762821674347, "learning_rate": 4.023360058744763e-07, "loss": 0.0102, "step": 76930 }, { "grad_norm": 0.11888590455055237, "learning_rate": 3.997235351226314e-07, "loss": 0.0087, "step": 76940 }, { "grad_norm": 0.348762184381485, "learning_rate": 3.971195397014571e-07, "loss": 0.0144, "step": 76950 }, { "grad_norm": 0.24566857516765594, "learning_rate": 3.9452402005591417e-07, "loss": 0.0123, "step": 76960 }, { "grad_norm": 0.15492820739746094, "learning_rate": 3.9193697662950334e-07, "loss": 0.0119, "step": 76970 }, { "grad_norm": 0.3754158020019531, "learning_rate": 3.893584098642822e-07, "loss": 0.0144, "step": 76980 }, { "grad_norm": 0.14636918902397156, "learning_rate": 3.867883202008538e-07, "loss": 0.0145, "step": 76990 }, { "grad_norm": 0.2252439260482788, "learning_rate": 3.842267080783779e-07, "loss": 0.0136, "step": 77000 }, { "grad_norm": 0.11315879225730896, "learning_rate": 3.816735739345656e-07, "loss": 0.013, "step": 77010 }, { "grad_norm": 0.26453134417533875, "learning_rate": 3.791289182056679e-07, "loss": 0.0128, "step": 77020 }, { "grad_norm": 0.20545147359371185, "learning_rate": 3.7659274132650913e-07, "loss": 0.015, "step": 77030 }, { "grad_norm": 0.24574878811836243, "learning_rate": 3.7406504373044826e-07, "loss": 0.0127, "step": 77040 }, { "grad_norm": 0.29188260436058044, "learning_rate": 3.7154582584939533e-07, "loss": 0.0143, "step": 77050 }, { "grad_norm": 0.2686549723148346, "learning_rate": 3.690350881138227e-07, "loss": 0.0133, "step": 77060 }, { "grad_norm": 0.2970260679721832, "learning_rate": 3.665328309527427e-07, "loss": 0.0116, "step": 77070 }, { "grad_norm": 0.174171581864357, "learning_rate": 3.640390547937245e-07, "loss": 0.0099, "step": 77080 }, { "grad_norm": 0.15322940051555634, "learning_rate": 3.615537600628882e-07, "loss": 0.0137, "step": 77090 }, { "grad_norm": 0.17522792518138885, "learning_rate": 3.5907694718489427e-07, "loss": 0.0146, "step": 77100 }, { "grad_norm": 0.25167667865753174, "learning_rate": 3.566086165829707e-07, "loss": 0.0197, "step": 77110 }, { "grad_norm": 0.38790684938430786, "learning_rate": 3.5414876867888024e-07, "loss": 0.0171, "step": 77120 }, { "grad_norm": 0.2514209747314453, "learning_rate": 3.5169740389295326e-07, "loss": 0.0153, "step": 77130 }, { "grad_norm": 0.3662934899330139, "learning_rate": 3.4925452264405466e-07, "loss": 0.0141, "step": 77140 }, { "grad_norm": 0.2884652018547058, "learning_rate": 3.468201253496062e-07, "loss": 0.0101, "step": 77150 }, { "grad_norm": 0.2362234741449356, "learning_rate": 3.443942124255861e-07, "loss": 0.0116, "step": 77160 }, { "grad_norm": 0.20575760304927826, "learning_rate": 3.4197678428650183e-07, "loss": 0.0143, "step": 77170 }, { "grad_norm": 0.1147342249751091, "learning_rate": 3.3956784134544504e-07, "loss": 0.009, "step": 77180 }, { "grad_norm": 0.26880496740341187, "learning_rate": 3.3716738401402547e-07, "loss": 0.0141, "step": 77190 }, { "grad_norm": 0.21163628995418549, "learning_rate": 3.3477541270241495e-07, "loss": 0.0097, "step": 77200 }, { "grad_norm": 0.2096630483865738, "learning_rate": 3.3239192781934215e-07, "loss": 0.0107, "step": 77210 }, { "grad_norm": 0.5275967121124268, "learning_rate": 3.3001692977207563e-07, "loss": 0.0119, "step": 77220 }, { "grad_norm": 0.21997781097888947, "learning_rate": 3.276504189664409e-07, "loss": 0.0146, "step": 77230 }, { "grad_norm": 0.20514334738254547, "learning_rate": 3.252923958068088e-07, "loss": 0.0125, "step": 77240 }, { "grad_norm": 0.09846647828817368, "learning_rate": 3.2294286069609046e-07, "loss": 0.0118, "step": 77250 }, { "grad_norm": 0.1701601892709732, "learning_rate": 3.2060181403577583e-07, "loss": 0.0131, "step": 77260 }, { "grad_norm": 0.2410535365343094, "learning_rate": 3.1826925622587823e-07, "loss": 0.0102, "step": 77270 }, { "grad_norm": 0.14330525696277618, "learning_rate": 3.159451876649622e-07, "loss": 0.0139, "step": 77280 }, { "grad_norm": 0.25896137952804565, "learning_rate": 3.1362960875015444e-07, "loss": 0.0121, "step": 77290 }, { "grad_norm": 0.3269180953502655, "learning_rate": 3.1132251987711637e-07, "loss": 0.0163, "step": 77300 }, { "grad_norm": 0.19062356650829315, "learning_rate": 3.0902392144007695e-07, "loss": 0.0114, "step": 77310 }, { "grad_norm": 0.18568599224090576, "learning_rate": 3.0673381383179436e-07, "loss": 0.0131, "step": 77320 }, { "grad_norm": 0.1855914443731308, "learning_rate": 3.0445219744358875e-07, "loss": 0.0089, "step": 77330 }, { "grad_norm": 0.2379802018404007, "learning_rate": 3.021790726653262e-07, "loss": 0.0124, "step": 77340 }, { "grad_norm": 0.3429736793041229, "learning_rate": 2.9991443988542366e-07, "loss": 0.0111, "step": 77350 }, { "grad_norm": 0.12880364060401917, "learning_rate": 2.976582994908439e-07, "loss": 0.0135, "step": 77360 }, { "grad_norm": 0.20245066285133362, "learning_rate": 2.954106518671007e-07, "loss": 0.0138, "step": 77370 }, { "grad_norm": 0.3410305082798004, "learning_rate": 2.9317149739825356e-07, "loss": 0.0102, "step": 77380 }, { "grad_norm": 0.4316920340061188, "learning_rate": 2.909408364669075e-07, "loss": 0.0123, "step": 77390 }, { "grad_norm": 0.2992596924304962, "learning_rate": 2.887186694542299e-07, "loss": 0.0179, "step": 77400 }, { "grad_norm": 0.30639365315437317, "learning_rate": 2.865049967399225e-07, "loss": 0.0149, "step": 77410 }, { "grad_norm": 0.19725576043128967, "learning_rate": 2.842998187022439e-07, "loss": 0.0142, "step": 77420 }, { "grad_norm": 0.2492510974407196, "learning_rate": 2.8210313571800374e-07, "loss": 0.0134, "step": 77430 }, { "grad_norm": 0.3552534282207489, "learning_rate": 2.7991494816255184e-07, "loss": 0.0104, "step": 77440 }, { "grad_norm": 0.20282326638698578, "learning_rate": 2.777352564097779e-07, "loss": 0.0124, "step": 77450 }, { "grad_norm": 0.1611558347940445, "learning_rate": 2.755640608321508e-07, "loss": 0.0148, "step": 77460 }, { "grad_norm": 0.3222716450691223, "learning_rate": 2.73401361800657e-07, "loss": 0.0121, "step": 77470 }, { "grad_norm": 0.1813291609287262, "learning_rate": 2.7124715968484537e-07, "loss": 0.0122, "step": 77480 }, { "grad_norm": 0.21775297820568085, "learning_rate": 2.691014548528104e-07, "loss": 0.0145, "step": 77490 }, { "grad_norm": 0.18177518248558044, "learning_rate": 2.669642476711864e-07, "loss": 0.0098, "step": 77500 }, { "grad_norm": 0.16409006714820862, "learning_rate": 2.648355385051815e-07, "loss": 0.0125, "step": 77510 }, { "grad_norm": 0.2282782942056656, "learning_rate": 2.627153277185157e-07, "loss": 0.0108, "step": 77520 }, { "grad_norm": 0.1809895783662796, "learning_rate": 2.606036156734881e-07, "loss": 0.0088, "step": 77530 }, { "grad_norm": 0.18699775636196136, "learning_rate": 2.5850040273092127e-07, "loss": 0.0103, "step": 77540 }, { "grad_norm": 0.25759562849998474, "learning_rate": 2.5640568925020536e-07, "loss": 0.0201, "step": 77550 }, { "grad_norm": 0.23045608401298523, "learning_rate": 2.5431947558927083e-07, "loss": 0.0159, "step": 77560 }, { "grad_norm": 0.9326906800270081, "learning_rate": 2.522417621045825e-07, "loss": 0.0139, "step": 77570 }, { "grad_norm": 0.1436348408460617, "learning_rate": 2.501725491511786e-07, "loss": 0.0153, "step": 77580 }, { "grad_norm": 0.12594515085220337, "learning_rate": 2.481118370826263e-07, "loss": 0.0117, "step": 77590 }, { "grad_norm": 0.2589523494243622, "learning_rate": 2.4605962625104393e-07, "loss": 0.0115, "step": 77600 }, { "grad_norm": 0.27683335542678833, "learning_rate": 2.440159170070955e-07, "loss": 0.0168, "step": 77610 }, { "grad_norm": 0.3096146881580353, "learning_rate": 2.41980709699996e-07, "loss": 0.0108, "step": 77620 }, { "grad_norm": 0.3150242567062378, "learning_rate": 2.3995400467751174e-07, "loss": 0.0123, "step": 77630 }, { "grad_norm": 0.2085603028535843, "learning_rate": 2.3793580228594902e-07, "loss": 0.0209, "step": 77640 }, { "grad_norm": 0.23216435313224792, "learning_rate": 2.3592610287015982e-07, "loss": 0.0106, "step": 77650 }, { "grad_norm": 0.17974425852298737, "learning_rate": 2.3392490677354718e-07, "loss": 0.0132, "step": 77660 }, { "grad_norm": 0.38054215908050537, "learning_rate": 2.3193221433806533e-07, "loss": 0.009, "step": 77670 }, { "grad_norm": 0.6714736223220825, "learning_rate": 2.299480259042086e-07, "loss": 0.0176, "step": 77680 }, { "grad_norm": 0.10126959532499313, "learning_rate": 2.2797234181102244e-07, "loss": 0.0109, "step": 77690 }, { "grad_norm": 0.2351425141096115, "learning_rate": 2.2600516239609238e-07, "loss": 0.0133, "step": 77700 }, { "grad_norm": 0.1641070544719696, "learning_rate": 2.240464879955606e-07, "loss": 0.0162, "step": 77710 }, { "grad_norm": 0.279431015253067, "learning_rate": 2.2209631894410387e-07, "loss": 0.0142, "step": 77720 }, { "grad_norm": 0.23799487948417664, "learning_rate": 2.2015465557496117e-07, "loss": 0.0118, "step": 77730 }, { "grad_norm": 0.20927923917770386, "learning_rate": 2.1822149821990602e-07, "loss": 0.0118, "step": 77740 }, { "grad_norm": 0.6667954325675964, "learning_rate": 2.1629684720926303e-07, "loss": 0.0136, "step": 77750 }, { "grad_norm": 0.18618570268154144, "learning_rate": 2.1438070287189693e-07, "loss": 0.0129, "step": 77760 }, { "grad_norm": 0.19354411959648132, "learning_rate": 2.1247306553523472e-07, "loss": 0.0151, "step": 77770 }, { "grad_norm": 0.2732159495353699, "learning_rate": 2.105739355252323e-07, "loss": 0.0154, "step": 77780 }, { "grad_norm": 0.15411578118801117, "learning_rate": 2.0868331316639678e-07, "loss": 0.013, "step": 77790 }, { "grad_norm": 0.12568293511867523, "learning_rate": 2.0680119878179193e-07, "loss": 0.0129, "step": 77800 }, { "grad_norm": 0.19101904332637787, "learning_rate": 2.0492759269301054e-07, "loss": 0.0142, "step": 77810 }, { "grad_norm": 0.48058730363845825, "learning_rate": 2.0306249522021315e-07, "loss": 0.0149, "step": 77820 }, { "grad_norm": 0.15652821958065033, "learning_rate": 2.0120590668207816e-07, "loss": 0.0093, "step": 77830 }, { "grad_norm": 0.5970901846885681, "learning_rate": 1.993578273958574e-07, "loss": 0.0107, "step": 77840 }, { "grad_norm": 0.13052956759929657, "learning_rate": 1.975182576773371e-07, "loss": 0.0106, "step": 77850 }, { "grad_norm": 0.393558531999588, "learning_rate": 1.9568719784083812e-07, "loss": 0.014, "step": 77860 }, { "grad_norm": 0.1583349108695984, "learning_rate": 1.9386464819924898e-07, "loss": 0.0181, "step": 77870 }, { "grad_norm": 0.18876224756240845, "learning_rate": 1.9205060906399285e-07, "loss": 0.0139, "step": 77880 }, { "grad_norm": 0.17135176062583923, "learning_rate": 1.9024508074503845e-07, "loss": 0.0117, "step": 77890 }, { "grad_norm": 0.1897803544998169, "learning_rate": 1.8844806355089452e-07, "loss": 0.0204, "step": 77900 }, { "grad_norm": 0.18387262523174286, "learning_rate": 1.8665955778863208e-07, "loss": 0.0219, "step": 77910 }, { "grad_norm": 0.17417772114276886, "learning_rate": 1.848795637638512e-07, "loss": 0.0124, "step": 77920 }, { "grad_norm": 0.3029000461101532, "learning_rate": 1.831080817807085e-07, "loss": 0.0153, "step": 77930 }, { "grad_norm": 0.2141748070716858, "learning_rate": 1.813451121418952e-07, "loss": 0.0212, "step": 77940 }, { "grad_norm": 0.22732530534267426, "learning_rate": 1.795906551486648e-07, "loss": 0.0109, "step": 77950 }, { "grad_norm": 0.20642656087875366, "learning_rate": 1.7784471110079414e-07, "loss": 0.0106, "step": 77960 }, { "grad_norm": 0.33922091126441956, "learning_rate": 1.7610728029662793e-07, "loss": 0.0138, "step": 77970 }, { "grad_norm": 0.13355199992656708, "learning_rate": 1.7437836303303979e-07, "loss": 0.0122, "step": 77980 }, { "grad_norm": 0.17100687325000763, "learning_rate": 1.726579596054545e-07, "loss": 0.0116, "step": 77990 }, { "grad_norm": 0.25473934412002563, "learning_rate": 1.7094607030784803e-07, "loss": 0.0138, "step": 78000 }, { "grad_norm": 0.22019614279270172, "learning_rate": 1.6924269543272532e-07, "loss": 0.0147, "step": 78010 }, { "grad_norm": 0.2186942845582962, "learning_rate": 1.6754783527115348e-07, "loss": 0.0111, "step": 78020 }, { "grad_norm": 0.21565665304660797, "learning_rate": 1.6586149011273422e-07, "loss": 0.0118, "step": 78030 }, { "grad_norm": 0.26237159967422485, "learning_rate": 1.6418366024562038e-07, "loss": 0.0216, "step": 78040 }, { "grad_norm": 0.2488923817873001, "learning_rate": 1.6251434595651037e-07, "loss": 0.0115, "step": 78050 }, { "grad_norm": 0.15666957199573517, "learning_rate": 1.608535475306372e-07, "loss": 0.0109, "step": 78060 }, { "grad_norm": 0.2917536795139313, "learning_rate": 1.5920126525179048e-07, "loss": 0.011, "step": 78070 }, { "grad_norm": 0.2521384060382843, "learning_rate": 1.5755749940229435e-07, "loss": 0.0106, "step": 78080 }, { "grad_norm": 0.36611998081207275, "learning_rate": 1.559222502630353e-07, "loss": 0.0124, "step": 78090 }, { "grad_norm": 0.24123547971248627, "learning_rate": 1.5429551811341757e-07, "loss": 0.0122, "step": 78100 }, { "grad_norm": 0.21021097898483276, "learning_rate": 1.5267730323141882e-07, "loss": 0.0155, "step": 78110 }, { "grad_norm": 0.1603155881166458, "learning_rate": 1.5106760589353454e-07, "loss": 0.0096, "step": 78120 }, { "grad_norm": 0.2068019062280655, "learning_rate": 1.4946642637483355e-07, "loss": 0.0099, "step": 78130 }, { "grad_norm": 0.25287455320358276, "learning_rate": 1.4787376494889703e-07, "loss": 0.01, "step": 78140 }, { "grad_norm": 0.24731265008449554, "learning_rate": 1.4628962188787955e-07, "loss": 0.0117, "step": 78150 }, { "grad_norm": 0.18753798305988312, "learning_rate": 1.4471399746247006e-07, "loss": 0.0161, "step": 78160 }, { "grad_norm": 0.18724074959754944, "learning_rate": 1.4314689194188103e-07, "loss": 0.0115, "step": 78170 }, { "grad_norm": 0.4343101978302002, "learning_rate": 1.4158830559390933e-07, "loss": 0.0181, "step": 78180 }, { "grad_norm": 0.1982218474149704, "learning_rate": 1.4003823868486422e-07, "loss": 0.0246, "step": 78190 }, { "grad_norm": 0.3143838047981262, "learning_rate": 1.3849669147960598e-07, "loss": 0.015, "step": 78200 }, { "grad_norm": 0.0971529483795166, "learning_rate": 1.3696366424155726e-07, "loss": 0.0117, "step": 78210 }, { "grad_norm": 0.6593595743179321, "learning_rate": 1.35439157232653e-07, "loss": 0.018, "step": 78220 }, { "grad_norm": 0.34230467677116394, "learning_rate": 1.3392317071340144e-07, "loss": 0.0104, "step": 78230 }, { "grad_norm": 0.3756946325302124, "learning_rate": 1.3241570494283984e-07, "loss": 0.013, "step": 78240 }, { "grad_norm": 0.4186871349811554, "learning_rate": 1.3091676017855103e-07, "loss": 0.0169, "step": 78250 }, { "grad_norm": 0.190072163939476, "learning_rate": 1.2942633667666904e-07, "loss": 0.0133, "step": 78260 }, { "grad_norm": 0.22248174250125885, "learning_rate": 1.2794443469185679e-07, "loss": 0.0173, "step": 78270 }, { "grad_norm": 0.1958102434873581, "learning_rate": 1.2647105447734508e-07, "loss": 0.0113, "step": 78280 }, { "grad_norm": 0.2118593007326126, "learning_rate": 1.2500619628488254e-07, "loss": 0.0104, "step": 78290 }, { "grad_norm": 0.19678330421447754, "learning_rate": 1.2354986036477446e-07, "loss": 0.016, "step": 78300 }, { "grad_norm": 0.18162932991981506, "learning_rate": 1.221020469658718e-07, "loss": 0.0118, "step": 78310 }, { "grad_norm": 0.2517935037612915, "learning_rate": 1.2066275633556556e-07, "loss": 0.0135, "step": 78320 }, { "grad_norm": 0.23704996705055237, "learning_rate": 1.192319887197979e-07, "loss": 0.0121, "step": 78330 }, { "grad_norm": 0.2643473744392395, "learning_rate": 1.1780974436303438e-07, "loss": 0.0107, "step": 78340 }, { "grad_norm": 0.15962670743465424, "learning_rate": 1.1639602350830836e-07, "loss": 0.0144, "step": 78350 }, { "grad_norm": 0.2503165006637573, "learning_rate": 1.1499082639718217e-07, "loss": 0.0147, "step": 78360 }, { "grad_norm": 0.1406359225511551, "learning_rate": 1.1359415326976374e-07, "loss": 0.0093, "step": 78370 }, { "grad_norm": 0.15227581560611725, "learning_rate": 1.1220600436470663e-07, "loss": 0.0162, "step": 78380 }, { "grad_norm": 0.19072774052619934, "learning_rate": 1.1082637991920996e-07, "loss": 0.0143, "step": 78390 }, { "grad_norm": 0.2626936137676239, "learning_rate": 1.0945528016901851e-07, "loss": 0.0134, "step": 78400 }, { "grad_norm": 0.27699828147888184, "learning_rate": 1.0809270534840599e-07, "loss": 0.0121, "step": 78410 }, { "grad_norm": 0.12754343450069427, "learning_rate": 1.067386556902028e-07, "loss": 0.0104, "step": 78420 }, { "grad_norm": 0.17767703533172607, "learning_rate": 1.05393131425785e-07, "loss": 0.0086, "step": 78430 }, { "grad_norm": 0.47604647278785706, "learning_rate": 1.0405613278505199e-07, "loss": 0.0161, "step": 78440 }, { "grad_norm": 0.31036657094955444, "learning_rate": 1.027276599964766e-07, "loss": 0.0115, "step": 78450 }, { "grad_norm": 0.17922335863113403, "learning_rate": 1.0140771328704391e-07, "loss": 0.0103, "step": 78460 }, { "grad_norm": 0.2566775679588318, "learning_rate": 1.0009629288231237e-07, "loss": 0.0105, "step": 78470 }, { "grad_norm": 0.272927463054657, "learning_rate": 9.879339900635276e-08, "loss": 0.0175, "step": 78480 }, { "grad_norm": 0.5215429067611694, "learning_rate": 9.74990318817981e-08, "loss": 0.0133, "step": 78490 }, { "grad_norm": 0.2809211313724518, "learning_rate": 9.621319172982701e-08, "loss": 0.0107, "step": 78500 }, { "grad_norm": 0.1736505627632141, "learning_rate": 9.493587877015264e-08, "loss": 0.0106, "step": 78510 }, { "grad_norm": 0.2836790084838867, "learning_rate": 9.366709322102263e-08, "loss": 0.0107, "step": 78520 }, { "grad_norm": 0.3242555856704712, "learning_rate": 9.240683529924688e-08, "loss": 0.0117, "step": 78530 }, { "grad_norm": 0.24757196009159088, "learning_rate": 9.115510522016979e-08, "loss": 0.0113, "step": 78540 }, { "grad_norm": 0.13930007815361023, "learning_rate": 8.991190319767583e-08, "loss": 0.0107, "step": 78550 }, { "grad_norm": 0.1231098622083664, "learning_rate": 8.867722944419509e-08, "loss": 0.011, "step": 78560 }, { "grad_norm": 0.19413071870803833, "learning_rate": 8.745108417069214e-08, "loss": 0.0116, "step": 78570 }, { "grad_norm": 0.17350585758686066, "learning_rate": 8.623346758669381e-08, "loss": 0.0106, "step": 78580 }, { "grad_norm": 0.22687019407749176, "learning_rate": 8.502437990025037e-08, "loss": 0.0115, "step": 78590 }, { "grad_norm": 0.13232967257499695, "learning_rate": 8.382382131795764e-08, "loss": 0.0108, "step": 78600 }, { "grad_norm": 0.11680332571268082, "learning_rate": 8.263179204496818e-08, "loss": 0.0177, "step": 78610 }, { "grad_norm": 0.2971747815608978, "learning_rate": 8.144829228496354e-08, "loss": 0.0176, "step": 78620 }, { "grad_norm": 0.2874131500720978, "learning_rate": 8.027332224016526e-08, "loss": 0.013, "step": 78630 }, { "grad_norm": 0.29315173625946045, "learning_rate": 7.910688211135164e-08, "loss": 0.0144, "step": 78640 }, { "grad_norm": 0.1922256350517273, "learning_rate": 7.794897209783546e-08, "loss": 0.0173, "step": 78650 }, { "grad_norm": 0.10866227746009827, "learning_rate": 7.679959239746403e-08, "loss": 0.0136, "step": 78660 }, { "grad_norm": 0.2016451209783554, "learning_rate": 7.565874320664135e-08, "loss": 0.0131, "step": 78670 }, { "grad_norm": 0.2869529128074646, "learning_rate": 7.45264247203059e-08, "loss": 0.013, "step": 78680 }, { "grad_norm": 0.2871364653110504, "learning_rate": 7.340263713194184e-08, "loss": 0.0112, "step": 78690 }, { "grad_norm": 0.20400497317314148, "learning_rate": 7.228738063356777e-08, "loss": 0.0116, "step": 78700 }, { "grad_norm": 0.4061574339866638, "learning_rate": 7.118065541575903e-08, "loss": 0.017, "step": 78710 }, { "grad_norm": 0.2022269368171692, "learning_rate": 7.008246166761435e-08, "loss": 0.0128, "step": 78720 }, { "grad_norm": 0.21172450482845306, "learning_rate": 6.899279957679472e-08, "loss": 0.0136, "step": 78730 }, { "grad_norm": 0.13965918123722076, "learning_rate": 6.791166932949011e-08, "loss": 0.0082, "step": 78740 }, { "grad_norm": 0.34243327379226685, "learning_rate": 6.683907111043608e-08, "loss": 0.0148, "step": 78750 }, { "grad_norm": 0.09815935045480728, "learning_rate": 6.577500510290824e-08, "loss": 0.0127, "step": 78760 }, { "grad_norm": 0.29531213641166687, "learning_rate": 6.47194714887278e-08, "loss": 0.024, "step": 78770 }, { "grad_norm": 0.17496171593666077, "learning_rate": 6.367247044825608e-08, "loss": 0.0113, "step": 78780 }, { "grad_norm": 0.14551585912704468, "learning_rate": 6.263400216039994e-08, "loss": 0.012, "step": 78790 }, { "grad_norm": 0.29089751839637756, "learning_rate": 6.160406680260078e-08, "loss": 0.0207, "step": 78800 }, { "grad_norm": 0.11478208750486374, "learning_rate": 6.058266455084561e-08, "loss": 0.0123, "step": 78810 }, { "grad_norm": 0.18059077858924866, "learning_rate": 5.956979557967257e-08, "loss": 0.0117, "step": 78820 }, { "grad_norm": 0.23362819850444794, "learning_rate": 5.856546006214325e-08, "loss": 0.0132, "step": 78830 }, { "grad_norm": 0.2053200751543045, "learning_rate": 5.7569658169881466e-08, "loss": 0.0111, "step": 78840 }, { "grad_norm": 0.4420333802700043, "learning_rate": 5.658239007303445e-08, "loss": 0.0159, "step": 78850 }, { "grad_norm": 0.3007148504257202, "learning_rate": 5.560365594030059e-08, "loss": 0.0118, "step": 78860 }, { "grad_norm": 0.25911104679107666, "learning_rate": 5.463345593891833e-08, "loss": 0.0137, "step": 78870 }, { "grad_norm": 0.3906126916408539, "learning_rate": 5.367179023467173e-08, "loss": 0.0156, "step": 78880 }, { "grad_norm": 0.2543913722038269, "learning_rate": 5.271865899187378e-08, "loss": 0.0105, "step": 78890 }, { "grad_norm": 0.36569562554359436, "learning_rate": 5.177406237340532e-08, "loss": 0.0316, "step": 78900 }, { "grad_norm": 0.21777406334877014, "learning_rate": 5.083800054065946e-08, "loss": 0.01, "step": 78910 }, { "grad_norm": 0.288921982049942, "learning_rate": 4.991047365358603e-08, "loss": 0.0125, "step": 78920 }, { "grad_norm": 0.5851228833198547, "learning_rate": 4.899148187067493e-08, "loss": 0.0216, "step": 78930 }, { "grad_norm": 0.12432494759559631, "learning_rate": 4.808102534895609e-08, "loss": 0.0093, "step": 78940 }, { "grad_norm": 0.14483876526355743, "learning_rate": 4.717910424400507e-08, "loss": 0.0102, "step": 78950 }, { "grad_norm": 0.2850014567375183, "learning_rate": 4.628571870993193e-08, "loss": 0.0115, "step": 78960 }, { "grad_norm": 0.2997974157333374, "learning_rate": 4.5400868899392324e-08, "loss": 0.0139, "step": 78970 }, { "grad_norm": 0.3335769474506378, "learning_rate": 4.4524554963576436e-08, "loss": 0.0157, "step": 78980 }, { "grad_norm": 0.2851256728172302, "learning_rate": 4.36567770522367e-08, "loss": 0.0114, "step": 78990 }, { "grad_norm": 0.1797732561826706, "learning_rate": 4.279753531364339e-08, "loss": 0.0136, "step": 79000 }, { "grad_norm": 0.279232919216156, "learning_rate": 4.1946829894617955e-08, "loss": 0.0167, "step": 79010 }, { "grad_norm": 0.0986630767583847, "learning_rate": 4.11046609405219e-08, "loss": 0.0231, "step": 79020 }, { "grad_norm": 0.2611841857433319, "learning_rate": 4.027102859526233e-08, "loss": 0.0095, "step": 79030 }, { "grad_norm": 0.7737959027290344, "learning_rate": 3.9445933001280856e-08, "loss": 0.0146, "step": 79040 }, { "grad_norm": 0.170450821518898, "learning_rate": 3.8629374299564704e-08, "loss": 0.017, "step": 79050 }, { "grad_norm": 0.16533303260803223, "learning_rate": 3.78213526296467e-08, "loss": 0.0121, "step": 79060 }, { "grad_norm": 0.2524893283843994, "learning_rate": 3.702186812958308e-08, "loss": 0.0146, "step": 79070 }, { "grad_norm": 0.23300747573375702, "learning_rate": 3.623092093599789e-08, "loss": 0.0159, "step": 79080 }, { "grad_norm": 0.36607635021209717, "learning_rate": 3.544851118403303e-08, "loss": 0.0122, "step": 79090 }, { "grad_norm": 0.18454386293888092, "learning_rate": 3.46746390073871e-08, "loss": 0.0142, "step": 79100 }, { "grad_norm": 0.2730787396430969, "learning_rate": 3.390930453829322e-08, "loss": 0.0175, "step": 79110 }, { "grad_norm": 0.2997169494628906, "learning_rate": 3.3152507907519005e-08, "loss": 0.0117, "step": 79120 }, { "grad_norm": 0.1744309365749359, "learning_rate": 3.240424924438323e-08, "loss": 0.0117, "step": 79130 }, { "grad_norm": 0.2043483704328537, "learning_rate": 3.1664528676750295e-08, "loss": 0.0135, "step": 79140 }, { "grad_norm": 0.24119797348976135, "learning_rate": 3.093334633100797e-08, "loss": 0.0206, "step": 79150 }, { "grad_norm": 0.25981608033180237, "learning_rate": 3.021070233210077e-08, "loss": 0.0156, "step": 79160 }, { "grad_norm": 0.5336216688156128, "learning_rate": 2.9496596803507692e-08, "loss": 0.0121, "step": 79170 }, { "grad_norm": 0.16437210142612457, "learning_rate": 2.879102986725335e-08, "loss": 0.012, "step": 79180 }, { "grad_norm": 0.2486262172460556, "learning_rate": 2.809400164389131e-08, "loss": 0.0151, "step": 79190 }, { "grad_norm": 0.4803626239299774, "learning_rate": 2.740551225253185e-08, "loss": 0.0124, "step": 79200 }, { "grad_norm": 0.2524105906486511, "learning_rate": 2.6725561810819754e-08, "loss": 0.0139, "step": 79210 }, { "grad_norm": 0.30714908242225647, "learning_rate": 2.605415043493431e-08, "loss": 0.0115, "step": 79220 }, { "grad_norm": 0.19642233848571777, "learning_rate": 2.5391278239605965e-08, "loss": 0.0127, "step": 79230 }, { "grad_norm": 0.23333553969860077, "learning_rate": 2.4736945338094118e-08, "loss": 0.0097, "step": 79240 }, { "grad_norm": 0.2873179018497467, "learning_rate": 2.4091151842214887e-08, "loss": 0.0133, "step": 79250 }, { "grad_norm": 0.2666356861591339, "learning_rate": 2.3453897862318884e-08, "loss": 0.0148, "step": 79260 }, { "grad_norm": 0.26522791385650635, "learning_rate": 2.2825183507285686e-08, "loss": 0.0109, "step": 79270 }, { "grad_norm": 0.19828788936138153, "learning_rate": 2.220500888455157e-08, "loss": 0.0157, "step": 79280 }, { "grad_norm": 0.1781986802816391, "learning_rate": 2.1593374100081777e-08, "loss": 0.0175, "step": 79290 }, { "grad_norm": 0.16375109553337097, "learning_rate": 2.09902792583927e-08, "loss": 0.0186, "step": 79300 }, { "grad_norm": 0.2557181417942047, "learning_rate": 2.0395724462540788e-08, "loss": 0.0115, "step": 79310 }, { "grad_norm": 0.33915290236473083, "learning_rate": 1.9809709814111453e-08, "loss": 0.016, "step": 79320 }, { "grad_norm": 0.24253469705581665, "learning_rate": 1.923223541324126e-08, "loss": 0.0174, "step": 79330 }, { "grad_norm": 0.19024862349033356, "learning_rate": 1.8663301358606833e-08, "loss": 0.0166, "step": 79340 }, { "grad_norm": 0.7115675806999207, "learning_rate": 1.8102907747419295e-08, "loss": 0.0158, "step": 79350 }, { "grad_norm": 0.362504780292511, "learning_rate": 1.7551054675435385e-08, "loss": 0.0137, "step": 79360 }, { "grad_norm": 0.13155236840248108, "learning_rate": 1.7007742236957447e-08, "loss": 0.0132, "step": 79370 }, { "grad_norm": 0.2344091832637787, "learning_rate": 1.647297052481678e-08, "loss": 0.0146, "step": 79380 }, { "grad_norm": 0.15593162178993225, "learning_rate": 1.5946739630390285e-08, "loss": 0.0162, "step": 79390 }, { "grad_norm": 0.1722780019044876, "learning_rate": 1.5429049643606032e-08, "loss": 0.0186, "step": 79400 }, { "grad_norm": 0.16616718471050262, "learning_rate": 1.4919900652909935e-08, "loss": 0.018, "step": 79410 }, { "grad_norm": 0.23168256878852844, "learning_rate": 1.4419292745310175e-08, "loss": 0.0148, "step": 79420 }, { "grad_norm": 0.16501998901367188, "learning_rate": 1.3927226006343885e-08, "loss": 0.0118, "step": 79430 }, { "grad_norm": 0.2825951874256134, "learning_rate": 1.3443700520093805e-08, "loss": 0.016, "step": 79440 }, { "grad_norm": 0.2167988270521164, "learning_rate": 1.296871636917718e-08, "loss": 0.0137, "step": 79450 }, { "grad_norm": 0.1956518292427063, "learning_rate": 1.2502273634762419e-08, "loss": 0.0134, "step": 79460 }, { "grad_norm": 0.3206793963909149, "learning_rate": 1.2044372396546876e-08, "loss": 0.0146, "step": 79470 }, { "grad_norm": 0.20659086108207703, "learning_rate": 1.1595012732773524e-08, "loss": 0.0141, "step": 79480 }, { "grad_norm": 0.20475135743618011, "learning_rate": 1.1154194720225386e-08, "loss": 0.0168, "step": 79490 }, { "grad_norm": 0.2971075177192688, "learning_rate": 1.0721918434231093e-08, "loss": 0.0113, "step": 79500 }, { "grad_norm": 0.1785849928855896, "learning_rate": 1.0298183948648232e-08, "loss": 0.0089, "step": 79510 }, { "grad_norm": 0.16786909103393555, "learning_rate": 9.88299133588555e-09, "loss": 0.0106, "step": 79520 }, { "grad_norm": 0.3237256705760956, "learning_rate": 9.476340666891847e-09, "loss": 0.0189, "step": 79530 }, { "grad_norm": 0.13191744685173035, "learning_rate": 9.078232011139332e-09, "loss": 0.0105, "step": 79540 }, { "grad_norm": 0.2208012193441391, "learning_rate": 8.68866543666802e-09, "loss": 0.0106, "step": 79550 }, { "grad_norm": 0.12624691426753998, "learning_rate": 8.307641010035782e-09, "loss": 0.0126, "step": 79560 }, { "grad_norm": 0.4664345383644104, "learning_rate": 7.93515879635165e-09, "loss": 0.0166, "step": 79570 }, { "grad_norm": 0.23966901004314423, "learning_rate": 7.571218859264706e-09, "loss": 0.0113, "step": 79580 }, { "grad_norm": 0.23275962471961975, "learning_rate": 7.215821260958544e-09, "loss": 0.0141, "step": 79590 }, { "grad_norm": 0.20350579917430878, "learning_rate": 6.868966062162363e-09, "loss": 0.0116, "step": 79600 }, { "grad_norm": 0.1973460167646408, "learning_rate": 6.530653322145419e-09, "loss": 0.0107, "step": 79610 }, { "grad_norm": 0.17247232794761658, "learning_rate": 6.200883098717025e-09, "loss": 0.01, "step": 79620 }, { "grad_norm": 0.345542311668396, "learning_rate": 5.879655448226551e-09, "loss": 0.016, "step": 79630 }, { "grad_norm": 0.14780814945697784, "learning_rate": 5.566970425557872e-09, "loss": 0.011, "step": 79640 }, { "grad_norm": 0.3983137011528015, "learning_rate": 5.26282808414047e-09, "loss": 0.0121, "step": 79650 }, { "grad_norm": 0.26662468910217285, "learning_rate": 4.967228475949437e-09, "loss": 0.0151, "step": 79660 }, { "grad_norm": 0.2290860414505005, "learning_rate": 4.680171651494369e-09, "loss": 0.0108, "step": 79670 }, { "grad_norm": 0.27440688014030457, "learning_rate": 4.40165765981937e-09, "loss": 0.0089, "step": 79680 }, { "grad_norm": 0.3352358639240265, "learning_rate": 4.131686548519698e-09, "loss": 0.0113, "step": 79690 }, { "grad_norm": 0.27713602781295776, "learning_rate": 3.8702583637251214e-09, "loss": 0.013, "step": 79700 }, { "grad_norm": 0.21250082552433014, "learning_rate": 3.617373150105463e-09, "loss": 0.0112, "step": 79710 }, { "grad_norm": 0.48672595620155334, "learning_rate": 3.3730309508706036e-09, "loss": 0.0118, "step": 79720 }, { "grad_norm": 0.2949310839176178, "learning_rate": 3.137231807781582e-09, "loss": 0.0122, "step": 79730 }, { "grad_norm": 0.15106746554374695, "learning_rate": 2.9099757611172894e-09, "loss": 0.0134, "step": 79740 }, { "grad_norm": 0.1520644575357437, "learning_rate": 2.6912628497133275e-09, "loss": 0.0115, "step": 79750 }, { "grad_norm": 0.27278512716293335, "learning_rate": 2.481093110945354e-09, "loss": 0.0144, "step": 79760 }, { "grad_norm": 0.12194862961769104, "learning_rate": 2.2794665807235327e-09, "loss": 0.0103, "step": 79770 }, { "grad_norm": 0.17785488069057465, "learning_rate": 2.0863832934980843e-09, "loss": 0.0093, "step": 79780 }, { "grad_norm": 0.17340533435344696, "learning_rate": 1.901843282264837e-09, "loss": 0.0139, "step": 79790 }, { "grad_norm": 0.17771439254283905, "learning_rate": 1.7258465785541245e-09, "loss": 0.0161, "step": 79800 }, { "grad_norm": 0.2146417200565338, "learning_rate": 1.5583932124418887e-09, "loss": 0.014, "step": 79810 }, { "grad_norm": 0.14461390674114227, "learning_rate": 1.3994832125441282e-09, "loss": 0.016, "step": 79820 }, { "grad_norm": 0.1550809144973755, "learning_rate": 1.2491166060057958e-09, "loss": 0.0116, "step": 79830 }, { "grad_norm": 0.14646607637405396, "learning_rate": 1.1072934185230032e-09, "loss": 0.0151, "step": 79840 }, { "grad_norm": 0.2993324100971222, "learning_rate": 9.740136743319195e-10, "loss": 0.0154, "step": 79850 }, { "grad_norm": 0.2028588056564331, "learning_rate": 8.492773962087696e-10, "loss": 0.0107, "step": 79860 }, { "grad_norm": 0.2236708551645279, "learning_rate": 7.330846054587337e-10, "loss": 0.0146, "step": 79870 }, { "grad_norm": 0.22716385126113892, "learning_rate": 6.254353219492526e-10, "loss": 0.0151, "step": 79880 }, { "grad_norm": 0.2075785994529724, "learning_rate": 5.263295640600685e-10, "loss": 0.0131, "step": 79890 }, { "grad_norm": 0.1598287969827652, "learning_rate": 4.357673487387359e-10, "loss": 0.0148, "step": 79900 }, { "grad_norm": 0.3783297836780548, "learning_rate": 3.537486914506616e-10, "loss": 0.0148, "step": 79910 }, { "grad_norm": 0.2018970251083374, "learning_rate": 2.8027360621241117e-10, "loss": 0.0121, "step": 79920 }, { "grad_norm": 0.22136607766151428, "learning_rate": 2.153421055806071e-10, "loss": 0.0128, "step": 79930 }, { "grad_norm": 0.18665163218975067, "learning_rate": 1.5895420064637733e-10, "loss": 0.0131, "step": 79940 }, { "grad_norm": 0.35647618770599365, "learning_rate": 1.1110990105200891e-10, "loss": 0.0127, "step": 79950 }, { "grad_norm": 0.3321819603443146, "learning_rate": 7.180921496874326e-11, "loss": 0.0166, "step": 79960 }, { "grad_norm": 0.23615679144859314, "learning_rate": 4.10521491134297e-11, "loss": 0.012, "step": 79970 }, { "grad_norm": 0.24507121741771698, "learning_rate": 1.883870874297422e-11, "loss": 0.0114, "step": 79980 }, { "grad_norm": 0.3560073673725128, "learning_rate": 5.1688976432373584e-12, "loss": 0.0146, "step": 79990 }, { "grad_norm": 0.22764228284358978, "learning_rate": 4.271816234080461e-14, "loss": 0.0128, "step": 80000 } ], "logging_steps": 10, "max_steps": 80000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }