{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00547645125958379, "grad_norm": 5.172877788543701, "learning_rate": 2e-05, "loss": 0.9408, "step": 10 }, { "epoch": 0.01095290251916758, "grad_norm": 3.29821515083313, "learning_rate": 2e-05, "loss": 0.4667, "step": 20 }, { "epoch": 0.01642935377875137, "grad_norm": 8.682180404663086, "learning_rate": 2e-05, "loss": 0.4354, "step": 30 }, { "epoch": 0.02190580503833516, "grad_norm": 2.988210439682007, "learning_rate": 2e-05, "loss": 0.3187, "step": 40 }, { "epoch": 0.027382256297918947, "grad_norm": 2.8658511638641357, "learning_rate": 2e-05, "loss": 0.2634, "step": 50 }, { "epoch": 0.03285870755750274, "grad_norm": 4.159895896911621, "learning_rate": 2e-05, "loss": 0.2714, "step": 60 }, { "epoch": 0.038335158817086525, "grad_norm": 4.234712600708008, "learning_rate": 2e-05, "loss": 0.2401, "step": 70 }, { "epoch": 0.04381161007667032, "grad_norm": 4.615501403808594, "learning_rate": 2e-05, "loss": 0.2764, "step": 80 }, { "epoch": 0.04928806133625411, "grad_norm": 2.7115259170532227, "learning_rate": 2e-05, "loss": 0.2515, "step": 90 }, { "epoch": 0.054764512595837894, "grad_norm": 4.8792500495910645, "learning_rate": 2e-05, "loss": 0.248, "step": 100 }, { "epoch": 0.060240963855421686, "grad_norm": 3.3276002407073975, "learning_rate": 2e-05, "loss": 0.2377, "step": 110 }, { "epoch": 0.06571741511500548, "grad_norm": 3.494762897491455, "learning_rate": 2e-05, "loss": 0.2317, "step": 120 }, { "epoch": 0.07119386637458927, "grad_norm": 3.085296392440796, "learning_rate": 2e-05, "loss": 0.2164, "step": 130 }, { "epoch": 0.07667031763417305, "grad_norm": 2.1236257553100586, "learning_rate": 2e-05, "loss": 0.2612, "step": 140 }, { "epoch": 0.08214676889375684, "grad_norm": 2.76648211479187, "learning_rate": 2e-05, "loss": 0.2113, "step": 150 }, { "epoch": 0.08762322015334063, "grad_norm": 5.641284942626953, "learning_rate": 2e-05, "loss": 0.2361, "step": 160 }, { "epoch": 0.09309967141292443, "grad_norm": 2.8492302894592285, "learning_rate": 2e-05, "loss": 0.2273, "step": 170 }, { "epoch": 0.09857612267250822, "grad_norm": 4.214968204498291, "learning_rate": 2e-05, "loss": 0.2411, "step": 180 }, { "epoch": 0.10405257393209201, "grad_norm": 4.820573329925537, "learning_rate": 2e-05, "loss": 0.182, "step": 190 }, { "epoch": 0.10952902519167579, "grad_norm": 2.3844950199127197, "learning_rate": 2e-05, "loss": 0.2221, "step": 200 }, { "epoch": 0.11500547645125958, "grad_norm": 3.7508792877197266, "learning_rate": 2e-05, "loss": 0.2787, "step": 210 }, { "epoch": 0.12048192771084337, "grad_norm": 3.6480345726013184, "learning_rate": 2e-05, "loss": 0.2308, "step": 220 }, { "epoch": 0.12595837897042717, "grad_norm": 3.0960395336151123, "learning_rate": 2e-05, "loss": 0.2141, "step": 230 }, { "epoch": 0.13143483023001096, "grad_norm": 2.7262496948242188, "learning_rate": 2e-05, "loss": 0.2295, "step": 240 }, { "epoch": 0.13691128148959475, "grad_norm": 3.7271182537078857, "learning_rate": 2e-05, "loss": 0.1957, "step": 250 }, { "epoch": 0.14238773274917854, "grad_norm": 3.30234432220459, "learning_rate": 2e-05, "loss": 0.233, "step": 260 }, { "epoch": 0.14786418400876233, "grad_norm": 2.1024515628814697, "learning_rate": 2e-05, "loss": 0.205, "step": 270 }, { "epoch": 0.1533406352683461, "grad_norm": 3.121746063232422, "learning_rate": 2e-05, "loss": 0.1806, "step": 280 }, { "epoch": 0.1588170865279299, "grad_norm": 5.09675407409668, "learning_rate": 2e-05, "loss": 0.2143, "step": 290 }, { "epoch": 0.16429353778751368, "grad_norm": 2.1228365898132324, "learning_rate": 2e-05, "loss": 0.1972, "step": 300 }, { "epoch": 0.16976998904709747, "grad_norm": 5.942009925842285, "learning_rate": 2e-05, "loss": 0.2367, "step": 310 }, { "epoch": 0.17524644030668127, "grad_norm": 3.828641653060913, "learning_rate": 2e-05, "loss": 0.2219, "step": 320 }, { "epoch": 0.18072289156626506, "grad_norm": 3.5164341926574707, "learning_rate": 2e-05, "loss": 0.2209, "step": 330 }, { "epoch": 0.18619934282584885, "grad_norm": 3.276562452316284, "learning_rate": 2e-05, "loss": 0.2066, "step": 340 }, { "epoch": 0.19167579408543264, "grad_norm": 2.173800468444824, "learning_rate": 2e-05, "loss": 0.1824, "step": 350 }, { "epoch": 0.19715224534501644, "grad_norm": 4.548515319824219, "learning_rate": 2e-05, "loss": 0.2085, "step": 360 }, { "epoch": 0.20262869660460023, "grad_norm": 5.658101558685303, "learning_rate": 2e-05, "loss": 0.2021, "step": 370 }, { "epoch": 0.20810514786418402, "grad_norm": 2.9604289531707764, "learning_rate": 2e-05, "loss": 0.2139, "step": 380 }, { "epoch": 0.21358159912376778, "grad_norm": 1.595267653465271, "learning_rate": 2e-05, "loss": 0.2441, "step": 390 }, { "epoch": 0.21905805038335158, "grad_norm": 2.9476821422576904, "learning_rate": 2e-05, "loss": 0.2221, "step": 400 }, { "epoch": 0.22453450164293537, "grad_norm": 6.530129909515381, "learning_rate": 2e-05, "loss": 0.2275, "step": 410 }, { "epoch": 0.23001095290251916, "grad_norm": 2.306079626083374, "learning_rate": 2e-05, "loss": 0.1799, "step": 420 }, { "epoch": 0.23548740416210295, "grad_norm": 3.0433263778686523, "learning_rate": 2e-05, "loss": 0.185, "step": 430 }, { "epoch": 0.24096385542168675, "grad_norm": 3.235048770904541, "learning_rate": 2e-05, "loss": 0.2363, "step": 440 }, { "epoch": 0.24644030668127054, "grad_norm": 2.9125173091888428, "learning_rate": 2e-05, "loss": 0.2158, "step": 450 }, { "epoch": 0.25191675794085433, "grad_norm": 1.9198905229568481, "learning_rate": 2e-05, "loss": 0.2514, "step": 460 }, { "epoch": 0.2573932092004381, "grad_norm": 4.257998943328857, "learning_rate": 2e-05, "loss": 0.2283, "step": 470 }, { "epoch": 0.2628696604600219, "grad_norm": 1.3411542177200317, "learning_rate": 2e-05, "loss": 0.1712, "step": 480 }, { "epoch": 0.2683461117196057, "grad_norm": 2.114000082015991, "learning_rate": 2e-05, "loss": 0.2022, "step": 490 }, { "epoch": 0.2738225629791895, "grad_norm": 2.4874045848846436, "learning_rate": 2e-05, "loss": 0.1744, "step": 500 }, { "epoch": 0.2792990142387733, "grad_norm": 3.700068950653076, "learning_rate": 2e-05, "loss": 0.2078, "step": 510 }, { "epoch": 0.2847754654983571, "grad_norm": 3.8135080337524414, "learning_rate": 2e-05, "loss": 0.1884, "step": 520 }, { "epoch": 0.2902519167579409, "grad_norm": 2.2092432975769043, "learning_rate": 2e-05, "loss": 0.1652, "step": 530 }, { "epoch": 0.29572836801752467, "grad_norm": 2.4670491218566895, "learning_rate": 2e-05, "loss": 0.2201, "step": 540 }, { "epoch": 0.30120481927710846, "grad_norm": 2.9962000846862793, "learning_rate": 2e-05, "loss": 0.1776, "step": 550 }, { "epoch": 0.3066812705366922, "grad_norm": 2.7531886100769043, "learning_rate": 2e-05, "loss": 0.1997, "step": 560 }, { "epoch": 0.312157721796276, "grad_norm": 3.9697697162628174, "learning_rate": 2e-05, "loss": 0.1689, "step": 570 }, { "epoch": 0.3176341730558598, "grad_norm": 2.352689743041992, "learning_rate": 2e-05, "loss": 0.2196, "step": 580 }, { "epoch": 0.3231106243154436, "grad_norm": 2.1512277126312256, "learning_rate": 2e-05, "loss": 0.2266, "step": 590 }, { "epoch": 0.32858707557502737, "grad_norm": 3.3206424713134766, "learning_rate": 2e-05, "loss": 0.1837, "step": 600 }, { "epoch": 0.33406352683461116, "grad_norm": 2.5991427898406982, "learning_rate": 2e-05, "loss": 0.1642, "step": 610 }, { "epoch": 0.33953997809419495, "grad_norm": 4.06941032409668, "learning_rate": 2e-05, "loss": 0.1954, "step": 620 }, { "epoch": 0.34501642935377874, "grad_norm": 2.006395101547241, "learning_rate": 2e-05, "loss": 0.1644, "step": 630 }, { "epoch": 0.35049288061336253, "grad_norm": 2.333529233932495, "learning_rate": 2e-05, "loss": 0.1766, "step": 640 }, { "epoch": 0.3559693318729463, "grad_norm": 1.8663638830184937, "learning_rate": 2e-05, "loss": 0.1427, "step": 650 }, { "epoch": 0.3614457831325301, "grad_norm": 1.4063774347305298, "learning_rate": 2e-05, "loss": 0.1886, "step": 660 }, { "epoch": 0.3669222343921139, "grad_norm": 2.3204855918884277, "learning_rate": 2e-05, "loss": 0.1794, "step": 670 }, { "epoch": 0.3723986856516977, "grad_norm": 2.0688014030456543, "learning_rate": 2e-05, "loss": 0.1788, "step": 680 }, { "epoch": 0.3778751369112815, "grad_norm": 2.9626317024230957, "learning_rate": 2e-05, "loss": 0.1931, "step": 690 }, { "epoch": 0.3833515881708653, "grad_norm": 2.1686134338378906, "learning_rate": 2e-05, "loss": 0.1797, "step": 700 }, { "epoch": 0.3888280394304491, "grad_norm": 3.220993995666504, "learning_rate": 2e-05, "loss": 0.2001, "step": 710 }, { "epoch": 0.39430449069003287, "grad_norm": 4.387556076049805, "learning_rate": 2e-05, "loss": 0.1679, "step": 720 }, { "epoch": 0.39978094194961666, "grad_norm": 6.711359024047852, "learning_rate": 2e-05, "loss": 0.1906, "step": 730 }, { "epoch": 0.40525739320920046, "grad_norm": 2.8974344730377197, "learning_rate": 2e-05, "loss": 0.1842, "step": 740 }, { "epoch": 0.41073384446878425, "grad_norm": 2.0242817401885986, "learning_rate": 2e-05, "loss": 0.1874, "step": 750 }, { "epoch": 0.41621029572836804, "grad_norm": 2.71280574798584, "learning_rate": 2e-05, "loss": 0.1226, "step": 760 }, { "epoch": 0.42168674698795183, "grad_norm": 2.318493604660034, "learning_rate": 2e-05, "loss": 0.2111, "step": 770 }, { "epoch": 0.42716319824753557, "grad_norm": 4.267580509185791, "learning_rate": 2e-05, "loss": 0.2008, "step": 780 }, { "epoch": 0.43263964950711936, "grad_norm": 2.4939076900482178, "learning_rate": 2e-05, "loss": 0.2053, "step": 790 }, { "epoch": 0.43811610076670315, "grad_norm": 1.9013803005218506, "learning_rate": 2e-05, "loss": 0.1724, "step": 800 }, { "epoch": 0.44359255202628695, "grad_norm": 3.4086337089538574, "learning_rate": 2e-05, "loss": 0.1688, "step": 810 }, { "epoch": 0.44906900328587074, "grad_norm": 4.253098011016846, "learning_rate": 2e-05, "loss": 0.2092, "step": 820 }, { "epoch": 0.45454545454545453, "grad_norm": 1.7755553722381592, "learning_rate": 2e-05, "loss": 0.1812, "step": 830 }, { "epoch": 0.4600219058050383, "grad_norm": 1.562054991722107, "learning_rate": 2e-05, "loss": 0.2296, "step": 840 }, { "epoch": 0.4654983570646221, "grad_norm": 1.4678446054458618, "learning_rate": 2e-05, "loss": 0.2006, "step": 850 }, { "epoch": 0.4709748083242059, "grad_norm": 2.852142810821533, "learning_rate": 2e-05, "loss": 0.1763, "step": 860 }, { "epoch": 0.4764512595837897, "grad_norm": 3.451470136642456, "learning_rate": 2e-05, "loss": 0.198, "step": 870 }, { "epoch": 0.4819277108433735, "grad_norm": 4.738248348236084, "learning_rate": 2e-05, "loss": 0.1803, "step": 880 }, { "epoch": 0.4874041621029573, "grad_norm": 2.169006824493408, "learning_rate": 2e-05, "loss": 0.1711, "step": 890 }, { "epoch": 0.4928806133625411, "grad_norm": 6.985939979553223, "learning_rate": 2e-05, "loss": 0.1828, "step": 900 }, { "epoch": 0.49835706462212487, "grad_norm": 3.6901047229766846, "learning_rate": 2e-05, "loss": 0.227, "step": 910 }, { "epoch": 0.5038335158817087, "grad_norm": 4.057900428771973, "learning_rate": 2e-05, "loss": 0.181, "step": 920 }, { "epoch": 0.5093099671412924, "grad_norm": 2.3261914253234863, "learning_rate": 2e-05, "loss": 0.2152, "step": 930 }, { "epoch": 0.5147864184008762, "grad_norm": 2.459638833999634, "learning_rate": 2e-05, "loss": 0.2261, "step": 940 }, { "epoch": 0.52026286966046, "grad_norm": 1.9431893825531006, "learning_rate": 2e-05, "loss": 0.1555, "step": 950 }, { "epoch": 0.5257393209200438, "grad_norm": 2.213655710220337, "learning_rate": 2e-05, "loss": 0.1966, "step": 960 }, { "epoch": 0.5312157721796276, "grad_norm": 3.0963807106018066, "learning_rate": 2e-05, "loss": 0.1672, "step": 970 }, { "epoch": 0.5366922234392114, "grad_norm": 1.3352348804473877, "learning_rate": 2e-05, "loss": 0.1688, "step": 980 }, { "epoch": 0.5421686746987951, "grad_norm": 4.130510330200195, "learning_rate": 2e-05, "loss": 0.2033, "step": 990 }, { "epoch": 0.547645125958379, "grad_norm": 1.72730553150177, "learning_rate": 2e-05, "loss": 0.1883, "step": 1000 }, { "epoch": 0.5531215772179627, "grad_norm": 2.0997588634490967, "learning_rate": 2e-05, "loss": 0.2085, "step": 1010 }, { "epoch": 0.5585980284775466, "grad_norm": 1.9637783765792847, "learning_rate": 2e-05, "loss": 0.1787, "step": 1020 }, { "epoch": 0.5640744797371303, "grad_norm": 5.923566818237305, "learning_rate": 2e-05, "loss": 0.179, "step": 1030 }, { "epoch": 0.5695509309967142, "grad_norm": 2.0748746395111084, "learning_rate": 2e-05, "loss": 0.1937, "step": 1040 }, { "epoch": 0.5750273822562979, "grad_norm": 3.298703193664551, "learning_rate": 2e-05, "loss": 0.1661, "step": 1050 }, { "epoch": 0.5805038335158818, "grad_norm": 2.482518196105957, "learning_rate": 2e-05, "loss": 0.1767, "step": 1060 }, { "epoch": 0.5859802847754655, "grad_norm": 1.430920124053955, "learning_rate": 2e-05, "loss": 0.1592, "step": 1070 }, { "epoch": 0.5914567360350493, "grad_norm": 2.223555326461792, "learning_rate": 2e-05, "loss": 0.1623, "step": 1080 }, { "epoch": 0.5969331872946331, "grad_norm": 3.7139480113983154, "learning_rate": 2e-05, "loss": 0.2036, "step": 1090 }, { "epoch": 0.6024096385542169, "grad_norm": 2.7747836112976074, "learning_rate": 2e-05, "loss": 0.159, "step": 1100 }, { "epoch": 0.6078860898138007, "grad_norm": 1.7586026191711426, "learning_rate": 2e-05, "loss": 0.1994, "step": 1110 }, { "epoch": 0.6133625410733844, "grad_norm": 1.7416315078735352, "learning_rate": 2e-05, "loss": 0.1974, "step": 1120 }, { "epoch": 0.6188389923329682, "grad_norm": 2.214825391769409, "learning_rate": 2e-05, "loss": 0.1584, "step": 1130 }, { "epoch": 0.624315443592552, "grad_norm": 4.937350273132324, "learning_rate": 2e-05, "loss": 0.1751, "step": 1140 }, { "epoch": 0.6297918948521358, "grad_norm": 2.918086290359497, "learning_rate": 2e-05, "loss": 0.1872, "step": 1150 }, { "epoch": 0.6352683461117196, "grad_norm": 2.486037254333496, "learning_rate": 2e-05, "loss": 0.1769, "step": 1160 }, { "epoch": 0.6407447973713034, "grad_norm": 1.8280752897262573, "learning_rate": 2e-05, "loss": 0.1948, "step": 1170 }, { "epoch": 0.6462212486308871, "grad_norm": 2.8489694595336914, "learning_rate": 2e-05, "loss": 0.1598, "step": 1180 }, { "epoch": 0.651697699890471, "grad_norm": 5.198742389678955, "learning_rate": 2e-05, "loss": 0.1503, "step": 1190 }, { "epoch": 0.6571741511500547, "grad_norm": 4.022942066192627, "learning_rate": 2e-05, "loss": 0.1242, "step": 1200 }, { "epoch": 0.6626506024096386, "grad_norm": 1.6303025484085083, "learning_rate": 2e-05, "loss": 0.1753, "step": 1210 }, { "epoch": 0.6681270536692223, "grad_norm": 2.142221450805664, "learning_rate": 2e-05, "loss": 0.1933, "step": 1220 }, { "epoch": 0.6736035049288062, "grad_norm": 1.7236963510513306, "learning_rate": 2e-05, "loss": 0.1632, "step": 1230 }, { "epoch": 0.6790799561883899, "grad_norm": 6.54170560836792, "learning_rate": 2e-05, "loss": 0.1745, "step": 1240 }, { "epoch": 0.6845564074479737, "grad_norm": 3.051344394683838, "learning_rate": 2e-05, "loss": 0.161, "step": 1250 }, { "epoch": 0.6900328587075575, "grad_norm": 4.368143558502197, "learning_rate": 2e-05, "loss": 0.1686, "step": 1260 }, { "epoch": 0.6955093099671413, "grad_norm": 2.810403347015381, "learning_rate": 2e-05, "loss": 0.1341, "step": 1270 }, { "epoch": 0.7009857612267251, "grad_norm": 3.573010206222534, "learning_rate": 2e-05, "loss": 0.1888, "step": 1280 }, { "epoch": 0.7064622124863089, "grad_norm": 3.028301954269409, "learning_rate": 2e-05, "loss": 0.1753, "step": 1290 }, { "epoch": 0.7119386637458927, "grad_norm": 3.8532004356384277, "learning_rate": 2e-05, "loss": 0.1733, "step": 1300 }, { "epoch": 0.7174151150054765, "grad_norm": 2.062229633331299, "learning_rate": 2e-05, "loss": 0.1533, "step": 1310 }, { "epoch": 0.7228915662650602, "grad_norm": 3.278475284576416, "learning_rate": 2e-05, "loss": 0.157, "step": 1320 }, { "epoch": 0.7283680175246441, "grad_norm": 2.650078058242798, "learning_rate": 2e-05, "loss": 0.168, "step": 1330 }, { "epoch": 0.7338444687842278, "grad_norm": 3.017892360687256, "learning_rate": 2e-05, "loss": 0.2381, "step": 1340 }, { "epoch": 0.7393209200438116, "grad_norm": 1.8926328420639038, "learning_rate": 2e-05, "loss": 0.1815, "step": 1350 }, { "epoch": 0.7447973713033954, "grad_norm": 1.819949746131897, "learning_rate": 2e-05, "loss": 0.1387, "step": 1360 }, { "epoch": 0.7502738225629791, "grad_norm": 1.5167309045791626, "learning_rate": 2e-05, "loss": 0.138, "step": 1370 }, { "epoch": 0.755750273822563, "grad_norm": 2.128439426422119, "learning_rate": 2e-05, "loss": 0.1566, "step": 1380 }, { "epoch": 0.7612267250821467, "grad_norm": 1.9905339479446411, "learning_rate": 2e-05, "loss": 0.1711, "step": 1390 }, { "epoch": 0.7667031763417306, "grad_norm": 1.624918818473816, "learning_rate": 2e-05, "loss": 0.1614, "step": 1400 }, { "epoch": 0.7721796276013143, "grad_norm": 2.2797772884368896, "learning_rate": 2e-05, "loss": 0.1839, "step": 1410 }, { "epoch": 0.7776560788608982, "grad_norm": 1.8958566188812256, "learning_rate": 2e-05, "loss": 0.1609, "step": 1420 }, { "epoch": 0.7831325301204819, "grad_norm": 4.018071174621582, "learning_rate": 2e-05, "loss": 0.2153, "step": 1430 }, { "epoch": 0.7886089813800657, "grad_norm": 2.038041353225708, "learning_rate": 2e-05, "loss": 0.1576, "step": 1440 }, { "epoch": 0.7940854326396495, "grad_norm": 3.009593963623047, "learning_rate": 2e-05, "loss": 0.1448, "step": 1450 }, { "epoch": 0.7995618838992333, "grad_norm": 2.0207109451293945, "learning_rate": 2e-05, "loss": 0.1642, "step": 1460 }, { "epoch": 0.8050383351588171, "grad_norm": 1.9745655059814453, "learning_rate": 2e-05, "loss": 0.1499, "step": 1470 }, { "epoch": 0.8105147864184009, "grad_norm": 2.5017263889312744, "learning_rate": 2e-05, "loss": 0.1865, "step": 1480 }, { "epoch": 0.8159912376779846, "grad_norm": 3.3768310546875, "learning_rate": 2e-05, "loss": 0.1439, "step": 1490 }, { "epoch": 0.8214676889375685, "grad_norm": 1.90123450756073, "learning_rate": 2e-05, "loss": 0.1712, "step": 1500 }, { "epoch": 0.8269441401971522, "grad_norm": 1.7746949195861816, "learning_rate": 2e-05, "loss": 0.1712, "step": 1510 }, { "epoch": 0.8324205914567361, "grad_norm": 2.588888645172119, "learning_rate": 2e-05, "loss": 0.189, "step": 1520 }, { "epoch": 0.8378970427163198, "grad_norm": 3.115365743637085, "learning_rate": 2e-05, "loss": 0.1595, "step": 1530 }, { "epoch": 0.8433734939759037, "grad_norm": 1.9716410636901855, "learning_rate": 2e-05, "loss": 0.1639, "step": 1540 }, { "epoch": 0.8488499452354874, "grad_norm": 1.8997513055801392, "learning_rate": 2e-05, "loss": 0.1843, "step": 1550 }, { "epoch": 0.8543263964950711, "grad_norm": 1.860077142715454, "learning_rate": 2e-05, "loss": 0.1748, "step": 1560 }, { "epoch": 0.859802847754655, "grad_norm": 2.047376871109009, "learning_rate": 2e-05, "loss": 0.1548, "step": 1570 }, { "epoch": 0.8652792990142387, "grad_norm": 2.242220401763916, "learning_rate": 2e-05, "loss": 0.1897, "step": 1580 }, { "epoch": 0.8707557502738226, "grad_norm": 1.3936264514923096, "learning_rate": 2e-05, "loss": 0.1529, "step": 1590 }, { "epoch": 0.8762322015334063, "grad_norm": 1.3506709337234497, "learning_rate": 2e-05, "loss": 0.1635, "step": 1600 }, { "epoch": 0.8817086527929902, "grad_norm": 2.024489641189575, "learning_rate": 2e-05, "loss": 0.1751, "step": 1610 }, { "epoch": 0.8871851040525739, "grad_norm": 3.0132129192352295, "learning_rate": 2e-05, "loss": 0.1447, "step": 1620 }, { "epoch": 0.8926615553121577, "grad_norm": 1.4840929508209229, "learning_rate": 2e-05, "loss": 0.1668, "step": 1630 }, { "epoch": 0.8981380065717415, "grad_norm": 5.782477378845215, "learning_rate": 2e-05, "loss": 0.186, "step": 1640 }, { "epoch": 0.9036144578313253, "grad_norm": 3.7930588722229004, "learning_rate": 2e-05, "loss": 0.1477, "step": 1650 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4280755519866943, "learning_rate": 2e-05, "loss": 0.1733, "step": 1660 }, { "epoch": 0.9145673603504929, "grad_norm": 3.5071022510528564, "learning_rate": 2e-05, "loss": 0.1771, "step": 1670 }, { "epoch": 0.9200438116100766, "grad_norm": 1.890026330947876, "learning_rate": 2e-05, "loss": 0.1807, "step": 1680 }, { "epoch": 0.9255202628696605, "grad_norm": 4.8093647956848145, "learning_rate": 2e-05, "loss": 0.1783, "step": 1690 }, { "epoch": 0.9309967141292442, "grad_norm": 3.141622304916382, "learning_rate": 2e-05, "loss": 0.1425, "step": 1700 }, { "epoch": 0.9364731653888281, "grad_norm": 1.4867947101593018, "learning_rate": 2e-05, "loss": 0.1549, "step": 1710 }, { "epoch": 0.9419496166484118, "grad_norm": 2.396588087081909, "learning_rate": 2e-05, "loss": 0.1459, "step": 1720 }, { "epoch": 0.9474260679079957, "grad_norm": 2.241640329360962, "learning_rate": 2e-05, "loss": 0.2159, "step": 1730 }, { "epoch": 0.9529025191675794, "grad_norm": 2.0894391536712646, "learning_rate": 2e-05, "loss": 0.1824, "step": 1740 }, { "epoch": 0.9583789704271632, "grad_norm": 2.195657253265381, "learning_rate": 2e-05, "loss": 0.1773, "step": 1750 }, { "epoch": 0.963855421686747, "grad_norm": 1.9088704586029053, "learning_rate": 2e-05, "loss": 0.1671, "step": 1760 }, { "epoch": 0.9693318729463308, "grad_norm": 2.40929913520813, "learning_rate": 2e-05, "loss": 0.1483, "step": 1770 }, { "epoch": 0.9748083242059146, "grad_norm": 2.1379597187042236, "learning_rate": 2e-05, "loss": 0.1651, "step": 1780 }, { "epoch": 0.9802847754654983, "grad_norm": 1.5753893852233887, "learning_rate": 2e-05, "loss": 0.1521, "step": 1790 }, { "epoch": 0.9857612267250822, "grad_norm": 1.7690379619598389, "learning_rate": 2e-05, "loss": 0.1685, "step": 1800 }, { "epoch": 0.9912376779846659, "grad_norm": 3.3368117809295654, "learning_rate": 2e-05, "loss": 0.1808, "step": 1810 }, { "epoch": 0.9967141292442497, "grad_norm": 2.3334920406341553, "learning_rate": 2e-05, "loss": 0.1687, "step": 1820 }, { "epoch": 1.0, "eval_accuracy": 0.9350717996050485, "eval_f1": 0.8531654843973757, "eval_loss": 0.16189317405223846, "eval_precision": 0.7893026050251876, "eval_recall": 0.9282724884500407, "eval_runtime": 19.8282, "eval_samples_per_second": 311.929, "eval_steps_per_second": 19.518, "step": 1826 }, { "epoch": 1.0021905805038336, "grad_norm": 3.1540722846984863, "learning_rate": 2e-05, "loss": 0.1989, "step": 1830 }, { "epoch": 1.0076670317634173, "grad_norm": 2.7038586139678955, "learning_rate": 2e-05, "loss": 0.1663, "step": 1840 }, { "epoch": 1.013143483023001, "grad_norm": 2.185299873352051, "learning_rate": 2e-05, "loss": 0.1469, "step": 1850 }, { "epoch": 1.0186199342825848, "grad_norm": 4.436729431152344, "learning_rate": 2e-05, "loss": 0.1658, "step": 1860 }, { "epoch": 1.0240963855421688, "grad_norm": 2.3837592601776123, "learning_rate": 2e-05, "loss": 0.1563, "step": 1870 }, { "epoch": 1.0295728368017525, "grad_norm": 1.6888504028320312, "learning_rate": 2e-05, "loss": 0.1705, "step": 1880 }, { "epoch": 1.0350492880613362, "grad_norm": 1.7870920896530151, "learning_rate": 2e-05, "loss": 0.1754, "step": 1890 }, { "epoch": 1.04052573932092, "grad_norm": 2.037872314453125, "learning_rate": 2e-05, "loss": 0.134, "step": 1900 }, { "epoch": 1.046002190580504, "grad_norm": 1.956781029701233, "learning_rate": 2e-05, "loss": 0.1525, "step": 1910 }, { "epoch": 1.0514786418400877, "grad_norm": 3.9054486751556396, "learning_rate": 2e-05, "loss": 0.1405, "step": 1920 }, { "epoch": 1.0569550930996714, "grad_norm": 3.2423737049102783, "learning_rate": 2e-05, "loss": 0.1304, "step": 1930 }, { "epoch": 1.0624315443592551, "grad_norm": 1.7311038970947266, "learning_rate": 2e-05, "loss": 0.1288, "step": 1940 }, { "epoch": 1.067907995618839, "grad_norm": 3.4807159900665283, "learning_rate": 2e-05, "loss": 0.1723, "step": 1950 }, { "epoch": 1.0733844468784228, "grad_norm": 1.2659446001052856, "learning_rate": 2e-05, "loss": 0.1674, "step": 1960 }, { "epoch": 1.0788608981380066, "grad_norm": 2.4274561405181885, "learning_rate": 2e-05, "loss": 0.1634, "step": 1970 }, { "epoch": 1.0843373493975903, "grad_norm": 1.1292122602462769, "learning_rate": 2e-05, "loss": 0.1579, "step": 1980 }, { "epoch": 1.0898138006571743, "grad_norm": 4.752586364746094, "learning_rate": 2e-05, "loss": 0.1267, "step": 1990 }, { "epoch": 1.095290251916758, "grad_norm": 4.907914161682129, "learning_rate": 2e-05, "loss": 0.1444, "step": 2000 }, { "epoch": 1.1007667031763417, "grad_norm": 1.4939802885055542, "learning_rate": 2e-05, "loss": 0.1763, "step": 2010 }, { "epoch": 1.1062431544359255, "grad_norm": 4.959670066833496, "learning_rate": 2e-05, "loss": 0.1581, "step": 2020 }, { "epoch": 1.1117196056955092, "grad_norm": 1.6166772842407227, "learning_rate": 2e-05, "loss": 0.1379, "step": 2030 }, { "epoch": 1.1171960569550932, "grad_norm": 1.21837317943573, "learning_rate": 2e-05, "loss": 0.1259, "step": 2040 }, { "epoch": 1.122672508214677, "grad_norm": 2.209987163543701, "learning_rate": 2e-05, "loss": 0.134, "step": 2050 }, { "epoch": 1.1281489594742606, "grad_norm": 2.8201191425323486, "learning_rate": 2e-05, "loss": 0.1391, "step": 2060 }, { "epoch": 1.1336254107338444, "grad_norm": 2.6724655628204346, "learning_rate": 2e-05, "loss": 0.1731, "step": 2070 }, { "epoch": 1.1391018619934283, "grad_norm": 3.112408399581909, "learning_rate": 2e-05, "loss": 0.1419, "step": 2080 }, { "epoch": 1.144578313253012, "grad_norm": 6.945749759674072, "learning_rate": 2e-05, "loss": 0.1673, "step": 2090 }, { "epoch": 1.1500547645125958, "grad_norm": 4.609986782073975, "learning_rate": 2e-05, "loss": 0.1118, "step": 2100 }, { "epoch": 1.1555312157721795, "grad_norm": 1.9280059337615967, "learning_rate": 2e-05, "loss": 0.1697, "step": 2110 }, { "epoch": 1.1610076670317635, "grad_norm": 2.8931400775909424, "learning_rate": 2e-05, "loss": 0.1794, "step": 2120 }, { "epoch": 1.1664841182913472, "grad_norm": 1.505615234375, "learning_rate": 2e-05, "loss": 0.1438, "step": 2130 }, { "epoch": 1.171960569550931, "grad_norm": 1.2091026306152344, "learning_rate": 2e-05, "loss": 0.1409, "step": 2140 }, { "epoch": 1.1774370208105147, "grad_norm": 5.640398025512695, "learning_rate": 2e-05, "loss": 0.1374, "step": 2150 }, { "epoch": 1.1829134720700987, "grad_norm": 1.9069983959197998, "learning_rate": 2e-05, "loss": 0.1333, "step": 2160 }, { "epoch": 1.1883899233296824, "grad_norm": 2.034888982772827, "learning_rate": 2e-05, "loss": 0.185, "step": 2170 }, { "epoch": 1.1938663745892661, "grad_norm": 1.780856728553772, "learning_rate": 2e-05, "loss": 0.1463, "step": 2180 }, { "epoch": 1.1993428258488499, "grad_norm": 3.035339593887329, "learning_rate": 2e-05, "loss": 0.1662, "step": 2190 }, { "epoch": 1.2048192771084336, "grad_norm": 2.7439584732055664, "learning_rate": 2e-05, "loss": 0.1481, "step": 2200 }, { "epoch": 1.2102957283680176, "grad_norm": 4.901017189025879, "learning_rate": 2e-05, "loss": 0.1641, "step": 2210 }, { "epoch": 1.2157721796276013, "grad_norm": 2.227445125579834, "learning_rate": 2e-05, "loss": 0.1582, "step": 2220 }, { "epoch": 1.221248630887185, "grad_norm": 2.1216564178466797, "learning_rate": 2e-05, "loss": 0.1554, "step": 2230 }, { "epoch": 1.226725082146769, "grad_norm": 1.2567392587661743, "learning_rate": 2e-05, "loss": 0.1683, "step": 2240 }, { "epoch": 1.2322015334063527, "grad_norm": 1.426159381866455, "learning_rate": 2e-05, "loss": 0.17, "step": 2250 }, { "epoch": 1.2376779846659365, "grad_norm": 1.435729742050171, "learning_rate": 2e-05, "loss": 0.1404, "step": 2260 }, { "epoch": 1.2431544359255202, "grad_norm": 5.743936538696289, "learning_rate": 2e-05, "loss": 0.1665, "step": 2270 }, { "epoch": 1.248630887185104, "grad_norm": 1.7799255847930908, "learning_rate": 2e-05, "loss": 0.1538, "step": 2280 }, { "epoch": 1.254107338444688, "grad_norm": 2.466597318649292, "learning_rate": 2e-05, "loss": 0.1038, "step": 2290 }, { "epoch": 1.2595837897042717, "grad_norm": 2.635021686553955, "learning_rate": 2e-05, "loss": 0.1613, "step": 2300 }, { "epoch": 1.2650602409638554, "grad_norm": 2.1053247451782227, "learning_rate": 2e-05, "loss": 0.1169, "step": 2310 }, { "epoch": 1.2705366922234393, "grad_norm": 2.312171459197998, "learning_rate": 2e-05, "loss": 0.1574, "step": 2320 }, { "epoch": 1.276013143483023, "grad_norm": 4.142621994018555, "learning_rate": 2e-05, "loss": 0.1568, "step": 2330 }, { "epoch": 1.2814895947426068, "grad_norm": 3.278440237045288, "learning_rate": 2e-05, "loss": 0.1758, "step": 2340 }, { "epoch": 1.2869660460021906, "grad_norm": 2.5266401767730713, "learning_rate": 2e-05, "loss": 0.177, "step": 2350 }, { "epoch": 1.2924424972617743, "grad_norm": 2.4267191886901855, "learning_rate": 2e-05, "loss": 0.1485, "step": 2360 }, { "epoch": 1.297918948521358, "grad_norm": 1.5584640502929688, "learning_rate": 2e-05, "loss": 0.1583, "step": 2370 }, { "epoch": 1.303395399780942, "grad_norm": 2.881457805633545, "learning_rate": 2e-05, "loss": 0.1515, "step": 2380 }, { "epoch": 1.3088718510405257, "grad_norm": 4.441954612731934, "learning_rate": 2e-05, "loss": 0.1414, "step": 2390 }, { "epoch": 1.3143483023001095, "grad_norm": 5.456357479095459, "learning_rate": 2e-05, "loss": 0.1685, "step": 2400 }, { "epoch": 1.3198247535596934, "grad_norm": 2.449070930480957, "learning_rate": 2e-05, "loss": 0.1445, "step": 2410 }, { "epoch": 1.3253012048192772, "grad_norm": 2.4937679767608643, "learning_rate": 2e-05, "loss": 0.1528, "step": 2420 }, { "epoch": 1.330777656078861, "grad_norm": 1.7794448137283325, "learning_rate": 2e-05, "loss": 0.1601, "step": 2430 }, { "epoch": 1.3362541073384446, "grad_norm": 1.795912265777588, "learning_rate": 2e-05, "loss": 0.1542, "step": 2440 }, { "epoch": 1.3417305585980284, "grad_norm": 1.772538661956787, "learning_rate": 2e-05, "loss": 0.1297, "step": 2450 }, { "epoch": 1.3472070098576123, "grad_norm": 1.0752304792404175, "learning_rate": 2e-05, "loss": 0.13, "step": 2460 }, { "epoch": 1.352683461117196, "grad_norm": 2.1968908309936523, "learning_rate": 2e-05, "loss": 0.1373, "step": 2470 }, { "epoch": 1.3581599123767798, "grad_norm": 0.7487109303474426, "learning_rate": 2e-05, "loss": 0.1646, "step": 2480 }, { "epoch": 1.3636363636363638, "grad_norm": 2.1781516075134277, "learning_rate": 2e-05, "loss": 0.1852, "step": 2490 }, { "epoch": 1.3691128148959475, "grad_norm": 1.8818821907043457, "learning_rate": 2e-05, "loss": 0.1481, "step": 2500 }, { "epoch": 1.3745892661555312, "grad_norm": 2.2098746299743652, "learning_rate": 2e-05, "loss": 0.1823, "step": 2510 }, { "epoch": 1.380065717415115, "grad_norm": 1.5912271738052368, "learning_rate": 2e-05, "loss": 0.1968, "step": 2520 }, { "epoch": 1.3855421686746987, "grad_norm": 1.1806056499481201, "learning_rate": 2e-05, "loss": 0.1677, "step": 2530 }, { "epoch": 1.3910186199342827, "grad_norm": 1.9674164056777954, "learning_rate": 2e-05, "loss": 0.1273, "step": 2540 }, { "epoch": 1.3964950711938664, "grad_norm": 4.151760578155518, "learning_rate": 2e-05, "loss": 0.1658, "step": 2550 }, { "epoch": 1.4019715224534501, "grad_norm": 1.8803857564926147, "learning_rate": 2e-05, "loss": 0.1466, "step": 2560 }, { "epoch": 1.4074479737130339, "grad_norm": 2.625727891921997, "learning_rate": 2e-05, "loss": 0.1298, "step": 2570 }, { "epoch": 1.4129244249726178, "grad_norm": 2.6431047916412354, "learning_rate": 2e-05, "loss": 0.1503, "step": 2580 }, { "epoch": 1.4184008762322016, "grad_norm": 4.68942928314209, "learning_rate": 2e-05, "loss": 0.1375, "step": 2590 }, { "epoch": 1.4238773274917853, "grad_norm": 2.73363995552063, "learning_rate": 2e-05, "loss": 0.1712, "step": 2600 }, { "epoch": 1.429353778751369, "grad_norm": 3.2278857231140137, "learning_rate": 2e-05, "loss": 0.1425, "step": 2610 }, { "epoch": 1.4348302300109528, "grad_norm": 3.2857725620269775, "learning_rate": 2e-05, "loss": 0.1111, "step": 2620 }, { "epoch": 1.4403066812705367, "grad_norm": 1.6636910438537598, "learning_rate": 2e-05, "loss": 0.1231, "step": 2630 }, { "epoch": 1.4457831325301205, "grad_norm": 1.7655991315841675, "learning_rate": 2e-05, "loss": 0.1526, "step": 2640 }, { "epoch": 1.4512595837897042, "grad_norm": 2.4831273555755615, "learning_rate": 2e-05, "loss": 0.1626, "step": 2650 }, { "epoch": 1.4567360350492882, "grad_norm": 1.5845210552215576, "learning_rate": 2e-05, "loss": 0.1471, "step": 2660 }, { "epoch": 1.462212486308872, "grad_norm": 2.035768985748291, "learning_rate": 2e-05, "loss": 0.1353, "step": 2670 }, { "epoch": 1.4676889375684556, "grad_norm": 3.0364644527435303, "learning_rate": 2e-05, "loss": 0.1534, "step": 2680 }, { "epoch": 1.4731653888280394, "grad_norm": 1.0436877012252808, "learning_rate": 2e-05, "loss": 0.1384, "step": 2690 }, { "epoch": 1.4786418400876231, "grad_norm": 3.814385175704956, "learning_rate": 2e-05, "loss": 0.1571, "step": 2700 }, { "epoch": 1.484118291347207, "grad_norm": 4.043318748474121, "learning_rate": 2e-05, "loss": 0.1408, "step": 2710 }, { "epoch": 1.4895947426067908, "grad_norm": 2.101560354232788, "learning_rate": 2e-05, "loss": 0.1506, "step": 2720 }, { "epoch": 1.4950711938663745, "grad_norm": 3.871284008026123, "learning_rate": 2e-05, "loss": 0.1598, "step": 2730 }, { "epoch": 1.5005476451259585, "grad_norm": 1.0195356607437134, "learning_rate": 2e-05, "loss": 0.1625, "step": 2740 }, { "epoch": 1.5060240963855422, "grad_norm": 2.7019972801208496, "learning_rate": 2e-05, "loss": 0.1396, "step": 2750 }, { "epoch": 1.511500547645126, "grad_norm": 3.040086269378662, "learning_rate": 2e-05, "loss": 0.1503, "step": 2760 }, { "epoch": 1.5169769989047097, "grad_norm": 1.6536140441894531, "learning_rate": 2e-05, "loss": 0.1424, "step": 2770 }, { "epoch": 1.5224534501642935, "grad_norm": 2.9479269981384277, "learning_rate": 2e-05, "loss": 0.1537, "step": 2780 }, { "epoch": 1.5279299014238772, "grad_norm": 2.638228416442871, "learning_rate": 2e-05, "loss": 0.1517, "step": 2790 }, { "epoch": 1.5334063526834611, "grad_norm": 1.5154801607131958, "learning_rate": 2e-05, "loss": 0.1627, "step": 2800 }, { "epoch": 1.5388828039430449, "grad_norm": 4.037379264831543, "learning_rate": 2e-05, "loss": 0.1788, "step": 2810 }, { "epoch": 1.5443592552026288, "grad_norm": 3.5345592498779297, "learning_rate": 2e-05, "loss": 0.1768, "step": 2820 }, { "epoch": 1.5498357064622126, "grad_norm": 3.8549864292144775, "learning_rate": 2e-05, "loss": 0.1721, "step": 2830 }, { "epoch": 1.5553121577217963, "grad_norm": 3.5247507095336914, "learning_rate": 2e-05, "loss": 0.1305, "step": 2840 }, { "epoch": 1.56078860898138, "grad_norm": 2.387272834777832, "learning_rate": 2e-05, "loss": 0.1234, "step": 2850 }, { "epoch": 1.5662650602409638, "grad_norm": 3.007579803466797, "learning_rate": 2e-05, "loss": 0.152, "step": 2860 }, { "epoch": 1.5717415115005475, "grad_norm": 1.0041784048080444, "learning_rate": 2e-05, "loss": 0.1489, "step": 2870 }, { "epoch": 1.5772179627601315, "grad_norm": 3.3091013431549072, "learning_rate": 2e-05, "loss": 0.14, "step": 2880 }, { "epoch": 1.5826944140197152, "grad_norm": 1.844616174697876, "learning_rate": 2e-05, "loss": 0.1721, "step": 2890 }, { "epoch": 1.588170865279299, "grad_norm": 3.9923973083496094, "learning_rate": 2e-05, "loss": 0.1546, "step": 2900 }, { "epoch": 1.593647316538883, "grad_norm": 2.3511135578155518, "learning_rate": 2e-05, "loss": 0.1477, "step": 2910 }, { "epoch": 1.5991237677984667, "grad_norm": 2.524749994277954, "learning_rate": 2e-05, "loss": 0.1613, "step": 2920 }, { "epoch": 1.6046002190580504, "grad_norm": 1.5530831813812256, "learning_rate": 2e-05, "loss": 0.1445, "step": 2930 }, { "epoch": 1.6100766703176341, "grad_norm": 1.8088948726654053, "learning_rate": 2e-05, "loss": 0.1446, "step": 2940 }, { "epoch": 1.6155531215772179, "grad_norm": 1.5274639129638672, "learning_rate": 2e-05, "loss": 0.1453, "step": 2950 }, { "epoch": 1.6210295728368016, "grad_norm": 2.369565963745117, "learning_rate": 2e-05, "loss": 0.1487, "step": 2960 }, { "epoch": 1.6265060240963856, "grad_norm": 2.4283454418182373, "learning_rate": 2e-05, "loss": 0.1522, "step": 2970 }, { "epoch": 1.6319824753559693, "grad_norm": 4.117255687713623, "learning_rate": 2e-05, "loss": 0.1523, "step": 2980 }, { "epoch": 1.6374589266155533, "grad_norm": 2.1403403282165527, "learning_rate": 2e-05, "loss": 0.1558, "step": 2990 }, { "epoch": 1.642935377875137, "grad_norm": 3.7226603031158447, "learning_rate": 2e-05, "loss": 0.1635, "step": 3000 }, { "epoch": 1.6484118291347207, "grad_norm": 3.3474371433258057, "learning_rate": 2e-05, "loss": 0.1543, "step": 3010 }, { "epoch": 1.6538882803943045, "grad_norm": 2.174217700958252, "learning_rate": 2e-05, "loss": 0.1474, "step": 3020 }, { "epoch": 1.6593647316538882, "grad_norm": 1.7523736953735352, "learning_rate": 2e-05, "loss": 0.1487, "step": 3030 }, { "epoch": 1.664841182913472, "grad_norm": 2.573213577270508, "learning_rate": 2e-05, "loss": 0.1566, "step": 3040 }, { "epoch": 1.670317634173056, "grad_norm": 1.8312263488769531, "learning_rate": 2e-05, "loss": 0.1524, "step": 3050 }, { "epoch": 1.6757940854326396, "grad_norm": 1.8972638845443726, "learning_rate": 2e-05, "loss": 0.1129, "step": 3060 }, { "epoch": 1.6812705366922236, "grad_norm": 2.2399697303771973, "learning_rate": 2e-05, "loss": 0.1569, "step": 3070 }, { "epoch": 1.6867469879518073, "grad_norm": 2.9116086959838867, "learning_rate": 2e-05, "loss": 0.1531, "step": 3080 }, { "epoch": 1.692223439211391, "grad_norm": 2.098607063293457, "learning_rate": 2e-05, "loss": 0.1378, "step": 3090 }, { "epoch": 1.6976998904709748, "grad_norm": 1.720107913017273, "learning_rate": 2e-05, "loss": 0.1554, "step": 3100 }, { "epoch": 1.7031763417305585, "grad_norm": 2.0600640773773193, "learning_rate": 2e-05, "loss": 0.1541, "step": 3110 }, { "epoch": 1.7086527929901423, "grad_norm": 2.0780065059661865, "learning_rate": 2e-05, "loss": 0.1551, "step": 3120 }, { "epoch": 1.714129244249726, "grad_norm": 1.9723634719848633, "learning_rate": 2e-05, "loss": 0.1168, "step": 3130 }, { "epoch": 1.71960569550931, "grad_norm": 6.4908552169799805, "learning_rate": 2e-05, "loss": 0.1157, "step": 3140 }, { "epoch": 1.7250821467688937, "grad_norm": 2.1401596069335938, "learning_rate": 2e-05, "loss": 0.1419, "step": 3150 }, { "epoch": 1.7305585980284777, "grad_norm": 1.883585810661316, "learning_rate": 2e-05, "loss": 0.1428, "step": 3160 }, { "epoch": 1.7360350492880614, "grad_norm": 2.2904489040374756, "learning_rate": 2e-05, "loss": 0.1382, "step": 3170 }, { "epoch": 1.7415115005476451, "grad_norm": 2.3025336265563965, "learning_rate": 2e-05, "loss": 0.2024, "step": 3180 }, { "epoch": 1.7469879518072289, "grad_norm": 1.5613994598388672, "learning_rate": 2e-05, "loss": 0.1696, "step": 3190 }, { "epoch": 1.7524644030668126, "grad_norm": 1.7806004285812378, "learning_rate": 2e-05, "loss": 0.1474, "step": 3200 }, { "epoch": 1.7579408543263964, "grad_norm": 2.04266095161438, "learning_rate": 2e-05, "loss": 0.1537, "step": 3210 }, { "epoch": 1.7634173055859803, "grad_norm": 3.345473527908325, "learning_rate": 2e-05, "loss": 0.1411, "step": 3220 }, { "epoch": 1.768893756845564, "grad_norm": 2.1662192344665527, "learning_rate": 2e-05, "loss": 0.1536, "step": 3230 }, { "epoch": 1.774370208105148, "grad_norm": 1.1458584070205688, "learning_rate": 2e-05, "loss": 0.1642, "step": 3240 }, { "epoch": 1.7798466593647317, "grad_norm": 4.288283824920654, "learning_rate": 2e-05, "loss": 0.1419, "step": 3250 }, { "epoch": 1.7853231106243155, "grad_norm": 3.2075963020324707, "learning_rate": 2e-05, "loss": 0.1667, "step": 3260 }, { "epoch": 1.7907995618838992, "grad_norm": 2.8897817134857178, "learning_rate": 2e-05, "loss": 0.1646, "step": 3270 }, { "epoch": 1.796276013143483, "grad_norm": 2.2969679832458496, "learning_rate": 2e-05, "loss": 0.1573, "step": 3280 }, { "epoch": 1.8017524644030667, "grad_norm": 3.1827869415283203, "learning_rate": 2e-05, "loss": 0.1366, "step": 3290 }, { "epoch": 1.8072289156626506, "grad_norm": 3.3078675270080566, "learning_rate": 2e-05, "loss": 0.1342, "step": 3300 }, { "epoch": 1.8127053669222344, "grad_norm": 0.969814658164978, "learning_rate": 2e-05, "loss": 0.1314, "step": 3310 }, { "epoch": 1.8181818181818183, "grad_norm": 1.9750161170959473, "learning_rate": 2e-05, "loss": 0.1398, "step": 3320 }, { "epoch": 1.823658269441402, "grad_norm": 2.6312105655670166, "learning_rate": 2e-05, "loss": 0.1271, "step": 3330 }, { "epoch": 1.8291347207009858, "grad_norm": 5.169326305389404, "learning_rate": 2e-05, "loss": 0.136, "step": 3340 }, { "epoch": 1.8346111719605696, "grad_norm": 4.923961639404297, "learning_rate": 2e-05, "loss": 0.1516, "step": 3350 }, { "epoch": 1.8400876232201533, "grad_norm": 1.6556754112243652, "learning_rate": 2e-05, "loss": 0.1577, "step": 3360 }, { "epoch": 1.845564074479737, "grad_norm": 3.2922916412353516, "learning_rate": 2e-05, "loss": 0.1508, "step": 3370 }, { "epoch": 1.8510405257393208, "grad_norm": 1.2395728826522827, "learning_rate": 2e-05, "loss": 0.149, "step": 3380 }, { "epoch": 1.8565169769989047, "grad_norm": 1.776043176651001, "learning_rate": 2e-05, "loss": 0.1762, "step": 3390 }, { "epoch": 1.8619934282584885, "grad_norm": 3.395716667175293, "learning_rate": 2e-05, "loss": 0.1709, "step": 3400 }, { "epoch": 1.8674698795180724, "grad_norm": 3.3589627742767334, "learning_rate": 2e-05, "loss": 0.1943, "step": 3410 }, { "epoch": 1.8729463307776562, "grad_norm": 1.2186440229415894, "learning_rate": 2e-05, "loss": 0.1607, "step": 3420 }, { "epoch": 1.87842278203724, "grad_norm": 1.260779857635498, "learning_rate": 2e-05, "loss": 0.1522, "step": 3430 }, { "epoch": 1.8838992332968236, "grad_norm": 2.699249267578125, "learning_rate": 2e-05, "loss": 0.1493, "step": 3440 }, { "epoch": 1.8893756845564074, "grad_norm": 1.9771623611450195, "learning_rate": 2e-05, "loss": 0.1485, "step": 3450 }, { "epoch": 1.894852135815991, "grad_norm": 2.270580768585205, "learning_rate": 2e-05, "loss": 0.1534, "step": 3460 }, { "epoch": 1.900328587075575, "grad_norm": 1.3207887411117554, "learning_rate": 2e-05, "loss": 0.1383, "step": 3470 }, { "epoch": 1.9058050383351588, "grad_norm": 26.00341796875, "learning_rate": 2e-05, "loss": 0.1685, "step": 3480 }, { "epoch": 1.9112814895947428, "grad_norm": 2.4248104095458984, "learning_rate": 2e-05, "loss": 0.1252, "step": 3490 }, { "epoch": 1.9167579408543265, "grad_norm": 3.160520315170288, "learning_rate": 2e-05, "loss": 0.1452, "step": 3500 }, { "epoch": 1.9222343921139102, "grad_norm": 2.528468608856201, "learning_rate": 2e-05, "loss": 0.1168, "step": 3510 }, { "epoch": 1.927710843373494, "grad_norm": 1.9054774045944214, "learning_rate": 2e-05, "loss": 0.1375, "step": 3520 }, { "epoch": 1.9331872946330777, "grad_norm": 3.4692299365997314, "learning_rate": 2e-05, "loss": 0.1652, "step": 3530 }, { "epoch": 1.9386637458926614, "grad_norm": 1.626815915107727, "learning_rate": 2e-05, "loss": 0.1231, "step": 3540 }, { "epoch": 1.9441401971522454, "grad_norm": 3.7855207920074463, "learning_rate": 2e-05, "loss": 0.1492, "step": 3550 }, { "epoch": 1.9496166484118291, "grad_norm": 1.1849123239517212, "learning_rate": 2e-05, "loss": 0.1594, "step": 3560 }, { "epoch": 1.9550930996714129, "grad_norm": 2.7899911403656006, "learning_rate": 2e-05, "loss": 0.1601, "step": 3570 }, { "epoch": 1.9605695509309968, "grad_norm": 1.963122010231018, "learning_rate": 2e-05, "loss": 0.1301, "step": 3580 }, { "epoch": 1.9660460021905806, "grad_norm": 1.8025850057601929, "learning_rate": 2e-05, "loss": 0.156, "step": 3590 }, { "epoch": 1.9715224534501643, "grad_norm": 1.5995118618011475, "learning_rate": 2e-05, "loss": 0.1348, "step": 3600 }, { "epoch": 1.976998904709748, "grad_norm": 1.159638524055481, "learning_rate": 2e-05, "loss": 0.1288, "step": 3610 }, { "epoch": 1.9824753559693318, "grad_norm": 1.3912004232406616, "learning_rate": 2e-05, "loss": 0.1161, "step": 3620 }, { "epoch": 1.9879518072289155, "grad_norm": 1.0395070314407349, "learning_rate": 2e-05, "loss": 0.1386, "step": 3630 }, { "epoch": 1.9934282584884995, "grad_norm": 1.532216191291809, "learning_rate": 2e-05, "loss": 0.1213, "step": 3640 }, { "epoch": 1.9989047097480832, "grad_norm": 1.4489120244979858, "learning_rate": 2e-05, "loss": 0.1123, "step": 3650 }, { "epoch": 2.0, "eval_accuracy": 0.941111445007298, "eval_f1": 0.8677761381181066, "eval_loss": 0.15645764768123627, "eval_precision": 0.819916825171669, "eval_recall": 0.9215689826977082, "eval_runtime": 19.7981, "eval_samples_per_second": 312.404, "eval_steps_per_second": 19.547, "step": 3652 }, { "epoch": 2.004381161007667, "grad_norm": 1.69236159324646, "learning_rate": 2e-05, "loss": 0.1479, "step": 3660 }, { "epoch": 2.009857612267251, "grad_norm": 3.8225982189178467, "learning_rate": 2e-05, "loss": 0.1477, "step": 3670 }, { "epoch": 2.0153340635268346, "grad_norm": 4.383903980255127, "learning_rate": 2e-05, "loss": 0.1094, "step": 3680 }, { "epoch": 2.0208105147864184, "grad_norm": 3.1119155883789062, "learning_rate": 2e-05, "loss": 0.1602, "step": 3690 }, { "epoch": 2.026286966046002, "grad_norm": 3.3700242042541504, "learning_rate": 2e-05, "loss": 0.1303, "step": 3700 }, { "epoch": 2.031763417305586, "grad_norm": 1.0470126867294312, "learning_rate": 2e-05, "loss": 0.1174, "step": 3710 }, { "epoch": 2.0372398685651696, "grad_norm": 2.912874698638916, "learning_rate": 2e-05, "loss": 0.1336, "step": 3720 }, { "epoch": 2.0427163198247538, "grad_norm": 0.8620438575744629, "learning_rate": 2e-05, "loss": 0.1112, "step": 3730 }, { "epoch": 2.0481927710843375, "grad_norm": 2.3170716762542725, "learning_rate": 2e-05, "loss": 0.1208, "step": 3740 }, { "epoch": 2.0536692223439212, "grad_norm": 1.4915480613708496, "learning_rate": 2e-05, "loss": 0.136, "step": 3750 }, { "epoch": 2.059145673603505, "grad_norm": 1.7329208850860596, "learning_rate": 2e-05, "loss": 0.1437, "step": 3760 }, { "epoch": 2.0646221248630887, "grad_norm": 1.5879555940628052, "learning_rate": 2e-05, "loss": 0.1338, "step": 3770 }, { "epoch": 2.0700985761226725, "grad_norm": 0.7418123483657837, "learning_rate": 2e-05, "loss": 0.1013, "step": 3780 }, { "epoch": 2.075575027382256, "grad_norm": 1.0119812488555908, "learning_rate": 2e-05, "loss": 0.0841, "step": 3790 }, { "epoch": 2.08105147864184, "grad_norm": 1.383432149887085, "learning_rate": 2e-05, "loss": 0.1212, "step": 3800 }, { "epoch": 2.0865279299014237, "grad_norm": 2.614387273788452, "learning_rate": 2e-05, "loss": 0.1228, "step": 3810 }, { "epoch": 2.092004381161008, "grad_norm": 2.6762051582336426, "learning_rate": 2e-05, "loss": 0.1491, "step": 3820 }, { "epoch": 2.0974808324205916, "grad_norm": 3.3792619705200195, "learning_rate": 2e-05, "loss": 0.1161, "step": 3830 }, { "epoch": 2.1029572836801753, "grad_norm": 2.690113067626953, "learning_rate": 2e-05, "loss": 0.1093, "step": 3840 }, { "epoch": 2.108433734939759, "grad_norm": 1.5759937763214111, "learning_rate": 2e-05, "loss": 0.1406, "step": 3850 }, { "epoch": 2.113910186199343, "grad_norm": 1.4909275770187378, "learning_rate": 2e-05, "loss": 0.1108, "step": 3860 }, { "epoch": 2.1193866374589265, "grad_norm": 2.6127500534057617, "learning_rate": 2e-05, "loss": 0.1269, "step": 3870 }, { "epoch": 2.1248630887185103, "grad_norm": 2.5836493968963623, "learning_rate": 2e-05, "loss": 0.1396, "step": 3880 }, { "epoch": 2.130339539978094, "grad_norm": 1.385608434677124, "learning_rate": 2e-05, "loss": 0.127, "step": 3890 }, { "epoch": 2.135815991237678, "grad_norm": 3.3218297958374023, "learning_rate": 2e-05, "loss": 0.1056, "step": 3900 }, { "epoch": 2.141292442497262, "grad_norm": 1.8507598638534546, "learning_rate": 2e-05, "loss": 0.1453, "step": 3910 }, { "epoch": 2.1467688937568457, "grad_norm": 3.654327630996704, "learning_rate": 2e-05, "loss": 0.122, "step": 3920 }, { "epoch": 2.1522453450164294, "grad_norm": 3.592478036880493, "learning_rate": 2e-05, "loss": 0.1245, "step": 3930 }, { "epoch": 2.157721796276013, "grad_norm": 3.7161383628845215, "learning_rate": 2e-05, "loss": 0.1126, "step": 3940 }, { "epoch": 2.163198247535597, "grad_norm": 2.2989351749420166, "learning_rate": 2e-05, "loss": 0.0944, "step": 3950 }, { "epoch": 2.1686746987951806, "grad_norm": 2.9460718631744385, "learning_rate": 2e-05, "loss": 0.126, "step": 3960 }, { "epoch": 2.1741511500547643, "grad_norm": 3.1067349910736084, "learning_rate": 2e-05, "loss": 0.1436, "step": 3970 }, { "epoch": 2.1796276013143485, "grad_norm": 2.155015230178833, "learning_rate": 2e-05, "loss": 0.1033, "step": 3980 }, { "epoch": 2.1851040525739323, "grad_norm": 2.9963104724884033, "learning_rate": 2e-05, "loss": 0.1443, "step": 3990 }, { "epoch": 2.190580503833516, "grad_norm": 1.293370246887207, "learning_rate": 2e-05, "loss": 0.1093, "step": 4000 }, { "epoch": 2.1960569550930997, "grad_norm": 1.3873592615127563, "learning_rate": 2e-05, "loss": 0.1139, "step": 4010 }, { "epoch": 2.2015334063526835, "grad_norm": 1.8804830312728882, "learning_rate": 2e-05, "loss": 0.1554, "step": 4020 }, { "epoch": 2.207009857612267, "grad_norm": 4.313164710998535, "learning_rate": 2e-05, "loss": 0.1129, "step": 4030 }, { "epoch": 2.212486308871851, "grad_norm": 2.9426050186157227, "learning_rate": 2e-05, "loss": 0.1334, "step": 4040 }, { "epoch": 2.2179627601314347, "grad_norm": 2.560018539428711, "learning_rate": 2e-05, "loss": 0.1492, "step": 4050 }, { "epoch": 2.2234392113910184, "grad_norm": 1.6301517486572266, "learning_rate": 2e-05, "loss": 0.1308, "step": 4060 }, { "epoch": 2.2289156626506026, "grad_norm": 1.1607255935668945, "learning_rate": 2e-05, "loss": 0.1374, "step": 4070 }, { "epoch": 2.2343921139101863, "grad_norm": 4.422305107116699, "learning_rate": 2e-05, "loss": 0.1375, "step": 4080 }, { "epoch": 2.23986856516977, "grad_norm": 3.9398353099823, "learning_rate": 2e-05, "loss": 0.1526, "step": 4090 }, { "epoch": 2.245345016429354, "grad_norm": 4.186077117919922, "learning_rate": 2e-05, "loss": 0.1117, "step": 4100 }, { "epoch": 2.2508214676889375, "grad_norm": 3.083814859390259, "learning_rate": 2e-05, "loss": 0.1273, "step": 4110 }, { "epoch": 2.2562979189485213, "grad_norm": 1.9174625873565674, "learning_rate": 2e-05, "loss": 0.1036, "step": 4120 }, { "epoch": 2.261774370208105, "grad_norm": 1.3200234174728394, "learning_rate": 2e-05, "loss": 0.1246, "step": 4130 }, { "epoch": 2.2672508214676887, "grad_norm": 1.504086971282959, "learning_rate": 2e-05, "loss": 0.0944, "step": 4140 }, { "epoch": 2.2727272727272725, "grad_norm": 2.5579471588134766, "learning_rate": 2e-05, "loss": 0.1242, "step": 4150 }, { "epoch": 2.2782037239868567, "grad_norm": 2.304062843322754, "learning_rate": 2e-05, "loss": 0.1543, "step": 4160 }, { "epoch": 2.2836801752464404, "grad_norm": 1.507938265800476, "learning_rate": 2e-05, "loss": 0.1277, "step": 4170 }, { "epoch": 2.289156626506024, "grad_norm": 4.3036346435546875, "learning_rate": 2e-05, "loss": 0.1311, "step": 4180 }, { "epoch": 2.294633077765608, "grad_norm": 1.80647873878479, "learning_rate": 2e-05, "loss": 0.1403, "step": 4190 }, { "epoch": 2.3001095290251916, "grad_norm": 2.770962715148926, "learning_rate": 2e-05, "loss": 0.1521, "step": 4200 }, { "epoch": 2.3055859802847753, "grad_norm": 2.768677234649658, "learning_rate": 2e-05, "loss": 0.1314, "step": 4210 }, { "epoch": 2.311062431544359, "grad_norm": 1.7572500705718994, "learning_rate": 2e-05, "loss": 0.122, "step": 4220 }, { "epoch": 2.3165388828039433, "grad_norm": 1.1709873676300049, "learning_rate": 2e-05, "loss": 0.1236, "step": 4230 }, { "epoch": 2.322015334063527, "grad_norm": 17.13128089904785, "learning_rate": 2e-05, "loss": 0.1509, "step": 4240 }, { "epoch": 2.3274917853231107, "grad_norm": 2.660583019256592, "learning_rate": 2e-05, "loss": 0.1326, "step": 4250 }, { "epoch": 2.3329682365826945, "grad_norm": 1.643479347229004, "learning_rate": 2e-05, "loss": 0.1179, "step": 4260 }, { "epoch": 2.338444687842278, "grad_norm": 6.019737720489502, "learning_rate": 2e-05, "loss": 0.0953, "step": 4270 }, { "epoch": 2.343921139101862, "grad_norm": 1.986523985862732, "learning_rate": 2e-05, "loss": 0.1048, "step": 4280 }, { "epoch": 2.3493975903614457, "grad_norm": 1.796851634979248, "learning_rate": 2e-05, "loss": 0.115, "step": 4290 }, { "epoch": 2.3548740416210294, "grad_norm": 1.2359439134597778, "learning_rate": 2e-05, "loss": 0.1174, "step": 4300 }, { "epoch": 2.360350492880613, "grad_norm": 3.057445526123047, "learning_rate": 2e-05, "loss": 0.1458, "step": 4310 }, { "epoch": 2.3658269441401973, "grad_norm": 1.0161036252975464, "learning_rate": 2e-05, "loss": 0.1437, "step": 4320 }, { "epoch": 2.371303395399781, "grad_norm": 1.2098288536071777, "learning_rate": 2e-05, "loss": 0.1379, "step": 4330 }, { "epoch": 2.376779846659365, "grad_norm": 1.4055923223495483, "learning_rate": 2e-05, "loss": 0.1199, "step": 4340 }, { "epoch": 2.3822562979189486, "grad_norm": 2.134941816329956, "learning_rate": 2e-05, "loss": 0.1303, "step": 4350 }, { "epoch": 2.3877327491785323, "grad_norm": 2.351625680923462, "learning_rate": 2e-05, "loss": 0.1278, "step": 4360 }, { "epoch": 2.393209200438116, "grad_norm": 3.273850679397583, "learning_rate": 2e-05, "loss": 0.11, "step": 4370 }, { "epoch": 2.3986856516976998, "grad_norm": 2.0896518230438232, "learning_rate": 2e-05, "loss": 0.1193, "step": 4380 }, { "epoch": 2.4041621029572835, "grad_norm": 3.240591287612915, "learning_rate": 2e-05, "loss": 0.139, "step": 4390 }, { "epoch": 2.4096385542168672, "grad_norm": 4.579762935638428, "learning_rate": 2e-05, "loss": 0.1417, "step": 4400 }, { "epoch": 2.4151150054764514, "grad_norm": 2.048832654953003, "learning_rate": 2e-05, "loss": 0.1176, "step": 4410 }, { "epoch": 2.420591456736035, "grad_norm": 1.8488651514053345, "learning_rate": 2e-05, "loss": 0.1408, "step": 4420 }, { "epoch": 2.426067907995619, "grad_norm": 1.4034713506698608, "learning_rate": 2e-05, "loss": 0.1246, "step": 4430 }, { "epoch": 2.4315443592552026, "grad_norm": 1.0171767473220825, "learning_rate": 2e-05, "loss": 0.1118, "step": 4440 }, { "epoch": 2.4370208105147864, "grad_norm": 4.190380573272705, "learning_rate": 2e-05, "loss": 0.1527, "step": 4450 }, { "epoch": 2.44249726177437, "grad_norm": 3.9857051372528076, "learning_rate": 2e-05, "loss": 0.1153, "step": 4460 }, { "epoch": 2.447973713033954, "grad_norm": 1.4358816146850586, "learning_rate": 2e-05, "loss": 0.0983, "step": 4470 }, { "epoch": 2.453450164293538, "grad_norm": 1.8944737911224365, "learning_rate": 2e-05, "loss": 0.1382, "step": 4480 }, { "epoch": 2.4589266155531218, "grad_norm": 1.8662302494049072, "learning_rate": 2e-05, "loss": 0.1564, "step": 4490 }, { "epoch": 2.4644030668127055, "grad_norm": 1.050307035446167, "learning_rate": 2e-05, "loss": 0.1111, "step": 4500 }, { "epoch": 2.4698795180722892, "grad_norm": 1.3058151006698608, "learning_rate": 2e-05, "loss": 0.1526, "step": 4510 }, { "epoch": 2.475355969331873, "grad_norm": 2.779019355773926, "learning_rate": 2e-05, "loss": 0.1013, "step": 4520 }, { "epoch": 2.4808324205914567, "grad_norm": 3.649847984313965, "learning_rate": 2e-05, "loss": 0.1039, "step": 4530 }, { "epoch": 2.4863088718510404, "grad_norm": 1.4723719358444214, "learning_rate": 2e-05, "loss": 0.1304, "step": 4540 }, { "epoch": 2.491785323110624, "grad_norm": 2.51281476020813, "learning_rate": 2e-05, "loss": 0.1067, "step": 4550 }, { "epoch": 2.497261774370208, "grad_norm": 3.2945971488952637, "learning_rate": 2e-05, "loss": 0.1354, "step": 4560 }, { "epoch": 2.502738225629792, "grad_norm": 3.16933536529541, "learning_rate": 2e-05, "loss": 0.127, "step": 4570 }, { "epoch": 2.508214676889376, "grad_norm": 1.2082220315933228, "learning_rate": 2e-05, "loss": 0.1296, "step": 4580 }, { "epoch": 2.5136911281489596, "grad_norm": 4.029638767242432, "learning_rate": 2e-05, "loss": 0.1363, "step": 4590 }, { "epoch": 2.5191675794085433, "grad_norm": 1.5362796783447266, "learning_rate": 2e-05, "loss": 0.1275, "step": 4600 }, { "epoch": 2.524644030668127, "grad_norm": 2.8110194206237793, "learning_rate": 2e-05, "loss": 0.1401, "step": 4610 }, { "epoch": 2.5301204819277108, "grad_norm": 1.6804673671722412, "learning_rate": 2e-05, "loss": 0.1448, "step": 4620 }, { "epoch": 2.5355969331872945, "grad_norm": 7.145838260650635, "learning_rate": 2e-05, "loss": 0.1305, "step": 4630 }, { "epoch": 2.5410733844468787, "grad_norm": 2.6446447372436523, "learning_rate": 2e-05, "loss": 0.1242, "step": 4640 }, { "epoch": 2.546549835706462, "grad_norm": 1.742530107498169, "learning_rate": 2e-05, "loss": 0.1151, "step": 4650 }, { "epoch": 2.552026286966046, "grad_norm": 2.4224681854248047, "learning_rate": 2e-05, "loss": 0.1499, "step": 4660 }, { "epoch": 2.55750273822563, "grad_norm": 2.849701404571533, "learning_rate": 2e-05, "loss": 0.1365, "step": 4670 }, { "epoch": 2.5629791894852136, "grad_norm": 5.31744384765625, "learning_rate": 2e-05, "loss": 0.1308, "step": 4680 }, { "epoch": 2.5684556407447974, "grad_norm": 1.5912376642227173, "learning_rate": 2e-05, "loss": 0.1551, "step": 4690 }, { "epoch": 2.573932092004381, "grad_norm": 1.0725617408752441, "learning_rate": 2e-05, "loss": 0.1279, "step": 4700 }, { "epoch": 2.579408543263965, "grad_norm": 3.9630486965179443, "learning_rate": 2e-05, "loss": 0.1158, "step": 4710 }, { "epoch": 2.5848849945235486, "grad_norm": 1.6319750547409058, "learning_rate": 2e-05, "loss": 0.1576, "step": 4720 }, { "epoch": 2.5903614457831328, "grad_norm": 4.792392253875732, "learning_rate": 2e-05, "loss": 0.1598, "step": 4730 }, { "epoch": 2.595837897042716, "grad_norm": 2.365724563598633, "learning_rate": 2e-05, "loss": 0.1189, "step": 4740 }, { "epoch": 2.6013143483023002, "grad_norm": 4.436146259307861, "learning_rate": 2e-05, "loss": 0.1286, "step": 4750 }, { "epoch": 2.606790799561884, "grad_norm": 1.227371096611023, "learning_rate": 2e-05, "loss": 0.1465, "step": 4760 }, { "epoch": 2.6122672508214677, "grad_norm": 1.3392515182495117, "learning_rate": 2e-05, "loss": 0.1305, "step": 4770 }, { "epoch": 2.6177437020810514, "grad_norm": 2.5968856811523438, "learning_rate": 2e-05, "loss": 0.1089, "step": 4780 }, { "epoch": 2.623220153340635, "grad_norm": 2.4093873500823975, "learning_rate": 2e-05, "loss": 0.139, "step": 4790 }, { "epoch": 2.628696604600219, "grad_norm": 2.9262475967407227, "learning_rate": 2e-05, "loss": 0.1138, "step": 4800 }, { "epoch": 2.6341730558598027, "grad_norm": 1.900516152381897, "learning_rate": 2e-05, "loss": 0.0951, "step": 4810 }, { "epoch": 2.639649507119387, "grad_norm": 1.9894856214523315, "learning_rate": 2e-05, "loss": 0.184, "step": 4820 }, { "epoch": 2.6451259583789706, "grad_norm": 0.8725138306617737, "learning_rate": 2e-05, "loss": 0.1043, "step": 4830 }, { "epoch": 2.6506024096385543, "grad_norm": 2.6368794441223145, "learning_rate": 2e-05, "loss": 0.1496, "step": 4840 }, { "epoch": 2.656078860898138, "grad_norm": 2.7605247497558594, "learning_rate": 2e-05, "loss": 0.1188, "step": 4850 }, { "epoch": 2.661555312157722, "grad_norm": 1.7536276578903198, "learning_rate": 2e-05, "loss": 0.12, "step": 4860 }, { "epoch": 2.6670317634173055, "grad_norm": 2.9888434410095215, "learning_rate": 2e-05, "loss": 0.1251, "step": 4870 }, { "epoch": 2.6725082146768893, "grad_norm": 2.501553535461426, "learning_rate": 2e-05, "loss": 0.1317, "step": 4880 }, { "epoch": 2.6779846659364734, "grad_norm": 3.2755653858184814, "learning_rate": 2e-05, "loss": 0.1431, "step": 4890 }, { "epoch": 2.6834611171960567, "grad_norm": 1.1044738292694092, "learning_rate": 2e-05, "loss": 0.1177, "step": 4900 }, { "epoch": 2.688937568455641, "grad_norm": 2.04195237159729, "learning_rate": 2e-05, "loss": 0.1385, "step": 4910 }, { "epoch": 2.6944140197152247, "grad_norm": 1.8423049449920654, "learning_rate": 2e-05, "loss": 0.1524, "step": 4920 }, { "epoch": 2.6998904709748084, "grad_norm": 3.017038345336914, "learning_rate": 2e-05, "loss": 0.1309, "step": 4930 }, { "epoch": 2.705366922234392, "grad_norm": 2.5917625427246094, "learning_rate": 2e-05, "loss": 0.0968, "step": 4940 }, { "epoch": 2.710843373493976, "grad_norm": 2.059396266937256, "learning_rate": 2e-05, "loss": 0.136, "step": 4950 }, { "epoch": 2.7163198247535596, "grad_norm": 2.7622623443603516, "learning_rate": 2e-05, "loss": 0.1344, "step": 4960 }, { "epoch": 2.7217962760131433, "grad_norm": 2.050316333770752, "learning_rate": 2e-05, "loss": 0.139, "step": 4970 }, { "epoch": 2.7272727272727275, "grad_norm": 2.0974202156066895, "learning_rate": 2e-05, "loss": 0.133, "step": 4980 }, { "epoch": 2.732749178532311, "grad_norm": 4.3153533935546875, "learning_rate": 2e-05, "loss": 0.1513, "step": 4990 }, { "epoch": 2.738225629791895, "grad_norm": 1.2816858291625977, "learning_rate": 2e-05, "loss": 0.1581, "step": 5000 }, { "epoch": 2.7437020810514787, "grad_norm": 3.44547176361084, "learning_rate": 2e-05, "loss": 0.1349, "step": 5010 }, { "epoch": 2.7491785323110625, "grad_norm": 1.2258213758468628, "learning_rate": 2e-05, "loss": 0.1412, "step": 5020 }, { "epoch": 2.754654983570646, "grad_norm": 2.301241159439087, "learning_rate": 2e-05, "loss": 0.1013, "step": 5030 }, { "epoch": 2.76013143483023, "grad_norm": 2.467775821685791, "learning_rate": 2e-05, "loss": 0.1501, "step": 5040 }, { "epoch": 2.7656078860898137, "grad_norm": 7.51368522644043, "learning_rate": 2e-05, "loss": 0.1378, "step": 5050 }, { "epoch": 2.7710843373493974, "grad_norm": 4.798404693603516, "learning_rate": 2e-05, "loss": 0.1305, "step": 5060 }, { "epoch": 2.7765607886089816, "grad_norm": 1.396974802017212, "learning_rate": 2e-05, "loss": 0.1043, "step": 5070 }, { "epoch": 2.7820372398685653, "grad_norm": 1.14879310131073, "learning_rate": 2e-05, "loss": 0.12, "step": 5080 }, { "epoch": 2.787513691128149, "grad_norm": 1.2579361200332642, "learning_rate": 2e-05, "loss": 0.1501, "step": 5090 }, { "epoch": 2.792990142387733, "grad_norm": 2.2348382472991943, "learning_rate": 2e-05, "loss": 0.157, "step": 5100 }, { "epoch": 2.7984665936473165, "grad_norm": 1.5609731674194336, "learning_rate": 2e-05, "loss": 0.135, "step": 5110 }, { "epoch": 2.8039430449069003, "grad_norm": 2.0120913982391357, "learning_rate": 2e-05, "loss": 0.1123, "step": 5120 }, { "epoch": 2.809419496166484, "grad_norm": 2.593989610671997, "learning_rate": 2e-05, "loss": 0.1142, "step": 5130 }, { "epoch": 2.8148959474260677, "grad_norm": 1.574157953262329, "learning_rate": 2e-05, "loss": 0.1701, "step": 5140 }, { "epoch": 2.8203723986856515, "grad_norm": 3.661393642425537, "learning_rate": 2e-05, "loss": 0.1311, "step": 5150 }, { "epoch": 2.8258488499452357, "grad_norm": 0.9907870888710022, "learning_rate": 2e-05, "loss": 0.1421, "step": 5160 }, { "epoch": 2.8313253012048194, "grad_norm": 2.06791615486145, "learning_rate": 2e-05, "loss": 0.1353, "step": 5170 }, { "epoch": 2.836801752464403, "grad_norm": 3.9285051822662354, "learning_rate": 2e-05, "loss": 0.1373, "step": 5180 }, { "epoch": 2.842278203723987, "grad_norm": 1.7791670560836792, "learning_rate": 2e-05, "loss": 0.1018, "step": 5190 }, { "epoch": 2.8477546549835706, "grad_norm": 1.8296700716018677, "learning_rate": 2e-05, "loss": 0.1247, "step": 5200 }, { "epoch": 2.8532311062431543, "grad_norm": 4.70639181137085, "learning_rate": 2e-05, "loss": 0.1314, "step": 5210 }, { "epoch": 2.858707557502738, "grad_norm": 2.3807830810546875, "learning_rate": 2e-05, "loss": 0.1356, "step": 5220 }, { "epoch": 2.8641840087623223, "grad_norm": 1.6014913320541382, "learning_rate": 2e-05, "loss": 0.1146, "step": 5230 }, { "epoch": 2.8696604600219056, "grad_norm": 1.8272550106048584, "learning_rate": 2e-05, "loss": 0.1183, "step": 5240 }, { "epoch": 2.8751369112814897, "grad_norm": 1.299852728843689, "learning_rate": 2e-05, "loss": 0.1405, "step": 5250 }, { "epoch": 2.8806133625410735, "grad_norm": 1.1718955039978027, "learning_rate": 2e-05, "loss": 0.1461, "step": 5260 }, { "epoch": 2.886089813800657, "grad_norm": 5.982026100158691, "learning_rate": 2e-05, "loss": 0.1539, "step": 5270 }, { "epoch": 2.891566265060241, "grad_norm": 3.6748552322387695, "learning_rate": 2e-05, "loss": 0.1164, "step": 5280 }, { "epoch": 2.8970427163198247, "grad_norm": 2.9227006435394287, "learning_rate": 2e-05, "loss": 0.1557, "step": 5290 }, { "epoch": 2.9025191675794084, "grad_norm": 1.4735232591629028, "learning_rate": 2e-05, "loss": 0.1168, "step": 5300 }, { "epoch": 2.907995618838992, "grad_norm": 3.212067127227783, "learning_rate": 2e-05, "loss": 0.1489, "step": 5310 }, { "epoch": 2.9134720700985763, "grad_norm": 1.4650025367736816, "learning_rate": 2e-05, "loss": 0.1131, "step": 5320 }, { "epoch": 2.91894852135816, "grad_norm": 4.302365779876709, "learning_rate": 2e-05, "loss": 0.1705, "step": 5330 }, { "epoch": 2.924424972617744, "grad_norm": 2.2288858890533447, "learning_rate": 2e-05, "loss": 0.1485, "step": 5340 }, { "epoch": 2.9299014238773275, "grad_norm": 1.019721508026123, "learning_rate": 2e-05, "loss": 0.1331, "step": 5350 }, { "epoch": 2.9353778751369113, "grad_norm": 2.5705177783966064, "learning_rate": 2e-05, "loss": 0.122, "step": 5360 }, { "epoch": 2.940854326396495, "grad_norm": 1.639599084854126, "learning_rate": 2e-05, "loss": 0.1218, "step": 5370 }, { "epoch": 2.9463307776560788, "grad_norm": 3.7570650577545166, "learning_rate": 2e-05, "loss": 0.1491, "step": 5380 }, { "epoch": 2.9518072289156625, "grad_norm": 3.499650716781616, "learning_rate": 2e-05, "loss": 0.1335, "step": 5390 }, { "epoch": 2.9572836801752462, "grad_norm": 2.9174211025238037, "learning_rate": 2e-05, "loss": 0.1468, "step": 5400 }, { "epoch": 2.9627601314348304, "grad_norm": 3.045962333679199, "learning_rate": 2e-05, "loss": 0.1283, "step": 5410 }, { "epoch": 2.968236582694414, "grad_norm": 2.5407910346984863, "learning_rate": 2e-05, "loss": 0.1171, "step": 5420 }, { "epoch": 2.973713033953998, "grad_norm": 2.07804274559021, "learning_rate": 2e-05, "loss": 0.1127, "step": 5430 }, { "epoch": 2.9791894852135816, "grad_norm": 2.2112016677856445, "learning_rate": 2e-05, "loss": 0.1249, "step": 5440 }, { "epoch": 2.9846659364731654, "grad_norm": 4.099008083343506, "learning_rate": 2e-05, "loss": 0.141, "step": 5450 }, { "epoch": 2.990142387732749, "grad_norm": 1.716291069984436, "learning_rate": 2e-05, "loss": 0.1102, "step": 5460 }, { "epoch": 2.995618838992333, "grad_norm": 3.4466896057128906, "learning_rate": 2e-05, "loss": 0.1283, "step": 5470 } ], "logging_steps": 10, "max_steps": 5478, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7443682033477008.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }