| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 5478, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00547645125958379, | |
| "grad_norm": 5.172877788543701, | |
| "learning_rate": 2e-05, | |
| "loss": 0.9408, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01095290251916758, | |
| "grad_norm": 3.29821515083313, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4667, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01642935377875137, | |
| "grad_norm": 8.682180404663086, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4354, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02190580503833516, | |
| "grad_norm": 2.988210439682007, | |
| "learning_rate": 2e-05, | |
| "loss": 0.3187, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.027382256297918947, | |
| "grad_norm": 2.8658511638641357, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2634, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03285870755750274, | |
| "grad_norm": 4.159895896911621, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2714, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.038335158817086525, | |
| "grad_norm": 4.234712600708008, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2401, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04381161007667032, | |
| "grad_norm": 4.615501403808594, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2764, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04928806133625411, | |
| "grad_norm": 2.7115259170532227, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2515, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.054764512595837894, | |
| "grad_norm": 4.8792500495910645, | |
| "learning_rate": 2e-05, | |
| "loss": 0.248, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.060240963855421686, | |
| "grad_norm": 3.3276002407073975, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2377, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06571741511500548, | |
| "grad_norm": 3.494762897491455, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2317, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07119386637458927, | |
| "grad_norm": 3.085296392440796, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2164, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07667031763417305, | |
| "grad_norm": 2.1236257553100586, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2612, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08214676889375684, | |
| "grad_norm": 2.76648211479187, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2113, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08762322015334063, | |
| "grad_norm": 5.641284942626953, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2361, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09309967141292443, | |
| "grad_norm": 2.8492302894592285, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2273, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09857612267250822, | |
| "grad_norm": 4.214968204498291, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2411, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10405257393209201, | |
| "grad_norm": 4.820573329925537, | |
| "learning_rate": 2e-05, | |
| "loss": 0.182, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10952902519167579, | |
| "grad_norm": 2.3844950199127197, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2221, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11500547645125958, | |
| "grad_norm": 3.7508792877197266, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2787, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 3.6480345726013184, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2308, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12595837897042717, | |
| "grad_norm": 3.0960395336151123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2141, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13143483023001096, | |
| "grad_norm": 2.7262496948242188, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2295, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13691128148959475, | |
| "grad_norm": 3.7271182537078857, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1957, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14238773274917854, | |
| "grad_norm": 3.30234432220459, | |
| "learning_rate": 2e-05, | |
| "loss": 0.233, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14786418400876233, | |
| "grad_norm": 2.1024515628814697, | |
| "learning_rate": 2e-05, | |
| "loss": 0.205, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1533406352683461, | |
| "grad_norm": 3.121746063232422, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1806, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1588170865279299, | |
| "grad_norm": 5.09675407409668, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2143, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16429353778751368, | |
| "grad_norm": 2.1228365898132324, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1972, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16976998904709747, | |
| "grad_norm": 5.942009925842285, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2367, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.17524644030668127, | |
| "grad_norm": 3.828641653060913, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2219, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18072289156626506, | |
| "grad_norm": 3.5164341926574707, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2209, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18619934282584885, | |
| "grad_norm": 3.276562452316284, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2066, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19167579408543264, | |
| "grad_norm": 2.173800468444824, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1824, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19715224534501644, | |
| "grad_norm": 4.548515319824219, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2085, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.20262869660460023, | |
| "grad_norm": 5.658101558685303, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2021, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20810514786418402, | |
| "grad_norm": 2.9604289531707764, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2139, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.21358159912376778, | |
| "grad_norm": 1.595267653465271, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2441, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21905805038335158, | |
| "grad_norm": 2.9476821422576904, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2221, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22453450164293537, | |
| "grad_norm": 6.530129909515381, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2275, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.23001095290251916, | |
| "grad_norm": 2.306079626083374, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1799, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.23548740416210295, | |
| "grad_norm": 3.0433263778686523, | |
| "learning_rate": 2e-05, | |
| "loss": 0.185, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 3.235048770904541, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2363, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.24644030668127054, | |
| "grad_norm": 2.9125173091888428, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2158, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.25191675794085433, | |
| "grad_norm": 1.9198905229568481, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2514, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2573932092004381, | |
| "grad_norm": 4.257998943328857, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2283, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2628696604600219, | |
| "grad_norm": 1.3411542177200317, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1712, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2683461117196057, | |
| "grad_norm": 2.114000082015991, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2022, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2738225629791895, | |
| "grad_norm": 2.4874045848846436, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1744, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2792990142387733, | |
| "grad_norm": 3.700068950653076, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2078, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2847754654983571, | |
| "grad_norm": 3.8135080337524414, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1884, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2902519167579409, | |
| "grad_norm": 2.2092432975769043, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1652, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.29572836801752467, | |
| "grad_norm": 2.4670491218566895, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2201, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.30120481927710846, | |
| "grad_norm": 2.9962000846862793, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1776, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3066812705366922, | |
| "grad_norm": 2.7531886100769043, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1997, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.312157721796276, | |
| "grad_norm": 3.9697697162628174, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1689, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3176341730558598, | |
| "grad_norm": 2.352689743041992, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2196, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3231106243154436, | |
| "grad_norm": 2.1512277126312256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2266, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.32858707557502737, | |
| "grad_norm": 3.3206424713134766, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1837, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.33406352683461116, | |
| "grad_norm": 2.5991427898406982, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1642, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.33953997809419495, | |
| "grad_norm": 4.06941032409668, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1954, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.34501642935377874, | |
| "grad_norm": 2.006395101547241, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1644, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.35049288061336253, | |
| "grad_norm": 2.333529233932495, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1766, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3559693318729463, | |
| "grad_norm": 1.8663638830184937, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1427, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 1.4063774347305298, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1886, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3669222343921139, | |
| "grad_norm": 2.3204855918884277, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1794, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3723986856516977, | |
| "grad_norm": 2.0688014030456543, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1788, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3778751369112815, | |
| "grad_norm": 2.9626317024230957, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1931, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3833515881708653, | |
| "grad_norm": 2.1686134338378906, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1797, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3888280394304491, | |
| "grad_norm": 3.220993995666504, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2001, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.39430449069003287, | |
| "grad_norm": 4.387556076049805, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1679, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.39978094194961666, | |
| "grad_norm": 6.711359024047852, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1906, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.40525739320920046, | |
| "grad_norm": 2.8974344730377197, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1842, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.41073384446878425, | |
| "grad_norm": 2.0242817401885986, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1874, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.41621029572836804, | |
| "grad_norm": 2.71280574798584, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1226, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.42168674698795183, | |
| "grad_norm": 2.318493604660034, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2111, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.42716319824753557, | |
| "grad_norm": 4.267580509185791, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2008, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.43263964950711936, | |
| "grad_norm": 2.4939076900482178, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2053, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.43811610076670315, | |
| "grad_norm": 1.9013803005218506, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1724, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.44359255202628695, | |
| "grad_norm": 3.4086337089538574, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1688, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.44906900328587074, | |
| "grad_norm": 4.253098011016846, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2092, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 1.7755553722381592, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1812, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4600219058050383, | |
| "grad_norm": 1.562054991722107, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2296, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4654983570646221, | |
| "grad_norm": 1.4678446054458618, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2006, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4709748083242059, | |
| "grad_norm": 2.852142810821533, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1763, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4764512595837897, | |
| "grad_norm": 3.451470136642456, | |
| "learning_rate": 2e-05, | |
| "loss": 0.198, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 4.738248348236084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1803, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4874041621029573, | |
| "grad_norm": 2.169006824493408, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1711, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4928806133625411, | |
| "grad_norm": 6.985939979553223, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1828, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.49835706462212487, | |
| "grad_norm": 3.6901047229766846, | |
| "learning_rate": 2e-05, | |
| "loss": 0.227, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5038335158817087, | |
| "grad_norm": 4.057900428771973, | |
| "learning_rate": 2e-05, | |
| "loss": 0.181, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5093099671412924, | |
| "grad_norm": 2.3261914253234863, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2152, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5147864184008762, | |
| "grad_norm": 2.459638833999634, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2261, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.52026286966046, | |
| "grad_norm": 1.9431893825531006, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1555, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5257393209200438, | |
| "grad_norm": 2.213655710220337, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1966, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5312157721796276, | |
| "grad_norm": 3.0963807106018066, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1672, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5366922234392114, | |
| "grad_norm": 1.3352348804473877, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1688, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5421686746987951, | |
| "grad_norm": 4.130510330200195, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2033, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.547645125958379, | |
| "grad_norm": 1.72730553150177, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1883, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5531215772179627, | |
| "grad_norm": 2.0997588634490967, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2085, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5585980284775466, | |
| "grad_norm": 1.9637783765792847, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1787, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5640744797371303, | |
| "grad_norm": 5.923566818237305, | |
| "learning_rate": 2e-05, | |
| "loss": 0.179, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5695509309967142, | |
| "grad_norm": 2.0748746395111084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1937, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5750273822562979, | |
| "grad_norm": 3.298703193664551, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1661, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5805038335158818, | |
| "grad_norm": 2.482518196105957, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1767, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5859802847754655, | |
| "grad_norm": 1.430920124053955, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1592, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5914567360350493, | |
| "grad_norm": 2.223555326461792, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1623, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5969331872946331, | |
| "grad_norm": 3.7139480113983154, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2036, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 2.7747836112976074, | |
| "learning_rate": 2e-05, | |
| "loss": 0.159, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6078860898138007, | |
| "grad_norm": 1.7586026191711426, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1994, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6133625410733844, | |
| "grad_norm": 1.7416315078735352, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1974, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6188389923329682, | |
| "grad_norm": 2.214825391769409, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1584, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.624315443592552, | |
| "grad_norm": 4.937350273132324, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1751, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6297918948521358, | |
| "grad_norm": 2.918086290359497, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1872, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6352683461117196, | |
| "grad_norm": 2.486037254333496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1769, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6407447973713034, | |
| "grad_norm": 1.8280752897262573, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1948, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6462212486308871, | |
| "grad_norm": 2.8489694595336914, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1598, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.651697699890471, | |
| "grad_norm": 5.198742389678955, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1503, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6571741511500547, | |
| "grad_norm": 4.022942066192627, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1242, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6626506024096386, | |
| "grad_norm": 1.6303025484085083, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1753, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6681270536692223, | |
| "grad_norm": 2.142221450805664, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1933, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6736035049288062, | |
| "grad_norm": 1.7236963510513306, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1632, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6790799561883899, | |
| "grad_norm": 6.54170560836792, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1745, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6845564074479737, | |
| "grad_norm": 3.051344394683838, | |
| "learning_rate": 2e-05, | |
| "loss": 0.161, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6900328587075575, | |
| "grad_norm": 4.368143558502197, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1686, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6955093099671413, | |
| "grad_norm": 2.810403347015381, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1341, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7009857612267251, | |
| "grad_norm": 3.573010206222534, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1888, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7064622124863089, | |
| "grad_norm": 3.028301954269409, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1753, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7119386637458927, | |
| "grad_norm": 3.8532004356384277, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1733, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7174151150054765, | |
| "grad_norm": 2.062229633331299, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1533, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 3.278475284576416, | |
| "learning_rate": 2e-05, | |
| "loss": 0.157, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7283680175246441, | |
| "grad_norm": 2.650078058242798, | |
| "learning_rate": 2e-05, | |
| "loss": 0.168, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7338444687842278, | |
| "grad_norm": 3.017892360687256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2381, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7393209200438116, | |
| "grad_norm": 1.8926328420639038, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1815, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7447973713033954, | |
| "grad_norm": 1.819949746131897, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1387, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7502738225629791, | |
| "grad_norm": 1.5167309045791626, | |
| "learning_rate": 2e-05, | |
| "loss": 0.138, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.755750273822563, | |
| "grad_norm": 2.128439426422119, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1566, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7612267250821467, | |
| "grad_norm": 1.9905339479446411, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1711, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7667031763417306, | |
| "grad_norm": 1.624918818473816, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1614, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7721796276013143, | |
| "grad_norm": 2.2797772884368896, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1839, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7776560788608982, | |
| "grad_norm": 1.8958566188812256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1609, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7831325301204819, | |
| "grad_norm": 4.018071174621582, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2153, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7886089813800657, | |
| "grad_norm": 2.038041353225708, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1576, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7940854326396495, | |
| "grad_norm": 3.009593963623047, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1448, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7995618838992333, | |
| "grad_norm": 2.0207109451293945, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1642, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8050383351588171, | |
| "grad_norm": 1.9745655059814453, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1499, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8105147864184009, | |
| "grad_norm": 2.5017263889312744, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1865, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8159912376779846, | |
| "grad_norm": 3.3768310546875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1439, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8214676889375685, | |
| "grad_norm": 1.90123450756073, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1712, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8269441401971522, | |
| "grad_norm": 1.7746949195861816, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1712, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8324205914567361, | |
| "grad_norm": 2.588888645172119, | |
| "learning_rate": 2e-05, | |
| "loss": 0.189, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8378970427163198, | |
| "grad_norm": 3.115365743637085, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1595, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 1.9716410636901855, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1639, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8488499452354874, | |
| "grad_norm": 1.8997513055801392, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1843, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8543263964950711, | |
| "grad_norm": 1.860077142715454, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1748, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.859802847754655, | |
| "grad_norm": 2.047376871109009, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1548, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8652792990142387, | |
| "grad_norm": 2.242220401763916, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1897, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8707557502738226, | |
| "grad_norm": 1.3936264514923096, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1529, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8762322015334063, | |
| "grad_norm": 1.3506709337234497, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1635, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8817086527929902, | |
| "grad_norm": 2.024489641189575, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1751, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8871851040525739, | |
| "grad_norm": 3.0132129192352295, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1447, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8926615553121577, | |
| "grad_norm": 1.4840929508209229, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1668, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8981380065717415, | |
| "grad_norm": 5.782477378845215, | |
| "learning_rate": 2e-05, | |
| "loss": 0.186, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9036144578313253, | |
| "grad_norm": 3.7930588722229004, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1477, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.4280755519866943, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1733, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9145673603504929, | |
| "grad_norm": 3.5071022510528564, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1771, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9200438116100766, | |
| "grad_norm": 1.890026330947876, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1807, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9255202628696605, | |
| "grad_norm": 4.8093647956848145, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1783, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9309967141292442, | |
| "grad_norm": 3.141622304916382, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1425, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9364731653888281, | |
| "grad_norm": 1.4867947101593018, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1549, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9419496166484118, | |
| "grad_norm": 2.396588087081909, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1459, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9474260679079957, | |
| "grad_norm": 2.241640329360962, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2159, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9529025191675794, | |
| "grad_norm": 2.0894391536712646, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1824, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9583789704271632, | |
| "grad_norm": 2.195657253265381, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1773, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 1.9088704586029053, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1671, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9693318729463308, | |
| "grad_norm": 2.40929913520813, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1483, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9748083242059146, | |
| "grad_norm": 2.1379597187042236, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1651, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9802847754654983, | |
| "grad_norm": 1.5753893852233887, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1521, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9857612267250822, | |
| "grad_norm": 1.7690379619598389, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1685, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9912376779846659, | |
| "grad_norm": 3.3368117809295654, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1808, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9967141292442497, | |
| "grad_norm": 2.3334920406341553, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1687, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9350717996050485, | |
| "eval_f1": 0.8531654843973757, | |
| "eval_loss": 0.16189317405223846, | |
| "eval_precision": 0.7893026050251876, | |
| "eval_recall": 0.9282724884500407, | |
| "eval_runtime": 19.8282, | |
| "eval_samples_per_second": 311.929, | |
| "eval_steps_per_second": 19.518, | |
| "step": 1826 | |
| }, | |
| { | |
| "epoch": 1.0021905805038336, | |
| "grad_norm": 3.1540722846984863, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1989, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.0076670317634173, | |
| "grad_norm": 2.7038586139678955, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1663, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.013143483023001, | |
| "grad_norm": 2.185299873352051, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1469, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0186199342825848, | |
| "grad_norm": 4.436729431152344, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1658, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.0240963855421688, | |
| "grad_norm": 2.3837592601776123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1563, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.0295728368017525, | |
| "grad_norm": 1.6888504028320312, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1705, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.0350492880613362, | |
| "grad_norm": 1.7870920896530151, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1754, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.04052573932092, | |
| "grad_norm": 2.037872314453125, | |
| "learning_rate": 2e-05, | |
| "loss": 0.134, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.046002190580504, | |
| "grad_norm": 1.956781029701233, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1525, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.0514786418400877, | |
| "grad_norm": 3.9054486751556396, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1405, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.0569550930996714, | |
| "grad_norm": 3.2423737049102783, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1304, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.0624315443592551, | |
| "grad_norm": 1.7311038970947266, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1288, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.067907995618839, | |
| "grad_norm": 3.4807159900665283, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1723, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0733844468784228, | |
| "grad_norm": 1.2659446001052856, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1674, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.0788608981380066, | |
| "grad_norm": 2.4274561405181885, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1634, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.0843373493975903, | |
| "grad_norm": 1.1292122602462769, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1579, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.0898138006571743, | |
| "grad_norm": 4.752586364746094, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1267, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.095290251916758, | |
| "grad_norm": 4.907914161682129, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1444, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1007667031763417, | |
| "grad_norm": 1.4939802885055542, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1763, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.1062431544359255, | |
| "grad_norm": 4.959670066833496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1581, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.1117196056955092, | |
| "grad_norm": 1.6166772842407227, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1379, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.1171960569550932, | |
| "grad_norm": 1.21837317943573, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1259, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.122672508214677, | |
| "grad_norm": 2.209987163543701, | |
| "learning_rate": 2e-05, | |
| "loss": 0.134, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.1281489594742606, | |
| "grad_norm": 2.8201191425323486, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1391, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.1336254107338444, | |
| "grad_norm": 2.6724655628204346, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1731, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.1391018619934283, | |
| "grad_norm": 3.112408399581909, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1419, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.144578313253012, | |
| "grad_norm": 6.945749759674072, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1673, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.1500547645125958, | |
| "grad_norm": 4.609986782073975, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1118, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.1555312157721795, | |
| "grad_norm": 1.9280059337615967, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1697, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.1610076670317635, | |
| "grad_norm": 2.8931400775909424, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1794, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.1664841182913472, | |
| "grad_norm": 1.505615234375, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1438, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.171960569550931, | |
| "grad_norm": 1.2091026306152344, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1409, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.1774370208105147, | |
| "grad_norm": 5.640398025512695, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1374, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.1829134720700987, | |
| "grad_norm": 1.9069983959197998, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1333, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.1883899233296824, | |
| "grad_norm": 2.034888982772827, | |
| "learning_rate": 2e-05, | |
| "loss": 0.185, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.1938663745892661, | |
| "grad_norm": 1.780856728553772, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1463, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.1993428258488499, | |
| "grad_norm": 3.035339593887329, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1662, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 2.7439584732055664, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1481, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2102957283680176, | |
| "grad_norm": 4.901017189025879, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1641, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.2157721796276013, | |
| "grad_norm": 2.227445125579834, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1582, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.221248630887185, | |
| "grad_norm": 2.1216564178466797, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1554, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.226725082146769, | |
| "grad_norm": 1.2567392587661743, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1683, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.2322015334063527, | |
| "grad_norm": 1.426159381866455, | |
| "learning_rate": 2e-05, | |
| "loss": 0.17, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.2376779846659365, | |
| "grad_norm": 1.435729742050171, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1404, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.2431544359255202, | |
| "grad_norm": 5.743936538696289, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1665, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.248630887185104, | |
| "grad_norm": 1.7799255847930908, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1538, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.254107338444688, | |
| "grad_norm": 2.466597318649292, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1038, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.2595837897042717, | |
| "grad_norm": 2.635021686553955, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1613, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.2650602409638554, | |
| "grad_norm": 2.1053247451782227, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1169, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.2705366922234393, | |
| "grad_norm": 2.312171459197998, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1574, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.276013143483023, | |
| "grad_norm": 4.142621994018555, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1568, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.2814895947426068, | |
| "grad_norm": 3.278440237045288, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1758, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.2869660460021906, | |
| "grad_norm": 2.5266401767730713, | |
| "learning_rate": 2e-05, | |
| "loss": 0.177, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.2924424972617743, | |
| "grad_norm": 2.4267191886901855, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1485, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.297918948521358, | |
| "grad_norm": 1.5584640502929688, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1583, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.303395399780942, | |
| "grad_norm": 2.881457805633545, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1515, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.3088718510405257, | |
| "grad_norm": 4.441954612731934, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1414, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.3143483023001095, | |
| "grad_norm": 5.456357479095459, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1685, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3198247535596934, | |
| "grad_norm": 2.449070930480957, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1445, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.3253012048192772, | |
| "grad_norm": 2.4937679767608643, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1528, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.330777656078861, | |
| "grad_norm": 1.7794448137283325, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1601, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.3362541073384446, | |
| "grad_norm": 1.795912265777588, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1542, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.3417305585980284, | |
| "grad_norm": 1.772538661956787, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1297, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.3472070098576123, | |
| "grad_norm": 1.0752304792404175, | |
| "learning_rate": 2e-05, | |
| "loss": 0.13, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.352683461117196, | |
| "grad_norm": 2.1968908309936523, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1373, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.3581599123767798, | |
| "grad_norm": 0.7487109303474426, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1646, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 2.1781516075134277, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1852, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.3691128148959475, | |
| "grad_norm": 1.8818821907043457, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1481, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3745892661555312, | |
| "grad_norm": 2.2098746299743652, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1823, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.380065717415115, | |
| "grad_norm": 1.5912271738052368, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1968, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.3855421686746987, | |
| "grad_norm": 1.1806056499481201, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1677, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.3910186199342827, | |
| "grad_norm": 1.9674164056777954, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1273, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.3964950711938664, | |
| "grad_norm": 4.151760578155518, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1658, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.4019715224534501, | |
| "grad_norm": 1.8803857564926147, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1466, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.4074479737130339, | |
| "grad_norm": 2.625727891921997, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1298, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.4129244249726178, | |
| "grad_norm": 2.6431047916412354, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1503, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.4184008762322016, | |
| "grad_norm": 4.68942928314209, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1375, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.4238773274917853, | |
| "grad_norm": 2.73363995552063, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1712, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.429353778751369, | |
| "grad_norm": 3.2278857231140137, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1425, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.4348302300109528, | |
| "grad_norm": 3.2857725620269775, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1111, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.4403066812705367, | |
| "grad_norm": 1.6636910438537598, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1231, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 1.7655991315841675, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1526, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.4512595837897042, | |
| "grad_norm": 2.4831273555755615, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1626, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.4567360350492882, | |
| "grad_norm": 1.5845210552215576, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1471, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.462212486308872, | |
| "grad_norm": 2.035768985748291, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1353, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.4676889375684556, | |
| "grad_norm": 3.0364644527435303, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1534, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.4731653888280394, | |
| "grad_norm": 1.0436877012252808, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1384, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.4786418400876231, | |
| "grad_norm": 3.814385175704956, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1571, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.484118291347207, | |
| "grad_norm": 4.043318748474121, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1408, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.4895947426067908, | |
| "grad_norm": 2.101560354232788, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1506, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.4950711938663745, | |
| "grad_norm": 3.871284008026123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1598, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.5005476451259585, | |
| "grad_norm": 1.0195356607437134, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1625, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.5060240963855422, | |
| "grad_norm": 2.7019972801208496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1396, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.511500547645126, | |
| "grad_norm": 3.040086269378662, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1503, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.5169769989047097, | |
| "grad_norm": 1.6536140441894531, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1424, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.5224534501642935, | |
| "grad_norm": 2.9479269981384277, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1537, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.5279299014238772, | |
| "grad_norm": 2.638228416442871, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1517, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.5334063526834611, | |
| "grad_norm": 1.5154801607131958, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1627, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.5388828039430449, | |
| "grad_norm": 4.037379264831543, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1788, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.5443592552026288, | |
| "grad_norm": 3.5345592498779297, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1768, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.5498357064622126, | |
| "grad_norm": 3.8549864292144775, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1721, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.5553121577217963, | |
| "grad_norm": 3.5247507095336914, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1305, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.56078860898138, | |
| "grad_norm": 2.387272834777832, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1234, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.5662650602409638, | |
| "grad_norm": 3.007579803466797, | |
| "learning_rate": 2e-05, | |
| "loss": 0.152, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.5717415115005475, | |
| "grad_norm": 1.0041784048080444, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1489, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.5772179627601315, | |
| "grad_norm": 3.3091013431549072, | |
| "learning_rate": 2e-05, | |
| "loss": 0.14, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.5826944140197152, | |
| "grad_norm": 1.844616174697876, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1721, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.588170865279299, | |
| "grad_norm": 3.9923973083496094, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1546, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.593647316538883, | |
| "grad_norm": 2.3511135578155518, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1477, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.5991237677984667, | |
| "grad_norm": 2.524749994277954, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1613, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.6046002190580504, | |
| "grad_norm": 1.5530831813812256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1445, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.6100766703176341, | |
| "grad_norm": 1.8088948726654053, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1446, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.6155531215772179, | |
| "grad_norm": 1.5274639129638672, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1453, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.6210295728368016, | |
| "grad_norm": 2.369565963745117, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1487, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.6265060240963856, | |
| "grad_norm": 2.4283454418182373, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1522, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.6319824753559693, | |
| "grad_norm": 4.117255687713623, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1523, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.6374589266155533, | |
| "grad_norm": 2.1403403282165527, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1558, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.642935377875137, | |
| "grad_norm": 3.7226603031158447, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1635, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.6484118291347207, | |
| "grad_norm": 3.3474371433258057, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1543, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.6538882803943045, | |
| "grad_norm": 2.174217700958252, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1474, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.6593647316538882, | |
| "grad_norm": 1.7523736953735352, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1487, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.664841182913472, | |
| "grad_norm": 2.573213577270508, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1566, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.670317634173056, | |
| "grad_norm": 1.8312263488769531, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1524, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.6757940854326396, | |
| "grad_norm": 1.8972638845443726, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1129, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.6812705366922236, | |
| "grad_norm": 2.2399697303771973, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1569, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 2.9116086959838867, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1531, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.692223439211391, | |
| "grad_norm": 2.098607063293457, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1378, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.6976998904709748, | |
| "grad_norm": 1.720107913017273, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1554, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.7031763417305585, | |
| "grad_norm": 2.0600640773773193, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1541, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.7086527929901423, | |
| "grad_norm": 2.0780065059661865, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1551, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.714129244249726, | |
| "grad_norm": 1.9723634719848633, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1168, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.71960569550931, | |
| "grad_norm": 6.4908552169799805, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1157, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.7250821467688937, | |
| "grad_norm": 2.1401596069335938, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1419, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.7305585980284777, | |
| "grad_norm": 1.883585810661316, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1428, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.7360350492880614, | |
| "grad_norm": 2.2904489040374756, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1382, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.7415115005476451, | |
| "grad_norm": 2.3025336265563965, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2024, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.7469879518072289, | |
| "grad_norm": 1.5613994598388672, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1696, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.7524644030668126, | |
| "grad_norm": 1.7806004285812378, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1474, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.7579408543263964, | |
| "grad_norm": 2.04266095161438, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1537, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.7634173055859803, | |
| "grad_norm": 3.345473527908325, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1411, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.768893756845564, | |
| "grad_norm": 2.1662192344665527, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1536, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.774370208105148, | |
| "grad_norm": 1.1458584070205688, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1642, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.7798466593647317, | |
| "grad_norm": 4.288283824920654, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1419, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.7853231106243155, | |
| "grad_norm": 3.2075963020324707, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1667, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.7907995618838992, | |
| "grad_norm": 2.8897817134857178, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1646, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.796276013143483, | |
| "grad_norm": 2.2969679832458496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1573, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.8017524644030667, | |
| "grad_norm": 3.1827869415283203, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1366, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.8072289156626506, | |
| "grad_norm": 3.3078675270080566, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1342, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.8127053669222344, | |
| "grad_norm": 0.969814658164978, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1314, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 1.9750161170959473, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1398, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.823658269441402, | |
| "grad_norm": 2.6312105655670166, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1271, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.8291347207009858, | |
| "grad_norm": 5.169326305389404, | |
| "learning_rate": 2e-05, | |
| "loss": 0.136, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.8346111719605696, | |
| "grad_norm": 4.923961639404297, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1516, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.8400876232201533, | |
| "grad_norm": 1.6556754112243652, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1577, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.845564074479737, | |
| "grad_norm": 3.2922916412353516, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1508, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.8510405257393208, | |
| "grad_norm": 1.2395728826522827, | |
| "learning_rate": 2e-05, | |
| "loss": 0.149, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.8565169769989047, | |
| "grad_norm": 1.776043176651001, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1762, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.8619934282584885, | |
| "grad_norm": 3.395716667175293, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1709, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.8674698795180724, | |
| "grad_norm": 3.3589627742767334, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1943, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.8729463307776562, | |
| "grad_norm": 1.2186440229415894, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1607, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.87842278203724, | |
| "grad_norm": 1.260779857635498, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1522, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.8838992332968236, | |
| "grad_norm": 2.699249267578125, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1493, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.8893756845564074, | |
| "grad_norm": 1.9771623611450195, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1485, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.894852135815991, | |
| "grad_norm": 2.270580768585205, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1534, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.900328587075575, | |
| "grad_norm": 1.3207887411117554, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1383, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.9058050383351588, | |
| "grad_norm": 26.00341796875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1685, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.9112814895947428, | |
| "grad_norm": 2.4248104095458984, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1252, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.9167579408543265, | |
| "grad_norm": 3.160520315170288, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1452, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.9222343921139102, | |
| "grad_norm": 2.528468608856201, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1168, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 1.9054774045944214, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1375, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.9331872946330777, | |
| "grad_norm": 3.4692299365997314, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1652, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.9386637458926614, | |
| "grad_norm": 1.626815915107727, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1231, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.9441401971522454, | |
| "grad_norm": 3.7855207920074463, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1492, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.9496166484118291, | |
| "grad_norm": 1.1849123239517212, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1594, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.9550930996714129, | |
| "grad_norm": 2.7899911403656006, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1601, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.9605695509309968, | |
| "grad_norm": 1.963122010231018, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1301, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.9660460021905806, | |
| "grad_norm": 1.8025850057601929, | |
| "learning_rate": 2e-05, | |
| "loss": 0.156, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.9715224534501643, | |
| "grad_norm": 1.5995118618011475, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1348, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.976998904709748, | |
| "grad_norm": 1.159638524055481, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1288, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.9824753559693318, | |
| "grad_norm": 1.3912004232406616, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1161, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.9879518072289155, | |
| "grad_norm": 1.0395070314407349, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1386, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.9934282584884995, | |
| "grad_norm": 1.532216191291809, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1213, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.9989047097480832, | |
| "grad_norm": 1.4489120244979858, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1123, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.941111445007298, | |
| "eval_f1": 0.8677761381181066, | |
| "eval_loss": 0.15645764768123627, | |
| "eval_precision": 0.819916825171669, | |
| "eval_recall": 0.9215689826977082, | |
| "eval_runtime": 19.7981, | |
| "eval_samples_per_second": 312.404, | |
| "eval_steps_per_second": 19.547, | |
| "step": 3652 | |
| }, | |
| { | |
| "epoch": 2.004381161007667, | |
| "grad_norm": 1.69236159324646, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1479, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.009857612267251, | |
| "grad_norm": 3.8225982189178467, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1477, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.0153340635268346, | |
| "grad_norm": 4.383903980255127, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1094, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.0208105147864184, | |
| "grad_norm": 3.1119155883789062, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1602, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.026286966046002, | |
| "grad_norm": 3.3700242042541504, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1303, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.031763417305586, | |
| "grad_norm": 1.0470126867294312, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1174, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.0372398685651696, | |
| "grad_norm": 2.912874698638916, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1336, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.0427163198247538, | |
| "grad_norm": 0.8620438575744629, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1112, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.0481927710843375, | |
| "grad_norm": 2.3170716762542725, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1208, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.0536692223439212, | |
| "grad_norm": 1.4915480613708496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.136, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.059145673603505, | |
| "grad_norm": 1.7329208850860596, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1437, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.0646221248630887, | |
| "grad_norm": 1.5879555940628052, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1338, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.0700985761226725, | |
| "grad_norm": 0.7418123483657837, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1013, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.075575027382256, | |
| "grad_norm": 1.0119812488555908, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0841, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.08105147864184, | |
| "grad_norm": 1.383432149887085, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1212, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.0865279299014237, | |
| "grad_norm": 2.614387273788452, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1228, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.092004381161008, | |
| "grad_norm": 2.6762051582336426, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1491, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.0974808324205916, | |
| "grad_norm": 3.3792619705200195, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1161, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.1029572836801753, | |
| "grad_norm": 2.690113067626953, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1093, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.108433734939759, | |
| "grad_norm": 1.5759937763214111, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1406, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.113910186199343, | |
| "grad_norm": 1.4909275770187378, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1108, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.1193866374589265, | |
| "grad_norm": 2.6127500534057617, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1269, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.1248630887185103, | |
| "grad_norm": 2.5836493968963623, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1396, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.130339539978094, | |
| "grad_norm": 1.385608434677124, | |
| "learning_rate": 2e-05, | |
| "loss": 0.127, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.135815991237678, | |
| "grad_norm": 3.3218297958374023, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1056, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.141292442497262, | |
| "grad_norm": 1.8507598638534546, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1453, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.1467688937568457, | |
| "grad_norm": 3.654327630996704, | |
| "learning_rate": 2e-05, | |
| "loss": 0.122, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.1522453450164294, | |
| "grad_norm": 3.592478036880493, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1245, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.157721796276013, | |
| "grad_norm": 3.7161383628845215, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1126, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.163198247535597, | |
| "grad_norm": 2.2989351749420166, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0944, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 2.9460718631744385, | |
| "learning_rate": 2e-05, | |
| "loss": 0.126, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.1741511500547643, | |
| "grad_norm": 3.1067349910736084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1436, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.1796276013143485, | |
| "grad_norm": 2.155015230178833, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1033, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.1851040525739323, | |
| "grad_norm": 2.9963104724884033, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1443, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.190580503833516, | |
| "grad_norm": 1.293370246887207, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1093, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.1960569550930997, | |
| "grad_norm": 1.3873592615127563, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1139, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.2015334063526835, | |
| "grad_norm": 1.8804830312728882, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1554, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.207009857612267, | |
| "grad_norm": 4.313164710998535, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1129, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.212486308871851, | |
| "grad_norm": 2.9426050186157227, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1334, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.2179627601314347, | |
| "grad_norm": 2.560018539428711, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1492, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.2234392113910184, | |
| "grad_norm": 1.6301517486572266, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1308, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.2289156626506026, | |
| "grad_norm": 1.1607255935668945, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1374, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.2343921139101863, | |
| "grad_norm": 4.422305107116699, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1375, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.23986856516977, | |
| "grad_norm": 3.9398353099823, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1526, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.245345016429354, | |
| "grad_norm": 4.186077117919922, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1117, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.2508214676889375, | |
| "grad_norm": 3.083814859390259, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1273, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.2562979189485213, | |
| "grad_norm": 1.9174625873565674, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1036, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.261774370208105, | |
| "grad_norm": 1.3200234174728394, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1246, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.2672508214676887, | |
| "grad_norm": 1.504086971282959, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0944, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 2.5579471588134766, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1242, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.2782037239868567, | |
| "grad_norm": 2.304062843322754, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1543, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.2836801752464404, | |
| "grad_norm": 1.507938265800476, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1277, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.289156626506024, | |
| "grad_norm": 4.3036346435546875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1311, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.294633077765608, | |
| "grad_norm": 1.80647873878479, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1403, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.3001095290251916, | |
| "grad_norm": 2.770962715148926, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1521, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.3055859802847753, | |
| "grad_norm": 2.768677234649658, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1314, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.311062431544359, | |
| "grad_norm": 1.7572500705718994, | |
| "learning_rate": 2e-05, | |
| "loss": 0.122, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.3165388828039433, | |
| "grad_norm": 1.1709873676300049, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1236, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.322015334063527, | |
| "grad_norm": 17.13128089904785, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1509, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.3274917853231107, | |
| "grad_norm": 2.660583019256592, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1326, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.3329682365826945, | |
| "grad_norm": 1.643479347229004, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1179, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.338444687842278, | |
| "grad_norm": 6.019737720489502, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0953, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.343921139101862, | |
| "grad_norm": 1.986523985862732, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1048, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.3493975903614457, | |
| "grad_norm": 1.796851634979248, | |
| "learning_rate": 2e-05, | |
| "loss": 0.115, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.3548740416210294, | |
| "grad_norm": 1.2359439134597778, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1174, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.360350492880613, | |
| "grad_norm": 3.057445526123047, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1458, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.3658269441401973, | |
| "grad_norm": 1.0161036252975464, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1437, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.371303395399781, | |
| "grad_norm": 1.2098288536071777, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1379, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.376779846659365, | |
| "grad_norm": 1.4055923223495483, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1199, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.3822562979189486, | |
| "grad_norm": 2.134941816329956, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1303, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.3877327491785323, | |
| "grad_norm": 2.351625680923462, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1278, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.393209200438116, | |
| "grad_norm": 3.273850679397583, | |
| "learning_rate": 2e-05, | |
| "loss": 0.11, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.3986856516976998, | |
| "grad_norm": 2.0896518230438232, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1193, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.4041621029572835, | |
| "grad_norm": 3.240591287612915, | |
| "learning_rate": 2e-05, | |
| "loss": 0.139, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 4.579762935638428, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1417, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.4151150054764514, | |
| "grad_norm": 2.048832654953003, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1176, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.420591456736035, | |
| "grad_norm": 1.8488651514053345, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1408, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.426067907995619, | |
| "grad_norm": 1.4034713506698608, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1246, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 2.4315443592552026, | |
| "grad_norm": 1.0171767473220825, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1118, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 2.4370208105147864, | |
| "grad_norm": 4.190380573272705, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1527, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.44249726177437, | |
| "grad_norm": 3.9857051372528076, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1153, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 2.447973713033954, | |
| "grad_norm": 1.4358816146850586, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0983, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 2.453450164293538, | |
| "grad_norm": 1.8944737911224365, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1382, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.4589266155531218, | |
| "grad_norm": 1.8662302494049072, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1564, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 2.4644030668127055, | |
| "grad_norm": 1.050307035446167, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1111, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.4698795180722892, | |
| "grad_norm": 1.3058151006698608, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1526, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 2.475355969331873, | |
| "grad_norm": 2.779019355773926, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1013, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 2.4808324205914567, | |
| "grad_norm": 3.649847984313965, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1039, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 2.4863088718510404, | |
| "grad_norm": 1.4723719358444214, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1304, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 2.491785323110624, | |
| "grad_norm": 2.51281476020813, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1067, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.497261774370208, | |
| "grad_norm": 3.2945971488952637, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1354, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 2.502738225629792, | |
| "grad_norm": 3.16933536529541, | |
| "learning_rate": 2e-05, | |
| "loss": 0.127, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 2.508214676889376, | |
| "grad_norm": 1.2082220315933228, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1296, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 2.5136911281489596, | |
| "grad_norm": 4.029638767242432, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1363, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 2.5191675794085433, | |
| "grad_norm": 1.5362796783447266, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1275, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.524644030668127, | |
| "grad_norm": 2.8110194206237793, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1401, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 2.5301204819277108, | |
| "grad_norm": 1.6804673671722412, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1448, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 2.5355969331872945, | |
| "grad_norm": 7.145838260650635, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1305, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 2.5410733844468787, | |
| "grad_norm": 2.6446447372436523, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1242, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.546549835706462, | |
| "grad_norm": 1.742530107498169, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1151, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.552026286966046, | |
| "grad_norm": 2.4224681854248047, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1499, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 2.55750273822563, | |
| "grad_norm": 2.849701404571533, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1365, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 2.5629791894852136, | |
| "grad_norm": 5.31744384765625, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1308, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 2.5684556407447974, | |
| "grad_norm": 1.5912376642227173, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1551, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 2.573932092004381, | |
| "grad_norm": 1.0725617408752441, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1279, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.579408543263965, | |
| "grad_norm": 3.9630486965179443, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1158, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 2.5848849945235486, | |
| "grad_norm": 1.6319750547409058, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1576, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 2.5903614457831328, | |
| "grad_norm": 4.792392253875732, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1598, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 2.595837897042716, | |
| "grad_norm": 2.365724563598633, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1189, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 2.6013143483023002, | |
| "grad_norm": 4.436146259307861, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1286, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.606790799561884, | |
| "grad_norm": 1.227371096611023, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1465, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 2.6122672508214677, | |
| "grad_norm": 1.3392515182495117, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1305, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 2.6177437020810514, | |
| "grad_norm": 2.5968856811523438, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1089, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 2.623220153340635, | |
| "grad_norm": 2.4093873500823975, | |
| "learning_rate": 2e-05, | |
| "loss": 0.139, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 2.628696604600219, | |
| "grad_norm": 2.9262475967407227, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1138, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.6341730558598027, | |
| "grad_norm": 1.900516152381897, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0951, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 2.639649507119387, | |
| "grad_norm": 1.9894856214523315, | |
| "learning_rate": 2e-05, | |
| "loss": 0.184, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 2.6451259583789706, | |
| "grad_norm": 0.8725138306617737, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1043, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 2.6368794441223145, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1496, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 2.656078860898138, | |
| "grad_norm": 2.7605247497558594, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1188, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.661555312157722, | |
| "grad_norm": 1.7536276578903198, | |
| "learning_rate": 2e-05, | |
| "loss": 0.12, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 2.6670317634173055, | |
| "grad_norm": 2.9888434410095215, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1251, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 2.6725082146768893, | |
| "grad_norm": 2.501553535461426, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1317, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 2.6779846659364734, | |
| "grad_norm": 3.2755653858184814, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1431, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 2.6834611171960567, | |
| "grad_norm": 1.1044738292694092, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1177, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.688937568455641, | |
| "grad_norm": 2.04195237159729, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1385, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 2.6944140197152247, | |
| "grad_norm": 1.8423049449920654, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1524, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 2.6998904709748084, | |
| "grad_norm": 3.017038345336914, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1309, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 2.705366922234392, | |
| "grad_norm": 2.5917625427246094, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0968, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 2.710843373493976, | |
| "grad_norm": 2.059396266937256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.136, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.7163198247535596, | |
| "grad_norm": 2.7622623443603516, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1344, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.7217962760131433, | |
| "grad_norm": 2.050316333770752, | |
| "learning_rate": 2e-05, | |
| "loss": 0.139, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 2.0974202156066895, | |
| "learning_rate": 2e-05, | |
| "loss": 0.133, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 2.732749178532311, | |
| "grad_norm": 4.3153533935546875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1513, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 2.738225629791895, | |
| "grad_norm": 1.2816858291625977, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1581, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.7437020810514787, | |
| "grad_norm": 3.44547176361084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1349, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 2.7491785323110625, | |
| "grad_norm": 1.2258213758468628, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1412, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 2.754654983570646, | |
| "grad_norm": 2.301241159439087, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1013, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 2.76013143483023, | |
| "grad_norm": 2.467775821685791, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1501, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 2.7656078860898137, | |
| "grad_norm": 7.51368522644043, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1378, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.7710843373493974, | |
| "grad_norm": 4.798404693603516, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1305, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 2.7765607886089816, | |
| "grad_norm": 1.396974802017212, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1043, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 2.7820372398685653, | |
| "grad_norm": 1.14879310131073, | |
| "learning_rate": 2e-05, | |
| "loss": 0.12, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 2.787513691128149, | |
| "grad_norm": 1.2579361200332642, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1501, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 2.792990142387733, | |
| "grad_norm": 2.2348382472991943, | |
| "learning_rate": 2e-05, | |
| "loss": 0.157, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.7984665936473165, | |
| "grad_norm": 1.5609731674194336, | |
| "learning_rate": 2e-05, | |
| "loss": 0.135, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 2.8039430449069003, | |
| "grad_norm": 2.0120913982391357, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1123, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.809419496166484, | |
| "grad_norm": 2.593989610671997, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1142, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 2.8148959474260677, | |
| "grad_norm": 1.574157953262329, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1701, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 2.8203723986856515, | |
| "grad_norm": 3.661393642425537, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1311, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.8258488499452357, | |
| "grad_norm": 0.9907870888710022, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1421, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 2.8313253012048194, | |
| "grad_norm": 2.06791615486145, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1353, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 2.836801752464403, | |
| "grad_norm": 3.9285051822662354, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1373, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 2.842278203723987, | |
| "grad_norm": 1.7791670560836792, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1018, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 2.8477546549835706, | |
| "grad_norm": 1.8296700716018677, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1247, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.8532311062431543, | |
| "grad_norm": 4.70639181137085, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1314, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 2.858707557502738, | |
| "grad_norm": 2.3807830810546875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1356, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 2.8641840087623223, | |
| "grad_norm": 1.6014913320541382, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1146, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 2.8696604600219056, | |
| "grad_norm": 1.8272550106048584, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1183, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 2.8751369112814897, | |
| "grad_norm": 1.299852728843689, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1405, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.8806133625410735, | |
| "grad_norm": 1.1718955039978027, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1461, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 2.886089813800657, | |
| "grad_norm": 5.982026100158691, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1539, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 3.6748552322387695, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1164, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.8970427163198247, | |
| "grad_norm": 2.9227006435394287, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1557, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 2.9025191675794084, | |
| "grad_norm": 1.4735232591629028, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1168, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.907995618838992, | |
| "grad_norm": 3.212067127227783, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1489, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 2.9134720700985763, | |
| "grad_norm": 1.4650025367736816, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1131, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 2.91894852135816, | |
| "grad_norm": 4.302365779876709, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1705, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 2.924424972617744, | |
| "grad_norm": 2.2288858890533447, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1485, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 2.9299014238773275, | |
| "grad_norm": 1.019721508026123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1331, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.9353778751369113, | |
| "grad_norm": 2.5705177783966064, | |
| "learning_rate": 2e-05, | |
| "loss": 0.122, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 2.940854326396495, | |
| "grad_norm": 1.639599084854126, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1218, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 2.9463307776560788, | |
| "grad_norm": 3.7570650577545166, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1491, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 2.9518072289156625, | |
| "grad_norm": 3.499650716781616, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1335, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 2.9572836801752462, | |
| "grad_norm": 2.9174211025238037, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1468, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.9627601314348304, | |
| "grad_norm": 3.045962333679199, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1283, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 2.968236582694414, | |
| "grad_norm": 2.5407910346984863, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1171, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 2.973713033953998, | |
| "grad_norm": 2.07804274559021, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1127, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 2.9791894852135816, | |
| "grad_norm": 2.2112016677856445, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1249, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.9846659364731654, | |
| "grad_norm": 4.099008083343506, | |
| "learning_rate": 2e-05, | |
| "loss": 0.141, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.990142387732749, | |
| "grad_norm": 1.716291069984436, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1102, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 2.995618838992333, | |
| "grad_norm": 3.4466896057128906, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1283, | |
| "step": 5470 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5478, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7443682033477008.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |