| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.7181719260065287, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013601741022850925, | |
| "grad_norm": 2.812274217605591, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 5.2086, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02720348204570185, | |
| "grad_norm": 2.162468433380127, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 5.2051, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.040805223068552776, | |
| "grad_norm": 1.7208396196365356, | |
| "learning_rate": 6e-06, | |
| "loss": 5.2105, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0544069640914037, | |
| "grad_norm": 2.556692361831665, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 5.1625, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06800870511425462, | |
| "grad_norm": 8.235758781433105, | |
| "learning_rate": 1e-05, | |
| "loss": 4.6379, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08161044613710555, | |
| "grad_norm": 10.163893699645996, | |
| "learning_rate": 1.2e-05, | |
| "loss": 3.8391, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09521218715995647, | |
| "grad_norm": 11.116437911987305, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 3.0016, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1088139281828074, | |
| "grad_norm": 10.696512222290039, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 2.4004, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12241566920565833, | |
| "grad_norm": 12.367400169372559, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.8969, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13601741022850924, | |
| "grad_norm": 13.723505973815918, | |
| "learning_rate": 2e-05, | |
| "loss": 1.4616, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14961915125136016, | |
| "grad_norm": 22.034290313720703, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.2379, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1632208922742111, | |
| "grad_norm": 8.55447769165039, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.0951, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17682263329706202, | |
| "grad_norm": 11.120705604553223, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.9654, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.19042437431991294, | |
| "grad_norm": 13.087006568908691, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.8391, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20402611534276388, | |
| "grad_norm": 12.556931495666504, | |
| "learning_rate": 3e-05, | |
| "loss": 0.7075, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2176278563656148, | |
| "grad_norm": 14.245359420776367, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.6853, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2312295973884657, | |
| "grad_norm": 12.10800838470459, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.606, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.24483133841131666, | |
| "grad_norm": 11.180713653564453, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.5734, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2584330794341676, | |
| "grad_norm": 12.638975143432617, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.545, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2720348204570185, | |
| "grad_norm": 11.650677680969238, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5162, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.28563656147986943, | |
| "grad_norm": 11.61837387084961, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.491, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2992383025027203, | |
| "grad_norm": 5.453939914703369, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.4378, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.31284004352557127, | |
| "grad_norm": 7.908788204193115, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 0.4299, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3264417845484222, | |
| "grad_norm": 13.816142082214355, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.4113, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3400435255712731, | |
| "grad_norm": 11.430989265441895, | |
| "learning_rate": 5e-05, | |
| "loss": 0.4371, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.35364526659412404, | |
| "grad_norm": 6.766596794128418, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 0.387, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.367247007616975, | |
| "grad_norm": 10.817462921142578, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 0.3643, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3808487486398259, | |
| "grad_norm": 6.278716564178467, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.3477, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3944504896626768, | |
| "grad_norm": 13.244380950927734, | |
| "learning_rate": 5.8e-05, | |
| "loss": 0.3763, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.40805223068552776, | |
| "grad_norm": 4.513805866241455, | |
| "learning_rate": 6e-05, | |
| "loss": 0.3093, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.42165397170837865, | |
| "grad_norm": 4.978238582611084, | |
| "learning_rate": 6.2e-05, | |
| "loss": 0.306, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4352557127312296, | |
| "grad_norm": 9.654162406921387, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.3157, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.44885745375408054, | |
| "grad_norm": 5.806159496307373, | |
| "learning_rate": 6.6e-05, | |
| "loss": 0.302, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4624591947769314, | |
| "grad_norm": 6.283710479736328, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 0.2885, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.47606093579978237, | |
| "grad_norm": 6.1870951652526855, | |
| "learning_rate": 7e-05, | |
| "loss": 0.289, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4896626768226333, | |
| "grad_norm": 3.082080364227295, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.2584, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5032644178454843, | |
| "grad_norm": 5.254792213439941, | |
| "learning_rate": 7.4e-05, | |
| "loss": 0.2752, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5168661588683352, | |
| "grad_norm": 6.462503433227539, | |
| "learning_rate": 7.6e-05, | |
| "loss": 0.2639, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.530467899891186, | |
| "grad_norm": 10.89343547821045, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 0.2893, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.544069640914037, | |
| "grad_norm": 5.2178192138671875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2415, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5576713819368879, | |
| "grad_norm": 8.687518119812012, | |
| "learning_rate": 8.2e-05, | |
| "loss": 0.2385, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5712731229597389, | |
| "grad_norm": 4.670180320739746, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.2297, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5848748639825898, | |
| "grad_norm": 3.699598550796509, | |
| "learning_rate": 8.6e-05, | |
| "loss": 0.2241, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5984766050054406, | |
| "grad_norm": 3.260232925415039, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.216, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6120783460282916, | |
| "grad_norm": 3.4559569358825684, | |
| "learning_rate": 9e-05, | |
| "loss": 0.2275, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6256800870511425, | |
| "grad_norm": 3.7167000770568848, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 0.2133, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6392818280739935, | |
| "grad_norm": 4.1776628494262695, | |
| "learning_rate": 9.4e-05, | |
| "loss": 0.2194, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6528835690968444, | |
| "grad_norm": 4.609129428863525, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.2088, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6664853101196954, | |
| "grad_norm": 5.062413215637207, | |
| "learning_rate": 9.8e-05, | |
| "loss": 0.2185, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6800870511425462, | |
| "grad_norm": 3.3144760131835938, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2176, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6936887921653971, | |
| "grad_norm": 2.5662150382995605, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 0.2108, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7072905331882481, | |
| "grad_norm": 2.718777656555176, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 0.1734, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.720892274211099, | |
| "grad_norm": 2.934107780456543, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 0.209, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.73449401523395, | |
| "grad_norm": 2.3642656803131104, | |
| "learning_rate": 0.00010800000000000001, | |
| "loss": 0.1958, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7480957562568009, | |
| "grad_norm": 2.765012502670288, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 0.1857, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7616974972796517, | |
| "grad_norm": 2.482921600341797, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 0.186, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7752992383025027, | |
| "grad_norm": 2.26837420463562, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 0.1783, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7889009793253536, | |
| "grad_norm": 2.1319069862365723, | |
| "learning_rate": 0.000116, | |
| "loss": 0.1731, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8025027203482046, | |
| "grad_norm": 1.4668622016906738, | |
| "learning_rate": 0.000118, | |
| "loss": 0.1854, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8161044613710555, | |
| "grad_norm": 2.2079055309295654, | |
| "learning_rate": 0.00012, | |
| "loss": 0.1806, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8297062023939065, | |
| "grad_norm": 1.9991375207901, | |
| "learning_rate": 0.000122, | |
| "loss": 0.1682, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8433079434167573, | |
| "grad_norm": 1.889404296875, | |
| "learning_rate": 0.000124, | |
| "loss": 0.162, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8569096844396082, | |
| "grad_norm": 1.7021093368530273, | |
| "learning_rate": 0.000126, | |
| "loss": 0.1521, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8705114254624592, | |
| "grad_norm": 2.153775215148926, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 0.1656, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8841131664853101, | |
| "grad_norm": 2.285336494445801, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 0.1609, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8977149075081611, | |
| "grad_norm": 1.8330312967300415, | |
| "learning_rate": 0.000132, | |
| "loss": 0.151, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.911316648531012, | |
| "grad_norm": 1.712404489517212, | |
| "learning_rate": 0.000134, | |
| "loss": 0.1577, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9249183895538629, | |
| "grad_norm": 1.6210945844650269, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 0.1641, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9385201305767138, | |
| "grad_norm": 1.661000370979309, | |
| "learning_rate": 0.000138, | |
| "loss": 0.1543, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9521218715995647, | |
| "grad_norm": 1.754515290260315, | |
| "learning_rate": 0.00014, | |
| "loss": 0.1621, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9657236126224157, | |
| "grad_norm": 1.6415514945983887, | |
| "learning_rate": 0.000142, | |
| "loss": 0.144, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9793253536452666, | |
| "grad_norm": 2.2498250007629395, | |
| "learning_rate": 0.000144, | |
| "loss": 0.1502, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9929270946681176, | |
| "grad_norm": 1.6925517320632935, | |
| "learning_rate": 0.000146, | |
| "loss": 0.1546, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.0054406964091405, | |
| "grad_norm": 1.8807954788208008, | |
| "learning_rate": 0.000148, | |
| "loss": 0.1409, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.0190424374319913, | |
| "grad_norm": 1.611089825630188, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.1578, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.0326441784548421, | |
| "grad_norm": 1.1505298614501953, | |
| "learning_rate": 0.000152, | |
| "loss": 0.1371, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.0462459194776932, | |
| "grad_norm": 1.5354456901550293, | |
| "learning_rate": 0.000154, | |
| "loss": 0.1407, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.059847660500544, | |
| "grad_norm": 1.178806185722351, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 0.141, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.073449401523395, | |
| "grad_norm": 1.3755444288253784, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 0.1287, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.087051142546246, | |
| "grad_norm": 1.5800870656967163, | |
| "learning_rate": 0.00016, | |
| "loss": 0.1382, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1006528835690967, | |
| "grad_norm": 1.1685993671417236, | |
| "learning_rate": 0.000162, | |
| "loss": 0.1363, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.1142546245919478, | |
| "grad_norm": 1.1938755512237549, | |
| "learning_rate": 0.000164, | |
| "loss": 0.1316, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.1278563656147986, | |
| "grad_norm": 1.2022426128387451, | |
| "learning_rate": 0.000166, | |
| "loss": 0.1313, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.1414581066376497, | |
| "grad_norm": 1.1900382041931152, | |
| "learning_rate": 0.000168, | |
| "loss": 0.1256, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1550598476605005, | |
| "grad_norm": 1.067172884941101, | |
| "learning_rate": 0.00017, | |
| "loss": 0.1385, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.1686615886833516, | |
| "grad_norm": 1.4434224367141724, | |
| "learning_rate": 0.000172, | |
| "loss": 0.1382, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1822633297062024, | |
| "grad_norm": 1.0884168148040771, | |
| "learning_rate": 0.000174, | |
| "loss": 0.1266, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1958650707290532, | |
| "grad_norm": 1.3909893035888672, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 0.1215, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.2094668117519043, | |
| "grad_norm": 0.9344027042388916, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 0.119, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.2230685527747551, | |
| "grad_norm": 1.1694083213806152, | |
| "learning_rate": 0.00018, | |
| "loss": 0.1192, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2366702937976062, | |
| "grad_norm": 0.9874443411827087, | |
| "learning_rate": 0.000182, | |
| "loss": 0.1278, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.250272034820457, | |
| "grad_norm": 1.2893680334091187, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.1318, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.263873775843308, | |
| "grad_norm": 1.3346811532974243, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 0.128, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.277475516866159, | |
| "grad_norm": 0.9889335632324219, | |
| "learning_rate": 0.000188, | |
| "loss": 0.1372, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2910772578890097, | |
| "grad_norm": 1.2218222618103027, | |
| "learning_rate": 0.00019, | |
| "loss": 0.1307, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.3046789989118608, | |
| "grad_norm": 0.897546112537384, | |
| "learning_rate": 0.000192, | |
| "loss": 0.1217, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.3182807399347116, | |
| "grad_norm": 0.9673519730567932, | |
| "learning_rate": 0.000194, | |
| "loss": 0.1094, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.3318824809575625, | |
| "grad_norm": 0.9263612031936646, | |
| "learning_rate": 0.000196, | |
| "loss": 0.1111, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.3454842219804135, | |
| "grad_norm": 1.0785088539123535, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 0.12, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.3590859630032643, | |
| "grad_norm": 0.8844039440155029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1211, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3726877040261154, | |
| "grad_norm": 0.8651229739189148, | |
| "learning_rate": 0.0001999999906373993, | |
| "loss": 0.1291, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.3862894450489662, | |
| "grad_norm": 1.0854979753494263, | |
| "learning_rate": 0.000199999962549599, | |
| "loss": 0.125, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3998911860718173, | |
| "grad_norm": 1.303252935409546, | |
| "learning_rate": 0.00019999991573660427, | |
| "loss": 0.1295, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.4134929270946681, | |
| "grad_norm": 1.0601307153701782, | |
| "learning_rate": 0.00019999985019842397, | |
| "loss": 0.1363, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.427094668117519, | |
| "grad_norm": 0.8127331733703613, | |
| "learning_rate": 0.0001999997659350703, | |
| "loss": 0.1124, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.44069640914037, | |
| "grad_norm": 0.867559015750885, | |
| "learning_rate": 0.0001999996629465591, | |
| "loss": 0.1157, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.4542981501632208, | |
| "grad_norm": 0.8963221907615662, | |
| "learning_rate": 0.0001999995412329096, | |
| "loss": 0.1197, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.467899891186072, | |
| "grad_norm": 0.7211653590202332, | |
| "learning_rate": 0.00019999940079414464, | |
| "loss": 0.11, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4815016322089227, | |
| "grad_norm": 0.8746846914291382, | |
| "learning_rate": 0.00019999924163029048, | |
| "loss": 0.104, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.4951033732317738, | |
| "grad_norm": 0.6718381643295288, | |
| "learning_rate": 0.00019999906374137694, | |
| "loss": 0.1041, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.5087051142546246, | |
| "grad_norm": 0.7146100997924805, | |
| "learning_rate": 0.00019999886712743732, | |
| "loss": 0.0894, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.5223068552774754, | |
| "grad_norm": 0.784561276435852, | |
| "learning_rate": 0.00019999865178850845, | |
| "loss": 0.0885, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.5359085963003265, | |
| "grad_norm": 0.610625147819519, | |
| "learning_rate": 0.00019999841772463066, | |
| "loss": 0.0853, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.5495103373231773, | |
| "grad_norm": 0.5619096159934998, | |
| "learning_rate": 0.00019999816493584775, | |
| "loss": 0.0788, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.5631120783460282, | |
| "grad_norm": 0.5502200126647949, | |
| "learning_rate": 0.00019999789342220708, | |
| "loss": 0.0802, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.5767138193688792, | |
| "grad_norm": 0.6030136942863464, | |
| "learning_rate": 0.0001999976031837595, | |
| "loss": 0.083, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.5903155603917303, | |
| "grad_norm": 0.647160530090332, | |
| "learning_rate": 0.00019999729422055928, | |
| "loss": 0.0897, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.603917301414581, | |
| "grad_norm": 0.7512218952178955, | |
| "learning_rate": 0.00019999696653266437, | |
| "loss": 0.08, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.617519042437432, | |
| "grad_norm": 0.5823985934257507, | |
| "learning_rate": 0.00019999662012013612, | |
| "loss": 0.0772, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.631120783460283, | |
| "grad_norm": 0.6494550108909607, | |
| "learning_rate": 0.00019999625498303932, | |
| "loss": 0.0772, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.6447225244831338, | |
| "grad_norm": 0.5662053823471069, | |
| "learning_rate": 0.00019999587112144244, | |
| "loss": 0.0695, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.6583242655059847, | |
| "grad_norm": 0.4981078803539276, | |
| "learning_rate": 0.0001999954685354173, | |
| "loss": 0.0742, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.6719260065288357, | |
| "grad_norm": 0.5595643520355225, | |
| "learning_rate": 0.00019999504722503927, | |
| "loss": 0.0693, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.6855277475516868, | |
| "grad_norm": 0.49906110763549805, | |
| "learning_rate": 0.00019999460719038732, | |
| "loss": 0.0692, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.6991294885745374, | |
| "grad_norm": 0.5288122296333313, | |
| "learning_rate": 0.00019999414843154375, | |
| "loss": 0.0689, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.7127312295973884, | |
| "grad_norm": 0.48555830121040344, | |
| "learning_rate": 0.00019999367094859452, | |
| "loss": 0.0655, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.7263329706202395, | |
| "grad_norm": 0.5002060532569885, | |
| "learning_rate": 0.00019999317474162905, | |
| "loss": 0.0707, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.7399347116430903, | |
| "grad_norm": 0.4510345458984375, | |
| "learning_rate": 0.0001999926598107402, | |
| "loss": 0.06, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.7535364526659412, | |
| "grad_norm": 0.5075559020042419, | |
| "learning_rate": 0.00019999212615602445, | |
| "loss": 0.0675, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.7671381936887922, | |
| "grad_norm": 0.5471305251121521, | |
| "learning_rate": 0.0001999915737775817, | |
| "loss": 0.0661, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.780739934711643, | |
| "grad_norm": 0.6649473905563354, | |
| "learning_rate": 0.00019999100267551538, | |
| "loss": 0.0746, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.7943416757344939, | |
| "grad_norm": 0.6705607175827026, | |
| "learning_rate": 0.00019999041284993245, | |
| "loss": 0.075, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.807943416757345, | |
| "grad_norm": 0.645964503288269, | |
| "learning_rate": 0.00019998980430094334, | |
| "loss": 0.0825, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.821545157780196, | |
| "grad_norm": 0.48304426670074463, | |
| "learning_rate": 0.00019998917702866202, | |
| "loss": 0.0726, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.8351468988030468, | |
| "grad_norm": 0.5829260349273682, | |
| "learning_rate": 0.00019998853103320592, | |
| "loss": 0.0674, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.8487486398258977, | |
| "grad_norm": 0.6563496589660645, | |
| "learning_rate": 0.00019998786631469603, | |
| "loss": 0.0666, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.8623503808487487, | |
| "grad_norm": 0.5735076069831848, | |
| "learning_rate": 0.00019998718287325676, | |
| "loss": 0.0686, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.8759521218715995, | |
| "grad_norm": 0.5888078212738037, | |
| "learning_rate": 0.0001999864807090162, | |
| "loss": 0.0728, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.8895538628944504, | |
| "grad_norm": 0.5383855700492859, | |
| "learning_rate": 0.00019998575982210572, | |
| "loss": 0.0691, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.9031556039173014, | |
| "grad_norm": 0.4812714159488678, | |
| "learning_rate": 0.0001999850202126604, | |
| "loss": 0.0591, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.9167573449401525, | |
| "grad_norm": 0.5054184794425964, | |
| "learning_rate": 0.00019998426188081865, | |
| "loss": 0.0657, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.9303590859630033, | |
| "grad_norm": 0.4280984103679657, | |
| "learning_rate": 0.0001999834848267225, | |
| "loss": 0.0619, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.9439608269858542, | |
| "grad_norm": 0.7245299816131592, | |
| "learning_rate": 0.0001999826890505175, | |
| "loss": 0.064, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.9575625680087052, | |
| "grad_norm": 0.5748353004455566, | |
| "learning_rate": 0.0001999818745523526, | |
| "loss": 0.0646, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.971164309031556, | |
| "grad_norm": 0.5662197470664978, | |
| "learning_rate": 0.00019998104133238034, | |
| "loss": 0.0646, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.9847660500544069, | |
| "grad_norm": 0.47684717178344727, | |
| "learning_rate": 0.00019998018939075673, | |
| "loss": 0.0626, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.998367791077258, | |
| "grad_norm": 0.4988132417201996, | |
| "learning_rate": 0.00019997931872764132, | |
| "loss": 0.0596, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.010881392818281, | |
| "grad_norm": 0.48226118087768555, | |
| "learning_rate": 0.0001999784293431971, | |
| "loss": 0.0579, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.0244831338411315, | |
| "grad_norm": 0.44725948572158813, | |
| "learning_rate": 0.0001999775212375907, | |
| "loss": 0.0575, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.0380848748639826, | |
| "grad_norm": 0.5599634051322937, | |
| "learning_rate": 0.00019997659441099206, | |
| "loss": 0.0594, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.0516866158868337, | |
| "grad_norm": 0.47687241435050964, | |
| "learning_rate": 0.00019997564886357476, | |
| "loss": 0.0601, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.0652883569096843, | |
| "grad_norm": 0.38273605704307556, | |
| "learning_rate": 0.0001999746845955159, | |
| "loss": 0.0551, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.0788900979325353, | |
| "grad_norm": 0.5022798180580139, | |
| "learning_rate": 0.00019997370160699602, | |
| "loss": 0.0628, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.0924918389553864, | |
| "grad_norm": 0.4850836992263794, | |
| "learning_rate": 0.00019997269989819916, | |
| "loss": 0.0562, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.1060935799782374, | |
| "grad_norm": 0.45668402314186096, | |
| "learning_rate": 0.0001999716794693129, | |
| "loss": 0.0542, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.119695321001088, | |
| "grad_norm": 0.4036906361579895, | |
| "learning_rate": 0.00019997064032052837, | |
| "loss": 0.0556, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.133297062023939, | |
| "grad_norm": 0.46593400835990906, | |
| "learning_rate": 0.00019996958245204009, | |
| "loss": 0.0563, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.14689880304679, | |
| "grad_norm": 0.4298928380012512, | |
| "learning_rate": 0.00019996850586404615, | |
| "loss": 0.0531, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.1605005440696408, | |
| "grad_norm": 0.40037456154823303, | |
| "learning_rate": 0.00019996741055674816, | |
| "loss": 0.0519, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.174102285092492, | |
| "grad_norm": 0.4456872344017029, | |
| "learning_rate": 0.00019996629653035126, | |
| "loss": 0.0477, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.187704026115343, | |
| "grad_norm": 0.5066975355148315, | |
| "learning_rate": 0.000199965163785064, | |
| "loss": 0.049, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.2013057671381935, | |
| "grad_norm": 0.39652055501937866, | |
| "learning_rate": 0.0001999640123210985, | |
| "loss": 0.0485, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.2149075081610445, | |
| "grad_norm": 0.41883769631385803, | |
| "learning_rate": 0.00019996284213867033, | |
| "loss": 0.051, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.2285092491838956, | |
| "grad_norm": 0.4462110996246338, | |
| "learning_rate": 0.0001999616532379987, | |
| "loss": 0.0515, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.2421109902067466, | |
| "grad_norm": 0.5779175758361816, | |
| "learning_rate": 0.00019996044561930622, | |
| "loss": 0.0569, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.2557127312295973, | |
| "grad_norm": 0.42402154207229614, | |
| "learning_rate": 0.00019995921928281894, | |
| "loss": 0.0519, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.2693144722524483, | |
| "grad_norm": 0.642371654510498, | |
| "learning_rate": 0.00019995797422876654, | |
| "loss": 0.0612, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.2829162132752994, | |
| "grad_norm": 0.44712305068969727, | |
| "learning_rate": 0.0001999567104573822, | |
| "loss": 0.0555, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.29651795429815, | |
| "grad_norm": 0.5107985138893127, | |
| "learning_rate": 0.0001999554279689025, | |
| "loss": 0.0629, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.310119695321001, | |
| "grad_norm": 0.4252520203590393, | |
| "learning_rate": 0.00019995412676356762, | |
| "loss": 0.0511, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.323721436343852, | |
| "grad_norm": 0.413331001996994, | |
| "learning_rate": 0.0001999528068416212, | |
| "loss": 0.0504, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.337323177366703, | |
| "grad_norm": 0.4216318726539612, | |
| "learning_rate": 0.0001999514682033104, | |
| "loss": 0.0539, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.3509249183895538, | |
| "grad_norm": 0.5241215825080872, | |
| "learning_rate": 0.0001999501108488859, | |
| "loss": 0.0551, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.364526659412405, | |
| "grad_norm": 0.48192480206489563, | |
| "learning_rate": 0.00019994873477860185, | |
| "loss": 0.0591, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.378128400435256, | |
| "grad_norm": 0.5567557215690613, | |
| "learning_rate": 0.00019994733999271596, | |
| "loss": 0.0522, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.3917301414581065, | |
| "grad_norm": 0.4027807414531708, | |
| "learning_rate": 0.00019994592649148933, | |
| "loss": 0.0524, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.4053318824809575, | |
| "grad_norm": 0.40923604369163513, | |
| "learning_rate": 0.0001999444942751867, | |
| "loss": 0.0533, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.4189336235038086, | |
| "grad_norm": 0.37654557824134827, | |
| "learning_rate": 0.00019994304334407622, | |
| "loss": 0.0482, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.432535364526659, | |
| "grad_norm": 0.37279561161994934, | |
| "learning_rate": 0.00019994157369842964, | |
| "loss": 0.0448, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.4461371055495102, | |
| "grad_norm": 0.34426528215408325, | |
| "learning_rate": 0.0001999400853385221, | |
| "loss": 0.0467, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.4597388465723613, | |
| "grad_norm": 0.4583146870136261, | |
| "learning_rate": 0.00019993857826463231, | |
| "loss": 0.0501, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.4733405875952124, | |
| "grad_norm": 0.4296802878379822, | |
| "learning_rate": 0.00019993705247704245, | |
| "loss": 0.0475, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.486942328618063, | |
| "grad_norm": 0.3417333960533142, | |
| "learning_rate": 0.00019993550797603828, | |
| "loss": 0.0475, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.500544069640914, | |
| "grad_norm": 0.3391024172306061, | |
| "learning_rate": 0.000199933944761909, | |
| "loss": 0.0428, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.514145810663765, | |
| "grad_norm": 0.4173300862312317, | |
| "learning_rate": 0.00019993236283494728, | |
| "loss": 0.0487, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.527747551686616, | |
| "grad_norm": 0.41140830516815186, | |
| "learning_rate": 0.00019993076219544938, | |
| "loss": 0.0499, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.5413492927094667, | |
| "grad_norm": 0.4615647494792938, | |
| "learning_rate": 0.00019992914284371497, | |
| "loss": 0.0485, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.554951033732318, | |
| "grad_norm": 0.4390008747577667, | |
| "learning_rate": 0.00019992750478004738, | |
| "loss": 0.0476, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.568552774755169, | |
| "grad_norm": 0.36641502380371094, | |
| "learning_rate": 0.00019992584800475322, | |
| "loss": 0.0421, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.5821545157780195, | |
| "grad_norm": 0.43274542689323425, | |
| "learning_rate": 0.00019992417251814282, | |
| "loss": 0.048, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.5957562568008705, | |
| "grad_norm": 0.39262470602989197, | |
| "learning_rate": 0.0001999224783205299, | |
| "loss": 0.0482, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.6093579978237216, | |
| "grad_norm": 0.3618634343147278, | |
| "learning_rate": 0.0001999207654122316, | |
| "loss": 0.0464, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.622959738846572, | |
| "grad_norm": 0.3481171727180481, | |
| "learning_rate": 0.0001999190337935688, | |
| "loss": 0.0478, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.6365614798694232, | |
| "grad_norm": 0.33801841735839844, | |
| "learning_rate": 0.0001999172834648657, | |
| "loss": 0.0444, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.6501632208922743, | |
| "grad_norm": 0.337028443813324, | |
| "learning_rate": 0.00019991551442645006, | |
| "loss": 0.0403, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.663764961915125, | |
| "grad_norm": 0.3380604684352875, | |
| "learning_rate": 0.0001999137266786531, | |
| "loss": 0.0483, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.677366702937976, | |
| "grad_norm": 0.44155481457710266, | |
| "learning_rate": 0.0001999119202218096, | |
| "loss": 0.045, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.690968443960827, | |
| "grad_norm": 0.43442797660827637, | |
| "learning_rate": 0.00019991009505625784, | |
| "loss": 0.0442, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.704570184983678, | |
| "grad_norm": 0.4352855980396271, | |
| "learning_rate": 0.00019990825118233957, | |
| "loss": 0.0463, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.7181719260065287, | |
| "grad_norm": 0.5073570609092712, | |
| "learning_rate": 0.00019990638860040006, | |
| "loss": 0.0492, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 73600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.54846097526789e+21, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |