| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 100, |
| "global_step": 1827, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016420361247947456, |
| "grad_norm": 0.24344406443731442, |
| "learning_rate": 1.092896174863388e-05, |
| "loss": 0.9892, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03284072249589491, |
| "grad_norm": 0.4125241392130406, |
| "learning_rate": 2.185792349726776e-05, |
| "loss": 1.0156, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04926108374384237, |
| "grad_norm": 0.6820350190299901, |
| "learning_rate": 3.2786885245901635e-05, |
| "loss": 0.9011, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06568144499178982, |
| "grad_norm": 0.3337377325006549, |
| "learning_rate": 4.371584699453552e-05, |
| "loss": 0.7712, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08210180623973727, |
| "grad_norm": 0.25141334294130874, |
| "learning_rate": 5.4644808743169406e-05, |
| "loss": 0.6344, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09852216748768473, |
| "grad_norm": 0.24065515098304474, |
| "learning_rate": 6.557377049180327e-05, |
| "loss": 0.5561, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11494252873563218, |
| "grad_norm": 0.22706179761932083, |
| "learning_rate": 7.650273224043716e-05, |
| "loss": 0.5197, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13136288998357964, |
| "grad_norm": 0.27088874717723194, |
| "learning_rate": 8.743169398907104e-05, |
| "loss": 0.5356, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1477832512315271, |
| "grad_norm": 0.20612532388807284, |
| "learning_rate": 9.836065573770493e-05, |
| "loss": 0.4876, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16420361247947454, |
| "grad_norm": 0.19862741522784605, |
| "learning_rate": 0.00010928961748633881, |
| "loss": 0.4525, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16420361247947454, |
| "eval_loss": 0.46812304854393005, |
| "eval_runtime": 183.2125, |
| "eval_samples_per_second": 23.639, |
| "eval_steps_per_second": 2.958, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.180623973727422, |
| "grad_norm": 0.2101845619328537, |
| "learning_rate": 0.00012021857923497268, |
| "loss": 0.506, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.19704433497536947, |
| "grad_norm": 0.2349100061009304, |
| "learning_rate": 0.00013114754098360654, |
| "loss": 0.4962, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2134646962233169, |
| "grad_norm": 0.28742618935606645, |
| "learning_rate": 0.00014207650273224045, |
| "loss": 0.4713, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.22988505747126436, |
| "grad_norm": 0.29754352649151433, |
| "learning_rate": 0.0001530054644808743, |
| "loss": 0.4533, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.24630541871921183, |
| "grad_norm": 0.24471382470916278, |
| "learning_rate": 0.0001639344262295082, |
| "loss": 0.4405, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2627257799671593, |
| "grad_norm": 0.25817141257326837, |
| "learning_rate": 0.00017486338797814208, |
| "loss": 0.4467, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2791461412151067, |
| "grad_norm": 0.2989945379301817, |
| "learning_rate": 0.00018579234972677597, |
| "loss": 0.4422, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2955665024630542, |
| "grad_norm": 0.3228970636746939, |
| "learning_rate": 0.00019672131147540985, |
| "loss": 0.4353, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.31198686371100165, |
| "grad_norm": 0.20939874727194543, |
| "learning_rate": 0.00019999105344723812, |
| "loss": 0.4237, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3284072249589491, |
| "grad_norm": 0.18763922543710346, |
| "learning_rate": 0.0001999472374506253, |
| "loss": 0.4298, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3284072249589491, |
| "eval_loss": 0.421655535697937, |
| "eval_runtime": 183.1202, |
| "eval_samples_per_second": 23.651, |
| "eval_steps_per_second": 2.96, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 0.20346107785301223, |
| "learning_rate": 0.00019986692474561292, |
| "loss": 0.4229, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.361247947454844, |
| "grad_norm": 0.2148306578056735, |
| "learning_rate": 0.00019975014465916825, |
| "loss": 0.4065, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.37766830870279144, |
| "grad_norm": 0.232976318701319, |
| "learning_rate": 0.00019959693983467874, |
| "loss": 0.4122, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.39408866995073893, |
| "grad_norm": 0.26419924004160084, |
| "learning_rate": 0.00019940736621638, |
| "loss": 0.4045, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.41050903119868637, |
| "grad_norm": 0.26529960716769463, |
| "learning_rate": 0.00019918149302892746, |
| "loss": 0.4303, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4269293924466338, |
| "grad_norm": 0.2070163052593257, |
| "learning_rate": 0.0001989194027521181, |
| "loss": 0.4064, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4433497536945813, |
| "grad_norm": 0.22108521642249446, |
| "learning_rate": 0.00019862119109077223, |
| "loss": 0.3908, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.45977011494252873, |
| "grad_norm": 0.22405910167315063, |
| "learning_rate": 0.00019828696693978615, |
| "loss": 0.4068, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.2185643225410028, |
| "learning_rate": 0.00019791685234436771, |
| "loss": 0.3992, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.49261083743842365, |
| "grad_norm": 0.19006575606804949, |
| "learning_rate": 0.0001975109824554707, |
| "loss": 0.4128, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.49261083743842365, |
| "eval_loss": 0.4074872136116028, |
| "eval_runtime": 182.3826, |
| "eval_samples_per_second": 23.747, |
| "eval_steps_per_second": 2.972, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5090311986863711, |
| "grad_norm": 0.19145583439083957, |
| "learning_rate": 0.0001970695054804429, |
| "loss": 0.4034, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5254515599343186, |
| "grad_norm": 0.2273380197603789, |
| "learning_rate": 0.00019659258262890683, |
| "loss": 0.4197, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.541871921182266, |
| "grad_norm": 0.22091852959390795, |
| "learning_rate": 0.00019608038805389252, |
| "loss": 0.3991, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5582922824302134, |
| "grad_norm": 0.22367242517683747, |
| "learning_rate": 0.00019553310878824373, |
| "loss": 0.3861, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5747126436781609, |
| "grad_norm": 0.17219186625619032, |
| "learning_rate": 0.00019495094467632113, |
| "loss": 0.3979, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5911330049261084, |
| "grad_norm": 0.19351650812107563, |
| "learning_rate": 0.00019433410830102722, |
| "loss": 0.3815, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6075533661740559, |
| "grad_norm": 0.23951228194879498, |
| "learning_rate": 0.00019368282490617964, |
| "loss": 0.4016, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6239737274220033, |
| "grad_norm": 0.18151574403855258, |
| "learning_rate": 0.000192997332314261, |
| "loss": 0.4116, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6403940886699507, |
| "grad_norm": 0.2062847844928879, |
| "learning_rate": 0.0001922778808395759, |
| "loss": 0.4316, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6568144499178982, |
| "grad_norm": 0.2100697627990869, |
| "learning_rate": 0.0001915247331968461, |
| "loss": 0.4247, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6568144499178982, |
| "eval_loss": 0.3982411026954651, |
| "eval_runtime": 182.3128, |
| "eval_samples_per_second": 23.756, |
| "eval_steps_per_second": 2.973, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6732348111658456, |
| "grad_norm": 0.2045711407622735, |
| "learning_rate": 0.00019073816440527778, |
| "loss": 0.4004, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 0.2127829647814462, |
| "learning_rate": 0.00018991846168813544, |
| "loss": 0.3842, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7060755336617406, |
| "grad_norm": 0.21955839561937393, |
| "learning_rate": 0.00018906592436785966, |
| "loss": 0.3999, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.722495894909688, |
| "grad_norm": 0.1655887989642121, |
| "learning_rate": 0.00018818086375676653, |
| "loss": 0.4066, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7389162561576355, |
| "grad_norm": 0.24085315390514417, |
| "learning_rate": 0.00018726360304336894, |
| "loss": 0.3908, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7553366174055829, |
| "grad_norm": 0.21190935166305344, |
| "learning_rate": 0.00018631447717436115, |
| "loss": 0.4083, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7717569786535303, |
| "grad_norm": 0.20895160435862614, |
| "learning_rate": 0.00018533383273230966, |
| "loss": 0.3995, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7881773399014779, |
| "grad_norm": 0.2711577365645202, |
| "learning_rate": 0.0001843220278090954, |
| "loss": 0.3645, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8045977011494253, |
| "grad_norm": 0.18438724587907176, |
| "learning_rate": 0.00018327943187515278, |
| "loss": 0.3823, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8210180623973727, |
| "grad_norm": 0.2604449097960011, |
| "learning_rate": 0.000182206425644554, |
| "loss": 0.3922, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8210180623973727, |
| "eval_loss": 0.39160048961639404, |
| "eval_runtime": 182.2512, |
| "eval_samples_per_second": 23.764, |
| "eval_steps_per_second": 2.974, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8374384236453202, |
| "grad_norm": 0.17015787500883076, |
| "learning_rate": 0.0001811034009359877, |
| "loss": 0.3849, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8538587848932676, |
| "grad_norm": 0.24475801339167882, |
| "learning_rate": 0.0001799707605296825, |
| "loss": 0.3985, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8702791461412152, |
| "grad_norm": 0.19669186817709053, |
| "learning_rate": 0.00017880891802032775, |
| "loss": 0.3912, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8866995073891626, |
| "grad_norm": 0.21566193043161921, |
| "learning_rate": 0.00017761829766604556, |
| "loss": 0.3618, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.90311986863711, |
| "grad_norm": 0.16411309691606532, |
| "learning_rate": 0.0001763993342334688, |
| "loss": 0.3742, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9195402298850575, |
| "grad_norm": 0.20572670194937387, |
| "learning_rate": 0.00017515247283898165, |
| "loss": 0.4028, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9359605911330049, |
| "grad_norm": 0.18095400431338843, |
| "learning_rate": 0.0001738781687861812, |
| "loss": 0.3687, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.17975479637736744, |
| "learning_rate": 0.0001725768873996188, |
| "loss": 0.409, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9688013136288999, |
| "grad_norm": 0.18067906778640103, |
| "learning_rate": 0.00017124910385488238, |
| "loss": 0.3695, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9852216748768473, |
| "grad_norm": 0.28537458222870044, |
| "learning_rate": 0.00016989530300508124, |
| "loss": 0.3634, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9852216748768473, |
| "eval_loss": 0.3875725269317627, |
| "eval_runtime": 184.7983, |
| "eval_samples_per_second": 23.436, |
| "eval_steps_per_second": 2.933, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.0016420361247949, |
| "grad_norm": 0.24626304392948783, |
| "learning_rate": 0.00016851597920379741, |
| "loss": 0.3756, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0180623973727423, |
| "grad_norm": 0.2377389610644995, |
| "learning_rate": 0.00016711163612456758, |
| "loss": 0.3631, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0344827586206897, |
| "grad_norm": 0.2001960352295221, |
| "learning_rate": 0.00016568278657696164, |
| "loss": 0.3469, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.0509031198686372, |
| "grad_norm": 0.24763506441381955, |
| "learning_rate": 0.00016422995231932548, |
| "loss": 0.3675, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0673234811165846, |
| "grad_norm": 0.2770882265665396, |
| "learning_rate": 0.00016275366386825572, |
| "loss": 0.3556, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.083743842364532, |
| "grad_norm": 0.22949474804382808, |
| "learning_rate": 0.00016125446030487643, |
| "loss": 0.3837, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.1001642036124795, |
| "grad_norm": 0.2312352658660111, |
| "learning_rate": 0.00015973288907798842, |
| "loss": 0.3399, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.116584564860427, |
| "grad_norm": 0.3016093942936665, |
| "learning_rate": 0.0001581895058041629, |
| "loss": 0.353, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1330049261083743, |
| "grad_norm": 0.20329768179994662, |
| "learning_rate": 0.00015662487406485273, |
| "loss": 0.3567, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1494252873563218, |
| "grad_norm": 0.24197160472933935, |
| "learning_rate": 0.00015503956520059525, |
| "loss": 0.351, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1494252873563218, |
| "eval_loss": 0.3857228755950928, |
| "eval_runtime": 182.3099, |
| "eval_samples_per_second": 23.756, |
| "eval_steps_per_second": 2.973, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1658456486042692, |
| "grad_norm": 0.2224497243857267, |
| "learning_rate": 0.0001534341581023814, |
| "loss": 0.3603, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1822660098522166, |
| "grad_norm": 0.20517751734717224, |
| "learning_rate": 0.00015180923900026848, |
| "loss": 0.3681, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.1986863711001643, |
| "grad_norm": 0.2731449513925482, |
| "learning_rate": 0.0001501654012493121, |
| "loss": 0.3812, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.2151067323481117, |
| "grad_norm": 0.24191327156963474, |
| "learning_rate": 0.0001485032451128971, |
| "loss": 0.3665, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2315270935960592, |
| "grad_norm": 0.21627312539889124, |
| "learning_rate": 0.00014682337754354534, |
| "loss": 0.3564, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2479474548440066, |
| "grad_norm": 0.17826400495436648, |
| "learning_rate": 0.00014512641196128115, |
| "loss": 0.3705, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.264367816091954, |
| "grad_norm": 0.24384067958647468, |
| "learning_rate": 0.000143412968029635, |
| "loss": 0.3659, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.2807881773399015, |
| "grad_norm": 0.2622553973852724, |
| "learning_rate": 0.00014168367142936735, |
| "loss": 0.3777, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.297208538587849, |
| "grad_norm": 0.24127081497225922, |
| "learning_rate": 0.00013993915362999515, |
| "loss": 0.3636, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3136288998357963, |
| "grad_norm": 0.20872541872479608, |
| "learning_rate": 0.00013818005165920467, |
| "loss": 0.3613, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3136288998357963, |
| "eval_loss": 0.3825320899486542, |
| "eval_runtime": 181.276, |
| "eval_samples_per_second": 23.892, |
| "eval_steps_per_second": 2.99, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3300492610837438, |
| "grad_norm": 0.22884530054863364, |
| "learning_rate": 0.00013640700787023464, |
| "loss": 0.3487, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.3464696223316914, |
| "grad_norm": 0.3192682013485456, |
| "learning_rate": 0.00013462066970731454, |
| "loss": 0.3644, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.3628899835796386, |
| "grad_norm": 0.24655216410147654, |
| "learning_rate": 0.00013282168946924424, |
| "loss": 0.3571, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.3793103448275863, |
| "grad_norm": 0.18787989160895535, |
| "learning_rate": 0.00013101072407120057, |
| "loss": 0.351, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3957307060755337, |
| "grad_norm": 0.1848697738688275, |
| "learning_rate": 0.0001291884348048584, |
| "loss": 0.3891, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.4121510673234812, |
| "grad_norm": 0.22237495334933807, |
| "learning_rate": 0.00012735548709691356, |
| "loss": 0.3774, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.2466680297782571, |
| "learning_rate": 0.0001255125502660958, |
| "loss": 0.3733, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.444991789819376, |
| "grad_norm": 0.2335451857601313, |
| "learning_rate": 0.0001236602972787604, |
| "loss": 0.3826, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.4614121510673235, |
| "grad_norm": 0.26118360054137263, |
| "learning_rate": 0.00012179940450314816, |
| "loss": 0.3742, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.477832512315271, |
| "grad_norm": 0.2378914948570509, |
| "learning_rate": 0.00011993055146240273, |
| "loss": 0.3655, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.477832512315271, |
| "eval_loss": 0.3792349696159363, |
| "eval_runtime": 181.1939, |
| "eval_samples_per_second": 23.903, |
| "eval_steps_per_second": 2.991, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4942528735632183, |
| "grad_norm": 0.255487972388616, |
| "learning_rate": 0.00011805442058643621, |
| "loss": 0.3614, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.5106732348111658, |
| "grad_norm": 0.19677148734311173, |
| "learning_rate": 0.00011617169696273325, |
| "loss": 0.3607, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.5270935960591134, |
| "grad_norm": 0.2034148205786053, |
| "learning_rate": 0.00011428306808618456, |
| "loss": 0.3581, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.5435139573070606, |
| "grad_norm": 0.2172139938997764, |
| "learning_rate": 0.00011238922360804159, |
| "loss": 0.3513, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.5599343185550083, |
| "grad_norm": 0.2086331597480039, |
| "learning_rate": 0.00011049085508408348, |
| "loss": 0.3405, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.5763546798029555, |
| "grad_norm": 0.21489224754509587, |
| "learning_rate": 0.00010858865572208892, |
| "loss": 0.3696, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.5927750410509032, |
| "grad_norm": 0.18831018208877479, |
| "learning_rate": 0.00010668332012870437, |
| "loss": 0.3422, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.6091954022988506, |
| "grad_norm": 0.25159259320461275, |
| "learning_rate": 0.00010477554405580183, |
| "loss": 0.3452, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.625615763546798, |
| "grad_norm": 0.21031599290249967, |
| "learning_rate": 0.00010286602414641817, |
| "loss": 0.3521, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.6420361247947455, |
| "grad_norm": 0.19651891254691545, |
| "learning_rate": 0.00010095545768036913, |
| "loss": 0.3849, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6420361247947455, |
| "eval_loss": 0.3762163817882538, |
| "eval_runtime": 181.2331, |
| "eval_samples_per_second": 23.897, |
| "eval_steps_per_second": 2.991, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.658456486042693, |
| "grad_norm": 0.19968900716509402, |
| "learning_rate": 9.904454231963089e-05, |
| "loss": 0.3854, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.6748768472906403, |
| "grad_norm": 0.20362598590099687, |
| "learning_rate": 9.713397585358188e-05, |
| "loss": 0.3768, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.6912972085385878, |
| "grad_norm": 0.22197361327539747, |
| "learning_rate": 9.52244559441982e-05, |
| "loss": 0.345, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.7077175697865354, |
| "grad_norm": 0.21984040718616688, |
| "learning_rate": 9.331667987129567e-05, |
| "loss": 0.3391, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.7241379310344827, |
| "grad_norm": 0.19128872792880933, |
| "learning_rate": 9.14113442779111e-05, |
| "loss": 0.3483, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.7405582922824303, |
| "grad_norm": 0.1799425595659442, |
| "learning_rate": 8.950914491591653e-05, |
| "loss": 0.3581, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.7569786535303775, |
| "grad_norm": 0.24469075185624548, |
| "learning_rate": 8.761077639195845e-05, |
| "loss": 0.3624, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.7733990147783252, |
| "grad_norm": 0.19922186890571247, |
| "learning_rate": 8.571693191381545e-05, |
| "loss": 0.3406, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7898193760262726, |
| "grad_norm": 0.21371381860320704, |
| "learning_rate": 8.38283030372668e-05, |
| "loss": 0.3924, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.80623973727422, |
| "grad_norm": 0.22498092502689765, |
| "learning_rate": 8.194557941356382e-05, |
| "loss": 0.3373, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.80623973727422, |
| "eval_loss": 0.37362968921661377, |
| "eval_runtime": 182.0338, |
| "eval_samples_per_second": 23.792, |
| "eval_steps_per_second": 2.977, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.8226600985221675, |
| "grad_norm": 0.20923612184407323, |
| "learning_rate": 8.006944853759732e-05, |
| "loss": 0.3539, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.839080459770115, |
| "grad_norm": 0.197165934554251, |
| "learning_rate": 7.820059549685185e-05, |
| "loss": 0.3366, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.8555008210180624, |
| "grad_norm": 0.2479628844731751, |
| "learning_rate": 7.63397027212396e-05, |
| "loss": 0.3533, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.8719211822660098, |
| "grad_norm": 0.2048613348077339, |
| "learning_rate": 7.448744973390422e-05, |
| "loss": 0.3433, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.8883415435139574, |
| "grad_norm": 0.1978616785725276, |
| "learning_rate": 7.264451290308642e-05, |
| "loss": 0.358, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.23907776822210447, |
| "learning_rate": 7.081156519514162e-05, |
| "loss": 0.3373, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.9211822660098523, |
| "grad_norm": 0.22697517505792655, |
| "learning_rate": 6.898927592879945e-05, |
| "loss": 0.3677, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.9376026272577995, |
| "grad_norm": 0.19952355876882955, |
| "learning_rate": 6.71783105307558e-05, |
| "loss": 0.3586, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.9540229885057472, |
| "grad_norm": 0.23973505798542807, |
| "learning_rate": 6.537933029268545e-05, |
| "loss": 0.351, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.9704433497536946, |
| "grad_norm": 0.21045951152349507, |
| "learning_rate": 6.359299212976534e-05, |
| "loss": 0.358, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.9704433497536946, |
| "eval_loss": 0.37114256620407104, |
| "eval_runtime": 181.5295, |
| "eval_samples_per_second": 23.858, |
| "eval_steps_per_second": 2.986, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.986863711001642, |
| "grad_norm": 0.2178513568906315, |
| "learning_rate": 6.181994834079534e-05, |
| "loss": 0.3661, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.0032840722495897, |
| "grad_norm": 0.22255069074338651, |
| "learning_rate": 6.006084637000486e-05, |
| "loss": 0.3468, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.019704433497537, |
| "grad_norm": 0.2146784712653626, |
| "learning_rate": 5.8316328570632706e-05, |
| "loss": 0.3288, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.0361247947454846, |
| "grad_norm": 0.19842848576771832, |
| "learning_rate": 5.6587031970365034e-05, |
| "loss": 0.3428, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.052545155993432, |
| "grad_norm": 0.2480996901650553, |
| "learning_rate": 5.487358803871887e-05, |
| "loss": 0.317, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.0689655172413794, |
| "grad_norm": 0.25320522417631036, |
| "learning_rate": 5.3176622456454693e-05, |
| "loss": 0.3458, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.0853858784893267, |
| "grad_norm": 0.2761010309663585, |
| "learning_rate": 5.1496754887102924e-05, |
| "loss": 0.344, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.1018062397372743, |
| "grad_norm": 0.207714571300336, |
| "learning_rate": 4.98345987506879e-05, |
| "loss": 0.3346, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.1182266009852215, |
| "grad_norm": 0.26011454554235863, |
| "learning_rate": 4.8190760999731524e-05, |
| "loss": 0.308, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.134646962233169, |
| "grad_norm": 0.22344608598475113, |
| "learning_rate": 4.6565841897618615e-05, |
| "loss": 0.3476, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.134646962233169, |
| "eval_loss": 0.37267637252807617, |
| "eval_runtime": 181.518, |
| "eval_samples_per_second": 23.86, |
| "eval_steps_per_second": 2.986, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.1510673234811164, |
| "grad_norm": 0.2967215583254955, |
| "learning_rate": 4.496043479940478e-05, |
| "loss": 0.3257, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.167487684729064, |
| "grad_norm": 0.2606814382277013, |
| "learning_rate": 4.337512593514729e-05, |
| "loss": 0.3386, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.1839080459770113, |
| "grad_norm": 0.23459530147231222, |
| "learning_rate": 4.181049419583713e-05, |
| "loss": 0.3222, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.200328407224959, |
| "grad_norm": 0.22931006977757953, |
| "learning_rate": 4.026711092201162e-05, |
| "loss": 0.3485, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.2167487684729066, |
| "grad_norm": 0.21406254032335828, |
| "learning_rate": 3.8745539695123575e-05, |
| "loss": 0.3317, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.233169129720854, |
| "grad_norm": 0.2246772369192269, |
| "learning_rate": 3.724633613174429e-05, |
| "loss": 0.3195, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.2495894909688015, |
| "grad_norm": 0.20058597054661914, |
| "learning_rate": 3.577004768067456e-05, |
| "loss": 0.3353, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.2660098522167487, |
| "grad_norm": 0.19548049551690252, |
| "learning_rate": 3.431721342303839e-05, |
| "loss": 0.3435, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.2824302134646963, |
| "grad_norm": 0.3209884551646414, |
| "learning_rate": 3.288836387543247e-05, |
| "loss": 0.3065, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.2988505747126435, |
| "grad_norm": 0.22671041629202113, |
| "learning_rate": 3.148402079620261e-05, |
| "loss": 0.3318, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.2988505747126435, |
| "eval_loss": 0.3717256188392639, |
| "eval_runtime": 181.4415, |
| "eval_samples_per_second": 23.87, |
| "eval_steps_per_second": 2.987, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.315270935960591, |
| "grad_norm": 0.2030483834077384, |
| "learning_rate": 3.01046969949188e-05, |
| "loss": 0.3064, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.3316912972085384, |
| "grad_norm": 0.23788365593483962, |
| "learning_rate": 2.8750896145117657e-05, |
| "loss": 0.3179, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.348111658456486, |
| "grad_norm": 0.19692059961575673, |
| "learning_rate": 2.7423112600381206e-05, |
| "loss": 0.3131, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.3645320197044333, |
| "grad_norm": 0.2237818052949346, |
| "learning_rate": 2.6121831213818827e-05, |
| "loss": 0.3253, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.19504196499026782, |
| "learning_rate": 2.4847527161018357e-05, |
| "loss": 0.3357, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.3973727422003286, |
| "grad_norm": 0.22539099442687743, |
| "learning_rate": 2.3600665766531217e-05, |
| "loss": 0.3046, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.413793103448276, |
| "grad_norm": 0.22439054668185862, |
| "learning_rate": 2.2381702333954434e-05, |
| "loss": 0.3077, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.4302134646962235, |
| "grad_norm": 0.2712680693548686, |
| "learning_rate": 2.119108197967228e-05, |
| "loss": 0.3363, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.4466338259441707, |
| "grad_norm": 0.24707851488677393, |
| "learning_rate": 2.002923947031753e-05, |
| "loss": 0.3383, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.4630541871921183, |
| "grad_norm": 0.2441408851266546, |
| "learning_rate": 1.8896599064012298e-05, |
| "loss": 0.3309, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4630541871921183, |
| "eval_loss": 0.3708588480949402, |
| "eval_runtime": 183.0453, |
| "eval_samples_per_second": 23.661, |
| "eval_steps_per_second": 2.961, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4794745484400655, |
| "grad_norm": 0.33265270775799227, |
| "learning_rate": 1.779357435544603e-05, |
| "loss": 0.3127, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.495894909688013, |
| "grad_norm": 0.27253862679732843, |
| "learning_rate": 1.6720568124847245e-05, |
| "loss": 0.3165, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.512315270935961, |
| "grad_norm": 0.2346747331686747, |
| "learning_rate": 1.5677972190904622e-05, |
| "loss": 0.3288, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.528735632183908, |
| "grad_norm": 0.22806719713684484, |
| "learning_rate": 1.4666167267690345e-05, |
| "loss": 0.3314, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.5451559934318553, |
| "grad_norm": 0.29265105480331116, |
| "learning_rate": 1.3685522825638897e-05, |
| "loss": 0.3397, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.561576354679803, |
| "grad_norm": 0.23724928573204634, |
| "learning_rate": 1.273639695663108e-05, |
| "loss": 0.3218, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.5779967159277506, |
| "grad_norm": 0.26141011452015106, |
| "learning_rate": 1.1819136243233487e-05, |
| "loss": 0.3177, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.594417077175698, |
| "grad_norm": 0.2031906424566249, |
| "learning_rate": 1.093407563214036e-05, |
| "loss": 0.3231, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.6108374384236455, |
| "grad_norm": 0.21755775030563373, |
| "learning_rate": 1.0081538311864569e-05, |
| "loss": 0.3339, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.6272577996715927, |
| "grad_norm": 0.21842386508748884, |
| "learning_rate": 9.261835594722213e-06, |
| "loss": 0.3141, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.6272577996715927, |
| "eval_loss": 0.3702554702758789, |
| "eval_runtime": 182.4061, |
| "eval_samples_per_second": 23.744, |
| "eval_steps_per_second": 2.971, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.6436781609195403, |
| "grad_norm": 0.236951072287543, |
| "learning_rate": 8.475266803153891e-06, |
| "loss": 0.3401, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.6600985221674875, |
| "grad_norm": 0.2525139633376538, |
| "learning_rate": 7.722119160424112e-06, |
| "loss": 0.3085, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.676518883415435, |
| "grad_norm": 0.2516229325471898, |
| "learning_rate": 7.002667685739006e-06, |
| "loss": 0.3022, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.692939244663383, |
| "grad_norm": 0.26050295140139745, |
| "learning_rate": 6.317175093820371e-06, |
| "loss": 0.3267, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.70935960591133, |
| "grad_norm": 0.18224262926232945, |
| "learning_rate": 5.6658916989727695e-06, |
| "loss": 0.303, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.7257799671592773, |
| "grad_norm": 0.24832596165336826, |
| "learning_rate": 5.049055323678886e-06, |
| "loss": 0.321, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.742200328407225, |
| "grad_norm": 0.2590753148743458, |
| "learning_rate": 4.466891211756297e-06, |
| "loss": 0.3262, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.7586206896551726, |
| "grad_norm": 0.2428540816617657, |
| "learning_rate": 3.919611946107493e-06, |
| "loss": 0.3349, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.77504105090312, |
| "grad_norm": 0.26458866020693184, |
| "learning_rate": 3.40741737109318e-06, |
| "loss": 0.3162, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.7914614121510675, |
| "grad_norm": 0.19323777891652238, |
| "learning_rate": 2.930494519557114e-06, |
| "loss": 0.3252, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7914614121510675, |
| "eval_loss": 0.3698328137397766, |
| "eval_runtime": 180.1089, |
| "eval_samples_per_second": 24.047, |
| "eval_steps_per_second": 3.009, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.8078817733990147, |
| "grad_norm": 0.22306709767178198, |
| "learning_rate": 2.489017544529315e-06, |
| "loss": 0.3294, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.8243021346469623, |
| "grad_norm": 0.21362629340229825, |
| "learning_rate": 2.083147655632289e-06, |
| "loss": 0.3452, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.8407224958949095, |
| "grad_norm": 0.2451092641924001, |
| "learning_rate": 1.7130330602138644e-06, |
| "loss": 0.3244, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.24087531125607498, |
| "learning_rate": 1.378808909227769e-06, |
| "loss": 0.3331, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.873563218390805, |
| "grad_norm": 0.2263212098087392, |
| "learning_rate": 1.0805972478819425e-06, |
| "loss": 0.3214, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.889983579638752, |
| "grad_norm": 0.23088744114799653, |
| "learning_rate": 8.185069710725524e-07, |
| "loss": 0.3132, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.9064039408866993, |
| "grad_norm": 0.24275972838675813, |
| "learning_rate": 5.926337836199891e-07, |
| "loss": 0.3223, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.922824302134647, |
| "grad_norm": 0.23934161415457397, |
| "learning_rate": 4.0306016532126734e-07, |
| "loss": 0.3322, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.9392446633825946, |
| "grad_norm": 0.21022506463435206, |
| "learning_rate": 2.4985534083176166e-07, |
| "loss": 0.3284, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.955665024630542, |
| "grad_norm": 0.21555995324889698, |
| "learning_rate": 1.330752543871161e-07, |
| "loss": 0.3446, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.955665024630542, |
| "eval_loss": 0.3698555529117584, |
| "eval_runtime": 177.5603, |
| "eval_samples_per_second": 24.392, |
| "eval_steps_per_second": 3.052, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.9720853858784895, |
| "grad_norm": 0.2518284713990051, |
| "learning_rate": 5.2762549374685275e-08, |
| "loss": 0.3281, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.9885057471264367, |
| "grad_norm": 0.2209423779742796, |
| "learning_rate": 8.946552761890382e-09, |
| "loss": 0.3178, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1827, |
| "total_flos": 5848557328269312.0, |
| "train_loss": 0.38001893711142287, |
| "train_runtime": 19748.2381, |
| "train_samples_per_second": 5.92, |
| "train_steps_per_second": 0.093 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1827, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5848557328269312.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|