{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007410151908114116, "grad_norm": 4.45390510559082, "learning_rate": 1.3333333333333334e-07, "loss": 1.2745, "step": 10 }, { "epoch": 0.014820303816228233, "grad_norm": 4.346654415130615, "learning_rate": 2.814814814814815e-07, "loss": 1.2289, "step": 20 }, { "epoch": 0.02223045572434235, "grad_norm": 3.9740023612976074, "learning_rate": 4.296296296296296e-07, "loss": 1.2757, "step": 30 }, { "epoch": 0.029640607632456465, "grad_norm": 3.878234386444092, "learning_rate": 5.777777777777777e-07, "loss": 1.2404, "step": 40 }, { "epoch": 0.037050759540570584, "grad_norm": 4.405137538909912, "learning_rate": 7.259259259259259e-07, "loss": 1.2627, "step": 50 }, { "epoch": 0.037050759540570584, "eval_loss": 1.2372760772705078, "eval_runtime": 208.6218, "eval_samples_per_second": 2.723, "eval_steps_per_second": 1.361, "step": 50 }, { "epoch": 0.0444609114486847, "grad_norm": 3.6006131172180176, "learning_rate": 8.740740740740741e-07, "loss": 1.2097, "step": 60 }, { "epoch": 0.051871063356798815, "grad_norm": 3.857161045074463, "learning_rate": 1.0222222222222221e-06, "loss": 1.0813, "step": 70 }, { "epoch": 0.05928121526491293, "grad_norm": 2.1402502059936523, "learning_rate": 1.1703703703703702e-06, "loss": 1.0065, "step": 80 }, { "epoch": 0.06669136717302705, "grad_norm": 2.2522549629211426, "learning_rate": 1.3185185185185184e-06, "loss": 0.885, "step": 90 }, { "epoch": 0.07410151908114117, "grad_norm": 1.8222051858901978, "learning_rate": 1.4666666666666665e-06, "loss": 0.8144, "step": 100 }, { "epoch": 0.07410151908114117, "eval_loss": 0.7972270846366882, "eval_runtime": 208.2662, "eval_samples_per_second": 2.727, "eval_steps_per_second": 1.364, "step": 100 }, { "epoch": 0.08151167098925528, "grad_norm": 1.2294323444366455, "learning_rate": 1.614814814814815e-06, "loss": 0.763, "step": 110 }, { "epoch": 0.0889218228973694, "grad_norm": 1.1122323274612427, "learning_rate": 1.762962962962963e-06, "loss": 0.7137, "step": 120 }, { "epoch": 0.09633197480548351, "grad_norm": 0.8735978007316589, "learning_rate": 1.9111111111111112e-06, "loss": 0.6903, "step": 130 }, { "epoch": 0.10374212671359763, "grad_norm": 0.7982610464096069, "learning_rate": 1.9999465148392903e-06, "loss": 0.6381, "step": 140 }, { "epoch": 0.11115227862171174, "grad_norm": 1.082419753074646, "learning_rate": 1.999344872485215e-06, "loss": 0.6285, "step": 150 }, { "epoch": 0.11115227862171174, "eval_loss": 0.6020215749740601, "eval_runtime": 208.1618, "eval_samples_per_second": 2.729, "eval_steps_per_second": 1.364, "step": 150 }, { "epoch": 0.11856243052982586, "grad_norm": 0.9158598780632019, "learning_rate": 1.9980751348850217e-06, "loss": 0.5996, "step": 160 }, { "epoch": 0.12597258243794, "grad_norm": 0.8013431429862976, "learning_rate": 1.996138150900478e-06, "loss": 0.5625, "step": 170 }, { "epoch": 0.1333827343460541, "grad_norm": 0.6254628300666809, "learning_rate": 1.9935352154697255e-06, "loss": 0.5658, "step": 180 }, { "epoch": 0.14079288625416822, "grad_norm": 0.7743931412696838, "learning_rate": 1.99026806874157e-06, "loss": 0.5308, "step": 190 }, { "epoch": 0.14820303816228234, "grad_norm": 0.8001664280891418, "learning_rate": 1.986338894912137e-06, "loss": 0.4969, "step": 200 }, { "epoch": 0.14820303816228234, "eval_loss": 0.5175904035568237, "eval_runtime": 208.2728, "eval_samples_per_second": 2.727, "eval_steps_per_second": 1.364, "step": 200 }, { "epoch": 0.15561319007039645, "grad_norm": 0.7069100141525269, "learning_rate": 1.9817503207646603e-06, "loss": 0.4996, "step": 210 }, { "epoch": 0.16302334197851057, "grad_norm": 1.052976131439209, "learning_rate": 1.9765054139133926e-06, "loss": 0.4658, "step": 220 }, { "epoch": 0.17043349388662468, "grad_norm": 0.739701509475708, "learning_rate": 1.970607680752804e-06, "loss": 0.4852, "step": 230 }, { "epoch": 0.1778436457947388, "grad_norm": 1.0828092098236084, "learning_rate": 1.9640610641134382e-06, "loss": 0.4793, "step": 240 }, { "epoch": 0.1852537977028529, "grad_norm": 0.8444927930831909, "learning_rate": 1.956869940626001e-06, "loss": 0.4198, "step": 250 }, { "epoch": 0.1852537977028529, "eval_loss": 0.45384249091148376, "eval_runtime": 208.3355, "eval_samples_per_second": 2.726, "eval_steps_per_second": 1.363, "step": 250 }, { "epoch": 0.19266394961096703, "grad_norm": 0.9700266122817993, "learning_rate": 1.9490391177954383e-06, "loss": 0.4512, "step": 260 }, { "epoch": 0.20007410151908114, "grad_norm": 0.8187345862388611, "learning_rate": 1.940573830786956e-06, "loss": 0.419, "step": 270 }, { "epoch": 0.20748425342719526, "grad_norm": 0.7705732583999634, "learning_rate": 1.9314797389261425e-06, "loss": 0.4329, "step": 280 }, { "epoch": 0.21489440533530937, "grad_norm": 0.8967396020889282, "learning_rate": 1.921762921915517e-06, "loss": 0.4267, "step": 290 }, { "epoch": 0.2223045572434235, "grad_norm": 0.8268992304801941, "learning_rate": 1.911429875770051e-06, "loss": 0.3825, "step": 300 }, { "epoch": 0.2223045572434235, "eval_loss": 0.3979480266571045, "eval_runtime": 208.4142, "eval_samples_per_second": 2.725, "eval_steps_per_second": 1.363, "step": 300 }, { "epoch": 0.2297147091515376, "grad_norm": 0.8976253867149353, "learning_rate": 1.9004875084743622e-06, "loss": 0.4011, "step": 310 }, { "epoch": 0.23712486105965172, "grad_norm": 1.0648952722549438, "learning_rate": 1.8889431353645002e-06, "loss": 0.3914, "step": 320 }, { "epoch": 0.24453501296776584, "grad_norm": 1.5843675136566162, "learning_rate": 1.8768044742374006e-06, "loss": 0.392, "step": 330 }, { "epoch": 0.25194516487588, "grad_norm": 1.2248514890670776, "learning_rate": 1.8640796401912805e-06, "loss": 0.3595, "step": 340 }, { "epoch": 0.2593553167839941, "grad_norm": 1.2559500932693481, "learning_rate": 1.8507771402004266e-06, "loss": 0.355, "step": 350 }, { "epoch": 0.2593553167839941, "eval_loss": 0.34925511479377747, "eval_runtime": 208.5081, "eval_samples_per_second": 2.724, "eval_steps_per_second": 1.362, "step": 350 }, { "epoch": 0.2667654686921082, "grad_norm": 1.2216342687606812, "learning_rate": 1.8369058674280002e-06, "loss": 0.315, "step": 360 }, { "epoch": 0.2741756206002223, "grad_norm": 1.0528122186660767, "learning_rate": 1.8224750952806621e-06, "loss": 0.3279, "step": 370 }, { "epoch": 0.28158577250833644, "grad_norm": 0.948635995388031, "learning_rate": 1.8074944712089923e-06, "loss": 0.3288, "step": 380 }, { "epoch": 0.28899592441645056, "grad_norm": 1.1430398225784302, "learning_rate": 1.791974010257848e-06, "loss": 0.3157, "step": 390 }, { "epoch": 0.29640607632456467, "grad_norm": 1.2426784038543701, "learning_rate": 1.7759240883709743e-06, "loss": 0.2976, "step": 400 }, { "epoch": 0.29640607632456467, "eval_loss": 0.3070007562637329, "eval_runtime": 208.3886, "eval_samples_per_second": 2.726, "eval_steps_per_second": 1.363, "step": 400 }, { "epoch": 0.3038162282326788, "grad_norm": 1.1574546098709106, "learning_rate": 1.7593554354543415e-06, "loss": 0.3031, "step": 410 }, { "epoch": 0.3112263801407929, "grad_norm": 1.3627562522888184, "learning_rate": 1.7422791282028455e-06, "loss": 0.2916, "step": 420 }, { "epoch": 0.318636532048907, "grad_norm": 1.4292229413986206, "learning_rate": 1.7247065826951692e-06, "loss": 0.2731, "step": 430 }, { "epoch": 0.32604668395702113, "grad_norm": 1.2023752927780151, "learning_rate": 1.706649546761755e-06, "loss": 0.2734, "step": 440 }, { "epoch": 0.33345683586513525, "grad_norm": 1.4670097827911377, "learning_rate": 1.6881200921309913e-06, "loss": 0.2547, "step": 450 }, { "epoch": 0.33345683586513525, "eval_loss": 0.2757605016231537, "eval_runtime": 208.5349, "eval_samples_per_second": 2.724, "eval_steps_per_second": 1.362, "step": 450 }, { "epoch": 0.34086698777324936, "grad_norm": 1.222509503364563, "learning_rate": 1.669130606358858e-06, "loss": 0.2549, "step": 460 }, { "epoch": 0.3482771396813635, "grad_norm": 1.0831719636917114, "learning_rate": 1.6496937845474371e-06, "loss": 0.2878, "step": 470 }, { "epoch": 0.3556872915894776, "grad_norm": 1.6136417388916016, "learning_rate": 1.6298226208578124e-06, "loss": 0.2495, "step": 480 }, { "epoch": 0.3630974434975917, "grad_norm": 1.2490746974945068, "learning_rate": 1.6095303998230431e-06, "loss": 0.2541, "step": 490 }, { "epoch": 0.3705075954057058, "grad_norm": 1.3669530153274536, "learning_rate": 1.5888306874670112e-06, "loss": 0.2537, "step": 500 }, { "epoch": 0.3705075954057058, "eval_loss": 0.2523694634437561, "eval_runtime": 208.3135, "eval_samples_per_second": 2.727, "eval_steps_per_second": 1.363, "step": 500 }, { "epoch": 0.37791774731381994, "grad_norm": 1.2288622856140137, "learning_rate": 1.567737322235084e-06, "loss": 0.2494, "step": 510 }, { "epoch": 0.38532789922193406, "grad_norm": 1.4230667352676392, "learning_rate": 1.546264405742654e-06, "loss": 0.2437, "step": 520 }, { "epoch": 0.39273805113004817, "grad_norm": 2.109126091003418, "learning_rate": 1.5244262933477398e-06, "loss": 0.2418, "step": 530 }, { "epoch": 0.4001482030381623, "grad_norm": 1.7550323009490967, "learning_rate": 1.5022375845539534e-06, "loss": 0.2358, "step": 540 }, { "epoch": 0.4075583549462764, "grad_norm": 1.5699337720870972, "learning_rate": 1.4797131132502464e-06, "loss": 0.2403, "step": 550 }, { "epoch": 0.4075583549462764, "eval_loss": 0.23371076583862305, "eval_runtime": 209.4346, "eval_samples_per_second": 2.712, "eval_steps_per_second": 1.356, "step": 550 }, { "epoch": 0.4149685068543905, "grad_norm": 1.7377654314041138, "learning_rate": 1.4568679377939617e-06, "loss": 0.2189, "step": 560 }, { "epoch": 0.42237865876250463, "grad_norm": 1.5883984565734863, "learning_rate": 1.4337173309438233e-06, "loss": 0.2268, "step": 570 }, { "epoch": 0.42978881067061875, "grad_norm": 1.6136311292648315, "learning_rate": 1.4102767696495883e-06, "loss": 0.2187, "step": 580 }, { "epoch": 0.43719896257873286, "grad_norm": 1.569895625114441, "learning_rate": 1.3865619247051915e-06, "loss": 0.2132, "step": 590 }, { "epoch": 0.444609114486847, "grad_norm": 1.6274341344833374, "learning_rate": 1.3625886502723008e-06, "loss": 0.2022, "step": 600 }, { "epoch": 0.444609114486847, "eval_loss": 0.21966929733753204, "eval_runtime": 209.6627, "eval_samples_per_second": 2.709, "eval_steps_per_second": 1.355, "step": 600 }, { "epoch": 0.4520192663949611, "grad_norm": 1.511763095855713, "learning_rate": 1.338372973281281e-06, "loss": 0.2026, "step": 610 }, { "epoch": 0.4594294183030752, "grad_norm": 1.2806991338729858, "learning_rate": 1.3139310827166612e-06, "loss": 0.217, "step": 620 }, { "epoch": 0.4668395702111893, "grad_norm": 1.5671716928482056, "learning_rate": 1.2892793187942586e-06, "loss": 0.2002, "step": 630 }, { "epoch": 0.47424972211930344, "grad_norm": 1.7969197034835815, "learning_rate": 1.2644341620372023e-06, "loss": 0.2004, "step": 640 }, { "epoch": 0.48165987402741756, "grad_norm": 1.4059193134307861, "learning_rate": 1.2394122222581555e-06, "loss": 0.2023, "step": 650 }, { "epoch": 0.48165987402741756, "eval_loss": 0.2078457921743393, "eval_runtime": 209.6676, "eval_samples_per_second": 2.709, "eval_steps_per_second": 1.355, "step": 650 }, { "epoch": 0.48907002593553167, "grad_norm": 1.4455323219299316, "learning_rate": 1.214230227455106e-06, "loss": 0.2059, "step": 660 }, { "epoch": 0.4964801778436458, "grad_norm": 1.5389137268066406, "learning_rate": 1.1889050126281403e-06, "loss": 0.2017, "step": 670 }, { "epoch": 0.50389032975176, "grad_norm": 1.3474076986312866, "learning_rate": 1.1634535085246902e-06, "loss": 0.1841, "step": 680 }, { "epoch": 0.5113004816598741, "grad_norm": 1.399523138999939, "learning_rate": 1.1378927303207636e-06, "loss": 0.2018, "step": 690 }, { "epoch": 0.5187106335679882, "grad_norm": 1.4829585552215576, "learning_rate": 1.112239766245735e-06, "loss": 0.1985, "step": 700 }, { "epoch": 0.5187106335679882, "eval_loss": 0.1990778148174286, "eval_runtime": 209.7592, "eval_samples_per_second": 2.708, "eval_steps_per_second": 1.354, "step": 700 }, { "epoch": 0.5261207854761023, "grad_norm": 1.460558295249939, "learning_rate": 1.0865117661582956e-06, "loss": 0.1789, "step": 710 }, { "epoch": 0.5335309373842164, "grad_norm": 1.1899714469909668, "learning_rate": 1.0607259300812045e-06, "loss": 0.2072, "step": 720 }, { "epoch": 0.5409410892923305, "grad_norm": 1.4120274782180786, "learning_rate": 1.034899496702501e-06, "loss": 0.1887, "step": 730 }, { "epoch": 0.5483512412004447, "grad_norm": 1.3966922760009766, "learning_rate": 1.0090497318508686e-06, "loss": 0.1723, "step": 740 }, { "epoch": 0.5557613931085588, "grad_norm": 1.6820755004882812, "learning_rate": 9.831939169528563e-07, "loss": 0.195, "step": 750 }, { "epoch": 0.5557613931085588, "eval_loss": 0.19260452687740326, "eval_runtime": 209.5514, "eval_samples_per_second": 2.711, "eval_steps_per_second": 1.355, "step": 750 }, { "epoch": 0.5631715450166729, "grad_norm": 1.6537615060806274, "learning_rate": 9.57349337479669e-07, "loss": 0.1922, "step": 760 }, { "epoch": 0.570581696924787, "grad_norm": 1.4756197929382324, "learning_rate": 9.315332713912591e-07, "loss": 0.1771, "step": 770 }, { "epoch": 0.5779918488329011, "grad_norm": 1.5430341958999634, "learning_rate": 9.057629775854314e-07, "loss": 0.1693, "step": 780 }, { "epoch": 0.5854020007410152, "grad_norm": 1.7049578428268433, "learning_rate": 8.800556843597001e-07, "loss": 0.1875, "step": 790 }, { "epoch": 0.5928121526491293, "grad_norm": 1.6176693439483643, "learning_rate": 8.544285778936002e-07, "loss": 0.1737, "step": 800 }, { "epoch": 0.5928121526491293, "eval_loss": 0.18725676834583282, "eval_runtime": 209.6187, "eval_samples_per_second": 2.71, "eval_steps_per_second": 1.355, "step": 800 }, { "epoch": 0.6002223045572435, "grad_norm": 1.9868125915527344, "learning_rate": 8.288987907591518e-07, "loss": 0.1754, "step": 810 }, { "epoch": 0.6076324564653576, "grad_norm": 1.5091502666473389, "learning_rate": 8.034833904671697e-07, "loss": 0.1815, "step": 820 }, { "epoch": 0.6150426083734717, "grad_norm": 2.074658155441284, "learning_rate": 7.781993680570655e-07, "loss": 0.1834, "step": 830 }, { "epoch": 0.6224527602815858, "grad_norm": 1.2912707328796387, "learning_rate": 7.530636267377706e-07, "loss": 0.1816, "step": 840 }, { "epoch": 0.6298629121896999, "grad_norm": 1.5129729509353638, "learning_rate": 7.280929705873818e-07, "loss": 0.1864, "step": 850 }, { "epoch": 0.6298629121896999, "eval_loss": 0.18269772827625275, "eval_runtime": 209.5398, "eval_samples_per_second": 2.711, "eval_steps_per_second": 1.355, "step": 850 }, { "epoch": 0.637273064097814, "grad_norm": 1.635215163230896, "learning_rate": 7.033040933190774e-07, "loss": 0.1815, "step": 860 }, { "epoch": 0.6446832160059282, "grad_norm": 1.7670304775238037, "learning_rate": 6.787135671208126e-07, "loss": 0.1873, "step": 870 }, { "epoch": 0.6520933679140423, "grad_norm": 1.7472981214523315, "learning_rate": 6.543378315762633e-07, "loss": 0.19, "step": 880 }, { "epoch": 0.6595035198221564, "grad_norm": 1.8691086769104004, "learning_rate": 6.301931826744189e-07, "loss": 0.1914, "step": 890 }, { "epoch": 0.6669136717302705, "grad_norm": 1.5207786560058594, "learning_rate": 6.062957619151703e-07, "loss": 0.1805, "step": 900 }, { "epoch": 0.6669136717302705, "eval_loss": 0.17914490401744843, "eval_runtime": 209.5686, "eval_samples_per_second": 2.71, "eval_steps_per_second": 1.355, "step": 900 }, { "epoch": 0.6743238236383846, "grad_norm": 1.5552278757095337, "learning_rate": 5.826615455181821e-07, "loss": 0.1887, "step": 910 }, { "epoch": 0.6817339755464987, "grad_norm": 1.5720982551574707, "learning_rate": 5.593063337422594e-07, "loss": 0.1743, "step": 920 }, { "epoch": 0.6891441274546128, "grad_norm": 1.781369924545288, "learning_rate": 5.362457403223495e-07, "loss": 0.1772, "step": 930 }, { "epoch": 0.696554279362727, "grad_norm": 1.4669294357299805, "learning_rate": 5.134951820312401e-07, "loss": 0.1897, "step": 940 }, { "epoch": 0.7039644312708411, "grad_norm": 1.097123384475708, "learning_rate": 4.91069868372937e-07, "loss": 0.1744, "step": 950 }, { "epoch": 0.7039644312708411, "eval_loss": 0.1766466647386551, "eval_runtime": 209.6411, "eval_samples_per_second": 2.709, "eval_steps_per_second": 1.355, "step": 950 }, { "epoch": 0.7113745831789552, "grad_norm": 1.3973758220672607, "learning_rate": 4.689847914146041e-07, "loss": 0.1683, "step": 960 }, { "epoch": 0.7187847350870693, "grad_norm": 1.736214280128479, "learning_rate": 4.472547157638673e-07, "loss": 0.1824, "step": 970 }, { "epoch": 0.7261948869951834, "grad_norm": 1.3411122560501099, "learning_rate": 4.258941686981864e-07, "loss": 0.183, "step": 980 }, { "epoch": 0.7336050389032975, "grad_norm": 1.4088919162750244, "learning_rate": 4.0491743045288564e-07, "loss": 0.1826, "step": 990 }, { "epoch": 0.7410151908114117, "grad_norm": 1.4413363933563232, "learning_rate": 3.843385246743417e-07, "loss": 0.1853, "step": 1000 }, { "epoch": 0.7410151908114117, "eval_loss": 0.17452707886695862, "eval_runtime": 209.5833, "eval_samples_per_second": 2.71, "eval_steps_per_second": 1.355, "step": 1000 }, { "epoch": 0.7484253427195258, "grad_norm": 1.4395395517349243, "learning_rate": 3.6417120904471244e-07, "loss": 0.1612, "step": 1010 }, { "epoch": 0.7558354946276399, "grad_norm": 1.5045926570892334, "learning_rate": 3.4442896608446647e-07, "loss": 0.1717, "step": 1020 }, { "epoch": 0.763245646535754, "grad_norm": 1.546872854232788, "learning_rate": 3.2512499413887253e-07, "loss": 0.1665, "step": 1030 }, { "epoch": 0.7706557984438681, "grad_norm": 1.5870492458343506, "learning_rate": 3.0627219855446664e-07, "loss": 0.1653, "step": 1040 }, { "epoch": 0.7780659503519822, "grad_norm": 1.694765329360962, "learning_rate": 2.87883183051398e-07, "loss": 0.1626, "step": 1050 }, { "epoch": 0.7780659503519822, "eval_loss": 0.17290721833705902, "eval_runtime": 209.5418, "eval_samples_per_second": 2.711, "eval_steps_per_second": 1.355, "step": 1050 }, { "epoch": 0.7854761022600963, "grad_norm": 2.193631649017334, "learning_rate": 2.699702412974254e-07, "loss": 0.1637, "step": 1060 }, { "epoch": 0.7928862541682105, "grad_norm": 1.4417670965194702, "learning_rate": 2.525453486891908e-07, "loss": 0.177, "step": 1070 }, { "epoch": 0.8002964060763246, "grad_norm": 1.472188949584961, "learning_rate": 2.356201543462678e-07, "loss": 0.1444, "step": 1080 }, { "epoch": 0.8077065579844387, "grad_norm": 1.3810269832611084, "learning_rate": 2.192059733233408e-07, "loss": 0.1771, "step": 1090 }, { "epoch": 0.8151167098925528, "grad_norm": 1.764819622039795, "learning_rate": 2.03313779045713e-07, "loss": 0.1557, "step": 1100 }, { "epoch": 0.8151167098925528, "eval_loss": 0.17193768918514252, "eval_runtime": 209.8503, "eval_samples_per_second": 2.707, "eval_steps_per_second": 1.353, "step": 1100 }, { "epoch": 0.8225268618006669, "grad_norm": 1.3674615621566772, "learning_rate": 1.8795419597320717e-07, "loss": 0.165, "step": 1110 }, { "epoch": 0.829937013708781, "grad_norm": 1.6381874084472656, "learning_rate": 1.7313749249736264e-07, "loss": 0.1743, "step": 1120 }, { "epoch": 0.8373471656168952, "grad_norm": 1.665426254272461, "learning_rate": 1.5887357407667312e-07, "loss": 0.1535, "step": 1130 }, { "epoch": 0.8447573175250093, "grad_norm": 1.4278993606567383, "learning_rate": 1.451719766144589e-07, "loss": 0.1709, "step": 1140 }, { "epoch": 0.8521674694331234, "grad_norm": 1.9010162353515625, "learning_rate": 1.3204186008379925e-07, "loss": 0.1621, "step": 1150 }, { "epoch": 0.8521674694331234, "eval_loss": 0.1711866408586502, "eval_runtime": 209.8853, "eval_samples_per_second": 2.706, "eval_steps_per_second": 1.353, "step": 1150 }, { "epoch": 0.8595776213412375, "grad_norm": 1.5918656587600708, "learning_rate": 1.1949200240378577e-07, "loss": 0.1866, "step": 1160 }, { "epoch": 0.8669877732493516, "grad_norm": 1.593591570854187, "learning_rate": 1.0753079357119132e-07, "loss": 0.1605, "step": 1170 }, { "epoch": 0.8743979251574657, "grad_norm": 1.415703535079956, "learning_rate": 9.61662300514795e-08, "loss": 0.159, "step": 1180 }, { "epoch": 0.8818080770655798, "grad_norm": 1.7163190841674805, "learning_rate": 8.540590943290127e-08, "loss": 0.1783, "step": 1190 }, { "epoch": 0.889218228973694, "grad_norm": 1.6285511255264282, "learning_rate": 7.525702534725443e-08, "loss": 0.1682, "step": 1200 }, { "epoch": 0.889218228973694, "eval_loss": 0.17063578963279724, "eval_runtime": 209.8969, "eval_samples_per_second": 2.706, "eval_steps_per_second": 1.353, "step": 1200 }, { "epoch": 0.8966283808818081, "grad_norm": 1.5694634914398193, "learning_rate": 6.572636266070264e-08, "loss": 0.1612, "step": 1210 }, { "epoch": 0.9040385327899222, "grad_norm": 1.6561760902404785, "learning_rate": 5.682029293786672e-08, "loss": 0.1703, "step": 1220 }, { "epoch": 0.9114486846980363, "grad_norm": 1.4719998836517334, "learning_rate": 4.854477018222103e-08, "loss": 0.1909, "step": 1230 }, { "epoch": 0.9188588366061504, "grad_norm": 1.349411129951477, "learning_rate": 4.090532685564618e-08, "loss": 0.1851, "step": 1240 }, { "epoch": 0.9262689885142645, "grad_norm": 1.8222600221633911, "learning_rate": 3.390707017979311e-08, "loss": 0.1744, "step": 1250 }, { "epoch": 0.9262689885142645, "eval_loss": 0.17040617763996124, "eval_runtime": 210.0132, "eval_samples_per_second": 2.705, "eval_steps_per_second": 1.352, "step": 1250 }, { "epoch": 0.9336791404223787, "grad_norm": 1.5898839235305786, "learning_rate": 2.755467872173567e-08, "loss": 0.1565, "step": 1260 }, { "epoch": 0.9410892923304928, "grad_norm": 1.6416816711425781, "learning_rate": 2.185239926619431e-08, "loss": 0.1699, "step": 1270 }, { "epoch": 0.9484994442386069, "grad_norm": 2.081421375274658, "learning_rate": 1.6804043976418438e-08, "loss": 0.1601, "step": 1280 }, { "epoch": 0.955909596146721, "grad_norm": 1.6453341245651245, "learning_rate": 1.2412987845628498e-08, "loss": 0.1553, "step": 1290 }, { "epoch": 0.9633197480548351, "grad_norm": 1.314548134803772, "learning_rate": 8.682166440721727e-09, "loss": 0.1626, "step": 1300 }, { "epoch": 0.9633197480548351, "eval_loss": 0.1703246533870697, "eval_runtime": 209.9198, "eval_samples_per_second": 2.706, "eval_steps_per_second": 1.353, "step": 1300 }, { "epoch": 0.9707298999629492, "grad_norm": 1.4979052543640137, "learning_rate": 5.614073939747443e-09, "loss": 0.1671, "step": 1310 }, { "epoch": 0.9781400518710633, "grad_norm": 1.4579306840896606, "learning_rate": 3.210761464466638e-09, "loss": 0.1605, "step": 1320 }, { "epoch": 0.9855502037791775, "grad_norm": 1.3856755495071411, "learning_rate": 1.4738357091084174e-09, "loss": 0.1662, "step": 1330 }, { "epoch": 0.9929603556872916, "grad_norm": 1.5018383264541626, "learning_rate": 4.0445786624199175e-10, "loss": 0.1757, "step": 1340 }, { "epoch": 1.0, "grad_norm": 1.9634699821472168, "learning_rate": 3.3428504808696857e-12, "loss": 0.1735, "step": 1350 }, { "epoch": 1.0, "eval_loss": 0.1701827049255371, "eval_runtime": 209.8395, "eval_samples_per_second": 2.707, "eval_steps_per_second": 1.353, "step": 1350 } ], "logging_steps": 10, "max_steps": 1350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.764415382282035e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }