| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 50, |
| "global_step": 1350, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007410151908114116, |
| "grad_norm": 4.45390510559082, |
| "learning_rate": 1.3333333333333334e-07, |
| "loss": 1.2745, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.014820303816228233, |
| "grad_norm": 4.346654415130615, |
| "learning_rate": 2.814814814814815e-07, |
| "loss": 1.2289, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.02223045572434235, |
| "grad_norm": 3.9740023612976074, |
| "learning_rate": 4.296296296296296e-07, |
| "loss": 1.2757, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.029640607632456465, |
| "grad_norm": 3.878234386444092, |
| "learning_rate": 5.777777777777777e-07, |
| "loss": 1.2404, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.037050759540570584, |
| "grad_norm": 4.405137538909912, |
| "learning_rate": 7.259259259259259e-07, |
| "loss": 1.2627, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.037050759540570584, |
| "eval_loss": 1.2372760772705078, |
| "eval_runtime": 208.6218, |
| "eval_samples_per_second": 2.723, |
| "eval_steps_per_second": 1.361, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0444609114486847, |
| "grad_norm": 3.6006131172180176, |
| "learning_rate": 8.740740740740741e-07, |
| "loss": 1.2097, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.051871063356798815, |
| "grad_norm": 3.857161045074463, |
| "learning_rate": 1.0222222222222221e-06, |
| "loss": 1.0813, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.05928121526491293, |
| "grad_norm": 2.1402502059936523, |
| "learning_rate": 1.1703703703703702e-06, |
| "loss": 1.0065, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.06669136717302705, |
| "grad_norm": 2.2522549629211426, |
| "learning_rate": 1.3185185185185184e-06, |
| "loss": 0.885, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.07410151908114117, |
| "grad_norm": 1.8222051858901978, |
| "learning_rate": 1.4666666666666665e-06, |
| "loss": 0.8144, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07410151908114117, |
| "eval_loss": 0.7972270846366882, |
| "eval_runtime": 208.2662, |
| "eval_samples_per_second": 2.727, |
| "eval_steps_per_second": 1.364, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08151167098925528, |
| "grad_norm": 1.2294323444366455, |
| "learning_rate": 1.614814814814815e-06, |
| "loss": 0.763, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0889218228973694, |
| "grad_norm": 1.1122323274612427, |
| "learning_rate": 1.762962962962963e-06, |
| "loss": 0.7137, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.09633197480548351, |
| "grad_norm": 0.8735978007316589, |
| "learning_rate": 1.9111111111111112e-06, |
| "loss": 0.6903, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.10374212671359763, |
| "grad_norm": 0.7982610464096069, |
| "learning_rate": 1.9999465148392903e-06, |
| "loss": 0.6381, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.11115227862171174, |
| "grad_norm": 1.082419753074646, |
| "learning_rate": 1.999344872485215e-06, |
| "loss": 0.6285, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.11115227862171174, |
| "eval_loss": 0.6020215749740601, |
| "eval_runtime": 208.1618, |
| "eval_samples_per_second": 2.729, |
| "eval_steps_per_second": 1.364, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.11856243052982586, |
| "grad_norm": 0.9158598780632019, |
| "learning_rate": 1.9980751348850217e-06, |
| "loss": 0.5996, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.12597258243794, |
| "grad_norm": 0.8013431429862976, |
| "learning_rate": 1.996138150900478e-06, |
| "loss": 0.5625, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1333827343460541, |
| "grad_norm": 0.6254628300666809, |
| "learning_rate": 1.9935352154697255e-06, |
| "loss": 0.5658, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.14079288625416822, |
| "grad_norm": 0.7743931412696838, |
| "learning_rate": 1.99026806874157e-06, |
| "loss": 0.5308, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.14820303816228234, |
| "grad_norm": 0.8001664280891418, |
| "learning_rate": 1.986338894912137e-06, |
| "loss": 0.4969, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.14820303816228234, |
| "eval_loss": 0.5175904035568237, |
| "eval_runtime": 208.2728, |
| "eval_samples_per_second": 2.727, |
| "eval_steps_per_second": 1.364, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.15561319007039645, |
| "grad_norm": 0.7069100141525269, |
| "learning_rate": 1.9817503207646603e-06, |
| "loss": 0.4996, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.16302334197851057, |
| "grad_norm": 1.052976131439209, |
| "learning_rate": 1.9765054139133926e-06, |
| "loss": 0.4658, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.17043349388662468, |
| "grad_norm": 0.739701509475708, |
| "learning_rate": 1.970607680752804e-06, |
| "loss": 0.4852, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1778436457947388, |
| "grad_norm": 1.0828092098236084, |
| "learning_rate": 1.9640610641134382e-06, |
| "loss": 0.4793, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1852537977028529, |
| "grad_norm": 0.8444927930831909, |
| "learning_rate": 1.956869940626001e-06, |
| "loss": 0.4198, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.1852537977028529, |
| "eval_loss": 0.45384249091148376, |
| "eval_runtime": 208.3355, |
| "eval_samples_per_second": 2.726, |
| "eval_steps_per_second": 1.363, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.19266394961096703, |
| "grad_norm": 0.9700266122817993, |
| "learning_rate": 1.9490391177954383e-06, |
| "loss": 0.4512, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.20007410151908114, |
| "grad_norm": 0.8187345862388611, |
| "learning_rate": 1.940573830786956e-06, |
| "loss": 0.419, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.20748425342719526, |
| "grad_norm": 0.7705732583999634, |
| "learning_rate": 1.9314797389261425e-06, |
| "loss": 0.4329, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.21489440533530937, |
| "grad_norm": 0.8967396020889282, |
| "learning_rate": 1.921762921915517e-06, |
| "loss": 0.4267, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.2223045572434235, |
| "grad_norm": 0.8268992304801941, |
| "learning_rate": 1.911429875770051e-06, |
| "loss": 0.3825, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2223045572434235, |
| "eval_loss": 0.3979480266571045, |
| "eval_runtime": 208.4142, |
| "eval_samples_per_second": 2.725, |
| "eval_steps_per_second": 1.363, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2297147091515376, |
| "grad_norm": 0.8976253867149353, |
| "learning_rate": 1.9004875084743622e-06, |
| "loss": 0.4011, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.23712486105965172, |
| "grad_norm": 1.0648952722549438, |
| "learning_rate": 1.8889431353645002e-06, |
| "loss": 0.3914, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.24453501296776584, |
| "grad_norm": 1.5843675136566162, |
| "learning_rate": 1.8768044742374006e-06, |
| "loss": 0.392, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.25194516487588, |
| "grad_norm": 1.2248514890670776, |
| "learning_rate": 1.8640796401912805e-06, |
| "loss": 0.3595, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2593553167839941, |
| "grad_norm": 1.2559500932693481, |
| "learning_rate": 1.8507771402004266e-06, |
| "loss": 0.355, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2593553167839941, |
| "eval_loss": 0.34925511479377747, |
| "eval_runtime": 208.5081, |
| "eval_samples_per_second": 2.724, |
| "eval_steps_per_second": 1.362, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2667654686921082, |
| "grad_norm": 1.2216342687606812, |
| "learning_rate": 1.8369058674280002e-06, |
| "loss": 0.315, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.2741756206002223, |
| "grad_norm": 1.0528122186660767, |
| "learning_rate": 1.8224750952806621e-06, |
| "loss": 0.3279, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.28158577250833644, |
| "grad_norm": 0.948635995388031, |
| "learning_rate": 1.8074944712089923e-06, |
| "loss": 0.3288, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.28899592441645056, |
| "grad_norm": 1.1430398225784302, |
| "learning_rate": 1.791974010257848e-06, |
| "loss": 0.3157, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.29640607632456467, |
| "grad_norm": 1.2426784038543701, |
| "learning_rate": 1.7759240883709743e-06, |
| "loss": 0.2976, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.29640607632456467, |
| "eval_loss": 0.3070007562637329, |
| "eval_runtime": 208.3886, |
| "eval_samples_per_second": 2.726, |
| "eval_steps_per_second": 1.363, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3038162282326788, |
| "grad_norm": 1.1574546098709106, |
| "learning_rate": 1.7593554354543415e-06, |
| "loss": 0.3031, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.3112263801407929, |
| "grad_norm": 1.3627562522888184, |
| "learning_rate": 1.7422791282028455e-06, |
| "loss": 0.2916, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.318636532048907, |
| "grad_norm": 1.4292229413986206, |
| "learning_rate": 1.7247065826951692e-06, |
| "loss": 0.2731, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.32604668395702113, |
| "grad_norm": 1.2023752927780151, |
| "learning_rate": 1.706649546761755e-06, |
| "loss": 0.2734, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.33345683586513525, |
| "grad_norm": 1.4670097827911377, |
| "learning_rate": 1.6881200921309913e-06, |
| "loss": 0.2547, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.33345683586513525, |
| "eval_loss": 0.2757605016231537, |
| "eval_runtime": 208.5349, |
| "eval_samples_per_second": 2.724, |
| "eval_steps_per_second": 1.362, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.34086698777324936, |
| "grad_norm": 1.222509503364563, |
| "learning_rate": 1.669130606358858e-06, |
| "loss": 0.2549, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.3482771396813635, |
| "grad_norm": 1.0831719636917114, |
| "learning_rate": 1.6496937845474371e-06, |
| "loss": 0.2878, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.3556872915894776, |
| "grad_norm": 1.6136417388916016, |
| "learning_rate": 1.6298226208578124e-06, |
| "loss": 0.2495, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3630974434975917, |
| "grad_norm": 1.2490746974945068, |
| "learning_rate": 1.6095303998230431e-06, |
| "loss": 0.2541, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.3705075954057058, |
| "grad_norm": 1.3669530153274536, |
| "learning_rate": 1.5888306874670112e-06, |
| "loss": 0.2537, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3705075954057058, |
| "eval_loss": 0.2523694634437561, |
| "eval_runtime": 208.3135, |
| "eval_samples_per_second": 2.727, |
| "eval_steps_per_second": 1.363, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.37791774731381994, |
| "grad_norm": 1.2288622856140137, |
| "learning_rate": 1.567737322235084e-06, |
| "loss": 0.2494, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.38532789922193406, |
| "grad_norm": 1.4230667352676392, |
| "learning_rate": 1.546264405742654e-06, |
| "loss": 0.2437, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.39273805113004817, |
| "grad_norm": 2.109126091003418, |
| "learning_rate": 1.5244262933477398e-06, |
| "loss": 0.2418, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.4001482030381623, |
| "grad_norm": 1.7550323009490967, |
| "learning_rate": 1.5022375845539534e-06, |
| "loss": 0.2358, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.4075583549462764, |
| "grad_norm": 1.5699337720870972, |
| "learning_rate": 1.4797131132502464e-06, |
| "loss": 0.2403, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.4075583549462764, |
| "eval_loss": 0.23371076583862305, |
| "eval_runtime": 209.4346, |
| "eval_samples_per_second": 2.712, |
| "eval_steps_per_second": 1.356, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.4149685068543905, |
| "grad_norm": 1.7377654314041138, |
| "learning_rate": 1.4568679377939617e-06, |
| "loss": 0.2189, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.42237865876250463, |
| "grad_norm": 1.5883984565734863, |
| "learning_rate": 1.4337173309438233e-06, |
| "loss": 0.2268, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.42978881067061875, |
| "grad_norm": 1.6136311292648315, |
| "learning_rate": 1.4102767696495883e-06, |
| "loss": 0.2187, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.43719896257873286, |
| "grad_norm": 1.569895625114441, |
| "learning_rate": 1.3865619247051915e-06, |
| "loss": 0.2132, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.444609114486847, |
| "grad_norm": 1.6274341344833374, |
| "learning_rate": 1.3625886502723008e-06, |
| "loss": 0.2022, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.444609114486847, |
| "eval_loss": 0.21966929733753204, |
| "eval_runtime": 209.6627, |
| "eval_samples_per_second": 2.709, |
| "eval_steps_per_second": 1.355, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4520192663949611, |
| "grad_norm": 1.511763095855713, |
| "learning_rate": 1.338372973281281e-06, |
| "loss": 0.2026, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.4594294183030752, |
| "grad_norm": 1.2806991338729858, |
| "learning_rate": 1.3139310827166612e-06, |
| "loss": 0.217, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4668395702111893, |
| "grad_norm": 1.5671716928482056, |
| "learning_rate": 1.2892793187942586e-06, |
| "loss": 0.2002, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.47424972211930344, |
| "grad_norm": 1.7969197034835815, |
| "learning_rate": 1.2644341620372023e-06, |
| "loss": 0.2004, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.48165987402741756, |
| "grad_norm": 1.4059193134307861, |
| "learning_rate": 1.2394122222581555e-06, |
| "loss": 0.2023, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.48165987402741756, |
| "eval_loss": 0.2078457921743393, |
| "eval_runtime": 209.6676, |
| "eval_samples_per_second": 2.709, |
| "eval_steps_per_second": 1.355, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.48907002593553167, |
| "grad_norm": 1.4455323219299316, |
| "learning_rate": 1.214230227455106e-06, |
| "loss": 0.2059, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4964801778436458, |
| "grad_norm": 1.5389137268066406, |
| "learning_rate": 1.1889050126281403e-06, |
| "loss": 0.2017, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.50389032975176, |
| "grad_norm": 1.3474076986312866, |
| "learning_rate": 1.1634535085246902e-06, |
| "loss": 0.1841, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.5113004816598741, |
| "grad_norm": 1.399523138999939, |
| "learning_rate": 1.1378927303207636e-06, |
| "loss": 0.2018, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.5187106335679882, |
| "grad_norm": 1.4829585552215576, |
| "learning_rate": 1.112239766245735e-06, |
| "loss": 0.1985, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5187106335679882, |
| "eval_loss": 0.1990778148174286, |
| "eval_runtime": 209.7592, |
| "eval_samples_per_second": 2.708, |
| "eval_steps_per_second": 1.354, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5261207854761023, |
| "grad_norm": 1.460558295249939, |
| "learning_rate": 1.0865117661582956e-06, |
| "loss": 0.1789, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.5335309373842164, |
| "grad_norm": 1.1899714469909668, |
| "learning_rate": 1.0607259300812045e-06, |
| "loss": 0.2072, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.5409410892923305, |
| "grad_norm": 1.4120274782180786, |
| "learning_rate": 1.034899496702501e-06, |
| "loss": 0.1887, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.5483512412004447, |
| "grad_norm": 1.3966922760009766, |
| "learning_rate": 1.0090497318508686e-06, |
| "loss": 0.1723, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5557613931085588, |
| "grad_norm": 1.6820755004882812, |
| "learning_rate": 9.831939169528563e-07, |
| "loss": 0.195, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5557613931085588, |
| "eval_loss": 0.19260452687740326, |
| "eval_runtime": 209.5514, |
| "eval_samples_per_second": 2.711, |
| "eval_steps_per_second": 1.355, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5631715450166729, |
| "grad_norm": 1.6537615060806274, |
| "learning_rate": 9.57349337479669e-07, |
| "loss": 0.1922, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.570581696924787, |
| "grad_norm": 1.4756197929382324, |
| "learning_rate": 9.315332713912591e-07, |
| "loss": 0.1771, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5779918488329011, |
| "grad_norm": 1.5430341958999634, |
| "learning_rate": 9.057629775854314e-07, |
| "loss": 0.1693, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5854020007410152, |
| "grad_norm": 1.7049578428268433, |
| "learning_rate": 8.800556843597001e-07, |
| "loss": 0.1875, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5928121526491293, |
| "grad_norm": 1.6176693439483643, |
| "learning_rate": 8.544285778936002e-07, |
| "loss": 0.1737, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5928121526491293, |
| "eval_loss": 0.18725676834583282, |
| "eval_runtime": 209.6187, |
| "eval_samples_per_second": 2.71, |
| "eval_steps_per_second": 1.355, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6002223045572435, |
| "grad_norm": 1.9868125915527344, |
| "learning_rate": 8.288987907591518e-07, |
| "loss": 0.1754, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.6076324564653576, |
| "grad_norm": 1.5091502666473389, |
| "learning_rate": 8.034833904671697e-07, |
| "loss": 0.1815, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.6150426083734717, |
| "grad_norm": 2.074658155441284, |
| "learning_rate": 7.781993680570655e-07, |
| "loss": 0.1834, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.6224527602815858, |
| "grad_norm": 1.2912707328796387, |
| "learning_rate": 7.530636267377706e-07, |
| "loss": 0.1816, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.6298629121896999, |
| "grad_norm": 1.5129729509353638, |
| "learning_rate": 7.280929705873818e-07, |
| "loss": 0.1864, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.6298629121896999, |
| "eval_loss": 0.18269772827625275, |
| "eval_runtime": 209.5398, |
| "eval_samples_per_second": 2.711, |
| "eval_steps_per_second": 1.355, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.637273064097814, |
| "grad_norm": 1.635215163230896, |
| "learning_rate": 7.033040933190774e-07, |
| "loss": 0.1815, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.6446832160059282, |
| "grad_norm": 1.7670304775238037, |
| "learning_rate": 6.787135671208126e-07, |
| "loss": 0.1873, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.6520933679140423, |
| "grad_norm": 1.7472981214523315, |
| "learning_rate": 6.543378315762633e-07, |
| "loss": 0.19, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.6595035198221564, |
| "grad_norm": 1.8691086769104004, |
| "learning_rate": 6.301931826744189e-07, |
| "loss": 0.1914, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6669136717302705, |
| "grad_norm": 1.5207786560058594, |
| "learning_rate": 6.062957619151703e-07, |
| "loss": 0.1805, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6669136717302705, |
| "eval_loss": 0.17914490401744843, |
| "eval_runtime": 209.5686, |
| "eval_samples_per_second": 2.71, |
| "eval_steps_per_second": 1.355, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6743238236383846, |
| "grad_norm": 1.5552278757095337, |
| "learning_rate": 5.826615455181821e-07, |
| "loss": 0.1887, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6817339755464987, |
| "grad_norm": 1.5720982551574707, |
| "learning_rate": 5.593063337422594e-07, |
| "loss": 0.1743, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.6891441274546128, |
| "grad_norm": 1.781369924545288, |
| "learning_rate": 5.362457403223495e-07, |
| "loss": 0.1772, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.696554279362727, |
| "grad_norm": 1.4669294357299805, |
| "learning_rate": 5.134951820312401e-07, |
| "loss": 0.1897, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.7039644312708411, |
| "grad_norm": 1.097123384475708, |
| "learning_rate": 4.91069868372937e-07, |
| "loss": 0.1744, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.7039644312708411, |
| "eval_loss": 0.1766466647386551, |
| "eval_runtime": 209.6411, |
| "eval_samples_per_second": 2.709, |
| "eval_steps_per_second": 1.355, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.7113745831789552, |
| "grad_norm": 1.3973758220672607, |
| "learning_rate": 4.689847914146041e-07, |
| "loss": 0.1683, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.7187847350870693, |
| "grad_norm": 1.736214280128479, |
| "learning_rate": 4.472547157638673e-07, |
| "loss": 0.1824, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.7261948869951834, |
| "grad_norm": 1.3411122560501099, |
| "learning_rate": 4.258941686981864e-07, |
| "loss": 0.183, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.7336050389032975, |
| "grad_norm": 1.4088919162750244, |
| "learning_rate": 4.0491743045288564e-07, |
| "loss": 0.1826, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.7410151908114117, |
| "grad_norm": 1.4413363933563232, |
| "learning_rate": 3.843385246743417e-07, |
| "loss": 0.1853, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7410151908114117, |
| "eval_loss": 0.17452707886695862, |
| "eval_runtime": 209.5833, |
| "eval_samples_per_second": 2.71, |
| "eval_steps_per_second": 1.355, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7484253427195258, |
| "grad_norm": 1.4395395517349243, |
| "learning_rate": 3.6417120904471244e-07, |
| "loss": 0.1612, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.7558354946276399, |
| "grad_norm": 1.5045926570892334, |
| "learning_rate": 3.4442896608446647e-07, |
| "loss": 0.1717, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.763245646535754, |
| "grad_norm": 1.546872854232788, |
| "learning_rate": 3.2512499413887253e-07, |
| "loss": 0.1665, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.7706557984438681, |
| "grad_norm": 1.5870492458343506, |
| "learning_rate": 3.0627219855446664e-07, |
| "loss": 0.1653, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7780659503519822, |
| "grad_norm": 1.694765329360962, |
| "learning_rate": 2.87883183051398e-07, |
| "loss": 0.1626, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7780659503519822, |
| "eval_loss": 0.17290721833705902, |
| "eval_runtime": 209.5418, |
| "eval_samples_per_second": 2.711, |
| "eval_steps_per_second": 1.355, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7854761022600963, |
| "grad_norm": 2.193631649017334, |
| "learning_rate": 2.699702412974254e-07, |
| "loss": 0.1637, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7928862541682105, |
| "grad_norm": 1.4417670965194702, |
| "learning_rate": 2.525453486891908e-07, |
| "loss": 0.177, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.8002964060763246, |
| "grad_norm": 1.472188949584961, |
| "learning_rate": 2.356201543462678e-07, |
| "loss": 0.1444, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.8077065579844387, |
| "grad_norm": 1.3810269832611084, |
| "learning_rate": 2.192059733233408e-07, |
| "loss": 0.1771, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.8151167098925528, |
| "grad_norm": 1.764819622039795, |
| "learning_rate": 2.03313779045713e-07, |
| "loss": 0.1557, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.8151167098925528, |
| "eval_loss": 0.17193768918514252, |
| "eval_runtime": 209.8503, |
| "eval_samples_per_second": 2.707, |
| "eval_steps_per_second": 1.353, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.8225268618006669, |
| "grad_norm": 1.3674615621566772, |
| "learning_rate": 1.8795419597320717e-07, |
| "loss": 0.165, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.829937013708781, |
| "grad_norm": 1.6381874084472656, |
| "learning_rate": 1.7313749249736264e-07, |
| "loss": 0.1743, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.8373471656168952, |
| "grad_norm": 1.665426254272461, |
| "learning_rate": 1.5887357407667312e-07, |
| "loss": 0.1535, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.8447573175250093, |
| "grad_norm": 1.4278993606567383, |
| "learning_rate": 1.451719766144589e-07, |
| "loss": 0.1709, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.8521674694331234, |
| "grad_norm": 1.9010162353515625, |
| "learning_rate": 1.3204186008379925e-07, |
| "loss": 0.1621, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.8521674694331234, |
| "eval_loss": 0.1711866408586502, |
| "eval_runtime": 209.8853, |
| "eval_samples_per_second": 2.706, |
| "eval_steps_per_second": 1.353, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.8595776213412375, |
| "grad_norm": 1.5918656587600708, |
| "learning_rate": 1.1949200240378577e-07, |
| "loss": 0.1866, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.8669877732493516, |
| "grad_norm": 1.593591570854187, |
| "learning_rate": 1.0753079357119132e-07, |
| "loss": 0.1605, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.8743979251574657, |
| "grad_norm": 1.415703535079956, |
| "learning_rate": 9.61662300514795e-08, |
| "loss": 0.159, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8818080770655798, |
| "grad_norm": 1.7163190841674805, |
| "learning_rate": 8.540590943290127e-08, |
| "loss": 0.1783, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.889218228973694, |
| "grad_norm": 1.6285511255264282, |
| "learning_rate": 7.525702534725443e-08, |
| "loss": 0.1682, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.889218228973694, |
| "eval_loss": 0.17063578963279724, |
| "eval_runtime": 209.8969, |
| "eval_samples_per_second": 2.706, |
| "eval_steps_per_second": 1.353, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8966283808818081, |
| "grad_norm": 1.5694634914398193, |
| "learning_rate": 6.572636266070264e-08, |
| "loss": 0.1612, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.9040385327899222, |
| "grad_norm": 1.6561760902404785, |
| "learning_rate": 5.682029293786672e-08, |
| "loss": 0.1703, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.9114486846980363, |
| "grad_norm": 1.4719998836517334, |
| "learning_rate": 4.854477018222103e-08, |
| "loss": 0.1909, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.9188588366061504, |
| "grad_norm": 1.349411129951477, |
| "learning_rate": 4.090532685564618e-08, |
| "loss": 0.1851, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.9262689885142645, |
| "grad_norm": 1.8222600221633911, |
| "learning_rate": 3.390707017979311e-08, |
| "loss": 0.1744, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.9262689885142645, |
| "eval_loss": 0.17040617763996124, |
| "eval_runtime": 210.0132, |
| "eval_samples_per_second": 2.705, |
| "eval_steps_per_second": 1.352, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.9336791404223787, |
| "grad_norm": 1.5898839235305786, |
| "learning_rate": 2.755467872173567e-08, |
| "loss": 0.1565, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.9410892923304928, |
| "grad_norm": 1.6416816711425781, |
| "learning_rate": 2.185239926619431e-08, |
| "loss": 0.1699, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.9484994442386069, |
| "grad_norm": 2.081421375274658, |
| "learning_rate": 1.6804043976418438e-08, |
| "loss": 0.1601, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.955909596146721, |
| "grad_norm": 1.6453341245651245, |
| "learning_rate": 1.2412987845628498e-08, |
| "loss": 0.1553, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.9633197480548351, |
| "grad_norm": 1.314548134803772, |
| "learning_rate": 8.682166440721727e-09, |
| "loss": 0.1626, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.9633197480548351, |
| "eval_loss": 0.1703246533870697, |
| "eval_runtime": 209.9198, |
| "eval_samples_per_second": 2.706, |
| "eval_steps_per_second": 1.353, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.9707298999629492, |
| "grad_norm": 1.4979052543640137, |
| "learning_rate": 5.614073939747443e-09, |
| "loss": 0.1671, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.9781400518710633, |
| "grad_norm": 1.4579306840896606, |
| "learning_rate": 3.210761464466638e-09, |
| "loss": 0.1605, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.9855502037791775, |
| "grad_norm": 1.3856755495071411, |
| "learning_rate": 1.4738357091084174e-09, |
| "loss": 0.1662, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.9929603556872916, |
| "grad_norm": 1.5018383264541626, |
| "learning_rate": 4.0445786624199175e-10, |
| "loss": 0.1757, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.9634699821472168, |
| "learning_rate": 3.3428504808696857e-12, |
| "loss": 0.1735, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.1701827049255371, |
| "eval_runtime": 209.8395, |
| "eval_samples_per_second": 2.707, |
| "eval_steps_per_second": 1.353, |
| "step": 1350 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1350, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.764415382282035e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|