| { | |
| "best_global_step": 24500, | |
| "best_metric": 1.4431298971176147, | |
| "best_model_checkpoint": "./ar-diffusion-checkpoints/checkpoint-24500", | |
| "epoch": 2.09991539112376, | |
| "eval_steps": 250, | |
| "global_step": 27301, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003845858010922237, | |
| "grad_norm": 8.077690124511719, | |
| "learning_rate": 6.579999999999999e-05, | |
| "loss": 10.7559, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007691716021844474, | |
| "grad_norm": 7.270859241485596, | |
| "learning_rate": 0.00013299999999999998, | |
| "loss": 6.4993, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01153757403276671, | |
| "grad_norm": 6.350255012512207, | |
| "learning_rate": 0.00013976839086798278, | |
| "loss": 5.8214, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.015383432043688947, | |
| "grad_norm": 5.809306621551514, | |
| "learning_rate": 0.00013951104738796366, | |
| "loss": 5.3663, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "grad_norm": 3.9576303958892822, | |
| "learning_rate": 0.00013925370390794456, | |
| "loss": 5.3697, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "eval_loss": 5.2464423179626465, | |
| "eval_runtime": 18.6939, | |
| "eval_samples_per_second": 53.493, | |
| "eval_steps_per_second": 13.373, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02307514806553342, | |
| "grad_norm": 5.081186771392822, | |
| "learning_rate": 0.00013899636042792544, | |
| "loss": 5.0419, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02692100607645566, | |
| "grad_norm": 5.957707405090332, | |
| "learning_rate": 0.0001387390169479063, | |
| "loss": 4.8305, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.030766864087377895, | |
| "grad_norm": 3.9519667625427246, | |
| "learning_rate": 0.0001384816734678872, | |
| "loss": 5.1118, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03461272209830013, | |
| "grad_norm": 2.498075485229492, | |
| "learning_rate": 0.00013822432998786808, | |
| "loss": 5.0262, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "grad_norm": 4.084473609924316, | |
| "learning_rate": 0.00013796698650784896, | |
| "loss": 5.0738, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "eval_loss": 4.9789862632751465, | |
| "eval_runtime": 18.8768, | |
| "eval_samples_per_second": 52.975, | |
| "eval_steps_per_second": 13.244, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0423044381201446, | |
| "grad_norm": 6.3689374923706055, | |
| "learning_rate": 0.00013771478989743022, | |
| "loss": 4.9228, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04615029613106684, | |
| "grad_norm": 3.9407873153686523, | |
| "learning_rate": 0.0001374574464174111, | |
| "loss": 4.976, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04999615414198908, | |
| "grad_norm": 4.298041343688965, | |
| "learning_rate": 0.00013720010293739198, | |
| "loss": 4.6802, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05384201215291132, | |
| "grad_norm": 3.756016492843628, | |
| "learning_rate": 0.0001369427594573729, | |
| "loss": 4.7095, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "grad_norm": 4.344913959503174, | |
| "learning_rate": 0.00013668541597735377, | |
| "loss": 4.8664, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "eval_loss": 4.762838363647461, | |
| "eval_runtime": 18.772, | |
| "eval_samples_per_second": 53.271, | |
| "eval_steps_per_second": 13.318, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.06153372817475579, | |
| "grad_norm": 4.1537275314331055, | |
| "learning_rate": 0.00013642807249733465, | |
| "loss": 4.9688, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06537958618567802, | |
| "grad_norm": 4.85400915145874, | |
| "learning_rate": 0.00013617072901731553, | |
| "loss": 4.8658, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06922544419660026, | |
| "grad_norm": 4.026614189147949, | |
| "learning_rate": 0.0001359133855372964, | |
| "loss": 4.893, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0730713022075225, | |
| "grad_norm": 3.84721040725708, | |
| "learning_rate": 0.0001356560420572773, | |
| "loss": 4.6926, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "grad_norm": 9.182045936584473, | |
| "learning_rate": 0.00013539869857725817, | |
| "loss": 4.881, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "eval_loss": 4.709664344787598, | |
| "eval_runtime": 18.8053, | |
| "eval_samples_per_second": 53.177, | |
| "eval_steps_per_second": 13.294, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08076301822936698, | |
| "grad_norm": 5.442048072814941, | |
| "learning_rate": 0.00013514135509723907, | |
| "loss": 4.6134, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0846088762402892, | |
| "grad_norm": 4.779583930969238, | |
| "learning_rate": 0.00013488401161721995, | |
| "loss": 4.7226, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08845473425121145, | |
| "grad_norm": 3.221238851547241, | |
| "learning_rate": 0.0001346266681372008, | |
| "loss": 4.6837, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.09230059226213368, | |
| "grad_norm": 5.55983304977417, | |
| "learning_rate": 0.0001343693246571817, | |
| "loss": 4.672, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "grad_norm": 6.964417934417725, | |
| "learning_rate": 0.0001341119811771626, | |
| "loss": 4.9043, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "eval_loss": 4.7052001953125, | |
| "eval_runtime": 18.9307, | |
| "eval_samples_per_second": 52.824, | |
| "eval_steps_per_second": 13.206, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09999230828397816, | |
| "grad_norm": 7.476005554199219, | |
| "learning_rate": 0.00013385463769714347, | |
| "loss": 4.7776, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10383816629490039, | |
| "grad_norm": 3.4916040897369385, | |
| "learning_rate": 0.00013359729421712435, | |
| "loss": 4.7738, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.10768402430582263, | |
| "grad_norm": 4.028671741485596, | |
| "learning_rate": 0.00013333995073710526, | |
| "loss": 4.6459, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.11152988231674486, | |
| "grad_norm": 4.597095489501953, | |
| "learning_rate": 0.0001330826072570861, | |
| "loss": 4.6778, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "grad_norm": 5.779391288757324, | |
| "learning_rate": 0.000132825263777067, | |
| "loss": 4.7938, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "eval_loss": 4.696172714233398, | |
| "eval_runtime": 18.8705, | |
| "eval_samples_per_second": 52.993, | |
| "eval_steps_per_second": 13.248, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11922159833858934, | |
| "grad_norm": 3.801748752593994, | |
| "learning_rate": 0.0001325679202970479, | |
| "loss": 4.7912, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.12306745634951158, | |
| "grad_norm": 8.367344856262207, | |
| "learning_rate": 0.00013231057681702878, | |
| "loss": 4.7281, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.12691331436043382, | |
| "grad_norm": 4.299734592437744, | |
| "learning_rate": 0.00013205323333700966, | |
| "loss": 4.7263, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.13075917237135604, | |
| "grad_norm": 6.152933597564697, | |
| "learning_rate": 0.00013179588985699054, | |
| "loss": 4.8519, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "grad_norm": 4.300355434417725, | |
| "learning_rate": 0.00013153854637697142, | |
| "loss": 4.8359, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "eval_loss": 4.635708808898926, | |
| "eval_runtime": 18.5455, | |
| "eval_samples_per_second": 53.922, | |
| "eval_steps_per_second": 13.48, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13845088839320052, | |
| "grad_norm": 2.1330080032348633, | |
| "learning_rate": 0.0001312812028969523, | |
| "loss": 4.807, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.14229674640412276, | |
| "grad_norm": 4.667717456817627, | |
| "learning_rate": 0.00013102385941693318, | |
| "loss": 4.6633, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.146142604415045, | |
| "grad_norm": 6.904145240783691, | |
| "learning_rate": 0.00013076651593691408, | |
| "loss": 4.7899, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14998846242596722, | |
| "grad_norm": 2.930926561355591, | |
| "learning_rate": 0.00013050917245689496, | |
| "loss": 4.6692, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "grad_norm": 3.6246345043182373, | |
| "learning_rate": 0.00013025182897687584, | |
| "loss": 4.781, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "eval_loss": 4.620576858520508, | |
| "eval_runtime": 18.7692, | |
| "eval_samples_per_second": 53.279, | |
| "eval_steps_per_second": 13.32, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1576801784478117, | |
| "grad_norm": 3.5292210578918457, | |
| "learning_rate": 0.00012999448549685672, | |
| "loss": 4.7815, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.16152603645873395, | |
| "grad_norm": 4.665738105773926, | |
| "learning_rate": 0.0001297371420168376, | |
| "loss": 4.6789, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.16537189446965617, | |
| "grad_norm": 4.332949161529541, | |
| "learning_rate": 0.00012947979853681848, | |
| "loss": 4.5991, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.1692177524805784, | |
| "grad_norm": 3.8279120922088623, | |
| "learning_rate": 0.00012922245505679936, | |
| "loss": 4.5791, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "grad_norm": 1.9522042274475098, | |
| "learning_rate": 0.00012896511157678027, | |
| "loss": 4.5643, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "eval_loss": 4.609655857086182, | |
| "eval_runtime": 18.946, | |
| "eval_samples_per_second": 52.782, | |
| "eval_steps_per_second": 13.195, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.1769094685024229, | |
| "grad_norm": 4.264033794403076, | |
| "learning_rate": 0.00012870776809676115, | |
| "loss": 4.6666, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.18075532651334514, | |
| "grad_norm": 4.572433948516846, | |
| "learning_rate": 0.000128450424616742, | |
| "loss": 4.6096, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.18460118452426735, | |
| "grad_norm": 3.8559391498565674, | |
| "learning_rate": 0.0001281930811367229, | |
| "loss": 4.6425, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1884470425351896, | |
| "grad_norm": 2.9414010047912598, | |
| "learning_rate": 0.0001279357376567038, | |
| "loss": 4.6336, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "grad_norm": 4.745160102844238, | |
| "learning_rate": 0.00012767839417668467, | |
| "loss": 4.6792, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "eval_loss": 4.558788776397705, | |
| "eval_runtime": 18.9882, | |
| "eval_samples_per_second": 52.664, | |
| "eval_steps_per_second": 13.166, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19613875855703408, | |
| "grad_norm": 2.456908702850342, | |
| "learning_rate": 0.00012742105069666555, | |
| "loss": 4.3847, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.19998461656795632, | |
| "grad_norm": 5.154629707336426, | |
| "learning_rate": 0.00012716370721664645, | |
| "loss": 4.6019, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.20383047457887854, | |
| "grad_norm": 3.0423479080200195, | |
| "learning_rate": 0.0001269063637366273, | |
| "loss": 4.4796, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.20767633258980078, | |
| "grad_norm": 4.218437194824219, | |
| "learning_rate": 0.00012664902025660819, | |
| "loss": 4.5566, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "grad_norm": 5.20380163192749, | |
| "learning_rate": 0.0001263916767765891, | |
| "loss": 4.3311, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "eval_loss": 4.574987888336182, | |
| "eval_runtime": 18.8565, | |
| "eval_samples_per_second": 53.032, | |
| "eval_steps_per_second": 13.258, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21536804861164527, | |
| "grad_norm": 4.369246482849121, | |
| "learning_rate": 0.00012613433329656997, | |
| "loss": 4.4131, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.21921390662256748, | |
| "grad_norm": 5.0442376136779785, | |
| "learning_rate": 0.00012587698981655085, | |
| "loss": 4.5027, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.22305976463348973, | |
| "grad_norm": 3.6387200355529785, | |
| "learning_rate": 0.00012561964633653173, | |
| "loss": 4.6659, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.22690562264441197, | |
| "grad_norm": 3.7960562705993652, | |
| "learning_rate": 0.0001253623028565126, | |
| "loss": 4.4826, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "grad_norm": 4.273965835571289, | |
| "learning_rate": 0.0001251049593764935, | |
| "loss": 4.5869, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "eval_loss": 4.55267858505249, | |
| "eval_runtime": 18.9735, | |
| "eval_samples_per_second": 52.705, | |
| "eval_steps_per_second": 13.176, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.23459733866625646, | |
| "grad_norm": 4.74845027923584, | |
| "learning_rate": 0.00012484761589647437, | |
| "loss": 4.6248, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.23844319667717867, | |
| "grad_norm": 6.299524784088135, | |
| "learning_rate": 0.00012459027241645528, | |
| "loss": 4.5457, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.2422890546881009, | |
| "grad_norm": 5.853606700897217, | |
| "learning_rate": 0.00012433292893643616, | |
| "loss": 4.5135, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.24613491269902316, | |
| "grad_norm": 3.1514365673065186, | |
| "learning_rate": 0.00012407558545641704, | |
| "loss": 4.672, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "grad_norm": 8.455827713012695, | |
| "learning_rate": 0.00012381824197639792, | |
| "loss": 4.6545, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "eval_loss": 4.550297737121582, | |
| "eval_runtime": 18.9801, | |
| "eval_samples_per_second": 52.687, | |
| "eval_steps_per_second": 13.172, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.25382662872086764, | |
| "grad_norm": 2.8094310760498047, | |
| "learning_rate": 0.0001235608984963788, | |
| "loss": 4.5392, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2576724867317899, | |
| "grad_norm": 3.2565436363220215, | |
| "learning_rate": 0.00012330355501635968, | |
| "loss": 4.481, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.26151834474271207, | |
| "grad_norm": 3.5588488578796387, | |
| "learning_rate": 0.00012304621153634056, | |
| "loss": 4.5543, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2653642027536343, | |
| "grad_norm": 3.0696310997009277, | |
| "learning_rate": 0.00012278886805632146, | |
| "loss": 4.5858, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "grad_norm": 3.886117935180664, | |
| "learning_rate": 0.00012253152457630234, | |
| "loss": 4.4694, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "eval_loss": 4.488556861877441, | |
| "eval_runtime": 18.9212, | |
| "eval_samples_per_second": 52.851, | |
| "eval_steps_per_second": 13.213, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2730559187754788, | |
| "grad_norm": 3.794307231903076, | |
| "learning_rate": 0.00012227418109628322, | |
| "loss": 4.4994, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.27690177678640104, | |
| "grad_norm": 3.5770812034606934, | |
| "learning_rate": 0.00012201683761626409, | |
| "loss": 4.5888, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2807476347973233, | |
| "grad_norm": 4.770874500274658, | |
| "learning_rate": 0.00012175949413624498, | |
| "loss": 4.5644, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.28459349280824553, | |
| "grad_norm": 3.4447147846221924, | |
| "learning_rate": 0.00012150215065622586, | |
| "loss": 4.5301, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "grad_norm": 4.76978063583374, | |
| "learning_rate": 0.00012124480717620675, | |
| "loss": 4.5563, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "eval_loss": 4.53049898147583, | |
| "eval_runtime": 18.9074, | |
| "eval_samples_per_second": 52.889, | |
| "eval_steps_per_second": 13.222, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.29228520883009, | |
| "grad_norm": 5.7456512451171875, | |
| "learning_rate": 0.00012098746369618763, | |
| "loss": 4.5612, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2961310668410122, | |
| "grad_norm": 5.577849864959717, | |
| "learning_rate": 0.00012073012021616851, | |
| "loss": 4.4629, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.29997692485193445, | |
| "grad_norm": 4.432284832000732, | |
| "learning_rate": 0.00012047277673614939, | |
| "loss": 4.6661, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.3038227828628567, | |
| "grad_norm": 5.174475193023682, | |
| "learning_rate": 0.00012021543325613027, | |
| "loss": 4.4835, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "grad_norm": 3.5657413005828857, | |
| "learning_rate": 0.00011995808977611117, | |
| "loss": 4.4894, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "eval_loss": 4.5068535804748535, | |
| "eval_runtime": 18.7951, | |
| "eval_samples_per_second": 53.205, | |
| "eval_steps_per_second": 13.301, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3115144988847012, | |
| "grad_norm": 3.854024648666382, | |
| "learning_rate": 0.00011970074629609205, | |
| "loss": 4.4989, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.3153603568956234, | |
| "grad_norm": 4.0870490074157715, | |
| "learning_rate": 0.00011944340281607294, | |
| "loss": 4.3779, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.31920621490654566, | |
| "grad_norm": 4.4627251625061035, | |
| "learning_rate": 0.0001191860593360538, | |
| "loss": 4.5526, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.3230520729174679, | |
| "grad_norm": 7.568991184234619, | |
| "learning_rate": 0.00011892871585603468, | |
| "loss": 4.6285, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "grad_norm": 4.214425086975098, | |
| "learning_rate": 0.00011867137237601558, | |
| "loss": 4.5328, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "eval_loss": 4.511099815368652, | |
| "eval_runtime": 18.7154, | |
| "eval_samples_per_second": 53.432, | |
| "eval_steps_per_second": 13.358, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.33074378893931233, | |
| "grad_norm": 2.3888497352600098, | |
| "learning_rate": 0.00011841402889599646, | |
| "loss": 4.5408, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3345896469502346, | |
| "grad_norm": 3.128143548965454, | |
| "learning_rate": 0.00011815668541597735, | |
| "loss": 4.3879, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3384355049611568, | |
| "grad_norm": 4.353067874908447, | |
| "learning_rate": 0.00011789934193595823, | |
| "loss": 4.5091, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.34228136297207906, | |
| "grad_norm": 4.771759986877441, | |
| "learning_rate": 0.00011764199845593911, | |
| "loss": 4.407, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "grad_norm": 2.9524829387664795, | |
| "learning_rate": 0.00011738465497591999, | |
| "loss": 4.3798, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "eval_loss": 4.479401588439941, | |
| "eval_runtime": 18.8172, | |
| "eval_samples_per_second": 53.143, | |
| "eval_steps_per_second": 13.286, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.34997307899392355, | |
| "grad_norm": 4.825377941131592, | |
| "learning_rate": 0.00011712731149590087, | |
| "loss": 4.5321, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.3538189370048458, | |
| "grad_norm": 3.5786240100860596, | |
| "learning_rate": 0.00011686996801588176, | |
| "loss": 4.5819, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.35766479501576803, | |
| "grad_norm": 4.445742130279541, | |
| "learning_rate": 0.00011661262453586264, | |
| "loss": 4.5954, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.3615106530266903, | |
| "grad_norm": 4.670301914215088, | |
| "learning_rate": 0.00011635528105584354, | |
| "loss": 4.3381, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "grad_norm": 3.0563037395477295, | |
| "learning_rate": 0.0001160979375758244, | |
| "loss": 4.4451, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "eval_loss": 4.503940582275391, | |
| "eval_runtime": 19.0274, | |
| "eval_samples_per_second": 52.556, | |
| "eval_steps_per_second": 13.139, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.3692023690485347, | |
| "grad_norm": 4.921920299530029, | |
| "learning_rate": 0.00011584059409580528, | |
| "loss": 4.5505, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.37304822705945695, | |
| "grad_norm": 4.440188407897949, | |
| "learning_rate": 0.00011558325061578617, | |
| "loss": 4.5339, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.3768940850703792, | |
| "grad_norm": 4.123379707336426, | |
| "learning_rate": 0.00011532590713576705, | |
| "loss": 4.5001, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.38073994308130144, | |
| "grad_norm": 3.6461265087127686, | |
| "learning_rate": 0.00011506856365574795, | |
| "loss": 4.4704, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "grad_norm": 4.586422443389893, | |
| "learning_rate": 0.00011481122017572883, | |
| "loss": 4.5607, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "eval_loss": 4.414160251617432, | |
| "eval_runtime": 18.6554, | |
| "eval_samples_per_second": 53.604, | |
| "eval_steps_per_second": 13.401, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3884316591031459, | |
| "grad_norm": 2.658412456512451, | |
| "learning_rate": 0.00011455387669570971, | |
| "loss": 4.5453, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.39227751711406816, | |
| "grad_norm": 2.231886148452759, | |
| "learning_rate": 0.00011429653321569059, | |
| "loss": 4.5524, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3961233751249904, | |
| "grad_norm": 4.202503204345703, | |
| "learning_rate": 0.00011403918973567147, | |
| "loss": 4.5274, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.39996923313591265, | |
| "grad_norm": 2.8525800704956055, | |
| "learning_rate": 0.00011378184625565236, | |
| "loss": 4.5095, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "grad_norm": 3.2517142295837402, | |
| "learning_rate": 0.00011352964964523362, | |
| "loss": 4.5043, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "eval_loss": 4.595612525939941, | |
| "eval_runtime": 18.9024, | |
| "eval_samples_per_second": 52.903, | |
| "eval_steps_per_second": 13.226, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.4076609491577571, | |
| "grad_norm": 5.091184616088867, | |
| "learning_rate": 0.00011327745303481488, | |
| "loss": 4.4768, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.4115068071686793, | |
| "grad_norm": 6.631587028503418, | |
| "learning_rate": 0.00011302010955479578, | |
| "loss": 4.5572, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.41535266517960157, | |
| "grad_norm": 3.529118299484253, | |
| "learning_rate": 0.00011276276607477666, | |
| "loss": 4.6685, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.4191985231905238, | |
| "grad_norm": 3.1017537117004395, | |
| "learning_rate": 0.00011250542259475754, | |
| "loss": 4.4271, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "grad_norm": 3.930664300918579, | |
| "learning_rate": 0.00011224807911473842, | |
| "loss": 4.5501, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "eval_loss": 4.486245632171631, | |
| "eval_runtime": 18.9209, | |
| "eval_samples_per_second": 52.851, | |
| "eval_steps_per_second": 13.213, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4268902392123683, | |
| "grad_norm": 4.470078945159912, | |
| "learning_rate": 0.00011199073563471931, | |
| "loss": 4.543, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.43073609722329054, | |
| "grad_norm": 5.099395751953125, | |
| "learning_rate": 0.00011173339215470019, | |
| "loss": 4.4515, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.4345819552342128, | |
| "grad_norm": 3.210951805114746, | |
| "learning_rate": 0.00011147604867468107, | |
| "loss": 4.4605, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.43842781324513497, | |
| "grad_norm": 4.092874050140381, | |
| "learning_rate": 0.00011121870519466196, | |
| "loss": 4.6267, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "grad_norm": 2.756460666656494, | |
| "learning_rate": 0.00011096136171464283, | |
| "loss": 4.4338, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "eval_loss": 4.457804203033447, | |
| "eval_runtime": 18.7914, | |
| "eval_samples_per_second": 53.216, | |
| "eval_steps_per_second": 13.304, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.44611952926697945, | |
| "grad_norm": 5.140827178955078, | |
| "learning_rate": 0.00011070401823462372, | |
| "loss": 4.5102, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.4499653872779017, | |
| "grad_norm": 6.364997863769531, | |
| "learning_rate": 0.0001104466747546046, | |
| "loss": 4.5594, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.45381124528882394, | |
| "grad_norm": 5.3479695320129395, | |
| "learning_rate": 0.00011018933127458548, | |
| "loss": 4.4067, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.4576571032997462, | |
| "grad_norm": 3.728893518447876, | |
| "learning_rate": 0.00010993198779456637, | |
| "loss": 4.4689, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "grad_norm": 6.3881611824035645, | |
| "learning_rate": 0.00010967464431454724, | |
| "loss": 4.5641, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "eval_loss": 4.457447052001953, | |
| "eval_runtime": 18.8382, | |
| "eval_samples_per_second": 53.084, | |
| "eval_steps_per_second": 13.271, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.46534881932159067, | |
| "grad_norm": 3.6767919063568115, | |
| "learning_rate": 0.00010941730083452813, | |
| "loss": 4.5798, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.4691946773325129, | |
| "grad_norm": 3.8597254753112793, | |
| "learning_rate": 0.00010915995735450901, | |
| "loss": 4.5867, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.4730405353434351, | |
| "grad_norm": 2.8041980266571045, | |
| "learning_rate": 0.0001089026138744899, | |
| "loss": 4.4825, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.47688639335435734, | |
| "grad_norm": 3.3872950077056885, | |
| "learning_rate": 0.00010864527039447078, | |
| "loss": 4.5624, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "grad_norm": 3.698118209838867, | |
| "learning_rate": 0.00010838792691445166, | |
| "loss": 4.4889, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "eval_loss": 4.451441287994385, | |
| "eval_runtime": 19.2349, | |
| "eval_samples_per_second": 51.989, | |
| "eval_steps_per_second": 12.997, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4845781093762018, | |
| "grad_norm": 3.7140421867370605, | |
| "learning_rate": 0.00010813058343443254, | |
| "loss": 4.4654, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.48842396738712407, | |
| "grad_norm": 3.095348834991455, | |
| "learning_rate": 0.00010787323995441342, | |
| "loss": 4.4761, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.4922698253980463, | |
| "grad_norm": 3.289018392562866, | |
| "learning_rate": 0.00010761589647439432, | |
| "loss": 4.5459, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.49611568340896856, | |
| "grad_norm": 3.9891817569732666, | |
| "learning_rate": 0.0001073585529943752, | |
| "loss": 4.3685, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "grad_norm": 4.315449237823486, | |
| "learning_rate": 0.00010710120951435608, | |
| "loss": 4.4197, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "eval_loss": 4.4507598876953125, | |
| "eval_runtime": 18.8652, | |
| "eval_samples_per_second": 53.008, | |
| "eval_steps_per_second": 13.252, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.503807399430813, | |
| "grad_norm": 4.299264430999756, | |
| "learning_rate": 0.00010684386603433697, | |
| "loss": 4.6103, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.5076532574417353, | |
| "grad_norm": 4.186795234680176, | |
| "learning_rate": 0.00010659166942391823, | |
| "loss": 4.5303, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.5114991154526575, | |
| "grad_norm": 2.925708293914795, | |
| "learning_rate": 0.00010633432594389911, | |
| "loss": 4.4265, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.5153449734635798, | |
| "grad_norm": 6.368393421173096, | |
| "learning_rate": 0.00010607698246388, | |
| "loss": 4.3358, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "grad_norm": 4.947482585906982, | |
| "learning_rate": 0.00010581963898386088, | |
| "loss": 4.5812, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "eval_loss": 4.466405868530273, | |
| "eval_runtime": 18.8333, | |
| "eval_samples_per_second": 53.097, | |
| "eval_steps_per_second": 13.274, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.5230366894854241, | |
| "grad_norm": 2.469914674758911, | |
| "learning_rate": 0.00010556229550384175, | |
| "loss": 4.3623, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.5268825474963464, | |
| "grad_norm": 5.027404308319092, | |
| "learning_rate": 0.00010530495202382264, | |
| "loss": 4.5466, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.5307284055072686, | |
| "grad_norm": 4.797220706939697, | |
| "learning_rate": 0.00010504760854380352, | |
| "loss": 4.4486, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.5345742635181909, | |
| "grad_norm": 5.403319358825684, | |
| "learning_rate": 0.00010479026506378442, | |
| "loss": 4.4919, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "grad_norm": 4.601899147033691, | |
| "learning_rate": 0.0001045329215837653, | |
| "loss": 4.4703, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "eval_loss": 4.411437034606934, | |
| "eval_runtime": 18.8691, | |
| "eval_samples_per_second": 52.997, | |
| "eval_steps_per_second": 13.249, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5422659795400354, | |
| "grad_norm": 5.943952560424805, | |
| "learning_rate": 0.00010427557810374618, | |
| "loss": 4.3737, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5461118375509576, | |
| "grad_norm": 4.010414123535156, | |
| "learning_rate": 0.00010401823462372706, | |
| "loss": 4.5472, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5499576955618799, | |
| "grad_norm": 3.5218944549560547, | |
| "learning_rate": 0.00010376089114370794, | |
| "loss": 4.5854, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5538035535728021, | |
| "grad_norm": 9.44631290435791, | |
| "learning_rate": 0.00010350354766368883, | |
| "loss": 4.3883, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "grad_norm": 4.5443434715271, | |
| "learning_rate": 0.00010324620418366971, | |
| "loss": 4.6685, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "eval_loss": 4.4039154052734375, | |
| "eval_runtime": 18.856, | |
| "eval_samples_per_second": 53.034, | |
| "eval_steps_per_second": 13.258, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5614952695946466, | |
| "grad_norm": 3.646768569946289, | |
| "learning_rate": 0.0001029888607036506, | |
| "loss": 4.5259, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5653411276055688, | |
| "grad_norm": 3.510744571685791, | |
| "learning_rate": 0.00010273151722363148, | |
| "loss": 4.4461, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.5691869856164911, | |
| "grad_norm": 3.874558687210083, | |
| "learning_rate": 0.00010247417374361235, | |
| "loss": 4.3743, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5730328436274132, | |
| "grad_norm": 2.755722761154175, | |
| "learning_rate": 0.00010221683026359324, | |
| "loss": 4.4979, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "grad_norm": 3.5653252601623535, | |
| "learning_rate": 0.00010195948678357412, | |
| "loss": 4.5442, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "eval_loss": 4.44308614730835, | |
| "eval_runtime": 18.8004, | |
| "eval_samples_per_second": 53.19, | |
| "eval_steps_per_second": 13.298, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5807245596492577, | |
| "grad_norm": 3.4961936473846436, | |
| "learning_rate": 0.00010170214330355501, | |
| "loss": 4.5194, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.58457041766018, | |
| "grad_norm": 2.529500961303711, | |
| "learning_rate": 0.00010144479982353589, | |
| "loss": 4.3337, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.5884162756711022, | |
| "grad_norm": 3.346160888671875, | |
| "learning_rate": 0.00010118745634351679, | |
| "loss": 4.5422, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.5922621336820244, | |
| "grad_norm": 3.8311049938201904, | |
| "learning_rate": 0.00010093011286349765, | |
| "loss": 4.4191, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "grad_norm": 4.324901580810547, | |
| "learning_rate": 0.00010067276938347853, | |
| "loss": 4.4613, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "eval_loss": 4.4118547439575195, | |
| "eval_runtime": 18.9517, | |
| "eval_samples_per_second": 52.766, | |
| "eval_steps_per_second": 13.191, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5999538497038689, | |
| "grad_norm": 3.888192653656006, | |
| "learning_rate": 0.00010041542590345943, | |
| "loss": 4.5492, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.6037997077147912, | |
| "grad_norm": 2.718320608139038, | |
| "learning_rate": 0.0001001580824234403, | |
| "loss": 4.5371, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.6076455657257134, | |
| "grad_norm": 3.5970869064331055, | |
| "learning_rate": 9.99007389434212e-05, | |
| "loss": 4.4835, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.6114914237366357, | |
| "grad_norm": 4.563399314880371, | |
| "learning_rate": 9.964339546340208e-05, | |
| "loss": 4.4494, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "grad_norm": 5.080177307128906, | |
| "learning_rate": 9.938605198338294e-05, | |
| "loss": 4.6072, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "eval_loss": 4.428142547607422, | |
| "eval_runtime": 18.8815, | |
| "eval_samples_per_second": 52.962, | |
| "eval_steps_per_second": 13.241, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6191831397584802, | |
| "grad_norm": 4.333257675170898, | |
| "learning_rate": 9.912870850336384e-05, | |
| "loss": 4.3148, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.6230289977694023, | |
| "grad_norm": 5.497674465179443, | |
| "learning_rate": 9.887136502334472e-05, | |
| "loss": 4.5952, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.6268748557803245, | |
| "grad_norm": 4.110482215881348, | |
| "learning_rate": 9.861402154332561e-05, | |
| "loss": 4.5036, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.6307207137912468, | |
| "grad_norm": 3.9359841346740723, | |
| "learning_rate": 9.835667806330649e-05, | |
| "loss": 4.409, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "grad_norm": 4.095981597900391, | |
| "learning_rate": 9.809933458328738e-05, | |
| "loss": 4.3515, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "eval_loss": 4.438499927520752, | |
| "eval_runtime": 18.9189, | |
| "eval_samples_per_second": 52.857, | |
| "eval_steps_per_second": 13.214, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.6384124298130913, | |
| "grad_norm": 4.357822895050049, | |
| "learning_rate": 9.784199110326825e-05, | |
| "loss": 4.3767, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.6422582878240135, | |
| "grad_norm": 3.039700508117676, | |
| "learning_rate": 9.758979449284952e-05, | |
| "loss": 4.4542, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.6461041458349358, | |
| "grad_norm": 6.7661919593811035, | |
| "learning_rate": 9.73324510128304e-05, | |
| "loss": 4.4073, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.649950003845858, | |
| "grad_norm": 4.223692893981934, | |
| "learning_rate": 9.70751075328113e-05, | |
| "loss": 4.4904, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "grad_norm": 4.621217250823975, | |
| "learning_rate": 9.681776405279216e-05, | |
| "loss": 4.7717, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "eval_loss": 4.392988204956055, | |
| "eval_runtime": 18.8399, | |
| "eval_samples_per_second": 53.079, | |
| "eval_steps_per_second": 13.27, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6576417198677025, | |
| "grad_norm": 2.6913883686065674, | |
| "learning_rate": 9.656042057277304e-05, | |
| "loss": 4.4409, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6614875778786247, | |
| "grad_norm": 3.749894618988037, | |
| "learning_rate": 9.630307709275394e-05, | |
| "loss": 4.5101, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.665333435889547, | |
| "grad_norm": 4.93977165222168, | |
| "learning_rate": 9.604573361273482e-05, | |
| "loss": 4.4504, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6691792939004692, | |
| "grad_norm": 4.311313152313232, | |
| "learning_rate": 9.578839013271571e-05, | |
| "loss": 4.4857, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "grad_norm": 3.646656036376953, | |
| "learning_rate": 9.553104665269659e-05, | |
| "loss": 4.387, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "eval_loss": 4.401506423950195, | |
| "eval_runtime": 18.7931, | |
| "eval_samples_per_second": 53.211, | |
| "eval_steps_per_second": 13.303, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6768710099223136, | |
| "grad_norm": 4.352843284606934, | |
| "learning_rate": 9.527370317267746e-05, | |
| "loss": 4.5279, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6807168679332359, | |
| "grad_norm": 3.890216827392578, | |
| "learning_rate": 9.501635969265835e-05, | |
| "loss": 4.4485, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.6845627259441581, | |
| "grad_norm": 3.4119713306427, | |
| "learning_rate": 9.475901621263923e-05, | |
| "loss": 4.4428, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6884085839550804, | |
| "grad_norm": 7.813595294952393, | |
| "learning_rate": 9.450167273262012e-05, | |
| "loss": 4.3308, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "grad_norm": 3.079829692840576, | |
| "learning_rate": 9.4244329252601e-05, | |
| "loss": 4.368, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "eval_loss": 4.393312931060791, | |
| "eval_runtime": 18.7727, | |
| "eval_samples_per_second": 53.269, | |
| "eval_steps_per_second": 13.317, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6961002999769248, | |
| "grad_norm": 9.26623821258545, | |
| "learning_rate": 9.39869857725819e-05, | |
| "loss": 4.3073, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.6999461579878471, | |
| "grad_norm": 3.5981953144073486, | |
| "learning_rate": 9.372964229256276e-05, | |
| "loss": 4.3923, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.7037920159987693, | |
| "grad_norm": 3.734813690185547, | |
| "learning_rate": 9.347229881254364e-05, | |
| "loss": 4.2449, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.7076378740096916, | |
| "grad_norm": 5.646871566772461, | |
| "learning_rate": 9.321495533252453e-05, | |
| "loss": 4.3953, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "grad_norm": 4.284733295440674, | |
| "learning_rate": 9.295761185250541e-05, | |
| "loss": 4.475, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "eval_loss": 4.348310470581055, | |
| "eval_runtime": 19.0285, | |
| "eval_samples_per_second": 52.553, | |
| "eval_steps_per_second": 13.138, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7153295900315361, | |
| "grad_norm": 5.92791223526001, | |
| "learning_rate": 9.27002683724863e-05, | |
| "loss": 4.5493, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.7191754480424583, | |
| "grad_norm": 4.768808841705322, | |
| "learning_rate": 9.244292489246719e-05, | |
| "loss": 4.2508, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.7230213060533806, | |
| "grad_norm": 3.473097562789917, | |
| "learning_rate": 9.218558141244805e-05, | |
| "loss": 4.4534, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.7268671640643027, | |
| "grad_norm": 10.189091682434082, | |
| "learning_rate": 9.192823793242895e-05, | |
| "loss": 4.3883, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "grad_norm": 1.9577853679656982, | |
| "learning_rate": 9.167089445240982e-05, | |
| "loss": 4.3191, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "eval_loss": 4.328299045562744, | |
| "eval_runtime": 18.8631, | |
| "eval_samples_per_second": 53.014, | |
| "eval_steps_per_second": 13.253, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7345588800861472, | |
| "grad_norm": 3.9685990810394287, | |
| "learning_rate": 9.141355097239072e-05, | |
| "loss": 4.325, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.7384047380970694, | |
| "grad_norm": 5.303285121917725, | |
| "learning_rate": 9.11562074923716e-05, | |
| "loss": 4.4277, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7422505961079917, | |
| "grad_norm": 2.70599627494812, | |
| "learning_rate": 9.089886401235249e-05, | |
| "loss": 4.4329, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.7460964541189139, | |
| "grad_norm": 4.711449146270752, | |
| "learning_rate": 9.064152053233336e-05, | |
| "loss": 4.251, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "grad_norm": 3.0169851779937744, | |
| "learning_rate": 9.038417705231424e-05, | |
| "loss": 4.3483, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "eval_loss": 4.341108322143555, | |
| "eval_runtime": 18.9063, | |
| "eval_samples_per_second": 52.893, | |
| "eval_steps_per_second": 13.223, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7537881701407584, | |
| "grad_norm": 3.375880002975464, | |
| "learning_rate": 9.012683357229513e-05, | |
| "loss": 4.313, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.7576340281516807, | |
| "grad_norm": 1.707850456237793, | |
| "learning_rate": 8.986949009227601e-05, | |
| "loss": 4.3062, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7614798861626029, | |
| "grad_norm": 3.6718738079071045, | |
| "learning_rate": 8.96121466122569e-05, | |
| "loss": 4.4415, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7653257441735252, | |
| "grad_norm": 3.5382699966430664, | |
| "learning_rate": 8.935480313223778e-05, | |
| "loss": 4.3754, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "grad_norm": 4.678229808807373, | |
| "learning_rate": 8.909745965221865e-05, | |
| "loss": 4.4404, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "eval_loss": 4.3746819496154785, | |
| "eval_runtime": 18.7221, | |
| "eval_samples_per_second": 53.413, | |
| "eval_steps_per_second": 13.353, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7730174601953695, | |
| "grad_norm": 3.490699529647827, | |
| "learning_rate": 8.884011617219954e-05, | |
| "loss": 4.5294, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.7768633182062918, | |
| "grad_norm": 4.614148139953613, | |
| "learning_rate": 8.858277269218042e-05, | |
| "loss": 4.2371, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.780709176217214, | |
| "grad_norm": 5.6906962394714355, | |
| "learning_rate": 8.832542921216132e-05, | |
| "loss": 4.5472, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.7845550342281363, | |
| "grad_norm": 4.382456302642822, | |
| "learning_rate": 8.80680857321422e-05, | |
| "loss": 4.4282, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "grad_norm": 4.546772003173828, | |
| "learning_rate": 8.781074225212309e-05, | |
| "loss": 4.4004, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "eval_loss": 4.373971462249756, | |
| "eval_runtime": 18.9303, | |
| "eval_samples_per_second": 52.825, | |
| "eval_steps_per_second": 13.206, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7922467502499808, | |
| "grad_norm": 3.784317970275879, | |
| "learning_rate": 8.755339877210395e-05, | |
| "loss": 4.4422, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.796092608260903, | |
| "grad_norm": 3.11979341506958, | |
| "learning_rate": 8.729605529208483e-05, | |
| "loss": 4.4909, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.7999384662718253, | |
| "grad_norm": 4.9711012840271, | |
| "learning_rate": 8.703871181206573e-05, | |
| "loss": 4.2955, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.8037843242827475, | |
| "grad_norm": 3.7663426399230957, | |
| "learning_rate": 8.678136833204661e-05, | |
| "loss": 4.5105, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "grad_norm": 4.679628372192383, | |
| "learning_rate": 8.65240248520275e-05, | |
| "loss": 4.5038, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "eval_loss": 4.3565592765808105, | |
| "eval_runtime": 18.9119, | |
| "eval_samples_per_second": 52.877, | |
| "eval_steps_per_second": 13.219, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.811476040304592, | |
| "grad_norm": 4.561670303344727, | |
| "learning_rate": 8.626668137200838e-05, | |
| "loss": 4.6428, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.8153218983155142, | |
| "grad_norm": 3.155518054962158, | |
| "learning_rate": 8.600933789198925e-05, | |
| "loss": 4.4605, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.8191677563264365, | |
| "grad_norm": 4.021768093109131, | |
| "learning_rate": 8.575199441197014e-05, | |
| "loss": 4.2982, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.8230136143373586, | |
| "grad_norm": 4.348796844482422, | |
| "learning_rate": 8.549465093195102e-05, | |
| "loss": 4.649, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "grad_norm": 4.647562503814697, | |
| "learning_rate": 8.523730745193191e-05, | |
| "loss": 4.2873, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "eval_loss": 4.3483662605285645, | |
| "eval_runtime": 18.9227, | |
| "eval_samples_per_second": 52.847, | |
| "eval_steps_per_second": 13.212, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8307053303592031, | |
| "grad_norm": 3.9260427951812744, | |
| "learning_rate": 8.497996397191279e-05, | |
| "loss": 4.3823, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.8345511883701254, | |
| "grad_norm": 3.7108564376831055, | |
| "learning_rate": 8.472262049189368e-05, | |
| "loss": 4.42, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.8383970463810476, | |
| "grad_norm": 4.9123663902282715, | |
| "learning_rate": 8.446527701187455e-05, | |
| "loss": 4.5828, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.8422429043919698, | |
| "grad_norm": 3.7289183139801025, | |
| "learning_rate": 8.420793353185543e-05, | |
| "loss": 4.3134, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "grad_norm": 4.0350542068481445, | |
| "learning_rate": 8.395059005183632e-05, | |
| "loss": 4.3768, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "eval_loss": 4.307990074157715, | |
| "eval_runtime": 18.7713, | |
| "eval_samples_per_second": 53.273, | |
| "eval_steps_per_second": 13.318, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8499346204138143, | |
| "grad_norm": 5.336431503295898, | |
| "learning_rate": 8.36983934414176e-05, | |
| "loss": 4.3977, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.8537804784247366, | |
| "grad_norm": 4.175157070159912, | |
| "learning_rate": 8.344104996139847e-05, | |
| "loss": 4.4053, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.8576263364356588, | |
| "grad_norm": 4.384688377380371, | |
| "learning_rate": 8.318370648137934e-05, | |
| "loss": 4.26, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.8614721944465811, | |
| "grad_norm": 3.6022467613220215, | |
| "learning_rate": 8.292636300136024e-05, | |
| "loss": 4.2993, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "grad_norm": 4.252429485321045, | |
| "learning_rate": 8.266901952134112e-05, | |
| "loss": 4.299, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "eval_loss": 4.334308624267578, | |
| "eval_runtime": 18.9071, | |
| "eval_samples_per_second": 52.89, | |
| "eval_steps_per_second": 13.223, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8691639104684256, | |
| "grad_norm": 3.4003775119781494, | |
| "learning_rate": 8.241167604132201e-05, | |
| "loss": 4.2806, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8730097684793477, | |
| "grad_norm": 3.7436835765838623, | |
| "learning_rate": 8.215433256130289e-05, | |
| "loss": 4.2694, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.8768556264902699, | |
| "grad_norm": 2.8963701725006104, | |
| "learning_rate": 8.189698908128376e-05, | |
| "loss": 4.362, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8807014845011922, | |
| "grad_norm": 3.3496339321136475, | |
| "learning_rate": 8.163964560126465e-05, | |
| "loss": 4.3698, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "grad_norm": 4.4007487297058105, | |
| "learning_rate": 8.138230212124553e-05, | |
| "loss": 4.2994, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "eval_loss": 4.315768241882324, | |
| "eval_runtime": 18.8056, | |
| "eval_samples_per_second": 53.176, | |
| "eval_steps_per_second": 13.294, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8883932005230367, | |
| "grad_norm": 5.072123050689697, | |
| "learning_rate": 8.112495864122642e-05, | |
| "loss": 4.5564, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.8922390585339589, | |
| "grad_norm": 3.130788564682007, | |
| "learning_rate": 8.08676151612073e-05, | |
| "loss": 4.427, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.8960849165448812, | |
| "grad_norm": 2.615147352218628, | |
| "learning_rate": 8.06102716811882e-05, | |
| "loss": 4.3831, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.8999307745558034, | |
| "grad_norm": 8.039403915405273, | |
| "learning_rate": 8.035292820116906e-05, | |
| "loss": 4.3388, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "grad_norm": 2.6177854537963867, | |
| "learning_rate": 8.009558472114994e-05, | |
| "loss": 1.4931, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "eval_loss": 1.534182071685791, | |
| "eval_runtime": 18.0719, | |
| "eval_samples_per_second": 55.335, | |
| "eval_steps_per_second": 13.834, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9076224905776479, | |
| "grad_norm": 1.4090014696121216, | |
| "learning_rate": 7.983824124113084e-05, | |
| "loss": 1.5524, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.9114683485885701, | |
| "grad_norm": 1.4773452281951904, | |
| "learning_rate": 7.958089776111171e-05, | |
| "loss": 1.4703, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.9153142065994924, | |
| "grad_norm": 1.7350648641586304, | |
| "learning_rate": 7.932355428109261e-05, | |
| "loss": 1.4752, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.9191600646104146, | |
| "grad_norm": 1.9704972505569458, | |
| "learning_rate": 7.906621080107349e-05, | |
| "loss": 1.5257, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "grad_norm": 1.6183151006698608, | |
| "learning_rate": 7.880886732105437e-05, | |
| "loss": 1.4704, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "eval_loss": 1.5159597396850586, | |
| "eval_runtime": 17.891, | |
| "eval_samples_per_second": 55.894, | |
| "eval_steps_per_second": 13.974, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.926851780632259, | |
| "grad_norm": 1.736138939857483, | |
| "learning_rate": 7.855152384103525e-05, | |
| "loss": 1.5304, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.9306976386431813, | |
| "grad_norm": 1.807916283607483, | |
| "learning_rate": 7.829418036101613e-05, | |
| "loss": 1.4984, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.9345434966541035, | |
| "grad_norm": 1.1977109909057617, | |
| "learning_rate": 7.803683688099702e-05, | |
| "loss": 1.4307, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.9383893546650258, | |
| "grad_norm": 0.8386535048484802, | |
| "learning_rate": 7.77794934009779e-05, | |
| "loss": 1.444, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "grad_norm": 1.395053744316101, | |
| "learning_rate": 7.752214992095878e-05, | |
| "loss": 1.4866, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "eval_loss": 1.5108226537704468, | |
| "eval_runtime": 18.0888, | |
| "eval_samples_per_second": 55.283, | |
| "eval_steps_per_second": 13.821, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.9460810706868702, | |
| "grad_norm": 1.5271111726760864, | |
| "learning_rate": 7.726480644093966e-05, | |
| "loss": 1.4849, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.9499269286977925, | |
| "grad_norm": 3.0610506534576416, | |
| "learning_rate": 7.700746296092054e-05, | |
| "loss": 1.4613, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.9537727867087147, | |
| "grad_norm": 1.8968026638031006, | |
| "learning_rate": 7.675011948090143e-05, | |
| "loss": 1.591, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.957618644719637, | |
| "grad_norm": 1.748979926109314, | |
| "learning_rate": 7.649277600088231e-05, | |
| "loss": 1.4781, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "grad_norm": 1.6586661338806152, | |
| "learning_rate": 7.62354325208632e-05, | |
| "loss": 1.4668, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "eval_loss": 1.5503162145614624, | |
| "eval_runtime": 17.9222, | |
| "eval_samples_per_second": 55.797, | |
| "eval_steps_per_second": 13.949, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9653103607414815, | |
| "grad_norm": 7.388810634613037, | |
| "learning_rate": 7.597808904084407e-05, | |
| "loss": 1.5683, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.9691562187524037, | |
| "grad_norm": 1.5548075437545776, | |
| "learning_rate": 7.572074556082496e-05, | |
| "loss": 1.4956, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.973002076763326, | |
| "grad_norm": 1.5935887098312378, | |
| "learning_rate": 7.546340208080584e-05, | |
| "loss": 1.5363, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.9768479347742481, | |
| "grad_norm": 1.985238790512085, | |
| "learning_rate": 7.520605860078672e-05, | |
| "loss": 1.5314, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "grad_norm": 1.5040565729141235, | |
| "learning_rate": 7.494871512076762e-05, | |
| "loss": 1.5108, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "eval_loss": 1.5085468292236328, | |
| "eval_runtime": 18.0531, | |
| "eval_samples_per_second": 55.392, | |
| "eval_steps_per_second": 13.848, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9845396507960926, | |
| "grad_norm": 1.2956914901733398, | |
| "learning_rate": 7.46913716407485e-05, | |
| "loss": 1.4287, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9883855088070148, | |
| "grad_norm": 1.1903409957885742, | |
| "learning_rate": 7.443402816072938e-05, | |
| "loss": 1.5583, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.9922313668179371, | |
| "grad_norm": 1.9069184064865112, | |
| "learning_rate": 7.417668468071026e-05, | |
| "loss": 1.5214, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9960772248288593, | |
| "grad_norm": 1.7362926006317139, | |
| "learning_rate": 7.391934120069114e-05, | |
| "loss": 1.55, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "grad_norm": 1.2136348485946655, | |
| "learning_rate": 7.366199772067203e-05, | |
| "loss": 1.5035, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "eval_loss": 1.5033278465270996, | |
| "eval_runtime": 18.189, | |
| "eval_samples_per_second": 54.978, | |
| "eval_steps_per_second": 13.745, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.0037689408507038, | |
| "grad_norm": 1.291033387184143, | |
| "learning_rate": 7.340465424065291e-05, | |
| "loss": 1.4455, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.007614798861626, | |
| "grad_norm": 1.247129201889038, | |
| "learning_rate": 7.31473107606338e-05, | |
| "loss": 1.4629, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.0114606568725482, | |
| "grad_norm": 1.2177772521972656, | |
| "learning_rate": 7.288996728061467e-05, | |
| "loss": 1.5715, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.0153065148834706, | |
| "grad_norm": 1.2471716403961182, | |
| "learning_rate": 7.263262380059556e-05, | |
| "loss": 1.4244, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "grad_norm": 0.8932450413703918, | |
| "learning_rate": 7.237528032057644e-05, | |
| "loss": 1.4278, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "eval_loss": 1.5201970338821411, | |
| "eval_runtime": 17.9356, | |
| "eval_samples_per_second": 55.755, | |
| "eval_steps_per_second": 13.939, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.022998230905315, | |
| "grad_norm": 1.9957834482192993, | |
| "learning_rate": 7.211793684055732e-05, | |
| "loss": 1.5017, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.0268440889162371, | |
| "grad_norm": 1.432619571685791, | |
| "learning_rate": 7.186059336053821e-05, | |
| "loss": 1.4271, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.0306899469271595, | |
| "grad_norm": 1.3298619985580444, | |
| "learning_rate": 7.16032498805191e-05, | |
| "loss": 1.5726, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.0345358049380817, | |
| "grad_norm": 10.102746963500977, | |
| "learning_rate": 7.134590640049997e-05, | |
| "loss": 1.3938, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "grad_norm": 1.9288721084594727, | |
| "learning_rate": 7.108856292048085e-05, | |
| "loss": 1.4264, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "eval_loss": 1.5168194770812988, | |
| "eval_runtime": 18.139, | |
| "eval_samples_per_second": 55.13, | |
| "eval_steps_per_second": 13.782, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.042227520959926, | |
| "grad_norm": 2.8053858280181885, | |
| "learning_rate": 7.083121944046175e-05, | |
| "loss": 1.5338, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.0460733789708483, | |
| "grad_norm": 1.2761131525039673, | |
| "learning_rate": 7.057387596044263e-05, | |
| "loss": 1.4137, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.0499192369817707, | |
| "grad_norm": 1.614910364151001, | |
| "learning_rate": 7.03165324804235e-05, | |
| "loss": 1.4634, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.0537650949926929, | |
| "grad_norm": 1.8560376167297363, | |
| "learning_rate": 7.00591890004044e-05, | |
| "loss": 1.5173, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "grad_norm": 1.3471609354019165, | |
| "learning_rate": 6.980184552038528e-05, | |
| "loss": 1.4887, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "eval_loss": 1.5006794929504395, | |
| "eval_runtime": 18.2151, | |
| "eval_samples_per_second": 54.9, | |
| "eval_steps_per_second": 13.725, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.0614568110145373, | |
| "grad_norm": 1.661996841430664, | |
| "learning_rate": 6.954450204036616e-05, | |
| "loss": 1.4428, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.0653026690254597, | |
| "grad_norm": 1.2982336282730103, | |
| "learning_rate": 6.928715856034704e-05, | |
| "loss": 1.4565, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.0691485270363819, | |
| "grad_norm": 0.9250918626785278, | |
| "learning_rate": 6.902981508032792e-05, | |
| "loss": 1.5353, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.072994385047304, | |
| "grad_norm": 1.8084945678710938, | |
| "learning_rate": 6.877247160030881e-05, | |
| "loss": 1.5047, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "grad_norm": 1.1049927473068237, | |
| "learning_rate": 6.851512812028969e-05, | |
| "loss": 1.5058, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "eval_loss": 1.5043680667877197, | |
| "eval_runtime": 18.1464, | |
| "eval_samples_per_second": 55.107, | |
| "eval_steps_per_second": 13.777, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0806861010691486, | |
| "grad_norm": 1.7406409978866577, | |
| "learning_rate": 6.825778464027057e-05, | |
| "loss": 1.3945, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.0845319590800708, | |
| "grad_norm": 1.1657389402389526, | |
| "learning_rate": 6.800044116025146e-05, | |
| "loss": 1.4528, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.088377817090993, | |
| "grad_norm": 1.380635380744934, | |
| "learning_rate": 6.774309768023234e-05, | |
| "loss": 1.442, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.0922236751019152, | |
| "grad_norm": 1.7555848360061646, | |
| "learning_rate": 6.748575420021322e-05, | |
| "loss": 1.5061, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "grad_norm": 1.6465975046157837, | |
| "learning_rate": 6.72284107201941e-05, | |
| "loss": 1.5004, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "eval_loss": 1.5090863704681396, | |
| "eval_runtime": 18.0174, | |
| "eval_samples_per_second": 55.502, | |
| "eval_steps_per_second": 13.876, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0999153911237598, | |
| "grad_norm": 2.0214383602142334, | |
| "learning_rate": 6.697106724017498e-05, | |
| "loss": 1.5436, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.103761249134682, | |
| "grad_norm": 1.399170160293579, | |
| "learning_rate": 6.671372376015588e-05, | |
| "loss": 1.5242, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.1076071071456042, | |
| "grad_norm": 2.1806626319885254, | |
| "learning_rate": 6.645638028013676e-05, | |
| "loss": 1.4609, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.1114529651565264, | |
| "grad_norm": 1.1671562194824219, | |
| "learning_rate": 6.619903680011763e-05, | |
| "loss": 1.3789, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "grad_norm": 1.0041520595550537, | |
| "learning_rate": 6.594169332009851e-05, | |
| "loss": 1.4909, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "eval_loss": 1.509366750717163, | |
| "eval_runtime": 18.0148, | |
| "eval_samples_per_second": 55.51, | |
| "eval_steps_per_second": 13.877, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.119144681178371, | |
| "grad_norm": 1.9716360569000244, | |
| "learning_rate": 6.568434984007941e-05, | |
| "loss": 1.5349, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.1229905391892931, | |
| "grad_norm": 0.710033655166626, | |
| "learning_rate": 6.542700636006029e-05, | |
| "loss": 1.4107, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.1268363972002153, | |
| "grad_norm": 1.4398375749588013, | |
| "learning_rate": 6.516966288004117e-05, | |
| "loss": 1.4185, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.1306822552111375, | |
| "grad_norm": 2.5566532611846924, | |
| "learning_rate": 6.491231940002206e-05, | |
| "loss": 1.5758, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "grad_norm": 1.2500799894332886, | |
| "learning_rate": 6.465497592000294e-05, | |
| "loss": 1.4751, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "eval_loss": 1.4990500211715698, | |
| "eval_runtime": 17.9979, | |
| "eval_samples_per_second": 55.562, | |
| "eval_steps_per_second": 13.891, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.1383739712329821, | |
| "grad_norm": 1.5937495231628418, | |
| "learning_rate": 6.439763243998382e-05, | |
| "loss": 1.5215, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.1422198292439043, | |
| "grad_norm": 1.362358570098877, | |
| "learning_rate": 6.41402889599647e-05, | |
| "loss": 1.5125, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.1460656872548265, | |
| "grad_norm": 2.1192502975463867, | |
| "learning_rate": 6.388294547994558e-05, | |
| "loss": 1.4485, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.149911545265749, | |
| "grad_norm": 1.4089174270629883, | |
| "learning_rate": 6.362560199992647e-05, | |
| "loss": 1.5331, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "grad_norm": 1.3750373125076294, | |
| "learning_rate": 6.336825851990735e-05, | |
| "loss": 1.5177, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "eval_loss": 1.5118192434310913, | |
| "eval_runtime": 17.9213, | |
| "eval_samples_per_second": 55.799, | |
| "eval_steps_per_second": 13.95, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.1576032612875933, | |
| "grad_norm": 1.5460007190704346, | |
| "learning_rate": 6.311091503988823e-05, | |
| "loss": 1.442, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.1614491192985155, | |
| "grad_norm": 1.001439094543457, | |
| "learning_rate": 6.285357155986911e-05, | |
| "loss": 1.5308, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.1652949773094377, | |
| "grad_norm": 0.8740602731704712, | |
| "learning_rate": 6.259622807985e-05, | |
| "loss": 1.455, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.16914083532036, | |
| "grad_norm": 2.034207820892334, | |
| "learning_rate": 6.233888459983088e-05, | |
| "loss": 1.5089, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "grad_norm": 1.8656599521636963, | |
| "learning_rate": 6.208154111981176e-05, | |
| "loss": 1.5368, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "eval_loss": 1.4986381530761719, | |
| "eval_runtime": 18.1736, | |
| "eval_samples_per_second": 55.025, | |
| "eval_steps_per_second": 13.756, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1768325513422044, | |
| "grad_norm": 1.2697277069091797, | |
| "learning_rate": 6.182419763979266e-05, | |
| "loss": 1.4239, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.1806784093531266, | |
| "grad_norm": 1.1131771802902222, | |
| "learning_rate": 6.156685415977354e-05, | |
| "loss": 1.4309, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.1845242673640488, | |
| "grad_norm": 1.5322145223617554, | |
| "learning_rate": 6.130951067975442e-05, | |
| "loss": 1.4793, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.1883701253749712, | |
| "grad_norm": 1.1703407764434814, | |
| "learning_rate": 6.10521671997353e-05, | |
| "loss": 1.4761, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "grad_norm": 1.4056655168533325, | |
| "learning_rate": 6.079482371971618e-05, | |
| "loss": 1.5311, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "eval_loss": 1.4925825595855713, | |
| "eval_runtime": 18.2116, | |
| "eval_samples_per_second": 54.91, | |
| "eval_steps_per_second": 13.727, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1960618413968156, | |
| "grad_norm": 2.7062911987304688, | |
| "learning_rate": 6.053748023969707e-05, | |
| "loss": 1.4145, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.1999076994077378, | |
| "grad_norm": 1.5163620710372925, | |
| "learning_rate": 6.028013675967794e-05, | |
| "loss": 1.4322, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.2037535574186602, | |
| "grad_norm": 1.342063546180725, | |
| "learning_rate": 6.002279327965883e-05, | |
| "loss": 1.4696, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.2075994154295824, | |
| "grad_norm": 1.8180099725723267, | |
| "learning_rate": 5.9765449799639715e-05, | |
| "loss": 1.4647, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "grad_norm": 1.951982855796814, | |
| "learning_rate": 5.9508106319620595e-05, | |
| "loss": 1.5141, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "eval_loss": 1.4893933534622192, | |
| "eval_runtime": 18.1951, | |
| "eval_samples_per_second": 54.96, | |
| "eval_steps_per_second": 13.74, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2152911314514268, | |
| "grad_norm": 1.7536894083023071, | |
| "learning_rate": 5.925076283960148e-05, | |
| "loss": 1.514, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.2191369894623492, | |
| "grad_norm": 1.1857939958572388, | |
| "learning_rate": 5.899341935958237e-05, | |
| "loss": 1.4745, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.2229828474732714, | |
| "grad_norm": 1.2500842809677124, | |
| "learning_rate": 5.873607587956324e-05, | |
| "loss": 1.4325, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.2268287054841935, | |
| "grad_norm": 2.025336742401123, | |
| "learning_rate": 5.847873239954413e-05, | |
| "loss": 1.4913, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "grad_norm": 1.1440426111221313, | |
| "learning_rate": 5.8221388919525014e-05, | |
| "loss": 1.451, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "eval_loss": 1.492313265800476, | |
| "eval_runtime": 18.0024, | |
| "eval_samples_per_second": 55.548, | |
| "eval_steps_per_second": 13.887, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.234520421506038, | |
| "grad_norm": 1.1019631624221802, | |
| "learning_rate": 5.796404543950589e-05, | |
| "loss": 1.3918, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.2383662795169603, | |
| "grad_norm": 1.7206593751907349, | |
| "learning_rate": 5.770670195948678e-05, | |
| "loss": 1.4726, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.2422121375278825, | |
| "grad_norm": 1.9747880697250366, | |
| "learning_rate": 5.7449358479467666e-05, | |
| "loss": 1.4829, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.2460579955388047, | |
| "grad_norm": 1.605573058128357, | |
| "learning_rate": 5.719201499944854e-05, | |
| "loss": 1.4476, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "grad_norm": 1.180405616760254, | |
| "learning_rate": 5.6934671519429426e-05, | |
| "loss": 1.3904, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "eval_loss": 1.4850120544433594, | |
| "eval_runtime": 18.0422, | |
| "eval_samples_per_second": 55.426, | |
| "eval_steps_per_second": 13.856, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.253749711560649, | |
| "grad_norm": 1.9959101676940918, | |
| "learning_rate": 5.667732803941031e-05, | |
| "loss": 1.4512, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.2575955695715715, | |
| "grad_norm": 1.8853541612625122, | |
| "learning_rate": 5.641998455939119e-05, | |
| "loss": 1.458, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.2614414275824937, | |
| "grad_norm": 1.4618902206420898, | |
| "learning_rate": 5.616264107937208e-05, | |
| "loss": 1.4968, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.2652872855934159, | |
| "grad_norm": 1.4913650751113892, | |
| "learning_rate": 5.5905297599352965e-05, | |
| "loss": 1.3966, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "grad_norm": 1.3095403909683228, | |
| "learning_rate": 5.564795411933384e-05, | |
| "loss": 1.4484, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "eval_loss": 1.4897910356521606, | |
| "eval_runtime": 18.0248, | |
| "eval_samples_per_second": 55.479, | |
| "eval_steps_per_second": 13.87, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2729790016152602, | |
| "grad_norm": 1.4080452919006348, | |
| "learning_rate": 5.5390610639314724e-05, | |
| "loss": 1.4667, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.2768248596261826, | |
| "grad_norm": 1.6634443998336792, | |
| "learning_rate": 5.513326715929561e-05, | |
| "loss": 1.4619, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.2806707176371048, | |
| "grad_norm": 2.0469400882720947, | |
| "learning_rate": 5.487592367927649e-05, | |
| "loss": 1.4105, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.284516575648027, | |
| "grad_norm": 1.5735753774642944, | |
| "learning_rate": 5.461858019925738e-05, | |
| "loss": 1.4002, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "grad_norm": 1.43183434009552, | |
| "learning_rate": 5.436123671923826e-05, | |
| "loss": 1.4586, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "eval_loss": 1.4708431959152222, | |
| "eval_runtime": 18.2152, | |
| "eval_samples_per_second": 54.899, | |
| "eval_steps_per_second": 13.725, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2922082916698716, | |
| "grad_norm": 1.6342015266418457, | |
| "learning_rate": 5.4103893239219136e-05, | |
| "loss": 1.4113, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.2960541496807938, | |
| "grad_norm": 3.80155873298645, | |
| "learning_rate": 5.384654975920002e-05, | |
| "loss": 1.4793, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.299900007691716, | |
| "grad_norm": 1.4240097999572754, | |
| "learning_rate": 5.358920627918091e-05, | |
| "loss": 1.4072, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.3037458657026382, | |
| "grad_norm": 1.4548074007034302, | |
| "learning_rate": 5.333186279916179e-05, | |
| "loss": 1.4275, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "grad_norm": 1.7287901639938354, | |
| "learning_rate": 5.3074519319142675e-05, | |
| "loss": 1.4741, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "eval_loss": 1.4836150407791138, | |
| "eval_runtime": 18.0219, | |
| "eval_samples_per_second": 55.488, | |
| "eval_steps_per_second": 13.872, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3114375817244828, | |
| "grad_norm": 1.732088327407837, | |
| "learning_rate": 5.281717583912356e-05, | |
| "loss": 1.5014, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.315283439735405, | |
| "grad_norm": 2.144697427749634, | |
| "learning_rate": 5.2559832359104435e-05, | |
| "loss": 1.4436, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.3191292977463271, | |
| "grad_norm": 1.649965763092041, | |
| "learning_rate": 5.230248887908532e-05, | |
| "loss": 1.4334, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.3229751557572493, | |
| "grad_norm": 0.8667518496513367, | |
| "learning_rate": 5.204514539906621e-05, | |
| "loss": 1.487, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "grad_norm": 1.4567649364471436, | |
| "learning_rate": 5.178780191904709e-05, | |
| "loss": 1.4714, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "eval_loss": 1.479749321937561, | |
| "eval_runtime": 17.9466, | |
| "eval_samples_per_second": 55.721, | |
| "eval_steps_per_second": 13.93, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.330666871779094, | |
| "grad_norm": 1.8523489236831665, | |
| "learning_rate": 5.1530458439027974e-05, | |
| "loss": 1.4718, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.3345127297900161, | |
| "grad_norm": 1.091204047203064, | |
| "learning_rate": 5.127311495900886e-05, | |
| "loss": 1.4012, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.3383585878009385, | |
| "grad_norm": 1.8271427154541016, | |
| "learning_rate": 5.101577147898973e-05, | |
| "loss": 1.4547, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.3422044458118605, | |
| "grad_norm": 1.8682465553283691, | |
| "learning_rate": 5.075842799897062e-05, | |
| "loss": 1.4373, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "grad_norm": 2.1932857036590576, | |
| "learning_rate": 5.0501084518951506e-05, | |
| "loss": 1.4628, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "eval_loss": 1.4871113300323486, | |
| "eval_runtime": 17.9165, | |
| "eval_samples_per_second": 55.814, | |
| "eval_steps_per_second": 13.954, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.349896161833705, | |
| "grad_norm": 1.6970813274383545, | |
| "learning_rate": 5.0243741038932386e-05, | |
| "loss": 1.4442, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.3537420198446273, | |
| "grad_norm": 1.0942292213439941, | |
| "learning_rate": 4.998639755891327e-05, | |
| "loss": 1.4769, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.3575878778555497, | |
| "grad_norm": 1.720035195350647, | |
| "learning_rate": 4.972905407889416e-05, | |
| "loss": 1.4519, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.3614337358664719, | |
| "grad_norm": 0.8887185454368591, | |
| "learning_rate": 4.947171059887503e-05, | |
| "loss": 1.4201, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "grad_norm": 1.9557030200958252, | |
| "learning_rate": 4.921436711885592e-05, | |
| "loss": 1.4848, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "eval_loss": 1.476893424987793, | |
| "eval_runtime": 17.9988, | |
| "eval_samples_per_second": 55.559, | |
| "eval_steps_per_second": 13.89, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.3691254518883162, | |
| "grad_norm": 1.471414566040039, | |
| "learning_rate": 4.8957023638836804e-05, | |
| "loss": 1.4541, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.3729713098992384, | |
| "grad_norm": 1.350690484046936, | |
| "learning_rate": 4.8699680158817684e-05, | |
| "loss": 1.3954, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.3768171679101608, | |
| "grad_norm": 0.7363431453704834, | |
| "learning_rate": 4.844233667879857e-05, | |
| "loss": 1.4919, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.380663025921083, | |
| "grad_norm": 1.8820909261703491, | |
| "learning_rate": 4.818499319877946e-05, | |
| "loss": 1.4177, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "grad_norm": 0.8440986275672913, | |
| "learning_rate": 4.792764971876033e-05, | |
| "loss": 1.3995, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "eval_loss": 1.4794726371765137, | |
| "eval_runtime": 17.9989, | |
| "eval_samples_per_second": 55.559, | |
| "eval_steps_per_second": 13.89, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3883547419429274, | |
| "grad_norm": 1.6790105104446411, | |
| "learning_rate": 4.7670306238741216e-05, | |
| "loss": 1.4791, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.3922005999538496, | |
| "grad_norm": 1.1840436458587646, | |
| "learning_rate": 4.74129627587221e-05, | |
| "loss": 1.4021, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.396046457964772, | |
| "grad_norm": 1.7883968353271484, | |
| "learning_rate": 4.715561927870298e-05, | |
| "loss": 1.4637, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.3998923159756942, | |
| "grad_norm": 1.2177505493164062, | |
| "learning_rate": 4.689827579868387e-05, | |
| "loss": 1.5123, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "grad_norm": 1.439232349395752, | |
| "learning_rate": 4.6640932318664756e-05, | |
| "loss": 1.4579, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "eval_loss": 1.4953014850616455, | |
| "eval_runtime": 17.9127, | |
| "eval_samples_per_second": 55.826, | |
| "eval_steps_per_second": 13.957, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4075840319975388, | |
| "grad_norm": 2.0796408653259277, | |
| "learning_rate": 4.638358883864563e-05, | |
| "loss": 1.4295, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.4114298900084608, | |
| "grad_norm": 1.3032926321029663, | |
| "learning_rate": 4.6126245358626515e-05, | |
| "loss": 1.4733, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.4152757480193832, | |
| "grad_norm": 0.9058660864830017, | |
| "learning_rate": 4.58689018786074e-05, | |
| "loss": 1.4446, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.4191216060303053, | |
| "grad_norm": 2.05460786819458, | |
| "learning_rate": 4.561155839858828e-05, | |
| "loss": 1.4133, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "grad_norm": 0.8309249877929688, | |
| "learning_rate": 4.535421491856917e-05, | |
| "loss": 1.456, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "eval_loss": 1.480312466621399, | |
| "eval_runtime": 18.2137, | |
| "eval_samples_per_second": 54.904, | |
| "eval_steps_per_second": 13.726, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.42681332205215, | |
| "grad_norm": 1.0496591329574585, | |
| "learning_rate": 4.5096871438550054e-05, | |
| "loss": 1.3723, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 1.4306591800630721, | |
| "grad_norm": 1.273758053779602, | |
| "learning_rate": 4.483952795853093e-05, | |
| "loss": 1.4747, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.4345050380739943, | |
| "grad_norm": 1.3594483137130737, | |
| "learning_rate": 4.458218447851181e-05, | |
| "loss": 1.564, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 1.4383508960849165, | |
| "grad_norm": 1.773634672164917, | |
| "learning_rate": 4.43248409984927e-05, | |
| "loss": 1.4344, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "grad_norm": 0.7939924001693726, | |
| "learning_rate": 4.406749751847358e-05, | |
| "loss": 1.3798, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "eval_loss": 1.4680087566375732, | |
| "eval_runtime": 18.0287, | |
| "eval_samples_per_second": 55.467, | |
| "eval_steps_per_second": 13.867, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.446042612106761, | |
| "grad_norm": 1.4785016775131226, | |
| "learning_rate": 4.3810154038454466e-05, | |
| "loss": 1.5316, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.4498884701176833, | |
| "grad_norm": 2.1929142475128174, | |
| "learning_rate": 4.355281055843535e-05, | |
| "loss": 1.4498, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 1.4537343281286055, | |
| "grad_norm": 1.816432237625122, | |
| "learning_rate": 4.3295467078416225e-05, | |
| "loss": 1.5089, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.4575801861395277, | |
| "grad_norm": 2.589778423309326, | |
| "learning_rate": 4.303812359839711e-05, | |
| "loss": 1.4011, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "grad_norm": 1.6828664541244507, | |
| "learning_rate": 4.2780780118378e-05, | |
| "loss": 1.3803, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "eval_loss": 1.4737956523895264, | |
| "eval_runtime": 17.9628, | |
| "eval_samples_per_second": 55.67, | |
| "eval_steps_per_second": 13.918, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4652719021613723, | |
| "grad_norm": 1.3094508647918701, | |
| "learning_rate": 4.252343663835888e-05, | |
| "loss": 1.4726, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 1.4691177601722945, | |
| "grad_norm": 2.1354212760925293, | |
| "learning_rate": 4.2266093158339764e-05, | |
| "loss": 1.4343, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.4729636181832166, | |
| "grad_norm": 1.395593523979187, | |
| "learning_rate": 4.200874967832065e-05, | |
| "loss": 1.4834, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 1.476809476194139, | |
| "grad_norm": 0.8917800784111023, | |
| "learning_rate": 4.1751406198301524e-05, | |
| "loss": 1.4625, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "grad_norm": 2.179772138595581, | |
| "learning_rate": 4.149406271828241e-05, | |
| "loss": 1.4832, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "eval_loss": 1.480191946029663, | |
| "eval_runtime": 17.952, | |
| "eval_samples_per_second": 55.704, | |
| "eval_steps_per_second": 13.926, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.4845011922159834, | |
| "grad_norm": 1.3308861255645752, | |
| "learning_rate": 4.12367192382633e-05, | |
| "loss": 1.4555, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.4883470502269056, | |
| "grad_norm": 1.6867352724075317, | |
| "learning_rate": 4.0979375758244176e-05, | |
| "loss": 1.4116, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 1.4921929082378278, | |
| "grad_norm": 2.161247491836548, | |
| "learning_rate": 4.072203227822506e-05, | |
| "loss": 1.4262, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.4960387662487502, | |
| "grad_norm": 1.717690110206604, | |
| "learning_rate": 4.046468879820595e-05, | |
| "loss": 1.3896, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "grad_norm": 1.0118234157562256, | |
| "learning_rate": 4.020734531818682e-05, | |
| "loss": 1.4503, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "eval_loss": 1.478628396987915, | |
| "eval_runtime": 18.0209, | |
| "eval_samples_per_second": 55.491, | |
| "eval_steps_per_second": 13.873, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.5037304822705946, | |
| "grad_norm": 0.8779070377349854, | |
| "learning_rate": 3.995000183816771e-05, | |
| "loss": 1.3728, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 1.5075763402815168, | |
| "grad_norm": 1.6068123579025269, | |
| "learning_rate": 3.9692658358148595e-05, | |
| "loss": 1.5204, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.511422198292439, | |
| "grad_norm": 1.7712832689285278, | |
| "learning_rate": 3.9435314878129475e-05, | |
| "loss": 1.514, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 1.5152680563033614, | |
| "grad_norm": 1.2519572973251343, | |
| "learning_rate": 3.917797139811036e-05, | |
| "loss": 1.3953, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "grad_norm": 1.5644786357879639, | |
| "learning_rate": 3.892062791809125e-05, | |
| "loss": 1.4772, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "eval_loss": 1.4710900783538818, | |
| "eval_runtime": 18.4205, | |
| "eval_samples_per_second": 54.287, | |
| "eval_steps_per_second": 13.572, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5229597723252057, | |
| "grad_norm": 1.6755670309066772, | |
| "learning_rate": 3.866328443807212e-05, | |
| "loss": 1.4148, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.5268056303361282, | |
| "grad_norm": 1.7168843746185303, | |
| "learning_rate": 3.840594095805301e-05, | |
| "loss": 1.4211, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 1.5306514883470501, | |
| "grad_norm": 1.5205817222595215, | |
| "learning_rate": 3.8148597478033894e-05, | |
| "loss": 1.4663, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.5344973463579725, | |
| "grad_norm": 1.608231544494629, | |
| "learning_rate": 3.789125399801477e-05, | |
| "loss": 1.3634, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "grad_norm": 1.5260729789733887, | |
| "learning_rate": 3.763391051799566e-05, | |
| "loss": 1.4114, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "eval_loss": 1.4733539819717407, | |
| "eval_runtime": 18.105, | |
| "eval_samples_per_second": 55.233, | |
| "eval_steps_per_second": 13.808, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.542189062379817, | |
| "grad_norm": 1.4523636102676392, | |
| "learning_rate": 3.7376567037976546e-05, | |
| "loss": 1.4538, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 1.5460349203907393, | |
| "grad_norm": 1.854066252708435, | |
| "learning_rate": 3.7119223557957426e-05, | |
| "loss": 1.4532, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.5498807784016613, | |
| "grad_norm": 1.8892920017242432, | |
| "learning_rate": 3.6861880077938306e-05, | |
| "loss": 1.4301, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 1.5537266364125837, | |
| "grad_norm": 1.2957504987716675, | |
| "learning_rate": 3.6609683467519574e-05, | |
| "loss": 1.4613, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "grad_norm": 1.9040348529815674, | |
| "learning_rate": 3.635233998750046e-05, | |
| "loss": 1.3847, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "eval_loss": 1.4672300815582275, | |
| "eval_runtime": 17.9888, | |
| "eval_samples_per_second": 55.59, | |
| "eval_steps_per_second": 13.898, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.561418352434428, | |
| "grad_norm": 1.4990596771240234, | |
| "learning_rate": 3.609499650748134e-05, | |
| "loss": 1.4243, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.5652642104453505, | |
| "grad_norm": 2.344515562057495, | |
| "learning_rate": 3.583765302746222e-05, | |
| "loss": 1.4971, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 1.5691100684562724, | |
| "grad_norm": 2.2836570739746094, | |
| "learning_rate": 3.5580309547443106e-05, | |
| "loss": 1.4641, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.5729559264671948, | |
| "grad_norm": 1.0165778398513794, | |
| "learning_rate": 3.5322966067423986e-05, | |
| "loss": 1.4268, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "grad_norm": 0.5663600564002991, | |
| "learning_rate": 3.506562258740487e-05, | |
| "loss": 1.3487, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "eval_loss": 1.4733059406280518, | |
| "eval_runtime": 18.0399, | |
| "eval_samples_per_second": 55.433, | |
| "eval_steps_per_second": 13.858, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.5806476424890392, | |
| "grad_norm": 1.36208176612854, | |
| "learning_rate": 3.480827910738575e-05, | |
| "loss": 1.3615, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 1.5844935004999616, | |
| "grad_norm": 1.6889315843582153, | |
| "learning_rate": 3.455093562736664e-05, | |
| "loss": 1.4174, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.5883393585108838, | |
| "grad_norm": 1.2735401391983032, | |
| "learning_rate": 3.429359214734752e-05, | |
| "loss": 1.4482, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 1.592185216521806, | |
| "grad_norm": 1.668188452720642, | |
| "learning_rate": 3.4036248667328405e-05, | |
| "loss": 1.4193, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "grad_norm": 1.8626503944396973, | |
| "learning_rate": 3.3778905187309284e-05, | |
| "loss": 1.4477, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "eval_loss": 1.4779850244522095, | |
| "eval_runtime": 18.0373, | |
| "eval_samples_per_second": 55.441, | |
| "eval_steps_per_second": 13.86, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5998769325436504, | |
| "grad_norm": 1.2189550399780273, | |
| "learning_rate": 3.352156170729017e-05, | |
| "loss": 1.5325, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.6037227905545728, | |
| "grad_norm": 2.126854658126831, | |
| "learning_rate": 3.326421822727105e-05, | |
| "loss": 1.5096, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 1.607568648565495, | |
| "grad_norm": 1.7529182434082031, | |
| "learning_rate": 3.300687474725194e-05, | |
| "loss": 1.4629, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.6114145065764172, | |
| "grad_norm": 2.2533035278320312, | |
| "learning_rate": 3.2749531267232824e-05, | |
| "loss": 1.4266, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "grad_norm": 1.6632803678512573, | |
| "learning_rate": 3.24921877872137e-05, | |
| "loss": 1.5018, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "eval_loss": 1.467063307762146, | |
| "eval_runtime": 18.0767, | |
| "eval_samples_per_second": 55.32, | |
| "eval_steps_per_second": 13.83, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6191062225982615, | |
| "grad_norm": 2.016814708709717, | |
| "learning_rate": 3.223484430719458e-05, | |
| "loss": 1.434, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 1.622952080609184, | |
| "grad_norm": 1.5766371488571167, | |
| "learning_rate": 3.197750082717547e-05, | |
| "loss": 1.4249, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.6267979386201061, | |
| "grad_norm": 2.3865230083465576, | |
| "learning_rate": 3.172015734715635e-05, | |
| "loss": 1.6, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 1.6306437966310283, | |
| "grad_norm": 1.193731427192688, | |
| "learning_rate": 3.1462813867137236e-05, | |
| "loss": 1.5674, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "grad_norm": 1.4854563474655151, | |
| "learning_rate": 3.120547038711812e-05, | |
| "loss": 1.4788, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "eval_loss": 1.4725981950759888, | |
| "eval_runtime": 18.2185, | |
| "eval_samples_per_second": 54.889, | |
| "eval_steps_per_second": 13.722, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6383355126528727, | |
| "grad_norm": 1.3907707929611206, | |
| "learning_rate": 3.0948126907099e-05, | |
| "loss": 1.4752, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.642181370663795, | |
| "grad_norm": 1.5267348289489746, | |
| "learning_rate": 3.069078342707988e-05, | |
| "loss": 1.4198, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 1.6460272286747173, | |
| "grad_norm": 1.2138367891311646, | |
| "learning_rate": 3.0433439947060768e-05, | |
| "loss": 1.4302, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.6498730866856395, | |
| "grad_norm": 1.3399436473846436, | |
| "learning_rate": 3.017609646704165e-05, | |
| "loss": 1.5098, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "grad_norm": 1.543906569480896, | |
| "learning_rate": 2.991875298702253e-05, | |
| "loss": 1.4577, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "eval_loss": 1.475114345550537, | |
| "eval_runtime": 18.0585, | |
| "eval_samples_per_second": 55.376, | |
| "eval_steps_per_second": 13.844, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.657564802707484, | |
| "grad_norm": 1.2780442237854004, | |
| "learning_rate": 2.9661409507003417e-05, | |
| "loss": 1.5179, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 1.6614106607184063, | |
| "grad_norm": 1.206725835800171, | |
| "learning_rate": 2.94040660269843e-05, | |
| "loss": 1.4438, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.6652565187293287, | |
| "grad_norm": 2.1834638118743896, | |
| "learning_rate": 2.914672254696518e-05, | |
| "loss": 1.4783, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 1.6691023767402506, | |
| "grad_norm": 1.5568137168884277, | |
| "learning_rate": 2.8889379066946066e-05, | |
| "loss": 1.38, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "grad_norm": 1.6938014030456543, | |
| "learning_rate": 2.863203558692695e-05, | |
| "loss": 1.3754, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "eval_loss": 1.466833472251892, | |
| "eval_runtime": 18.1069, | |
| "eval_samples_per_second": 55.228, | |
| "eval_steps_per_second": 13.807, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.6767940927620952, | |
| "grad_norm": 1.3192166090011597, | |
| "learning_rate": 2.837469210690783e-05, | |
| "loss": 1.4388, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.6806399507730174, | |
| "grad_norm": 2.0135934352874756, | |
| "learning_rate": 2.8117348626888716e-05, | |
| "loss": 1.429, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 1.6844858087839398, | |
| "grad_norm": 1.4457674026489258, | |
| "learning_rate": 2.78600051468696e-05, | |
| "loss": 1.5154, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.6883316667948618, | |
| "grad_norm": 1.225411295890808, | |
| "learning_rate": 2.760266166685048e-05, | |
| "loss": 1.4658, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "grad_norm": 1.8256678581237793, | |
| "learning_rate": 2.7345318186831365e-05, | |
| "loss": 1.5004, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "eval_loss": 1.4664525985717773, | |
| "eval_runtime": 18.0331, | |
| "eval_samples_per_second": 55.454, | |
| "eval_steps_per_second": 13.863, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6960233828167064, | |
| "grad_norm": 0.8262001276016235, | |
| "learning_rate": 2.7087974706812248e-05, | |
| "loss": 1.4304, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 1.6998692408276286, | |
| "grad_norm": 1.6224443912506104, | |
| "learning_rate": 2.6830631226793128e-05, | |
| "loss": 1.4127, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.703715098838551, | |
| "grad_norm": 1.3338160514831543, | |
| "learning_rate": 2.6573287746774014e-05, | |
| "loss": 1.4842, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 1.707560956849473, | |
| "grad_norm": 1.940238356590271, | |
| "learning_rate": 2.6315944266754897e-05, | |
| "loss": 1.4279, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "grad_norm": 2.091132164001465, | |
| "learning_rate": 2.6058600786735777e-05, | |
| "loss": 1.3779, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "eval_loss": 1.457463264465332, | |
| "eval_runtime": 18.1835, | |
| "eval_samples_per_second": 54.995, | |
| "eval_steps_per_second": 13.749, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7152526728713176, | |
| "grad_norm": 1.4367913007736206, | |
| "learning_rate": 2.5801257306716663e-05, | |
| "loss": 1.4821, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.7190985308822397, | |
| "grad_norm": 1.9735435247421265, | |
| "learning_rate": 2.5543913826697546e-05, | |
| "loss": 1.3754, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 1.7229443888931621, | |
| "grad_norm": 1.4968055486679077, | |
| "learning_rate": 2.5286570346678426e-05, | |
| "loss": 1.4045, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.7267902469040843, | |
| "grad_norm": 1.0449949502944946, | |
| "learning_rate": 2.5029226866659312e-05, | |
| "loss": 1.4458, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "grad_norm": 1.164890170097351, | |
| "learning_rate": 2.4771883386640196e-05, | |
| "loss": 1.4407, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "eval_loss": 1.4607012271881104, | |
| "eval_runtime": 18.2079, | |
| "eval_samples_per_second": 54.921, | |
| "eval_steps_per_second": 13.73, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.734481962925929, | |
| "grad_norm": 0.9285104870796204, | |
| "learning_rate": 2.4514539906621075e-05, | |
| "loss": 1.4243, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 1.738327820936851, | |
| "grad_norm": 1.2848355770111084, | |
| "learning_rate": 2.4257196426601962e-05, | |
| "loss": 1.4596, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.7421736789477733, | |
| "grad_norm": 1.4614371061325073, | |
| "learning_rate": 2.3999852946582845e-05, | |
| "loss": 1.3918, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 1.7460195369586955, | |
| "grad_norm": 0.9543781876564026, | |
| "learning_rate": 2.3742509466563724e-05, | |
| "loss": 1.4044, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "grad_norm": 1.602250099182129, | |
| "learning_rate": 2.348516598654461e-05, | |
| "loss": 1.4607, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "eval_loss": 1.4677520990371704, | |
| "eval_runtime": 18.158, | |
| "eval_samples_per_second": 55.072, | |
| "eval_steps_per_second": 13.768, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.75371125298054, | |
| "grad_norm": 1.1664291620254517, | |
| "learning_rate": 2.3227822506525494e-05, | |
| "loss": 1.5153, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.757557110991462, | |
| "grad_norm": 1.472679853439331, | |
| "learning_rate": 2.2970479026506374e-05, | |
| "loss": 1.4774, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 1.7614029690023845, | |
| "grad_norm": 1.7927029132843018, | |
| "learning_rate": 2.271313554648726e-05, | |
| "loss": 1.4551, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.7652488270133067, | |
| "grad_norm": 2.9085824489593506, | |
| "learning_rate": 2.2455792066468143e-05, | |
| "loss": 1.4474, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "grad_norm": 1.8322957754135132, | |
| "learning_rate": 2.2198448586449026e-05, | |
| "loss": 1.4642, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "eval_loss": 1.4676103591918945, | |
| "eval_runtime": 17.9158, | |
| "eval_samples_per_second": 55.817, | |
| "eval_steps_per_second": 13.954, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7729405430351513, | |
| "grad_norm": 0.7428656220436096, | |
| "learning_rate": 2.194110510642991e-05, | |
| "loss": 1.4475, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 1.7767864010460732, | |
| "grad_norm": 1.4552706480026245, | |
| "learning_rate": 2.1683761626410793e-05, | |
| "loss": 1.517, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.7806322590569956, | |
| "grad_norm": 1.1563323736190796, | |
| "learning_rate": 2.1426418146391676e-05, | |
| "loss": 1.4806, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 1.7844781170679178, | |
| "grad_norm": 1.7244662046432495, | |
| "learning_rate": 2.116907466637256e-05, | |
| "loss": 1.4492, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "grad_norm": 1.642321228981018, | |
| "learning_rate": 2.0911731186353442e-05, | |
| "loss": 1.4196, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "eval_loss": 1.4725000858306885, | |
| "eval_runtime": 18.1814, | |
| "eval_samples_per_second": 55.001, | |
| "eval_steps_per_second": 13.75, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.7921698330897624, | |
| "grad_norm": 1.1381646394729614, | |
| "learning_rate": 2.0654387706334325e-05, | |
| "loss": 1.4653, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.7960156911006846, | |
| "grad_norm": 1.2550010681152344, | |
| "learning_rate": 2.0397044226315208e-05, | |
| "loss": 1.4836, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 1.7998615491116068, | |
| "grad_norm": 1.4335628747940063, | |
| "learning_rate": 2.013970074629609e-05, | |
| "loss": 1.4403, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.8037074071225292, | |
| "grad_norm": 1.8901276588439941, | |
| "learning_rate": 1.9882357266276974e-05, | |
| "loss": 1.4562, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "grad_norm": 1.2078189849853516, | |
| "learning_rate": 1.9625013786257857e-05, | |
| "loss": 1.4221, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "eval_loss": 1.4660383462905884, | |
| "eval_runtime": 18.0656, | |
| "eval_samples_per_second": 55.354, | |
| "eval_steps_per_second": 13.838, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8113991231443736, | |
| "grad_norm": 1.6915593147277832, | |
| "learning_rate": 1.936767030623874e-05, | |
| "loss": 1.4296, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 1.8152449811552958, | |
| "grad_norm": 1.9247820377349854, | |
| "learning_rate": 1.9110326826219623e-05, | |
| "loss": 1.4513, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.819090839166218, | |
| "grad_norm": 2.794621229171753, | |
| "learning_rate": 1.8852983346200506e-05, | |
| "loss": 1.4381, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 1.8229366971771404, | |
| "grad_norm": 1.3829151391983032, | |
| "learning_rate": 1.859563986618139e-05, | |
| "loss": 1.4344, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "grad_norm": 1.8067855834960938, | |
| "learning_rate": 1.8338296386162273e-05, | |
| "loss": 1.4337, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "eval_loss": 1.4543312788009644, | |
| "eval_runtime": 18.2116, | |
| "eval_samples_per_second": 54.91, | |
| "eval_steps_per_second": 13.727, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8306284131989847, | |
| "grad_norm": 1.829542875289917, | |
| "learning_rate": 1.8080952906143156e-05, | |
| "loss": 1.3986, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.834474271209907, | |
| "grad_norm": 1.8767279386520386, | |
| "learning_rate": 1.782360942612404e-05, | |
| "loss": 1.4873, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 1.838320129220829, | |
| "grad_norm": 0.9735344052314758, | |
| "learning_rate": 1.7566265946104922e-05, | |
| "loss": 1.4105, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.8421659872317515, | |
| "grad_norm": 1.5424654483795166, | |
| "learning_rate": 1.7308922466085805e-05, | |
| "loss": 1.4357, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "grad_norm": 0.9316624999046326, | |
| "learning_rate": 1.7051578986066688e-05, | |
| "loss": 1.4616, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "eval_loss": 1.4611330032348633, | |
| "eval_runtime": 18.0655, | |
| "eval_samples_per_second": 55.354, | |
| "eval_steps_per_second": 13.839, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8498577032535959, | |
| "grad_norm": 1.3933135271072388, | |
| "learning_rate": 1.679423550604757e-05, | |
| "loss": 1.45, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 1.853703561264518, | |
| "grad_norm": 1.1157580614089966, | |
| "learning_rate": 1.6536892026028454e-05, | |
| "loss": 1.4916, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.8575494192754403, | |
| "grad_norm": 1.7401970624923706, | |
| "learning_rate": 1.6279548546009337e-05, | |
| "loss": 1.4563, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 1.8613952772863627, | |
| "grad_norm": 1.4699925184249878, | |
| "learning_rate": 1.602220506599022e-05, | |
| "loss": 1.4211, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "grad_norm": 1.1760289669036865, | |
| "learning_rate": 1.5764861585971103e-05, | |
| "loss": 1.4212, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "eval_loss": 1.460072636604309, | |
| "eval_runtime": 17.8176, | |
| "eval_samples_per_second": 56.124, | |
| "eval_steps_per_second": 14.031, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.869086993308207, | |
| "grad_norm": 1.8243287801742554, | |
| "learning_rate": 1.5507518105951986e-05, | |
| "loss": 1.4594, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.8729328513191295, | |
| "grad_norm": 0.8821312785148621, | |
| "learning_rate": 1.5250174625932868e-05, | |
| "loss": 1.3837, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 1.8767787093300514, | |
| "grad_norm": 1.673240065574646, | |
| "learning_rate": 1.4992831145913753e-05, | |
| "loss": 1.395, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.8806245673409738, | |
| "grad_norm": 1.4853135347366333, | |
| "learning_rate": 1.4735487665894636e-05, | |
| "loss": 1.5031, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "grad_norm": 2.507054567337036, | |
| "learning_rate": 1.4478144185875517e-05, | |
| "loss": 1.3909, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "eval_loss": 1.4431298971176147, | |
| "eval_runtime": 17.9815, | |
| "eval_samples_per_second": 55.613, | |
| "eval_steps_per_second": 13.903, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.8883162833628182, | |
| "grad_norm": 1.8027464151382446, | |
| "learning_rate": 1.4220800705856402e-05, | |
| "loss": 1.4855, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 1.8921621413737406, | |
| "grad_norm": 1.139756679534912, | |
| "learning_rate": 1.3963457225837285e-05, | |
| "loss": 1.3773, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.8960079993846626, | |
| "grad_norm": 1.377536654472351, | |
| "learning_rate": 1.3706113745818166e-05, | |
| "loss": 1.4274, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 1.899853857395585, | |
| "grad_norm": 1.2132219076156616, | |
| "learning_rate": 1.3448770265799051e-05, | |
| "loss": 1.3772, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "grad_norm": 1.7106857299804688, | |
| "learning_rate": 1.3191426785779932e-05, | |
| "loss": 1.41, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "eval_loss": 1.472328782081604, | |
| "eval_runtime": 18.0789, | |
| "eval_samples_per_second": 55.313, | |
| "eval_steps_per_second": 13.828, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9075455734174294, | |
| "grad_norm": 0.9809736013412476, | |
| "learning_rate": 1.2939230175361197e-05, | |
| "loss": 1.4547, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.9113914314283518, | |
| "grad_norm": 1.476722240447998, | |
| "learning_rate": 1.2681886695342082e-05, | |
| "loss": 1.4546, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 1.9152372894392737, | |
| "grad_norm": 2.078511953353882, | |
| "learning_rate": 1.2424543215322965e-05, | |
| "loss": 1.4971, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.9190831474501961, | |
| "grad_norm": 0.7233028411865234, | |
| "learning_rate": 1.2167199735303847e-05, | |
| "loss": 1.3622, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "grad_norm": 1.3686310052871704, | |
| "learning_rate": 1.1909856255284731e-05, | |
| "loss": 1.5232, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "eval_loss": 1.461082935333252, | |
| "eval_runtime": 18.2695, | |
| "eval_samples_per_second": 54.736, | |
| "eval_steps_per_second": 13.684, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9267748634720405, | |
| "grad_norm": 1.1179672479629517, | |
| "learning_rate": 1.1652512775265614e-05, | |
| "loss": 1.5076, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 1.930620721482963, | |
| "grad_norm": 0.9407248497009277, | |
| "learning_rate": 1.1395169295246496e-05, | |
| "loss": 1.468, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.9344665794938851, | |
| "grad_norm": 1.498488426208496, | |
| "learning_rate": 1.113782581522738e-05, | |
| "loss": 1.4566, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 1.9383124375048073, | |
| "grad_norm": 0.6983101963996887, | |
| "learning_rate": 1.0880482335208264e-05, | |
| "loss": 1.4621, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "grad_norm": 1.954953908920288, | |
| "learning_rate": 1.0623138855189145e-05, | |
| "loss": 1.417, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "eval_loss": 1.4591727256774902, | |
| "eval_runtime": 18.0732, | |
| "eval_samples_per_second": 55.331, | |
| "eval_steps_per_second": 13.833, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9460041535266517, | |
| "grad_norm": 1.6467170715332031, | |
| "learning_rate": 1.036579537517003e-05, | |
| "loss": 1.4942, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.949850011537574, | |
| "grad_norm": 1.4509849548339844, | |
| "learning_rate": 1.0108451895150913e-05, | |
| "loss": 1.4539, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 1.9536958695484963, | |
| "grad_norm": 1.6131352186203003, | |
| "learning_rate": 9.851108415131796e-06, | |
| "loss": 1.3993, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.9575417275594185, | |
| "grad_norm": 1.880043387413025, | |
| "learning_rate": 9.593764935112679e-06, | |
| "loss": 1.4449, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "grad_norm": 1.3041406869888306, | |
| "learning_rate": 9.336421455093562e-06, | |
| "loss": 1.4918, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "eval_loss": 1.4548134803771973, | |
| "eval_runtime": 18.0544, | |
| "eval_samples_per_second": 55.388, | |
| "eval_steps_per_second": 13.847, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9652334435812628, | |
| "grad_norm": 1.8318700790405273, | |
| "learning_rate": 9.079077975074445e-06, | |
| "loss": 1.42, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 1.9690793015921852, | |
| "grad_norm": 1.7966841459274292, | |
| "learning_rate": 8.821734495055328e-06, | |
| "loss": 1.3236, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.9729251596031074, | |
| "grad_norm": 0.7579635977745056, | |
| "learning_rate": 8.564391015036211e-06, | |
| "loss": 1.3957, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 1.9767710176140296, | |
| "grad_norm": 1.4515990018844604, | |
| "learning_rate": 8.307047535017094e-06, | |
| "loss": 1.3347, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "grad_norm": 1.5671380758285522, | |
| "learning_rate": 8.049704054997977e-06, | |
| "loss": 1.4624, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "eval_loss": 1.450337290763855, | |
| "eval_runtime": 17.9548, | |
| "eval_samples_per_second": 55.695, | |
| "eval_steps_per_second": 13.924, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.984462733635874, | |
| "grad_norm": 1.7020714282989502, | |
| "learning_rate": 7.79236057497886e-06, | |
| "loss": 1.3822, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.9883085916467964, | |
| "grad_norm": 1.297658920288086, | |
| "learning_rate": 7.535017094959743e-06, | |
| "loss": 1.4008, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 1.9921544496577186, | |
| "grad_norm": 1.8151623010635376, | |
| "learning_rate": 7.277673614940627e-06, | |
| "loss": 1.4408, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.9960003076686408, | |
| "grad_norm": 0.8869682550430298, | |
| "learning_rate": 7.02033013492151e-06, | |
| "loss": 1.4767, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "grad_norm": 1.898775339126587, | |
| "learning_rate": 6.762986654902392e-06, | |
| "loss": 1.5032, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "eval_loss": 1.4542045593261719, | |
| "eval_runtime": 18.0059, | |
| "eval_samples_per_second": 55.537, | |
| "eval_steps_per_second": 13.884, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.003692023690485, | |
| "grad_norm": 1.7356750965118408, | |
| "learning_rate": 6.505643174883276e-06, | |
| "loss": 1.3839, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.0075378817014076, | |
| "grad_norm": 2.3067352771759033, | |
| "learning_rate": 6.248299694864159e-06, | |
| "loss": 1.4348, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.01138373971233, | |
| "grad_norm": 1.343248724937439, | |
| "learning_rate": 5.990956214845041e-06, | |
| "loss": 1.3703, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.015229597723252, | |
| "grad_norm": 1.9424471855163574, | |
| "learning_rate": 5.733612734825925e-06, | |
| "loss": 1.4304, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "grad_norm": 1.5383673906326294, | |
| "learning_rate": 5.476269254806808e-06, | |
| "loss": 1.4118, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "eval_loss": 1.474881649017334, | |
| "eval_runtime": 18.1751, | |
| "eval_samples_per_second": 55.02, | |
| "eval_steps_per_second": 13.755, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0229213137450963, | |
| "grad_norm": 1.803488850593567, | |
| "learning_rate": 5.2189257747876905e-06, | |
| "loss": 1.4537, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.0267671717560187, | |
| "grad_norm": 1.8623336553573608, | |
| "learning_rate": 4.961582294768574e-06, | |
| "loss": 1.3659, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.030613029766941, | |
| "grad_norm": 1.1901572942733765, | |
| "learning_rate": 4.7042388147494575e-06, | |
| "loss": 1.4175, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.034458887777863, | |
| "grad_norm": 1.2967520952224731, | |
| "learning_rate": 4.4468953347303406e-06, | |
| "loss": 1.458, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "grad_norm": 1.2987436056137085, | |
| "learning_rate": 4.189551854711224e-06, | |
| "loss": 1.3965, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "eval_loss": 1.4528058767318726, | |
| "eval_runtime": 18.2495, | |
| "eval_samples_per_second": 54.796, | |
| "eval_steps_per_second": 13.699, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.042150603799708, | |
| "grad_norm": 1.0049172639846802, | |
| "learning_rate": 3.932208374692107e-06, | |
| "loss": 1.3012, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.04599646181063, | |
| "grad_norm": 1.193533182144165, | |
| "learning_rate": 3.6748648946729894e-06, | |
| "loss": 1.4038, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.0498423198215523, | |
| "grad_norm": 1.6459178924560547, | |
| "learning_rate": 3.417521414653873e-06, | |
| "loss": 1.4089, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.0536881778324743, | |
| "grad_norm": 0.546062171459198, | |
| "learning_rate": 3.160177934634756e-06, | |
| "loss": 1.3675, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "grad_norm": 1.7894645929336548, | |
| "learning_rate": 2.9028344546156386e-06, | |
| "loss": 1.4585, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "eval_loss": 1.460014820098877, | |
| "eval_runtime": 18.2356, | |
| "eval_samples_per_second": 54.838, | |
| "eval_steps_per_second": 13.709, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.061379893854319, | |
| "grad_norm": 1.1368170976638794, | |
| "learning_rate": 2.645490974596522e-06, | |
| "loss": 1.4038, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.065225751865241, | |
| "grad_norm": 1.698556900024414, | |
| "learning_rate": 2.388147494577405e-06, | |
| "loss": 1.4592, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.0690716098761635, | |
| "grad_norm": 1.3114346265792847, | |
| "learning_rate": 2.130804014558288e-06, | |
| "loss": 1.4566, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.0729174678870854, | |
| "grad_norm": 1.7974728345870972, | |
| "learning_rate": 1.8734605345391713e-06, | |
| "loss": 1.5074, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "grad_norm": 1.4648147821426392, | |
| "learning_rate": 1.6161170545200544e-06, | |
| "loss": 1.4478, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "eval_loss": 1.4667593240737915, | |
| "eval_runtime": 18.1467, | |
| "eval_samples_per_second": 55.107, | |
| "eval_steps_per_second": 13.777, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.0806091839089302, | |
| "grad_norm": 0.9924139380455017, | |
| "learning_rate": 1.3587735745009373e-06, | |
| "loss": 1.5088, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.084455041919852, | |
| "grad_norm": 1.1177709102630615, | |
| "learning_rate": 1.1014300944818204e-06, | |
| "loss": 1.4285, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.0883008999307746, | |
| "grad_norm": 1.7112759351730347, | |
| "learning_rate": 8.440866144627034e-07, | |
| "loss": 1.433, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.0921467579416966, | |
| "grad_norm": 1.9338856935501099, | |
| "learning_rate": 5.867431344435866e-07, | |
| "loss": 1.4008, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "grad_norm": 3.0200393199920654, | |
| "learning_rate": 3.2939965442446964e-07, | |
| "loss": 1.4285, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "eval_loss": 1.4686814546585083, | |
| "eval_runtime": 18.028, | |
| "eval_samples_per_second": 55.469, | |
| "eval_steps_per_second": 13.867, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.0998384739635414, | |
| "grad_norm": 1.5137439966201782, | |
| "learning_rate": 7.205617440535274e-08, | |
| "loss": 1.4596, | |
| "step": 27300 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 27301, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |