diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,6721 +1,99 @@ { - "best_global_step": 38250, - "best_metric": -30.471132278442383, - "best_model_checkpoint": "./ar-diffusion-checkpoints-progressive-attention/checkpoint-37500", - "epoch": 2.999769248519345, + "best_global_step": 500, + "best_metric": 1.7362440824508667, + "best_model_checkpoint": "./ar-diffusion-checkpoints-progressive-attention/checkpoint-500", + "epoch": 0.038458580109222366, "eval_steps": 250, - "global_step": 39000, + "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003845858010922237, - "grad_norm": 13.142335891723633, - "learning_rate": 1.84e-05, - "loss": 12.0117, + "grad_norm": 8.293161392211914, + "learning_rate": 1.88e-05, + "loss": 8.4888, "step": 50 }, { "epoch": 0.007691716021844474, - "grad_norm": 5.153238296508789, - "learning_rate": 3.8e-05, - "loss": 5.8408, + "grad_norm": 2.511484384536743, + "learning_rate": 3.88e-05, + "loss": 2.9957, "step": 100 }, { "epoch": 0.01153757403276671, - "grad_norm": 7.471889019012451, - "learning_rate": 5.8e-05, - "loss": 4.5173, + "grad_norm": 3.1082534790039062, + "learning_rate": 5.88e-05, + "loss": 2.2168, "step": 150 }, { "epoch": 0.015383432043688947, - "grad_norm": 4.756172180175781, - "learning_rate": 7.800000000000001e-05, - "loss": 3.8946, + "grad_norm": 3.4769539833068848, + "learning_rate": 7.88e-05, + "loss": 1.9786, "step": 200 }, { "epoch": 0.019229290054611183, - "grad_norm": 4.0329108238220215, - "learning_rate": 9.8e-05, - "loss": 3.842, + "grad_norm": 1.49607253074646, + "learning_rate": 9.88e-05, + "loss": 1.9748, "step": 250 }, { "epoch": 0.019229290054611183, - "eval_loss": 3.7754223346710205, - "eval_runtime": 18.0149, - "eval_samples_per_second": 55.51, - "eval_steps_per_second": 13.877, + "eval_loss": 1.9188385009765625, + "eval_runtime": 16.8659, + "eval_samples_per_second": 59.291, + "eval_steps_per_second": 14.823, "step": 250 }, { "epoch": 0.02307514806553342, - "grad_norm": 7.410353660583496, - "learning_rate": 0.000118, - "loss": 3.5501, + "grad_norm": 2.4030940532684326, + "learning_rate": 0.0001188, + "loss": 1.7869, "step": 300 }, { "epoch": 0.02692100607645566, - "grad_norm": 4.085294246673584, - "learning_rate": 0.000138, - "loss": 3.4015, + "grad_norm": 8.926054000854492, + "learning_rate": 0.00013879999999999999, + "loss": 1.7282, "step": 350 }, { "epoch": 0.030766864087377895, - "grad_norm": 5.78131103515625, - "learning_rate": 0.00015800000000000002, - "loss": 3.4988, + "grad_norm": 2.0970349311828613, + "learning_rate": 0.0001588, + "loss": 1.8283, "step": 400 }, { "epoch": 0.03461272209830013, - "grad_norm": 3.2009644508361816, - "learning_rate": 0.00017800000000000002, - "loss": 3.4145, + "grad_norm": 1.1687132120132446, + "learning_rate": 0.0001788, + "loss": 1.7503, "step": 450 }, { "epoch": 0.038458580109222366, - "grad_norm": 4.380056858062744, - "learning_rate": 0.00019800000000000002, - "loss": 3.3179, + "grad_norm": 1.5801700353622437, + "learning_rate": 0.0001988, + "loss": 1.7329, "step": 500 }, { "epoch": 0.038458580109222366, - "eval_loss": 3.3119897842407227, - "eval_runtime": 17.9539, - "eval_samples_per_second": 55.698, - "eval_steps_per_second": 13.925, + "eval_loss": 1.7362440824508667, + "eval_runtime": 16.8354, + "eval_samples_per_second": 59.399, + "eval_steps_per_second": 14.85, "step": 500 - }, - { - "epoch": 0.0423044381201446, - "grad_norm": 4.549246788024902, - "learning_rate": 0.0001997662519803652, - "loss": 3.3535, - "step": 550 - }, - { - "epoch": 0.04615029613106684, - "grad_norm": 4.421209335327148, - "learning_rate": 0.0001995065319585487, - "loss": 3.252, - "step": 600 - }, - { - "epoch": 0.04999615414198908, - "grad_norm": 3.3526384830474854, - "learning_rate": 0.0001992468119367322, - "loss": 3.1192, - "step": 650 - }, - { - "epoch": 0.05384201215291132, - "grad_norm": 3.0326781272888184, - "learning_rate": 0.00019898709191491573, - "loss": 2.9977, - "step": 700 - }, - { - "epoch": 0.05768787016383355, - "grad_norm": 5.362135410308838, - "learning_rate": 0.00019872737189309924, - "loss": 3.2504, - "step": 750 - }, - { - "epoch": 0.05768787016383355, - "eval_loss": 3.141200542449951, - "eval_runtime": 17.9291, - "eval_samples_per_second": 55.775, - "eval_steps_per_second": 13.944, - "step": 750 - }, - { - "epoch": 0.06153372817475579, - "grad_norm": 4.031102180480957, - "learning_rate": 0.00019846765187128275, - "loss": 3.2034, - "step": 800 - }, - { - "epoch": 0.06537958618567802, - "grad_norm": 4.671544551849365, - "learning_rate": 0.0001982079318494663, - "loss": 3.1486, - "step": 850 - }, - { - "epoch": 0.06922544419660026, - "grad_norm": 2.8522539138793945, - "learning_rate": 0.0001979482118276498, - "loss": 3.1832, - "step": 900 - }, - { - "epoch": 0.0730713022075225, - "grad_norm": 1.8937668800354004, - "learning_rate": 0.00019768849180583334, - "loss": 2.9562, - "step": 950 - }, - { - "epoch": 0.07691716021844473, - "grad_norm": 3.348437547683716, - "learning_rate": 0.00019742877178401683, - "loss": 3.0789, - "step": 1000 - }, - { - "epoch": 0.07691716021844473, - "eval_loss": 3.0555787086486816, - "eval_runtime": 17.9146, - "eval_samples_per_second": 55.82, - "eval_steps_per_second": 13.955, - "step": 1000 - }, - { - "epoch": 0.08076301822936698, - "grad_norm": 2.3448729515075684, - "learning_rate": 0.00019716905176220034, - "loss": 2.9338, - "step": 1050 - }, - { - "epoch": 0.0846088762402892, - "grad_norm": 2.609321355819702, - "learning_rate": 0.00019690933174038388, - "loss": 3.0214, - "step": 1100 - }, - { - "epoch": 0.08845473425121145, - "grad_norm": 2.4843454360961914, - "learning_rate": 0.0001966496117185674, - "loss": 2.9918, - "step": 1150 - }, - { - "epoch": 0.09230059226213368, - "grad_norm": 4.419650077819824, - "learning_rate": 0.0001963898916967509, - "loss": 2.9023, - "step": 1200 - }, - { - "epoch": 0.09614645027305592, - "grad_norm": 3.8419339656829834, - "learning_rate": 0.00019613017167493445, - "loss": 3.0145, - "step": 1250 - }, - { - "epoch": 0.09614645027305592, - "eval_loss": 2.9297268390655518, - "eval_runtime": 17.9152, - "eval_samples_per_second": 55.819, - "eval_steps_per_second": 13.955, - "step": 1250 - }, - { - "epoch": 0.09999230828397816, - "grad_norm": 7.558617115020752, - "learning_rate": 0.00019587045165311796, - "loss": 2.9151, - "step": 1300 - }, - { - "epoch": 0.10383816629490039, - "grad_norm": 2.1020314693450928, - "learning_rate": 0.00019561073163130145, - "loss": 2.8712, - "step": 1350 - }, - { - "epoch": 0.10768402430582263, - "grad_norm": 2.2210774421691895, - "learning_rate": 0.000195351011609485, - "loss": 2.8326, - "step": 1400 - }, - { - "epoch": 0.11152988231674486, - "grad_norm": 6.891756534576416, - "learning_rate": 0.0001950912915876685, - "loss": 2.8415, - "step": 1450 - }, - { - "epoch": 0.1153757403276671, - "grad_norm": 4.5944061279296875, - "learning_rate": 0.000194831571565852, - "loss": 2.9745, - "step": 1500 - }, - { - "epoch": 0.1153757403276671, - "eval_loss": 2.887022018432617, - "eval_runtime": 17.9841, - "eval_samples_per_second": 55.605, - "eval_steps_per_second": 13.901, - "step": 1500 - }, - { - "epoch": 0.11922159833858934, - "grad_norm": 3.0902695655822754, - "learning_rate": 0.00019457185154403555, - "loss": 2.8636, - "step": 1550 - }, - { - "epoch": 0.12306745634951158, - "grad_norm": 5.035081386566162, - "learning_rate": 0.00019431213152221907, - "loss": 2.9387, - "step": 1600 - }, - { - "epoch": 0.12691331436043382, - "grad_norm": 2.6864142417907715, - "learning_rate": 0.00019405241150040255, - "loss": 2.8064, - "step": 1650 - }, - { - "epoch": 0.13075917237135604, - "grad_norm": 4.986116409301758, - "learning_rate": 0.0001937926914785861, - "loss": 2.9818, - "step": 1700 - }, - { - "epoch": 0.13460503038227828, - "grad_norm": 2.94878888130188, - "learning_rate": 0.0001935329714567696, - "loss": 2.8934, - "step": 1750 - }, - { - "epoch": 0.13460503038227828, - "eval_loss": 2.7845654487609863, - "eval_runtime": 17.7992, - "eval_samples_per_second": 56.182, - "eval_steps_per_second": 14.046, - "step": 1750 - }, - { - "epoch": 0.13845088839320052, - "grad_norm": 1.6211572885513306, - "learning_rate": 0.00019327844583538946, - "loss": 2.8786, - "step": 1800 - }, - { - "epoch": 0.14229674640412276, - "grad_norm": 2.8289225101470947, - "learning_rate": 0.000193018725813573, - "loss": 2.702, - "step": 1850 - }, - { - "epoch": 0.146142604415045, - "grad_norm": 3.591913938522339, - "learning_rate": 0.0001927590057917565, - "loss": 2.8623, - "step": 1900 - }, - { - "epoch": 0.14998846242596722, - "grad_norm": 2.2412045001983643, - "learning_rate": 0.00019249928576994002, - "loss": 2.7517, - "step": 1950 - }, - { - "epoch": 0.15383432043688947, - "grad_norm": 2.419358015060425, - "learning_rate": 0.00019223956574812353, - "loss": 2.8122, - "step": 2000 - }, - { - "epoch": 0.15383432043688947, - "eval_loss": 2.7242305278778076, - "eval_runtime": 18.2018, - "eval_samples_per_second": 54.94, - "eval_steps_per_second": 13.735, - "step": 2000 - }, - { - "epoch": 0.1576801784478117, - "grad_norm": 2.873013973236084, - "learning_rate": 0.00019197984572630705, - "loss": 2.8007, - "step": 2050 - }, - { - "epoch": 0.16152603645873395, - "grad_norm": 2.716311454772949, - "learning_rate": 0.00019172012570449056, - "loss": 2.7435, - "step": 2100 - }, - { - "epoch": 0.16537189446965617, - "grad_norm": 2.8333351612091064, - "learning_rate": 0.0001914604056826741, - "loss": 2.6415, - "step": 2150 - }, - { - "epoch": 0.1692177524805784, - "grad_norm": 2.460752248764038, - "learning_rate": 0.0001912006856608576, - "loss": 2.64, - "step": 2200 - }, - { - "epoch": 0.17306361049150065, - "grad_norm": 2.181720733642578, - "learning_rate": 0.00019094096563904112, - "loss": 2.5928, - "step": 2250 - }, - { - "epoch": 0.17306361049150065, - "eval_loss": 2.620633363723755, - "eval_runtime": 17.9269, - "eval_samples_per_second": 55.782, - "eval_steps_per_second": 13.946, - "step": 2250 - }, - { - "epoch": 0.1769094685024229, - "grad_norm": 2.5329666137695312, - "learning_rate": 0.00019068124561722464, - "loss": 2.7231, - "step": 2300 - }, - { - "epoch": 0.18075532651334514, - "grad_norm": 3.047774314880371, - "learning_rate": 0.00019042152559540815, - "loss": 2.6484, - "step": 2350 - }, - { - "epoch": 0.18460118452426735, - "grad_norm": 3.1008481979370117, - "learning_rate": 0.00019016180557359166, - "loss": 2.6458, - "step": 2400 - }, - { - "epoch": 0.1884470425351896, - "grad_norm": 2.8875646591186523, - "learning_rate": 0.0001899020855517752, - "loss": 2.6817, - "step": 2450 - }, - { - "epoch": 0.19229290054611184, - "grad_norm": 2.3887267112731934, - "learning_rate": 0.00018964236552995872, - "loss": 2.6372, - "step": 2500 - }, - { - "epoch": 0.19229290054611184, - "eval_loss": 2.6067891120910645, - "eval_runtime": 18.0696, - "eval_samples_per_second": 55.342, - "eval_steps_per_second": 13.835, - "step": 2500 - }, - { - "epoch": 0.19613875855703408, - "grad_norm": 2.686868190765381, - "learning_rate": 0.00018938264550814223, - "loss": 2.498, - "step": 2550 - }, - { - "epoch": 0.19998461656795632, - "grad_norm": 2.962398052215576, - "learning_rate": 0.00018912292548632574, - "loss": 2.6486, - "step": 2600 - }, - { - "epoch": 0.20383047457887854, - "grad_norm": 1.946420669555664, - "learning_rate": 0.00018886320546450925, - "loss": 2.5939, - "step": 2650 - }, - { - "epoch": 0.20767633258980078, - "grad_norm": 3.541088104248047, - "learning_rate": 0.0001886034854426928, - "loss": 2.5497, - "step": 2700 - }, - { - "epoch": 0.21152219060072303, - "grad_norm": 2.2219605445861816, - "learning_rate": 0.0001883437654208763, - "loss": 2.4533, - "step": 2750 - }, - { - "epoch": 0.21152219060072303, - "eval_loss": 2.572129011154175, - "eval_runtime": 17.9723, - "eval_samples_per_second": 55.641, - "eval_steps_per_second": 13.91, - "step": 2750 - }, - { - "epoch": 0.21536804861164527, - "grad_norm": 3.535885810852051, - "learning_rate": 0.00018808404539905982, - "loss": 2.4784, - "step": 2800 - }, - { - "epoch": 0.21921390662256748, - "grad_norm": 3.0803003311157227, - "learning_rate": 0.00018782432537724336, - "loss": 2.5209, - "step": 2850 - }, - { - "epoch": 0.22305976463348973, - "grad_norm": 2.4540822505950928, - "learning_rate": 0.00018756460535542685, - "loss": 2.6128, - "step": 2900 - }, - { - "epoch": 0.22690562264441197, - "grad_norm": 2.583606243133545, - "learning_rate": 0.00018730488533361036, - "loss": 2.464, - "step": 2950 - }, - { - "epoch": 0.2307514806553342, - "grad_norm": 3.963115930557251, - "learning_rate": 0.0001870451653117939, - "loss": 2.5803, - "step": 3000 - }, - { - "epoch": 0.2307514806553342, - "eval_loss": 2.4989535808563232, - "eval_runtime": 17.9286, - "eval_samples_per_second": 55.777, - "eval_steps_per_second": 13.944, - "step": 3000 - }, - { - "epoch": 0.23459733866625646, - "grad_norm": 3.3239638805389404, - "learning_rate": 0.0001867854452899774, - "loss": 2.5628, - "step": 3050 - }, - { - "epoch": 0.23844319667717867, - "grad_norm": 2.3675172328948975, - "learning_rate": 0.00018652572526816092, - "loss": 2.4826, - "step": 3100 - }, - { - "epoch": 0.2422890546881009, - "grad_norm": 2.950798988342285, - "learning_rate": 0.00018626600524634446, - "loss": 2.518, - "step": 3150 - }, - { - "epoch": 0.24613491269902316, - "grad_norm": 2.4416730403900146, - "learning_rate": 0.00018600628522452798, - "loss": 2.6569, - "step": 3200 - }, - { - "epoch": 0.2499807707099454, - "grad_norm": 3.964357376098633, - "learning_rate": 0.00018574656520271146, - "loss": 2.4808, - "step": 3250 - }, - { - "epoch": 0.2499807707099454, - "eval_loss": 2.4578912258148193, - "eval_runtime": 17.8773, - "eval_samples_per_second": 55.937, - "eval_steps_per_second": 13.984, - "step": 3250 - }, - { - "epoch": 0.25382662872086764, - "grad_norm": 2.267664670944214, - "learning_rate": 0.000185486845180895, - "loss": 2.3763, - "step": 3300 - }, - { - "epoch": 0.2576724867317899, - "grad_norm": 2.5599284172058105, - "learning_rate": 0.00018522712515907852, - "loss": 2.4935, - "step": 3350 - }, - { - "epoch": 0.26151834474271207, - "grad_norm": 1.8299232721328735, - "learning_rate": 0.00018496740513726206, - "loss": 2.4674, - "step": 3400 - }, - { - "epoch": 0.2653642027536343, - "grad_norm": 1.7302981615066528, - "learning_rate": 0.00018470768511544557, - "loss": 2.4121, - "step": 3450 - }, - { - "epoch": 0.26921006076455656, - "grad_norm": 1.7371883392333984, - "learning_rate": 0.00018444796509362908, - "loss": 2.3909, - "step": 3500 - }, - { - "epoch": 0.26921006076455656, - "eval_loss": 2.427211046218872, - "eval_runtime": 17.8942, - "eval_samples_per_second": 55.884, - "eval_steps_per_second": 13.971, - "step": 3500 - }, - { - "epoch": 0.2730559187754788, - "grad_norm": 3.6324281692504883, - "learning_rate": 0.0001841882450718126, - "loss": 2.4369, - "step": 3550 - }, - { - "epoch": 0.27690177678640104, - "grad_norm": 3.5725560188293457, - "learning_rate": 0.0001839285250499961, - "loss": 2.4912, - "step": 3600 - }, - { - "epoch": 0.2807476347973233, - "grad_norm": 2.492825508117676, - "learning_rate": 0.00018366880502817962, - "loss": 2.4626, - "step": 3650 - }, - { - "epoch": 0.28459349280824553, - "grad_norm": 2.1992433071136475, - "learning_rate": 0.00018340908500636316, - "loss": 2.337, - "step": 3700 - }, - { - "epoch": 0.2884393508191678, - "grad_norm": 2.287579298019409, - "learning_rate": 0.00018314936498454667, - "loss": 2.4293, - "step": 3750 - }, - { - "epoch": 0.2884393508191678, - "eval_loss": 2.403160810470581, - "eval_runtime": 18.0987, - "eval_samples_per_second": 55.252, - "eval_steps_per_second": 13.813, - "step": 3750 - }, - { - "epoch": 0.29228520883009, - "grad_norm": 2.5130691528320312, - "learning_rate": 0.00018288964496273019, - "loss": 2.3591, - "step": 3800 - }, - { - "epoch": 0.2961310668410122, - "grad_norm": 2.376176357269287, - "learning_rate": 0.0001826299249409137, - "loss": 2.3887, - "step": 3850 - }, - { - "epoch": 0.29997692485193445, - "grad_norm": 2.7165868282318115, - "learning_rate": 0.0001823702049190972, - "loss": 2.4395, - "step": 3900 - }, - { - "epoch": 0.3038227828628567, - "grad_norm": 2.4704644680023193, - "learning_rate": 0.00018211048489728072, - "loss": 2.3314, - "step": 3950 - }, - { - "epoch": 0.30766864087377893, - "grad_norm": 1.2658872604370117, - "learning_rate": 0.00018185076487546426, - "loss": 2.3133, - "step": 4000 - }, - { - "epoch": 0.30766864087377893, - "eval_loss": 2.3614165782928467, - "eval_runtime": 17.9421, - "eval_samples_per_second": 55.735, - "eval_steps_per_second": 13.934, - "step": 4000 - }, - { - "epoch": 0.3115144988847012, - "grad_norm": 1.7358697652816772, - "learning_rate": 0.00018159104485364778, - "loss": 2.4004, - "step": 4050 - }, - { - "epoch": 0.3153603568956234, - "grad_norm": 2.5807759761810303, - "learning_rate": 0.0001813313248318313, - "loss": 2.3131, - "step": 4100 - }, - { - "epoch": 0.31920621490654566, - "grad_norm": 2.713397741317749, - "learning_rate": 0.00018107160481001483, - "loss": 2.3477, - "step": 4150 - }, - { - "epoch": 0.3230520729174679, - "grad_norm": 2.12007737159729, - "learning_rate": 0.00018081188478819832, - "loss": 2.3955, - "step": 4200 - }, - { - "epoch": 0.32689793092839015, - "grad_norm": 3.2964043617248535, - "learning_rate": 0.00018055216476638186, - "loss": 2.388, - "step": 4250 - }, - { - "epoch": 0.32689793092839015, - "eval_loss": 2.338472604751587, - "eval_runtime": 17.8233, - "eval_samples_per_second": 56.106, - "eval_steps_per_second": 14.027, - "step": 4250 - }, - { - "epoch": 0.33074378893931233, - "grad_norm": 1.9492484331130981, - "learning_rate": 0.00018029244474456537, - "loss": 2.3843, - "step": 4300 - }, - { - "epoch": 0.3345896469502346, - "grad_norm": 1.6252554655075073, - "learning_rate": 0.00018003272472274888, - "loss": 2.2891, - "step": 4350 - }, - { - "epoch": 0.3384355049611568, - "grad_norm": 2.1422605514526367, - "learning_rate": 0.00017977300470093242, - "loss": 2.3104, - "step": 4400 - }, - { - "epoch": 0.34228136297207906, - "grad_norm": 2.899003267288208, - "learning_rate": 0.00017951328467911593, - "loss": 2.2582, - "step": 4450 - }, - { - "epoch": 0.3461272209830013, - "grad_norm": 1.2482810020446777, - "learning_rate": 0.00017925356465729945, - "loss": 2.2296, - "step": 4500 - }, - { - "epoch": 0.3461272209830013, - "eval_loss": 2.2705459594726562, - "eval_runtime": 18.0389, - "eval_samples_per_second": 55.436, - "eval_steps_per_second": 13.859, - "step": 4500 - }, - { - "epoch": 0.34997307899392355, - "grad_norm": 2.5873026847839355, - "learning_rate": 0.00017899384463548296, - "loss": 2.3245, - "step": 4550 - }, - { - "epoch": 0.3538189370048458, - "grad_norm": 2.199103355407715, - "learning_rate": 0.00017873412461366647, - "loss": 2.322, - "step": 4600 - }, - { - "epoch": 0.35766479501576803, - "grad_norm": 2.3591790199279785, - "learning_rate": 0.00017847440459184999, - "loss": 2.3104, - "step": 4650 - }, - { - "epoch": 0.3615106530266903, - "grad_norm": 2.2885327339172363, - "learning_rate": 0.00017821468457003353, - "loss": 2.2148, - "step": 4700 - }, - { - "epoch": 0.36535651103761246, - "grad_norm": 1.8642834424972534, - "learning_rate": 0.00017795496454821704, - "loss": 2.2191, - "step": 4750 - }, - { - "epoch": 0.36535651103761246, - "eval_loss": 2.264026403427124, - "eval_runtime": 17.8717, - "eval_samples_per_second": 55.955, - "eval_steps_per_second": 13.989, - "step": 4750 - }, - { - "epoch": 0.3692023690485347, - "grad_norm": 1.7243143320083618, - "learning_rate": 0.00017769524452640055, - "loss": 2.2907, - "step": 4800 - }, - { - "epoch": 0.37304822705945695, - "grad_norm": 2.168203592300415, - "learning_rate": 0.00017743552450458406, - "loss": 2.2912, - "step": 4850 - }, - { - "epoch": 0.3768940850703792, - "grad_norm": 1.8623658418655396, - "learning_rate": 0.00017717580448276758, - "loss": 2.2809, - "step": 4900 - }, - { - "epoch": 0.38073994308130144, - "grad_norm": 1.9947059154510498, - "learning_rate": 0.0001769160844609511, - "loss": 2.2542, - "step": 4950 - }, - { - "epoch": 0.3845858010922237, - "grad_norm": 3.0198140144348145, - "learning_rate": 0.00017665636443913463, - "loss": 2.2352, - "step": 5000 - }, - { - "epoch": 0.3845858010922237, - "eval_loss": 2.2291696071624756, - "eval_runtime": 17.8435, - "eval_samples_per_second": 56.043, - "eval_steps_per_second": 14.011, - "step": 5000 - }, - { - "epoch": 0.3884316591031459, - "grad_norm": 1.8238661289215088, - "learning_rate": 0.00017639664441731814, - "loss": 2.3479, - "step": 5050 - }, - { - "epoch": 0.39227751711406816, - "grad_norm": 1.2968848943710327, - "learning_rate": 0.00017613692439550165, - "loss": 2.2648, - "step": 5100 - }, - { - "epoch": 0.3961233751249904, - "grad_norm": 3.036280393600464, - "learning_rate": 0.00017587720437368517, - "loss": 2.1473, - "step": 5150 - }, - { - "epoch": 0.39996923313591265, - "grad_norm": 1.7635585069656372, - "learning_rate": 0.00017561748435186868, - "loss": 2.2847, - "step": 5200 - }, - { - "epoch": 0.40381509114683484, - "grad_norm": 1.4038410186767578, - "learning_rate": 0.00017535776433005222, - "loss": 2.2621, - "step": 5250 - }, - { - "epoch": 0.40381509114683484, - "eval_loss": 2.214146614074707, - "eval_runtime": 17.7776, - "eval_samples_per_second": 56.25, - "eval_steps_per_second": 14.063, - "step": 5250 - }, - { - "epoch": 0.4076609491577571, - "grad_norm": 1.6948449611663818, - "learning_rate": 0.00017509804430823573, - "loss": 2.1945, - "step": 5300 - }, - { - "epoch": 0.4115068071686793, - "grad_norm": 2.7417380809783936, - "learning_rate": 0.00017483832428641925, - "loss": 2.2123, - "step": 5350 - }, - { - "epoch": 0.41535266517960157, - "grad_norm": 2.041032075881958, - "learning_rate": 0.00017457860426460279, - "loss": 2.2078, - "step": 5400 - }, - { - "epoch": 0.4191985231905238, - "grad_norm": 2.755319356918335, - "learning_rate": 0.00017431888424278627, - "loss": 2.1637, - "step": 5450 - }, - { - "epoch": 0.42304438120144605, - "grad_norm": 4.371281623840332, - "learning_rate": 0.00017405916422096978, - "loss": 2.3042, - "step": 5500 - }, - { - "epoch": 0.42304438120144605, - "eval_loss": 2.1678926944732666, - "eval_runtime": 17.7194, - "eval_samples_per_second": 56.435, - "eval_steps_per_second": 14.109, - "step": 5500 - }, - { - "epoch": 0.4268902392123683, - "grad_norm": 2.5908162593841553, - "learning_rate": 0.00017379944419915332, - "loss": 2.1524, - "step": 5550 - }, - { - "epoch": 0.43073609722329054, - "grad_norm": 2.3773181438446045, - "learning_rate": 0.00017353972417733684, - "loss": 2.1081, - "step": 5600 - }, - { - "epoch": 0.4345819552342128, - "grad_norm": 1.7582311630249023, - "learning_rate": 0.00017328000415552035, - "loss": 2.1667, - "step": 5650 - }, - { - "epoch": 0.43842781324513497, - "grad_norm": 2.112551689147949, - "learning_rate": 0.0001730202841337039, - "loss": 2.3053, - "step": 5700 - }, - { - "epoch": 0.4422736712560572, - "grad_norm": 1.6515388488769531, - "learning_rate": 0.0001727605641118874, - "loss": 2.1822, - "step": 5750 - }, - { - "epoch": 0.4422736712560572, - "eval_loss": 2.1589457988739014, - "eval_runtime": 18.0719, - "eval_samples_per_second": 55.335, - "eval_steps_per_second": 13.834, - "step": 5750 - }, - { - "epoch": 0.44611952926697945, - "grad_norm": 2.3416104316711426, - "learning_rate": 0.0001725008440900709, - "loss": 2.2417, - "step": 5800 - }, - { - "epoch": 0.4499653872779017, - "grad_norm": 2.031121015548706, - "learning_rate": 0.00017224112406825443, - "loss": 2.2381, - "step": 5850 - }, - { - "epoch": 0.45381124528882394, - "grad_norm": 1.880008339881897, - "learning_rate": 0.00017198140404643794, - "loss": 2.117, - "step": 5900 - }, - { - "epoch": 0.4576571032997462, - "grad_norm": 2.0337576866149902, - "learning_rate": 0.00017172168402462148, - "loss": 2.2118, - "step": 5950 - }, - { - "epoch": 0.4615029613106684, - "grad_norm": 2.1007370948791504, - "learning_rate": 0.000171461964002805, - "loss": 2.1743, - "step": 6000 - }, - { - "epoch": 0.4615029613106684, - "eval_loss": 2.130246877670288, - "eval_runtime": 17.7749, - "eval_samples_per_second": 56.259, - "eval_steps_per_second": 14.065, - "step": 6000 - }, - { - "epoch": 0.46534881932159067, - "grad_norm": 2.155015230178833, - "learning_rate": 0.0001712022439809885, - "loss": 2.1705, - "step": 6050 - }, - { - "epoch": 0.4691946773325129, - "grad_norm": 1.6260873079299927, - "learning_rate": 0.00017094252395917202, - "loss": 2.1626, - "step": 6100 - }, - { - "epoch": 0.4730405353434351, - "grad_norm": 1.473543405532837, - "learning_rate": 0.00017068280393735553, - "loss": 2.1957, - "step": 6150 - }, - { - "epoch": 0.47688639335435734, - "grad_norm": 1.728969931602478, - "learning_rate": 0.00017042308391553905, - "loss": 2.1864, - "step": 6200 - }, - { - "epoch": 0.4807322513652796, - "grad_norm": 1.990909218788147, - "learning_rate": 0.00017016336389372259, - "loss": 2.1385, - "step": 6250 - }, - { - "epoch": 0.4807322513652796, - "eval_loss": 2.0987420082092285, - "eval_runtime": 17.8525, - "eval_samples_per_second": 56.015, - "eval_steps_per_second": 14.004, - "step": 6250 - }, - { - "epoch": 0.4845781093762018, - "grad_norm": 1.9462237358093262, - "learning_rate": 0.0001699036438719061, - "loss": 2.1086, - "step": 6300 - }, - { - "epoch": 0.48842396738712407, - "grad_norm": 1.9223551750183105, - "learning_rate": 0.0001696439238500896, - "loss": 2.1302, - "step": 6350 - }, - { - "epoch": 0.4922698253980463, - "grad_norm": 2.4909427165985107, - "learning_rate": 0.00016938420382827312, - "loss": 2.2133, - "step": 6400 - }, - { - "epoch": 0.49611568340896856, - "grad_norm": 2.9876811504364014, - "learning_rate": 0.00016912448380645664, - "loss": 2.1354, - "step": 6450 - }, - { - "epoch": 0.4999615414198908, - "grad_norm": 1.96725594997406, - "learning_rate": 0.00016886476378464015, - "loss": 2.0744, - "step": 6500 - }, - { - "epoch": 0.4999615414198908, - "eval_loss": 2.083791494369507, - "eval_runtime": 17.9344, - "eval_samples_per_second": 55.759, - "eval_steps_per_second": 13.94, - "step": 6500 - }, - { - "epoch": 0.503807399430813, - "grad_norm": 2.44826602935791, - "learning_rate": 0.0001686050437628237, - "loss": 2.1554, - "step": 6550 - }, - { - "epoch": 0.5076532574417353, - "grad_norm": 3.768937826156616, - "learning_rate": 0.0001683453237410072, - "loss": 2.1123, - "step": 6600 - }, - { - "epoch": 0.5114991154526575, - "grad_norm": 1.6884543895721436, - "learning_rate": 0.00016808560371919072, - "loss": 2.0963, - "step": 6650 - }, - { - "epoch": 0.5153449734635798, - "grad_norm": 3.2465906143188477, - "learning_rate": 0.00016782588369737426, - "loss": 2.0452, - "step": 6700 - }, - { - "epoch": 0.519190831474502, - "grad_norm": 3.1067276000976562, - "learning_rate": 0.00016756616367555774, - "loss": 2.1565, - "step": 6750 - }, - { - "epoch": 0.519190831474502, - "eval_loss": 2.0547163486480713, - "eval_runtime": 17.7909, - "eval_samples_per_second": 56.208, - "eval_steps_per_second": 14.052, - "step": 6750 - }, - { - "epoch": 0.5230366894854241, - "grad_norm": 1.615390419960022, - "learning_rate": 0.00016730644365374128, - "loss": 2.0498, - "step": 6800 - }, - { - "epoch": 0.5268825474963464, - "grad_norm": 3.027007818222046, - "learning_rate": 0.0001670467236319248, - "loss": 2.1043, - "step": 6850 - }, - { - "epoch": 0.5307284055072686, - "grad_norm": 2.288503408432007, - "learning_rate": 0.0001667870036101083, - "loss": 2.019, - "step": 6900 - }, - { - "epoch": 0.5345742635181909, - "grad_norm": 2.088984727859497, - "learning_rate": 0.00016652728358829185, - "loss": 2.1001, - "step": 6950 - }, - { - "epoch": 0.5384201215291131, - "grad_norm": 1.8852577209472656, - "learning_rate": 0.00016626756356647536, - "loss": 2.0464, - "step": 7000 - }, - { - "epoch": 0.5384201215291131, - "eval_loss": 2.042011022567749, - "eval_runtime": 17.752, - "eval_samples_per_second": 56.332, - "eval_steps_per_second": 14.083, - "step": 7000 - }, - { - "epoch": 0.5422659795400354, - "grad_norm": 2.664232015609741, - "learning_rate": 0.00016600784354465885, - "loss": 2.068, - "step": 7050 - }, - { - "epoch": 0.5461118375509576, - "grad_norm": 1.5149424076080322, - "learning_rate": 0.00016574812352284239, - "loss": 2.1542, - "step": 7100 - }, - { - "epoch": 0.5499576955618799, - "grad_norm": 1.6383947134017944, - "learning_rate": 0.0001654884035010259, - "loss": 2.1201, - "step": 7150 - }, - { - "epoch": 0.5538035535728021, - "grad_norm": 2.146315097808838, - "learning_rate": 0.0001652286834792094, - "loss": 2.084, - "step": 7200 - }, - { - "epoch": 0.5576494115837243, - "grad_norm": 3.742957592010498, - "learning_rate": 0.00016496896345739295, - "loss": 2.1831, - "step": 7250 - }, - { - "epoch": 0.5576494115837243, - "eval_loss": 2.0231454372406006, - "eval_runtime": 17.88, - "eval_samples_per_second": 55.928, - "eval_steps_per_second": 13.982, - "step": 7250 - }, - { - "epoch": 0.5614952695946466, - "grad_norm": 2.808338165283203, - "learning_rate": 0.00016470924343557646, - "loss": 2.067, - "step": 7300 - }, - { - "epoch": 0.5653411276055688, - "grad_norm": 1.7112871408462524, - "learning_rate": 0.00016444952341375998, - "loss": 2.0396, - "step": 7350 - }, - { - "epoch": 0.5691869856164911, - "grad_norm": 1.9912537336349487, - "learning_rate": 0.0001641898033919435, - "loss": 2.0466, - "step": 7400 - }, - { - "epoch": 0.5730328436274132, - "grad_norm": 1.753312110900879, - "learning_rate": 0.000163930083370127, - "loss": 2.1138, - "step": 7450 - }, - { - "epoch": 0.5768787016383355, - "grad_norm": 1.4950071573257446, - "learning_rate": 0.00016367036334831052, - "loss": 2.1113, - "step": 7500 - }, - { - "epoch": 0.5768787016383355, - "eval_loss": 2.014023542404175, - "eval_runtime": 17.708, - "eval_samples_per_second": 56.472, - "eval_steps_per_second": 14.118, - "step": 7500 - }, - { - "epoch": 0.5807245596492577, - "grad_norm": 1.7512377500534058, - "learning_rate": 0.00016341064332649406, - "loss": 2.0485, - "step": 7550 - }, - { - "epoch": 0.58457041766018, - "grad_norm": 1.7805777788162231, - "learning_rate": 0.00016315092330467757, - "loss": 1.9163, - "step": 7600 - }, - { - "epoch": 0.5884162756711022, - "grad_norm": 2.1236133575439453, - "learning_rate": 0.00016289120328286108, - "loss": 2.1195, - "step": 7650 - }, - { - "epoch": 0.5922621336820244, - "grad_norm": 1.6518317461013794, - "learning_rate": 0.0001626314832610446, - "loss": 1.9555, - "step": 7700 - }, - { - "epoch": 0.5961079916929467, - "grad_norm": 6.697778701782227, - "learning_rate": 0.0001623717632392281, - "loss": 2.0895, - "step": 7750 - }, - { - "epoch": 0.5961079916929467, - "eval_loss": 1.9862221479415894, - "eval_runtime": 17.9057, - "eval_samples_per_second": 55.848, - "eval_steps_per_second": 13.962, - "step": 7750 - }, - { - "epoch": 0.5999538497038689, - "grad_norm": 1.7682440280914307, - "learning_rate": 0.00016211204321741165, - "loss": 2.0741, - "step": 7800 - }, - { - "epoch": 0.6037997077147912, - "grad_norm": 1.3762458562850952, - "learning_rate": 0.00016185232319559516, - "loss": 1.9891, - "step": 7850 - }, - { - "epoch": 0.6076455657257134, - "grad_norm": 1.802490234375, - "learning_rate": 0.00016159260317377867, - "loss": 1.9971, - "step": 7900 - }, - { - "epoch": 0.6114914237366357, - "grad_norm": 1.8477146625518799, - "learning_rate": 0.0001613328831519622, - "loss": 2.0318, - "step": 7950 - }, - { - "epoch": 0.6153372817475579, - "grad_norm": 2.1450252532958984, - "learning_rate": 0.0001610731631301457, - "loss": 2.1171, - "step": 8000 - }, - { - "epoch": 0.6153372817475579, - "eval_loss": 1.9715532064437866, - "eval_runtime": 17.7264, - "eval_samples_per_second": 56.413, - "eval_steps_per_second": 14.103, - "step": 8000 - }, - { - "epoch": 0.6191831397584802, - "grad_norm": 2.724039077758789, - "learning_rate": 0.0001608134431083292, - "loss": 1.9564, - "step": 8050 - }, - { - "epoch": 0.6230289977694023, - "grad_norm": 2.1357760429382324, - "learning_rate": 0.00016055372308651275, - "loss": 2.044, - "step": 8100 - }, - { - "epoch": 0.6268748557803245, - "grad_norm": 2.747109889984131, - "learning_rate": 0.00016029400306469626, - "loss": 2.0157, - "step": 8150 - }, - { - "epoch": 0.6307207137912468, - "grad_norm": 2.979858160018921, - "learning_rate": 0.00016003428304287978, - "loss": 2.0219, - "step": 8200 - }, - { - "epoch": 0.634566571802169, - "grad_norm": 1.5936884880065918, - "learning_rate": 0.00015977456302106332, - "loss": 1.9364, - "step": 8250 - }, - { - "epoch": 0.634566571802169, - "eval_loss": 1.9464218616485596, - "eval_runtime": 17.7894, - "eval_samples_per_second": 56.213, - "eval_steps_per_second": 14.053, - "step": 8250 - }, - { - "epoch": 0.6384124298130913, - "grad_norm": 2.502946376800537, - "learning_rate": 0.00015951484299924683, - "loss": 1.9746, - "step": 8300 - }, - { - "epoch": 0.6422582878240135, - "grad_norm": 1.307023048400879, - "learning_rate": 0.00015925512297743034, - "loss": 1.9936, - "step": 8350 - }, - { - "epoch": 0.6461041458349358, - "grad_norm": 2.037578582763672, - "learning_rate": 0.00015899540295561385, - "loss": 1.9515, - "step": 8400 - }, - { - "epoch": 0.649950003845858, - "grad_norm": 1.4244835376739502, - "learning_rate": 0.00015873568293379737, - "loss": 1.9988, - "step": 8450 - }, - { - "epoch": 0.6537958618567803, - "grad_norm": 2.5362133979797363, - "learning_rate": 0.0001584759629119809, - "loss": 2.1192, - "step": 8500 - }, - { - "epoch": 0.6537958618567803, - "eval_loss": 1.9391300678253174, - "eval_runtime": 17.9326, - "eval_samples_per_second": 55.764, - "eval_steps_per_second": 13.941, - "step": 8500 - }, - { - "epoch": 0.6576417198677025, - "grad_norm": 1.2576918601989746, - "learning_rate": 0.00015821624289016442, - "loss": 1.9734, - "step": 8550 - }, - { - "epoch": 0.6614875778786247, - "grad_norm": 2.036285400390625, - "learning_rate": 0.00015795652286834793, - "loss": 1.9899, - "step": 8600 - }, - { - "epoch": 0.665333435889547, - "grad_norm": 1.6453627347946167, - "learning_rate": 0.00015769680284653145, - "loss": 1.9238, - "step": 8650 - }, - { - "epoch": 0.6691792939004692, - "grad_norm": 1.4801263809204102, - "learning_rate": 0.00015743708282471496, - "loss": 1.9491, - "step": 8700 - }, - { - "epoch": 0.6730251519113915, - "grad_norm": 2.176879405975342, - "learning_rate": 0.00015717736280289847, - "loss": 1.9754, - "step": 8750 - }, - { - "epoch": 0.6730251519113915, - "eval_loss": 1.9268388748168945, - "eval_runtime": 17.7426, - "eval_samples_per_second": 56.362, - "eval_steps_per_second": 14.09, - "step": 8750 - }, - { - "epoch": 0.6768710099223136, - "grad_norm": 2.347804546356201, - "learning_rate": 0.000156917642781082, - "loss": 2.0034, - "step": 8800 - }, - { - "epoch": 0.6807168679332359, - "grad_norm": 1.8260661363601685, - "learning_rate": 0.00015665792275926552, - "loss": 1.9851, - "step": 8850 - }, - { - "epoch": 0.6845627259441581, - "grad_norm": 2.697566509246826, - "learning_rate": 0.00015639820273744904, - "loss": 1.9259, - "step": 8900 - }, - { - "epoch": 0.6884085839550804, - "grad_norm": 2.5771334171295166, - "learning_rate": 0.00015613848271563255, - "loss": 1.9216, - "step": 8950 - }, - { - "epoch": 0.6922544419660026, - "grad_norm": 1.098548173904419, - "learning_rate": 0.00015587876269381606, - "loss": 1.8965, - "step": 9000 - }, - { - "epoch": 0.6922544419660026, - "eval_loss": 1.900924801826477, - "eval_runtime": 17.6818, - "eval_samples_per_second": 56.555, - "eval_steps_per_second": 14.139, - "step": 9000 - }, - { - "epoch": 0.6961002999769248, - "grad_norm": 3.657864809036255, - "learning_rate": 0.00015561904267199958, - "loss": 1.95, - "step": 9050 - }, - { - "epoch": 0.6999461579878471, - "grad_norm": 1.5260729789733887, - "learning_rate": 0.00015535932265018312, - "loss": 1.9086, - "step": 9100 - }, - { - "epoch": 0.7037920159987693, - "grad_norm": 1.523361086845398, - "learning_rate": 0.00015509960262836663, - "loss": 1.8866, - "step": 9150 - }, - { - "epoch": 0.7076378740096916, - "grad_norm": 2.4977364540100098, - "learning_rate": 0.00015483988260655017, - "loss": 1.9033, - "step": 9200 - }, - { - "epoch": 0.7114837320206138, - "grad_norm": 2.8058366775512695, - "learning_rate": 0.00015458016258473365, - "loss": 1.9326, - "step": 9250 - }, - { - "epoch": 0.7114837320206138, - "eval_loss": 1.9121650457382202, - "eval_runtime": 17.715, - "eval_samples_per_second": 56.449, - "eval_steps_per_second": 14.112, - "step": 9250 - }, - { - "epoch": 0.7153295900315361, - "grad_norm": 2.2067878246307373, - "learning_rate": 0.00015432044256291717, - "loss": 1.9433, - "step": 9300 - }, - { - "epoch": 0.7191754480424583, - "grad_norm": 2.8370649814605713, - "learning_rate": 0.0001540607225411007, - "loss": 1.8972, - "step": 9350 - }, - { - "epoch": 0.7230213060533806, - "grad_norm": 1.707829236984253, - "learning_rate": 0.00015380100251928422, - "loss": 1.9328, - "step": 9400 - }, - { - "epoch": 0.7268671640643027, - "grad_norm": 2.2784628868103027, - "learning_rate": 0.00015354128249746773, - "loss": 1.9882, - "step": 9450 - }, - { - "epoch": 0.7307130220752249, - "grad_norm": 0.9409534931182861, - "learning_rate": 0.00015328156247565127, - "loss": 1.865, - "step": 9500 - }, - { - "epoch": 0.7307130220752249, - "eval_loss": 1.8746944665908813, - "eval_runtime": 17.5929, - "eval_samples_per_second": 56.841, - "eval_steps_per_second": 14.21, - "step": 9500 - }, - { - "epoch": 0.7345588800861472, - "grad_norm": 1.6800929307937622, - "learning_rate": 0.00015302184245383479, - "loss": 1.8351, - "step": 9550 - }, - { - "epoch": 0.7384047380970694, - "grad_norm": 2.518232822418213, - "learning_rate": 0.00015276212243201827, - "loss": 1.9335, - "step": 9600 - }, - { - "epoch": 0.7422505961079917, - "grad_norm": 1.3104406595230103, - "learning_rate": 0.0001525024024102018, - "loss": 1.9058, - "step": 9650 - }, - { - "epoch": 0.7460964541189139, - "grad_norm": 1.8274422883987427, - "learning_rate": 0.00015224268238838532, - "loss": 1.8246, - "step": 9700 - }, - { - "epoch": 0.7499423121298362, - "grad_norm": 1.5810478925704956, - "learning_rate": 0.00015198296236656884, - "loss": 1.9504, - "step": 9750 - }, - { - "epoch": 0.7499423121298362, - "eval_loss": 1.8743510246276855, - "eval_runtime": 17.783, - "eval_samples_per_second": 56.233, - "eval_steps_per_second": 14.058, - "step": 9750 - }, - { - "epoch": 0.7537881701407584, - "grad_norm": 1.7910431623458862, - "learning_rate": 0.00015172324234475238, - "loss": 1.8525, - "step": 9800 - }, - { - "epoch": 0.7576340281516807, - "grad_norm": 0.7378084063529968, - "learning_rate": 0.0001514635223229359, - "loss": 1.8688, - "step": 9850 - }, - { - "epoch": 0.7614798861626029, - "grad_norm": 1.357282280921936, - "learning_rate": 0.0001512038023011194, - "loss": 1.855, - "step": 9900 - }, - { - "epoch": 0.7653257441735252, - "grad_norm": 2.7199490070343018, - "learning_rate": 0.00015094408227930292, - "loss": 1.8655, - "step": 9950 - }, - { - "epoch": 0.7691716021844474, - "grad_norm": 1.9351532459259033, - "learning_rate": 0.00015068436225748643, - "loss": 1.9272, - "step": 10000 - }, - { - "epoch": 0.7691716021844474, - "eval_loss": 1.8518301248550415, - "eval_runtime": 17.6154, - "eval_samples_per_second": 56.769, - "eval_steps_per_second": 14.192, - "step": 10000 - }, - { - "epoch": 0.7730174601953695, - "grad_norm": 1.4824655055999756, - "learning_rate": 0.00015042464223566997, - "loss": 1.9359, - "step": 10050 - }, - { - "epoch": 0.7768633182062918, - "grad_norm": 1.8489737510681152, - "learning_rate": 0.00015016492221385348, - "loss": 1.7524, - "step": 10100 - }, - { - "epoch": 0.780709176217214, - "grad_norm": 1.8048781156539917, - "learning_rate": 0.000149905202192037, - "loss": 1.9209, - "step": 10150 - }, - { - "epoch": 0.7845550342281363, - "grad_norm": 1.5031684637069702, - "learning_rate": 0.0001496454821702205, - "loss": 1.8224, - "step": 10200 - }, - { - "epoch": 0.7884008922390585, - "grad_norm": 3.326998710632324, - "learning_rate": 0.00014938576214840402, - "loss": 1.8585, - "step": 10250 - }, - { - "epoch": 0.7884008922390585, - "eval_loss": 1.840669870376587, - "eval_runtime": 17.7052, - "eval_samples_per_second": 56.48, - "eval_steps_per_second": 14.12, - "step": 10250 - }, - { - "epoch": 0.7922467502499808, - "grad_norm": 1.9124550819396973, - "learning_rate": 0.00014912604212658753, - "loss": 1.8978, - "step": 10300 - }, - { - "epoch": 0.796092608260903, - "grad_norm": 1.2113343477249146, - "learning_rate": 0.00014886632210477107, - "loss": 1.9814, - "step": 10350 - }, - { - "epoch": 0.7999384662718253, - "grad_norm": 3.08606219291687, - "learning_rate": 0.00014860660208295459, - "loss": 1.7926, - "step": 10400 - }, - { - "epoch": 0.8037843242827475, - "grad_norm": 3.1372311115264893, - "learning_rate": 0.0001483468820611381, - "loss": 1.9039, - "step": 10450 - }, - { - "epoch": 0.8076301822936697, - "grad_norm": 2.0522525310516357, - "learning_rate": 0.00014808716203932164, - "loss": 1.91, - "step": 10500 - }, - { - "epoch": 0.8076301822936697, - "eval_loss": 1.8211458921432495, - "eval_runtime": 17.6697, - "eval_samples_per_second": 56.594, - "eval_steps_per_second": 14.149, - "step": 10500 - }, - { - "epoch": 0.811476040304592, - "grad_norm": 2.5157995223999023, - "learning_rate": 0.00014782744201750512, - "loss": 1.9747, - "step": 10550 - }, - { - "epoch": 0.8153218983155142, - "grad_norm": 1.821583867073059, - "learning_rate": 0.00014756772199568864, - "loss": 1.8858, - "step": 10600 - }, - { - "epoch": 0.8191677563264365, - "grad_norm": 1.9474884271621704, - "learning_rate": 0.00014730800197387218, - "loss": 1.7796, - "step": 10650 - }, - { - "epoch": 0.8230136143373586, - "grad_norm": 1.889302372932434, - "learning_rate": 0.0001470482819520557, - "loss": 2.0, - "step": 10700 - }, - { - "epoch": 0.8268594723482809, - "grad_norm": 2.596470594406128, - "learning_rate": 0.0001467885619302392, - "loss": 1.8673, - "step": 10750 - }, - { - "epoch": 0.8268594723482809, - "eval_loss": 1.821048617362976, - "eval_runtime": 17.7145, - "eval_samples_per_second": 56.451, - "eval_steps_per_second": 14.113, - "step": 10750 - }, - { - "epoch": 0.8307053303592031, - "grad_norm": 1.4734866619110107, - "learning_rate": 0.00014652884190842274, - "loss": 1.794, - "step": 10800 - }, - { - "epoch": 0.8345511883701254, - "grad_norm": 2.4944119453430176, - "learning_rate": 0.00014626912188660625, - "loss": 1.8974, - "step": 10850 - }, - { - "epoch": 0.8383970463810476, - "grad_norm": 2.175258159637451, - "learning_rate": 0.00014600940186478977, - "loss": 1.9176, - "step": 10900 - }, - { - "epoch": 0.8422429043919698, - "grad_norm": 1.6867458820343018, - "learning_rate": 0.00014574968184297328, - "loss": 1.8364, - "step": 10950 - }, - { - "epoch": 0.8460887624028921, - "grad_norm": 1.8447539806365967, - "learning_rate": 0.0001454899618211568, - "loss": 1.7942, - "step": 11000 - }, - { - "epoch": 0.8460887624028921, - "eval_loss": 1.7994624376296997, - "eval_runtime": 17.4954, - "eval_samples_per_second": 57.158, - "eval_steps_per_second": 14.289, - "step": 11000 - }, - { - "epoch": 0.8499346204138143, - "grad_norm": 2.359623432159424, - "learning_rate": 0.00014523024179934033, - "loss": 1.8082, - "step": 11050 - }, - { - "epoch": 0.8537804784247366, - "grad_norm": 1.690730333328247, - "learning_rate": 0.00014497052177752385, - "loss": 1.9029, - "step": 11100 - }, - { - "epoch": 0.8576263364356588, - "grad_norm": 1.5211533308029175, - "learning_rate": 0.00014471080175570736, - "loss": 1.836, - "step": 11150 - }, - { - "epoch": 0.8614721944465811, - "grad_norm": 2.294943332672119, - "learning_rate": 0.00014445108173389087, - "loss": 1.8554, - "step": 11200 - }, - { - "epoch": 0.8653180524575033, - "grad_norm": 1.7020373344421387, - "learning_rate": 0.00014419136171207438, - "loss": 1.8487, - "step": 11250 - }, - { - "epoch": 0.8653180524575033, - "eval_loss": 1.8003779649734497, - "eval_runtime": 17.5162, - "eval_samples_per_second": 57.09, - "eval_steps_per_second": 14.272, - "step": 11250 - }, - { - "epoch": 0.8691639104684256, - "grad_norm": 1.6584821939468384, - "learning_rate": 0.0001439316416902579, - "loss": 1.8169, - "step": 11300 - }, - { - "epoch": 0.8730097684793477, - "grad_norm": 1.5835644006729126, - "learning_rate": 0.00014367192166844144, - "loss": 1.8344, - "step": 11350 - }, - { - "epoch": 0.8768556264902699, - "grad_norm": 1.37686026096344, - "learning_rate": 0.00014341220164662495, - "loss": 1.839, - "step": 11400 - }, - { - "epoch": 0.8807014845011922, - "grad_norm": 1.2032794952392578, - "learning_rate": 0.00014315248162480846, - "loss": 1.8392, - "step": 11450 - }, - { - "epoch": 0.8845473425121144, - "grad_norm": 2.025676727294922, - "learning_rate": 0.00014289276160299198, - "loss": 1.7939, - "step": 11500 - }, - { - "epoch": 0.8845473425121144, - "eval_loss": 1.7949305772781372, - "eval_runtime": 17.7014, - "eval_samples_per_second": 56.493, - "eval_steps_per_second": 14.123, - "step": 11500 - }, - { - "epoch": 0.8883932005230367, - "grad_norm": 2.710430383682251, - "learning_rate": 0.0001426330415811755, - "loss": 1.8816, - "step": 11550 - }, - { - "epoch": 0.8922390585339589, - "grad_norm": 1.4110090732574463, - "learning_rate": 0.000142373321559359, - "loss": 1.8762, - "step": 11600 - }, - { - "epoch": 0.8960849165448812, - "grad_norm": 1.4623394012451172, - "learning_rate": 0.00014211360153754254, - "loss": 1.8463, - "step": 11650 - }, - { - "epoch": 0.8999307745558034, - "grad_norm": 1.2035316228866577, - "learning_rate": 0.00014185388151572605, - "loss": 1.8143, - "step": 11700 - }, - { - "epoch": 0.9037766325667257, - "grad_norm": 3.2838220596313477, - "learning_rate": 0.0001415941614939096, - "loss": -16.199, - "step": 11750 - }, - { - "epoch": 0.9037766325667257, - "eval_loss": -26.813920974731445, - "eval_runtime": 17.131, - "eval_samples_per_second": 58.374, - "eval_steps_per_second": 14.593, - "step": 11750 - }, - { - "epoch": 0.9076224905776479, - "grad_norm": 9.025236129760742, - "learning_rate": 0.00014133963587252942, - "loss": -28.4398, - "step": 11800 - }, - { - "epoch": 0.9114683485885701, - "grad_norm": 9.17872428894043, - "learning_rate": 0.00014109030465158558, - "loss": -29.3786, - "step": 11850 - }, - { - "epoch": 0.9153142065994924, - "grad_norm": 13.099103927612305, - "learning_rate": 0.00014083058462976912, - "loss": -29.5958, - "step": 11900 - }, - { - "epoch": 0.9191600646104146, - "grad_norm": 7.575841426849365, - "learning_rate": 0.00014057086460795263, - "loss": -29.7462, - "step": 11950 - }, - { - "epoch": 0.9230059226213368, - "grad_norm": 8.827557563781738, - "learning_rate": 0.00014031114458613615, - "loss": -29.9022, - "step": 12000 - }, - { - "epoch": 0.9230059226213368, - "eval_loss": -29.983943939208984, - "eval_runtime": 17.1961, - "eval_samples_per_second": 58.153, - "eval_steps_per_second": 14.538, - "step": 12000 - }, - { - "epoch": 0.926851780632259, - "grad_norm": 9.088982582092285, - "learning_rate": 0.0001400514245643197, - "loss": -29.9348, - "step": 12050 - }, - { - "epoch": 0.9306976386431813, - "grad_norm": 3.507662534713745, - "learning_rate": 0.0001397917045425032, - "loss": -30.0066, - "step": 12100 - }, - { - "epoch": 0.9345434966541035, - "grad_norm": 5.43391227722168, - "learning_rate": 0.00013953198452068669, - "loss": -30.0515, - "step": 12150 - }, - { - "epoch": 0.9383893546650258, - "grad_norm": 6.1119704246521, - "learning_rate": 0.00013927226449887023, - "loss": -30.0768, - "step": 12200 - }, - { - "epoch": 0.942235212675948, - "grad_norm": 4.298177242279053, - "learning_rate": 0.00013901254447705374, - "loss": -30.0727, - "step": 12250 - }, - { - "epoch": 0.942235212675948, - "eval_loss": -30.140077590942383, - "eval_runtime": 17.3124, - "eval_samples_per_second": 57.762, - "eval_steps_per_second": 14.441, - "step": 12250 - }, - { - "epoch": 0.9460810706868702, - "grad_norm": 5.769038677215576, - "learning_rate": 0.00013875282445523725, - "loss": -30.0775, - "step": 12300 - }, - { - "epoch": 0.9499269286977925, - "grad_norm": 5.405306816101074, - "learning_rate": 0.0001384931044334208, - "loss": -30.1151, - "step": 12350 - }, - { - "epoch": 0.9537727867087147, - "grad_norm": 7.666799545288086, - "learning_rate": 0.0001382333844116043, - "loss": -30.1243, - "step": 12400 - }, - { - "epoch": 0.957618644719637, - "grad_norm": 8.419403076171875, - "learning_rate": 0.00013797366438978782, - "loss": -30.1474, - "step": 12450 - }, - { - "epoch": 0.9614645027305592, - "grad_norm": 2.181037187576294, - "learning_rate": 0.00013771394436797133, - "loss": -30.1508, - "step": 12500 - }, - { - "epoch": 0.9614645027305592, - "eval_loss": -30.163711547851562, - "eval_runtime": 17.0325, - "eval_samples_per_second": 58.711, - "eval_steps_per_second": 14.678, - "step": 12500 - }, - { - "epoch": 0.9653103607414815, - "grad_norm": 8.547541618347168, - "learning_rate": 0.00013745422434615484, - "loss": -30.1494, - "step": 12550 - }, - { - "epoch": 0.9691562187524037, - "grad_norm": 4.740926265716553, - "learning_rate": 0.00013719450432433838, - "loss": -30.1558, - "step": 12600 - }, - { - "epoch": 0.973002076763326, - "grad_norm": 5.869479179382324, - "learning_rate": 0.0001369347843025219, - "loss": -30.1616, - "step": 12650 - }, - { - "epoch": 0.9768479347742481, - "grad_norm": 7.4348673820495605, - "learning_rate": 0.0001366750642807054, - "loss": -30.1561, - "step": 12700 - }, - { - "epoch": 0.9806937927851703, - "grad_norm": 8.506593704223633, - "learning_rate": 0.00013641534425888892, - "loss": -30.1737, - "step": 12750 - }, - { - "epoch": 0.9806937927851703, - "eval_loss": -30.198650360107422, - "eval_runtime": 16.9999, - "eval_samples_per_second": 58.824, - "eval_steps_per_second": 14.706, - "step": 12750 - }, - { - "epoch": 0.9845396507960926, - "grad_norm": 5.528530120849609, - "learning_rate": 0.00013615562423707243, - "loss": -30.1728, - "step": 12800 - }, - { - "epoch": 0.9883855088070148, - "grad_norm": 3.630002021789551, - "learning_rate": 0.00013589590421525595, - "loss": -30.1832, - "step": 12850 - }, - { - "epoch": 0.9922313668179371, - "grad_norm": 5.898171424865723, - "learning_rate": 0.00013563618419343949, - "loss": -30.1849, - "step": 12900 - }, - { - "epoch": 0.9960772248288593, - "grad_norm": 5.75803279876709, - "learning_rate": 0.000135376464171623, - "loss": -30.1971, - "step": 12950 - }, - { - "epoch": 0.9999230828397816, - "grad_norm": 8.700019836425781, - "learning_rate": 0.0001351167441498065, - "loss": -30.2117, - "step": 13000 - }, - { - "epoch": 0.9999230828397816, - "eval_loss": -30.213665008544922, - "eval_runtime": 17.0537, - "eval_samples_per_second": 58.638, - "eval_steps_per_second": 14.66, - "step": 13000 - }, - { - "epoch": 1.0037689408507038, - "grad_norm": 6.23037052154541, - "learning_rate": 0.00013485702412799005, - "loss": -30.1957, - "step": 13050 - }, - { - "epoch": 1.007614798861626, - "grad_norm": 4.10649299621582, - "learning_rate": 0.00013459730410617354, - "loss": -30.2093, - "step": 13100 - }, - { - "epoch": 1.0114606568725482, - "grad_norm": 6.44740629196167, - "learning_rate": 0.00013433758408435705, - "loss": -30.1925, - "step": 13150 - }, - { - "epoch": 1.0153065148834706, - "grad_norm": 3.919144630432129, - "learning_rate": 0.0001340778640625406, - "loss": -30.2131, - "step": 13200 - }, - { - "epoch": 1.0191523728943928, - "grad_norm": 3.7862091064453125, - "learning_rate": 0.0001338181440407241, - "loss": -30.2237, - "step": 13250 - }, - { - "epoch": 1.0191523728943928, - "eval_loss": -30.221071243286133, - "eval_runtime": 17.0391, - "eval_samples_per_second": 58.688, - "eval_steps_per_second": 14.672, - "step": 13250 - }, - { - "epoch": 1.022998230905315, - "grad_norm": 2.9611716270446777, - "learning_rate": 0.00013355842401890762, - "loss": -30.2087, - "step": 13300 - }, - { - "epoch": 1.0268440889162371, - "grad_norm": 3.5864665508270264, - "learning_rate": 0.00013329870399709116, - "loss": -30.2231, - "step": 13350 - }, - { - "epoch": 1.0306899469271595, - "grad_norm": 4.609787940979004, - "learning_rate": 0.00013304417837571098, - "loss": -30.2137, - "step": 13400 - }, - { - "epoch": 1.0345358049380817, - "grad_norm": 13.944046974182129, - "learning_rate": 0.0001327844583538945, - "loss": -30.2348, - "step": 13450 - }, - { - "epoch": 1.038381662949004, - "grad_norm": 3.965855836868286, - "learning_rate": 0.00013252473833207803, - "loss": -30.2325, - "step": 13500 - }, - { - "epoch": 1.038381662949004, - "eval_loss": -30.246578216552734, - "eval_runtime": 17.0659, - "eval_samples_per_second": 58.596, - "eval_steps_per_second": 14.649, - "step": 13500 - }, - { - "epoch": 1.042227520959926, - "grad_norm": 11.001680374145508, - "learning_rate": 0.00013226501831026155, - "loss": -30.2413, - "step": 13550 - }, - { - "epoch": 1.0460733789708483, - "grad_norm": 6.924397945404053, - "learning_rate": 0.00013200529828844506, - "loss": -30.2411, - "step": 13600 - }, - { - "epoch": 1.0499192369817707, - "grad_norm": 4.740690231323242, - "learning_rate": 0.0001317455782666286, - "loss": -30.2412, - "step": 13650 - }, - { - "epoch": 1.0537650949926929, - "grad_norm": 6.716350078582764, - "learning_rate": 0.0001314858582448121, - "loss": -30.243, - "step": 13700 - }, - { - "epoch": 1.057610953003615, - "grad_norm": 3.5048656463623047, - "learning_rate": 0.0001312261382229956, - "loss": -30.2447, - "step": 13750 - }, - { - "epoch": 1.057610953003615, - "eval_loss": -30.26629066467285, - "eval_runtime": 17.0438, - "eval_samples_per_second": 58.672, - "eval_steps_per_second": 14.668, - "step": 13750 - }, - { - "epoch": 1.0614568110145373, - "grad_norm": 12.012964248657227, - "learning_rate": 0.00013096641820117914, - "loss": -30.2578, - "step": 13800 - }, - { - "epoch": 1.0653026690254597, - "grad_norm": 5.072702407836914, - "learning_rate": 0.00013070669817936265, - "loss": -30.2674, - "step": 13850 - }, - { - "epoch": 1.0691485270363819, - "grad_norm": 6.612403392791748, - "learning_rate": 0.00013044697815754616, - "loss": -30.281, - "step": 13900 - }, - { - "epoch": 1.072994385047304, - "grad_norm": 5.599041938781738, - "learning_rate": 0.0001301872581357297, - "loss": -30.2823, - "step": 13950 - }, - { - "epoch": 1.0768402430582262, - "grad_norm": 3.985121726989746, - "learning_rate": 0.00012992753811391322, - "loss": -30.2907, - "step": 14000 - }, - { - "epoch": 1.0768402430582262, - "eval_loss": -30.32036781311035, - "eval_runtime": 16.9552, - "eval_samples_per_second": 58.979, - "eval_steps_per_second": 14.745, - "step": 14000 - }, - { - "epoch": 1.0806861010691486, - "grad_norm": 8.776793479919434, - "learning_rate": 0.00012966781809209673, - "loss": -30.2985, - "step": 14050 - }, - { - "epoch": 1.0845319590800708, - "grad_norm": 6.127963066101074, - "learning_rate": 0.00012940809807028024, - "loss": -30.3066, - "step": 14100 - }, - { - "epoch": 1.088377817090993, - "grad_norm": 7.999585151672363, - "learning_rate": 0.00012914837804846375, - "loss": -30.2914, - "step": 14150 - }, - { - "epoch": 1.0922236751019152, - "grad_norm": 7.283746719360352, - "learning_rate": 0.0001288886580266473, - "loss": -30.3119, - "step": 14200 - }, - { - "epoch": 1.0960695331128374, - "grad_norm": 10.978534698486328, - "learning_rate": 0.0001286289380048308, - "loss": -30.3062, - "step": 14250 - }, - { - "epoch": 1.0960695331128374, - "eval_loss": -30.304922103881836, - "eval_runtime": 16.9338, - "eval_samples_per_second": 59.054, - "eval_steps_per_second": 14.763, - "step": 14250 - }, - { - "epoch": 1.0999153911237598, - "grad_norm": 4.0659918785095215, - "learning_rate": 0.00012836921798301432, - "loss": -30.3197, - "step": 14300 - }, - { - "epoch": 1.103761249134682, - "grad_norm": 6.115564346313477, - "learning_rate": 0.00012810949796119783, - "loss": -30.3324, - "step": 14350 - }, - { - "epoch": 1.1076071071456042, - "grad_norm": 7.9282331466674805, - "learning_rate": 0.00012784977793938135, - "loss": -30.3401, - "step": 14400 - }, - { - "epoch": 1.1114529651565264, - "grad_norm": 5.920327663421631, - "learning_rate": 0.00012759005791756486, - "loss": -30.3405, - "step": 14450 - }, - { - "epoch": 1.1152988231674485, - "grad_norm": 3.5994579792022705, - "learning_rate": 0.0001273303378957484, - "loss": -30.3499, - "step": 14500 - }, - { - "epoch": 1.1152988231674485, - "eval_loss": -30.38400650024414, - "eval_runtime": 16.9883, - "eval_samples_per_second": 58.864, - "eval_steps_per_second": 14.716, - "step": 14500 - }, - { - "epoch": 1.119144681178371, - "grad_norm": 4.9496750831604, - "learning_rate": 0.0001270706178739319, - "loss": -30.352, - "step": 14550 - }, - { - "epoch": 1.1229905391892931, - "grad_norm": 4.976698875427246, - "learning_rate": 0.00012681089785211542, - "loss": -30.3553, - "step": 14600 - }, - { - "epoch": 1.1268363972002153, - "grad_norm": 3.139697790145874, - "learning_rate": 0.00012655117783029896, - "loss": -30.357, - "step": 14650 - }, - { - "epoch": 1.1306822552111375, - "grad_norm": 5.011180400848389, - "learning_rate": 0.00012629145780848245, - "loss": -30.3574, - "step": 14700 - }, - { - "epoch": 1.13452811322206, - "grad_norm": 8.093709945678711, - "learning_rate": 0.00012603173778666596, - "loss": -30.3587, - "step": 14750 - }, - { - "epoch": 1.13452811322206, - "eval_loss": -30.386066436767578, - "eval_runtime": 16.9522, - "eval_samples_per_second": 58.989, - "eval_steps_per_second": 14.747, - "step": 14750 - }, - { - "epoch": 1.1383739712329821, - "grad_norm": 4.983211040496826, - "learning_rate": 0.0001257720177648495, - "loss": -30.3616, - "step": 14800 - }, - { - "epoch": 1.1422198292439043, - "grad_norm": 3.838836431503296, - "learning_rate": 0.00012551229774303301, - "loss": -30.3639, - "step": 14850 - }, - { - "epoch": 1.1460656872548265, - "grad_norm": 53.798492431640625, - "learning_rate": 0.00012525257772121653, - "loss": -30.3554, - "step": 14900 - }, - { - "epoch": 1.149911545265749, - "grad_norm": 2.782118558883667, - "learning_rate": 0.00012499285769940007, - "loss": -30.3685, - "step": 14950 - }, - { - "epoch": 1.153757403276671, - "grad_norm": 4.757026195526123, - "learning_rate": 0.00012473313767758355, - "loss": -30.3738, - "step": 15000 - }, - { - "epoch": 1.153757403276671, - "eval_loss": -30.39963722229004, - "eval_runtime": 16.9126, - "eval_samples_per_second": 59.128, - "eval_steps_per_second": 14.782, - "step": 15000 - }, - { - "epoch": 1.1576032612875933, - "grad_norm": 5.459719657897949, - "learning_rate": 0.0001244734176557671, - "loss": -30.3726, - "step": 15050 - }, - { - "epoch": 1.1614491192985155, - "grad_norm": 5.796365737915039, - "learning_rate": 0.0001242136976339506, - "loss": -30.3743, - "step": 15100 - }, - { - "epoch": 1.1652949773094377, - "grad_norm": 8.96560287475586, - "learning_rate": 0.00012395397761213412, - "loss": -30.3713, - "step": 15150 - }, - { - "epoch": 1.16914083532036, - "grad_norm": 3.6820108890533447, - "learning_rate": 0.00012369425759031766, - "loss": -30.3737, - "step": 15200 - }, - { - "epoch": 1.1729866933312822, - "grad_norm": 4.876701831817627, - "learning_rate": 0.00012343453756850117, - "loss": -30.3783, - "step": 15250 - }, - { - "epoch": 1.1729866933312822, - "eval_loss": -30.406328201293945, - "eval_runtime": 17.0903, - "eval_samples_per_second": 58.513, - "eval_steps_per_second": 14.628, - "step": 15250 - }, - { - "epoch": 1.1768325513422044, - "grad_norm": 6.255270957946777, - "learning_rate": 0.00012317481754668468, - "loss": -30.3819, - "step": 15300 - }, - { - "epoch": 1.1806784093531266, - "grad_norm": 3.348052978515625, - "learning_rate": 0.00012293068072617719, - "loss": -30.0431, - "step": 15350 - }, - { - "epoch": 1.1845242673640488, - "grad_norm": 6.610827922821045, - "learning_rate": 0.0001226709607043607, - "loss": -30.3818, - "step": 15400 - }, - { - "epoch": 1.1883701253749712, - "grad_norm": 5.610541343688965, - "learning_rate": 0.0001224112406825442, - "loss": -30.3844, - "step": 15450 - }, - { - "epoch": 1.1922159833858934, - "grad_norm": 6.599088191986084, - "learning_rate": 0.00012215152066072775, - "loss": -30.3857, - "step": 15500 - }, - { - "epoch": 1.1922159833858934, - "eval_loss": -30.417774200439453, - "eval_runtime": 17.1082, - "eval_samples_per_second": 58.451, - "eval_steps_per_second": 14.613, - "step": 15500 - }, - { - "epoch": 1.1960618413968156, - "grad_norm": 5.156207084655762, - "learning_rate": 0.00012189180063891126, - "loss": -30.3882, - "step": 15550 - }, - { - "epoch": 1.1999076994077378, - "grad_norm": 6.738996982574463, - "learning_rate": 0.00012163208061709476, - "loss": -30.3916, - "step": 15600 - }, - { - "epoch": 1.2037535574186602, - "grad_norm": 3.4522907733917236, - "learning_rate": 0.0001213723605952783, - "loss": -30.3917, - "step": 15650 - }, - { - "epoch": 1.2075994154295824, - "grad_norm": 2.062731981277466, - "learning_rate": 0.00012111264057346182, - "loss": -30.3951, - "step": 15700 - }, - { - "epoch": 1.2114452734405046, - "grad_norm": 6.478760242462158, - "learning_rate": 0.00012085292055164532, - "loss": -30.3963, - "step": 15750 - }, - { - "epoch": 1.2114452734405046, - "eval_loss": -30.418058395385742, - "eval_runtime": 16.8984, - "eval_samples_per_second": 59.177, - "eval_steps_per_second": 14.794, - "step": 15750 - }, - { - "epoch": 1.2152911314514268, - "grad_norm": 3.3898251056671143, - "learning_rate": 0.00012059320052982886, - "loss": -30.3988, - "step": 15800 - }, - { - "epoch": 1.2191369894623492, - "grad_norm": 5.295444965362549, - "learning_rate": 0.00012033348050801237, - "loss": -30.4014, - "step": 15850 - }, - { - "epoch": 1.2229828474732714, - "grad_norm": 3.9401540756225586, - "learning_rate": 0.0001200737604861959, - "loss": -30.4028, - "step": 15900 - }, - { - "epoch": 1.2268287054841935, - "grad_norm": 6.728087902069092, - "learning_rate": 0.00011981404046437941, - "loss": -30.403, - "step": 15950 - }, - { - "epoch": 1.2306745634951157, - "grad_norm": 4.500195503234863, - "learning_rate": 0.00011955432044256292, - "loss": -30.4058, - "step": 16000 - }, - { - "epoch": 1.2306745634951157, - "eval_loss": -30.41932487487793, - "eval_runtime": 16.9437, - "eval_samples_per_second": 59.019, - "eval_steps_per_second": 14.755, - "step": 16000 - }, - { - "epoch": 1.234520421506038, - "grad_norm": 4.910549163818359, - "learning_rate": 0.00011929460042074645, - "loss": -30.4013, - "step": 16050 - }, - { - "epoch": 1.2383662795169603, - "grad_norm": 3.5906295776367188, - "learning_rate": 0.00011903488039892996, - "loss": -30.4036, - "step": 16100 - }, - { - "epoch": 1.2422121375278825, - "grad_norm": 4.437080383300781, - "learning_rate": 0.00011877516037711347, - "loss": -30.4115, - "step": 16150 - }, - { - "epoch": 1.2460579955388047, - "grad_norm": 6.500924110412598, - "learning_rate": 0.000118515440355297, - "loss": -30.4133, - "step": 16200 - }, - { - "epoch": 1.2499038535497269, - "grad_norm": 5.0980143547058105, - "learning_rate": 0.00011825572033348051, - "loss": -30.4132, - "step": 16250 - }, - { - "epoch": 1.2499038535497269, - "eval_loss": -30.43558120727539, - "eval_runtime": 17.0068, - "eval_samples_per_second": 58.8, - "eval_steps_per_second": 14.7, - "step": 16250 - }, - { - "epoch": 1.253749711560649, - "grad_norm": 4.151224613189697, - "learning_rate": 0.00011799600031166402, - "loss": -30.4139, - "step": 16300 - }, - { - "epoch": 1.2575955695715715, - "grad_norm": 5.165994167327881, - "learning_rate": 0.00011773628028984755, - "loss": -30.4154, - "step": 16350 - }, - { - "epoch": 1.2614414275824937, - "grad_norm": 3.3392717838287354, - "learning_rate": 0.00011747656026803106, - "loss": -30.4177, - "step": 16400 - }, - { - "epoch": 1.2652872855934159, - "grad_norm": 5.80817985534668, - "learning_rate": 0.00011721684024621458, - "loss": -30.4187, - "step": 16450 - }, - { - "epoch": 1.2691331436043383, - "grad_norm": 4.31649112701416, - "learning_rate": 0.0001169571202243981, - "loss": -30.4183, - "step": 16500 - }, - { - "epoch": 1.2691331436043383, - "eval_loss": -30.43385124206543, - "eval_runtime": 16.9325, - "eval_samples_per_second": 59.058, - "eval_steps_per_second": 14.765, - "step": 16500 - }, - { - "epoch": 1.2729790016152602, - "grad_norm": 2.9843568801879883, - "learning_rate": 0.00011669740020258162, - "loss": -30.4202, - "step": 16550 - }, - { - "epoch": 1.2768248596261826, - "grad_norm": 2.4206202030181885, - "learning_rate": 0.00011643768018076516, - "loss": -30.4172, - "step": 16600 - }, - { - "epoch": 1.2806707176371048, - "grad_norm": 4.983964443206787, - "learning_rate": 0.00011617796015894867, - "loss": -30.4214, - "step": 16650 - }, - { - "epoch": 1.284516575648027, - "grad_norm": 1.8332034349441528, - "learning_rate": 0.00011591824013713217, - "loss": -30.4232, - "step": 16700 - }, - { - "epoch": 1.2883624336589494, - "grad_norm": 5.537328720092773, - "learning_rate": 0.00011565852011531571, - "loss": -30.4212, - "step": 16750 - }, - { - "epoch": 1.2883624336589494, - "eval_loss": -30.440034866333008, - "eval_runtime": 16.9934, - "eval_samples_per_second": 58.846, - "eval_steps_per_second": 14.712, - "step": 16750 - }, - { - "epoch": 1.2922082916698716, - "grad_norm": 4.200454235076904, - "learning_rate": 0.00011539880009349922, - "loss": -30.4222, - "step": 16800 - }, - { - "epoch": 1.2960541496807938, - "grad_norm": 3.8615283966064453, - "learning_rate": 0.00011513908007168272, - "loss": -30.4218, - "step": 16850 - }, - { - "epoch": 1.299900007691716, - "grad_norm": 4.854475975036621, - "learning_rate": 0.00011487936004986626, - "loss": -30.4196, - "step": 16900 - }, - { - "epoch": 1.3037458657026382, - "grad_norm": 3.457418441772461, - "learning_rate": 0.00011461964002804977, - "loss": -30.4238, - "step": 16950 - }, - { - "epoch": 1.3075917237135606, - "grad_norm": 3.361154317855835, - "learning_rate": 0.00011435992000623327, - "loss": -30.4236, - "step": 17000 - }, - { - "epoch": 1.3075917237135606, - "eval_loss": -30.437725067138672, - "eval_runtime": 16.8779, - "eval_samples_per_second": 59.249, - "eval_steps_per_second": 14.812, - "step": 17000 - }, - { - "epoch": 1.3114375817244828, - "grad_norm": 5.688414096832275, - "learning_rate": 0.00011410019998441681, - "loss": -30.4228, - "step": 17050 - }, - { - "epoch": 1.315283439735405, - "grad_norm": 4.790516376495361, - "learning_rate": 0.00011384047996260032, - "loss": -30.4245, - "step": 17100 - }, - { - "epoch": 1.3191292977463271, - "grad_norm": 4.5773539543151855, - "learning_rate": 0.00011358075994078384, - "loss": -30.4263, - "step": 17150 - }, - { - "epoch": 1.3229751557572493, - "grad_norm": 3.8185110092163086, - "learning_rate": 0.00011332103991896736, - "loss": -30.4269, - "step": 17200 - }, - { - "epoch": 1.3268210137681717, - "grad_norm": 2.325967788696289, - "learning_rate": 0.00011306131989715088, - "loss": -30.4272, - "step": 17250 - }, - { - "epoch": 1.3268210137681717, - "eval_loss": -30.445838928222656, - "eval_runtime": 16.9252, - "eval_samples_per_second": 59.083, - "eval_steps_per_second": 14.771, - "step": 17250 - }, - { - "epoch": 1.330666871779094, - "grad_norm": 2.729612112045288, - "learning_rate": 0.00011280159987533439, - "loss": -30.4271, - "step": 17300 - }, - { - "epoch": 1.3345127297900161, - "grad_norm": 3.422671318054199, - "learning_rate": 0.00011254187985351792, - "loss": -30.4248, - "step": 17350 - }, - { - "epoch": 1.3383585878009385, - "grad_norm": 3.345399856567383, - "learning_rate": 0.00011228215983170143, - "loss": -30.4281, - "step": 17400 - }, - { - "epoch": 1.3422044458118605, - "grad_norm": 3.8958306312561035, - "learning_rate": 0.00011202243980988496, - "loss": -30.4291, - "step": 17450 - }, - { - "epoch": 1.346050303822783, - "grad_norm": 6.417865753173828, - "learning_rate": 0.00011176271978806847, - "loss": -30.4293, - "step": 17500 - }, - { - "epoch": 1.346050303822783, - "eval_loss": -30.43779182434082, - "eval_runtime": 16.8798, - "eval_samples_per_second": 59.242, - "eval_steps_per_second": 14.811, - "step": 17500 - }, - { - "epoch": 1.349896161833705, - "grad_norm": 3.9683895111083984, - "learning_rate": 0.00011150299976625198, - "loss": -30.43, - "step": 17550 - }, - { - "epoch": 1.3537420198446273, - "grad_norm": 2.845992088317871, - "learning_rate": 0.00011124327974443551, - "loss": -30.4299, - "step": 17600 - }, - { - "epoch": 1.3575878778555497, - "grad_norm": 2.662853240966797, - "learning_rate": 0.00011098355972261902, - "loss": -30.4296, - "step": 17650 - }, - { - "epoch": 1.3614337358664719, - "grad_norm": 4.240821838378906, - "learning_rate": 0.00011072383970080253, - "loss": -30.4296, - "step": 17700 - }, - { - "epoch": 1.365279593877394, - "grad_norm": 3.456580400466919, - "learning_rate": 0.00011046411967898607, - "loss": -30.4301, - "step": 17750 - }, - { - "epoch": 1.365279593877394, - "eval_loss": -30.446062088012695, - "eval_runtime": 17.169, - "eval_samples_per_second": 58.244, - "eval_steps_per_second": 14.561, - "step": 17750 - }, - { - "epoch": 1.3691254518883162, - "grad_norm": 2.481637477874756, - "learning_rate": 0.00011020439965716957, - "loss": -30.43, - "step": 17800 - }, - { - "epoch": 1.3729713098992384, - "grad_norm": 5.5576558113098145, - "learning_rate": 0.00010994467963535309, - "loss": -30.4315, - "step": 17850 - }, - { - "epoch": 1.3768171679101608, - "grad_norm": 3.786583662033081, - "learning_rate": 0.00010968495961353663, - "loss": -30.4279, - "step": 17900 - }, - { - "epoch": 1.380663025921083, - "grad_norm": 3.966721773147583, - "learning_rate": 0.00010942523959172012, - "loss": -30.4325, - "step": 17950 - }, - { - "epoch": 1.3845088839320052, - "grad_norm": 5.2827582359313965, - "learning_rate": 0.00010916551956990364, - "loss": -30.433, - "step": 18000 - }, - { - "epoch": 1.3845088839320052, - "eval_loss": -30.445518493652344, - "eval_runtime": 16.9753, - "eval_samples_per_second": 58.909, - "eval_steps_per_second": 14.727, - "step": 18000 - }, - { - "epoch": 1.3883547419429274, - "grad_norm": 4.988203048706055, - "learning_rate": 0.00010890579954808718, - "loss": -30.4311, - "step": 18050 - }, - { - "epoch": 1.3922005999538496, - "grad_norm": 4.5886454582214355, - "learning_rate": 0.00010864607952627068, - "loss": -30.4332, - "step": 18100 - }, - { - "epoch": 1.396046457964772, - "grad_norm": 2.1658132076263428, - "learning_rate": 0.00010838635950445419, - "loss": -30.434, - "step": 18150 - }, - { - "epoch": 1.3998923159756942, - "grad_norm": 3.8353946208953857, - "learning_rate": 0.00010812663948263773, - "loss": -30.4336, - "step": 18200 - }, - { - "epoch": 1.4037381739866164, - "grad_norm": 4.345909118652344, - "learning_rate": 0.00010786691946082124, - "loss": -30.4363, - "step": 18250 - }, - { - "epoch": 1.4037381739866164, - "eval_loss": -30.447927474975586, - "eval_runtime": 16.9767, - "eval_samples_per_second": 58.904, - "eval_steps_per_second": 14.726, - "step": 18250 - }, - { - "epoch": 1.4075840319975388, - "grad_norm": 2.635801076889038, - "learning_rate": 0.00010760719943900477, - "loss": -30.4351, - "step": 18300 - }, - { - "epoch": 1.4114298900084608, - "grad_norm": 4.603756427764893, - "learning_rate": 0.00010734747941718828, - "loss": -30.4351, - "step": 18350 - }, - { - "epoch": 1.4152757480193832, - "grad_norm": 4.313344955444336, - "learning_rate": 0.0001070877593953718, - "loss": -30.4361, - "step": 18400 - }, - { - "epoch": 1.4191216060303053, - "grad_norm": 4.869143486022949, - "learning_rate": 0.00010682803937355532, - "loss": -30.4351, - "step": 18450 - }, - { - "epoch": 1.4229674640412275, - "grad_norm": 2.338141679763794, - "learning_rate": 0.00010656831935173883, - "loss": -30.435, - "step": 18500 - }, - { - "epoch": 1.4229674640412275, - "eval_loss": -30.445350646972656, - "eval_runtime": 16.9508, - "eval_samples_per_second": 58.994, - "eval_steps_per_second": 14.749, - "step": 18500 - }, - { - "epoch": 1.42681332205215, - "grad_norm": 3.5870954990386963, - "learning_rate": 0.00010630859932992235, - "loss": -30.4369, - "step": 18550 - }, - { - "epoch": 1.4306591800630721, - "grad_norm": 2.8752095699310303, - "learning_rate": 0.00010604887930810587, - "loss": -30.4345, - "step": 18600 - }, - { - "epoch": 1.4345050380739943, - "grad_norm": 3.7183678150177, - "learning_rate": 0.00010578915928628939, - "loss": -30.4362, - "step": 18650 - }, - { - "epoch": 1.4383508960849165, - "grad_norm": 3.3645851612091064, - "learning_rate": 0.0001055294392644729, - "loss": -30.4363, - "step": 18700 - }, - { - "epoch": 1.4421967540958387, - "grad_norm": 3.521252393722534, - "learning_rate": 0.00010526971924265642, - "loss": -30.4371, - "step": 18750 - }, - { - "epoch": 1.4421967540958387, - "eval_loss": -30.448963165283203, - "eval_runtime": 16.966, - "eval_samples_per_second": 58.941, - "eval_steps_per_second": 14.735, - "step": 18750 - }, - { - "epoch": 1.446042612106761, - "grad_norm": 4.74116325378418, - "learning_rate": 0.00010500999922083994, - "loss": -30.4378, - "step": 18800 - }, - { - "epoch": 1.4498884701176833, - "grad_norm": 3.7258331775665283, - "learning_rate": 0.00010475027919902345, - "loss": -30.4376, - "step": 18850 - }, - { - "epoch": 1.4537343281286055, - "grad_norm": 4.193502426147461, - "learning_rate": 0.00010449055917720698, - "loss": -30.4384, - "step": 18900 - }, - { - "epoch": 1.4575801861395277, - "grad_norm": 2.630093812942505, - "learning_rate": 0.00010423083915539049, - "loss": -30.4363, - "step": 18950 - }, - { - "epoch": 1.4614260441504499, - "grad_norm": 5.140982151031494, - "learning_rate": 0.000103971119133574, - "loss": -30.433, - "step": 19000 - }, - { - "epoch": 1.4614260441504499, - "eval_loss": -30.441884994506836, - "eval_runtime": 16.8889, - "eval_samples_per_second": 59.21, - "eval_steps_per_second": 14.803, - "step": 19000 - }, - { - "epoch": 1.4652719021613723, - "grad_norm": 4.889688968658447, - "learning_rate": 0.00010371139911175753, - "loss": -30.4382, - "step": 19050 - }, - { - "epoch": 1.4691177601722945, - "grad_norm": 3.332848072052002, - "learning_rate": 0.00010345167908994104, - "loss": -30.4385, - "step": 19100 - }, - { - "epoch": 1.4729636181832166, - "grad_norm": 3.6298229694366455, - "learning_rate": 0.00010319195906812458, - "loss": -30.4384, - "step": 19150 - }, - { - "epoch": 1.476809476194139, - "grad_norm": 3.3661463260650635, - "learning_rate": 0.0001029322390463081, - "loss": -30.4383, - "step": 19200 - }, - { - "epoch": 1.480655334205061, - "grad_norm": 3.7535996437072754, - "learning_rate": 0.0001026725190244916, - "loss": -30.4388, - "step": 19250 - }, - { - "epoch": 1.480655334205061, - "eval_loss": -30.449628829956055, - "eval_runtime": 16.977, - "eval_samples_per_second": 58.903, - "eval_steps_per_second": 14.726, - "step": 19250 - }, - { - "epoch": 1.4845011922159834, - "grad_norm": 2.860837936401367, - "learning_rate": 0.00010241279900267513, - "loss": -30.4381, - "step": 19300 - }, - { - "epoch": 1.4883470502269056, - "grad_norm": 3.559368848800659, - "learning_rate": 0.00010215307898085865, - "loss": -30.4401, - "step": 19350 - }, - { - "epoch": 1.4921929082378278, - "grad_norm": 4.874238967895508, - "learning_rate": 0.00010189335895904215, - "loss": -30.4403, - "step": 19400 - }, - { - "epoch": 1.4960387662487502, - "grad_norm": 3.3331515789031982, - "learning_rate": 0.00010163363893722569, - "loss": -30.4398, - "step": 19450 - }, - { - "epoch": 1.4998846242596724, - "grad_norm": 2.6575047969818115, - "learning_rate": 0.0001013739189154092, - "loss": -30.4338, - "step": 19500 - }, - { - "epoch": 1.4998846242596724, - "eval_loss": -30.449535369873047, - "eval_runtime": 16.7967, - "eval_samples_per_second": 59.536, - "eval_steps_per_second": 14.884, - "step": 19500 - }, - { - "epoch": 1.5037304822705946, - "grad_norm": 3.4910714626312256, - "learning_rate": 0.0001011141988935927, - "loss": -30.4402, - "step": 19550 - }, - { - "epoch": 1.5075763402815168, - "grad_norm": 3.4878039360046387, - "learning_rate": 0.00010085447887177624, - "loss": -30.4406, - "step": 19600 - }, - { - "epoch": 1.511422198292439, - "grad_norm": 1.7369427680969238, - "learning_rate": 0.00010059475884995975, - "loss": -30.441, - "step": 19650 - }, - { - "epoch": 1.5152680563033614, - "grad_norm": 2.82442045211792, - "learning_rate": 0.00010033503882814326, - "loss": -30.4407, - "step": 19700 - }, - { - "epoch": 1.5191139143142836, - "grad_norm": 3.5565500259399414, - "learning_rate": 0.00010007531880632679, - "loss": -30.4322, - "step": 19750 - }, - { - "epoch": 1.5191139143142836, - "eval_loss": -30.445045471191406, - "eval_runtime": 16.8955, - "eval_samples_per_second": 59.187, - "eval_steps_per_second": 14.797, - "step": 19750 - }, - { - "epoch": 1.5229597723252057, - "grad_norm": 2.4976837635040283, - "learning_rate": 9.98155987845103e-05, - "loss": -30.4389, - "step": 19800 - }, - { - "epoch": 1.5268056303361282, - "grad_norm": 2.888719320297241, - "learning_rate": 9.955587876269382e-05, - "loss": -30.4332, - "step": 19850 - }, - { - "epoch": 1.5306514883470501, - "grad_norm": 4.438738822937012, - "learning_rate": 9.929615874087734e-05, - "loss": -30.4415, - "step": 19900 - }, - { - "epoch": 1.5344973463579725, - "grad_norm": 1.5556750297546387, - "learning_rate": 9.903643871906085e-05, - "loss": -30.4422, - "step": 19950 - }, - { - "epoch": 1.5383432043688947, - "grad_norm": 3.4446284770965576, - "learning_rate": 9.877671869724438e-05, - "loss": -30.4399, - "step": 20000 - }, - { - "epoch": 1.5383432043688947, - "eval_loss": -30.452638626098633, - "eval_runtime": 16.7438, - "eval_samples_per_second": 59.724, - "eval_steps_per_second": 14.931, - "step": 20000 - }, - { - "epoch": 1.542189062379817, - "grad_norm": 2.2135956287384033, - "learning_rate": 9.85169986754279e-05, - "loss": -30.4423, - "step": 20050 - }, - { - "epoch": 1.5460349203907393, - "grad_norm": 3.5614891052246094, - "learning_rate": 9.825727865361141e-05, - "loss": -30.4419, - "step": 20100 - }, - { - "epoch": 1.5498807784016613, - "grad_norm": 3.5207202434539795, - "learning_rate": 9.799755863179493e-05, - "loss": -30.4427, - "step": 20150 - }, - { - "epoch": 1.5537266364125837, - "grad_norm": 2.915543794631958, - "learning_rate": 9.773783860997845e-05, - "loss": -30.4403, - "step": 20200 - }, - { - "epoch": 1.5575724944235059, - "grad_norm": 2.3293378353118896, - "learning_rate": 9.747811858816197e-05, - "loss": -30.4429, - "step": 20250 - }, - { - "epoch": 1.5575724944235059, - "eval_loss": -30.45326042175293, - "eval_runtime": 16.8328, - "eval_samples_per_second": 59.408, - "eval_steps_per_second": 14.852, - "step": 20250 - }, - { - "epoch": 1.561418352434428, - "grad_norm": 4.711287021636963, - "learning_rate": 9.721839856634549e-05, - "loss": -30.4434, - "step": 20300 - }, - { - "epoch": 1.5652642104453505, - "grad_norm": 1.71726393699646, - "learning_rate": 9.6958678544529e-05, - "loss": -30.4421, - "step": 20350 - }, - { - "epoch": 1.5691100684562724, - "grad_norm": 3.524540424346924, - "learning_rate": 9.669895852271252e-05, - "loss": -30.4437, - "step": 20400 - }, - { - "epoch": 1.5729559264671948, - "grad_norm": 3.0880210399627686, - "learning_rate": 9.643923850089604e-05, - "loss": -30.4424, - "step": 20450 - }, - { - "epoch": 1.576801784478117, - "grad_norm": 4.263181686401367, - "learning_rate": 9.617951847907955e-05, - "loss": -30.444, - "step": 20500 - }, - { - "epoch": 1.576801784478117, - "eval_loss": -30.45319175720215, - "eval_runtime": 16.9713, - "eval_samples_per_second": 58.923, - "eval_steps_per_second": 14.731, - "step": 20500 - }, - { - "epoch": 1.5806476424890392, - "grad_norm": 7.002903938293457, - "learning_rate": 9.591979845726308e-05, - "loss": -30.4429, - "step": 20550 - }, - { - "epoch": 1.5844935004999616, - "grad_norm": 3.438530445098877, - "learning_rate": 9.56600784354466e-05, - "loss": -30.4417, - "step": 20600 - }, - { - "epoch": 1.5883393585108838, - "grad_norm": 3.099423408508301, - "learning_rate": 9.54003584136301e-05, - "loss": -30.444, - "step": 20650 - }, - { - "epoch": 1.592185216521806, - "grad_norm": 3.872269868850708, - "learning_rate": 9.514063839181363e-05, - "loss": -30.4439, - "step": 20700 - }, - { - "epoch": 1.5960310745327284, - "grad_norm": 2.9675283432006836, - "learning_rate": 9.488091836999716e-05, - "loss": -30.4453, - "step": 20750 - }, - { - "epoch": 1.5960310745327284, - "eval_loss": -30.452062606811523, - "eval_runtime": 16.9677, - "eval_samples_per_second": 58.936, - "eval_steps_per_second": 14.734, - "step": 20750 - }, - { - "epoch": 1.5998769325436504, - "grad_norm": 3.0117275714874268, - "learning_rate": 9.462119834818067e-05, - "loss": -30.4438, - "step": 20800 - }, - { - "epoch": 1.6037227905545728, - "grad_norm": 3.3223464488983154, - "learning_rate": 9.436147832636418e-05, - "loss": -30.4445, - "step": 20850 - }, - { - "epoch": 1.607568648565495, - "grad_norm": 2.780449390411377, - "learning_rate": 9.410175830454771e-05, - "loss": -30.4441, - "step": 20900 - }, - { - "epoch": 1.6114145065764172, - "grad_norm": 2.49845027923584, - "learning_rate": 9.384203828273122e-05, - "loss": -30.4451, - "step": 20950 - }, - { - "epoch": 1.6152603645873396, - "grad_norm": 2.1651992797851562, - "learning_rate": 9.358231826091473e-05, - "loss": -30.4443, - "step": 21000 - }, - { - "epoch": 1.6152603645873396, - "eval_loss": -30.446992874145508, - "eval_runtime": 16.808, - "eval_samples_per_second": 59.495, - "eval_steps_per_second": 14.874, - "step": 21000 - }, - { - "epoch": 1.6191062225982615, - "grad_norm": 2.3256795406341553, - "learning_rate": 9.332259823909826e-05, - "loss": -30.4447, - "step": 21050 - }, - { - "epoch": 1.622952080609184, - "grad_norm": 3.201892852783203, - "learning_rate": 9.306287821728179e-05, - "loss": -30.4439, - "step": 21100 - }, - { - "epoch": 1.6267979386201061, - "grad_norm": 2.861118793487549, - "learning_rate": 9.280315819546528e-05, - "loss": -30.4453, - "step": 21150 - }, - { - "epoch": 1.6306437966310283, - "grad_norm": 2.3513760566711426, - "learning_rate": 9.254343817364881e-05, - "loss": -30.4454, - "step": 21200 - }, - { - "epoch": 1.6344896546419507, - "grad_norm": 2.680166482925415, - "learning_rate": 9.228371815183234e-05, - "loss": -30.4461, - "step": 21250 - }, - { - "epoch": 1.6344896546419507, - "eval_loss": -30.453556060791016, - "eval_runtime": 16.9036, - "eval_samples_per_second": 59.159, - "eval_steps_per_second": 14.79, - "step": 21250 - }, - { - "epoch": 1.6383355126528727, - "grad_norm": 3.896608352661133, - "learning_rate": 9.202399813001584e-05, - "loss": -30.4461, - "step": 21300 - }, - { - "epoch": 1.642181370663795, - "grad_norm": 3.4098875522613525, - "learning_rate": 9.176427810819936e-05, - "loss": -30.4464, - "step": 21350 - }, - { - "epoch": 1.6460272286747173, - "grad_norm": 2.555978536605835, - "learning_rate": 9.150455808638289e-05, - "loss": -30.4457, - "step": 21400 - }, - { - "epoch": 1.6498730866856395, - "grad_norm": 3.0330915451049805, - "learning_rate": 9.12448380645664e-05, - "loss": -30.4463, - "step": 21450 - }, - { - "epoch": 1.6537189446965619, - "grad_norm": 4.583216190338135, - "learning_rate": 9.098511804274992e-05, - "loss": -30.445, - "step": 21500 - }, - { - "epoch": 1.6537189446965619, - "eval_loss": -30.445945739746094, - "eval_runtime": 16.9709, - "eval_samples_per_second": 58.925, - "eval_steps_per_second": 14.731, - "step": 21500 - }, - { - "epoch": 1.657564802707484, - "grad_norm": 2.692993402481079, - "learning_rate": 9.072539802093344e-05, - "loss": -30.4452, - "step": 21550 - }, - { - "epoch": 1.6614106607184063, - "grad_norm": 3.2486062049865723, - "learning_rate": 9.046567799911695e-05, - "loss": -30.4466, - "step": 21600 - }, - { - "epoch": 1.6652565187293287, - "grad_norm": 3.774552345275879, - "learning_rate": 9.020595797730047e-05, - "loss": -30.4468, - "step": 21650 - }, - { - "epoch": 1.6691023767402506, - "grad_norm": 2.923283338546753, - "learning_rate": 8.9946237955484e-05, - "loss": -30.4472, - "step": 21700 - }, - { - "epoch": 1.672948234751173, - "grad_norm": 2.804466724395752, - "learning_rate": 8.96865179336675e-05, - "loss": -30.4471, - "step": 21750 - }, - { - "epoch": 1.672948234751173, - "eval_loss": -30.455671310424805, - "eval_runtime": 16.8224, - "eval_samples_per_second": 59.445, - "eval_steps_per_second": 14.861, - "step": 21750 - }, - { - "epoch": 1.6767940927620952, - "grad_norm": 2.091228485107422, - "learning_rate": 8.942679791185102e-05, - "loss": -30.4476, - "step": 21800 - }, - { - "epoch": 1.6806399507730174, - "grad_norm": 3.4288341999053955, - "learning_rate": 8.916707789003455e-05, - "loss": -30.4466, - "step": 21850 - }, - { - "epoch": 1.6844858087839398, - "grad_norm": 2.85774827003479, - "learning_rate": 8.890735786821807e-05, - "loss": -30.4471, - "step": 21900 - }, - { - "epoch": 1.6883316667948618, - "grad_norm": 3.7691524028778076, - "learning_rate": 8.864763784640159e-05, - "loss": -30.4473, - "step": 21950 - }, - { - "epoch": 1.6921775248057842, - "grad_norm": 3.015897035598755, - "learning_rate": 8.83879178245851e-05, - "loss": -30.4478, - "step": 22000 - }, - { - "epoch": 1.6921775248057842, - "eval_loss": -30.455175399780273, - "eval_runtime": 16.9353, - "eval_samples_per_second": 59.048, - "eval_steps_per_second": 14.762, - "step": 22000 - }, - { - "epoch": 1.6960233828167064, - "grad_norm": 2.7589683532714844, - "learning_rate": 8.812819780276862e-05, - "loss": -30.4483, - "step": 22050 - }, - { - "epoch": 1.6998692408276286, - "grad_norm": 3.7240102291107178, - "learning_rate": 8.786847778095214e-05, - "loss": -30.448, - "step": 22100 - }, - { - "epoch": 1.703715098838551, - "grad_norm": 3.830146074295044, - "learning_rate": 8.760875775913565e-05, - "loss": -30.448, - "step": 22150 - }, - { - "epoch": 1.707560956849473, - "grad_norm": 2.670171022415161, - "learning_rate": 8.734903773731918e-05, - "loss": -30.4485, - "step": 22200 - }, - { - "epoch": 1.7114068148603954, - "grad_norm": 2.8607327938079834, - "learning_rate": 8.708931771550269e-05, - "loss": -30.4486, - "step": 22250 - }, - { - "epoch": 1.7114068148603954, - "eval_loss": -30.454792022705078, - "eval_runtime": 16.9895, - "eval_samples_per_second": 58.86, - "eval_steps_per_second": 14.715, - "step": 22250 - }, - { - "epoch": 1.7152526728713176, - "grad_norm": 4.332964897155762, - "learning_rate": 8.682959769368622e-05, - "loss": -30.4481, - "step": 22300 - }, - { - "epoch": 1.7190985308822397, - "grad_norm": 3.3097095489501953, - "learning_rate": 8.656987767186973e-05, - "loss": -30.4482, - "step": 22350 - }, - { - "epoch": 1.7229443888931621, - "grad_norm": 2.8887455463409424, - "learning_rate": 8.631015765005324e-05, - "loss": -30.4493, - "step": 22400 - }, - { - "epoch": 1.7267902469040843, - "grad_norm": 2.452583074569702, - "learning_rate": 8.605043762823677e-05, - "loss": -30.4492, - "step": 22450 - }, - { - "epoch": 1.7306361049150065, - "grad_norm": 3.4899165630340576, - "learning_rate": 8.579071760642028e-05, - "loss": -30.4492, - "step": 22500 - }, - { - "epoch": 1.7306361049150065, - "eval_loss": -30.454774856567383, - "eval_runtime": 16.8487, - "eval_samples_per_second": 59.352, - "eval_steps_per_second": 14.838, - "step": 22500 - }, - { - "epoch": 1.734481962925929, - "grad_norm": 2.8990135192871094, - "learning_rate": 8.553099758460381e-05, - "loss": -30.448, - "step": 22550 - }, - { - "epoch": 1.738327820936851, - "grad_norm": 2.8500287532806396, - "learning_rate": 8.527127756278732e-05, - "loss": -30.4489, - "step": 22600 - }, - { - "epoch": 1.7421736789477733, - "grad_norm": 1.9518436193466187, - "learning_rate": 8.501155754097083e-05, - "loss": -30.4487, - "step": 22650 - }, - { - "epoch": 1.7460195369586955, - "grad_norm": 2.950298309326172, - "learning_rate": 8.475183751915436e-05, - "loss": -30.4484, - "step": 22700 - }, - { - "epoch": 1.7498653949696177, - "grad_norm": 3.065047264099121, - "learning_rate": 8.449211749733787e-05, - "loss": -30.4494, - "step": 22750 - }, - { - "epoch": 1.7498653949696177, - "eval_loss": -30.456340789794922, - "eval_runtime": 16.963, - "eval_samples_per_second": 58.952, - "eval_steps_per_second": 14.738, - "step": 22750 - }, - { - "epoch": 1.75371125298054, - "grad_norm": 2.389820098876953, - "learning_rate": 8.42323974755214e-05, - "loss": -30.4503, - "step": 22800 - }, - { - "epoch": 1.757557110991462, - "grad_norm": 4.366456985473633, - "learning_rate": 8.397267745370491e-05, - "loss": -30.4499, - "step": 22850 - }, - { - "epoch": 1.7614029690023845, - "grad_norm": 1.5029981136322021, - "learning_rate": 8.371295743188842e-05, - "loss": -30.4503, - "step": 22900 - }, - { - "epoch": 1.7652488270133067, - "grad_norm": 2.083980083465576, - "learning_rate": 8.345323741007195e-05, - "loss": -30.4501, - "step": 22950 - }, - { - "epoch": 1.7690946850242288, - "grad_norm": 3.1565330028533936, - "learning_rate": 8.319351738825546e-05, - "loss": -30.4494, - "step": 23000 - }, - { - "epoch": 1.7690946850242288, - "eval_loss": -30.45475959777832, - "eval_runtime": 16.8819, - "eval_samples_per_second": 59.235, - "eval_steps_per_second": 14.809, - "step": 23000 - }, - { - "epoch": 1.7729405430351513, - "grad_norm": 3.5162878036499023, - "learning_rate": 8.293379736643898e-05, - "loss": -30.4499, - "step": 23050 - }, - { - "epoch": 1.7767864010460732, - "grad_norm": 2.4113125801086426, - "learning_rate": 8.26740773446225e-05, - "loss": -30.4491, - "step": 23100 - }, - { - "epoch": 1.7806322590569956, - "grad_norm": 1.8185194730758667, - "learning_rate": 8.241435732280603e-05, - "loss": -30.4506, - "step": 23150 - }, - { - "epoch": 1.7844781170679178, - "grad_norm": 3.115666627883911, - "learning_rate": 8.215463730098953e-05, - "loss": -30.4495, - "step": 23200 - }, - { - "epoch": 1.78832397507884, - "grad_norm": 2.227865695953369, - "learning_rate": 8.189491727917305e-05, - "loss": -30.4502, - "step": 23250 - }, - { - "epoch": 1.78832397507884, - "eval_loss": -30.456247329711914, - "eval_runtime": 16.8967, - "eval_samples_per_second": 59.183, - "eval_steps_per_second": 14.796, - "step": 23250 - }, - { - "epoch": 1.7921698330897624, - "grad_norm": 2.8278579711914062, - "learning_rate": 8.163519725735658e-05, - "loss": -30.4501, - "step": 23300 - }, - { - "epoch": 1.7960156911006846, - "grad_norm": 1.8039246797561646, - "learning_rate": 8.13754772355401e-05, - "loss": -30.4503, - "step": 23350 - }, - { - "epoch": 1.7998615491116068, - "grad_norm": 2.9235339164733887, - "learning_rate": 8.11157572137236e-05, - "loss": -30.4506, - "step": 23400 - }, - { - "epoch": 1.8037074071225292, - "grad_norm": 1.375192642211914, - "learning_rate": 8.085603719190713e-05, - "loss": -30.4514, - "step": 23450 - }, - { - "epoch": 1.8075532651334512, - "grad_norm": 2.6810855865478516, - "learning_rate": 8.059631717009065e-05, - "loss": -30.451, - "step": 23500 - }, - { - "epoch": 1.8075532651334512, - "eval_loss": -30.456506729125977, - "eval_runtime": 16.8767, - "eval_samples_per_second": 59.253, - "eval_steps_per_second": 14.813, - "step": 23500 - }, - { - "epoch": 1.8113991231443736, - "grad_norm": 3.5937023162841797, - "learning_rate": 8.033659714827416e-05, - "loss": -30.4489, - "step": 23550 - }, - { - "epoch": 1.8152449811552958, - "grad_norm": 2.0678956508636475, - "learning_rate": 8.008207152689401e-05, - "loss": -30.4469, - "step": 23600 - }, - { - "epoch": 1.819090839166218, - "grad_norm": 3.0057597160339355, - "learning_rate": 7.982235150507754e-05, - "loss": -30.451, - "step": 23650 - }, - { - "epoch": 1.8229366971771404, - "grad_norm": 2.567552089691162, - "learning_rate": 7.956263148326105e-05, - "loss": -30.4511, - "step": 23700 - }, - { - "epoch": 1.8267825551880623, - "grad_norm": 2.3882832527160645, - "learning_rate": 7.930291146144456e-05, - "loss": -30.4512, - "step": 23750 - }, - { - "epoch": 1.8267825551880623, - "eval_loss": -30.457120895385742, - "eval_runtime": 16.7786, - "eval_samples_per_second": 59.6, - "eval_steps_per_second": 14.9, - "step": 23750 - }, - { - "epoch": 1.8306284131989847, - "grad_norm": 2.1864283084869385, - "learning_rate": 7.904319143962809e-05, - "loss": -30.4511, - "step": 23800 - }, - { - "epoch": 1.834474271209907, - "grad_norm": 2.5760741233825684, - "learning_rate": 7.87834714178116e-05, - "loss": -30.4512, - "step": 23850 - }, - { - "epoch": 1.838320129220829, - "grad_norm": 2.0894832611083984, - "learning_rate": 7.852375139599513e-05, - "loss": -30.4514, - "step": 23900 - }, - { - "epoch": 1.8421659872317515, - "grad_norm": 2.8342926502227783, - "learning_rate": 7.826403137417864e-05, - "loss": -30.4509, - "step": 23950 - }, - { - "epoch": 1.8460118452426735, - "grad_norm": 2.050407648086548, - "learning_rate": 7.800431135236215e-05, - "loss": -30.4518, - "step": 24000 - }, - { - "epoch": 1.8460118452426735, - "eval_loss": -30.43790626525879, - "eval_runtime": 16.6887, - "eval_samples_per_second": 59.921, - "eval_steps_per_second": 14.98, - "step": 24000 - }, - { - "epoch": 1.8498577032535959, - "grad_norm": 3.1422579288482666, - "learning_rate": 7.774459133054568e-05, - "loss": -30.4522, - "step": 24050 - }, - { - "epoch": 1.853703561264518, - "grad_norm": 2.7372827529907227, - "learning_rate": 7.748487130872919e-05, - "loss": -30.4523, - "step": 24100 - }, - { - "epoch": 1.8575494192754403, - "grad_norm": 2.3861794471740723, - "learning_rate": 7.72251512869127e-05, - "loss": -30.4523, - "step": 24150 - }, - { - "epoch": 1.8613952772863627, - "grad_norm": 2.221836805343628, - "learning_rate": 7.696543126509623e-05, - "loss": -30.453, - "step": 24200 - }, - { - "epoch": 1.8652411352972849, - "grad_norm": 3.2006218433380127, - "learning_rate": 7.670571124327974e-05, - "loss": -30.453, - "step": 24250 - }, - { - "epoch": 1.8652411352972849, - "eval_loss": -30.45906639099121, - "eval_runtime": 16.6647, - "eval_samples_per_second": 60.007, - "eval_steps_per_second": 15.002, - "step": 24250 - }, - { - "epoch": 1.869086993308207, - "grad_norm": 3.0464978218078613, - "learning_rate": 7.644599122146327e-05, - "loss": -30.4534, - "step": 24300 - }, - { - "epoch": 1.8729328513191295, - "grad_norm": 2.5272178649902344, - "learning_rate": 7.618627119964678e-05, - "loss": -30.4532, - "step": 24350 - }, - { - "epoch": 1.8767787093300514, - "grad_norm": 1.6283899545669556, - "learning_rate": 7.592655117783031e-05, - "loss": -30.4525, - "step": 24400 - }, - { - "epoch": 1.8806245673409738, - "grad_norm": 3.1398937702178955, - "learning_rate": 7.566683115601382e-05, - "loss": -30.4527, - "step": 24450 - }, - { - "epoch": 1.884470425351896, - "grad_norm": 2.703652858734131, - "learning_rate": 7.540711113419734e-05, - "loss": -30.453, - "step": 24500 - }, - { - "epoch": 1.884470425351896, - "eval_loss": -30.457090377807617, - "eval_runtime": 16.8086, - "eval_samples_per_second": 59.493, - "eval_steps_per_second": 14.873, - "step": 24500 - }, - { - "epoch": 1.8883162833628182, - "grad_norm": 2.3130578994750977, - "learning_rate": 7.514739111238086e-05, - "loss": -30.4525, - "step": 24550 - }, - { - "epoch": 1.8921621413737406, - "grad_norm": 1.720234990119934, - "learning_rate": 7.488767109056438e-05, - "loss": -30.4533, - "step": 24600 - }, - { - "epoch": 1.8960079993846626, - "grad_norm": 2.150315761566162, - "learning_rate": 7.462795106874789e-05, - "loss": -30.4528, - "step": 24650 - }, - { - "epoch": 1.899853857395585, - "grad_norm": 2.7258620262145996, - "learning_rate": 7.436823104693141e-05, - "loss": -30.4538, - "step": 24700 - }, - { - "epoch": 1.9036997154065072, - "grad_norm": 3.188485860824585, - "learning_rate": 7.410851102511494e-05, - "loss": -30.4522, - "step": 24750 - }, - { - "epoch": 1.9036997154065072, - "eval_loss": -30.41309356689453, - "eval_runtime": 16.8987, - "eval_samples_per_second": 59.176, - "eval_steps_per_second": 14.794, - "step": 24750 - }, - { - "epoch": 1.9075455734174294, - "grad_norm": 3.489452600479126, - "learning_rate": 7.38591798041711e-05, - "loss": -29.9165, - "step": 24800 - }, - { - "epoch": 1.9113914314283518, - "grad_norm": 2.326321601867676, - "learning_rate": 7.359945978235463e-05, - "loss": -30.4499, - "step": 24850 - }, - { - "epoch": 1.9152372894392737, - "grad_norm": 3.053919792175293, - "learning_rate": 7.333973976053814e-05, - "loss": -30.4527, - "step": 24900 - }, - { - "epoch": 1.9190831474501961, - "grad_norm": 3.227989435195923, - "learning_rate": 7.308001973872166e-05, - "loss": -30.4533, - "step": 24950 - }, - { - "epoch": 1.9229290054611183, - "grad_norm": 2.026668071746826, - "learning_rate": 7.282029971690518e-05, - "loss": -30.4532, - "step": 25000 - }, - { - "epoch": 1.9229290054611183, - "eval_loss": -30.459980010986328, - "eval_runtime": 16.8538, - "eval_samples_per_second": 59.334, - "eval_steps_per_second": 14.833, - "step": 25000 - }, - { - "epoch": 1.9267748634720405, - "grad_norm": 2.0744123458862305, - "learning_rate": 7.25605796950887e-05, - "loss": -30.4534, - "step": 25050 - }, - { - "epoch": 1.930620721482963, - "grad_norm": 1.4959275722503662, - "learning_rate": 7.230085967327222e-05, - "loss": -30.4542, - "step": 25100 - }, - { - "epoch": 1.9344665794938851, - "grad_norm": 2.123647689819336, - "learning_rate": 7.204113965145573e-05, - "loss": -30.4521, - "step": 25150 - }, - { - "epoch": 1.9383124375048073, - "grad_norm": 2.204134702682495, - "learning_rate": 7.178141962963926e-05, - "loss": -30.4519, - "step": 25200 - }, - { - "epoch": 1.9421582955157297, - "grad_norm": 1.7041330337524414, - "learning_rate": 7.152169960782277e-05, - "loss": -30.4539, - "step": 25250 - }, - { - "epoch": 1.9421582955157297, - "eval_loss": -30.45870590209961, - "eval_runtime": 16.7732, - "eval_samples_per_second": 59.619, - "eval_steps_per_second": 14.905, - "step": 25250 - }, - { - "epoch": 1.9460041535266517, - "grad_norm": 2.355837345123291, - "learning_rate": 7.126197958600629e-05, - "loss": -30.4536, - "step": 25300 - }, - { - "epoch": 1.949850011537574, - "grad_norm": 2.4595673084259033, - "learning_rate": 7.100225956418981e-05, - "loss": -30.4529, - "step": 25350 - }, - { - "epoch": 1.9536958695484963, - "grad_norm": 3.4583332538604736, - "learning_rate": 7.074253954237333e-05, - "loss": -30.4544, - "step": 25400 - }, - { - "epoch": 1.9575417275594185, - "grad_norm": 2.7398970127105713, - "learning_rate": 7.048281952055684e-05, - "loss": -30.4548, - "step": 25450 - }, - { - "epoch": 1.9613875855703409, - "grad_norm": 2.3612735271453857, - "learning_rate": 7.022309949874036e-05, - "loss": -30.454, - "step": 25500 - }, - { - "epoch": 1.9613875855703409, - "eval_loss": -30.459121704101562, - "eval_runtime": 16.7228, - "eval_samples_per_second": 59.799, - "eval_steps_per_second": 14.95, - "step": 25500 - }, - { - "epoch": 1.9652334435812628, - "grad_norm": 2.4880611896514893, - "learning_rate": 6.996337947692389e-05, - "loss": -30.4548, - "step": 25550 - }, - { - "epoch": 1.9690793015921852, - "grad_norm": 1.9865983724594116, - "learning_rate": 6.970365945510739e-05, - "loss": -30.4522, - "step": 25600 - }, - { - "epoch": 1.9729251596031074, - "grad_norm": 2.1370553970336914, - "learning_rate": 6.944393943329092e-05, - "loss": -30.4546, - "step": 25650 - }, - { - "epoch": 1.9767710176140296, - "grad_norm": 2.5807433128356934, - "learning_rate": 6.918421941147444e-05, - "loss": -30.4552, - "step": 25700 - }, - { - "epoch": 1.980616875624952, - "grad_norm": 2.2055130004882812, - "learning_rate": 6.892449938965794e-05, - "loss": -30.4554, - "step": 25750 - }, - { - "epoch": 1.980616875624952, - "eval_loss": -30.4610538482666, - "eval_runtime": 16.9285, - "eval_samples_per_second": 59.072, - "eval_steps_per_second": 14.768, - "step": 25750 - }, - { - "epoch": 1.984462733635874, - "grad_norm": 2.3505399227142334, - "learning_rate": 6.866477936784147e-05, - "loss": -30.4548, - "step": 25800 - }, - { - "epoch": 1.9883085916467964, - "grad_norm": 2.190473794937134, - "learning_rate": 6.8405059346025e-05, - "loss": -30.455, - "step": 25850 - }, - { - "epoch": 1.9921544496577186, - "grad_norm": 2.6653695106506348, - "learning_rate": 6.814533932420851e-05, - "loss": -30.4553, - "step": 25900 - }, - { - "epoch": 1.9960003076686408, - "grad_norm": 1.8680397272109985, - "learning_rate": 6.788561930239202e-05, - "loss": -30.456, - "step": 25950 - }, - { - "epoch": 1.9998461656795632, - "grad_norm": 2.7857582569122314, - "learning_rate": 6.762589928057555e-05, - "loss": -30.4556, - "step": 26000 - }, - { - "epoch": 1.9998461656795632, - "eval_loss": -30.46038246154785, - "eval_runtime": 16.741, - "eval_samples_per_second": 59.734, - "eval_steps_per_second": 14.933, - "step": 26000 - }, - { - "epoch": 2.003692023690485, - "grad_norm": 1.8869682550430298, - "learning_rate": 6.736617925875906e-05, - "loss": -30.4556, - "step": 26050 - }, - { - "epoch": 2.0075378817014076, - "grad_norm": 3.1017348766326904, - "learning_rate": 6.710645923694257e-05, - "loss": -30.4554, - "step": 26100 - }, - { - "epoch": 2.01138373971233, - "grad_norm": 2.1093878746032715, - "learning_rate": 6.68467392151261e-05, - "loss": -30.4563, - "step": 26150 - }, - { - "epoch": 2.015229597723252, - "grad_norm": 2.2458508014678955, - "learning_rate": 6.658701919330963e-05, - "loss": -30.4562, - "step": 26200 - }, - { - "epoch": 2.0190754557341744, - "grad_norm": 2.072725296020508, - "learning_rate": 6.632729917149312e-05, - "loss": -30.4556, - "step": 26250 - }, - { - "epoch": 2.0190754557341744, - "eval_loss": -30.46011734008789, - "eval_runtime": 16.7329, - "eval_samples_per_second": 59.762, - "eval_steps_per_second": 14.941, - "step": 26250 - }, - { - "epoch": 2.0229213137450963, - "grad_norm": 2.431828260421753, - "learning_rate": 6.606757914967665e-05, - "loss": -30.4549, - "step": 26300 - }, - { - "epoch": 2.0267671717560187, - "grad_norm": 3.0434165000915527, - "learning_rate": 6.580785912786018e-05, - "loss": -30.4562, - "step": 26350 - }, - { - "epoch": 2.030613029766941, - "grad_norm": 2.3322978019714355, - "learning_rate": 6.554813910604369e-05, - "loss": -30.4542, - "step": 26400 - }, - { - "epoch": 2.034458887777863, - "grad_norm": 3.0168204307556152, - "learning_rate": 6.52884190842272e-05, - "loss": -30.4566, - "step": 26450 - }, - { - "epoch": 2.0383047457887855, - "grad_norm": 1.6167207956314087, - "learning_rate": 6.502869906241073e-05, - "loss": -30.4568, - "step": 26500 - }, - { - "epoch": 2.0383047457887855, - "eval_loss": -30.460376739501953, - "eval_runtime": 16.813, - "eval_samples_per_second": 59.478, - "eval_steps_per_second": 14.869, - "step": 26500 - }, - { - "epoch": 2.042150603799708, - "grad_norm": 2.257894992828369, - "learning_rate": 6.476897904059424e-05, - "loss": -30.4568, - "step": 26550 - }, - { - "epoch": 2.04599646181063, - "grad_norm": 3.5271377563476562, - "learning_rate": 6.450925901877776e-05, - "loss": -30.4569, - "step": 26600 - }, - { - "epoch": 2.0498423198215523, - "grad_norm": 2.4597764015197754, - "learning_rate": 6.424953899696128e-05, - "loss": -30.4565, - "step": 26650 - }, - { - "epoch": 2.0536881778324743, - "grad_norm": 2.209780693054199, - "learning_rate": 6.39898189751448e-05, - "loss": -30.4567, - "step": 26700 - }, - { - "epoch": 2.0575340358433967, - "grad_norm": 2.887272834777832, - "learning_rate": 6.373009895332831e-05, - "loss": -30.4562, - "step": 26750 - }, - { - "epoch": 2.0575340358433967, - "eval_loss": -30.461790084838867, - "eval_runtime": 16.8176, - "eval_samples_per_second": 59.462, - "eval_steps_per_second": 14.865, - "step": 26750 - }, - { - "epoch": 2.061379893854319, - "grad_norm": 1.9459458589553833, - "learning_rate": 6.347037893151183e-05, - "loss": -30.4557, - "step": 26800 - }, - { - "epoch": 2.065225751865241, - "grad_norm": 1.4567102193832397, - "learning_rate": 6.321065890969535e-05, - "loss": -30.4562, - "step": 26850 - }, - { - "epoch": 2.0690716098761635, - "grad_norm": 2.8089401721954346, - "learning_rate": 6.295093888787887e-05, - "loss": -30.4562, - "step": 26900 - }, - { - "epoch": 2.0729174678870854, - "grad_norm": 1.6846117973327637, - "learning_rate": 6.269121886606239e-05, - "loss": -30.4563, - "step": 26950 - }, - { - "epoch": 2.076763325898008, - "grad_norm": 2.10868501663208, - "learning_rate": 6.243149884424591e-05, - "loss": -30.4575, - "step": 27000 - }, - { - "epoch": 2.076763325898008, - "eval_loss": -30.46200942993164, - "eval_runtime": 16.733, - "eval_samples_per_second": 59.762, - "eval_steps_per_second": 14.941, - "step": 27000 - }, - { - "epoch": 2.0806091839089302, - "grad_norm": 2.005777359008789, - "learning_rate": 6.217177882242942e-05, - "loss": -30.457, - "step": 27050 - }, - { - "epoch": 2.084455041919852, - "grad_norm": 3.0324509143829346, - "learning_rate": 6.191205880061294e-05, - "loss": -30.4566, - "step": 27100 - }, - { - "epoch": 2.0883008999307746, - "grad_norm": 2.6647162437438965, - "learning_rate": 6.165233877879646e-05, - "loss": -30.4574, - "step": 27150 - }, - { - "epoch": 2.0921467579416966, - "grad_norm": 2.078568458557129, - "learning_rate": 6.139261875697998e-05, - "loss": -30.4579, - "step": 27200 - }, - { - "epoch": 2.095992615952619, - "grad_norm": 1.8361953496932983, - "learning_rate": 6.11328987351635e-05, - "loss": -30.4566, - "step": 27250 - }, - { - "epoch": 2.095992615952619, - "eval_loss": -30.461936950683594, - "eval_runtime": 16.9333, - "eval_samples_per_second": 59.055, - "eval_steps_per_second": 14.764, - "step": 27250 - }, - { - "epoch": 2.0998384739635414, - "grad_norm": 2.176724433898926, - "learning_rate": 6.087317871334701e-05, - "loss": -30.4579, - "step": 27300 - }, - { - "epoch": 2.1036843319744634, - "grad_norm": 2.137570858001709, - "learning_rate": 6.0613458691530536e-05, - "loss": -30.4578, - "step": 27350 - }, - { - "epoch": 2.1075301899853858, - "grad_norm": 2.3757429122924805, - "learning_rate": 6.0353738669714055e-05, - "loss": -30.4576, - "step": 27400 - }, - { - "epoch": 2.111376047996308, - "grad_norm": 2.0854310989379883, - "learning_rate": 6.009401864789757e-05, - "loss": -30.4571, - "step": 27450 - }, - { - "epoch": 2.11522190600723, - "grad_norm": 2.1995458602905273, - "learning_rate": 5.983429862608109e-05, - "loss": -30.4575, - "step": 27500 - }, - { - "epoch": 2.11522190600723, - "eval_loss": -30.462617874145508, - "eval_runtime": 16.8806, - "eval_samples_per_second": 59.24, - "eval_steps_per_second": 14.81, - "step": 27500 - }, - { - "epoch": 2.1190677640181526, - "grad_norm": 1.9954910278320312, - "learning_rate": 5.957457860426461e-05, - "loss": -30.458, - "step": 27550 - }, - { - "epoch": 2.1229136220290745, - "grad_norm": 1.2620997428894043, - "learning_rate": 5.931485858244812e-05, - "loss": -30.4578, - "step": 27600 - }, - { - "epoch": 2.126759480039997, - "grad_norm": 1.8231252431869507, - "learning_rate": 5.905513856063164e-05, - "loss": -30.4571, - "step": 27650 - }, - { - "epoch": 2.1306053380509193, - "grad_norm": 2.8753602504730225, - "learning_rate": 5.879541853881516e-05, - "loss": -30.4576, - "step": 27700 - }, - { - "epoch": 2.1344511960618413, - "grad_norm": 2.1304397583007812, - "learning_rate": 5.8535698516998686e-05, - "loss": -30.4579, - "step": 27750 - }, - { - "epoch": 2.1344511960618413, - "eval_loss": -30.463788986206055, - "eval_runtime": 16.8747, - "eval_samples_per_second": 59.26, - "eval_steps_per_second": 14.815, - "step": 27750 - }, - { - "epoch": 2.1382970540727637, - "grad_norm": 2.8373219966888428, - "learning_rate": 5.827597849518219e-05, - "loss": -30.458, - "step": 27800 - }, - { - "epoch": 2.1421429120836857, - "grad_norm": 2.214996576309204, - "learning_rate": 5.801625847336571e-05, - "loss": -30.4577, - "step": 27850 - }, - { - "epoch": 2.145988770094608, - "grad_norm": 2.79176664352417, - "learning_rate": 5.775653845154924e-05, - "loss": -30.4586, - "step": 27900 - }, - { - "epoch": 2.1498346281055305, - "grad_norm": 3.1659934520721436, - "learning_rate": 5.7496818429732744e-05, - "loss": -30.4584, - "step": 27950 - }, - { - "epoch": 2.1536804861164525, - "grad_norm": 3.1562278270721436, - "learning_rate": 5.723709840791627e-05, - "loss": -30.4578, - "step": 28000 - }, - { - "epoch": 2.1536804861164525, - "eval_loss": -30.461944580078125, - "eval_runtime": 16.8316, - "eval_samples_per_second": 59.412, - "eval_steps_per_second": 14.853, - "step": 28000 - }, - { - "epoch": 2.157526344127375, - "grad_norm": 1.7737715244293213, - "learning_rate": 5.697737838609979e-05, - "loss": -30.4579, - "step": 28050 - }, - { - "epoch": 2.1613722021382973, - "grad_norm": 2.3100192546844482, - "learning_rate": 5.671765836428331e-05, - "loss": -30.4582, - "step": 28100 - }, - { - "epoch": 2.1652180601492192, - "grad_norm": 2.7968549728393555, - "learning_rate": 5.645793834246682e-05, - "loss": -30.4588, - "step": 28150 - }, - { - "epoch": 2.1690639181601417, - "grad_norm": 1.6727192401885986, - "learning_rate": 5.619821832065034e-05, - "loss": -30.4585, - "step": 28200 - }, - { - "epoch": 2.1729097761710636, - "grad_norm": 1.8535244464874268, - "learning_rate": 5.593849829883386e-05, - "loss": -30.4589, - "step": 28250 - }, - { - "epoch": 2.1729097761710636, - "eval_loss": -30.463205337524414, - "eval_runtime": 16.861, - "eval_samples_per_second": 59.308, - "eval_steps_per_second": 14.827, - "step": 28250 - }, - { - "epoch": 2.176755634181986, - "grad_norm": 1.9916008710861206, - "learning_rate": 5.5678778277017375e-05, - "loss": -30.4585, - "step": 28300 - }, - { - "epoch": 2.1806014921929084, - "grad_norm": 1.7134790420532227, - "learning_rate": 5.5419058255200894e-05, - "loss": -30.4589, - "step": 28350 - }, - { - "epoch": 2.1844473502038304, - "grad_norm": 1.7053008079528809, - "learning_rate": 5.5159338233384414e-05, - "loss": -30.4587, - "step": 28400 - }, - { - "epoch": 2.188293208214753, - "grad_norm": 2.964104413986206, - "learning_rate": 5.489961821156793e-05, - "loss": -30.4586, - "step": 28450 - }, - { - "epoch": 2.192139066225675, - "grad_norm": 1.901634931564331, - "learning_rate": 5.4639898189751446e-05, - "loss": -30.4595, - "step": 28500 - }, - { - "epoch": 2.192139066225675, - "eval_loss": -30.463890075683594, - "eval_runtime": 16.9187, - "eval_samples_per_second": 59.106, - "eval_steps_per_second": 14.777, - "step": 28500 - }, - { - "epoch": 2.195984924236597, - "grad_norm": 1.420623779296875, - "learning_rate": 5.438017816793497e-05, - "loss": -30.4589, - "step": 28550 - }, - { - "epoch": 2.1998307822475196, - "grad_norm": 2.0934250354766846, - "learning_rate": 5.412045814611849e-05, - "loss": -30.4585, - "step": 28600 - }, - { - "epoch": 2.2036766402584416, - "grad_norm": 1.6350470781326294, - "learning_rate": 5.3860738124302e-05, - "loss": -30.459, - "step": 28650 - }, - { - "epoch": 2.207522498269364, - "grad_norm": 1.5302081108093262, - "learning_rate": 5.3601018102485525e-05, - "loss": -30.4597, - "step": 28700 - }, - { - "epoch": 2.211368356280286, - "grad_norm": 1.9414024353027344, - "learning_rate": 5.3341298080669044e-05, - "loss": -30.4578, - "step": 28750 - }, - { - "epoch": 2.211368356280286, - "eval_loss": -30.459186553955078, - "eval_runtime": 16.8332, - "eval_samples_per_second": 59.406, - "eval_steps_per_second": 14.852, - "step": 28750 - }, - { - "epoch": 2.2152142142912084, - "grad_norm": 2.1578445434570312, - "learning_rate": 5.308157805885256e-05, - "loss": -30.4588, - "step": 28800 - }, - { - "epoch": 2.2190600723021308, - "grad_norm": 1.992924690246582, - "learning_rate": 5.282185803703608e-05, - "loss": -30.4565, - "step": 28850 - }, - { - "epoch": 2.2229059303130527, - "grad_norm": 1.7855414152145386, - "learning_rate": 5.2562138015219596e-05, - "loss": -30.4597, - "step": 28900 - }, - { - "epoch": 2.226751788323975, - "grad_norm": 2.323654890060425, - "learning_rate": 5.2302417993403116e-05, - "loss": -30.4597, - "step": 28950 - }, - { - "epoch": 2.230597646334897, - "grad_norm": 2.070629835128784, - "learning_rate": 5.204269797158663e-05, - "loss": -30.4596, - "step": 29000 - }, - { - "epoch": 2.230597646334897, - "eval_loss": -30.4643497467041, - "eval_runtime": 16.8617, - "eval_samples_per_second": 59.306, - "eval_steps_per_second": 14.827, - "step": 29000 - }, - { - "epoch": 2.2344435043458195, - "grad_norm": 2.0510427951812744, - "learning_rate": 5.178297794977015e-05, - "loss": -30.4589, - "step": 29050 - }, - { - "epoch": 2.238289362356742, - "grad_norm": 1.5907058715820312, - "learning_rate": 5.1523257927953675e-05, - "loss": -30.4597, - "step": 29100 - }, - { - "epoch": 2.242135220367664, - "grad_norm": 2.039212226867676, - "learning_rate": 5.126353790613718e-05, - "loss": -30.4604, - "step": 29150 - }, - { - "epoch": 2.2459810783785863, - "grad_norm": 1.4781228303909302, - "learning_rate": 5.10038178843207e-05, - "loss": -30.4605, - "step": 29200 - }, - { - "epoch": 2.2498269363895087, - "grad_norm": 1.5673744678497314, - "learning_rate": 5.074409786250423e-05, - "loss": -30.4595, - "step": 29250 - }, - { - "epoch": 2.2498269363895087, - "eval_loss": -30.46396827697754, - "eval_runtime": 16.8497, - "eval_samples_per_second": 59.348, - "eval_steps_per_second": 14.837, - "step": 29250 - }, - { - "epoch": 2.2536727944004307, - "grad_norm": 1.5207022428512573, - "learning_rate": 5.048437784068775e-05, - "loss": -30.4592, - "step": 29300 - }, - { - "epoch": 2.257518652411353, - "grad_norm": 1.9671216011047363, - "learning_rate": 5.022465781887126e-05, - "loss": -30.46, - "step": 29350 - }, - { - "epoch": 2.261364510422275, - "grad_norm": 1.6448663473129272, - "learning_rate": 4.996493779705478e-05, - "loss": -30.4604, - "step": 29400 - }, - { - "epoch": 2.2652103684331975, - "grad_norm": 2.366356372833252, - "learning_rate": 4.970521777523829e-05, - "loss": -30.4599, - "step": 29450 - }, - { - "epoch": 2.26905622644412, - "grad_norm": 2.137887716293335, - "learning_rate": 4.944549775342182e-05, - "loss": -30.4605, - "step": 29500 - }, - { - "epoch": 2.26905622644412, - "eval_loss": -30.46525001525879, - "eval_runtime": 16.856, - "eval_samples_per_second": 59.326, - "eval_steps_per_second": 14.832, - "step": 29500 - }, - { - "epoch": 2.272902084455042, - "grad_norm": 2.2537872791290283, - "learning_rate": 4.918577773160533e-05, - "loss": -30.4599, - "step": 29550 - }, - { - "epoch": 2.2767479424659642, - "grad_norm": 1.9328151941299438, - "learning_rate": 4.8926057709788844e-05, - "loss": -30.4602, - "step": 29600 - }, - { - "epoch": 2.280593800476886, - "grad_norm": 2.076399326324463, - "learning_rate": 4.866633768797237e-05, - "loss": -30.4607, - "step": 29650 - }, - { - "epoch": 2.2844396584878086, - "grad_norm": 1.552717924118042, - "learning_rate": 4.840661766615588e-05, - "loss": -30.46, - "step": 29700 - }, - { - "epoch": 2.288285516498731, - "grad_norm": 1.7150416374206543, - "learning_rate": 4.81468976443394e-05, - "loss": -30.4601, - "step": 29750 - }, - { - "epoch": 2.288285516498731, - "eval_loss": -30.464597702026367, - "eval_runtime": 16.899, - "eval_samples_per_second": 59.175, - "eval_steps_per_second": 14.794, - "step": 29750 - }, - { - "epoch": 2.292131374509653, - "grad_norm": 2.387169122695923, - "learning_rate": 4.788717762252292e-05, - "loss": -30.4604, - "step": 29800 - }, - { - "epoch": 2.2959772325205754, - "grad_norm": 2.8682591915130615, - "learning_rate": 4.762745760070644e-05, - "loss": -30.4607, - "step": 29850 - }, - { - "epoch": 2.299823090531498, - "grad_norm": 2.2639787197113037, - "learning_rate": 4.736773757888996e-05, - "loss": -30.4605, - "step": 29900 - }, - { - "epoch": 2.3036689485424198, - "grad_norm": 1.8516710996627808, - "learning_rate": 4.7108017557073475e-05, - "loss": -30.4601, - "step": 29950 - }, - { - "epoch": 2.307514806553342, - "grad_norm": 2.0204548835754395, - "learning_rate": 4.6848297535256994e-05, - "loss": -30.4602, - "step": 30000 - }, - { - "epoch": 2.307514806553342, - "eval_loss": -30.46541976928711, - "eval_runtime": 16.9539, - "eval_samples_per_second": 58.983, - "eval_steps_per_second": 14.746, - "step": 30000 - }, - { - "epoch": 2.311360664564264, - "grad_norm": 1.7279092073440552, - "learning_rate": 4.6588577513440514e-05, - "loss": -30.4612, - "step": 30050 - }, - { - "epoch": 2.3152065225751866, - "grad_norm": 1.8211647272109985, - "learning_rate": 4.632885749162403e-05, - "loss": -30.4612, - "step": 30100 - }, - { - "epoch": 2.3190523805861085, - "grad_norm": 2.0550918579101562, - "learning_rate": 4.6069137469807546e-05, - "loss": -30.4605, - "step": 30150 - }, - { - "epoch": 2.322898238597031, - "grad_norm": 1.8465496301651, - "learning_rate": 4.5809417447991066e-05, - "loss": -30.461, - "step": 30200 - }, - { - "epoch": 2.3267440966079533, - "grad_norm": 2.338982582092285, - "learning_rate": 4.5549697426174585e-05, - "loss": -30.4609, - "step": 30250 - }, - { - "epoch": 2.3267440966079533, - "eval_loss": -30.466022491455078, - "eval_runtime": 16.8437, - "eval_samples_per_second": 59.37, - "eval_steps_per_second": 14.842, - "step": 30250 - }, - { - "epoch": 2.3305899546188753, - "grad_norm": 2.0003364086151123, - "learning_rate": 4.5289977404358105e-05, - "loss": -30.4608, - "step": 30300 - }, - { - "epoch": 2.3344358126297977, - "grad_norm": 1.2678554058074951, - "learning_rate": 4.5030257382541625e-05, - "loss": -30.4611, - "step": 30350 - }, - { - "epoch": 2.33828167064072, - "grad_norm": 1.6590012311935425, - "learning_rate": 4.477053736072514e-05, - "loss": -30.4602, - "step": 30400 - }, - { - "epoch": 2.342127528651642, - "grad_norm": 2.2486422061920166, - "learning_rate": 4.4510817338908664e-05, - "loss": -30.4607, - "step": 30450 - }, - { - "epoch": 2.3459733866625645, - "grad_norm": 1.6569013595581055, - "learning_rate": 4.425109731709218e-05, - "loss": -30.4617, - "step": 30500 - }, - { - "epoch": 2.3459733866625645, - "eval_loss": -30.465835571289062, - "eval_runtime": 17.0015, - "eval_samples_per_second": 58.818, - "eval_steps_per_second": 14.705, - "step": 30500 - }, - { - "epoch": 2.3498192446734865, - "grad_norm": 1.567749261856079, - "learning_rate": 4.3991377295275696e-05, - "loss": -30.4615, - "step": 30550 - }, - { - "epoch": 2.353665102684409, - "grad_norm": 1.2779616117477417, - "learning_rate": 4.3731657273459216e-05, - "loss": -30.4614, - "step": 30600 - }, - { - "epoch": 2.3575109606953313, - "grad_norm": 1.9518656730651855, - "learning_rate": 4.347193725164273e-05, - "loss": -30.4614, - "step": 30650 - }, - { - "epoch": 2.3613568187062532, - "grad_norm": 1.6477199792861938, - "learning_rate": 4.321221722982625e-05, - "loss": -30.4617, - "step": 30700 - }, - { - "epoch": 2.3652026767171757, - "grad_norm": 1.1383932828903198, - "learning_rate": 4.295249720800977e-05, - "loss": -30.462, - "step": 30750 - }, - { - "epoch": 2.3652026767171757, - "eval_loss": -30.466596603393555, - "eval_runtime": 16.9152, - "eval_samples_per_second": 59.119, - "eval_steps_per_second": 14.78, - "step": 30750 - }, - { - "epoch": 2.3690485347280976, - "grad_norm": 1.8575818538665771, - "learning_rate": 4.269277718619328e-05, - "loss": -30.4619, - "step": 30800 - }, - { - "epoch": 2.37289439273902, - "grad_norm": 1.8746001720428467, - "learning_rate": 4.243305716437681e-05, - "loss": -30.4617, - "step": 30850 - }, - { - "epoch": 2.3767402507499424, - "grad_norm": 1.870802402496338, - "learning_rate": 4.217333714256032e-05, - "loss": -30.4619, - "step": 30900 - }, - { - "epoch": 2.3805861087608644, - "grad_norm": 1.6408015489578247, - "learning_rate": 4.191361712074384e-05, - "loss": -30.462, - "step": 30950 - }, - { - "epoch": 2.384431966771787, - "grad_norm": 1.2397960424423218, - "learning_rate": 4.165389709892736e-05, - "loss": -30.4618, - "step": 31000 - }, - { - "epoch": 2.384431966771787, - "eval_loss": -30.46613121032715, - "eval_runtime": 16.8356, - "eval_samples_per_second": 59.398, - "eval_steps_per_second": 14.85, - "step": 31000 - }, - { - "epoch": 2.3882778247827092, - "grad_norm": 1.8404053449630737, - "learning_rate": 4.139417707711087e-05, - "loss": -30.462, - "step": 31050 - }, - { - "epoch": 2.392123682793631, - "grad_norm": 2.467785596847534, - "learning_rate": 4.11344570552944e-05, - "loss": -30.4619, - "step": 31100 - }, - { - "epoch": 2.3959695408045536, - "grad_norm": 2.1997389793395996, - "learning_rate": 4.087473703347791e-05, - "loss": -30.462, - "step": 31150 - }, - { - "epoch": 2.3998153988154756, - "grad_norm": 2.1173219680786133, - "learning_rate": 4.061501701166143e-05, - "loss": -30.4621, - "step": 31200 - }, - { - "epoch": 2.403661256826398, - "grad_norm": 1.7325748205184937, - "learning_rate": 4.035529698984495e-05, - "loss": -30.4621, - "step": 31250 - }, - { - "epoch": 2.403661256826398, - "eval_loss": -30.46693229675293, - "eval_runtime": 16.8671, - "eval_samples_per_second": 59.287, - "eval_steps_per_second": 14.822, - "step": 31250 - }, - { - "epoch": 2.4075071148373204, - "grad_norm": 1.5806682109832764, - "learning_rate": 4.009557696802847e-05, - "loss": -30.4619, - "step": 31300 - }, - { - "epoch": 2.4113529728482423, - "grad_norm": 2.6714529991149902, - "learning_rate": 3.983585694621198e-05, - "loss": -30.4617, - "step": 31350 - }, - { - "epoch": 2.4151988308591648, - "grad_norm": 1.4386422634124756, - "learning_rate": 3.95761369243955e-05, - "loss": -30.462, - "step": 31400 - }, - { - "epoch": 2.4190446888700867, - "grad_norm": 1.7787779569625854, - "learning_rate": 3.931641690257902e-05, - "loss": -30.462, - "step": 31450 - }, - { - "epoch": 2.422890546881009, - "grad_norm": 1.5804333686828613, - "learning_rate": 3.905669688076254e-05, - "loss": -30.4619, - "step": 31500 - }, - { - "epoch": 2.422890546881009, - "eval_loss": -30.466127395629883, - "eval_runtime": 16.9571, - "eval_samples_per_second": 58.972, - "eval_steps_per_second": 14.743, - "step": 31500 - }, - { - "epoch": 2.4267364048919315, - "grad_norm": 2.5385239124298096, - "learning_rate": 3.879697685894606e-05, - "loss": -30.4623, - "step": 31550 - }, - { - "epoch": 2.4305822629028535, - "grad_norm": 2.142542839050293, - "learning_rate": 3.8537256837129574e-05, - "loss": -30.4624, - "step": 31600 - }, - { - "epoch": 2.434428120913776, - "grad_norm": 1.5194621086120605, - "learning_rate": 3.8277536815313094e-05, - "loss": -30.4625, - "step": 31650 - }, - { - "epoch": 2.4382739789246983, - "grad_norm": 1.2033811807632446, - "learning_rate": 3.8017816793496614e-05, - "loss": -30.4624, - "step": 31700 - }, - { - "epoch": 2.4421198369356203, - "grad_norm": 3.1859564781188965, - "learning_rate": 3.7758096771680126e-05, - "loss": -30.4621, - "step": 31750 - }, - { - "epoch": 2.4421198369356203, - "eval_loss": -30.46595573425293, - "eval_runtime": 16.8541, - "eval_samples_per_second": 59.333, - "eval_steps_per_second": 14.833, - "step": 31750 - }, - { - "epoch": 2.4459656949465427, - "grad_norm": 2.844355821609497, - "learning_rate": 3.749837674986365e-05, - "loss": -30.4622, - "step": 31800 - }, - { - "epoch": 2.4498115529574647, - "grad_norm": 1.9936915636062622, - "learning_rate": 3.7238656728047166e-05, - "loss": -30.4626, - "step": 31850 - }, - { - "epoch": 2.453657410968387, - "grad_norm": 1.0412544012069702, - "learning_rate": 3.6978936706230685e-05, - "loss": -30.4626, - "step": 31900 - }, - { - "epoch": 2.457503268979309, - "grad_norm": 2.455996513366699, - "learning_rate": 3.6719216684414205e-05, - "loss": -30.4624, - "step": 31950 - }, - { - "epoch": 2.4613491269902315, - "grad_norm": 1.5153650045394897, - "learning_rate": 3.645949666259772e-05, - "loss": -30.4627, - "step": 32000 - }, - { - "epoch": 2.4613491269902315, - "eval_loss": -30.466768264770508, - "eval_runtime": 17.0502, - "eval_samples_per_second": 58.65, - "eval_steps_per_second": 14.663, - "step": 32000 - }, - { - "epoch": 2.465194985001154, - "grad_norm": 1.4773210287094116, - "learning_rate": 3.6199776640781244e-05, - "loss": -30.4631, - "step": 32050 - }, - { - "epoch": 2.469040843012076, - "grad_norm": 2.181157112121582, - "learning_rate": 3.594005661896476e-05, - "loss": -30.4627, - "step": 32100 - }, - { - "epoch": 2.4728867010229982, - "grad_norm": 1.5130398273468018, - "learning_rate": 3.5680336597148277e-05, - "loss": -30.463, - "step": 32150 - }, - { - "epoch": 2.4767325590339206, - "grad_norm": 1.2299845218658447, - "learning_rate": 3.5420616575331796e-05, - "loss": -30.4632, - "step": 32200 - }, - { - "epoch": 2.4805784170448426, - "grad_norm": 1.3962304592132568, - "learning_rate": 3.516089655351531e-05, - "loss": -30.4632, - "step": 32250 - }, - { - "epoch": 2.4805784170448426, - "eval_loss": -30.467247009277344, - "eval_runtime": 16.7784, - "eval_samples_per_second": 59.6, - "eval_steps_per_second": 14.9, - "step": 32250 - }, - { - "epoch": 2.484424275055765, - "grad_norm": 1.6033949851989746, - "learning_rate": 3.490117653169883e-05, - "loss": -30.4632, - "step": 32300 - }, - { - "epoch": 2.488270133066687, - "grad_norm": 1.4287469387054443, - "learning_rate": 3.464145650988235e-05, - "loss": -30.4626, - "step": 32350 - }, - { - "epoch": 2.4921159910776094, - "grad_norm": 2.5753841400146484, - "learning_rate": 3.438173648806587e-05, - "loss": -30.4633, - "step": 32400 - }, - { - "epoch": 2.495961849088532, - "grad_norm": 1.5068596601486206, - "learning_rate": 3.412201646624939e-05, - "loss": -30.4631, - "step": 32450 - }, - { - "epoch": 2.4998077070994538, - "grad_norm": 1.6092793941497803, - "learning_rate": 3.38622964444329e-05, - "loss": -30.4633, - "step": 32500 - }, - { - "epoch": 2.4998077070994538, - "eval_loss": -30.467578887939453, - "eval_runtime": 16.8772, - "eval_samples_per_second": 59.252, - "eval_steps_per_second": 14.813, - "step": 32500 - }, - { - "epoch": 2.503653565110376, - "grad_norm": 0.964199423789978, - "learning_rate": 3.360257642261642e-05, - "loss": -30.463, - "step": 32550 - }, - { - "epoch": 2.507499423121298, - "grad_norm": 2.097522497177124, - "learning_rate": 3.334285640079994e-05, - "loss": -30.4637, - "step": 32600 - }, - { - "epoch": 2.5113452811322206, - "grad_norm": 1.688252568244934, - "learning_rate": 3.308313637898346e-05, - "loss": -30.4634, - "step": 32650 - }, - { - "epoch": 2.515191139143143, - "grad_norm": 1.733709454536438, - "learning_rate": 3.282341635716697e-05, - "loss": -30.4629, - "step": 32700 - }, - { - "epoch": 2.519036997154065, - "grad_norm": 1.897979974746704, - "learning_rate": 3.25636963353505e-05, - "loss": -30.4629, - "step": 32750 - }, - { - "epoch": 2.519036997154065, - "eval_loss": -30.46739387512207, - "eval_runtime": 16.8323, - "eval_samples_per_second": 59.41, - "eval_steps_per_second": 14.852, - "step": 32750 - }, - { - "epoch": 2.5228828551649873, - "grad_norm": 0.9445510506629944, - "learning_rate": 3.230397631353401e-05, - "loss": -30.4633, - "step": 32800 - }, - { - "epoch": 2.5267287131759097, - "grad_norm": 1.7918905019760132, - "learning_rate": 3.204425629171753e-05, - "loss": -30.4636, - "step": 32850 - }, - { - "epoch": 2.5305745711868317, - "grad_norm": 2.108637571334839, - "learning_rate": 3.178453626990105e-05, - "loss": -30.4635, - "step": 32900 - }, - { - "epoch": 2.534420429197754, - "grad_norm": 1.0502158403396606, - "learning_rate": 3.152481624808456e-05, - "loss": -30.4638, - "step": 32950 - }, - { - "epoch": 2.5382662872086765, - "grad_norm": 1.5265525579452515, - "learning_rate": 3.126509622626809e-05, - "loss": -30.4632, - "step": 33000 - }, - { - "epoch": 2.5382662872086765, - "eval_loss": -30.46713638305664, - "eval_runtime": 16.8812, - "eval_samples_per_second": 59.237, - "eval_steps_per_second": 14.809, - "step": 33000 - }, - { - "epoch": 2.5421121452195985, - "grad_norm": 1.2125709056854248, - "learning_rate": 3.10053762044516e-05, - "loss": -30.4629, - "step": 33050 - }, - { - "epoch": 2.5459580032305205, - "grad_norm": 1.4404875040054321, - "learning_rate": 3.0745656182635115e-05, - "loss": -30.464, - "step": 33100 - }, - { - "epoch": 2.549803861241443, - "grad_norm": 1.889853835105896, - "learning_rate": 3.048593616081864e-05, - "loss": -30.4633, - "step": 33150 - }, - { - "epoch": 2.5536497192523653, - "grad_norm": 1.402807593345642, - "learning_rate": 3.0226216139002155e-05, - "loss": -30.4639, - "step": 33200 - }, - { - "epoch": 2.5574955772632872, - "grad_norm": 1.044944405555725, - "learning_rate": 2.9966496117185678e-05, - "loss": -30.4639, - "step": 33250 - }, - { - "epoch": 2.5574955772632872, - "eval_loss": -30.468294143676758, - "eval_runtime": 16.9528, - "eval_samples_per_second": 58.987, - "eval_steps_per_second": 14.747, - "step": 33250 - }, - { - "epoch": 2.5613414352742097, - "grad_norm": 1.4782401323318481, - "learning_rate": 2.9706776095369194e-05, - "loss": -30.4639, - "step": 33300 - }, - { - "epoch": 2.565187293285132, - "grad_norm": 1.0335558652877808, - "learning_rate": 2.944705607355271e-05, - "loss": -30.4635, - "step": 33350 - }, - { - "epoch": 2.569033151296054, - "grad_norm": 1.1205657720565796, - "learning_rate": 2.918733605173623e-05, - "loss": -30.4643, - "step": 33400 - }, - { - "epoch": 2.5728790093069764, - "grad_norm": 1.6154303550720215, - "learning_rate": 2.8927616029919746e-05, - "loss": -30.4641, - "step": 33450 - }, - { - "epoch": 2.576724867317899, - "grad_norm": 1.305467128753662, - "learning_rate": 2.866789600810327e-05, - "loss": -30.4642, - "step": 33500 - }, - { - "epoch": 2.576724867317899, - "eval_loss": -30.468229293823242, - "eval_runtime": 16.8335, - "eval_samples_per_second": 59.405, - "eval_steps_per_second": 14.851, - "step": 33500 - }, - { - "epoch": 2.580570725328821, - "grad_norm": 1.4428735971450806, - "learning_rate": 2.8408175986286785e-05, - "loss": -30.4636, - "step": 33550 - }, - { - "epoch": 2.5844165833397432, - "grad_norm": 1.3220546245574951, - "learning_rate": 2.8148455964470305e-05, - "loss": -30.4638, - "step": 33600 - }, - { - "epoch": 2.588262441350665, - "grad_norm": 1.3444154262542725, - "learning_rate": 2.788873594265382e-05, - "loss": -30.464, - "step": 33650 - }, - { - "epoch": 2.5921082993615876, - "grad_norm": 1.2507829666137695, - "learning_rate": 2.7629015920837337e-05, - "loss": -30.464, - "step": 33700 - }, - { - "epoch": 2.5959541573725096, - "grad_norm": 1.4041513204574585, - "learning_rate": 2.7369295899020857e-05, - "loss": -30.4643, - "step": 33750 - }, - { - "epoch": 2.5959541573725096, - "eval_loss": -30.468578338623047, - "eval_runtime": 16.9279, - "eval_samples_per_second": 59.074, - "eval_steps_per_second": 14.769, - "step": 33750 - }, - { - "epoch": 2.599800015383432, - "grad_norm": 1.2238858938217163, - "learning_rate": 2.7109575877204373e-05, - "loss": -30.4641, - "step": 33800 - }, - { - "epoch": 2.6036458733943544, - "grad_norm": 0.8141158819198608, - "learning_rate": 2.6849855855387896e-05, - "loss": -30.4641, - "step": 33850 - }, - { - "epoch": 2.6074917314052763, - "grad_norm": 3.0219566822052, - "learning_rate": 2.6590135833571412e-05, - "loss": -30.4642, - "step": 33900 - }, - { - "epoch": 2.6113375894161988, - "grad_norm": 1.9201821088790894, - "learning_rate": 2.633041581175493e-05, - "loss": -30.4641, - "step": 33950 - }, - { - "epoch": 2.615183447427121, - "grad_norm": 1.4936727285385132, - "learning_rate": 2.6070695789938448e-05, - "loss": -30.4645, - "step": 34000 - }, - { - "epoch": 2.615183447427121, - "eval_loss": -30.46862030029297, - "eval_runtime": 16.8191, - "eval_samples_per_second": 59.456, - "eval_steps_per_second": 14.864, - "step": 34000 - }, - { - "epoch": 2.619029305438043, - "grad_norm": 0.9636225700378418, - "learning_rate": 2.5810975768121964e-05, - "loss": -30.4646, - "step": 34050 - }, - { - "epoch": 2.6228751634489655, - "grad_norm": 1.7576632499694824, - "learning_rate": 2.5551255746305487e-05, - "loss": -30.4644, - "step": 34100 - }, - { - "epoch": 2.626721021459888, - "grad_norm": 1.2852941751480103, - "learning_rate": 2.5291535724489e-05, - "loss": -30.4646, - "step": 34150 - }, - { - "epoch": 2.63056687947081, - "grad_norm": 1.7288777828216553, - "learning_rate": 2.5031815702672516e-05, - "loss": -30.4642, - "step": 34200 - }, - { - "epoch": 2.6344127374817323, - "grad_norm": 1.7094942331314087, - "learning_rate": 2.477209568085604e-05, - "loss": -30.4648, - "step": 34250 - }, - { - "epoch": 2.6344127374817323, - "eval_loss": -30.468746185302734, - "eval_runtime": 16.7565, - "eval_samples_per_second": 59.678, - "eval_steps_per_second": 14.92, - "step": 34250 - }, - { - "epoch": 2.6382585954926543, - "grad_norm": 1.2149583101272583, - "learning_rate": 2.451237565903956e-05, - "loss": -30.4644, - "step": 34300 - }, - { - "epoch": 2.6421044535035767, - "grad_norm": 1.0909748077392578, - "learning_rate": 2.4252655637223072e-05, - "loss": -30.4647, - "step": 34350 - }, - { - "epoch": 2.6459503115144987, - "grad_norm": 1.1797704696655273, - "learning_rate": 2.399293561540659e-05, - "loss": -30.4645, - "step": 34400 - }, - { - "epoch": 2.649796169525421, - "grad_norm": 1.0700589418411255, - "learning_rate": 2.373321559359011e-05, - "loss": -30.465, - "step": 34450 - }, - { - "epoch": 2.6536420275363435, - "grad_norm": 1.4579436779022217, - "learning_rate": 2.347349557177363e-05, - "loss": -30.4646, - "step": 34500 - }, - { - "epoch": 2.6536420275363435, - "eval_loss": -30.46938705444336, - "eval_runtime": 16.8439, - "eval_samples_per_second": 59.369, - "eval_steps_per_second": 14.842, - "step": 34500 - }, - { - "epoch": 2.6574878855472654, - "grad_norm": 0.7920024991035461, - "learning_rate": 2.3213775549957147e-05, - "loss": -30.4646, - "step": 34550 - }, - { - "epoch": 2.661333743558188, - "grad_norm": 1.470051646232605, - "learning_rate": 2.2954055528140667e-05, - "loss": -30.4651, - "step": 34600 - }, - { - "epoch": 2.6651796015691103, - "grad_norm": 0.7992119193077087, - "learning_rate": 2.2694335506324183e-05, - "loss": -30.4645, - "step": 34650 - }, - { - "epoch": 2.6690254595800322, - "grad_norm": 0.9150895476341248, - "learning_rate": 2.2434615484507702e-05, - "loss": -30.4651, - "step": 34700 - }, - { - "epoch": 2.6728713175909546, - "grad_norm": 1.7401885986328125, - "learning_rate": 2.217489546269122e-05, - "loss": -30.4645, - "step": 34750 - }, - { - "epoch": 2.6728713175909546, - "eval_loss": -30.468637466430664, - "eval_runtime": 16.8394, - "eval_samples_per_second": 59.384, - "eval_steps_per_second": 14.846, - "step": 34750 - }, - { - "epoch": 2.676717175601877, - "grad_norm": 1.014699935913086, - "learning_rate": 2.1915175440874738e-05, - "loss": -30.465, - "step": 34800 - }, - { - "epoch": 2.680563033612799, - "grad_norm": 0.9657973051071167, - "learning_rate": 2.1655455419058258e-05, - "loss": -30.465, - "step": 34850 - }, - { - "epoch": 2.684408891623721, - "grad_norm": 1.1033378839492798, - "learning_rate": 2.1395735397241774e-05, - "loss": -30.465, - "step": 34900 - }, - { - "epoch": 2.6882547496346434, - "grad_norm": 3.0065767765045166, - "learning_rate": 2.113601537542529e-05, - "loss": -30.4651, - "step": 34950 - }, - { - "epoch": 2.692100607645566, - "grad_norm": 1.226468563079834, - "learning_rate": 2.087629535360881e-05, - "loss": -30.4653, - "step": 35000 - }, - { - "epoch": 2.692100607645566, - "eval_loss": -30.46967315673828, - "eval_runtime": 16.9427, - "eval_samples_per_second": 59.022, - "eval_steps_per_second": 14.756, - "step": 35000 - }, - { - "epoch": 2.6959464656564878, - "grad_norm": 1.8203473091125488, - "learning_rate": 2.061657533179233e-05, - "loss": -30.4653, - "step": 35050 - }, - { - "epoch": 2.69979232366741, - "grad_norm": 1.8032426834106445, - "learning_rate": 2.0356855309975846e-05, - "loss": -30.4652, - "step": 35100 - }, - { - "epoch": 2.7036381816783326, - "grad_norm": 1.6480307579040527, - "learning_rate": 2.0097135288159365e-05, - "loss": -30.465, - "step": 35150 - }, - { - "epoch": 2.7074840396892546, - "grad_norm": 1.4139653444290161, - "learning_rate": 1.983741526634288e-05, - "loss": -30.4647, - "step": 35200 - }, - { - "epoch": 2.711329897700177, - "grad_norm": 1.9126203060150146, - "learning_rate": 1.95776952445264e-05, - "loss": -30.4651, - "step": 35250 - }, - { - "epoch": 2.711329897700177, - "eval_loss": -30.4685115814209, - "eval_runtime": 17.0035, - "eval_samples_per_second": 58.812, - "eval_steps_per_second": 14.703, - "step": 35250 - }, - { - "epoch": 2.7151757557110994, - "grad_norm": 1.5345407724380493, - "learning_rate": 1.931797522270992e-05, - "loss": -30.465, - "step": 35300 - }, - { - "epoch": 2.7190216137220213, - "grad_norm": 0.8343766331672668, - "learning_rate": 1.9058255200893437e-05, - "loss": -30.4653, - "step": 35350 - }, - { - "epoch": 2.7228674717329437, - "grad_norm": 0.7670063972473145, - "learning_rate": 1.8798535179076957e-05, - "loss": -30.4652, - "step": 35400 - }, - { - "epoch": 2.7267133297438657, - "grad_norm": 0.913152813911438, - "learning_rate": 1.8538815157260476e-05, - "loss": -30.4652, - "step": 35450 - }, - { - "epoch": 2.730559187754788, - "grad_norm": 1.1838785409927368, - "learning_rate": 1.8279095135443993e-05, - "loss": -30.4654, - "step": 35500 - }, - { - "epoch": 2.730559187754788, - "eval_loss": -30.469755172729492, - "eval_runtime": 16.8431, - "eval_samples_per_second": 59.371, - "eval_steps_per_second": 14.843, - "step": 35500 - }, - { - "epoch": 2.73440504576571, - "grad_norm": 1.0075780153274536, - "learning_rate": 1.801937511362751e-05, - "loss": -30.4655, - "step": 35550 - }, - { - "epoch": 2.7382509037766325, - "grad_norm": 1.856740117073059, - "learning_rate": 1.775965509181103e-05, - "loss": -30.4653, - "step": 35600 - }, - { - "epoch": 2.742096761787555, - "grad_norm": 0.6788634657859802, - "learning_rate": 1.7499935069994548e-05, - "loss": -30.4655, - "step": 35650 - }, - { - "epoch": 2.745942619798477, - "grad_norm": 0.9009542465209961, - "learning_rate": 1.7240215048178064e-05, - "loss": -30.4656, - "step": 35700 - }, - { - "epoch": 2.7497884778093993, - "grad_norm": 1.9731862545013428, - "learning_rate": 1.6980495026361584e-05, - "loss": -30.4643, - "step": 35750 - }, - { - "epoch": 2.7497884778093993, - "eval_loss": -30.46780014038086, - "eval_runtime": 16.8878, - "eval_samples_per_second": 59.214, - "eval_steps_per_second": 14.804, - "step": 35750 - }, - { - "epoch": 2.7536343358203217, - "grad_norm": 1.0164039134979248, - "learning_rate": 1.67207750045451e-05, - "loss": -30.4655, - "step": 35800 - }, - { - "epoch": 2.7574801938312437, - "grad_norm": 1.1127376556396484, - "learning_rate": 1.646105498272862e-05, - "loss": -30.4657, - "step": 35850 - }, - { - "epoch": 2.761326051842166, - "grad_norm": 1.2492812871932983, - "learning_rate": 1.6201334960912136e-05, - "loss": -30.4659, - "step": 35900 - }, - { - "epoch": 2.7651719098530885, - "grad_norm": 0.944760799407959, - "learning_rate": 1.5941614939095655e-05, - "loss": -30.4654, - "step": 35950 - }, - { - "epoch": 2.7690177678640104, - "grad_norm": 0.7091095447540283, - "learning_rate": 1.5681894917279175e-05, - "loss": -30.4654, - "step": 36000 - }, - { - "epoch": 2.7690177678640104, - "eval_loss": -30.46934700012207, - "eval_runtime": 16.7482, - "eval_samples_per_second": 59.708, - "eval_steps_per_second": 14.927, - "step": 36000 - }, - { - "epoch": 2.772863625874933, - "grad_norm": 2.303079128265381, - "learning_rate": 1.5422174895462695e-05, - "loss": -30.4658, - "step": 36050 - }, - { - "epoch": 2.776709483885855, - "grad_norm": 1.1409249305725098, - "learning_rate": 1.516245487364621e-05, - "loss": -30.4659, - "step": 36100 - }, - { - "epoch": 2.780555341896777, - "grad_norm": 0.9111425280570984, - "learning_rate": 1.4902734851829727e-05, - "loss": -30.4658, - "step": 36150 - }, - { - "epoch": 2.784401199907699, - "grad_norm": 1.3415731191635132, - "learning_rate": 1.4643014830013247e-05, - "loss": -30.4659, - "step": 36200 - }, - { - "epoch": 2.7882470579186216, - "grad_norm": 0.8532393574714661, - "learning_rate": 1.4383294808196765e-05, - "loss": -30.4658, - "step": 36250 - }, - { - "epoch": 2.7882470579186216, - "eval_loss": -30.469867706298828, - "eval_runtime": 17.0192, - "eval_samples_per_second": 58.757, - "eval_steps_per_second": 14.689, - "step": 36250 - }, - { - "epoch": 2.792092915929544, - "grad_norm": 1.6142336130142212, - "learning_rate": 1.4123574786380284e-05, - "loss": -30.4662, - "step": 36300 - }, - { - "epoch": 2.795938773940466, - "grad_norm": 1.9979597330093384, - "learning_rate": 1.3863854764563799e-05, - "loss": -30.4652, - "step": 36350 - }, - { - "epoch": 2.7997846319513884, - "grad_norm": 1.4881914854049683, - "learning_rate": 1.3604134742747318e-05, - "loss": -30.466, - "step": 36400 - }, - { - "epoch": 2.803630489962311, - "grad_norm": 2.0614233016967773, - "learning_rate": 1.3344414720930836e-05, - "loss": -30.4658, - "step": 36450 - }, - { - "epoch": 2.8074763479732328, - "grad_norm": 1.0104310512542725, - "learning_rate": 1.3084694699114356e-05, - "loss": -30.466, - "step": 36500 - }, - { - "epoch": 2.8074763479732328, - "eval_loss": -30.47022247314453, - "eval_runtime": 16.8782, - "eval_samples_per_second": 59.248, - "eval_steps_per_second": 14.812, - "step": 36500 - }, - { - "epoch": 2.811322205984155, - "grad_norm": 0.8461622595787048, - "learning_rate": 1.2824974677297874e-05, - "loss": -30.466, - "step": 36550 - }, - { - "epoch": 2.8151680639950776, - "grad_norm": 1.3280683755874634, - "learning_rate": 1.2565254655481392e-05, - "loss": -30.466, - "step": 36600 - }, - { - "epoch": 2.8190139220059995, - "grad_norm": 1.5778801441192627, - "learning_rate": 1.230553463366491e-05, - "loss": -30.4659, - "step": 36650 - }, - { - "epoch": 2.8228597800169215, - "grad_norm": 1.6054023504257202, - "learning_rate": 1.204581461184843e-05, - "loss": -30.4661, - "step": 36700 - }, - { - "epoch": 2.826705638027844, - "grad_norm": 1.3517789840698242, - "learning_rate": 1.1786094590031946e-05, - "loss": -30.4661, - "step": 36750 - }, - { - "epoch": 2.826705638027844, - "eval_loss": -30.470460891723633, - "eval_runtime": 16.9586, - "eval_samples_per_second": 58.967, - "eval_steps_per_second": 14.742, - "step": 36750 - }, - { - "epoch": 2.8305514960387663, - "grad_norm": 1.1399978399276733, - "learning_rate": 1.1526374568215465e-05, - "loss": -30.4658, - "step": 36800 - }, - { - "epoch": 2.8343973540496883, - "grad_norm": 0.8231783509254456, - "learning_rate": 1.1266654546398983e-05, - "loss": -30.4663, - "step": 36850 - }, - { - "epoch": 2.8382432120606107, - "grad_norm": 0.8370407819747925, - "learning_rate": 1.1006934524582501e-05, - "loss": -30.4662, - "step": 36900 - }, - { - "epoch": 2.842089070071533, - "grad_norm": 0.8367822170257568, - "learning_rate": 1.0747214502766019e-05, - "loss": -30.4662, - "step": 36950 - }, - { - "epoch": 2.845934928082455, - "grad_norm": 0.8438307642936707, - "learning_rate": 1.0487494480949537e-05, - "loss": -30.4661, - "step": 37000 - }, - { - "epoch": 2.845934928082455, - "eval_loss": -30.470365524291992, - "eval_runtime": 16.8139, - "eval_samples_per_second": 59.475, - "eval_steps_per_second": 14.869, - "step": 37000 - }, - { - "epoch": 2.8497807860933775, - "grad_norm": 0.8433590531349182, - "learning_rate": 1.0227774459133055e-05, - "loss": -30.4661, - "step": 37050 - }, - { - "epoch": 2.8536266441043, - "grad_norm": 0.6508035063743591, - "learning_rate": 9.968054437316573e-06, - "loss": -30.4663, - "step": 37100 - }, - { - "epoch": 2.857472502115222, - "grad_norm": 1.3801597356796265, - "learning_rate": 9.70833441550009e-06, - "loss": -30.4662, - "step": 37150 - }, - { - "epoch": 2.8613183601261443, - "grad_norm": 0.7447367906570435, - "learning_rate": 9.448614393683609e-06, - "loss": -30.4663, - "step": 37200 - }, - { - "epoch": 2.8651642181370662, - "grad_norm": 1.370768666267395, - "learning_rate": 9.188894371867128e-06, - "loss": -30.4661, - "step": 37250 - }, - { - "epoch": 2.8651642181370662, - "eval_loss": -30.470088958740234, - "eval_runtime": 16.7498, - "eval_samples_per_second": 59.702, - "eval_steps_per_second": 14.926, - "step": 37250 - }, - { - "epoch": 2.8690100761479886, - "grad_norm": 1.3880547285079956, - "learning_rate": 8.929174350050646e-06, - "loss": -30.4665, - "step": 37300 - }, - { - "epoch": 2.8728559341589106, - "grad_norm": 1.428084135055542, - "learning_rate": 8.669454328234164e-06, - "loss": -30.4665, - "step": 37350 - }, - { - "epoch": 2.876701792169833, - "grad_norm": 0.4759369492530823, - "learning_rate": 8.409734306417682e-06, - "loss": -30.4665, - "step": 37400 - }, - { - "epoch": 2.8805476501807554, - "grad_norm": 1.0323377847671509, - "learning_rate": 8.1500142846012e-06, - "loss": -30.4666, - "step": 37450 - }, - { - "epoch": 2.8843935081916774, - "grad_norm": 0.9753648638725281, - "learning_rate": 7.890294262784718e-06, - "loss": -30.4666, - "step": 37500 - }, - { - "epoch": 2.8843935081916774, - "eval_loss": -30.470531463623047, - "eval_runtime": 16.7944, - "eval_samples_per_second": 59.544, - "eval_steps_per_second": 14.886, - "step": 37500 - }, - { - "epoch": 2.8882393662026, - "grad_norm": 0.6514129042625427, - "learning_rate": 7.630574240968237e-06, - "loss": -30.4666, - "step": 37550 - }, - { - "epoch": 2.892085224213522, - "grad_norm": 1.0104998350143433, - "learning_rate": 7.3708542191517545e-06, - "loss": -30.4667, - "step": 37600 - }, - { - "epoch": 2.895931082224444, - "grad_norm": 0.713027834892273, - "learning_rate": 7.111134197335273e-06, - "loss": -30.4665, - "step": 37650 - }, - { - "epoch": 2.8997769402353666, - "grad_norm": 0.6001731157302856, - "learning_rate": 6.851414175518792e-06, - "loss": -30.4666, - "step": 37700 - }, - { - "epoch": 2.903622798246289, - "grad_norm": 0.6344665288925171, - "learning_rate": 6.591694153702309e-06, - "loss": -30.4664, - "step": 37750 - }, - { - "epoch": 2.903622798246289, - "eval_loss": -30.47093391418457, - "eval_runtime": 16.9622, - "eval_samples_per_second": 58.955, - "eval_steps_per_second": 14.739, - "step": 37750 - }, - { - "epoch": 2.907468656257211, - "grad_norm": 0.5279808640480042, - "learning_rate": 6.331974131885828e-06, - "loss": -30.4667, - "step": 37800 - }, - { - "epoch": 2.9113145142681334, - "grad_norm": 0.9036094546318054, - "learning_rate": 6.072254110069346e-06, - "loss": -30.4667, - "step": 37850 - }, - { - "epoch": 2.9151603722790553, - "grad_norm": 0.601686954498291, - "learning_rate": 5.812534088252864e-06, - "loss": -30.4668, - "step": 37900 - }, - { - "epoch": 2.9190062302899777, - "grad_norm": 1.0191963911056519, - "learning_rate": 5.552814066436382e-06, - "loss": -30.4667, - "step": 37950 - }, - { - "epoch": 2.9228520883008997, - "grad_norm": 0.9260501265525818, - "learning_rate": 5.2930940446198996e-06, - "loss": -30.4667, - "step": 38000 - }, - { - "epoch": 2.9228520883008997, - "eval_loss": -30.47071075439453, - "eval_runtime": 16.8261, - "eval_samples_per_second": 59.431, - "eval_steps_per_second": 14.858, - "step": 38000 - }, - { - "epoch": 2.926697946311822, - "grad_norm": 0.9110496044158936, - "learning_rate": 5.033374022803418e-06, - "loss": -30.4668, - "step": 38050 - }, - { - "epoch": 2.9305438043227445, - "grad_norm": 0.7347224950790405, - "learning_rate": 4.773654000986936e-06, - "loss": -30.4668, - "step": 38100 - }, - { - "epoch": 2.9343896623336665, - "grad_norm": 0.946358859539032, - "learning_rate": 4.513933979170454e-06, - "loss": -30.4669, - "step": 38150 - }, - { - "epoch": 2.938235520344589, - "grad_norm": 0.5716750621795654, - "learning_rate": 4.254213957353972e-06, - "loss": -30.4669, - "step": 38200 - }, - { - "epoch": 2.9420813783555113, - "grad_norm": 1.8405364751815796, - "learning_rate": 3.994493935537491e-06, - "loss": -30.4667, - "step": 38250 - }, - { - "epoch": 2.9420813783555113, - "eval_loss": -30.471132278442383, - "eval_runtime": 16.8489, - "eval_samples_per_second": 59.351, - "eval_steps_per_second": 14.838, - "step": 38250 - }, - { - "epoch": 2.9459272363664333, - "grad_norm": 0.6454173922538757, - "learning_rate": 3.734773913721009e-06, - "loss": -30.467, - "step": 38300 - }, - { - "epoch": 2.9497730943773557, - "grad_norm": 0.7232244610786438, - "learning_rate": 3.475053891904527e-06, - "loss": -30.4668, - "step": 38350 - }, - { - "epoch": 2.953618952388278, - "grad_norm": 1.0730299949645996, - "learning_rate": 3.215333870088045e-06, - "loss": -30.4668, - "step": 38400 - }, - { - "epoch": 2.9574648103992, - "grad_norm": 0.4886613190174103, - "learning_rate": 2.9556138482715634e-06, - "loss": -30.4669, - "step": 38450 - }, - { - "epoch": 2.961310668410122, - "grad_norm": 0.5141581296920776, - "learning_rate": 2.6958938264550813e-06, - "loss": -30.4669, - "step": 38500 - }, - { - "epoch": 2.961310668410122, - "eval_loss": -30.471107482910156, - "eval_runtime": 16.867, - "eval_samples_per_second": 59.287, - "eval_steps_per_second": 14.822, - "step": 38500 - }, - { - "epoch": 2.9651565264210444, - "grad_norm": 0.5108075737953186, - "learning_rate": 2.4361738046385997e-06, - "loss": -30.467, - "step": 38550 - }, - { - "epoch": 2.969002384431967, - "grad_norm": 0.608711838722229, - "learning_rate": 2.1764537828221176e-06, - "loss": -30.467, - "step": 38600 - }, - { - "epoch": 2.972848242442889, - "grad_norm": 0.9723970890045166, - "learning_rate": 1.916733761005636e-06, - "loss": -30.4672, - "step": 38650 - }, - { - "epoch": 2.976694100453811, - "grad_norm": 0.4361710846424103, - "learning_rate": 1.6570137391891543e-06, - "loss": -30.467, - "step": 38700 - }, - { - "epoch": 2.9805399584647336, - "grad_norm": 1.1543878316879272, - "learning_rate": 1.3972937173726722e-06, - "loss": -30.4669, - "step": 38750 - }, - { - "epoch": 2.9805399584647336, - "eval_loss": -30.47084617614746, - "eval_runtime": 16.7842, - "eval_samples_per_second": 59.58, - "eval_steps_per_second": 14.895, - "step": 38750 - }, - { - "epoch": 2.9843858164756556, - "grad_norm": 0.4299401640892029, - "learning_rate": 1.1375736955561905e-06, - "loss": -30.467, - "step": 38800 - }, - { - "epoch": 2.988231674486578, - "grad_norm": 0.8476674556732178, - "learning_rate": 8.778536737397087e-07, - "loss": -30.467, - "step": 38850 - }, - { - "epoch": 2.9920775324975004, - "grad_norm": 1.1194632053375244, - "learning_rate": 6.181336519232268e-07, - "loss": -30.4671, - "step": 38900 - }, - { - "epoch": 2.9959233905084224, - "grad_norm": 0.7896573543548584, - "learning_rate": 3.584136301067449e-07, - "loss": -30.4671, - "step": 38950 - }, - { - "epoch": 2.999769248519345, - "grad_norm": 0.32893452048301697, - "learning_rate": 9.869360829026311e-08, - "loss": -30.4671, - "step": 39000 - }, - { - "epoch": 2.999769248519345, - "eval_loss": -30.471084594726562, - "eval_runtime": 16.7188, - "eval_samples_per_second": 59.813, - "eval_steps_per_second": 14.953, - "step": 39000 } ], "logging_steps": 50,