| { | |
| "best_global_step": 38000, | |
| "best_metric": 0.98431396484375, | |
| "best_model_checkpoint": "./ar-diffusion-checkpoints-fixed/checkpoint-38000", | |
| "epoch": 2.999769248519345, | |
| "eval_steps": 250, | |
| "global_step": 39000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003845858010922237, | |
| "grad_norm": 10.111166000366211, | |
| "learning_rate": 1.76e-05, | |
| "loss": 13.1594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007691716021844474, | |
| "grad_norm": 5.981163501739502, | |
| "learning_rate": 3.76e-05, | |
| "loss": 7.7407, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01153757403276671, | |
| "grad_norm": 7.169628143310547, | |
| "learning_rate": 5.76e-05, | |
| "loss": 6.5841, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.015383432043688947, | |
| "grad_norm": 9.481369972229004, | |
| "learning_rate": 7.76e-05, | |
| "loss": 5.8929, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "grad_norm": 5.348191261291504, | |
| "learning_rate": 9.76e-05, | |
| "loss": 5.8651, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "eval_loss": 5.691133975982666, | |
| "eval_runtime": 19.1859, | |
| "eval_samples_per_second": 52.122, | |
| "eval_steps_per_second": 13.03, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02307514806553342, | |
| "grad_norm": 9.455660820007324, | |
| "learning_rate": 0.0001176, | |
| "loss": 5.3783, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02692100607645566, | |
| "grad_norm": 11.51925277709961, | |
| "learning_rate": 0.00013759999999999998, | |
| "loss": 5.0764, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.030766864087377895, | |
| "grad_norm": 5.797102928161621, | |
| "learning_rate": 0.0001576, | |
| "loss": 5.2967, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03461272209830013, | |
| "grad_norm": 3.018831253051758, | |
| "learning_rate": 0.0001776, | |
| "loss": 5.1947, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "grad_norm": 5.181191921234131, | |
| "learning_rate": 0.0001976, | |
| "loss": 5.2159, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "eval_loss": 5.163902759552002, | |
| "eval_runtime": 19.7141, | |
| "eval_samples_per_second": 50.725, | |
| "eval_steps_per_second": 12.681, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0423044381201446, | |
| "grad_norm": 6.368561744689941, | |
| "learning_rate": 0.0001997714463808015, | |
| "loss": 5.0723, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04615029613106684, | |
| "grad_norm": 4.666464328765869, | |
| "learning_rate": 0.00019951172635898504, | |
| "loss": 5.0754, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04999615414198908, | |
| "grad_norm": 9.395013809204102, | |
| "learning_rate": 0.00019925200633716855, | |
| "loss": 4.7718, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05384201215291132, | |
| "grad_norm": 4.279386520385742, | |
| "learning_rate": 0.00019899228631535204, | |
| "loss": 4.781, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "grad_norm": 4.274901866912842, | |
| "learning_rate": 0.00019873256629353558, | |
| "loss": 4.9529, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "eval_loss": 4.844481945037842, | |
| "eval_runtime": 18.9171, | |
| "eval_samples_per_second": 52.862, | |
| "eval_steps_per_second": 13.216, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.06153372817475579, | |
| "grad_norm": 3.241445541381836, | |
| "learning_rate": 0.0001984728462717191, | |
| "loss": 5.0586, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06537958618567802, | |
| "grad_norm": 5.880845546722412, | |
| "learning_rate": 0.00019821312624990263, | |
| "loss": 4.952, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06922544419660026, | |
| "grad_norm": 3.81998610496521, | |
| "learning_rate": 0.00019795340622808614, | |
| "loss": 5.0012, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0730713022075225, | |
| "grad_norm": 4.52741003036499, | |
| "learning_rate": 0.00019769368620626966, | |
| "loss": 4.7775, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "grad_norm": 27.68866729736328, | |
| "learning_rate": 0.00019743916058488948, | |
| "loss": 4.9852, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "eval_loss": 4.836514472961426, | |
| "eval_runtime": 19.7103, | |
| "eval_samples_per_second": 50.735, | |
| "eval_steps_per_second": 12.684, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08076301822936698, | |
| "grad_norm": 5.79191255569458, | |
| "learning_rate": 0.00019717944056307302, | |
| "loss": 4.6731, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0846088762402892, | |
| "grad_norm": 4.957877159118652, | |
| "learning_rate": 0.00019691972054125653, | |
| "loss": 4.7966, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08845473425121145, | |
| "grad_norm": 3.2968597412109375, | |
| "learning_rate": 0.00019666000051944005, | |
| "loss": 4.752, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.09230059226213368, | |
| "grad_norm": 6.059363842010498, | |
| "learning_rate": 0.00019640028049762359, | |
| "loss": 4.7368, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "grad_norm": 5.9793171882629395, | |
| "learning_rate": 0.0001961405604758071, | |
| "loss": 4.9613, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "eval_loss": 4.76948356628418, | |
| "eval_runtime": 18.6703, | |
| "eval_samples_per_second": 53.561, | |
| "eval_steps_per_second": 13.39, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09999230828397816, | |
| "grad_norm": 7.604544162750244, | |
| "learning_rate": 0.0001958808404539906, | |
| "loss": 4.8365, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10383816629490039, | |
| "grad_norm": 6.4756083488464355, | |
| "learning_rate": 0.00019562112043217412, | |
| "loss": 4.8381, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.10768402430582263, | |
| "grad_norm": 3.605341672897339, | |
| "learning_rate": 0.00019536140041035764, | |
| "loss": 4.7004, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.11152988231674486, | |
| "grad_norm": 3.4805853366851807, | |
| "learning_rate": 0.00019510168038854115, | |
| "loss": 4.7293, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "grad_norm": 4.5123796463012695, | |
| "learning_rate": 0.0001948419603667247, | |
| "loss": 4.8315, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "eval_loss": 4.784451961517334, | |
| "eval_runtime": 18.5259, | |
| "eval_samples_per_second": 53.979, | |
| "eval_steps_per_second": 13.495, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11922159833858934, | |
| "grad_norm": 3.392902135848999, | |
| "learning_rate": 0.0001945822403449082, | |
| "loss": 4.859, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.12306745634951158, | |
| "grad_norm": 4.783381938934326, | |
| "learning_rate": 0.00019432252032309172, | |
| "loss": 4.7791, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.12691331436043382, | |
| "grad_norm": 3.96708345413208, | |
| "learning_rate": 0.00019406280030127523, | |
| "loss": 4.7743, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.13075917237135604, | |
| "grad_norm": 5.381892681121826, | |
| "learning_rate": 0.00019380308027945874, | |
| "loss": 4.8915, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "grad_norm": 3.269899606704712, | |
| "learning_rate": 0.00019354336025764228, | |
| "loss": 4.8823, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "eval_loss": 4.689135551452637, | |
| "eval_runtime": 18.4665, | |
| "eval_samples_per_second": 54.152, | |
| "eval_steps_per_second": 13.538, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13845088839320052, | |
| "grad_norm": 2.5163233280181885, | |
| "learning_rate": 0.0001932836402358258, | |
| "loss": 4.8513, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.14229674640412276, | |
| "grad_norm": 4.244402885437012, | |
| "learning_rate": 0.0001930239202140093, | |
| "loss": 4.6855, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.146142604415045, | |
| "grad_norm": 6.551025867462158, | |
| "learning_rate": 0.00019276420019219285, | |
| "loss": 4.8173, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14998846242596722, | |
| "grad_norm": 2.791435956954956, | |
| "learning_rate": 0.00019250448017037633, | |
| "loss": 4.6983, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "grad_norm": 3.4012036323547363, | |
| "learning_rate": 0.00019224476014855984, | |
| "loss": 4.8178, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "eval_loss": 4.666446208953857, | |
| "eval_runtime": 18.3775, | |
| "eval_samples_per_second": 54.414, | |
| "eval_steps_per_second": 13.604, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1576801784478117, | |
| "grad_norm": 6.46567964553833, | |
| "learning_rate": 0.00019198504012674338, | |
| "loss": 4.8124, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.16152603645873395, | |
| "grad_norm": 4.300732135772705, | |
| "learning_rate": 0.0001917253201049269, | |
| "loss": 4.7302, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.16537189446965617, | |
| "grad_norm": 4.140190601348877, | |
| "learning_rate": 0.0001914656000831104, | |
| "loss": 4.6498, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.1692177524805784, | |
| "grad_norm": 3.221662998199463, | |
| "learning_rate": 0.00019120588006129395, | |
| "loss": 4.6428, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "grad_norm": 2.6079111099243164, | |
| "learning_rate": 0.00019094616003947746, | |
| "loss": 4.6115, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "eval_loss": 4.661706924438477, | |
| "eval_runtime": 18.7413, | |
| "eval_samples_per_second": 53.358, | |
| "eval_steps_per_second": 13.34, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.1769094685024229, | |
| "grad_norm": 4.190133094787598, | |
| "learning_rate": 0.00019068644001766095, | |
| "loss": 4.7041, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.18075532651334514, | |
| "grad_norm": 5.242035388946533, | |
| "learning_rate": 0.0001904267199958445, | |
| "loss": 4.6656, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.18460118452426735, | |
| "grad_norm": 6.203541278839111, | |
| "learning_rate": 0.000190166999974028, | |
| "loss": 4.6835, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1884470425351896, | |
| "grad_norm": 2.8591034412384033, | |
| "learning_rate": 0.00018990727995221151, | |
| "loss": 4.6751, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "grad_norm": 5.204433441162109, | |
| "learning_rate": 0.00018964755993039505, | |
| "loss": 4.7301, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "eval_loss": 4.597048282623291, | |
| "eval_runtime": 18.5098, | |
| "eval_samples_per_second": 54.025, | |
| "eval_steps_per_second": 13.506, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19613875855703408, | |
| "grad_norm": 2.59525465965271, | |
| "learning_rate": 0.00018938783990857857, | |
| "loss": 4.4364, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.19998461656795632, | |
| "grad_norm": 3.844686985015869, | |
| "learning_rate": 0.00018912811988676208, | |
| "loss": 4.6437, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.20383047457887854, | |
| "grad_norm": 3.4633946418762207, | |
| "learning_rate": 0.0001888683998649456, | |
| "loss": 4.5016, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.20767633258980078, | |
| "grad_norm": 3.7852296829223633, | |
| "learning_rate": 0.0001886086798431291, | |
| "loss": 4.5845, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "grad_norm": 3.8716065883636475, | |
| "learning_rate": 0.00018834895982131265, | |
| "loss": 4.3669, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "eval_loss": 4.602295875549316, | |
| "eval_runtime": 18.4747, | |
| "eval_samples_per_second": 54.128, | |
| "eval_steps_per_second": 13.532, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21536804861164527, | |
| "grad_norm": 3.9932167530059814, | |
| "learning_rate": 0.00018808923979949616, | |
| "loss": 4.4545, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.21921390662256748, | |
| "grad_norm": 4.182176113128662, | |
| "learning_rate": 0.00018782951977767967, | |
| "loss": 4.5355, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.22305976463348973, | |
| "grad_norm": 3.776895523071289, | |
| "learning_rate": 0.00018756979975586318, | |
| "loss": 4.6859, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.22690562264441197, | |
| "grad_norm": 3.9219324588775635, | |
| "learning_rate": 0.0001873100797340467, | |
| "loss": 4.5186, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "grad_norm": 3.7879323959350586, | |
| "learning_rate": 0.0001870503597122302, | |
| "loss": 4.612, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "eval_loss": 4.579595565795898, | |
| "eval_runtime": 18.476, | |
| "eval_samples_per_second": 54.124, | |
| "eval_steps_per_second": 13.531, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.23459733866625646, | |
| "grad_norm": 6.408226490020752, | |
| "learning_rate": 0.00018679063969041375, | |
| "loss": 4.6614, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.23844319667717867, | |
| "grad_norm": 6.529899597167969, | |
| "learning_rate": 0.00018653091966859726, | |
| "loss": 4.5576, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.2422890546881009, | |
| "grad_norm": 4.417705059051514, | |
| "learning_rate": 0.00018627119964678078, | |
| "loss": 4.5466, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.24613491269902316, | |
| "grad_norm": 3.052746295928955, | |
| "learning_rate": 0.0001860114796249643, | |
| "loss": 4.6959, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "grad_norm": 5.121955871582031, | |
| "learning_rate": 0.0001857517596031478, | |
| "loss": 4.6796, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "eval_loss": 4.576270580291748, | |
| "eval_runtime": 18.8896, | |
| "eval_samples_per_second": 52.939, | |
| "eval_steps_per_second": 13.235, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.25382662872086764, | |
| "grad_norm": 2.772292375564575, | |
| "learning_rate": 0.00018549203958133131, | |
| "loss": 4.534, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2576724867317899, | |
| "grad_norm": 6.105401039123535, | |
| "learning_rate": 0.00018523231955951485, | |
| "loss": 4.4958, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.26151834474271207, | |
| "grad_norm": 2.742037057876587, | |
| "learning_rate": 0.00018497259953769837, | |
| "loss": 4.5835, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2653642027536343, | |
| "grad_norm": 3.6125235557556152, | |
| "learning_rate": 0.0001847128795158819, | |
| "loss": 4.5917, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "grad_norm": 3.8596699237823486, | |
| "learning_rate": 0.00018445315949406542, | |
| "loss": 4.4909, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "eval_loss": 4.520543098449707, | |
| "eval_runtime": 18.7111, | |
| "eval_samples_per_second": 53.444, | |
| "eval_steps_per_second": 13.361, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2730559187754788, | |
| "grad_norm": 3.639690399169922, | |
| "learning_rate": 0.0001841934394722489, | |
| "loss": 4.5127, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.27690177678640104, | |
| "grad_norm": 3.864473342895508, | |
| "learning_rate": 0.00018393371945043245, | |
| "loss": 4.599, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2807476347973233, | |
| "grad_norm": 4.662705421447754, | |
| "learning_rate": 0.00018367399942861596, | |
| "loss": 4.5951, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.28459349280824553, | |
| "grad_norm": 3.066333532333374, | |
| "learning_rate": 0.00018341427940679947, | |
| "loss": 4.5583, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "grad_norm": 4.054274082183838, | |
| "learning_rate": 0.000183154559384983, | |
| "loss": 4.5732, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "eval_loss": 4.569947242736816, | |
| "eval_runtime": 18.5242, | |
| "eval_samples_per_second": 53.983, | |
| "eval_steps_per_second": 13.496, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.29228520883009, | |
| "grad_norm": 6.440130710601807, | |
| "learning_rate": 0.00018289483936316652, | |
| "loss": 4.6025, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2961310668410122, | |
| "grad_norm": 5.849060535430908, | |
| "learning_rate": 0.00018263511934135004, | |
| "loss": 4.4932, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.29997692485193445, | |
| "grad_norm": 7.0537800788879395, | |
| "learning_rate": 0.00018237539931953355, | |
| "loss": 4.6986, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.3038227828628567, | |
| "grad_norm": 5.134806156158447, | |
| "learning_rate": 0.00018211567929771706, | |
| "loss": 4.5074, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "grad_norm": 2.872307777404785, | |
| "learning_rate": 0.00018185595927590058, | |
| "loss": 4.547, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "eval_loss": 4.5459160804748535, | |
| "eval_runtime": 18.4232, | |
| "eval_samples_per_second": 54.279, | |
| "eval_steps_per_second": 13.57, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3115144988847012, | |
| "grad_norm": 5.374965190887451, | |
| "learning_rate": 0.00018159623925408412, | |
| "loss": 4.5327, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.3153603568956234, | |
| "grad_norm": 4.99652624130249, | |
| "learning_rate": 0.00018133651923226763, | |
| "loss": 4.3966, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.31920621490654566, | |
| "grad_norm": 6.257124423980713, | |
| "learning_rate": 0.00018107679921045114, | |
| "loss": 4.5792, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.3230520729174679, | |
| "grad_norm": 8.056533813476562, | |
| "learning_rate": 0.00018081707918863465, | |
| "loss": 4.6401, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "grad_norm": 4.024567127227783, | |
| "learning_rate": 0.00018055735916681817, | |
| "loss": 4.5516, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "eval_loss": 4.558110237121582, | |
| "eval_runtime": 18.5343, | |
| "eval_samples_per_second": 53.954, | |
| "eval_steps_per_second": 13.488, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.33074378893931233, | |
| "grad_norm": 2.4326066970825195, | |
| "learning_rate": 0.0001802976391450017, | |
| "loss": 4.5761, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3345896469502346, | |
| "grad_norm": 2.7615299224853516, | |
| "learning_rate": 0.00018003791912318522, | |
| "loss": 4.4291, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3384355049611568, | |
| "grad_norm": 3.9387362003326416, | |
| "learning_rate": 0.00017977819910136873, | |
| "loss": 4.5422, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.34228136297207906, | |
| "grad_norm": 23.72602653503418, | |
| "learning_rate": 0.00017951847907955227, | |
| "loss": 4.4379, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "grad_norm": 2.968930959701538, | |
| "learning_rate": 0.0001792639534581721, | |
| "loss": 4.433, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "eval_loss": 4.5359063148498535, | |
| "eval_runtime": 18.5422, | |
| "eval_samples_per_second": 53.931, | |
| "eval_steps_per_second": 13.483, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.34997307899392355, | |
| "grad_norm": 6.404330730438232, | |
| "learning_rate": 0.0001790042334363556, | |
| "loss": 4.5673, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.3538189370048458, | |
| "grad_norm": 10.212136268615723, | |
| "learning_rate": 0.00017874451341453912, | |
| "loss": 4.6249, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.35766479501576803, | |
| "grad_norm": 4.401816368103027, | |
| "learning_rate": 0.00017848479339272266, | |
| "loss": 4.6305, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.3615106530266903, | |
| "grad_norm": 4.710996150970459, | |
| "learning_rate": 0.00017822507337090617, | |
| "loss": 4.3731, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "grad_norm": 3.150613307952881, | |
| "learning_rate": 0.0001779653533490897, | |
| "loss": 4.4491, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "eval_loss": 4.564510345458984, | |
| "eval_runtime": 18.5575, | |
| "eval_samples_per_second": 53.886, | |
| "eval_steps_per_second": 13.472, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.3692023690485347, | |
| "grad_norm": 4.828207492828369, | |
| "learning_rate": 0.0001777056333272732, | |
| "loss": 4.5923, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.37304822705945695, | |
| "grad_norm": 3.780848264694214, | |
| "learning_rate": 0.0001774459133054567, | |
| "loss": 4.5544, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.3768940850703792, | |
| "grad_norm": 4.04913854598999, | |
| "learning_rate": 0.00017718619328364023, | |
| "loss": 4.5271, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.38073994308130144, | |
| "grad_norm": 4.097137451171875, | |
| "learning_rate": 0.00017692647326182377, | |
| "loss": 4.4929, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "grad_norm": 4.65788459777832, | |
| "learning_rate": 0.00017666675324000728, | |
| "loss": 4.5888, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "eval_loss": 4.465761661529541, | |
| "eval_runtime": 18.6518, | |
| "eval_samples_per_second": 53.614, | |
| "eval_steps_per_second": 13.404, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3884316591031459, | |
| "grad_norm": 3.0376453399658203, | |
| "learning_rate": 0.00017640703321819082, | |
| "loss": 4.574, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.39227751711406816, | |
| "grad_norm": 2.6457693576812744, | |
| "learning_rate": 0.00017614731319637433, | |
| "loss": 4.5782, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3961233751249904, | |
| "grad_norm": 4.438416957855225, | |
| "learning_rate": 0.00017588759317455782, | |
| "loss": 4.5586, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.39996923313591265, | |
| "grad_norm": 5.325882911682129, | |
| "learning_rate": 0.00017562787315274136, | |
| "loss": 4.5136, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "grad_norm": 1.816029667854309, | |
| "learning_rate": 0.00017536815313092487, | |
| "loss": 4.4819, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "eval_loss": 4.561609268188477, | |
| "eval_runtime": 18.5623, | |
| "eval_samples_per_second": 53.873, | |
| "eval_steps_per_second": 13.468, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.4076609491577571, | |
| "grad_norm": 4.011863708496094, | |
| "learning_rate": 0.00017510843310910838, | |
| "loss": 4.4284, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.4115068071686793, | |
| "grad_norm": 10.169037818908691, | |
| "learning_rate": 0.00017484871308729192, | |
| "loss": 4.5414, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.41535266517960157, | |
| "grad_norm": 3.9185502529144287, | |
| "learning_rate": 0.00017458899306547544, | |
| "loss": 4.6659, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.4191985231905238, | |
| "grad_norm": 2.873530864715576, | |
| "learning_rate": 0.00017432927304365895, | |
| "loss": 4.4482, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "grad_norm": 4.379590034484863, | |
| "learning_rate": 0.00017406955302184246, | |
| "loss": 4.5501, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "eval_loss": 4.513929843902588, | |
| "eval_runtime": 18.6196, | |
| "eval_samples_per_second": 53.707, | |
| "eval_steps_per_second": 13.427, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4268902392123683, | |
| "grad_norm": 4.2113752365112305, | |
| "learning_rate": 0.00017380983300002597, | |
| "loss": 4.5454, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.43073609722329054, | |
| "grad_norm": 4.782048225402832, | |
| "learning_rate": 0.0001735501129782095, | |
| "loss": 4.4705, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.4345819552342128, | |
| "grad_norm": 2.6753036975860596, | |
| "learning_rate": 0.00017329039295639303, | |
| "loss": 4.4592, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.43842781324513497, | |
| "grad_norm": 5.012415885925293, | |
| "learning_rate": 0.00017303067293457654, | |
| "loss": 4.623, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "grad_norm": 3.0666699409484863, | |
| "learning_rate": 0.00017277095291276005, | |
| "loss": 4.4325, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "eval_loss": 4.489352703094482, | |
| "eval_runtime": 18.4948, | |
| "eval_samples_per_second": 54.069, | |
| "eval_steps_per_second": 13.517, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.44611952926697945, | |
| "grad_norm": 6.8570876121521, | |
| "learning_rate": 0.00017251123289094357, | |
| "loss": 4.5179, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.4499653872779017, | |
| "grad_norm": 7.190755844116211, | |
| "learning_rate": 0.00017225151286912708, | |
| "loss": 4.5877, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.45381124528882394, | |
| "grad_norm": 4.404886722564697, | |
| "learning_rate": 0.00017199179284731062, | |
| "loss": 4.4072, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.4576571032997462, | |
| "grad_norm": 3.0543084144592285, | |
| "learning_rate": 0.00017173207282549413, | |
| "loss": 4.4656, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "grad_norm": 6.7454514503479, | |
| "learning_rate": 0.00017147235280367764, | |
| "loss": 4.5688, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "eval_loss": 4.480144023895264, | |
| "eval_runtime": 18.5584, | |
| "eval_samples_per_second": 53.884, | |
| "eval_steps_per_second": 13.471, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.46534881932159067, | |
| "grad_norm": 5.5196661949157715, | |
| "learning_rate": 0.00017121263278186118, | |
| "loss": 4.5703, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.4691946773325129, | |
| "grad_norm": 4.253966331481934, | |
| "learning_rate": 0.00017095291276004467, | |
| "loss": 4.6012, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.4730405353434351, | |
| "grad_norm": 2.459376096725464, | |
| "learning_rate": 0.00017069319273822818, | |
| "loss": 4.5002, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.47688639335435734, | |
| "grad_norm": 4.933450698852539, | |
| "learning_rate": 0.00017043347271641172, | |
| "loss": 4.5703, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "grad_norm": 4.511186599731445, | |
| "learning_rate": 0.00017017375269459524, | |
| "loss": 4.4665, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "eval_loss": 4.485811233520508, | |
| "eval_runtime": 18.522, | |
| "eval_samples_per_second": 53.99, | |
| "eval_steps_per_second": 13.497, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4845781093762018, | |
| "grad_norm": 5.634074687957764, | |
| "learning_rate": 0.00016991403267277875, | |
| "loss": 4.4616, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.48842396738712407, | |
| "grad_norm": 3.319650650024414, | |
| "learning_rate": 0.0001696543126509623, | |
| "loss": 4.4836, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.4922698253980463, | |
| "grad_norm": 3.306976079940796, | |
| "learning_rate": 0.00016939459262914577, | |
| "loss": 4.5256, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.49611568340896856, | |
| "grad_norm": 4.1797308921813965, | |
| "learning_rate": 0.0001691348726073293, | |
| "loss": 4.3822, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "grad_norm": 3.2349929809570312, | |
| "learning_rate": 0.00016887515258551283, | |
| "loss": 4.4384, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "eval_loss": 4.485826015472412, | |
| "eval_runtime": 18.614, | |
| "eval_samples_per_second": 53.723, | |
| "eval_steps_per_second": 13.431, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.503807399430813, | |
| "grad_norm": 3.325056791305542, | |
| "learning_rate": 0.00016861543256369634, | |
| "loss": 4.618, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.5076532574417353, | |
| "grad_norm": 4.026259899139404, | |
| "learning_rate": 0.0001683609069423162, | |
| "loss": 4.5485, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.5114991154526575, | |
| "grad_norm": 3.1270413398742676, | |
| "learning_rate": 0.00016810118692049973, | |
| "loss": 4.4426, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.5153449734635798, | |
| "grad_norm": 5.264435768127441, | |
| "learning_rate": 0.00016784146689868324, | |
| "loss": 4.3544, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "grad_norm": 4.349465847015381, | |
| "learning_rate": 0.00016758694127730307, | |
| "loss": 4.5912, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "eval_loss": 4.503914833068848, | |
| "eval_runtime": 18.5932, | |
| "eval_samples_per_second": 53.783, | |
| "eval_steps_per_second": 13.446, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.5230366894854241, | |
| "grad_norm": 2.5264992713928223, | |
| "learning_rate": 0.00016732722125548658, | |
| "loss": 4.3703, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.5268825474963464, | |
| "grad_norm": 4.934820175170898, | |
| "learning_rate": 0.00016706750123367012, | |
| "loss": 4.5652, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.5307284055072686, | |
| "grad_norm": 4.21425724029541, | |
| "learning_rate": 0.00016680778121185363, | |
| "loss": 4.4684, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.5345742635181909, | |
| "grad_norm": 5.111146926879883, | |
| "learning_rate": 0.00016654806119003715, | |
| "loss": 4.5119, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "grad_norm": 4.563775062561035, | |
| "learning_rate": 0.00016628834116822069, | |
| "loss": 4.489, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "eval_loss": 4.429446220397949, | |
| "eval_runtime": 18.6345, | |
| "eval_samples_per_second": 53.664, | |
| "eval_steps_per_second": 13.416, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5422659795400354, | |
| "grad_norm": 6.055607795715332, | |
| "learning_rate": 0.00016602862114640417, | |
| "loss": 4.3805, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5461118375509576, | |
| "grad_norm": 3.190605878829956, | |
| "learning_rate": 0.00016576890112458768, | |
| "loss": 4.561, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5499576955618799, | |
| "grad_norm": 2.95857834815979, | |
| "learning_rate": 0.00016550918110277122, | |
| "loss": 4.5957, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5538035535728021, | |
| "grad_norm": 5.055838108062744, | |
| "learning_rate": 0.00016524946108095474, | |
| "loss": 4.3916, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "grad_norm": 6.25083589553833, | |
| "learning_rate": 0.00016498974105913825, | |
| "loss": 4.6649, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "eval_loss": 4.42822265625, | |
| "eval_runtime": 18.5876, | |
| "eval_samples_per_second": 53.799, | |
| "eval_steps_per_second": 13.45, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5614952695946466, | |
| "grad_norm": 3.6657636165618896, | |
| "learning_rate": 0.0001647300210373218, | |
| "loss": 4.5059, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5653411276055688, | |
| "grad_norm": 3.6100645065307617, | |
| "learning_rate": 0.0001644703010155053, | |
| "loss": 4.4379, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.5691869856164911, | |
| "grad_norm": 3.535804271697998, | |
| "learning_rate": 0.0001642105809936888, | |
| "loss": 4.3881, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5730328436274132, | |
| "grad_norm": 2.9636013507843018, | |
| "learning_rate": 0.00016395086097187233, | |
| "loss": 4.491, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "grad_norm": 2.9023678302764893, | |
| "learning_rate": 0.00016369114095005584, | |
| "loss": 4.5567, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "eval_loss": 4.459234237670898, | |
| "eval_runtime": 18.5868, | |
| "eval_samples_per_second": 53.802, | |
| "eval_steps_per_second": 13.45, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5807245596492577, | |
| "grad_norm": 4.381130218505859, | |
| "learning_rate": 0.00016343142092823938, | |
| "loss": 4.521, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.58457041766018, | |
| "grad_norm": 2.533957004547119, | |
| "learning_rate": 0.0001631717009064229, | |
| "loss": 4.3233, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.5884162756711022, | |
| "grad_norm": 3.277646541595459, | |
| "learning_rate": 0.0001629119808846064, | |
| "loss": 4.5503, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.5922621336820244, | |
| "grad_norm": 3.871952772140503, | |
| "learning_rate": 0.00016265226086278992, | |
| "loss": 4.4196, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "grad_norm": 3.191589832305908, | |
| "learning_rate": 0.00016239254084097343, | |
| "loss": 4.479, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "eval_loss": 4.44810152053833, | |
| "eval_runtime": 18.425, | |
| "eval_samples_per_second": 54.274, | |
| "eval_steps_per_second": 13.569, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5999538497038689, | |
| "grad_norm": 3.7022933959960938, | |
| "learning_rate": 0.00016213282081915695, | |
| "loss": 4.5402, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.6037997077147912, | |
| "grad_norm": 2.883859395980835, | |
| "learning_rate": 0.00016187310079734049, | |
| "loss": 4.5457, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.6076455657257134, | |
| "grad_norm": 3.5229415893554688, | |
| "learning_rate": 0.000161613380775524, | |
| "loss": 4.5185, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.6114914237366357, | |
| "grad_norm": 4.529599666595459, | |
| "learning_rate": 0.0001613536607537075, | |
| "loss": 4.4489, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "grad_norm": 5.1023850440979, | |
| "learning_rate": 0.00016109394073189102, | |
| "loss": 4.6043, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "eval_loss": 4.478011608123779, | |
| "eval_runtime": 18.576, | |
| "eval_samples_per_second": 53.833, | |
| "eval_steps_per_second": 13.458, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6191831397584802, | |
| "grad_norm": 4.5992255210876465, | |
| "learning_rate": 0.00016083422071007454, | |
| "loss": 4.3146, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.6230289977694023, | |
| "grad_norm": 5.412031650543213, | |
| "learning_rate": 0.00016057450068825805, | |
| "loss": 4.5841, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.6268748557803245, | |
| "grad_norm": 4.4531779289245605, | |
| "learning_rate": 0.0001603147806664416, | |
| "loss": 4.5084, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.6307207137912468, | |
| "grad_norm": 3.913174867630005, | |
| "learning_rate": 0.0001600550606446251, | |
| "loss": 4.4183, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "grad_norm": 3.3952407836914062, | |
| "learning_rate": 0.00015979534062280862, | |
| "loss": 4.3496, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "eval_loss": 4.46174430847168, | |
| "eval_runtime": 18.5381, | |
| "eval_samples_per_second": 53.943, | |
| "eval_steps_per_second": 13.486, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.6384124298130913, | |
| "grad_norm": 4.156221389770508, | |
| "learning_rate": 0.00015953562060099213, | |
| "loss": 4.361, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.6422582878240135, | |
| "grad_norm": 3.2021920680999756, | |
| "learning_rate": 0.00015927590057917564, | |
| "loss": 4.473, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.6461041458349358, | |
| "grad_norm": 5.048036575317383, | |
| "learning_rate": 0.00015901618055735918, | |
| "loss": 4.4142, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.649950003845858, | |
| "grad_norm": 3.91768217086792, | |
| "learning_rate": 0.0001587564605355427, | |
| "loss": 4.4672, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "grad_norm": 9.229452133178711, | |
| "learning_rate": 0.0001584967405137262, | |
| "loss": 4.7837, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "eval_loss": 4.409055709838867, | |
| "eval_runtime": 18.5943, | |
| "eval_samples_per_second": 53.78, | |
| "eval_steps_per_second": 13.445, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6576417198677025, | |
| "grad_norm": 2.7313661575317383, | |
| "learning_rate": 0.00015823702049190975, | |
| "loss": 4.4721, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6614875778786247, | |
| "grad_norm": 4.160475730895996, | |
| "learning_rate": 0.00015797730047009326, | |
| "loss": 4.501, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.665333435889547, | |
| "grad_norm": 11.54045581817627, | |
| "learning_rate": 0.00015771758044827675, | |
| "loss": 4.4433, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6691792939004692, | |
| "grad_norm": 4.087617874145508, | |
| "learning_rate": 0.00015745786042646029, | |
| "loss": 4.4981, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "grad_norm": 4.155121803283691, | |
| "learning_rate": 0.0001571981404046438, | |
| "loss": 4.3874, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "eval_loss": 4.418811321258545, | |
| "eval_runtime": 18.6306, | |
| "eval_samples_per_second": 53.675, | |
| "eval_steps_per_second": 13.419, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6768710099223136, | |
| "grad_norm": 4.071916580200195, | |
| "learning_rate": 0.0001569384203828273, | |
| "loss": 4.5531, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6807168679332359, | |
| "grad_norm": 3.395460605621338, | |
| "learning_rate": 0.00015667870036101085, | |
| "loss": 4.4609, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.6845627259441581, | |
| "grad_norm": 3.4933230876922607, | |
| "learning_rate": 0.00015641898033919436, | |
| "loss": 4.4536, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6884085839550804, | |
| "grad_norm": 6.921072483062744, | |
| "learning_rate": 0.00015615926031737788, | |
| "loss": 4.3478, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "grad_norm": 3.920626401901245, | |
| "learning_rate": 0.0001558995402955614, | |
| "loss": 4.3761, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "eval_loss": 4.415992259979248, | |
| "eval_runtime": 18.5147, | |
| "eval_samples_per_second": 54.011, | |
| "eval_steps_per_second": 13.503, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6961002999769248, | |
| "grad_norm": 7.213745594024658, | |
| "learning_rate": 0.0001556398202737449, | |
| "loss": 4.323, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.6999461579878471, | |
| "grad_norm": 3.2426984310150146, | |
| "learning_rate": 0.00015538010025192842, | |
| "loss": 4.3922, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.7037920159987693, | |
| "grad_norm": 3.256950855255127, | |
| "learning_rate": 0.00015512038023011196, | |
| "loss": 4.2602, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.7076378740096916, | |
| "grad_norm": 6.132264614105225, | |
| "learning_rate": 0.00015486066020829547, | |
| "loss": 4.3734, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "grad_norm": 3.921595573425293, | |
| "learning_rate": 0.00015460094018647898, | |
| "loss": 4.4776, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "eval_loss": 4.3921356201171875, | |
| "eval_runtime": 18.5511, | |
| "eval_samples_per_second": 53.905, | |
| "eval_steps_per_second": 13.476, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7153295900315361, | |
| "grad_norm": 5.416064739227295, | |
| "learning_rate": 0.0001543412201646625, | |
| "loss": 4.5659, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.7191754480424583, | |
| "grad_norm": 4.542217254638672, | |
| "learning_rate": 0.000154081500142846, | |
| "loss": 4.2557, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.7230213060533806, | |
| "grad_norm": 3.7075681686401367, | |
| "learning_rate": 0.00015382178012102955, | |
| "loss": 4.453, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.7268671640643027, | |
| "grad_norm": 4.457496166229248, | |
| "learning_rate": 0.00015356206009921306, | |
| "loss": 4.3861, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "grad_norm": 1.7784981727600098, | |
| "learning_rate": 0.00015330234007739657, | |
| "loss": 4.3258, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "eval_loss": 4.357193470001221, | |
| "eval_runtime": 18.6372, | |
| "eval_samples_per_second": 53.656, | |
| "eval_steps_per_second": 13.414, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7345588800861472, | |
| "grad_norm": 3.8532371520996094, | |
| "learning_rate": 0.0001530426200555801, | |
| "loss": 4.3473, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.7384047380970694, | |
| "grad_norm": 4.654659271240234, | |
| "learning_rate": 0.0001527829000337636, | |
| "loss": 4.457, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7422505961079917, | |
| "grad_norm": 2.420182228088379, | |
| "learning_rate": 0.0001525231800119471, | |
| "loss": 4.4521, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.7460964541189139, | |
| "grad_norm": 4.189414978027344, | |
| "learning_rate": 0.00015226345999013065, | |
| "loss": 4.2569, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "grad_norm": 2.824084997177124, | |
| "learning_rate": 0.00015200373996831416, | |
| "loss": 4.3409, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "eval_loss": 4.378731727600098, | |
| "eval_runtime": 18.4857, | |
| "eval_samples_per_second": 54.096, | |
| "eval_steps_per_second": 13.524, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7537881701407584, | |
| "grad_norm": 6.379781723022461, | |
| "learning_rate": 0.00015174401994649768, | |
| "loss": 4.3041, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.7576340281516807, | |
| "grad_norm": 1.7334113121032715, | |
| "learning_rate": 0.00015148429992468122, | |
| "loss": 4.3096, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7614798861626029, | |
| "grad_norm": 4.287415027618408, | |
| "learning_rate": 0.00015122457990286473, | |
| "loss": 4.4411, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7653257441735252, | |
| "grad_norm": 3.3184821605682373, | |
| "learning_rate": 0.00015096485988104821, | |
| "loss": 4.3992, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "grad_norm": 4.698968887329102, | |
| "learning_rate": 0.00015070513985923175, | |
| "loss": 4.4726, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "eval_loss": 4.408615589141846, | |
| "eval_runtime": 18.5408, | |
| "eval_samples_per_second": 53.935, | |
| "eval_steps_per_second": 13.484, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7730174601953695, | |
| "grad_norm": 3.882775068283081, | |
| "learning_rate": 0.00015044541983741527, | |
| "loss": 4.5207, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.7768633182062918, | |
| "grad_norm": 5.814795017242432, | |
| "learning_rate": 0.0001501856998155988, | |
| "loss": 4.2462, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.780709176217214, | |
| "grad_norm": 4.733581066131592, | |
| "learning_rate": 0.00014992597979378232, | |
| "loss": 4.5563, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.7845550342281363, | |
| "grad_norm": 4.805403232574463, | |
| "learning_rate": 0.00014966625977196583, | |
| "loss": 4.4353, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "grad_norm": 5.814332008361816, | |
| "learning_rate": 0.00014940653975014935, | |
| "loss": 4.4004, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "eval_loss": 4.41144323348999, | |
| "eval_runtime": 18.5402, | |
| "eval_samples_per_second": 53.937, | |
| "eval_steps_per_second": 13.484, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7922467502499808, | |
| "grad_norm": 5.321393013000488, | |
| "learning_rate": 0.00014914681972833286, | |
| "loss": 4.4383, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.796092608260903, | |
| "grad_norm": 3.681452751159668, | |
| "learning_rate": 0.00014888709970651637, | |
| "loss": 4.5094, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.7999384662718253, | |
| "grad_norm": 4.766401767730713, | |
| "learning_rate": 0.0001486273796846999, | |
| "loss": 4.3104, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.8037843242827475, | |
| "grad_norm": 4.676774024963379, | |
| "learning_rate": 0.00014836765966288342, | |
| "loss": 4.516, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "grad_norm": 3.623643159866333, | |
| "learning_rate": 0.00014810793964106694, | |
| "loss": 4.5146, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "eval_loss": 4.398375034332275, | |
| "eval_runtime": 18.4969, | |
| "eval_samples_per_second": 54.063, | |
| "eval_steps_per_second": 13.516, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.811476040304592, | |
| "grad_norm": 4.379317760467529, | |
| "learning_rate": 0.00014784821961925045, | |
| "loss": 4.6715, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.8153218983155142, | |
| "grad_norm": 3.034796714782715, | |
| "learning_rate": 0.00014758849959743396, | |
| "loss": 4.4511, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.8191677563264365, | |
| "grad_norm": 3.8016927242279053, | |
| "learning_rate": 0.00014732877957561748, | |
| "loss": 4.2966, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.8230136143373586, | |
| "grad_norm": 4.330080509185791, | |
| "learning_rate": 0.00014706905955380102, | |
| "loss": 4.6496, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "grad_norm": 8.032389640808105, | |
| "learning_rate": 0.00014680933953198453, | |
| "loss": 4.2758, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "eval_loss": 4.388455867767334, | |
| "eval_runtime": 18.481, | |
| "eval_samples_per_second": 54.11, | |
| "eval_steps_per_second": 13.527, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8307053303592031, | |
| "grad_norm": 3.1724319458007812, | |
| "learning_rate": 0.00014654961951016807, | |
| "loss": 4.405, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.8345511883701254, | |
| "grad_norm": 3.529196262359619, | |
| "learning_rate": 0.00014628989948835155, | |
| "loss": 4.4433, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.8383970463810476, | |
| "grad_norm": 4.758362293243408, | |
| "learning_rate": 0.00014603017946653507, | |
| "loss": 4.5747, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.8422429043919698, | |
| "grad_norm": 3.524068832397461, | |
| "learning_rate": 0.0001457704594447186, | |
| "loss": 4.3469, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "grad_norm": 4.452401161193848, | |
| "learning_rate": 0.00014551073942290212, | |
| "loss": 4.3767, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "eval_loss": 4.352676868438721, | |
| "eval_runtime": 18.5817, | |
| "eval_samples_per_second": 53.816, | |
| "eval_steps_per_second": 13.454, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8499346204138143, | |
| "grad_norm": 6.128251075744629, | |
| "learning_rate": 0.00014525101940108563, | |
| "loss": 4.3993, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.8537804784247366, | |
| "grad_norm": 3.9961323738098145, | |
| "learning_rate": 0.00014499129937926917, | |
| "loss": 4.406, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.8576263364356588, | |
| "grad_norm": 3.889711856842041, | |
| "learning_rate": 0.00014473157935745269, | |
| "loss": 4.2991, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.8614721944465811, | |
| "grad_norm": 3.5852463245391846, | |
| "learning_rate": 0.00014447185933563617, | |
| "loss": 4.2967, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "grad_norm": 3.343247652053833, | |
| "learning_rate": 0.0001442121393138197, | |
| "loss": 4.314, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "eval_loss": 4.387504577636719, | |
| "eval_runtime": 18.554, | |
| "eval_samples_per_second": 53.897, | |
| "eval_steps_per_second": 13.474, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8691639104684256, | |
| "grad_norm": 3.5600407123565674, | |
| "learning_rate": 0.00014395241929200322, | |
| "loss": 4.2958, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8730097684793477, | |
| "grad_norm": 4.299932956695557, | |
| "learning_rate": 0.00014369269927018674, | |
| "loss": 4.2966, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.8768556264902699, | |
| "grad_norm": 2.5763466358184814, | |
| "learning_rate": 0.00014343297924837028, | |
| "loss": 4.3739, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8807014845011922, | |
| "grad_norm": 3.670653820037842, | |
| "learning_rate": 0.0001431732592265538, | |
| "loss": 4.3932, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "grad_norm": 5.472078800201416, | |
| "learning_rate": 0.0001429135392047373, | |
| "loss": 4.3183, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "eval_loss": 4.377117156982422, | |
| "eval_runtime": 18.5574, | |
| "eval_samples_per_second": 53.887, | |
| "eval_steps_per_second": 13.472, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8883932005230367, | |
| "grad_norm": 4.711415767669678, | |
| "learning_rate": 0.00014265381918292082, | |
| "loss": 4.5701, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.8922390585339589, | |
| "grad_norm": 3.1737523078918457, | |
| "learning_rate": 0.00014239409916110433, | |
| "loss": 4.4522, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.8960849165448812, | |
| "grad_norm": 4.876018047332764, | |
| "learning_rate": 0.00014213437913928787, | |
| "loss": 4.3937, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.8999307745558034, | |
| "grad_norm": 7.117967128753662, | |
| "learning_rate": 0.00014187465911747138, | |
| "loss": 4.3585, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "grad_norm": 2.587160587310791, | |
| "learning_rate": 0.0001416149390956549, | |
| "loss": 1.5054, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "eval_loss": 1.5468424558639526, | |
| "eval_runtime": 17.9563, | |
| "eval_samples_per_second": 55.691, | |
| "eval_steps_per_second": 13.923, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9076224905776479, | |
| "grad_norm": 1.4935526847839355, | |
| "learning_rate": 0.0001413552190738384, | |
| "loss": 1.5719, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.9114683485885701, | |
| "grad_norm": 1.4879201650619507, | |
| "learning_rate": 0.00014109549905202192, | |
| "loss": 1.4862, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.9153142065994924, | |
| "grad_norm": 1.7936193943023682, | |
| "learning_rate": 0.00014083577903020543, | |
| "loss": 1.488, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.9191600646104146, | |
| "grad_norm": 2.14953875541687, | |
| "learning_rate": 0.00014057605900838897, | |
| "loss": 1.5383, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "grad_norm": 1.5575013160705566, | |
| "learning_rate": 0.00014031633898657249, | |
| "loss": 1.4814, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "eval_loss": 1.5273067951202393, | |
| "eval_runtime": 17.7259, | |
| "eval_samples_per_second": 56.414, | |
| "eval_steps_per_second": 14.104, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.926851780632259, | |
| "grad_norm": 1.9142653942108154, | |
| "learning_rate": 0.000140056618964756, | |
| "loss": 1.5436, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.9306976386431813, | |
| "grad_norm": 1.9730989933013916, | |
| "learning_rate": 0.00013979689894293954, | |
| "loss": 1.5175, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.9345434966541035, | |
| "grad_norm": 0.9956797361373901, | |
| "learning_rate": 0.00013953717892112302, | |
| "loss": 1.4491, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.9383893546650258, | |
| "grad_norm": 1.0358608961105347, | |
| "learning_rate": 0.00013927745889930654, | |
| "loss": 1.4488, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "grad_norm": 1.5184404850006104, | |
| "learning_rate": 0.00013901773887749008, | |
| "loss": 1.5037, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "eval_loss": 1.5220181941986084, | |
| "eval_runtime": 17.7647, | |
| "eval_samples_per_second": 56.291, | |
| "eval_steps_per_second": 14.073, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.9460810706868702, | |
| "grad_norm": 1.3881593942642212, | |
| "learning_rate": 0.0001387580188556736, | |
| "loss": 1.4952, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.9499269286977925, | |
| "grad_norm": 2.319173574447632, | |
| "learning_rate": 0.0001384982988338571, | |
| "loss": 1.4667, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.9537727867087147, | |
| "grad_norm": 1.9042879343032837, | |
| "learning_rate": 0.00013823857881204064, | |
| "loss": 1.6038, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.957618644719637, | |
| "grad_norm": 1.9162698984146118, | |
| "learning_rate": 0.00013797885879022415, | |
| "loss": 1.4904, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "grad_norm": 2.0601863861083984, | |
| "learning_rate": 0.00013771913876840767, | |
| "loss": 1.4758, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "eval_loss": 1.5356587171554565, | |
| "eval_runtime": 17.7404, | |
| "eval_samples_per_second": 56.369, | |
| "eval_steps_per_second": 14.092, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9653103607414815, | |
| "grad_norm": 1.888836145401001, | |
| "learning_rate": 0.00013745941874659118, | |
| "loss": 1.5681, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.9691562187524037, | |
| "grad_norm": 1.4329860210418701, | |
| "learning_rate": 0.0001371996987247747, | |
| "loss": 1.5018, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.973002076763326, | |
| "grad_norm": 1.969533920288086, | |
| "learning_rate": 0.00013693997870295823, | |
| "loss": 1.5494, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.9768479347742481, | |
| "grad_norm": 2.1219890117645264, | |
| "learning_rate": 0.00013668025868114175, | |
| "loss": 1.5386, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "grad_norm": 1.6632941961288452, | |
| "learning_rate": 0.00013642053865932526, | |
| "loss": 1.5253, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "eval_loss": 1.5147372484207153, | |
| "eval_runtime": 17.8072, | |
| "eval_samples_per_second": 56.157, | |
| "eval_steps_per_second": 14.039, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9845396507960926, | |
| "grad_norm": 1.292913794517517, | |
| "learning_rate": 0.00013616081863750877, | |
| "loss": 1.4413, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9883855088070148, | |
| "grad_norm": 1.1377824544906616, | |
| "learning_rate": 0.00013590109861569228, | |
| "loss": 1.5705, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.9922313668179371, | |
| "grad_norm": 1.656996726989746, | |
| "learning_rate": 0.0001356413785938758, | |
| "loss": 1.5311, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9960772248288593, | |
| "grad_norm": 1.6639357805252075, | |
| "learning_rate": 0.00013538165857205934, | |
| "loss": 1.5676, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "grad_norm": 1.0893466472625732, | |
| "learning_rate": 0.00013512193855024285, | |
| "loss": 1.5215, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "eval_loss": 1.5119102001190186, | |
| "eval_runtime": 17.7291, | |
| "eval_samples_per_second": 56.404, | |
| "eval_steps_per_second": 14.101, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.0037689408507038, | |
| "grad_norm": 1.4789248704910278, | |
| "learning_rate": 0.00013486221852842636, | |
| "loss": 1.4551, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.007614798861626, | |
| "grad_norm": 1.3270663022994995, | |
| "learning_rate": 0.00013460249850660988, | |
| "loss": 1.4776, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.0114606568725482, | |
| "grad_norm": 1.3546854257583618, | |
| "learning_rate": 0.0001343427784847934, | |
| "loss": 1.5807, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.0153065148834706, | |
| "grad_norm": 1.303915023803711, | |
| "learning_rate": 0.0001340830584629769, | |
| "loss": 1.4309, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "grad_norm": 0.8854748606681824, | |
| "learning_rate": 0.00013382333844116044, | |
| "loss": 1.4395, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "eval_loss": 1.5238608121871948, | |
| "eval_runtime": 17.7588, | |
| "eval_samples_per_second": 56.31, | |
| "eval_steps_per_second": 14.077, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.022998230905315, | |
| "grad_norm": 1.5649653673171997, | |
| "learning_rate": 0.00013356361841934395, | |
| "loss": 1.5022, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.0268440889162371, | |
| "grad_norm": 1.6031616926193237, | |
| "learning_rate": 0.0001333038983975275, | |
| "loss": 1.4315, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.0306899469271595, | |
| "grad_norm": 1.3788844347000122, | |
| "learning_rate": 0.00013304417837571098, | |
| "loss": 1.578, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.0345358049380817, | |
| "grad_norm": 1.4347171783447266, | |
| "learning_rate": 0.0001327844583538945, | |
| "loss": 1.3993, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "grad_norm": 1.9777193069458008, | |
| "learning_rate": 0.00013252473833207803, | |
| "loss": 1.4331, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "eval_loss": 1.5231057405471802, | |
| "eval_runtime": 17.8114, | |
| "eval_samples_per_second": 56.144, | |
| "eval_steps_per_second": 14.036, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.042227520959926, | |
| "grad_norm": 2.056574583053589, | |
| "learning_rate": 0.00013226501831026155, | |
| "loss": 1.5387, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.0460733789708483, | |
| "grad_norm": 1.41805899143219, | |
| "learning_rate": 0.00013200529828844506, | |
| "loss": 1.4194, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.0499192369817707, | |
| "grad_norm": 1.5727626085281372, | |
| "learning_rate": 0.0001317455782666286, | |
| "loss": 1.4763, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.0537650949926929, | |
| "grad_norm": 1.8175796270370483, | |
| "learning_rate": 0.0001314858582448121, | |
| "loss": 1.5232, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "grad_norm": 1.459721565246582, | |
| "learning_rate": 0.0001312261382229956, | |
| "loss": 1.4926, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "eval_loss": 1.5073590278625488, | |
| "eval_runtime": 17.8208, | |
| "eval_samples_per_second": 56.114, | |
| "eval_steps_per_second": 14.029, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.0614568110145373, | |
| "grad_norm": 1.7236889600753784, | |
| "learning_rate": 0.00013096641820117914, | |
| "loss": 1.4485, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.0653026690254597, | |
| "grad_norm": 1.1652172803878784, | |
| "learning_rate": 0.00013070669817936265, | |
| "loss": 1.4706, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.0691485270363819, | |
| "grad_norm": 1.1279985904693604, | |
| "learning_rate": 0.00013044697815754616, | |
| "loss": 1.5507, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.072994385047304, | |
| "grad_norm": 2.2368061542510986, | |
| "learning_rate": 0.0001301872581357297, | |
| "loss": 1.5184, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "grad_norm": 1.1515541076660156, | |
| "learning_rate": 0.00012992753811391322, | |
| "loss": 1.5184, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "eval_loss": 1.5123000144958496, | |
| "eval_runtime": 17.8325, | |
| "eval_samples_per_second": 56.077, | |
| "eval_steps_per_second": 14.019, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0806861010691486, | |
| "grad_norm": 2.1172475814819336, | |
| "learning_rate": 0.00012966781809209673, | |
| "loss": 1.4055, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.0845319590800708, | |
| "grad_norm": 1.196999430656433, | |
| "learning_rate": 0.00012940809807028024, | |
| "loss": 1.464, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.088377817090993, | |
| "grad_norm": 1.3582040071487427, | |
| "learning_rate": 0.00012914837804846375, | |
| "loss": 1.4502, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.0922236751019152, | |
| "grad_norm": 1.6588162183761597, | |
| "learning_rate": 0.0001288886580266473, | |
| "loss": 1.5174, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "grad_norm": 1.7531650066375732, | |
| "learning_rate": 0.0001286289380048308, | |
| "loss": 1.505, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "eval_loss": 1.5160688161849976, | |
| "eval_runtime": 17.764, | |
| "eval_samples_per_second": 56.294, | |
| "eval_steps_per_second": 14.073, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0999153911237598, | |
| "grad_norm": 1.868784785270691, | |
| "learning_rate": 0.00012836921798301432, | |
| "loss": 1.5544, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.103761249134682, | |
| "grad_norm": 1.9493080377578735, | |
| "learning_rate": 0.00012810949796119783, | |
| "loss": 1.533, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.1076071071456042, | |
| "grad_norm": 0.7309526801109314, | |
| "learning_rate": 0.00012784977793938135, | |
| "loss": 1.4672, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.1114529651565264, | |
| "grad_norm": 1.3281447887420654, | |
| "learning_rate": 0.00012759005791756486, | |
| "loss": 1.3874, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "grad_norm": 1.0158611536026, | |
| "learning_rate": 0.0001273303378957484, | |
| "loss": 1.4966, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "eval_loss": 1.5160739421844482, | |
| "eval_runtime": 17.6672, | |
| "eval_samples_per_second": 56.602, | |
| "eval_steps_per_second": 14.151, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.119144681178371, | |
| "grad_norm": 1.6422228813171387, | |
| "learning_rate": 0.0001270706178739319, | |
| "loss": 1.5494, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.1229905391892931, | |
| "grad_norm": 0.7187716960906982, | |
| "learning_rate": 0.00012681089785211542, | |
| "loss": 1.4221, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.1268363972002153, | |
| "grad_norm": 1.2605098485946655, | |
| "learning_rate": 0.00012655117783029896, | |
| "loss": 1.4299, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.1306822552111375, | |
| "grad_norm": 2.598015069961548, | |
| "learning_rate": 0.00012629145780848245, | |
| "loss": 1.5757, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "grad_norm": 1.4004614353179932, | |
| "learning_rate": 0.00012603173778666596, | |
| "loss": 1.482, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "eval_loss": 1.5089725255966187, | |
| "eval_runtime": 17.9036, | |
| "eval_samples_per_second": 55.855, | |
| "eval_steps_per_second": 13.964, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.1383739712329821, | |
| "grad_norm": 1.3800735473632812, | |
| "learning_rate": 0.0001257720177648495, | |
| "loss": 1.5285, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.1422198292439043, | |
| "grad_norm": 1.3741459846496582, | |
| "learning_rate": 0.00012551229774303301, | |
| "loss": 1.5242, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.1460656872548265, | |
| "grad_norm": 2.232680559158325, | |
| "learning_rate": 0.00012525257772121653, | |
| "loss": 1.4483, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.149911545265749, | |
| "grad_norm": 1.4408409595489502, | |
| "learning_rate": 0.00012499285769940007, | |
| "loss": 1.5414, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "grad_norm": 1.5221819877624512, | |
| "learning_rate": 0.00012473313767758355, | |
| "loss": 1.5246, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "eval_loss": 1.516871452331543, | |
| "eval_runtime": 17.9308, | |
| "eval_samples_per_second": 55.77, | |
| "eval_steps_per_second": 13.943, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.1576032612875933, | |
| "grad_norm": 1.5970553159713745, | |
| "learning_rate": 0.0001244734176557671, | |
| "loss": 1.4515, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.1614491192985155, | |
| "grad_norm": 1.0201988220214844, | |
| "learning_rate": 0.0001242136976339506, | |
| "loss": 1.5431, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.1652949773094377, | |
| "grad_norm": 0.9830596446990967, | |
| "learning_rate": 0.00012395397761213412, | |
| "loss": 1.4581, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.16914083532036, | |
| "grad_norm": 2.3363943099975586, | |
| "learning_rate": 0.00012369425759031766, | |
| "loss": 1.5147, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "grad_norm": 1.8213731050491333, | |
| "learning_rate": 0.00012343453756850117, | |
| "loss": 1.5497, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "eval_loss": 1.5053696632385254, | |
| "eval_runtime": 17.7358, | |
| "eval_samples_per_second": 56.383, | |
| "eval_steps_per_second": 14.096, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1768325513422044, | |
| "grad_norm": 1.1468195915222168, | |
| "learning_rate": 0.00012317481754668468, | |
| "loss": 1.4292, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.1806784093531266, | |
| "grad_norm": 1.2235878705978394, | |
| "learning_rate": 0.0001229150975248682, | |
| "loss": 1.4428, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.1845242673640488, | |
| "grad_norm": 1.5518691539764404, | |
| "learning_rate": 0.0001226553775030517, | |
| "loss": 1.4889, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.1883701253749712, | |
| "grad_norm": 1.217755913734436, | |
| "learning_rate": 0.00012239565748123522, | |
| "loss": 1.4841, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "grad_norm": 1.6093647480010986, | |
| "learning_rate": 0.00012213593745941876, | |
| "loss": 1.5461, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "eval_loss": 1.5013692378997803, | |
| "eval_runtime": 18.0052, | |
| "eval_samples_per_second": 55.539, | |
| "eval_steps_per_second": 13.885, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1960618413968156, | |
| "grad_norm": 3.367638349533081, | |
| "learning_rate": 0.00012187621743760226, | |
| "loss": 1.4261, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.1999076994077378, | |
| "grad_norm": 1.6329169273376465, | |
| "learning_rate": 0.00012161649741578578, | |
| "loss": 1.4475, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.2037535574186602, | |
| "grad_norm": 1.397910475730896, | |
| "learning_rate": 0.00012135677739396932, | |
| "loss": 1.4747, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.2075994154295824, | |
| "grad_norm": 1.7463736534118652, | |
| "learning_rate": 0.00012109705737215283, | |
| "loss": 1.4745, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "grad_norm": 1.8097542524337769, | |
| "learning_rate": 0.00012083733735033633, | |
| "loss": 1.5276, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "eval_loss": 1.4955236911773682, | |
| "eval_runtime": 17.9054, | |
| "eval_samples_per_second": 55.849, | |
| "eval_steps_per_second": 13.962, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2152911314514268, | |
| "grad_norm": 1.745730996131897, | |
| "learning_rate": 0.00012057761732851987, | |
| "loss": 1.5282, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.2191369894623492, | |
| "grad_norm": 1.3859128952026367, | |
| "learning_rate": 0.00012031789730670338, | |
| "loss": 1.4795, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.2229828474732714, | |
| "grad_norm": 1.5086127519607544, | |
| "learning_rate": 0.0001200581772848869, | |
| "loss": 1.437, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.2268287054841935, | |
| "grad_norm": 2.088292121887207, | |
| "learning_rate": 0.00011979845726307042, | |
| "loss": 1.5011, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "grad_norm": 1.2746011018753052, | |
| "learning_rate": 0.00011953873724125393, | |
| "loss": 1.4596, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "eval_loss": 1.4984314441680908, | |
| "eval_runtime": 17.8066, | |
| "eval_samples_per_second": 56.159, | |
| "eval_steps_per_second": 14.04, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.234520421506038, | |
| "grad_norm": 6.292486667633057, | |
| "learning_rate": 0.00011927901721943746, | |
| "loss": 1.3934, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.2383662795169603, | |
| "grad_norm": 1.6574532985687256, | |
| "learning_rate": 0.00011901929719762097, | |
| "loss": 1.4821, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.2422121375278825, | |
| "grad_norm": 1.8651037216186523, | |
| "learning_rate": 0.00011875957717580448, | |
| "loss": 1.49, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.2460579955388047, | |
| "grad_norm": 1.3768175840377808, | |
| "learning_rate": 0.00011849985715398801, | |
| "loss": 1.4579, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "grad_norm": 1.1569020748138428, | |
| "learning_rate": 0.00011824013713217152, | |
| "loss": 1.4029, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "eval_loss": 1.4893407821655273, | |
| "eval_runtime": 17.9552, | |
| "eval_samples_per_second": 55.694, | |
| "eval_steps_per_second": 13.924, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.253749711560649, | |
| "grad_norm": 1.8632296323776245, | |
| "learning_rate": 0.00011798041711035504, | |
| "loss": 1.4592, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.2575955695715715, | |
| "grad_norm": 1.8080470561981201, | |
| "learning_rate": 0.00011772069708853856, | |
| "loss": 1.4678, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.2614414275824937, | |
| "grad_norm": 1.4193981885910034, | |
| "learning_rate": 0.00011746097706672208, | |
| "loss": 1.5031, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.2652872855934159, | |
| "grad_norm": 1.5050238370895386, | |
| "learning_rate": 0.00011720125704490559, | |
| "loss": 1.4107, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "grad_norm": 1.12454092502594, | |
| "learning_rate": 0.00011694153702308911, | |
| "loss": 1.4572, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "eval_loss": 1.4972718954086304, | |
| "eval_runtime": 17.8677, | |
| "eval_samples_per_second": 55.967, | |
| "eval_steps_per_second": 13.992, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2729790016152602, | |
| "grad_norm": 1.3523976802825928, | |
| "learning_rate": 0.00011668181700127263, | |
| "loss": 1.4796, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.2768248596261826, | |
| "grad_norm": 1.9770869016647339, | |
| "learning_rate": 0.00011642209697945614, | |
| "loss": 1.4756, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.2806707176371048, | |
| "grad_norm": 1.7973159551620483, | |
| "learning_rate": 0.00011616237695763967, | |
| "loss": 1.4163, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.284516575648027, | |
| "grad_norm": 1.3054739236831665, | |
| "learning_rate": 0.00011590265693582318, | |
| "loss": 1.4105, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "grad_norm": 1.457047939300537, | |
| "learning_rate": 0.00011564293691400672, | |
| "loss": 1.4698, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "eval_loss": 1.4798808097839355, | |
| "eval_runtime": 17.8696, | |
| "eval_samples_per_second": 55.961, | |
| "eval_steps_per_second": 13.99, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2922082916698716, | |
| "grad_norm": 2.117663860321045, | |
| "learning_rate": 0.00011538841129262656, | |
| "loss": 1.4252, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.2960541496807938, | |
| "grad_norm": 3.7321341037750244, | |
| "learning_rate": 0.00011512869127081007, | |
| "loss": 1.4818, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.299900007691716, | |
| "grad_norm": 2.497528553009033, | |
| "learning_rate": 0.00011486897124899358, | |
| "loss": 1.4176, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.3037458657026382, | |
| "grad_norm": 1.6085398197174072, | |
| "learning_rate": 0.00011460925122717712, | |
| "loss": 1.4373, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "grad_norm": 1.8937525749206543, | |
| "learning_rate": 0.00011434953120536062, | |
| "loss": 1.4823, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "eval_loss": 1.4854488372802734, | |
| "eval_runtime": 17.8761, | |
| "eval_samples_per_second": 55.941, | |
| "eval_steps_per_second": 13.985, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3114375817244828, | |
| "grad_norm": 1.6637665033340454, | |
| "learning_rate": 0.00011408981118354414, | |
| "loss": 1.5071, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.315283439735405, | |
| "grad_norm": 2.0815582275390625, | |
| "learning_rate": 0.00011383009116172768, | |
| "loss": 1.4606, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.3191292977463271, | |
| "grad_norm": 1.6492595672607422, | |
| "learning_rate": 0.00011357037113991117, | |
| "loss": 1.4408, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.3229751557572493, | |
| "grad_norm": 0.8617509603500366, | |
| "learning_rate": 0.00011331065111809469, | |
| "loss": 1.497, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "grad_norm": 1.6395294666290283, | |
| "learning_rate": 0.00011305093109627823, | |
| "loss": 1.4774, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "eval_loss": 1.4835026264190674, | |
| "eval_runtime": 18.0136, | |
| "eval_samples_per_second": 55.514, | |
| "eval_steps_per_second": 13.878, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.330666871779094, | |
| "grad_norm": 2.7765560150146484, | |
| "learning_rate": 0.00011279121107446174, | |
| "loss": 1.4864, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.3345127297900161, | |
| "grad_norm": 1.2104064226150513, | |
| "learning_rate": 0.00011253149105264524, | |
| "loss": 1.4075, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.3383585878009385, | |
| "grad_norm": 1.6772801876068115, | |
| "learning_rate": 0.00011227177103082878, | |
| "loss": 1.4629, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.3422044458118605, | |
| "grad_norm": 2.254371404647827, | |
| "learning_rate": 0.00011201205100901229, | |
| "loss": 1.4447, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "grad_norm": 2.2015669345855713, | |
| "learning_rate": 0.00011175233098719582, | |
| "loss": 1.4664, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "eval_loss": 1.496685266494751, | |
| "eval_runtime": 17.7963, | |
| "eval_samples_per_second": 56.191, | |
| "eval_steps_per_second": 14.048, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.349896161833705, | |
| "grad_norm": 1.740045428276062, | |
| "learning_rate": 0.00011149261096537933, | |
| "loss": 1.4486, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.3537420198446273, | |
| "grad_norm": 1.299919605255127, | |
| "learning_rate": 0.00011123289094356284, | |
| "loss": 1.4868, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.3575878778555497, | |
| "grad_norm": 1.6288009881973267, | |
| "learning_rate": 0.00011097317092174637, | |
| "loss": 1.4595, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.3614337358664719, | |
| "grad_norm": 0.8747851252555847, | |
| "learning_rate": 0.00011071345089992988, | |
| "loss": 1.4241, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "grad_norm": 1.9510573148727417, | |
| "learning_rate": 0.0001104537308781134, | |
| "loss": 1.496, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "eval_loss": 1.4812238216400146, | |
| "eval_runtime": 17.8955, | |
| "eval_samples_per_second": 55.88, | |
| "eval_steps_per_second": 13.97, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.3691254518883162, | |
| "grad_norm": 1.4853876829147339, | |
| "learning_rate": 0.00011019401085629692, | |
| "loss": 1.4645, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.3729713098992384, | |
| "grad_norm": 1.5125057697296143, | |
| "learning_rate": 0.00010993429083448044, | |
| "loss": 1.4052, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.3768171679101608, | |
| "grad_norm": 0.7320863008499146, | |
| "learning_rate": 0.00010967457081266395, | |
| "loss": 1.5016, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.380663025921083, | |
| "grad_norm": 1.9995285272598267, | |
| "learning_rate": 0.00010941485079084747, | |
| "loss": 1.4234, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "grad_norm": 0.8304823637008667, | |
| "learning_rate": 0.00010915513076903099, | |
| "loss": 1.4068, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "eval_loss": 1.4845945835113525, | |
| "eval_runtime": 17.7402, | |
| "eval_samples_per_second": 56.369, | |
| "eval_steps_per_second": 14.092, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3883547419429274, | |
| "grad_norm": 1.8189436197280884, | |
| "learning_rate": 0.0001088954107472145, | |
| "loss": 1.4872, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.3922005999538496, | |
| "grad_norm": 1.4212762117385864, | |
| "learning_rate": 0.00010863569072539803, | |
| "loss": 1.4067, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.396046457964772, | |
| "grad_norm": 1.9733264446258545, | |
| "learning_rate": 0.00010837597070358154, | |
| "loss": 1.4787, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.3998923159756942, | |
| "grad_norm": 1.320064663887024, | |
| "learning_rate": 0.00010811625068176505, | |
| "loss": 1.5242, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "grad_norm": 1.4177141189575195, | |
| "learning_rate": 0.00010785653065994858, | |
| "loss": 1.4719, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "eval_loss": 1.4992233514785767, | |
| "eval_runtime": 17.8574, | |
| "eval_samples_per_second": 55.999, | |
| "eval_steps_per_second": 14.0, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4075840319975388, | |
| "grad_norm": 1.8219791650772095, | |
| "learning_rate": 0.00010759681063813209, | |
| "loss": 1.4392, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.4114298900084608, | |
| "grad_norm": 1.3196603059768677, | |
| "learning_rate": 0.00010733709061631563, | |
| "loss": 1.4837, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.4152757480193832, | |
| "grad_norm": 1.01405668258667, | |
| "learning_rate": 0.00010707737059449914, | |
| "loss": 1.455, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.4191216060303053, | |
| "grad_norm": 1.8538917303085327, | |
| "learning_rate": 0.00010681765057268264, | |
| "loss": 1.4209, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "grad_norm": 0.8785907030105591, | |
| "learning_rate": 0.00010655793055086618, | |
| "loss": 1.4716, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "eval_loss": 1.485592246055603, | |
| "eval_runtime": 17.7246, | |
| "eval_samples_per_second": 56.419, | |
| "eval_steps_per_second": 14.105, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.42681332205215, | |
| "grad_norm": 1.0700381994247437, | |
| "learning_rate": 0.0001062982105290497, | |
| "loss": 1.3846, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 1.4306591800630721, | |
| "grad_norm": 1.2848351001739502, | |
| "learning_rate": 0.0001060384905072332, | |
| "loss": 1.4874, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.4345050380739943, | |
| "grad_norm": 1.4261386394500732, | |
| "learning_rate": 0.00010577877048541674, | |
| "loss": 1.5771, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 1.4383508960849165, | |
| "grad_norm": 1.3613426685333252, | |
| "learning_rate": 0.00010551905046360025, | |
| "loss": 1.4445, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "grad_norm": 0.8625685572624207, | |
| "learning_rate": 0.00010525933044178375, | |
| "loss": 1.3845, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "eval_loss": 1.4757392406463623, | |
| "eval_runtime": 17.8903, | |
| "eval_samples_per_second": 55.896, | |
| "eval_steps_per_second": 13.974, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.446042612106761, | |
| "grad_norm": 1.5575672388076782, | |
| "learning_rate": 0.00010499961041996729, | |
| "loss": 1.5387, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.4498884701176833, | |
| "grad_norm": 2.2173306941986084, | |
| "learning_rate": 0.0001047398903981508, | |
| "loss": 1.459, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 1.4537343281286055, | |
| "grad_norm": 1.904809832572937, | |
| "learning_rate": 0.00010448017037633431, | |
| "loss": 1.5229, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.4575801861395277, | |
| "grad_norm": 1.8832893371582031, | |
| "learning_rate": 0.00010422045035451784, | |
| "loss": 1.4093, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "grad_norm": 1.852971076965332, | |
| "learning_rate": 0.00010396073033270135, | |
| "loss": 1.389, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "eval_loss": 1.4788576364517212, | |
| "eval_runtime": 17.7751, | |
| "eval_samples_per_second": 56.258, | |
| "eval_steps_per_second": 14.065, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4652719021613723, | |
| "grad_norm": 1.2875189781188965, | |
| "learning_rate": 0.00010370101031088487, | |
| "loss": 1.4826, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 1.4691177601722945, | |
| "grad_norm": 1.7036223411560059, | |
| "learning_rate": 0.00010344129028906839, | |
| "loss": 1.4392, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.4729636181832166, | |
| "grad_norm": 1.537514328956604, | |
| "learning_rate": 0.0001031815702672519, | |
| "loss": 1.4846, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 1.476809476194139, | |
| "grad_norm": 0.9159242510795593, | |
| "learning_rate": 0.00010292185024543543, | |
| "loss": 1.4668, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "grad_norm": 3.47868013381958, | |
| "learning_rate": 0.00010266213022361894, | |
| "loss": 1.493, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "eval_loss": 1.4858986139297485, | |
| "eval_runtime": 17.7738, | |
| "eval_samples_per_second": 56.263, | |
| "eval_steps_per_second": 14.066, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.4845011922159834, | |
| "grad_norm": 1.467437505722046, | |
| "learning_rate": 0.00010240241020180246, | |
| "loss": 1.4639, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.4883470502269056, | |
| "grad_norm": 1.2710049152374268, | |
| "learning_rate": 0.00010214269017998598, | |
| "loss": 1.4244, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 1.4921929082378278, | |
| "grad_norm": 2.0059661865234375, | |
| "learning_rate": 0.0001018829701581695, | |
| "loss": 1.4258, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.4960387662487502, | |
| "grad_norm": 1.7536308765411377, | |
| "learning_rate": 0.00010162325013635301, | |
| "loss": 1.396, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "grad_norm": 0.9684279561042786, | |
| "learning_rate": 0.00010136353011453655, | |
| "loss": 1.4598, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "eval_loss": 1.4841110706329346, | |
| "eval_runtime": 17.9151, | |
| "eval_samples_per_second": 55.819, | |
| "eval_steps_per_second": 13.955, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.5037304822705946, | |
| "grad_norm": 0.9244908690452576, | |
| "learning_rate": 0.00010110381009272005, | |
| "loss": 1.3834, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 1.5075763402815168, | |
| "grad_norm": 1.6488862037658691, | |
| "learning_rate": 0.00010084409007090356, | |
| "loss": 1.5311, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.511422198292439, | |
| "grad_norm": 1.9130067825317383, | |
| "learning_rate": 0.0001005843700490871, | |
| "loss": 1.5212, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 1.5152680563033614, | |
| "grad_norm": 1.326277256011963, | |
| "learning_rate": 0.0001003246500272706, | |
| "loss": 1.4067, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "grad_norm": 1.7258195877075195, | |
| "learning_rate": 0.00010006493000545411, | |
| "loss": 1.4844, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "eval_loss": 1.4816969633102417, | |
| "eval_runtime": 17.826, | |
| "eval_samples_per_second": 56.098, | |
| "eval_steps_per_second": 14.024, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5229597723252057, | |
| "grad_norm": 1.8164838552474976, | |
| "learning_rate": 9.980520998363765e-05, | |
| "loss": 1.4233, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.5268056303361282, | |
| "grad_norm": 1.5884016752243042, | |
| "learning_rate": 9.954548996182115e-05, | |
| "loss": 1.4313, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 1.5306514883470501, | |
| "grad_norm": 1.5381648540496826, | |
| "learning_rate": 9.928576994000468e-05, | |
| "loss": 1.4789, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.5344973463579725, | |
| "grad_norm": 1.6448626518249512, | |
| "learning_rate": 9.90260499181882e-05, | |
| "loss": 1.3756, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "grad_norm": 1.6137230396270752, | |
| "learning_rate": 9.876632989637172e-05, | |
| "loss": 1.4171, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "eval_loss": 1.4770597219467163, | |
| "eval_runtime": 17.9365, | |
| "eval_samples_per_second": 55.752, | |
| "eval_steps_per_second": 13.938, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.542189062379817, | |
| "grad_norm": 1.7058050632476807, | |
| "learning_rate": 9.850660987455523e-05, | |
| "loss": 1.4646, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 1.5460349203907393, | |
| "grad_norm": 2.2624917030334473, | |
| "learning_rate": 9.824688985273876e-05, | |
| "loss": 1.4586, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.5498807784016613, | |
| "grad_norm": 2.216883420944214, | |
| "learning_rate": 9.798716983092227e-05, | |
| "loss": 1.4376, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 1.5537266364125837, | |
| "grad_norm": 1.9749584197998047, | |
| "learning_rate": 9.772744980910578e-05, | |
| "loss": 1.47, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "grad_norm": 2.185480833053589, | |
| "learning_rate": 9.746772978728931e-05, | |
| "loss": 1.39, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "eval_loss": 1.4724150896072388, | |
| "eval_runtime": 17.8191, | |
| "eval_samples_per_second": 56.12, | |
| "eval_steps_per_second": 14.03, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.561418352434428, | |
| "grad_norm": 1.5810290575027466, | |
| "learning_rate": 9.720800976547284e-05, | |
| "loss": 1.4267, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.5652642104453505, | |
| "grad_norm": 2.0329344272613525, | |
| "learning_rate": 9.694828974365633e-05, | |
| "loss": 1.5066, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 1.5691100684562724, | |
| "grad_norm": 2.7335126399993896, | |
| "learning_rate": 9.668856972183986e-05, | |
| "loss": 1.4749, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.5729559264671948, | |
| "grad_norm": 1.0576220750808716, | |
| "learning_rate": 9.642884970002339e-05, | |
| "loss": 1.4318, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "grad_norm": 0.6857870817184448, | |
| "learning_rate": 9.616912967820689e-05, | |
| "loss": 1.3567, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "eval_loss": 1.479669213294983, | |
| "eval_runtime": 17.8378, | |
| "eval_samples_per_second": 56.061, | |
| "eval_steps_per_second": 14.015, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.5806476424890392, | |
| "grad_norm": 1.430114507675171, | |
| "learning_rate": 9.590940965639041e-05, | |
| "loss": 1.3714, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 1.5844935004999616, | |
| "grad_norm": 1.7613717317581177, | |
| "learning_rate": 9.564968963457394e-05, | |
| "loss": 1.4285, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.5883393585108838, | |
| "grad_norm": 1.3678529262542725, | |
| "learning_rate": 9.538996961275745e-05, | |
| "loss": 1.4463, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 1.592185216521806, | |
| "grad_norm": 3.1135504245758057, | |
| "learning_rate": 9.513024959094097e-05, | |
| "loss": 1.4287, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "grad_norm": 1.7866570949554443, | |
| "learning_rate": 9.487052956912449e-05, | |
| "loss": 1.4515, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "eval_loss": 1.4817472696304321, | |
| "eval_runtime": 17.9267, | |
| "eval_samples_per_second": 55.783, | |
| "eval_steps_per_second": 13.946, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5998769325436504, | |
| "grad_norm": 1.2406786680221558, | |
| "learning_rate": 9.4610809547308e-05, | |
| "loss": 1.5364, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.6037227905545728, | |
| "grad_norm": 1.9183951616287231, | |
| "learning_rate": 9.435108952549152e-05, | |
| "loss": 1.5169, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 1.607568648565495, | |
| "grad_norm": 1.852089524269104, | |
| "learning_rate": 9.409136950367504e-05, | |
| "loss": 1.4668, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.6114145065764172, | |
| "grad_norm": 2.3430335521698, | |
| "learning_rate": 9.383164948185856e-05, | |
| "loss": 1.4297, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "grad_norm": 1.9646743535995483, | |
| "learning_rate": 9.357192946004207e-05, | |
| "loss": 1.5143, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "eval_loss": 1.4690666198730469, | |
| "eval_runtime": 17.7563, | |
| "eval_samples_per_second": 56.318, | |
| "eval_steps_per_second": 14.079, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6191062225982615, | |
| "grad_norm": 1.8204776048660278, | |
| "learning_rate": 9.33122094382256e-05, | |
| "loss": 1.4423, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 1.622952080609184, | |
| "grad_norm": 1.6150448322296143, | |
| "learning_rate": 9.305248941640912e-05, | |
| "loss": 1.4368, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.6267979386201061, | |
| "grad_norm": 2.4393765926361084, | |
| "learning_rate": 9.279276939459264e-05, | |
| "loss": 1.6115, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 1.6306437966310283, | |
| "grad_norm": 1.217399001121521, | |
| "learning_rate": 9.253304937277615e-05, | |
| "loss": 1.5792, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "grad_norm": 1.4467514753341675, | |
| "learning_rate": 9.227332935095967e-05, | |
| "loss": 1.4838, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "eval_loss": 1.4741238355636597, | |
| "eval_runtime": 17.7656, | |
| "eval_samples_per_second": 56.289, | |
| "eval_steps_per_second": 14.072, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6383355126528727, | |
| "grad_norm": 1.4274511337280273, | |
| "learning_rate": 9.201360932914319e-05, | |
| "loss": 1.4747, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.642181370663795, | |
| "grad_norm": 1.5080102682113647, | |
| "learning_rate": 9.175908370776304e-05, | |
| "loss": 1.4262, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 1.6460272286747173, | |
| "grad_norm": 1.2717032432556152, | |
| "learning_rate": 9.149936368594655e-05, | |
| "loss": 1.4418, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.6498730866856395, | |
| "grad_norm": 1.3594080209732056, | |
| "learning_rate": 9.123964366413006e-05, | |
| "loss": 1.5184, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "grad_norm": 1.896607518196106, | |
| "learning_rate": 9.097992364231359e-05, | |
| "loss": 1.4702, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "eval_loss": 1.4777735471725464, | |
| "eval_runtime": 17.8161, | |
| "eval_samples_per_second": 56.129, | |
| "eval_steps_per_second": 14.032, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.657564802707484, | |
| "grad_norm": 1.3839844465255737, | |
| "learning_rate": 9.072020362049712e-05, | |
| "loss": 1.522, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 1.6614106607184063, | |
| "grad_norm": 1.187853455543518, | |
| "learning_rate": 9.046048359868063e-05, | |
| "loss": 1.4469, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.6652565187293287, | |
| "grad_norm": 2.8398866653442383, | |
| "learning_rate": 9.020076357686414e-05, | |
| "loss": 1.4875, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 1.6691023767402506, | |
| "grad_norm": 1.801963448524475, | |
| "learning_rate": 8.994104355504767e-05, | |
| "loss": 1.3892, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "grad_norm": 1.7727116346359253, | |
| "learning_rate": 8.968132353323118e-05, | |
| "loss": 1.3886, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "eval_loss": 1.4662117958068848, | |
| "eval_runtime": 17.7809, | |
| "eval_samples_per_second": 56.24, | |
| "eval_steps_per_second": 14.06, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.6767940927620952, | |
| "grad_norm": 1.7052053213119507, | |
| "learning_rate": 8.94216035114147e-05, | |
| "loss": 1.4483, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.6806399507730174, | |
| "grad_norm": 1.9812465906143188, | |
| "learning_rate": 8.916188348959822e-05, | |
| "loss": 1.4325, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 1.6844858087839398, | |
| "grad_norm": 1.2499721050262451, | |
| "learning_rate": 8.890216346778173e-05, | |
| "loss": 1.5246, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.6883316667948618, | |
| "grad_norm": 1.2503259181976318, | |
| "learning_rate": 8.864244344596525e-05, | |
| "loss": 1.4661, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "grad_norm": 2.201223134994507, | |
| "learning_rate": 8.838272342414877e-05, | |
| "loss": 1.5103, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "eval_loss": 1.470120906829834, | |
| "eval_runtime": 17.7715, | |
| "eval_samples_per_second": 56.27, | |
| "eval_steps_per_second": 14.067, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6960233828167064, | |
| "grad_norm": 0.8225556015968323, | |
| "learning_rate": 8.81230034023323e-05, | |
| "loss": 1.4372, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 1.6998692408276286, | |
| "grad_norm": 1.495335578918457, | |
| "learning_rate": 8.78632833805158e-05, | |
| "loss": 1.4231, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.703715098838551, | |
| "grad_norm": 1.6913652420043945, | |
| "learning_rate": 8.760356335869933e-05, | |
| "loss": 1.4859, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 1.707560956849473, | |
| "grad_norm": 1.9825598001480103, | |
| "learning_rate": 8.734384333688285e-05, | |
| "loss": 1.4354, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "grad_norm": 2.212759017944336, | |
| "learning_rate": 8.708412331506635e-05, | |
| "loss": 1.3861, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "eval_loss": 1.4596961736679077, | |
| "eval_runtime": 17.8176, | |
| "eval_samples_per_second": 56.124, | |
| "eval_steps_per_second": 14.031, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7152526728713176, | |
| "grad_norm": 1.4325975179672241, | |
| "learning_rate": 8.682440329324988e-05, | |
| "loss": 1.4805, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.7190985308822397, | |
| "grad_norm": 1.9796292781829834, | |
| "learning_rate": 8.65646832714334e-05, | |
| "loss": 1.3836, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 1.7229443888931621, | |
| "grad_norm": 1.6221562623977661, | |
| "learning_rate": 8.630496324961692e-05, | |
| "loss": 1.4176, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.7267902469040843, | |
| "grad_norm": 1.1431959867477417, | |
| "learning_rate": 8.604524322780043e-05, | |
| "loss": 1.4553, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "grad_norm": 1.1562083959579468, | |
| "learning_rate": 8.578552320598396e-05, | |
| "loss": 1.4489, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "eval_loss": 1.464021921157837, | |
| "eval_runtime": 17.8166, | |
| "eval_samples_per_second": 56.127, | |
| "eval_steps_per_second": 14.032, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.734481962925929, | |
| "grad_norm": 0.8531803488731384, | |
| "learning_rate": 8.552580318416747e-05, | |
| "loss": 1.4298, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 1.738327820936851, | |
| "grad_norm": 1.3987632989883423, | |
| "learning_rate": 8.526608316235098e-05, | |
| "loss": 1.4629, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.7421736789477733, | |
| "grad_norm": 1.4521870613098145, | |
| "learning_rate": 8.500636314053451e-05, | |
| "loss": 1.4005, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 1.7460195369586955, | |
| "grad_norm": 1.0557054281234741, | |
| "learning_rate": 8.474664311871803e-05, | |
| "loss": 1.4079, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "grad_norm": 1.5067927837371826, | |
| "learning_rate": 8.448692309690155e-05, | |
| "loss": 1.4664, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "eval_loss": 1.4728831052780151, | |
| "eval_runtime": 17.7456, | |
| "eval_samples_per_second": 56.352, | |
| "eval_steps_per_second": 14.088, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.75371125298054, | |
| "grad_norm": 1.3237221240997314, | |
| "learning_rate": 8.422720307508506e-05, | |
| "loss": 1.5266, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.757557110991462, | |
| "grad_norm": 1.4342700242996216, | |
| "learning_rate": 8.396748305326859e-05, | |
| "loss": 1.4929, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 1.7614029690023845, | |
| "grad_norm": 5.2202534675598145, | |
| "learning_rate": 8.37077630314521e-05, | |
| "loss": 1.457, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.7652488270133067, | |
| "grad_norm": 2.7584545612335205, | |
| "learning_rate": 8.344804300963561e-05, | |
| "loss": 1.4523, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "grad_norm": 1.8208624124526978, | |
| "learning_rate": 8.318832298781914e-05, | |
| "loss": 1.4746, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "eval_loss": 1.467396855354309, | |
| "eval_runtime": 17.663, | |
| "eval_samples_per_second": 56.616, | |
| "eval_steps_per_second": 14.154, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7729405430351513, | |
| "grad_norm": 1.6651790142059326, | |
| "learning_rate": 8.292860296600265e-05, | |
| "loss": 1.4537, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 1.7767864010460732, | |
| "grad_norm": 1.5910587310791016, | |
| "learning_rate": 8.266888294418618e-05, | |
| "loss": 1.5305, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.7806322590569956, | |
| "grad_norm": 1.2711199522018433, | |
| "learning_rate": 8.240916292236969e-05, | |
| "loss": 1.4877, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 1.7844781170679178, | |
| "grad_norm": 1.7133463621139526, | |
| "learning_rate": 8.21494429005532e-05, | |
| "loss": 1.4547, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "grad_norm": 1.6850301027297974, | |
| "learning_rate": 8.188972287873673e-05, | |
| "loss": 1.4228, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "eval_loss": 1.4735645055770874, | |
| "eval_runtime": 17.7983, | |
| "eval_samples_per_second": 56.185, | |
| "eval_steps_per_second": 14.046, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.7921698330897624, | |
| "grad_norm": 1.1047898530960083, | |
| "learning_rate": 8.163000285692024e-05, | |
| "loss": 1.4817, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.7960156911006846, | |
| "grad_norm": 1.8970743417739868, | |
| "learning_rate": 8.137028283510376e-05, | |
| "loss": 1.4884, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 1.7998615491116068, | |
| "grad_norm": 1.4730740785598755, | |
| "learning_rate": 8.111056281328728e-05, | |
| "loss": 1.4436, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.8037074071225292, | |
| "grad_norm": 1.8288697004318237, | |
| "learning_rate": 8.08508427914708e-05, | |
| "loss": 1.4614, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "grad_norm": 1.2339516878128052, | |
| "learning_rate": 8.059112276965432e-05, | |
| "loss": 1.4276, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "eval_loss": 1.4655163288116455, | |
| "eval_runtime": 17.9781, | |
| "eval_samples_per_second": 55.623, | |
| "eval_steps_per_second": 13.906, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8113991231443736, | |
| "grad_norm": 1.8125578165054321, | |
| "learning_rate": 8.033140274783783e-05, | |
| "loss": 1.4318, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 1.8152449811552958, | |
| "grad_norm": 1.941846489906311, | |
| "learning_rate": 8.007168272602136e-05, | |
| "loss": 1.4569, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.819090839166218, | |
| "grad_norm": 2.054161787033081, | |
| "learning_rate": 7.981196270420487e-05, | |
| "loss": 1.4426, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 1.8229366971771404, | |
| "grad_norm": 1.6102700233459473, | |
| "learning_rate": 7.955224268238839e-05, | |
| "loss": 1.4407, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "grad_norm": 1.7302616834640503, | |
| "learning_rate": 7.929252266057191e-05, | |
| "loss": 1.4432, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "eval_loss": 1.4570631980895996, | |
| "eval_runtime": 17.8893, | |
| "eval_samples_per_second": 55.899, | |
| "eval_steps_per_second": 13.975, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8306284131989847, | |
| "grad_norm": 9.521604537963867, | |
| "learning_rate": 7.903280263875543e-05, | |
| "loss": 1.3995, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.834474271209907, | |
| "grad_norm": 1.9546847343444824, | |
| "learning_rate": 7.877308261693894e-05, | |
| "loss": 1.4957, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 1.838320129220829, | |
| "grad_norm": 1.173722505569458, | |
| "learning_rate": 7.851336259512246e-05, | |
| "loss": 1.4157, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.8421659872317515, | |
| "grad_norm": 1.5322128534317017, | |
| "learning_rate": 7.825364257330599e-05, | |
| "loss": 1.4453, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "grad_norm": 1.0376055240631104, | |
| "learning_rate": 7.799392255148949e-05, | |
| "loss": 1.4646, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "eval_loss": 1.4612687826156616, | |
| "eval_runtime": 17.984, | |
| "eval_samples_per_second": 55.605, | |
| "eval_steps_per_second": 13.901, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8498577032535959, | |
| "grad_norm": 1.5047483444213867, | |
| "learning_rate": 7.773420252967302e-05, | |
| "loss": 1.4546, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 1.853703561264518, | |
| "grad_norm": 1.0463405847549438, | |
| "learning_rate": 7.747448250785654e-05, | |
| "loss": 1.5014, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.8575494192754403, | |
| "grad_norm": 1.8368524312973022, | |
| "learning_rate": 7.721476248604004e-05, | |
| "loss": 1.4616, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 1.8613952772863627, | |
| "grad_norm": 1.4084677696228027, | |
| "learning_rate": 7.695504246422357e-05, | |
| "loss": 1.4255, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "grad_norm": 1.2279951572418213, | |
| "learning_rate": 7.66953224424071e-05, | |
| "loss": 1.4254, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "eval_loss": 1.460336685180664, | |
| "eval_runtime": 17.7185, | |
| "eval_samples_per_second": 56.438, | |
| "eval_steps_per_second": 14.11, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.869086993308207, | |
| "grad_norm": 1.9729641675949097, | |
| "learning_rate": 7.643560242059061e-05, | |
| "loss": 1.4656, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.8729328513191295, | |
| "grad_norm": 0.9121168255805969, | |
| "learning_rate": 7.617588239877412e-05, | |
| "loss": 1.3949, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 1.8767787093300514, | |
| "grad_norm": 1.8953206539154053, | |
| "learning_rate": 7.591616237695765e-05, | |
| "loss": 1.4006, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.8806245673409738, | |
| "grad_norm": 1.5828944444656372, | |
| "learning_rate": 7.565644235514116e-05, | |
| "loss": 1.5085, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "grad_norm": 2.027841329574585, | |
| "learning_rate": 7.539672233332467e-05, | |
| "loss": 1.3978, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "eval_loss": 1.445096731185913, | |
| "eval_runtime": 18.4795, | |
| "eval_samples_per_second": 54.114, | |
| "eval_steps_per_second": 13.528, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.8883162833628182, | |
| "grad_norm": 3.0785481929779053, | |
| "learning_rate": 7.51370023115082e-05, | |
| "loss": 1.493, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 1.8921621413737406, | |
| "grad_norm": 1.2793898582458496, | |
| "learning_rate": 7.487728228969173e-05, | |
| "loss": 1.3865, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.8960079993846626, | |
| "grad_norm": 1.474137544631958, | |
| "learning_rate": 7.461756226787522e-05, | |
| "loss": 1.4316, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 1.899853857395585, | |
| "grad_norm": 1.270415186882019, | |
| "learning_rate": 7.435784224605875e-05, | |
| "loss": 1.3844, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "grad_norm": 1.3681602478027344, | |
| "learning_rate": 7.409812222424228e-05, | |
| "loss": 1.4202, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "eval_loss": 1.4705748558044434, | |
| "eval_runtime": 18.5933, | |
| "eval_samples_per_second": 53.783, | |
| "eval_steps_per_second": 13.446, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9075455734174294, | |
| "grad_norm": 1.028225302696228, | |
| "learning_rate": 7.384359660286212e-05, | |
| "loss": 1.4658, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.9113914314283518, | |
| "grad_norm": 1.1178765296936035, | |
| "learning_rate": 7.358387658104564e-05, | |
| "loss": 1.4616, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 1.9152372894392737, | |
| "grad_norm": 2.1088242530822754, | |
| "learning_rate": 7.332415655922915e-05, | |
| "loss": 1.4916, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.9190831474501961, | |
| "grad_norm": 0.7096924781799316, | |
| "learning_rate": 7.306443653741267e-05, | |
| "loss": 1.3687, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "grad_norm": 1.4435713291168213, | |
| "learning_rate": 7.28047165155962e-05, | |
| "loss": 1.5273, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "eval_loss": 1.4638206958770752, | |
| "eval_runtime": 18.4856, | |
| "eval_samples_per_second": 54.096, | |
| "eval_steps_per_second": 13.524, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9267748634720405, | |
| "grad_norm": 1.4893878698349, | |
| "learning_rate": 7.25449964937797e-05, | |
| "loss": 1.5067, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 1.930620721482963, | |
| "grad_norm": 0.8735935091972351, | |
| "learning_rate": 7.228527647196322e-05, | |
| "loss": 1.4671, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.9344665794938851, | |
| "grad_norm": 1.6086535453796387, | |
| "learning_rate": 7.202555645014675e-05, | |
| "loss": 1.4551, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 1.9383124375048073, | |
| "grad_norm": 0.683675229549408, | |
| "learning_rate": 7.176583642833027e-05, | |
| "loss": 1.4673, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "grad_norm": 1.9318158626556396, | |
| "learning_rate": 7.150611640651378e-05, | |
| "loss": 1.4199, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "eval_loss": 1.4574114084243774, | |
| "eval_runtime": 18.5222, | |
| "eval_samples_per_second": 53.989, | |
| "eval_steps_per_second": 13.497, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9460041535266517, | |
| "grad_norm": 1.9871971607208252, | |
| "learning_rate": 7.12463963846973e-05, | |
| "loss": 1.5002, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.949850011537574, | |
| "grad_norm": 1.4302830696105957, | |
| "learning_rate": 7.098667636288082e-05, | |
| "loss": 1.46, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 1.9536958695484963, | |
| "grad_norm": 1.8389050960540771, | |
| "learning_rate": 7.072695634106434e-05, | |
| "loss": 1.4025, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.9575417275594185, | |
| "grad_norm": 1.7089191675186157, | |
| "learning_rate": 7.046723631924785e-05, | |
| "loss": 1.4507, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "grad_norm": 1.3698766231536865, | |
| "learning_rate": 7.020751629743138e-05, | |
| "loss": 1.4954, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "eval_loss": 1.454710841178894, | |
| "eval_runtime": 18.5708, | |
| "eval_samples_per_second": 53.848, | |
| "eval_steps_per_second": 13.462, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9652334435812628, | |
| "grad_norm": 1.808030128479004, | |
| "learning_rate": 6.994779627561489e-05, | |
| "loss": 1.4254, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 1.9690793015921852, | |
| "grad_norm": 1.634099006652832, | |
| "learning_rate": 6.96880762537984e-05, | |
| "loss": 1.3325, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.9729251596031074, | |
| "grad_norm": 0.8857108354568481, | |
| "learning_rate": 6.942835623198193e-05, | |
| "loss": 1.3983, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 1.9767710176140296, | |
| "grad_norm": 1.6114498376846313, | |
| "learning_rate": 6.916863621016545e-05, | |
| "loss": 1.3348, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "grad_norm": 1.4415462017059326, | |
| "learning_rate": 6.890891618834895e-05, | |
| "loss": 1.4617, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "eval_loss": 1.4457746744155884, | |
| "eval_runtime": 17.9567, | |
| "eval_samples_per_second": 55.689, | |
| "eval_steps_per_second": 13.922, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.984462733635874, | |
| "grad_norm": 1.9614554643630981, | |
| "learning_rate": 6.864919616653248e-05, | |
| "loss": 1.3813, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.9883085916467964, | |
| "grad_norm": 1.2938437461853027, | |
| "learning_rate": 6.8389476144716e-05, | |
| "loss": 1.4047, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 1.9921544496577186, | |
| "grad_norm": 2.1129326820373535, | |
| "learning_rate": 6.812975612289952e-05, | |
| "loss": 1.4362, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.9960003076686408, | |
| "grad_norm": 0.8634279370307922, | |
| "learning_rate": 6.787003610108303e-05, | |
| "loss": 1.4805, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "grad_norm": 2.995699405670166, | |
| "learning_rate": 6.761031607926656e-05, | |
| "loss": 1.5073, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "eval_loss": 1.453719973564148, | |
| "eval_runtime": 18.0595, | |
| "eval_samples_per_second": 55.373, | |
| "eval_steps_per_second": 13.843, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.003692023690485, | |
| "grad_norm": 1.621793508529663, | |
| "learning_rate": 6.73557904578864e-05, | |
| "loss": 1.369, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.0075378817014076, | |
| "grad_norm": 2.223520278930664, | |
| "learning_rate": 6.709607043606992e-05, | |
| "loss": 1.4269, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.01138373971233, | |
| "grad_norm": 1.4860827922821045, | |
| "learning_rate": 6.683635041425344e-05, | |
| "loss": 1.3634, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.015229597723252, | |
| "grad_norm": 2.0796148777008057, | |
| "learning_rate": 6.657663039243696e-05, | |
| "loss": 1.4233, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "grad_norm": 1.6398444175720215, | |
| "learning_rate": 6.631691037062047e-05, | |
| "loss": 1.4058, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "eval_loss": 1.4733901023864746, | |
| "eval_runtime": 18.0349, | |
| "eval_samples_per_second": 55.448, | |
| "eval_steps_per_second": 13.862, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0229213137450963, | |
| "grad_norm": 1.7550077438354492, | |
| "learning_rate": 6.605719034880399e-05, | |
| "loss": 1.4436, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.0267671717560187, | |
| "grad_norm": 2.3273561000823975, | |
| "learning_rate": 6.579747032698751e-05, | |
| "loss": 1.356, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.030613029766941, | |
| "grad_norm": 1.1432509422302246, | |
| "learning_rate": 6.553775030517103e-05, | |
| "loss": 1.4116, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.034458887777863, | |
| "grad_norm": 1.2345376014709473, | |
| "learning_rate": 6.527803028335455e-05, | |
| "loss": 1.4465, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "grad_norm": 1.485564112663269, | |
| "learning_rate": 6.501831026153807e-05, | |
| "loss": 1.3896, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "eval_loss": 1.4492217302322388, | |
| "eval_runtime": 17.9114, | |
| "eval_samples_per_second": 55.83, | |
| "eval_steps_per_second": 13.958, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.042150603799708, | |
| "grad_norm": 1.0810669660568237, | |
| "learning_rate": 6.475859023972158e-05, | |
| "loss": 1.2936, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.04599646181063, | |
| "grad_norm": 1.23382568359375, | |
| "learning_rate": 6.44988702179051e-05, | |
| "loss": 1.3922, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.0498423198215523, | |
| "grad_norm": 1.8218950033187866, | |
| "learning_rate": 6.423915019608862e-05, | |
| "loss": 1.4041, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.0536881778324743, | |
| "grad_norm": 0.6482899785041809, | |
| "learning_rate": 6.397943017427213e-05, | |
| "loss": 1.3609, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "grad_norm": 1.9538156986236572, | |
| "learning_rate": 6.371971015245566e-05, | |
| "loss": 1.4558, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "eval_loss": 1.454428791999817, | |
| "eval_runtime": 18.2441, | |
| "eval_samples_per_second": 54.812, | |
| "eval_steps_per_second": 13.703, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.061379893854319, | |
| "grad_norm": 1.087234377861023, | |
| "learning_rate": 6.345999013063917e-05, | |
| "loss": 1.3967, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.065225751865241, | |
| "grad_norm": 1.638533592224121, | |
| "learning_rate": 6.32002701088227e-05, | |
| "loss": 1.4558, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.0690716098761635, | |
| "grad_norm": 1.4552900791168213, | |
| "learning_rate": 6.294055008700621e-05, | |
| "loss": 1.4553, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.0729174678870854, | |
| "grad_norm": 1.6330054998397827, | |
| "learning_rate": 6.268083006518974e-05, | |
| "loss": 1.4977, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "grad_norm": 1.497938632965088, | |
| "learning_rate": 6.242111004337325e-05, | |
| "loss": 1.4449, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "eval_loss": 1.4610888957977295, | |
| "eval_runtime": 17.9657, | |
| "eval_samples_per_second": 55.662, | |
| "eval_steps_per_second": 13.915, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.0806091839089302, | |
| "grad_norm": 0.9695401191711426, | |
| "learning_rate": 6.216139002155676e-05, | |
| "loss": 1.4943, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.084455041919852, | |
| "grad_norm": 1.243717074394226, | |
| "learning_rate": 6.190166999974029e-05, | |
| "loss": 1.4199, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.0883008999307746, | |
| "grad_norm": 1.6151024103164673, | |
| "learning_rate": 6.16419499779238e-05, | |
| "loss": 1.4248, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.0921467579416966, | |
| "grad_norm": 1.7448607683181763, | |
| "learning_rate": 6.138222995610731e-05, | |
| "loss": 1.3944, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "grad_norm": 3.0997345447540283, | |
| "learning_rate": 6.112250993429084e-05, | |
| "loss": 1.4174, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "eval_loss": 1.4659229516983032, | |
| "eval_runtime": 18.0653, | |
| "eval_samples_per_second": 55.355, | |
| "eval_steps_per_second": 13.839, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.0998384739635414, | |
| "grad_norm": 1.637845754623413, | |
| "learning_rate": 6.086278991247436e-05, | |
| "loss": 1.446, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.1036843319744634, | |
| "grad_norm": 1.4263664484024048, | |
| "learning_rate": 6.060306989065787e-05, | |
| "loss": 0.9793, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 2.1075301899853858, | |
| "grad_norm": 0.8709418773651123, | |
| "learning_rate": 6.034334986884139e-05, | |
| "loss": 0.9715, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.111376047996308, | |
| "grad_norm": 0.8483341336250305, | |
| "learning_rate": 6.008362984702491e-05, | |
| "loss": 0.9163, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 2.11522190600723, | |
| "grad_norm": 2.070937156677246, | |
| "learning_rate": 5.9823909825208425e-05, | |
| "loss": 1.0047, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.11522190600723, | |
| "eval_loss": 1.0159448385238647, | |
| "eval_runtime": 17.7021, | |
| "eval_samples_per_second": 56.49, | |
| "eval_steps_per_second": 14.123, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.1190677640181526, | |
| "grad_norm": 0.9638277292251587, | |
| "learning_rate": 5.9564189803391944e-05, | |
| "loss": 0.9753, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 2.1229136220290745, | |
| "grad_norm": 1.1322181224822998, | |
| "learning_rate": 5.9304469781575464e-05, | |
| "loss": 0.948, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.126759480039997, | |
| "grad_norm": 1.144047737121582, | |
| "learning_rate": 5.904474975975898e-05, | |
| "loss": 1.009, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 2.1306053380509193, | |
| "grad_norm": 2.713625431060791, | |
| "learning_rate": 5.8785029737942496e-05, | |
| "loss": 1.0069, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.1344511960618413, | |
| "grad_norm": 1.1035822629928589, | |
| "learning_rate": 5.852530971612602e-05, | |
| "loss": 1.0013, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.1344511960618413, | |
| "eval_loss": 1.0144418478012085, | |
| "eval_runtime": 17.6881, | |
| "eval_samples_per_second": 56.535, | |
| "eval_steps_per_second": 14.134, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.1382970540727637, | |
| "grad_norm": 1.2658100128173828, | |
| "learning_rate": 5.826558969430954e-05, | |
| "loss": 1.0185, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.1421429120836857, | |
| "grad_norm": 0.9421238303184509, | |
| "learning_rate": 5.800586967249305e-05, | |
| "loss": 0.9892, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 2.145988770094608, | |
| "grad_norm": 0.9409565925598145, | |
| "learning_rate": 5.7746149650676575e-05, | |
| "loss": 0.9985, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 2.1498346281055305, | |
| "grad_norm": 1.445890188217163, | |
| "learning_rate": 5.7486429628860094e-05, | |
| "loss": 1.0147, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 2.1536804861164525, | |
| "grad_norm": 1.109020709991455, | |
| "learning_rate": 5.722670960704361e-05, | |
| "loss": 1.0093, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.1536804861164525, | |
| "eval_loss": 1.008616328239441, | |
| "eval_runtime": 17.6489, | |
| "eval_samples_per_second": 56.661, | |
| "eval_steps_per_second": 14.165, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.157526344127375, | |
| "grad_norm": 1.3012685775756836, | |
| "learning_rate": 5.696698958522713e-05, | |
| "loss": 1.0703, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 2.1613722021382973, | |
| "grad_norm": 0.8459142446517944, | |
| "learning_rate": 5.6707269563410646e-05, | |
| "loss": 1.0507, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 2.1652180601492192, | |
| "grad_norm": 0.6889505982398987, | |
| "learning_rate": 5.6447549541594166e-05, | |
| "loss": 1.0056, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 2.1690639181601417, | |
| "grad_norm": 1.2236456871032715, | |
| "learning_rate": 5.618782951977768e-05, | |
| "loss": 0.988, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.1729097761710636, | |
| "grad_norm": 1.3419203758239746, | |
| "learning_rate": 5.59281094979612e-05, | |
| "loss": 0.9467, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 2.1729097761710636, | |
| "eval_loss": 1.0140153169631958, | |
| "eval_runtime": 17.7299, | |
| "eval_samples_per_second": 56.402, | |
| "eval_steps_per_second": 14.1, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 2.176755634181986, | |
| "grad_norm": 0.8394871354103088, | |
| "learning_rate": 5.5668389476144725e-05, | |
| "loss": 0.9824, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 2.1806014921929084, | |
| "grad_norm": 0.7906908392906189, | |
| "learning_rate": 5.540866945432823e-05, | |
| "loss": 0.9915, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 2.1844473502038304, | |
| "grad_norm": 1.0852785110473633, | |
| "learning_rate": 5.514894943251175e-05, | |
| "loss": 0.9473, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.188293208214753, | |
| "grad_norm": 0.9187583923339844, | |
| "learning_rate": 5.488922941069528e-05, | |
| "loss": 0.9418, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 2.192139066225675, | |
| "grad_norm": 0.7200838923454285, | |
| "learning_rate": 5.4629509388878797e-05, | |
| "loss": 0.9776, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.192139066225675, | |
| "eval_loss": 1.0052642822265625, | |
| "eval_runtime": 17.5069, | |
| "eval_samples_per_second": 57.12, | |
| "eval_steps_per_second": 14.28, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.195984924236597, | |
| "grad_norm": 1.00751793384552, | |
| "learning_rate": 5.436978936706231e-05, | |
| "loss": 0.9479, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 2.1998307822475196, | |
| "grad_norm": 0.9239784479141235, | |
| "learning_rate": 5.411006934524583e-05, | |
| "loss": 0.9896, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.2036766402584416, | |
| "grad_norm": 0.9650816917419434, | |
| "learning_rate": 5.385034932342935e-05, | |
| "loss": 1.0081, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 2.207522498269364, | |
| "grad_norm": 1.455723524093628, | |
| "learning_rate": 5.359062930161286e-05, | |
| "loss": 0.9717, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.211368356280286, | |
| "grad_norm": 0.609380304813385, | |
| "learning_rate": 5.333090927979638e-05, | |
| "loss": 0.9774, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 2.211368356280286, | |
| "eval_loss": 1.0018259286880493, | |
| "eval_runtime": 17.3475, | |
| "eval_samples_per_second": 57.645, | |
| "eval_steps_per_second": 14.411, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 2.2152142142912084, | |
| "grad_norm": 0.9981700778007507, | |
| "learning_rate": 5.30711892579799e-05, | |
| "loss": 0.9873, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.2190600723021308, | |
| "grad_norm": 0.5452422499656677, | |
| "learning_rate": 5.2811469236163414e-05, | |
| "loss": 1.0078, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 2.2229059303130527, | |
| "grad_norm": 1.4739840030670166, | |
| "learning_rate": 5.255174921434693e-05, | |
| "loss": 0.9739, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.226751788323975, | |
| "grad_norm": 0.9250359535217285, | |
| "learning_rate": 5.229202919253045e-05, | |
| "loss": 0.9723, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 2.230597646334897, | |
| "grad_norm": 1.0843122005462646, | |
| "learning_rate": 5.203230917071398e-05, | |
| "loss": 0.989, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.230597646334897, | |
| "eval_loss": 0.9999537467956543, | |
| "eval_runtime": 17.2417, | |
| "eval_samples_per_second": 57.999, | |
| "eval_steps_per_second": 14.5, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.2344435043458195, | |
| "grad_norm": 0.5559306740760803, | |
| "learning_rate": 5.1772589148897485e-05, | |
| "loss": 0.9749, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 2.238289362356742, | |
| "grad_norm": 0.6720598936080933, | |
| "learning_rate": 5.151286912708101e-05, | |
| "loss": 0.9662, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.242135220367664, | |
| "grad_norm": 1.117200493812561, | |
| "learning_rate": 5.125314910526453e-05, | |
| "loss": 0.9552, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 2.2459810783785863, | |
| "grad_norm": 0.6818645000457764, | |
| "learning_rate": 5.0993429083448044e-05, | |
| "loss": 0.9032, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.2498269363895087, | |
| "grad_norm": 0.9796412587165833, | |
| "learning_rate": 5.0733709061631564e-05, | |
| "loss": 0.9429, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 2.2498269363895087, | |
| "eval_loss": 1.0121312141418457, | |
| "eval_runtime": 17.3211, | |
| "eval_samples_per_second": 57.733, | |
| "eval_steps_per_second": 14.433, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 2.2536727944004307, | |
| "grad_norm": 1.021713137626648, | |
| "learning_rate": 5.047398903981508e-05, | |
| "loss": 0.979, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.257518652411353, | |
| "grad_norm": 1.1321250200271606, | |
| "learning_rate": 5.02142690179986e-05, | |
| "loss": 1.0327, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 2.261364510422275, | |
| "grad_norm": 0.7670277953147888, | |
| "learning_rate": 4.9954548996182116e-05, | |
| "loss": 0.9668, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.2652103684331975, | |
| "grad_norm": 1.447698712348938, | |
| "learning_rate": 4.9694828974365635e-05, | |
| "loss": 0.9288, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 2.26905622644412, | |
| "grad_norm": 1.0438776016235352, | |
| "learning_rate": 4.9435108952549155e-05, | |
| "loss": 1.0154, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.26905622644412, | |
| "eval_loss": 1.0100510120391846, | |
| "eval_runtime": 17.4293, | |
| "eval_samples_per_second": 57.375, | |
| "eval_steps_per_second": 14.344, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.272902084455042, | |
| "grad_norm": 0.6814424991607666, | |
| "learning_rate": 4.9175388930732675e-05, | |
| "loss": 0.9563, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 2.2767479424659642, | |
| "grad_norm": 0.85086989402771, | |
| "learning_rate": 4.891566890891619e-05, | |
| "loss": 0.9778, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.280593800476886, | |
| "grad_norm": 1.1364527940750122, | |
| "learning_rate": 4.8655948887099714e-05, | |
| "loss": 0.996, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 2.2844396584878086, | |
| "grad_norm": 0.7810873985290527, | |
| "learning_rate": 4.839622886528323e-05, | |
| "loss": 1.0141, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.288285516498731, | |
| "grad_norm": 1.02638578414917, | |
| "learning_rate": 4.8136508843466746e-05, | |
| "loss": 0.97, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 2.288285516498731, | |
| "eval_loss": 1.0048705339431763, | |
| "eval_runtime": 17.487, | |
| "eval_samples_per_second": 57.185, | |
| "eval_steps_per_second": 14.296, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 2.292131374509653, | |
| "grad_norm": 1.334876298904419, | |
| "learning_rate": 4.7876788821650266e-05, | |
| "loss": 0.9533, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.2959772325205754, | |
| "grad_norm": 1.3690305948257446, | |
| "learning_rate": 4.761706879983378e-05, | |
| "loss": 0.992, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 2.299823090531498, | |
| "grad_norm": 1.525981068611145, | |
| "learning_rate": 4.7357348778017305e-05, | |
| "loss": 0.949, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.3036689485424198, | |
| "grad_norm": 0.6612236499786377, | |
| "learning_rate": 4.709762875620082e-05, | |
| "loss": 1.0112, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 2.307514806553342, | |
| "grad_norm": 0.8452871441841125, | |
| "learning_rate": 4.683790873438433e-05, | |
| "loss": 0.9858, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.307514806553342, | |
| "eval_loss": 0.9945911169052124, | |
| "eval_runtime": 17.2501, | |
| "eval_samples_per_second": 57.971, | |
| "eval_steps_per_second": 14.493, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.311360664564264, | |
| "grad_norm": 1.3450604677200317, | |
| "learning_rate": 4.657818871256786e-05, | |
| "loss": 0.973, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 2.3152065225751866, | |
| "grad_norm": 1.1462957859039307, | |
| "learning_rate": 4.631846869075137e-05, | |
| "loss": 0.9789, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.3190523805861085, | |
| "grad_norm": 1.0691301822662354, | |
| "learning_rate": 4.605874866893489e-05, | |
| "loss": 0.9937, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 2.322898238597031, | |
| "grad_norm": 1.527723789215088, | |
| "learning_rate": 4.579902864711841e-05, | |
| "loss": 0.9276, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.3267440966079533, | |
| "grad_norm": 0.7396986484527588, | |
| "learning_rate": 4.553930862530192e-05, | |
| "loss": 0.9648, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 2.3267440966079533, | |
| "eval_loss": 1.0029717683792114, | |
| "eval_runtime": 17.2662, | |
| "eval_samples_per_second": 57.917, | |
| "eval_steps_per_second": 14.479, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 2.3305899546188753, | |
| "grad_norm": 0.9694539308547974, | |
| "learning_rate": 4.527958860348545e-05, | |
| "loss": 0.9378, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.3344358126297977, | |
| "grad_norm": 1.161028504371643, | |
| "learning_rate": 4.501986858166896e-05, | |
| "loss": 0.9807, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 2.33828167064072, | |
| "grad_norm": 1.6975845098495483, | |
| "learning_rate": 4.476014855985248e-05, | |
| "loss": 0.9809, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.342127528651642, | |
| "grad_norm": 1.3228962421417236, | |
| "learning_rate": 4.4500428538036e-05, | |
| "loss": 1.0339, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 2.3459733866625645, | |
| "grad_norm": 1.0168397426605225, | |
| "learning_rate": 4.424070851621952e-05, | |
| "loss": 0.9544, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.3459733866625645, | |
| "eval_loss": 1.002646565437317, | |
| "eval_runtime": 17.3363, | |
| "eval_samples_per_second": 57.682, | |
| "eval_steps_per_second": 14.421, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.3498192446734865, | |
| "grad_norm": 0.9951680302619934, | |
| "learning_rate": 4.398098849440303e-05, | |
| "loss": 1.0062, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 2.353665102684409, | |
| "grad_norm": 0.5752933025360107, | |
| "learning_rate": 4.372126847258655e-05, | |
| "loss": 0.9649, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.3575109606953313, | |
| "grad_norm": 1.0051320791244507, | |
| "learning_rate": 4.346154845077007e-05, | |
| "loss": 0.9958, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 2.3613568187062532, | |
| "grad_norm": 0.7760717868804932, | |
| "learning_rate": 4.320182842895359e-05, | |
| "loss": 1.0125, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 2.3652026767171757, | |
| "grad_norm": 0.852301836013794, | |
| "learning_rate": 4.294210840713711e-05, | |
| "loss": 0.9523, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 2.3652026767171757, | |
| "eval_loss": 1.003655195236206, | |
| "eval_runtime": 17.3241, | |
| "eval_samples_per_second": 57.723, | |
| "eval_steps_per_second": 14.431, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 2.3690485347280976, | |
| "grad_norm": 0.9062100648880005, | |
| "learning_rate": 4.2682388385320624e-05, | |
| "loss": 0.9735, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.37289439273902, | |
| "grad_norm": 1.309615969657898, | |
| "learning_rate": 4.2427862763940476e-05, | |
| "loss": 0.9668, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 2.3767402507499424, | |
| "grad_norm": 1.0907591581344604, | |
| "learning_rate": 4.2168142742123995e-05, | |
| "loss": 0.9502, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 2.3805861087608644, | |
| "grad_norm": 1.0946288108825684, | |
| "learning_rate": 4.190842272030751e-05, | |
| "loss": 0.9545, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 2.384431966771787, | |
| "grad_norm": 1.225540280342102, | |
| "learning_rate": 4.164870269849103e-05, | |
| "loss": 0.9635, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.384431966771787, | |
| "eval_loss": 1.0031681060791016, | |
| "eval_runtime": 17.2062, | |
| "eval_samples_per_second": 58.119, | |
| "eval_steps_per_second": 14.53, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.3882778247827092, | |
| "grad_norm": 0.8717153668403625, | |
| "learning_rate": 4.138898267667455e-05, | |
| "loss": 1.0038, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 2.392123682793631, | |
| "grad_norm": 1.007270097732544, | |
| "learning_rate": 4.112926265485807e-05, | |
| "loss": 0.9858, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 2.3959695408045536, | |
| "grad_norm": 1.9409807920455933, | |
| "learning_rate": 4.0869542633041587e-05, | |
| "loss": 1.004, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 2.3998153988154756, | |
| "grad_norm": 0.6027572154998779, | |
| "learning_rate": 4.06098226112251e-05, | |
| "loss": 0.9958, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.403661256826398, | |
| "grad_norm": 0.8274515867233276, | |
| "learning_rate": 4.035010258940862e-05, | |
| "loss": 0.9852, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 2.403661256826398, | |
| "eval_loss": 1.0001976490020752, | |
| "eval_runtime": 17.2233, | |
| "eval_samples_per_second": 58.061, | |
| "eval_steps_per_second": 14.515, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 2.4075071148373204, | |
| "grad_norm": 0.7238942384719849, | |
| "learning_rate": 4.009038256759214e-05, | |
| "loss": 1.0062, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 2.4113529728482423, | |
| "grad_norm": 0.8912849426269531, | |
| "learning_rate": 3.983066254577565e-05, | |
| "loss": 0.9802, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 2.4151988308591648, | |
| "grad_norm": 1.1922829151153564, | |
| "learning_rate": 3.957094252395918e-05, | |
| "loss": 1.0317, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.4190446888700867, | |
| "grad_norm": 1.3773999214172363, | |
| "learning_rate": 3.931122250214269e-05, | |
| "loss": 0.9773, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 2.422890546881009, | |
| "grad_norm": 1.0747745037078857, | |
| "learning_rate": 3.905150248032621e-05, | |
| "loss": 1.0208, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.422890546881009, | |
| "eval_loss": 1.0009056329727173, | |
| "eval_runtime": 17.1069, | |
| "eval_samples_per_second": 58.456, | |
| "eval_steps_per_second": 14.614, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.4267364048919315, | |
| "grad_norm": 1.3409994840621948, | |
| "learning_rate": 3.879178245850973e-05, | |
| "loss": 0.9546, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 2.4305822629028535, | |
| "grad_norm": 1.482633352279663, | |
| "learning_rate": 3.853206243669324e-05, | |
| "loss": 1.0155, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 2.434428120913776, | |
| "grad_norm": 1.0458152294158936, | |
| "learning_rate": 3.827234241487677e-05, | |
| "loss": 0.989, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 2.4382739789246983, | |
| "grad_norm": 0.9805555939674377, | |
| "learning_rate": 3.801262239306028e-05, | |
| "loss": 0.9125, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 2.4421198369356203, | |
| "grad_norm": 1.5680670738220215, | |
| "learning_rate": 3.77529023712438e-05, | |
| "loss": 0.9713, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 2.4421198369356203, | |
| "eval_loss": 0.9981379508972168, | |
| "eval_runtime": 17.3284, | |
| "eval_samples_per_second": 57.709, | |
| "eval_steps_per_second": 14.427, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 2.4459656949465427, | |
| "grad_norm": 1.0316798686981201, | |
| "learning_rate": 3.749318234942732e-05, | |
| "loss": 0.9689, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 2.4498115529574647, | |
| "grad_norm": 0.6630721092224121, | |
| "learning_rate": 3.7233462327610834e-05, | |
| "loss": 0.9772, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 2.453657410968387, | |
| "grad_norm": 1.1662702560424805, | |
| "learning_rate": 3.6973742305794354e-05, | |
| "loss": 1.0165, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 2.457503268979309, | |
| "grad_norm": 1.0451244115829468, | |
| "learning_rate": 3.671402228397787e-05, | |
| "loss": 1.0233, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 2.4613491269902315, | |
| "grad_norm": 0.9077771306037903, | |
| "learning_rate": 3.645430226216139e-05, | |
| "loss": 0.9797, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.4613491269902315, | |
| "eval_loss": 0.9961766004562378, | |
| "eval_runtime": 17.2799, | |
| "eval_samples_per_second": 57.871, | |
| "eval_steps_per_second": 14.468, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.465194985001154, | |
| "grad_norm": 0.999718964099884, | |
| "learning_rate": 3.619458224034491e-05, | |
| "loss": 0.9725, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 2.469040843012076, | |
| "grad_norm": 2.740297794342041, | |
| "learning_rate": 3.5934862218528425e-05, | |
| "loss": 0.9838, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 2.4728867010229982, | |
| "grad_norm": 1.199425458908081, | |
| "learning_rate": 3.5675142196711945e-05, | |
| "loss": 0.9807, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 2.4767325590339206, | |
| "grad_norm": 0.7113758325576782, | |
| "learning_rate": 3.5415422174895465e-05, | |
| "loss": 1.0046, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 2.4805784170448426, | |
| "grad_norm": 0.9929390549659729, | |
| "learning_rate": 3.5155702153078984e-05, | |
| "loss": 0.9692, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 2.4805784170448426, | |
| "eval_loss": 1.002519965171814, | |
| "eval_runtime": 17.3042, | |
| "eval_samples_per_second": 57.789, | |
| "eval_steps_per_second": 14.447, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 2.484424275055765, | |
| "grad_norm": 0.8170703649520874, | |
| "learning_rate": 3.48959821312625e-05, | |
| "loss": 0.982, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 2.488270133066687, | |
| "grad_norm": 0.8909692168235779, | |
| "learning_rate": 3.463626210944602e-05, | |
| "loss": 0.9953, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 2.4921159910776094, | |
| "grad_norm": 1.806539535522461, | |
| "learning_rate": 3.4376542087629536e-05, | |
| "loss": 1.0069, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 2.495961849088532, | |
| "grad_norm": 1.4509518146514893, | |
| "learning_rate": 3.4116822065813056e-05, | |
| "loss": 0.9642, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 2.4998077070994538, | |
| "grad_norm": 1.0312175750732422, | |
| "learning_rate": 3.3857102043996575e-05, | |
| "loss": 0.9921, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.4998077070994538, | |
| "eval_loss": 0.9993879199028015, | |
| "eval_runtime": 17.3389, | |
| "eval_samples_per_second": 57.674, | |
| "eval_steps_per_second": 14.418, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.503653565110376, | |
| "grad_norm": 0.867513120174408, | |
| "learning_rate": 3.359738202218009e-05, | |
| "loss": 0.994, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 2.507499423121298, | |
| "grad_norm": 1.7425885200500488, | |
| "learning_rate": 3.3337662000363615e-05, | |
| "loss": 1.0068, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 2.5113452811322206, | |
| "grad_norm": 0.9053608775138855, | |
| "learning_rate": 3.307794197854713e-05, | |
| "loss": 0.9704, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 2.515191139143143, | |
| "grad_norm": 1.0533051490783691, | |
| "learning_rate": 3.281822195673064e-05, | |
| "loss": 0.9506, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 2.519036997154065, | |
| "grad_norm": 1.2495230436325073, | |
| "learning_rate": 3.255850193491417e-05, | |
| "loss": 0.9936, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 2.519036997154065, | |
| "eval_loss": 0.9990929961204529, | |
| "eval_runtime": 17.239, | |
| "eval_samples_per_second": 58.008, | |
| "eval_steps_per_second": 14.502, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 2.5228828551649873, | |
| "grad_norm": 0.8339760303497314, | |
| "learning_rate": 3.229878191309768e-05, | |
| "loss": 1.0008, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 2.5267287131759097, | |
| "grad_norm": 1.2839399576187134, | |
| "learning_rate": 3.20390618912812e-05, | |
| "loss": 1.0109, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 2.5305745711868317, | |
| "grad_norm": 1.2146100997924805, | |
| "learning_rate": 3.177934186946472e-05, | |
| "loss": 0.9834, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 2.534420429197754, | |
| "grad_norm": 0.7952923774719238, | |
| "learning_rate": 3.151962184764823e-05, | |
| "loss": 1.0013, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 2.5382662872086765, | |
| "grad_norm": 1.67001211643219, | |
| "learning_rate": 3.125990182583176e-05, | |
| "loss": 0.9797, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.5382662872086765, | |
| "eval_loss": 1.001037359237671, | |
| "eval_runtime": 17.347, | |
| "eval_samples_per_second": 57.647, | |
| "eval_steps_per_second": 14.412, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.5421121452195985, | |
| "grad_norm": 0.7536977529525757, | |
| "learning_rate": 3.100018180401527e-05, | |
| "loss": 0.9818, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 2.5459580032305205, | |
| "grad_norm": 1.3777302503585815, | |
| "learning_rate": 3.074046178219879e-05, | |
| "loss": 0.953, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 2.549803861241443, | |
| "grad_norm": 0.9286239743232727, | |
| "learning_rate": 3.048074176038231e-05, | |
| "loss": 1.028, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 2.5536497192523653, | |
| "grad_norm": 0.6741893291473389, | |
| "learning_rate": 3.022102173856583e-05, | |
| "loss": 0.9693, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 2.5574955772632872, | |
| "grad_norm": 1.3020586967468262, | |
| "learning_rate": 2.9961301716749346e-05, | |
| "loss": 0.9673, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 2.5574955772632872, | |
| "eval_loss": 1.0041394233703613, | |
| "eval_runtime": 17.32, | |
| "eval_samples_per_second": 57.737, | |
| "eval_steps_per_second": 14.434, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 2.5613414352742097, | |
| "grad_norm": 1.4816234111785889, | |
| "learning_rate": 2.9701581694932862e-05, | |
| "loss": 0.9964, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 2.565187293285132, | |
| "grad_norm": 0.8813285827636719, | |
| "learning_rate": 2.9441861673116382e-05, | |
| "loss": 0.966, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 2.569033151296054, | |
| "grad_norm": 1.0042293071746826, | |
| "learning_rate": 2.9182141651299898e-05, | |
| "loss": 0.9254, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 2.5728790093069764, | |
| "grad_norm": 1.5775707960128784, | |
| "learning_rate": 2.892242162948342e-05, | |
| "loss": 1.0215, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 2.576724867317899, | |
| "grad_norm": 1.0266311168670654, | |
| "learning_rate": 2.8662701607666937e-05, | |
| "loss": 0.9323, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.576724867317899, | |
| "eval_loss": 1.0002070665359497, | |
| "eval_runtime": 17.2317, | |
| "eval_samples_per_second": 58.033, | |
| "eval_steps_per_second": 14.508, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.580570725328821, | |
| "grad_norm": 0.5430648922920227, | |
| "learning_rate": 2.8402981585850453e-05, | |
| "loss": 0.9658, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 2.5844165833397432, | |
| "grad_norm": 0.6678454279899597, | |
| "learning_rate": 2.8143261564033973e-05, | |
| "loss": 1.0292, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 2.588262441350665, | |
| "grad_norm": 0.7208724021911621, | |
| "learning_rate": 2.788354154221749e-05, | |
| "loss": 0.9505, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 2.5921082993615876, | |
| "grad_norm": 1.2248526811599731, | |
| "learning_rate": 2.7623821520401012e-05, | |
| "loss": 0.9728, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 2.5959541573725096, | |
| "grad_norm": 1.0026588439941406, | |
| "learning_rate": 2.7364101498584525e-05, | |
| "loss": 0.9783, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 2.5959541573725096, | |
| "eval_loss": 0.9881900548934937, | |
| "eval_runtime": 17.2742, | |
| "eval_samples_per_second": 57.89, | |
| "eval_steps_per_second": 14.472, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 2.599800015383432, | |
| "grad_norm": 0.9579987525939941, | |
| "learning_rate": 2.710438147676804e-05, | |
| "loss": 0.9882, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 2.6036458733943544, | |
| "grad_norm": 1.0152076482772827, | |
| "learning_rate": 2.6844661454951564e-05, | |
| "loss": 1.041, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 2.6074917314052763, | |
| "grad_norm": 1.1370351314544678, | |
| "learning_rate": 2.658494143313508e-05, | |
| "loss": 0.9518, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 2.6113375894161988, | |
| "grad_norm": 0.9851937890052795, | |
| "learning_rate": 2.63252214113186e-05, | |
| "loss": 0.9125, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 2.615183447427121, | |
| "grad_norm": 0.8480270504951477, | |
| "learning_rate": 2.6065501389502116e-05, | |
| "loss": 0.9736, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.615183447427121, | |
| "eval_loss": 0.987713634967804, | |
| "eval_runtime": 17.2436, | |
| "eval_samples_per_second": 57.993, | |
| "eval_steps_per_second": 14.498, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.619029305438043, | |
| "grad_norm": 0.5307362079620361, | |
| "learning_rate": 2.580578136768564e-05, | |
| "loss": 1.0031, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 2.6228751634489655, | |
| "grad_norm": 1.1112557649612427, | |
| "learning_rate": 2.5546061345869156e-05, | |
| "loss": 0.9928, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 2.626721021459888, | |
| "grad_norm": 0.646759569644928, | |
| "learning_rate": 2.5286341324052672e-05, | |
| "loss": 1.0173, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 2.63056687947081, | |
| "grad_norm": 1.0114878416061401, | |
| "learning_rate": 2.502662130223619e-05, | |
| "loss": 0.9765, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 2.6344127374817323, | |
| "grad_norm": 0.8782021403312683, | |
| "learning_rate": 2.4766901280419708e-05, | |
| "loss": 0.9926, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 2.6344127374817323, | |
| "eval_loss": 0.9932020306587219, | |
| "eval_runtime": 17.1921, | |
| "eval_samples_per_second": 58.166, | |
| "eval_steps_per_second": 14.542, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 2.6382585954926543, | |
| "grad_norm": 1.0792268514633179, | |
| "learning_rate": 2.4507181258603227e-05, | |
| "loss": 1.0541, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 2.6421044535035767, | |
| "grad_norm": 0.9647793173789978, | |
| "learning_rate": 2.4247461236786744e-05, | |
| "loss": 0.9779, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 2.6459503115144987, | |
| "grad_norm": 1.0052498579025269, | |
| "learning_rate": 2.3987741214970263e-05, | |
| "loss": 0.9798, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 2.649796169525421, | |
| "grad_norm": 0.8122023344039917, | |
| "learning_rate": 2.3728021193153783e-05, | |
| "loss": 1.0205, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 2.6536420275363435, | |
| "grad_norm": 1.452087163925171, | |
| "learning_rate": 2.3468301171337302e-05, | |
| "loss": 0.9745, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.6536420275363435, | |
| "eval_loss": 0.9959968328475952, | |
| "eval_runtime": 17.2025, | |
| "eval_samples_per_second": 58.131, | |
| "eval_steps_per_second": 14.533, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.6574878855472654, | |
| "grad_norm": 1.3329054117202759, | |
| "learning_rate": 2.3208581149520815e-05, | |
| "loss": 1.0035, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 2.661333743558188, | |
| "grad_norm": 0.8142715096473694, | |
| "learning_rate": 2.2948861127704335e-05, | |
| "loss": 0.9783, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 2.6651796015691103, | |
| "grad_norm": 0.5424798130989075, | |
| "learning_rate": 2.2689141105887855e-05, | |
| "loss": 0.9754, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 2.6690254595800322, | |
| "grad_norm": 0.8890462517738342, | |
| "learning_rate": 2.2429421084071374e-05, | |
| "loss": 0.9707, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 2.6728713175909546, | |
| "grad_norm": 1.0329838991165161, | |
| "learning_rate": 2.216970106225489e-05, | |
| "loss": 0.9845, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 2.6728713175909546, | |
| "eval_loss": 0.9899721741676331, | |
| "eval_runtime": 17.3008, | |
| "eval_samples_per_second": 57.801, | |
| "eval_steps_per_second": 14.45, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 2.676717175601877, | |
| "grad_norm": 1.0697598457336426, | |
| "learning_rate": 2.190998104043841e-05, | |
| "loss": 1.0167, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 2.680563033612799, | |
| "grad_norm": 0.8134399056434631, | |
| "learning_rate": 2.1650261018621926e-05, | |
| "loss": 0.9001, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 2.684408891623721, | |
| "grad_norm": 1.0405962467193604, | |
| "learning_rate": 2.1390540996805446e-05, | |
| "loss": 1.0184, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 2.6882547496346434, | |
| "grad_norm": 1.3229318857192993, | |
| "learning_rate": 2.1130820974988962e-05, | |
| "loss": 0.9556, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 2.692100607645566, | |
| "grad_norm": 1.2907413244247437, | |
| "learning_rate": 2.087110095317248e-05, | |
| "loss": 0.9355, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.692100607645566, | |
| "eval_loss": 0.9918172359466553, | |
| "eval_runtime": 17.2562, | |
| "eval_samples_per_second": 57.95, | |
| "eval_steps_per_second": 14.488, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.6959464656564878, | |
| "grad_norm": 1.3044216632843018, | |
| "learning_rate": 2.0611380931356e-05, | |
| "loss": 0.9787, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 2.69979232366741, | |
| "grad_norm": 1.4273097515106201, | |
| "learning_rate": 2.0351660909539518e-05, | |
| "loss": 0.9775, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 2.7036381816783326, | |
| "grad_norm": 1.6097638607025146, | |
| "learning_rate": 2.0091940887723034e-05, | |
| "loss": 0.9926, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 2.7074840396892546, | |
| "grad_norm": 1.4893895387649536, | |
| "learning_rate": 1.9832220865906553e-05, | |
| "loss": 1.0435, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 2.711329897700177, | |
| "grad_norm": 1.1620233058929443, | |
| "learning_rate": 1.9572500844090073e-05, | |
| "loss": 0.9525, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 2.711329897700177, | |
| "eval_loss": 0.9926208257675171, | |
| "eval_runtime": 17.3977, | |
| "eval_samples_per_second": 57.479, | |
| "eval_steps_per_second": 14.37, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 2.7151757557110994, | |
| "grad_norm": 1.2087517976760864, | |
| "learning_rate": 1.931278082227359e-05, | |
| "loss": 1.0005, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 2.7190216137220213, | |
| "grad_norm": 0.8885460495948792, | |
| "learning_rate": 1.9058255200893437e-05, | |
| "loss": 0.9911, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 2.7228674717329437, | |
| "grad_norm": 0.9543077945709229, | |
| "learning_rate": 1.8798535179076957e-05, | |
| "loss": 1.0437, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 2.7267133297438657, | |
| "grad_norm": 1.2362306118011475, | |
| "learning_rate": 1.8538815157260476e-05, | |
| "loss": 0.9766, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 2.730559187754788, | |
| "grad_norm": 1.1946227550506592, | |
| "learning_rate": 1.8279095135443993e-05, | |
| "loss": 1.0333, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.730559187754788, | |
| "eval_loss": 0.9953948855400085, | |
| "eval_runtime": 17.1888, | |
| "eval_samples_per_second": 58.178, | |
| "eval_steps_per_second": 14.544, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.73440504576571, | |
| "grad_norm": 0.66939777135849, | |
| "learning_rate": 1.801937511362751e-05, | |
| "loss": 0.8892, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 2.7382509037766325, | |
| "grad_norm": 1.0852998495101929, | |
| "learning_rate": 1.775965509181103e-05, | |
| "loss": 0.9502, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 2.742096761787555, | |
| "grad_norm": 0.7603423595428467, | |
| "learning_rate": 1.7499935069994548e-05, | |
| "loss": 1.0034, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 2.745942619798477, | |
| "grad_norm": 0.4625702202320099, | |
| "learning_rate": 1.7240215048178064e-05, | |
| "loss": 1.0123, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 2.7497884778093993, | |
| "grad_norm": 1.4387953281402588, | |
| "learning_rate": 1.6980495026361584e-05, | |
| "loss": 0.9704, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 2.7497884778093993, | |
| "eval_loss": 0.9952225685119629, | |
| "eval_runtime": 17.4458, | |
| "eval_samples_per_second": 57.32, | |
| "eval_steps_per_second": 14.33, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 2.7536343358203217, | |
| "grad_norm": 0.6899126172065735, | |
| "learning_rate": 1.67207750045451e-05, | |
| "loss": 0.9627, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 2.7574801938312437, | |
| "grad_norm": 1.0329424142837524, | |
| "learning_rate": 1.646105498272862e-05, | |
| "loss": 0.9207, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 2.761326051842166, | |
| "grad_norm": 1.1055504083633423, | |
| "learning_rate": 1.6201334960912136e-05, | |
| "loss": 0.9834, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 2.7651719098530885, | |
| "grad_norm": 0.7458188533782959, | |
| "learning_rate": 1.5941614939095655e-05, | |
| "loss": 1.003, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 2.7690177678640104, | |
| "grad_norm": 1.112021803855896, | |
| "learning_rate": 1.5681894917279175e-05, | |
| "loss": 1.0001, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.7690177678640104, | |
| "eval_loss": 0.9910063147544861, | |
| "eval_runtime": 17.2718, | |
| "eval_samples_per_second": 57.898, | |
| "eval_steps_per_second": 14.474, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.772863625874933, | |
| "grad_norm": 0.5958703756332397, | |
| "learning_rate": 1.5422174895462695e-05, | |
| "loss": 0.9503, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 2.776709483885855, | |
| "grad_norm": 1.1087392568588257, | |
| "learning_rate": 1.516245487364621e-05, | |
| "loss": 0.9725, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 2.780555341896777, | |
| "grad_norm": 1.2012007236480713, | |
| "learning_rate": 1.4902734851829727e-05, | |
| "loss": 1.0098, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 2.784401199907699, | |
| "grad_norm": 0.9333285093307495, | |
| "learning_rate": 1.4643014830013247e-05, | |
| "loss": 0.9764, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 2.7882470579186216, | |
| "grad_norm": 0.9407594203948975, | |
| "learning_rate": 1.4383294808196765e-05, | |
| "loss": 0.9404, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 2.7882470579186216, | |
| "eval_loss": 0.9894633889198303, | |
| "eval_runtime": 17.3239, | |
| "eval_samples_per_second": 57.724, | |
| "eval_steps_per_second": 14.431, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 2.792092915929544, | |
| "grad_norm": 0.9794307947158813, | |
| "learning_rate": 1.4123574786380284e-05, | |
| "loss": 0.9445, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 2.795938773940466, | |
| "grad_norm": 1.1447358131408691, | |
| "learning_rate": 1.3863854764563799e-05, | |
| "loss": 1.0056, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 2.7997846319513884, | |
| "grad_norm": 0.8741857409477234, | |
| "learning_rate": 1.3604134742747318e-05, | |
| "loss": 0.9511, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 2.803630489962311, | |
| "grad_norm": 1.0769715309143066, | |
| "learning_rate": 1.3344414720930836e-05, | |
| "loss": 0.9789, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 2.8074763479732328, | |
| "grad_norm": 1.1045129299163818, | |
| "learning_rate": 1.3084694699114356e-05, | |
| "loss": 0.9958, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.8074763479732328, | |
| "eval_loss": 0.9914500117301941, | |
| "eval_runtime": 17.3793, | |
| "eval_samples_per_second": 57.54, | |
| "eval_steps_per_second": 14.385, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.811322205984155, | |
| "grad_norm": 0.9856983423233032, | |
| "learning_rate": 1.2824974677297874e-05, | |
| "loss": 0.9923, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 2.8151680639950776, | |
| "grad_norm": 1.2112038135528564, | |
| "learning_rate": 1.2565254655481392e-05, | |
| "loss": 0.9388, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 2.8190139220059995, | |
| "grad_norm": 1.1824342012405396, | |
| "learning_rate": 1.230553463366491e-05, | |
| "loss": 0.984, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 2.8228597800169215, | |
| "grad_norm": 1.3278725147247314, | |
| "learning_rate": 1.204581461184843e-05, | |
| "loss": 0.9835, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 2.826705638027844, | |
| "grad_norm": 1.4297553300857544, | |
| "learning_rate": 1.1786094590031946e-05, | |
| "loss": 0.9999, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 2.826705638027844, | |
| "eval_loss": 0.9847651720046997, | |
| "eval_runtime": 17.5097, | |
| "eval_samples_per_second": 57.111, | |
| "eval_steps_per_second": 14.278, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 2.8305514960387663, | |
| "grad_norm": 1.1944117546081543, | |
| "learning_rate": 1.1526374568215465e-05, | |
| "loss": 0.9758, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 2.8343973540496883, | |
| "grad_norm": 1.1629287004470825, | |
| "learning_rate": 1.1266654546398983e-05, | |
| "loss": 0.988, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 2.8382432120606107, | |
| "grad_norm": 1.598382592201233, | |
| "learning_rate": 1.1006934524582501e-05, | |
| "loss": 0.9679, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 2.842089070071533, | |
| "grad_norm": 0.9534172415733337, | |
| "learning_rate": 1.0747214502766019e-05, | |
| "loss": 0.9921, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 2.845934928082455, | |
| "grad_norm": 1.5716655254364014, | |
| "learning_rate": 1.0487494480949537e-05, | |
| "loss": 0.9898, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.845934928082455, | |
| "eval_loss": 0.9878412485122681, | |
| "eval_runtime": 17.2808, | |
| "eval_samples_per_second": 57.868, | |
| "eval_steps_per_second": 14.467, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.8497807860933775, | |
| "grad_norm": 1.3013421297073364, | |
| "learning_rate": 1.0227774459133055e-05, | |
| "loss": 0.9717, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 2.8536266441043, | |
| "grad_norm": 1.263071060180664, | |
| "learning_rate": 9.968054437316573e-06, | |
| "loss": 1.0259, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 2.857472502115222, | |
| "grad_norm": 1.1513851881027222, | |
| "learning_rate": 9.70833441550009e-06, | |
| "loss": 1.0015, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 2.8613183601261443, | |
| "grad_norm": 0.7431422472000122, | |
| "learning_rate": 9.448614393683609e-06, | |
| "loss": 0.999, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 2.8651642181370662, | |
| "grad_norm": 0.6744217872619629, | |
| "learning_rate": 9.188894371867128e-06, | |
| "loss": 0.9285, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 2.8651642181370662, | |
| "eval_loss": 0.994976282119751, | |
| "eval_runtime": 17.3294, | |
| "eval_samples_per_second": 57.705, | |
| "eval_steps_per_second": 14.426, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 2.8690100761479886, | |
| "grad_norm": 1.2962367534637451, | |
| "learning_rate": 8.929174350050646e-06, | |
| "loss": 0.9433, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 2.8728559341589106, | |
| "grad_norm": 0.9955423474311829, | |
| "learning_rate": 8.669454328234164e-06, | |
| "loss": 1.0447, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 2.876701792169833, | |
| "grad_norm": 0.5840064287185669, | |
| "learning_rate": 8.409734306417682e-06, | |
| "loss": 0.9504, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 2.8805476501807554, | |
| "grad_norm": 1.0777620077133179, | |
| "learning_rate": 8.1500142846012e-06, | |
| "loss": 0.9635, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 2.8843935081916774, | |
| "grad_norm": 0.9312844276428223, | |
| "learning_rate": 7.890294262784718e-06, | |
| "loss": 1.005, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.8843935081916774, | |
| "eval_loss": 0.9864250421524048, | |
| "eval_runtime": 17.2238, | |
| "eval_samples_per_second": 58.059, | |
| "eval_steps_per_second": 14.515, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.8882393662026, | |
| "grad_norm": 0.6800574660301208, | |
| "learning_rate": 7.630574240968237e-06, | |
| "loss": 0.983, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 2.892085224213522, | |
| "grad_norm": 0.9417561292648315, | |
| "learning_rate": 7.3708542191517545e-06, | |
| "loss": 0.973, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 2.895931082224444, | |
| "grad_norm": 0.6454310417175293, | |
| "learning_rate": 7.111134197335273e-06, | |
| "loss": 1.0493, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 2.8997769402353666, | |
| "grad_norm": 1.0287562608718872, | |
| "learning_rate": 6.851414175518792e-06, | |
| "loss": 0.9821, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 2.903622798246289, | |
| "grad_norm": 1.6874816417694092, | |
| "learning_rate": 6.591694153702309e-06, | |
| "loss": 0.9761, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 2.903622798246289, | |
| "eval_loss": 0.9886119961738586, | |
| "eval_runtime": 17.2542, | |
| "eval_samples_per_second": 57.957, | |
| "eval_steps_per_second": 14.489, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 2.907468656257211, | |
| "grad_norm": 0.7677489519119263, | |
| "learning_rate": 6.331974131885828e-06, | |
| "loss": 1.0421, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 2.9113145142681334, | |
| "grad_norm": 0.9784395098686218, | |
| "learning_rate": 6.072254110069346e-06, | |
| "loss": 0.9288, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 2.9151603722790553, | |
| "grad_norm": 0.6236763000488281, | |
| "learning_rate": 5.812534088252864e-06, | |
| "loss": 0.9777, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 2.9190062302899777, | |
| "grad_norm": 1.0675079822540283, | |
| "learning_rate": 5.552814066436382e-06, | |
| "loss": 0.9659, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 2.9228520883008997, | |
| "grad_norm": 0.9310262203216553, | |
| "learning_rate": 5.2930940446198996e-06, | |
| "loss": 0.9334, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.9228520883008997, | |
| "eval_loss": 0.98431396484375, | |
| "eval_runtime": 17.3967, | |
| "eval_samples_per_second": 57.482, | |
| "eval_steps_per_second": 14.371, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.926697946311822, | |
| "grad_norm": 0.9615415930747986, | |
| "learning_rate": 5.038568423239748e-06, | |
| "loss": 0.9715, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 2.9305438043227445, | |
| "grad_norm": 0.5959100127220154, | |
| "learning_rate": 4.778848401423266e-06, | |
| "loss": 0.936, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 2.9343896623336665, | |
| "grad_norm": 1.0181951522827148, | |
| "learning_rate": 4.519128379606784e-06, | |
| "loss": 0.969, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 2.938235520344589, | |
| "grad_norm": 0.8716458082199097, | |
| "learning_rate": 4.259408357790302e-06, | |
| "loss": 1.002, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 2.9420813783555113, | |
| "grad_norm": 1.6100435256958008, | |
| "learning_rate": 3.99968833597382e-06, | |
| "loss": 0.9425, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 2.9420813783555113, | |
| "eval_loss": 0.9931854009628296, | |
| "eval_runtime": 17.3406, | |
| "eval_samples_per_second": 57.668, | |
| "eval_steps_per_second": 14.417, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 2.9459272363664333, | |
| "grad_norm": 1.1552485227584839, | |
| "learning_rate": 3.7399683141573383e-06, | |
| "loss": 1.0424, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 2.9497730943773557, | |
| "grad_norm": 1.441148042678833, | |
| "learning_rate": 3.480248292340857e-06, | |
| "loss": 1.0688, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 2.953618952388278, | |
| "grad_norm": 0.8127447366714478, | |
| "learning_rate": 3.220528270524375e-06, | |
| "loss": 0.9497, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 2.9574648103992, | |
| "grad_norm": 0.9566346406936646, | |
| "learning_rate": 2.9608082487078933e-06, | |
| "loss": 0.9628, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 2.961310668410122, | |
| "grad_norm": 0.7955614924430847, | |
| "learning_rate": 2.7010882268914113e-06, | |
| "loss": 1.016, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.961310668410122, | |
| "eval_loss": 0.9919273257255554, | |
| "eval_runtime": 17.3829, | |
| "eval_samples_per_second": 57.528, | |
| "eval_steps_per_second": 14.382, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.9651565264210444, | |
| "grad_norm": 0.8041125535964966, | |
| "learning_rate": 2.4413682050749296e-06, | |
| "loss": 0.9631, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 2.969002384431967, | |
| "grad_norm": 1.0755919218063354, | |
| "learning_rate": 2.1816481832584475e-06, | |
| "loss": 0.9689, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 2.972848242442889, | |
| "grad_norm": 0.8630362153053284, | |
| "learning_rate": 1.9219281614419654e-06, | |
| "loss": 0.9822, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 2.976694100453811, | |
| "grad_norm": 0.6994553804397583, | |
| "learning_rate": 1.662208139625484e-06, | |
| "loss": 0.9706, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 2.9805399584647336, | |
| "grad_norm": 1.1746189594268799, | |
| "learning_rate": 1.4024881178090021e-06, | |
| "loss": 0.9695, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 2.9805399584647336, | |
| "eval_loss": 0.9868382215499878, | |
| "eval_runtime": 17.3783, | |
| "eval_samples_per_second": 57.543, | |
| "eval_steps_per_second": 14.386, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 2.9843858164756556, | |
| "grad_norm": 1.0636792182922363, | |
| "learning_rate": 1.14276809599252e-06, | |
| "loss": 1.0233, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 2.988231674486578, | |
| "grad_norm": 0.8173992037773132, | |
| "learning_rate": 8.830480741760383e-07, | |
| "loss": 0.963, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 2.9920775324975004, | |
| "grad_norm": 1.591539978981018, | |
| "learning_rate": 6.233280523595564e-07, | |
| "loss": 0.949, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 2.9959233905084224, | |
| "grad_norm": 0.6467046737670898, | |
| "learning_rate": 3.636080305430746e-07, | |
| "loss": 0.9649, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 2.999769248519345, | |
| "grad_norm": 0.9910799860954285, | |
| "learning_rate": 1.0388800872659275e-07, | |
| "loss": 1.0502, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.999769248519345, | |
| "eval_loss": 0.9894677400588989, | |
| "eval_runtime": 17.4751, | |
| "eval_samples_per_second": 57.224, | |
| "eval_steps_per_second": 14.306, | |
| "step": 39000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 39003, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |