diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15154 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.0, + "eval_steps": 100, + "global_step": 108000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007407407407407408, + "grad_norm": 22.549795150756836, + "learning_rate": 1.9600000000000003e-06, + "loss": 10.1124, + "step": 50 + }, + { + "epoch": 0.014814814814814815, + "grad_norm": 22.49587631225586, + "learning_rate": 3.96e-06, + "loss": 7.7186, + "step": 100 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 8.133307456970215, + "learning_rate": 5.9600000000000005e-06, + "loss": 4.0014, + "step": 150 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 5.439042091369629, + "learning_rate": 7.960000000000002e-06, + "loss": 2.5781, + "step": 200 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 6.463360786437988, + "learning_rate": 9.960000000000001e-06, + "loss": 2.1167, + "step": 250 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 5.057401180267334, + "learning_rate": 1.196e-05, + "loss": 2.0461, + "step": 300 + }, + { + "epoch": 0.05185185185185185, + "grad_norm": 5.143754482269287, + "learning_rate": 1.396e-05, + "loss": 2.0597, + "step": 350 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 5.2624993324279785, + "learning_rate": 1.5960000000000003e-05, + "loss": 1.9737, + "step": 400 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 5.019354820251465, + "learning_rate": 1.796e-05, + "loss": 1.9023, + "step": 450 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 4.731785297393799, + "learning_rate": 1.9960000000000002e-05, + "loss": 1.9953, + "step": 500 + }, + { + "epoch": 0.08148148148148149, + "grad_norm": 4.751569747924805, + "learning_rate": 1.9996966947554476e-05, + "loss": 1.9253, + "step": 550 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 3.835200071334839, + "learning_rate": 1.9987620859825225e-05, + "loss": 1.8835, + "step": 600 + }, + { + "epoch": 0.0962962962962963, + "grad_norm": 4.402861595153809, + "learning_rate": 1.997196637669223e-05, + "loss": 1.8831, + "step": 650 + }, + { + "epoch": 0.1037037037037037, + "grad_norm": 3.8608696460723877, + "learning_rate": 1.9950013385862575e-05, + "loss": 1.92, + "step": 700 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.7692012786865234, + "learning_rate": 1.9921775753315793e-05, + "loss": 1.8961, + "step": 750 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 3.9636597633361816, + "learning_rate": 1.9887271314545823e-05, + "loss": 1.9051, + "step": 800 + }, + { + "epoch": 0.1259259259259259, + "grad_norm": 4.725071430206299, + "learning_rate": 1.984652186329575e-05, + "loss": 1.9338, + "step": 850 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 4.309209823608398, + "learning_rate": 1.9799553137792373e-05, + "loss": 1.9735, + "step": 900 + }, + { + "epoch": 0.14074074074074075, + "grad_norm": 4.000847816467285, + "learning_rate": 1.9746394804489425e-05, + "loss": 1.8845, + "step": 950 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 4.6890058517456055, + "learning_rate": 1.9687080439329585e-05, + "loss": 1.9339, + "step": 1000 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 4.7135701179504395, + "learning_rate": 1.9621647506537192e-05, + "loss": 1.8919, + "step": 1050 + }, + { + "epoch": 0.16296296296296298, + "grad_norm": 4.486835956573486, + "learning_rate": 1.955013733495505e-05, + "loss": 1.7963, + "step": 1100 + }, + { + "epoch": 0.17037037037037037, + "grad_norm": 4.228176593780518, + "learning_rate": 1.947259509194024e-05, + "loss": 1.8209, + "step": 1150 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 4.834776878356934, + "learning_rate": 1.9389069754835436e-05, + "loss": 1.8378, + "step": 1200 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 5.123416423797607, + "learning_rate": 1.9299614080033794e-05, + "loss": 1.96, + "step": 1250 + }, + { + "epoch": 0.1925925925925926, + "grad_norm": 4.351609230041504, + "learning_rate": 1.9204284569656848e-05, + "loss": 1.8308, + "step": 1300 + }, + { + "epoch": 0.2, + "grad_norm": 3.7220265865325928, + "learning_rate": 1.91031414358666e-05, + "loss": 1.8498, + "step": 1350 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 3.787611484527588, + "learning_rate": 1.8996248562834184e-05, + "loss": 1.7831, + "step": 1400 + }, + { + "epoch": 0.21481481481481482, + "grad_norm": 3.5662522315979004, + "learning_rate": 1.8883673466389286e-05, + "loss": 1.7446, + "step": 1450 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.7757928371429443, + "learning_rate": 1.876548725137569e-05, + "loss": 1.8315, + "step": 1500 + }, + { + "epoch": 0.22962962962962963, + "grad_norm": 4.004444122314453, + "learning_rate": 1.8641764566739933e-05, + "loss": 1.8419, + "step": 1550 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 3.936206817626953, + "learning_rate": 1.8512583558381422e-05, + "loss": 1.8409, + "step": 1600 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.8752267360687256, + "learning_rate": 1.8378025819793832e-05, + "loss": 1.8383, + "step": 1650 + }, + { + "epoch": 0.2518518518518518, + "grad_norm": 3.54079008102417, + "learning_rate": 1.823817634052888e-05, + "loss": 1.8094, + "step": 1700 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 4.669138431549072, + "learning_rate": 1.8093123452515122e-05, + "loss": 1.8002, + "step": 1750 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 4.1702704429626465, + "learning_rate": 1.7942958774265603e-05, + "loss": 1.8473, + "step": 1800 + }, + { + "epoch": 0.2740740740740741, + "grad_norm": 3.8928747177124023, + "learning_rate": 1.778777715300964e-05, + "loss": 1.8624, + "step": 1850 + }, + { + "epoch": 0.2814814814814815, + "grad_norm": 3.1108946800231934, + "learning_rate": 1.76276766047853e-05, + "loss": 1.8579, + "step": 1900 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 4.28292989730835, + "learning_rate": 1.746275825253033e-05, + "loss": 1.8081, + "step": 1950 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 3.518152952194214, + "learning_rate": 1.729312626221078e-05, + "loss": 1.7689, + "step": 2000 + }, + { + "epoch": 0.3037037037037037, + "grad_norm": 3.84735369682312, + "learning_rate": 1.7118887777027525e-05, + "loss": 1.7576, + "step": 2050 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 4.495599269866943, + "learning_rate": 1.694015284974233e-05, + "loss": 1.753, + "step": 2100 + }, + { + "epoch": 0.31851851851851853, + "grad_norm": 4.794450759887695, + "learning_rate": 1.6757034373166164e-05, + "loss": 1.7982, + "step": 2150 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 3.8519253730773926, + "learning_rate": 1.6569648008853686e-05, + "loss": 1.831, + "step": 2200 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.328887462615967, + "learning_rate": 1.6378112114048925e-05, + "loss": 1.8093, + "step": 2250 + }, + { + "epoch": 0.34074074074074073, + "grad_norm": 4.722980976104736, + "learning_rate": 1.61825476669283e-05, + "loss": 1.7542, + "step": 2300 + }, + { + "epoch": 0.34814814814814815, + "grad_norm": 3.8855786323547363, + "learning_rate": 1.5983078190188224e-05, + "loss": 1.78, + "step": 2350 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 3.9583353996276855, + "learning_rate": 1.577982967302552e-05, + "loss": 1.811, + "step": 2400 + }, + { + "epoch": 0.362962962962963, + "grad_norm": 3.5661697387695312, + "learning_rate": 1.5572930491559928e-05, + "loss": 1.7345, + "step": 2450 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 3.3534741401672363, + "learning_rate": 1.536251132774902e-05, + "loss": 1.8426, + "step": 2500 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 4.127283096313477, + "learning_rate": 1.5148705086846656e-05, + "loss": 1.8462, + "step": 2550 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 3.9793572425842285, + "learning_rate": 1.4931646813457183e-05, + "loss": 1.824, + "step": 2600 + }, + { + "epoch": 0.3925925925925926, + "grad_norm": 3.6798741817474365, + "learning_rate": 1.4711473606238373e-05, + "loss": 1.8243, + "step": 2650 + }, + { + "epoch": 0.4, + "grad_norm": 4.0039520263671875, + "learning_rate": 1.4488324531306963e-05, + "loss": 1.7302, + "step": 2700 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 4.815029144287109, + "learning_rate": 1.4262340534401525e-05, + "loss": 1.7768, + "step": 2750 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 4.897470474243164, + "learning_rate": 1.4033664351858107e-05, + "loss": 1.8168, + "step": 2800 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 4.236404895782471, + "learning_rate": 1.3802440420454915e-05, + "loss": 1.7363, + "step": 2850 + }, + { + "epoch": 0.42962962962962964, + "grad_norm": 3.841942548751831, + "learning_rate": 1.3568814786182938e-05, + "loss": 1.7471, + "step": 2900 + }, + { + "epoch": 0.43703703703703706, + "grad_norm": 4.591772556304932, + "learning_rate": 1.3332935012000171e-05, + "loss": 1.8046, + "step": 2950 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 4.35361385345459, + "learning_rate": 1.3094950084627698e-05, + "loss": 1.8287, + "step": 3000 + }, + { + "epoch": 0.45185185185185184, + "grad_norm": 3.7746520042419434, + "learning_rate": 1.2855010320446471e-05, + "loss": 1.838, + "step": 3050 + }, + { + "epoch": 0.45925925925925926, + "grad_norm": 6.708522796630859, + "learning_rate": 1.261326727055427e-05, + "loss": 1.7481, + "step": 3100 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 4.1498847007751465, + "learning_rate": 1.2369873625042784e-05, + "loss": 1.7127, + "step": 3150 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 4.49635124206543, + "learning_rate": 1.2124983116555271e-05, + "loss": 1.7455, + "step": 3200 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 3.5679614543914795, + "learning_rate": 1.187875042318573e-05, + "loss": 1.7606, + "step": 3250 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 3.869044780731201, + "learning_rate": 1.1631331070780895e-05, + "loss": 1.7529, + "step": 3300 + }, + { + "epoch": 0.4962962962962963, + "grad_norm": 3.826565980911255, + "learning_rate": 1.138288133470678e-05, + "loss": 1.7686, + "step": 3350 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 3.7426226139068604, + "learning_rate": 1.1133558141141823e-05, + "loss": 1.7185, + "step": 3400 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.6974334716796875, + "learning_rate": 1.088351896795891e-05, + "loss": 1.7201, + "step": 3450 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 4.5774054527282715, + "learning_rate": 1.0632921745259022e-05, + "loss": 1.7731, + "step": 3500 + }, + { + "epoch": 0.5259259259259259, + "grad_norm": 4.392045974731445, + "learning_rate": 1.0381924755619161e-05, + "loss": 1.7597, + "step": 3550 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 4.4499993324279785, + "learning_rate": 1.0130686534117704e-05, + "loss": 1.8088, + "step": 3600 + }, + { + "epoch": 0.5407407407407407, + "grad_norm": 4.234902858734131, + "learning_rate": 9.879365768200245e-06, + "loss": 1.7659, + "step": 3650 + }, + { + "epoch": 0.5481481481481482, + "grad_norm": 3.981933116912842, + "learning_rate": 9.62812119744919e-06, + "loss": 1.7617, + "step": 3700 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 3.9301373958587646, + "learning_rate": 9.377111513320452e-06, + "loss": 1.7294, + "step": 3750 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 4.41324520111084, + "learning_rate": 9.12649525891052e-06, + "loss": 1.7386, + "step": 3800 + }, + { + "epoch": 0.5703703703703704, + "grad_norm": 4.823089599609375, + "learning_rate": 8.876430728817238e-06, + "loss": 1.7384, + "step": 3850 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 4.53611421585083, + "learning_rate": 8.627075869157543e-06, + "loss": 1.7353, + "step": 3900 + }, + { + "epoch": 0.5851851851851851, + "grad_norm": 3.9253506660461426, + "learning_rate": 8.37858817780532e-06, + "loss": 1.7514, + "step": 3950 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 4.049194812774658, + "learning_rate": 8.131124604912365e-06, + "loss": 1.7986, + "step": 4000 + }, + { + "epoch": 0.6, + "grad_norm": 3.7836596965789795, + "learning_rate": 7.884841453775301e-06, + "loss": 1.7423, + "step": 4050 + }, + { + "epoch": 0.6074074074074074, + "grad_norm": 4.229864120483398, + "learning_rate": 7.63989428211107e-06, + "loss": 1.772, + "step": 4100 + }, + { + "epoch": 0.6148148148148148, + "grad_norm": 3.277118444442749, + "learning_rate": 7.3964378038033515e-06, + "loss": 1.7633, + "step": 4150 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 3.6668293476104736, + "learning_rate": 7.154625791181937e-06, + "loss": 1.7836, + "step": 4200 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 4.401381015777588, + "learning_rate": 6.914610977896858e-06, + "loss": 1.8098, + "step": 4250 + }, + { + "epoch": 0.6370370370370371, + "grad_norm": 5.0834808349609375, + "learning_rate": 6.676544962448514e-06, + "loss": 1.8092, + "step": 4300 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 4.588042736053467, + "learning_rate": 6.44057811243483e-06, + "loss": 1.7334, + "step": 4350 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 4.436654567718506, + "learning_rate": 6.20685946957585e-06, + "loss": 1.8513, + "step": 4400 + }, + { + "epoch": 0.6592592592592592, + "grad_norm": 3.1908841133117676, + "learning_rate": 5.97553665557578e-06, + "loss": 1.7538, + "step": 4450 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 3.9574151039123535, + "learning_rate": 5.746755778881979e-06, + "loss": 1.738, + "step": 4500 + }, + { + "epoch": 0.674074074074074, + "grad_norm": 3.3809914588928223, + "learning_rate": 5.520661342399726e-06, + "loss": 1.7949, + "step": 4550 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 5.055593967437744, + "learning_rate": 5.297396152221066e-06, + "loss": 1.7811, + "step": 4600 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 4.099941253662109, + "learning_rate": 5.0771012274254515e-06, + "loss": 1.7209, + "step": 4650 + }, + { + "epoch": 0.6962962962962963, + "grad_norm": 4.286564826965332, + "learning_rate": 4.85991571100906e-06, + "loss": 1.7683, + "step": 4700 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 4.559092998504639, + "learning_rate": 4.645976781999073e-06, + "loss": 1.8504, + "step": 4750 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 5.575582981109619, + "learning_rate": 4.4354195688085e-06, + "loss": 1.7454, + "step": 4800 + }, + { + "epoch": 0.7185185185185186, + "grad_norm": 4.1070780754089355, + "learning_rate": 4.228377063886143e-06, + "loss": 1.7724, + "step": 4850 + }, + { + "epoch": 0.725925925925926, + "grad_norm": 4.384026527404785, + "learning_rate": 4.0249800397157425e-06, + "loss": 1.7621, + "step": 4900 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 3.8214433193206787, + "learning_rate": 3.825356966217246e-06, + "loss": 1.6777, + "step": 4950 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 5.473362445831299, + "learning_rate": 3.6296339296024853e-06, + "loss": 1.7927, + "step": 5000 + }, + { + "epoch": 0.7481481481481481, + "grad_norm": 5.610530853271484, + "learning_rate": 3.437934552736388e-06, + "loss": 1.7948, + "step": 5050 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 4.789555072784424, + "learning_rate": 3.2503799170541573e-06, + "loss": 1.7594, + "step": 5100 + }, + { + "epoch": 0.762962962962963, + "grad_norm": 4.272578239440918, + "learning_rate": 3.067088486083628e-06, + "loss": 1.7519, + "step": 5150 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 4.48111629486084, + "learning_rate": 2.888176030621148e-06, + "loss": 1.7246, + "step": 5200 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 3.5455284118652344, + "learning_rate": 2.713755555608295e-06, + "loss": 1.7379, + "step": 5250 + }, + { + "epoch": 0.7851851851851852, + "grad_norm": 4.136303901672363, + "learning_rate": 2.5439372287555164e-06, + "loss": 1.8543, + "step": 5300 + }, + { + "epoch": 0.7925925925925926, + "grad_norm": 3.6334547996520996, + "learning_rate": 2.3788283109578282e-06, + "loss": 1.7957, + "step": 5350 + }, + { + "epoch": 0.8, + "grad_norm": 3.3337242603302, + "learning_rate": 2.2185330885465626e-06, + "loss": 1.7577, + "step": 5400 + }, + { + "epoch": 0.8074074074074075, + "grad_norm": 4.086159706115723, + "learning_rate": 2.0631528074198624e-06, + "loss": 1.7125, + "step": 5450 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 4.198705673217773, + "learning_rate": 1.912785609093619e-06, + "loss": 1.8493, + "step": 5500 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 3.8114004135131836, + "learning_rate": 1.7675264687131699e-06, + "loss": 1.736, + "step": 5550 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 4.404809474945068, + "learning_rate": 1.6274671350649818e-06, + "loss": 1.6118, + "step": 5600 + }, + { + "epoch": 0.837037037037037, + "grad_norm": 5.045413017272949, + "learning_rate": 1.4926960726261342e-06, + "loss": 1.8435, + "step": 5650 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.8105881214141846, + "learning_rate": 1.3632984056882615e-06, + "loss": 1.8167, + "step": 5700 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 3.879211902618408, + "learning_rate": 1.2393558645912395e-06, + "loss": 1.8421, + "step": 5750 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 4.706577301025391, + "learning_rate": 1.1209467341005297e-06, + "loss": 1.7424, + "step": 5800 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 4.003446578979492, + "learning_rate": 1.0081458039608638e-06, + "loss": 1.7016, + "step": 5850 + }, + { + "epoch": 0.8740740740740741, + "grad_norm": 4.579277515411377, + "learning_rate": 9.010243216574233e-07, + "loss": 1.8125, + "step": 5900 + }, + { + "epoch": 0.8814814814814815, + "grad_norm": 3.8693718910217285, + "learning_rate": 7.996499474144115e-07, + "loss": 1.7651, + "step": 5950 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 4.174098014831543, + "learning_rate": 7.040867114593952e-07, + "loss": 1.741, + "step": 6000 + }, + { + "epoch": 0.8962962962962963, + "grad_norm": 4.958495616912842, + "learning_rate": 6.143949735804477e-07, + "loss": 1.7791, + "step": 6050 + }, + { + "epoch": 0.9037037037037037, + "grad_norm": 4.0520548820495605, + "learning_rate": 5.306313850016154e-07, + "loss": 1.7638, + "step": 6100 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 5.820758819580078, + "learning_rate": 4.5284885260078014e-07, + "loss": 1.7263, + "step": 6150 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 3.6252601146698, + "learning_rate": 3.8109650549255195e-07, + "loss": 1.736, + "step": 6200 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 4.056784152984619, + "learning_rate": 3.1541966399726287e-07, + "loss": 1.754, + "step": 6250 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 3.9577994346618652, + "learning_rate": 2.5585981101567627e-07, + "loss": 1.8165, + "step": 6300 + }, + { + "epoch": 0.9407407407407408, + "grad_norm": 4.5424089431762695, + "learning_rate": 2.024545658275079e-07, + "loss": 1.8013, + "step": 6350 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 3.9447548389434814, + "learning_rate": 1.5523766033027298e-07, + "loss": 1.7521, + "step": 6400 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 5.120405673980713, + "learning_rate": 1.1423891773350238e-07, + "loss": 1.7422, + "step": 6450 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 4.158134937286377, + "learning_rate": 7.948423372176384e-08, + "loss": 1.7485, + "step": 6500 + }, + { + "epoch": 0.9703703703703703, + "grad_norm": 4.426783561706543, + "learning_rate": 5.099556009838913e-08, + "loss": 1.8421, + "step": 6550 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 4.222548961639404, + "learning_rate": 2.8790890920249447e-08, + "loss": 1.7025, + "step": 6600 + }, + { + "epoch": 0.9851851851851852, + "grad_norm": 4.41562032699585, + "learning_rate": 1.2884251132316839e-08, + "loss": 1.79, + "step": 6650 + }, + { + "epoch": 0.9925925925925926, + "grad_norm": 4.172543048858643, + "learning_rate": 3.2856877092168895e-09, + "loss": 1.7572, + "step": 6700 + }, + { + "epoch": 1.0, + "grad_norm": 4.530770301818848, + "learning_rate": 1.2633093371405836e-12, + "loss": 1.7305, + "step": 6750 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 4.5686211585998535, + "learning_rate": 1.0485547568805592e-05, + "loss": 1.7918, + "step": 6800 + }, + { + "epoch": 1.0148148148148148, + "grad_norm": 4.193668842315674, + "learning_rate": 1.0364827091168057e-05, + "loss": 1.818, + "step": 6850 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 3.926337957382202, + "learning_rate": 1.0244053349399506e-05, + "loss": 1.7526, + "step": 6900 + }, + { + "epoch": 1.0296296296296297, + "grad_norm": 3.8789143562316895, + "learning_rate": 1.0123243976259578e-05, + "loss": 1.733, + "step": 6950 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 4.395325183868408, + "learning_rate": 1.000241660971001e-05, + "loss": 1.6787, + "step": 7000 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 4.981533527374268, + "learning_rate": 9.881588890339562e-06, + "loss": 1.7642, + "step": 7050 + }, + { + "epoch": 1.0518518518518518, + "grad_norm": 3.6582109928131104, + "learning_rate": 9.760778458788497e-06, + "loss": 1.7935, + "step": 7100 + }, + { + "epoch": 1.0592592592592593, + "grad_norm": 4.174370288848877, + "learning_rate": 9.640002953173087e-06, + "loss": 1.7471, + "step": 7150 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 4.729079246520996, + "learning_rate": 9.519280006510476e-06, + "loss": 1.7822, + "step": 7200 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 3.563659429550171, + "learning_rate": 9.398627244144298e-06, + "loss": 1.7268, + "step": 7250 + }, + { + "epoch": 1.0814814814814815, + "grad_norm": 4.399823188781738, + "learning_rate": 9.278062281171394e-06, + "loss": 1.7783, + "step": 7300 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 3.1910765171051025, + "learning_rate": 9.157602719870045e-06, + "loss": 1.8013, + "step": 7350 + }, + { + "epoch": 1.0962962962962963, + "grad_norm": 4.391103267669678, + "learning_rate": 9.037266147130064e-06, + "loss": 1.7556, + "step": 7400 + }, + { + "epoch": 1.1037037037037036, + "grad_norm": 4.619572162628174, + "learning_rate": 8.917070131885155e-06, + "loss": 1.8024, + "step": 7450 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 4.849502086639404, + "learning_rate": 8.797032222547856e-06, + "loss": 1.7657, + "step": 7500 + }, + { + "epoch": 1.1185185185185185, + "grad_norm": 3.781859874725342, + "learning_rate": 8.67716994444752e-06, + "loss": 1.7622, + "step": 7550 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 3.9502782821655273, + "learning_rate": 8.557500797271638e-06, + "loss": 1.7528, + "step": 7600 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 4.41927433013916, + "learning_rate": 8.438042252510919e-06, + "loss": 1.7763, + "step": 7650 + }, + { + "epoch": 1.1407407407407408, + "grad_norm": 4.123870372772217, + "learning_rate": 8.318811750908481e-06, + "loss": 1.7615, + "step": 7700 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 4.480880260467529, + "learning_rate": 8.199826699913524e-06, + "loss": 1.7523, + "step": 7750 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 4.082483291625977, + "learning_rate": 8.081104471139885e-06, + "loss": 1.7517, + "step": 7800 + }, + { + "epoch": 1.162962962962963, + "grad_norm": 4.776106357574463, + "learning_rate": 7.962662397829805e-06, + "loss": 1.6679, + "step": 7850 + }, + { + "epoch": 1.1703703703703703, + "grad_norm": 5.85430908203125, + "learning_rate": 7.844517772323305e-06, + "loss": 1.6882, + "step": 7900 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 4.3917012214660645, + "learning_rate": 7.726687843533539e-06, + "loss": 1.7661, + "step": 7950 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 3.9886467456817627, + "learning_rate": 7.609189814428473e-06, + "loss": 1.7336, + "step": 8000 + }, + { + "epoch": 1.1925925925925926, + "grad_norm": 4.853082180023193, + "learning_rate": 7.492040839519299e-06, + "loss": 1.8087, + "step": 8050 + }, + { + "epoch": 1.2, + "grad_norm": 4.096999645233154, + "learning_rate": 7.37525802235588e-06, + "loss": 1.7323, + "step": 8100 + }, + { + "epoch": 1.2074074074074075, + "grad_norm": 4.180538177490234, + "learning_rate": 7.258858413029683e-06, + "loss": 1.8245, + "step": 8150 + }, + { + "epoch": 1.2148148148148148, + "grad_norm": 4.450080871582031, + "learning_rate": 7.142859005684486e-06, + "loss": 1.7485, + "step": 8200 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 5.08303689956665, + "learning_rate": 7.027276736035256e-06, + "loss": 1.7721, + "step": 8250 + }, + { + "epoch": 1.2296296296296296, + "grad_norm": 3.8852577209472656, + "learning_rate": 6.912128478895575e-06, + "loss": 1.7258, + "step": 8300 + }, + { + "epoch": 1.237037037037037, + "grad_norm": 4.540018081665039, + "learning_rate": 6.797431045713948e-06, + "loss": 1.8447, + "step": 8350 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 4.3245954513549805, + "learning_rate": 6.683201182119334e-06, + "loss": 1.7964, + "step": 8400 + }, + { + "epoch": 1.2518518518518518, + "grad_norm": 6.528281211853027, + "learning_rate": 6.569455565476361e-06, + "loss": 1.7047, + "step": 8450 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 4.52567195892334, + "learning_rate": 6.4562108024504065e-06, + "loss": 1.7325, + "step": 8500 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 3.1090915203094482, + "learning_rate": 6.343483426583085e-06, + "loss": 1.7662, + "step": 8550 + }, + { + "epoch": 1.2740740740740741, + "grad_norm": 4.740061283111572, + "learning_rate": 6.231289895878375e-06, + "loss": 1.7386, + "step": 8600 + }, + { + "epoch": 1.2814814814814814, + "grad_norm": 4.207073211669922, + "learning_rate": 6.119646590399768e-06, + "loss": 1.7749, + "step": 8650 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 3.976893901824951, + "learning_rate": 6.008569809878817e-06, + "loss": 1.7334, + "step": 8700 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 5.906888961791992, + "learning_rate": 5.898075771335408e-06, + "loss": 1.739, + "step": 8750 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 4.902368068695068, + "learning_rate": 5.788180606710076e-06, + "loss": 1.7981, + "step": 8800 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 5.860922813415527, + "learning_rate": 5.678900360508813e-06, + "loss": 1.7231, + "step": 8850 + }, + { + "epoch": 1.3185185185185184, + "grad_norm": 4.610461711883545, + "learning_rate": 5.570250987460557e-06, + "loss": 1.814, + "step": 8900 + }, + { + "epoch": 1.325925925925926, + "grad_norm": 4.3659844398498535, + "learning_rate": 5.462248350187851e-06, + "loss": 1.7117, + "step": 8950 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.581747531890869, + "learning_rate": 5.35490821689092e-06, + "loss": 1.7883, + "step": 9000 + }, + { + "epoch": 1.3407407407407408, + "grad_norm": 4.3234333992004395, + "learning_rate": 5.248246259045545e-06, + "loss": 1.7519, + "step": 9050 + }, + { + "epoch": 1.348148148148148, + "grad_norm": 4.9023308753967285, + "learning_rate": 5.142278049115043e-06, + "loss": 1.7388, + "step": 9100 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 5.058767318725586, + "learning_rate": 5.037019058276733e-06, + "loss": 1.7572, + "step": 9150 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 4.163683891296387, + "learning_rate": 4.932484654163156e-06, + "loss": 1.8142, + "step": 9200 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 4.509832859039307, + "learning_rate": 4.828690098618429e-06, + "loss": 1.8259, + "step": 9250 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 4.533676624298096, + "learning_rate": 4.725650545470048e-06, + "loss": 1.7511, + "step": 9300 + }, + { + "epoch": 1.3851851851851853, + "grad_norm": 5.931670665740967, + "learning_rate": 4.62338103831645e-06, + "loss": 1.707, + "step": 9350 + }, + { + "epoch": 1.3925925925925926, + "grad_norm": 5.659622669219971, + "learning_rate": 4.521896508330672e-06, + "loss": 1.6732, + "step": 9400 + }, + { + "epoch": 1.4, + "grad_norm": 5.572442054748535, + "learning_rate": 4.421211772080429e-06, + "loss": 1.774, + "step": 9450 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 4.983119487762451, + "learning_rate": 4.321341529364921e-06, + "loss": 1.8113, + "step": 9500 + }, + { + "epoch": 1.4148148148148147, + "grad_norm": 4.496840953826904, + "learning_rate": 4.222300361068686e-06, + "loss": 1.8397, + "step": 9550 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 4.4754204750061035, + "learning_rate": 4.12410272703281e-06, + "loss": 1.7113, + "step": 9600 + }, + { + "epoch": 1.4296296296296296, + "grad_norm": 4.279294490814209, + "learning_rate": 4.026762963943822e-06, + "loss": 1.7018, + "step": 9650 + }, + { + "epoch": 1.4370370370370371, + "grad_norm": 4.388822078704834, + "learning_rate": 3.93029528324057e-06, + "loss": 1.6982, + "step": 9700 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 4.000326156616211, + "learning_rate": 3.83471376903936e-06, + "loss": 1.696, + "step": 9750 + }, + { + "epoch": 1.4518518518518517, + "grad_norm": 4.983404636383057, + "learning_rate": 3.740032376077698e-06, + "loss": 1.6858, + "step": 9800 + }, + { + "epoch": 1.4592592592592593, + "grad_norm": 3.823800563812256, + "learning_rate": 3.646264927676937e-06, + "loss": 1.7345, + "step": 9850 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 5.286291599273682, + "learning_rate": 3.5534251137240883e-06, + "loss": 1.6576, + "step": 9900 + }, + { + "epoch": 1.474074074074074, + "grad_norm": 4.020094394683838, + "learning_rate": 3.461526488673118e-06, + "loss": 1.7326, + "step": 9950 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 3.9249887466430664, + "learning_rate": 3.370582469566027e-06, + "loss": 1.7345, + "step": 10000 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 4.402612686157227, + "learning_rate": 3.2806063340739768e-06, + "loss": 1.7224, + "step": 10050 + }, + { + "epoch": 1.4962962962962962, + "grad_norm": 4.24970817565918, + "learning_rate": 3.1916112185587833e-06, + "loss": 1.761, + "step": 10100 + }, + { + "epoch": 1.5037037037037035, + "grad_norm": 5.631906509399414, + "learning_rate": 3.103610116155018e-06, + "loss": 1.7461, + "step": 10150 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 3.5104784965515137, + "learning_rate": 3.0166158748730456e-06, + "loss": 1.747, + "step": 10200 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 5.422842979431152, + "learning_rate": 2.930641195723224e-06, + "loss": 1.7131, + "step": 10250 + }, + { + "epoch": 1.525925925925926, + "grad_norm": 4.995806694030762, + "learning_rate": 2.845698630861593e-06, + "loss": 1.8228, + "step": 10300 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 4.923055171966553, + "learning_rate": 2.761800581757258e-06, + "loss": 1.7489, + "step": 10350 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 3.771245241165161, + "learning_rate": 2.6789592973818257e-06, + "loss": 1.7353, + "step": 10400 + }, + { + "epoch": 1.5481481481481483, + "grad_norm": 3.624614715576172, + "learning_rate": 2.5971868724210513e-06, + "loss": 1.7126, + "step": 10450 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 7.3490471839904785, + "learning_rate": 2.5164952455090575e-06, + "loss": 1.6574, + "step": 10500 + }, + { + "epoch": 1.5629629629629629, + "grad_norm": 3.961510419845581, + "learning_rate": 2.436896197485282e-06, + "loss": 1.8217, + "step": 10550 + }, + { + "epoch": 1.5703703703703704, + "grad_norm": 4.2067975997924805, + "learning_rate": 2.358401349674528e-06, + "loss": 1.7681, + "step": 10600 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 4.517464637756348, + "learning_rate": 2.2810221621902563e-06, + "loss": 1.7826, + "step": 10650 + }, + { + "epoch": 1.585185185185185, + "grad_norm": 4.264112949371338, + "learning_rate": 2.2047699322614234e-06, + "loss": 1.7051, + "step": 10700 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 4.641966819763184, + "learning_rate": 2.1296557925831164e-06, + "loss": 1.7346, + "step": 10750 + }, + { + "epoch": 1.6, + "grad_norm": 4.101382732391357, + "learning_rate": 2.0556907096911926e-06, + "loss": 1.7784, + "step": 10800 + }, + { + "epoch": 1.6074074074074074, + "grad_norm": 3.9751815795898438, + "learning_rate": 1.9828854823611776e-06, + "loss": 1.8022, + "step": 10850 + }, + { + "epoch": 1.6148148148148147, + "grad_norm": 4.810856819152832, + "learning_rate": 1.9112507400316814e-06, + "loss": 1.6955, + "step": 10900 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 4.649193286895752, + "learning_rate": 1.8407969412525006e-06, + "loss": 1.7858, + "step": 10950 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 4.422476291656494, + "learning_rate": 1.7715343721576973e-06, + "loss": 1.6827, + "step": 11000 + }, + { + "epoch": 1.637037037037037, + "grad_norm": 4.0928473472595215, + "learning_rate": 1.7034731449638287e-06, + "loss": 1.7319, + "step": 11050 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 4.212646007537842, + "learning_rate": 1.6366231964936019e-06, + "loss": 1.6627, + "step": 11100 + }, + { + "epoch": 1.651851851851852, + "grad_norm": 5.1110663414001465, + "learning_rate": 1.5709942867250972e-06, + "loss": 1.7207, + "step": 11150 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 4.896889686584473, + "learning_rate": 1.5065959973668355e-06, + "loss": 1.6812, + "step": 11200 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 5.54286003112793, + "learning_rate": 1.4434377304588643e-06, + "loss": 1.8249, + "step": 11250 + }, + { + "epoch": 1.674074074074074, + "grad_norm": 6.0234575271606445, + "learning_rate": 1.3815287070000727e-06, + "loss": 1.7191, + "step": 11300 + }, + { + "epoch": 1.6814814814814816, + "grad_norm": 5.337372779846191, + "learning_rate": 1.3208779656019466e-06, + "loss": 1.7623, + "step": 11350 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 4.075193405151367, + "learning_rate": 1.2614943611689446e-06, + "loss": 1.7792, + "step": 11400 + }, + { + "epoch": 1.6962962962962962, + "grad_norm": 3.773776054382324, + "learning_rate": 1.203386563605693e-06, + "loss": 1.7327, + "step": 11450 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 3.25433349609375, + "learning_rate": 1.146563056551202e-06, + "loss": 1.6928, + "step": 11500 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 3.913914442062378, + "learning_rate": 1.0910321361402654e-06, + "loss": 1.6769, + "step": 11550 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 3.303250551223755, + "learning_rate": 1.0368019097922344e-06, + "loss": 1.713, + "step": 11600 + }, + { + "epoch": 1.7259259259259259, + "grad_norm": 5.97332239151001, + "learning_rate": 9.838802950273551e-07, + "loss": 1.7482, + "step": 11650 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 4.753826141357422, + "learning_rate": 9.322750183108264e-07, + "loss": 1.7199, + "step": 11700 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 4.7902092933654785, + "learning_rate": 8.819936139247421e-07, + "loss": 1.6939, + "step": 11750 + }, + { + "epoch": 1.748148148148148, + "grad_norm": 4.5352630615234375, + "learning_rate": 8.33043422868095e-07, + "loss": 1.7428, + "step": 11800 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 3.912677526473999, + "learning_rate": 7.854315917850163e-07, + "loss": 1.6898, + "step": 11850 + }, + { + "epoch": 1.762962962962963, + "grad_norm": 5.150155067443848, + "learning_rate": 7.391650719213706e-07, + "loss": 1.6438, + "step": 11900 + }, + { + "epoch": 1.7703703703703704, + "grad_norm": 6.563386917114258, + "learning_rate": 6.942506181098851e-07, + "loss": 1.7661, + "step": 11950 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 3.9827239513397217, + "learning_rate": 6.506947877839587e-07, + "loss": 1.8036, + "step": 12000 + }, + { + "epoch": 1.7851851851851852, + "grad_norm": 4.162949562072754, + "learning_rate": 6.085039400202852e-07, + "loss": 1.772, + "step": 12050 + }, + { + "epoch": 1.7925925925925927, + "grad_norm": 4.235226154327393, + "learning_rate": 5.676842346104383e-07, + "loss": 1.6865, + "step": 12100 + }, + { + "epoch": 1.8, + "grad_norm": 4.130675315856934, + "learning_rate": 5.28241631161559e-07, + "loss": 1.6688, + "step": 12150 + }, + { + "epoch": 1.8074074074074074, + "grad_norm": 4.83511209487915, + "learning_rate": 4.901818882262532e-07, + "loss": 1.6613, + "step": 12200 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 4.381454944610596, + "learning_rate": 4.53510562461863e-07, + "loss": 1.7773, + "step": 12250 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 5.143768787384033, + "learning_rate": 4.182330078191976e-07, + "loss": 1.8461, + "step": 12300 + }, + { + "epoch": 1.8296296296296295, + "grad_norm": 4.87855339050293, + "learning_rate": 3.8435437476086466e-07, + "loss": 1.7111, + "step": 12350 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 4.4321441650390625, + "learning_rate": 3.51879609509318e-07, + "loss": 1.7595, + "step": 12400 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 4.462729454040527, + "learning_rate": 3.2081345332471204e-07, + "loss": 1.8522, + "step": 12450 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 5.265483379364014, + "learning_rate": 2.911604418126901e-07, + "loss": 1.7855, + "step": 12500 + }, + { + "epoch": 1.8592592592592592, + "grad_norm": 6.335628509521484, + "learning_rate": 2.6292490426218955e-07, + "loss": 1.6626, + "step": 12550 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 5.781252384185791, + "learning_rate": 2.3611096301337623e-07, + "loss": 1.8229, + "step": 12600 + }, + { + "epoch": 1.8740740740740742, + "grad_norm": 4.649851322174072, + "learning_rate": 2.1072253285578602e-07, + "loss": 1.7732, + "step": 12650 + }, + { + "epoch": 1.8814814814814815, + "grad_norm": 4.953747749328613, + "learning_rate": 1.867633204567776e-07, + "loss": 1.6664, + "step": 12700 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 3.763566493988037, + "learning_rate": 1.6423682382036288e-07, + "loss": 1.764, + "step": 12750 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 5.626904487609863, + "learning_rate": 1.431463317765025e-07, + "loss": 1.7337, + "step": 12800 + }, + { + "epoch": 1.9037037037037037, + "grad_norm": 5.060257434844971, + "learning_rate": 1.2349492350094195e-07, + "loss": 1.8271, + "step": 12850 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 4.137857913970947, + "learning_rate": 1.0528546806566342e-07, + "loss": 1.7984, + "step": 12900 + }, + { + "epoch": 1.9185185185185185, + "grad_norm": 4.870020389556885, + "learning_rate": 8.852062402000095e-08, + "loss": 1.7068, + "step": 12950 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 4.509660243988037, + "learning_rate": 7.320283900249636e-08, + "loss": 1.7385, + "step": 13000 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 4.607212543487549, + "learning_rate": 5.933434938354965e-08, + "loss": 1.7423, + "step": 13050 + }, + { + "epoch": 1.9407407407407407, + "grad_norm": 4.2224016189575195, + "learning_rate": 4.6917179938912315e-08, + "loss": 1.7218, + "step": 13100 + }, + { + "epoch": 1.9481481481481482, + "grad_norm": 4.699623107910156, + "learning_rate": 3.595314355407609e-08, + "loss": 1.7581, + "step": 13150 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 5.265933990478516, + "learning_rate": 2.6443840959590183e-08, + "loss": 1.7206, + "step": 13200 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 4.173360347747803, + "learning_rate": 1.839066049736271e-08, + "loss": 1.7231, + "step": 13250 + }, + { + "epoch": 1.9703703703703703, + "grad_norm": 4.037991523742676, + "learning_rate": 1.1794777917957245e-08, + "loss": 1.6786, + "step": 13300 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 3.815904140472412, + "learning_rate": 6.657156208946802e-09, + "loss": 1.7763, + "step": 13350 + }, + { + "epoch": 1.9851851851851852, + "grad_norm": 4.844634532928467, + "learning_rate": 2.9785454543074244e-09, + "loss": 1.7412, + "step": 13400 + }, + { + "epoch": 1.9925925925925925, + "grad_norm": 4.531811237335205, + "learning_rate": 7.594827249135517e-10, + "loss": 1.7515, + "step": 13450 + }, + { + "epoch": 2.0, + "grad_norm": 5.7891435623168945, + "learning_rate": 2.9200012852115266e-13, + "loss": 1.7226, + "step": 13500 + }, + { + "epoch": 2.0074074074074075, + "grad_norm": 5.030249118804932, + "learning_rate": 5.162960293961459e-06, + "loss": 1.702, + "step": 13550 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 4.552247047424316, + "learning_rate": 5.093503346649434e-06, + "loss": 1.6877, + "step": 13600 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 4.661364555358887, + "learning_rate": 5.024356765789562e-06, + "loss": 1.6621, + "step": 13650 + }, + { + "epoch": 2.0296296296296297, + "grad_norm": 4.184024333953857, + "learning_rate": 4.955524925333455e-06, + "loss": 1.811, + "step": 13700 + }, + { + "epoch": 2.037037037037037, + "grad_norm": 4.5465898513793945, + "learning_rate": 4.887012179323451e-06, + "loss": 1.7626, + "step": 13750 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 5.515983581542969, + "learning_rate": 4.818822861617165e-06, + "loss": 1.761, + "step": 13800 + }, + { + "epoch": 2.051851851851852, + "grad_norm": 4.300454616546631, + "learning_rate": 4.7509612856133645e-06, + "loss": 1.7684, + "step": 13850 + }, + { + "epoch": 2.0592592592592593, + "grad_norm": 4.138401508331299, + "learning_rate": 4.683431743979113e-06, + "loss": 1.6814, + "step": 13900 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 4.315591335296631, + "learning_rate": 4.616238508378233e-06, + "loss": 1.7425, + "step": 13950 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 5.489791393280029, + "learning_rate": 4.549385829201098e-06, + "loss": 1.7289, + "step": 14000 + }, + { + "epoch": 2.0814814814814815, + "grad_norm": 3.5941522121429443, + "learning_rate": 4.482877935295768e-06, + "loss": 1.7781, + "step": 14050 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 4.438997268676758, + "learning_rate": 4.416719033700483e-06, + "loss": 1.7008, + "step": 14100 + }, + { + "epoch": 2.096296296296296, + "grad_norm": 4.225255966186523, + "learning_rate": 4.350913309377562e-06, + "loss": 1.7366, + "step": 14150 + }, + { + "epoch": 2.1037037037037036, + "grad_norm": 5.48150110244751, + "learning_rate": 4.28546492494865e-06, + "loss": 1.784, + "step": 14200 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 4.1811299324035645, + "learning_rate": 4.220378020431424e-06, + "loss": 1.7335, + "step": 14250 + }, + { + "epoch": 2.1185185185185187, + "grad_norm": 4.268235683441162, + "learning_rate": 4.155656712977703e-06, + "loss": 1.7866, + "step": 14300 + }, + { + "epoch": 2.1259259259259258, + "grad_norm": 4.499932765960693, + "learning_rate": 4.091305096613023e-06, + "loss": 1.7826, + "step": 14350 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 5.271858215332031, + "learning_rate": 4.027327241977652e-06, + "loss": 1.7491, + "step": 14400 + }, + { + "epoch": 2.140740740740741, + "grad_norm": 5.147281169891357, + "learning_rate": 3.963727196069101e-06, + "loss": 1.8014, + "step": 14450 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 4.847378730773926, + "learning_rate": 3.900508981986137e-06, + "loss": 1.6866, + "step": 14500 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 4.832731246948242, + "learning_rate": 3.8376765986742795e-06, + "loss": 1.811, + "step": 14550 + }, + { + "epoch": 2.162962962962963, + "grad_norm": 4.3363800048828125, + "learning_rate": 3.77523402067285e-06, + "loss": 1.7513, + "step": 14600 + }, + { + "epoch": 2.1703703703703705, + "grad_norm": 5.211742877960205, + "learning_rate": 3.71318519786356e-06, + "loss": 1.6949, + "step": 14650 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 5.9687113761901855, + "learning_rate": 3.6515340552206547e-06, + "loss": 1.6632, + "step": 14700 + }, + { + "epoch": 2.185185185185185, + "grad_norm": 4.977273464202881, + "learning_rate": 3.5902844925626334e-06, + "loss": 1.7158, + "step": 14750 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 4.584545612335205, + "learning_rate": 3.5294403843055604e-06, + "loss": 1.6568, + "step": 14800 + }, + { + "epoch": 2.2, + "grad_norm": 5.935274600982666, + "learning_rate": 3.4690055792179824e-06, + "loss": 1.6968, + "step": 14850 + }, + { + "epoch": 2.2074074074074073, + "grad_norm": 5.389420509338379, + "learning_rate": 3.408983900177486e-06, + "loss": 1.763, + "step": 14900 + }, + { + "epoch": 2.214814814814815, + "grad_norm": 5.381558895111084, + "learning_rate": 3.3493791439288503e-06, + "loss": 1.7165, + "step": 14950 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 4.326274871826172, + "learning_rate": 3.2901950808438975e-06, + "loss": 1.7434, + "step": 15000 + }, + { + "epoch": 2.2296296296296294, + "grad_norm": 5.2910919189453125, + "learning_rate": 3.2314354546829874e-06, + "loss": 1.7093, + "step": 15050 + }, + { + "epoch": 2.237037037037037, + "grad_norm": 3.7212393283843994, + "learning_rate": 3.173103982358211e-06, + "loss": 1.6989, + "step": 15100 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 5.0373759269714355, + "learning_rate": 3.11520435369825e-06, + "loss": 1.798, + "step": 15150 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 5.25777006149292, + "learning_rate": 3.0577402312149963e-06, + "loss": 1.7995, + "step": 15200 + }, + { + "epoch": 2.259259259259259, + "grad_norm": 4.734623432159424, + "learning_rate": 3.0007152498718596e-06, + "loss": 1.7205, + "step": 15250 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 5.25666618347168, + "learning_rate": 2.9441330168538484e-06, + "loss": 1.8312, + "step": 15300 + }, + { + "epoch": 2.274074074074074, + "grad_norm": 4.524062156677246, + "learning_rate": 2.8879971113393755e-06, + "loss": 1.7134, + "step": 15350 + }, + { + "epoch": 2.2814814814814817, + "grad_norm": 5.685743808746338, + "learning_rate": 2.832311084273863e-06, + "loss": 1.7821, + "step": 15400 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 4.252753734588623, + "learning_rate": 2.7770784581451205e-06, + "loss": 1.7991, + "step": 15450 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 5.265336990356445, + "learning_rate": 2.7223027267605307e-06, + "loss": 1.763, + "step": 15500 + }, + { + "epoch": 2.303703703703704, + "grad_norm": 4.18657922744751, + "learning_rate": 2.667987355026039e-06, + "loss": 1.7952, + "step": 15550 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 4.812895774841309, + "learning_rate": 2.614135778726965e-06, + "loss": 1.7109, + "step": 15600 + }, + { + "epoch": 2.3185185185185184, + "grad_norm": 5.158443450927734, + "learning_rate": 2.5607514043106997e-06, + "loss": 1.6805, + "step": 15650 + }, + { + "epoch": 2.325925925925926, + "grad_norm": 4.908586025238037, + "learning_rate": 2.507837608671194e-06, + "loss": 1.726, + "step": 15700 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 4.755005836486816, + "learning_rate": 2.4553977389353678e-06, + "loss": 1.7835, + "step": 15750 + }, + { + "epoch": 2.3407407407407406, + "grad_norm": 4.698461532592773, + "learning_rate": 2.4034351122513723e-06, + "loss": 1.7529, + "step": 15800 + }, + { + "epoch": 2.348148148148148, + "grad_norm": 5.384017467498779, + "learning_rate": 2.351953015578775e-06, + "loss": 1.8149, + "step": 15850 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 7.171658515930176, + "learning_rate": 2.3009547054806205e-06, + "loss": 1.7038, + "step": 15900 + }, + { + "epoch": 2.362962962962963, + "grad_norm": 5.168342590332031, + "learning_rate": 2.2504434079174465e-06, + "loss": 1.8115, + "step": 15950 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 4.522395610809326, + "learning_rate": 2.200422318043206e-06, + "loss": 1.7485, + "step": 16000 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 5.041557788848877, + "learning_rate": 2.150894600003182e-06, + "loss": 1.6743, + "step": 16050 + }, + { + "epoch": 2.3851851851851853, + "grad_norm": 5.59356164932251, + "learning_rate": 2.1018633867338055e-06, + "loss": 1.8181, + "step": 16100 + }, + { + "epoch": 2.3925925925925924, + "grad_norm": 5.429520130157471, + "learning_rate": 2.0533317797644947e-06, + "loss": 1.773, + "step": 16150 + }, + { + "epoch": 2.4, + "grad_norm": 4.348500728607178, + "learning_rate": 2.0053028490214555e-06, + "loss": 1.7281, + "step": 16200 + }, + { + "epoch": 2.4074074074074074, + "grad_norm": 5.83561897277832, + "learning_rate": 1.957779632633503e-06, + "loss": 1.6564, + "step": 16250 + }, + { + "epoch": 2.414814814814815, + "grad_norm": 5.049750804901123, + "learning_rate": 1.910765136739864e-06, + "loss": 1.7724, + "step": 16300 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 5.865388870239258, + "learning_rate": 1.8642623353000277e-06, + "loss": 1.7494, + "step": 16350 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 4.3326215744018555, + "learning_rate": 1.8182741699056273e-06, + "loss": 1.8173, + "step": 16400 + }, + { + "epoch": 2.437037037037037, + "grad_norm": 5.618774890899658, + "learning_rate": 1.7728035495943618e-06, + "loss": 1.7839, + "step": 16450 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 5.614139556884766, + "learning_rate": 1.7278533506659822e-06, + "loss": 1.6849, + "step": 16500 + }, + { + "epoch": 2.4518518518518517, + "grad_norm": 4.3281779289245605, + "learning_rate": 1.6834264165003478e-06, + "loss": 1.701, + "step": 16550 + }, + { + "epoch": 2.4592592592592593, + "grad_norm": 4.949063777923584, + "learning_rate": 1.6395255573775626e-06, + "loss": 1.7329, + "step": 16600 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 4.536187648773193, + "learning_rate": 1.5961535503002168e-06, + "loss": 1.7248, + "step": 16650 + }, + { + "epoch": 2.474074074074074, + "grad_norm": 4.2007293701171875, + "learning_rate": 1.5533131388177115e-06, + "loss": 1.6858, + "step": 16700 + }, + { + "epoch": 2.4814814814814814, + "grad_norm": 4.544723033905029, + "learning_rate": 1.511007032852716e-06, + "loss": 1.7631, + "step": 16750 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 4.876046180725098, + "learning_rate": 1.4692379085297549e-06, + "loss": 1.727, + "step": 16800 + }, + { + "epoch": 2.4962962962962965, + "grad_norm": 4.374514102935791, + "learning_rate": 1.4280084080059175e-06, + "loss": 1.6994, + "step": 16850 + }, + { + "epoch": 2.5037037037037035, + "grad_norm": 4.541500091552734, + "learning_rate": 1.3873211393037333e-06, + "loss": 1.7648, + "step": 16900 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 4.923274993896484, + "learning_rate": 1.347178676146188e-06, + "loss": 1.7582, + "step": 16950 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 4.922545433044434, + "learning_rate": 1.3075835577939332e-06, + "loss": 1.7092, + "step": 17000 + }, + { + "epoch": 2.525925925925926, + "grad_norm": 3.655978202819824, + "learning_rate": 1.268538288884651e-06, + "loss": 1.7205, + "step": 17050 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 4.191182613372803, + "learning_rate": 1.2300453392746226e-06, + "loss": 1.7521, + "step": 17100 + }, + { + "epoch": 2.5407407407407407, + "grad_norm": 4.058208465576172, + "learning_rate": 1.1921071438824971e-06, + "loss": 1.7354, + "step": 17150 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 4.33668327331543, + "learning_rate": 1.1547261025352674e-06, + "loss": 1.7775, + "step": 17200 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 4.856389999389648, + "learning_rate": 1.1179045798164634e-06, + "loss": 1.8189, + "step": 17250 + }, + { + "epoch": 2.562962962962963, + "grad_norm": 4.473984241485596, + "learning_rate": 1.08164490491658e-06, + "loss": 1.763, + "step": 17300 + }, + { + "epoch": 2.5703703703703704, + "grad_norm": 5.137580871582031, + "learning_rate": 1.0459493714857404e-06, + "loss": 1.7901, + "step": 17350 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 4.774714946746826, + "learning_rate": 1.0108202374886111e-06, + "loss": 1.6943, + "step": 17400 + }, + { + "epoch": 2.585185185185185, + "grad_norm": 4.900181293487549, + "learning_rate": 9.762597250615647e-07, + "loss": 1.8258, + "step": 17450 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 3.775563955307007, + "learning_rate": 9.422700203721235e-07, + "loss": 1.7091, + "step": 17500 + }, + { + "epoch": 2.6, + "grad_norm": 5.2524800300598145, + "learning_rate": 9.088532734806655e-07, + "loss": 1.7415, + "step": 17550 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 4.3708648681640625, + "learning_rate": 8.760115982044259e-07, + "loss": 1.8055, + "step": 17600 + }, + { + "epoch": 2.6148148148148147, + "grad_norm": 5.368456840515137, + "learning_rate": 8.437470719837737e-07, + "loss": 1.7102, + "step": 17650 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 4.581341743469238, + "learning_rate": 8.120617357508109e-07, + "loss": 1.7465, + "step": 17700 + }, + { + "epoch": 2.6296296296296298, + "grad_norm": 4.498773097991943, + "learning_rate": 7.809575938002744e-07, + "loss": 1.7582, + "step": 17750 + }, + { + "epoch": 2.637037037037037, + "grad_norm": 4.894174098968506, + "learning_rate": 7.504366136627372e-07, + "loss": 1.7584, + "step": 17800 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 5.387177467346191, + "learning_rate": 7.205007259801589e-07, + "loss": 1.7143, + "step": 17850 + }, + { + "epoch": 2.651851851851852, + "grad_norm": 4.475066661834717, + "learning_rate": 6.911518243837634e-07, + "loss": 1.6871, + "step": 17900 + }, + { + "epoch": 2.659259259259259, + "grad_norm": 5.426724910736084, + "learning_rate": 6.623917653742473e-07, + "loss": 1.7643, + "step": 17950 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 5.106094837188721, + "learning_rate": 6.342223682043536e-07, + "loss": 1.7472, + "step": 18000 + }, + { + "epoch": 2.674074074074074, + "grad_norm": 5.548438549041748, + "learning_rate": 6.066454147637791e-07, + "loss": 1.7487, + "step": 18050 + }, + { + "epoch": 2.6814814814814816, + "grad_norm": 4.959684371948242, + "learning_rate": 5.796626494664736e-07, + "loss": 1.7598, + "step": 18100 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 4.252431392669678, + "learning_rate": 5.53275779140291e-07, + "loss": 1.7861, + "step": 18150 + }, + { + "epoch": 2.696296296296296, + "grad_norm": 4.257000923156738, + "learning_rate": 5.274864729190121e-07, + "loss": 1.7652, + "step": 18200 + }, + { + "epoch": 2.7037037037037037, + "grad_norm": 5.807356357574463, + "learning_rate": 5.022963621367738e-07, + "loss": 1.7479, + "step": 18250 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 4.54324197769165, + "learning_rate": 4.777070402248674e-07, + "loss": 1.6874, + "step": 18300 + }, + { + "epoch": 2.7185185185185183, + "grad_norm": 4.591061592102051, + "learning_rate": 4.5372006261095616e-07, + "loss": 1.7495, + "step": 18350 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 4.683126449584961, + "learning_rate": 4.3033694662067193e-07, + "loss": 1.8024, + "step": 18400 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 5.42063570022583, + "learning_rate": 4.075591713816396e-07, + "loss": 1.7193, + "step": 18450 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 4.868284225463867, + "learning_rate": 3.85388177729914e-07, + "loss": 1.7279, + "step": 18500 + }, + { + "epoch": 2.748148148148148, + "grad_norm": 4.900742053985596, + "learning_rate": 3.6382536811884304e-07, + "loss": 1.7472, + "step": 18550 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 6.0796308517456055, + "learning_rate": 3.428721065303442e-07, + "loss": 1.7336, + "step": 18600 + }, + { + "epoch": 2.762962962962963, + "grad_norm": 5.27460241317749, + "learning_rate": 3.225297183886289e-07, + "loss": 1.7737, + "step": 18650 + }, + { + "epoch": 2.7703703703703706, + "grad_norm": 5.652526378631592, + "learning_rate": 3.0279949047636094e-07, + "loss": 1.6492, + "step": 18700 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 4.943962574005127, + "learning_rate": 2.836826708532603e-07, + "loss": 1.7477, + "step": 18750 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 4.6496429443359375, + "learning_rate": 2.6518046877715643e-07, + "loss": 1.7588, + "step": 18800 + }, + { + "epoch": 2.7925925925925927, + "grad_norm": 5.191527366638184, + "learning_rate": 2.472940546274871e-07, + "loss": 1.7494, + "step": 18850 + }, + { + "epoch": 2.8, + "grad_norm": 4.17832612991333, + "learning_rate": 2.300245598312778e-07, + "loss": 1.6849, + "step": 18900 + }, + { + "epoch": 2.8074074074074074, + "grad_norm": 3.590705633163452, + "learning_rate": 2.1337307679156206e-07, + "loss": 1.7048, + "step": 18950 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 7.666831970214844, + "learning_rate": 1.9734065881828467e-07, + "loss": 1.7125, + "step": 19000 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 4.35758638381958, + "learning_rate": 1.8192832006166949e-07, + "loss": 1.7405, + "step": 19050 + }, + { + "epoch": 2.8296296296296295, + "grad_norm": 4.952466011047363, + "learning_rate": 1.6713703544807169e-07, + "loss": 1.7473, + "step": 19100 + }, + { + "epoch": 2.837037037037037, + "grad_norm": 3.8383476734161377, + "learning_rate": 1.5296774061830722e-07, + "loss": 1.7803, + "step": 19150 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 4.229979515075684, + "learning_rate": 1.3942133186846563e-07, + "loss": 1.6724, + "step": 19200 + }, + { + "epoch": 2.851851851851852, + "grad_norm": 4.510770797729492, + "learning_rate": 1.2649866609321548e-07, + "loss": 1.7898, + "step": 19250 + }, + { + "epoch": 2.859259259259259, + "grad_norm": 5.060910701751709, + "learning_rate": 1.1420056073159879e-07, + "loss": 1.632, + "step": 19300 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 4.019813537597656, + "learning_rate": 1.0252779371532795e-07, + "loss": 1.72, + "step": 19350 + }, + { + "epoch": 2.8740740740740742, + "grad_norm": 4.553711414337158, + "learning_rate": 9.148110341956618e-08, + "loss": 1.7606, + "step": 19400 + }, + { + "epoch": 2.8814814814814813, + "grad_norm": 4.837120056152344, + "learning_rate": 8.10611886162338e-08, + "loss": 1.7198, + "step": 19450 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 6.557154178619385, + "learning_rate": 7.126870842979695e-08, + "loss": 1.8004, + "step": 19500 + }, + { + "epoch": 2.8962962962962964, + "grad_norm": 6.096388816833496, + "learning_rate": 6.210428229557641e-08, + "loss": 1.7971, + "step": 19550 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 5.821315288543701, + "learning_rate": 5.356848992056574e-08, + "loss": 1.6286, + "step": 19600 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 6.1501336097717285, + "learning_rate": 4.566187124676269e-08, + "loss": 1.798, + "step": 19650 + }, + { + "epoch": 2.9185185185185185, + "grad_norm": 5.411888599395752, + "learning_rate": 3.8384926417008864e-08, + "loss": 1.7652, + "step": 19700 + }, + { + "epoch": 2.925925925925926, + "grad_norm": 4.397125720977783, + "learning_rate": 3.1738115743358275e-08, + "loss": 1.7398, + "step": 19750 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 4.768675327301025, + "learning_rate": 2.5721859677957374e-08, + "loss": 1.7477, + "step": 19800 + }, + { + "epoch": 2.9407407407407407, + "grad_norm": 4.665643215179443, + "learning_rate": 2.033653878644626e-08, + "loss": 1.7327, + "step": 19850 + }, + { + "epoch": 2.948148148148148, + "grad_norm": 4.826103687286377, + "learning_rate": 1.5582493723893533e-08, + "loss": 1.8006, + "step": 19900 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 5.414469242095947, + "learning_rate": 1.1460025213236858e-08, + "loss": 1.7529, + "step": 19950 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 3.83282732963562, + "learning_rate": 7.969394026269284e-09, + "loss": 1.6982, + "step": 20000 + }, + { + "epoch": 2.9703703703703703, + "grad_norm": 4.964536190032959, + "learning_rate": 5.110820967140217e-09, + "loss": 1.7715, + "step": 20050 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 4.930854320526123, + "learning_rate": 2.8844868583866036e-09, + "loss": 1.7523, + "step": 20100 + }, + { + "epoch": 2.985185185185185, + "grad_norm": 4.766547679901123, + "learning_rate": 1.2905325294987337e-09, + "loss": 1.7657, + "step": 20150 + }, + { + "epoch": 2.9925925925925925, + "grad_norm": 5.212747097015381, + "learning_rate": 3.290588080095969e-10, + "loss": 1.7505, + "step": 20200 + }, + { + "epoch": 3.0, + "grad_norm": 4.470175266265869, + "learning_rate": 1.265131144290166e-13, + "loss": 1.622, + "step": 20250 + }, + { + "epoch": 3.0074074074074075, + "grad_norm": 5.816748142242432, + "learning_rate": 2.9929275944909074e-06, + "loss": 1.716, + "step": 20300 + }, + { + "epoch": 3.0148148148148146, + "grad_norm": 5.761480331420898, + "learning_rate": 2.9507610183637545e-06, + "loss": 1.7195, + "step": 20350 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 4.432187557220459, + "learning_rate": 2.9088421211129593e-06, + "loss": 1.7226, + "step": 20400 + }, + { + "epoch": 3.0296296296296297, + "grad_norm": 5.013230323791504, + "learning_rate": 2.8671723755819604e-06, + "loss": 1.712, + "step": 20450 + }, + { + "epoch": 3.037037037037037, + "grad_norm": 5.5916266441345215, + "learning_rate": 2.8257532458601156e-06, + "loss": 1.7733, + "step": 20500 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 4.905713081359863, + "learning_rate": 2.7845861872312497e-06, + "loss": 1.7558, + "step": 20550 + }, + { + "epoch": 3.051851851851852, + "grad_norm": 4.5096306800842285, + "learning_rate": 2.743672646122539e-06, + "loss": 1.7287, + "step": 20600 + }, + { + "epoch": 3.0592592592592593, + "grad_norm": 3.8488352298736572, + "learning_rate": 2.703014060053688e-06, + "loss": 1.6764, + "step": 20650 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 5.562567234039307, + "learning_rate": 2.6626118575864003e-06, + "loss": 1.7835, + "step": 20700 + }, + { + "epoch": 3.074074074074074, + "grad_norm": 4.081836700439453, + "learning_rate": 2.622467458274216e-06, + "loss": 1.775, + "step": 20750 + }, + { + "epoch": 3.0814814814814815, + "grad_norm": 5.104362964630127, + "learning_rate": 2.5825822726126095e-06, + "loss": 1.6917, + "step": 20800 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 3.979689359664917, + "learning_rate": 2.542957701989447e-06, + "loss": 1.6521, + "step": 20850 + }, + { + "epoch": 3.096296296296296, + "grad_norm": 4.8478779792785645, + "learning_rate": 2.503595138635747e-06, + "loss": 1.6938, + "step": 20900 + }, + { + "epoch": 3.1037037037037036, + "grad_norm": 4.569914817810059, + "learning_rate": 2.464495965576745e-06, + "loss": 1.6867, + "step": 20950 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 5.286581993103027, + "learning_rate": 2.4256615565833285e-06, + "loss": 1.712, + "step": 21000 + }, + { + "epoch": 3.1185185185185187, + "grad_norm": 3.896458387374878, + "learning_rate": 2.3870932761237487e-06, + "loss": 1.7861, + "step": 21050 + }, + { + "epoch": 3.1259259259259258, + "grad_norm": 3.608572244644165, + "learning_rate": 2.34879247931568e-06, + "loss": 1.7556, + "step": 21100 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 4.977129936218262, + "learning_rate": 2.310760511878619e-06, + "loss": 1.7147, + "step": 21150 + }, + { + "epoch": 3.140740740740741, + "grad_norm": 6.970212936401367, + "learning_rate": 2.2729987100865946e-06, + "loss": 1.7385, + "step": 21200 + }, + { + "epoch": 3.148148148148148, + "grad_norm": 5.554306507110596, + "learning_rate": 2.2355084007212126e-06, + "loss": 1.7916, + "step": 21250 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 4.7065558433532715, + "learning_rate": 2.1982909010250465e-06, + "loss": 1.7447, + "step": 21300 + }, + { + "epoch": 3.162962962962963, + "grad_norm": 3.5494189262390137, + "learning_rate": 2.161347518655358e-06, + "loss": 1.6721, + "step": 21350 + }, + { + "epoch": 3.1703703703703705, + "grad_norm": 5.571441173553467, + "learning_rate": 2.1246795516381324e-06, + "loss": 1.8119, + "step": 21400 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 4.489774227142334, + "learning_rate": 2.0882882883224996e-06, + "loss": 1.7543, + "step": 21450 + }, + { + "epoch": 3.185185185185185, + "grad_norm": 4.602801322937012, + "learning_rate": 2.0521750073354484e-06, + "loss": 1.6942, + "step": 21500 + }, + { + "epoch": 3.1925925925925926, + "grad_norm": 4.661532878875732, + "learning_rate": 2.0163409775369015e-06, + "loss": 1.7236, + "step": 21550 + }, + { + "epoch": 3.2, + "grad_norm": 5.045383930206299, + "learning_rate": 1.9807874579751427e-06, + "loss": 1.7122, + "step": 21600 + }, + { + "epoch": 3.2074074074074073, + "grad_norm": 3.879729986190796, + "learning_rate": 1.9455156978425783e-06, + "loss": 1.7256, + "step": 21650 + }, + { + "epoch": 3.214814814814815, + "grad_norm": 5.181421279907227, + "learning_rate": 1.9105269364318323e-06, + "loss": 1.7287, + "step": 21700 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 6.361110210418701, + "learning_rate": 1.8758224030922224e-06, + "loss": 1.7141, + "step": 21750 + }, + { + "epoch": 3.2296296296296294, + "grad_norm": 4.662287712097168, + "learning_rate": 1.8414033171865564e-06, + "loss": 1.7346, + "step": 21800 + }, + { + "epoch": 3.237037037037037, + "grad_norm": 5.042120933532715, + "learning_rate": 1.8072708880482825e-06, + "loss": 1.6893, + "step": 21850 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 5.092305660247803, + "learning_rate": 1.7734263149390141e-06, + "loss": 1.7447, + "step": 21900 + }, + { + "epoch": 3.251851851851852, + "grad_norm": 5.0466227531433105, + "learning_rate": 1.739870787006387e-06, + "loss": 1.687, + "step": 21950 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 4.487703323364258, + "learning_rate": 1.7066054832422641e-06, + "loss": 1.7448, + "step": 22000 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 4.347760200500488, + "learning_rate": 1.6736315724413344e-06, + "loss": 1.7028, + "step": 22050 + }, + { + "epoch": 3.274074074074074, + "grad_norm": 4.359129905700684, + "learning_rate": 1.6409502131600352e-06, + "loss": 1.6988, + "step": 22100 + }, + { + "epoch": 3.2814814814814817, + "grad_norm": 5.131579399108887, + "learning_rate": 1.6085625536758376e-06, + "loss": 1.7299, + "step": 22150 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 5.537661552429199, + "learning_rate": 1.5764697319469147e-06, + "loss": 1.6983, + "step": 22200 + }, + { + "epoch": 3.2962962962962963, + "grad_norm": 5.458590030670166, + "learning_rate": 1.5446728755721563e-06, + "loss": 1.7139, + "step": 22250 + }, + { + "epoch": 3.303703703703704, + "grad_norm": 5.582429885864258, + "learning_rate": 1.5131731017515384e-06, + "loss": 1.7113, + "step": 22300 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 5.524541854858398, + "learning_rate": 1.4819715172468873e-06, + "loss": 1.7508, + "step": 22350 + }, + { + "epoch": 3.3185185185185184, + "grad_norm": 5.851492404937744, + "learning_rate": 1.451069218342983e-06, + "loss": 1.7525, + "step": 22400 + }, + { + "epoch": 3.325925925925926, + "grad_norm": 5.018085479736328, + "learning_rate": 1.4204672908090345e-06, + "loss": 1.7677, + "step": 22450 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 4.522815704345703, + "learning_rate": 1.390166809860547e-06, + "loss": 1.7647, + "step": 22500 + }, + { + "epoch": 3.3407407407407406, + "grad_norm": 4.073155879974365, + "learning_rate": 1.3601688401215274e-06, + "loss": 1.6966, + "step": 22550 + }, + { + "epoch": 3.348148148148148, + "grad_norm": 4.451920509338379, + "learning_rate": 1.3304744355870914e-06, + "loss": 1.7378, + "step": 22600 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 4.887360572814941, + "learning_rate": 1.301084639586424e-06, + "loss": 1.7222, + "step": 22650 + }, + { + "epoch": 3.362962962962963, + "grad_norm": 4.85434627532959, + "learning_rate": 1.2720004847461165e-06, + "loss": 1.6973, + "step": 22700 + }, + { + "epoch": 3.3703703703703702, + "grad_norm": 5.109053134918213, + "learning_rate": 1.2432229929538952e-06, + "loss": 1.7477, + "step": 22750 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 4.921713352203369, + "learning_rate": 1.214753175322716e-06, + "loss": 1.6625, + "step": 22800 + }, + { + "epoch": 3.3851851851851853, + "grad_norm": 4.879732608795166, + "learning_rate": 1.1865920321552238e-06, + "loss": 1.6934, + "step": 22850 + }, + { + "epoch": 3.3925925925925924, + "grad_norm": 4.966546058654785, + "learning_rate": 1.158740552908627e-06, + "loss": 1.7571, + "step": 22900 + }, + { + "epoch": 3.4, + "grad_norm": 4.508844375610352, + "learning_rate": 1.1311997161599186e-06, + "loss": 1.7194, + "step": 22950 + }, + { + "epoch": 3.4074074074074074, + "grad_norm": 6.013401508331299, + "learning_rate": 1.1039704895714941e-06, + "loss": 1.7142, + "step": 23000 + }, + { + "epoch": 3.414814814814815, + "grad_norm": 5.535034656524658, + "learning_rate": 1.0770538298571598e-06, + "loss": 1.707, + "step": 23050 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 3.6205241680145264, + "learning_rate": 1.0504506827485139e-06, + "loss": 1.7567, + "step": 23100 + }, + { + "epoch": 3.4296296296296296, + "grad_norm": 3.9489850997924805, + "learning_rate": 1.024161982961711e-06, + "loss": 1.7595, + "step": 23150 + }, + { + "epoch": 3.437037037037037, + "grad_norm": 5.946234703063965, + "learning_rate": 9.981886541646325e-07, + "loss": 1.7318, + "step": 23200 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 4.73652458190918, + "learning_rate": 9.725316089444291e-07, + "loss": 1.7689, + "step": 23250 + }, + { + "epoch": 3.4518518518518517, + "grad_norm": 4.716205596923828, + "learning_rate": 9.471917487754456e-07, + "loss": 1.71, + "step": 23300 + }, + { + "epoch": 3.4592592592592593, + "grad_norm": 5.690927028656006, + "learning_rate": 9.221699639875637e-07, + "loss": 1.7837, + "step": 23350 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 6.249578952789307, + "learning_rate": 8.974671337349128e-07, + "loss": 1.6703, + "step": 23400 + }, + { + "epoch": 3.474074074074074, + "grad_norm": 5.07996940612793, + "learning_rate": 8.730841259649725e-07, + "loss": 1.7051, + "step": 23450 + }, + { + "epoch": 3.4814814814814814, + "grad_norm": 5.213183403015137, + "learning_rate": 8.49021797388091e-07, + "loss": 1.7092, + "step": 23500 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 5.148874759674072, + "learning_rate": 8.252809934473771e-07, + "loss": 1.715, + "step": 23550 + }, + { + "epoch": 3.4962962962962965, + "grad_norm": 4.876300811767578, + "learning_rate": 8.018625482889897e-07, + "loss": 1.6792, + "step": 23600 + }, + { + "epoch": 3.5037037037037035, + "grad_norm": 5.332327842712402, + "learning_rate": 7.787672847328387e-07, + "loss": 1.77, + "step": 23650 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 4.203309535980225, + "learning_rate": 7.559960142436751e-07, + "loss": 1.7447, + "step": 23700 + }, + { + "epoch": 3.5185185185185186, + "grad_norm": 4.4003987312316895, + "learning_rate": 7.335495369025669e-07, + "loss": 1.6881, + "step": 23750 + }, + { + "epoch": 3.525925925925926, + "grad_norm": 4.845513343811035, + "learning_rate": 7.11428641378804e-07, + "loss": 1.7904, + "step": 23800 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 4.780256748199463, + "learning_rate": 6.896341049021804e-07, + "loss": 1.6839, + "step": 23850 + }, + { + "epoch": 3.5407407407407407, + "grad_norm": 5.716015815734863, + "learning_rate": 6.6816669323568e-07, + "loss": 1.727, + "step": 23900 + }, + { + "epoch": 3.5481481481481483, + "grad_norm": 4.932426929473877, + "learning_rate": 6.470271606485834e-07, + "loss": 1.7465, + "step": 23950 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 4.499339580535889, + "learning_rate": 6.262162498899593e-07, + "loss": 1.7221, + "step": 24000 + }, + { + "epoch": 3.562962962962963, + "grad_norm": 4.518272399902344, + "learning_rate": 6.057346921625628e-07, + "loss": 1.7454, + "step": 24050 + }, + { + "epoch": 3.5703703703703704, + "grad_norm": 4.595861434936523, + "learning_rate": 5.855832070971557e-07, + "loss": 1.7209, + "step": 24100 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 4.064270973205566, + "learning_rate": 5.657625027272162e-07, + "loss": 1.793, + "step": 24150 + }, + { + "epoch": 3.585185185185185, + "grad_norm": 5.750711441040039, + "learning_rate": 5.462732754640554e-07, + "loss": 1.7546, + "step": 24200 + }, + { + "epoch": 3.5925925925925926, + "grad_norm": 4.3929901123046875, + "learning_rate": 5.271162100723592e-07, + "loss": 1.716, + "step": 24250 + }, + { + "epoch": 3.6, + "grad_norm": 5.952783584594727, + "learning_rate": 5.08291979646125e-07, + "loss": 1.7265, + "step": 24300 + }, + { + "epoch": 3.6074074074074076, + "grad_norm": 4.167656898498535, + "learning_rate": 4.898012455850065e-07, + "loss": 1.6597, + "step": 24350 + }, + { + "epoch": 3.6148148148148147, + "grad_norm": 4.184138774871826, + "learning_rate": 4.7164465757108424e-07, + "loss": 1.8007, + "step": 24400 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 4.484316349029541, + "learning_rate": 4.5382285354602983e-07, + "loss": 1.764, + "step": 24450 + }, + { + "epoch": 3.6296296296296298, + "grad_norm": 4.9524126052856445, + "learning_rate": 4.363364596887021e-07, + "loss": 1.7239, + "step": 24500 + }, + { + "epoch": 3.637037037037037, + "grad_norm": 4.901634693145752, + "learning_rate": 4.191860903931344e-07, + "loss": 1.7435, + "step": 24550 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 4.495138645172119, + "learning_rate": 4.0237234824695327e-07, + "loss": 1.6682, + "step": 24600 + }, + { + "epoch": 3.651851851851852, + "grad_norm": 4.296281337738037, + "learning_rate": 3.858958240102084e-07, + "loss": 1.749, + "step": 24650 + }, + { + "epoch": 3.659259259259259, + "grad_norm": 5.341070652008057, + "learning_rate": 3.69757096594614e-07, + "loss": 1.7463, + "step": 24700 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 4.133769989013672, + "learning_rate": 3.5395673304320253e-07, + "loss": 1.6866, + "step": 24750 + }, + { + "epoch": 3.674074074074074, + "grad_norm": 6.342161178588867, + "learning_rate": 3.384952885104109e-07, + "loss": 1.7442, + "step": 24800 + }, + { + "epoch": 3.6814814814814816, + "grad_norm": 5.4333720207214355, + "learning_rate": 3.233733062425715e-07, + "loss": 1.7159, + "step": 24850 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 4.437062740325928, + "learning_rate": 3.0859131755881956e-07, + "loss": 1.7579, + "step": 24900 + }, + { + "epoch": 3.696296296296296, + "grad_norm": 5.132778167724609, + "learning_rate": 2.9414984183243177e-07, + "loss": 1.6867, + "step": 24950 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 4.612050533294678, + "learning_rate": 2.800493864725784e-07, + "loss": 1.667, + "step": 25000 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 4.683866500854492, + "learning_rate": 2.662904469064842e-07, + "loss": 1.7585, + "step": 25050 + }, + { + "epoch": 3.7185185185185183, + "grad_norm": 5.033325672149658, + "learning_rate": 2.52873506562038e-07, + "loss": 1.7299, + "step": 25100 + }, + { + "epoch": 3.725925925925926, + "grad_norm": 5.117885589599609, + "learning_rate": 2.39799036850793e-07, + "loss": 1.7486, + "step": 25150 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 3.9167544841766357, + "learning_rate": 2.2706749715141085e-07, + "loss": 1.6908, + "step": 25200 + }, + { + "epoch": 3.7407407407407405, + "grad_norm": 5.12332010269165, + "learning_rate": 2.1467933479351942e-07, + "loss": 1.6907, + "step": 25250 + }, + { + "epoch": 3.748148148148148, + "grad_norm": 5.0879130363464355, + "learning_rate": 2.0263498504199397e-07, + "loss": 1.7745, + "step": 25300 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 4.076460838317871, + "learning_rate": 1.909348710816672e-07, + "loss": 1.7463, + "step": 25350 + }, + { + "epoch": 3.762962962962963, + "grad_norm": 4.88745641708374, + "learning_rate": 1.7957940400245677e-07, + "loss": 1.7632, + "step": 25400 + }, + { + "epoch": 3.7703703703703706, + "grad_norm": 5.026834964752197, + "learning_rate": 1.6856898278492573e-07, + "loss": 1.825, + "step": 25450 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 5.161101341247559, + "learning_rate": 1.5790399428625925e-07, + "loss": 1.7309, + "step": 25500 + }, + { + "epoch": 3.785185185185185, + "grad_norm": 4.700355529785156, + "learning_rate": 1.475848132266733e-07, + "loss": 1.716, + "step": 25550 + }, + { + "epoch": 3.7925925925925927, + "grad_norm": 4.75068998336792, + "learning_rate": 1.3761180217625514e-07, + "loss": 1.7438, + "step": 25600 + }, + { + "epoch": 3.8, + "grad_norm": 4.847548484802246, + "learning_rate": 1.2798531154221362e-07, + "loss": 1.6976, + "step": 25650 + }, + { + "epoch": 3.8074074074074074, + "grad_norm": 4.9904866218566895, + "learning_rate": 1.1870567955657552e-07, + "loss": 1.733, + "step": 25700 + }, + { + "epoch": 3.814814814814815, + "grad_norm": 5.078738689422607, + "learning_rate": 1.097732322642997e-07, + "loss": 1.7232, + "step": 25750 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 5.824419975280762, + "learning_rate": 1.0118828351181609e-07, + "loss": 1.7559, + "step": 25800 + }, + { + "epoch": 3.8296296296296295, + "grad_norm": 3.6938419342041016, + "learning_rate": 9.295113493600683e-08, + "loss": 1.6989, + "step": 25850 + }, + { + "epoch": 3.837037037037037, + "grad_norm": 6.135478973388672, + "learning_rate": 8.506207595360361e-08, + "loss": 1.6932, + "step": 25900 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 5.020415782928467, + "learning_rate": 7.752138375101914e-08, + "loss": 1.7119, + "step": 25950 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 5.36544942855835, + "learning_rate": 7.032932327460828e-08, + "loss": 1.7197, + "step": 26000 + }, + { + "epoch": 3.859259259259259, + "grad_norm": 4.222412586212158, + "learning_rate": 6.348614722135771e-08, + "loss": 1.7421, + "step": 26050 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 4.589230537414551, + "learning_rate": 5.699209603001077e-08, + "loss": 1.7871, + "step": 26100 + }, + { + "epoch": 3.8740740740740742, + "grad_norm": 4.290050506591797, + "learning_rate": 5.0847397872617607e-08, + "loss": 1.7179, + "step": 26150 + }, + { + "epoch": 3.8814814814814813, + "grad_norm": 5.372164726257324, + "learning_rate": 4.50522686465138e-08, + "loss": 1.7356, + "step": 26200 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 4.672863006591797, + "learning_rate": 3.960691196674304e-08, + "loss": 1.7587, + "step": 26250 + }, + { + "epoch": 3.8962962962962964, + "grad_norm": 6.645358562469482, + "learning_rate": 3.451151915889961e-08, + "loss": 1.6881, + "step": 26300 + }, + { + "epoch": 3.9037037037037035, + "grad_norm": 5.1877336502075195, + "learning_rate": 2.9766269252401448e-08, + "loss": 1.6786, + "step": 26350 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 5.258729934692383, + "learning_rate": 2.5371328974206356e-08, + "loss": 1.7356, + "step": 26400 + }, + { + "epoch": 3.9185185185185185, + "grad_norm": 4.210945129394531, + "learning_rate": 2.1326852742949987e-08, + "loss": 1.7084, + "step": 26450 + }, + { + "epoch": 3.925925925925926, + "grad_norm": 5.776233196258545, + "learning_rate": 1.7632982663521314e-08, + "loss": 1.741, + "step": 26500 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 4.814140796661377, + "learning_rate": 1.4289848522073269e-08, + "loss": 1.7106, + "step": 26550 + }, + { + "epoch": 3.9407407407407407, + "grad_norm": 5.385549068450928, + "learning_rate": 1.1297567781454188e-08, + "loss": 1.7509, + "step": 26600 + }, + { + "epoch": 3.948148148148148, + "grad_norm": 5.47978401184082, + "learning_rate": 8.656245577089994e-09, + "loss": 1.6962, + "step": 26650 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 5.5808281898498535, + "learning_rate": 6.365974713283818e-09, + "loss": 1.6714, + "step": 26700 + }, + { + "epoch": 3.962962962962963, + "grad_norm": 5.073295593261719, + "learning_rate": 4.426835659958606e-09, + "loss": 1.7363, + "step": 26750 + }, + { + "epoch": 3.9703703703703703, + "grad_norm": 5.013734817504883, + "learning_rate": 2.838896549828274e-09, + "loss": 1.7764, + "step": 26800 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 5.511692047119141, + "learning_rate": 1.6022131760018433e-09, + "loss": 1.6929, + "step": 26850 + }, + { + "epoch": 3.985185185185185, + "grad_norm": 4.73627233505249, + "learning_rate": 7.168289900305602e-10, + "loss": 1.6978, + "step": 26900 + }, + { + "epoch": 3.9925925925925925, + "grad_norm": 4.294295310974121, + "learning_rate": 1.827751003724565e-10, + "loss": 1.7181, + "step": 26950 + }, + { + "epoch": 4.0, + "grad_norm": 4.967982292175293, + "learning_rate": 7.027130211056943e-14, + "loss": 1.7233, + "step": 27000 + }, + { + "epoch": 4.007407407407407, + "grad_norm": 4.82836389541626, + "learning_rate": 1.9382473829432267e-06, + "loss": 1.7041, + "step": 27050 + }, + { + "epoch": 4.014814814814815, + "grad_norm": 5.167964935302734, + "learning_rate": 1.9103854553176194e-06, + "loss": 1.8133, + "step": 27100 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 4.666854381561279, + "learning_rate": 1.8827040718648226e-06, + "loss": 1.6764, + "step": 27150 + }, + { + "epoch": 4.029629629629629, + "grad_norm": 5.357041835784912, + "learning_rate": 1.8552038503784874e-06, + "loss": 1.749, + "step": 27200 + }, + { + "epoch": 4.037037037037037, + "grad_norm": 6.967157363891602, + "learning_rate": 1.82788540460908e-06, + "loss": 1.7238, + "step": 27250 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 5.682682037353516, + "learning_rate": 1.8007493442502034e-06, + "loss": 1.7133, + "step": 27300 + }, + { + "epoch": 4.051851851851852, + "grad_norm": 5.161554336547852, + "learning_rate": 1.7737962749249681e-06, + "loss": 1.6942, + "step": 27350 + }, + { + "epoch": 4.059259259259259, + "grad_norm": 4.5494232177734375, + "learning_rate": 1.7470267981724963e-06, + "loss": 1.7458, + "step": 27400 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 4.907562732696533, + "learning_rate": 1.7204415114344875e-06, + "loss": 1.7069, + "step": 27450 + }, + { + "epoch": 4.074074074074074, + "grad_norm": 5.7244648933410645, + "learning_rate": 1.6940410080418723e-06, + "loss": 1.7731, + "step": 27500 + }, + { + "epoch": 4.0814814814814815, + "grad_norm": 5.122848987579346, + "learning_rate": 1.6678258772016043e-06, + "loss": 1.6889, + "step": 27550 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 5.414388656616211, + "learning_rate": 1.6417967039834693e-06, + "loss": 1.7262, + "step": 27600 + }, + { + "epoch": 4.0962962962962965, + "grad_norm": 4.686498641967773, + "learning_rate": 1.6159540693070609e-06, + "loss": 1.7175, + "step": 27650 + }, + { + "epoch": 4.103703703703704, + "grad_norm": 3.999427556991577, + "learning_rate": 1.5902985499287894e-06, + "loss": 1.7779, + "step": 27700 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 5.516722202301025, + "learning_rate": 1.5648307184290335e-06, + "loss": 1.6821, + "step": 27750 + }, + { + "epoch": 4.118518518518519, + "grad_norm": 4.3700761795043945, + "learning_rate": 1.539551143199346e-06, + "loss": 1.774, + "step": 27800 + }, + { + "epoch": 4.125925925925926, + "grad_norm": 5.696080207824707, + "learning_rate": 1.5144603884297705e-06, + "loss": 1.7454, + "step": 27850 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 4.253551006317139, + "learning_rate": 1.4895590140962546e-06, + "loss": 1.76, + "step": 27900 + }, + { + "epoch": 4.140740740740741, + "grad_norm": 4.988155364990234, + "learning_rate": 1.4648475759481518e-06, + "loss": 1.7696, + "step": 27950 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 4.6464972496032715, + "learning_rate": 1.440326625495807e-06, + "loss": 1.7652, + "step": 28000 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 5.924875736236572, + "learning_rate": 1.4159967099982708e-06, + "loss": 1.6888, + "step": 28050 + }, + { + "epoch": 4.162962962962963, + "grad_norm": 4.925395488739014, + "learning_rate": 1.3918583724510604e-06, + "loss": 1.7267, + "step": 28100 + }, + { + "epoch": 4.17037037037037, + "grad_norm": 4.7780046463012695, + "learning_rate": 1.367912151574059e-06, + "loss": 1.6646, + "step": 28150 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 3.9888927936553955, + "learning_rate": 1.3441585817994818e-06, + "loss": 1.6915, + "step": 28200 + }, + { + "epoch": 4.185185185185185, + "grad_norm": 4.683606147766113, + "learning_rate": 1.3205981932599555e-06, + "loss": 1.7211, + "step": 28250 + }, + { + "epoch": 4.192592592592592, + "grad_norm": 5.409604072570801, + "learning_rate": 1.2972315117766876e-06, + "loss": 1.7155, + "step": 28300 + }, + { + "epoch": 4.2, + "grad_norm": 4.003017425537109, + "learning_rate": 1.2740590588477198e-06, + "loss": 1.7622, + "step": 28350 + }, + { + "epoch": 4.207407407407407, + "grad_norm": 5.602859973907471, + "learning_rate": 1.2510813516363064e-06, + "loss": 1.7029, + "step": 28400 + }, + { + "epoch": 4.214814814814815, + "grad_norm": 5.32771110534668, + "learning_rate": 1.228298902959353e-06, + "loss": 1.7684, + "step": 28450 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 5.030776500701904, + "learning_rate": 1.2057122212759887e-06, + "loss": 1.7379, + "step": 28500 + }, + { + "epoch": 4.229629629629629, + "grad_norm": 5.6066365242004395, + "learning_rate": 1.1833218106762113e-06, + "loss": 1.7228, + "step": 28550 + }, + { + "epoch": 4.237037037037037, + "grad_norm": 4.757691860198975, + "learning_rate": 1.1611281708696332e-06, + "loss": 1.6407, + "step": 28600 + }, + { + "epoch": 4.2444444444444445, + "grad_norm": 4.368016242980957, + "learning_rate": 1.1391317971743366e-06, + "loss": 1.7502, + "step": 28650 + }, + { + "epoch": 4.2518518518518515, + "grad_norm": 4.5439581871032715, + "learning_rate": 1.1173331805058074e-06, + "loss": 1.732, + "step": 28700 + }, + { + "epoch": 4.2592592592592595, + "grad_norm": 4.928348064422607, + "learning_rate": 1.0957328073659945e-06, + "loss": 1.7228, + "step": 28750 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 5.2048139572143555, + "learning_rate": 1.074331159832439e-06, + "loss": 1.7416, + "step": 28800 + }, + { + "epoch": 4.274074074074074, + "grad_norm": 4.4075446128845215, + "learning_rate": 1.0531287155475223e-06, + "loss": 1.7174, + "step": 28850 + }, + { + "epoch": 4.281481481481482, + "grad_norm": 4.489355564117432, + "learning_rate": 1.032125947707805e-06, + "loss": 1.6647, + "step": 28900 + }, + { + "epoch": 4.288888888888889, + "grad_norm": 4.746804237365723, + "learning_rate": 1.0113233250534594e-06, + "loss": 1.743, + "step": 28950 + }, + { + "epoch": 4.296296296296296, + "grad_norm": 6.516097068786621, + "learning_rate": 9.907213118578184e-07, + "loss": 1.7985, + "step": 29000 + }, + { + "epoch": 4.303703703703704, + "grad_norm": 4.8853678703308105, + "learning_rate": 9.703203679170116e-07, + "loss": 1.7554, + "step": 29050 + }, + { + "epoch": 4.311111111111111, + "grad_norm": 4.837742805480957, + "learning_rate": 9.501209485396968e-07, + "loss": 1.7171, + "step": 29100 + }, + { + "epoch": 4.318518518518519, + "grad_norm": 4.9490556716918945, + "learning_rate": 9.301235045368995e-07, + "loss": 1.6853, + "step": 29150 + }, + { + "epoch": 4.325925925925926, + "grad_norm": 4.251191139221191, + "learning_rate": 9.103284822119629e-07, + "loss": 1.6628, + "step": 29200 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 4.1913533210754395, + "learning_rate": 8.907363233505772e-07, + "loss": 1.638, + "step": 29250 + }, + { + "epoch": 4.340740740740741, + "grad_norm": 5.5985517501831055, + "learning_rate": 8.713474652109155e-07, + "loss": 1.6611, + "step": 29300 + }, + { + "epoch": 4.348148148148148, + "grad_norm": 5.221460819244385, + "learning_rate": 8.521623405138902e-07, + "loss": 1.7188, + "step": 29350 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 5.904771327972412, + "learning_rate": 8.331813774334796e-07, + "loss": 1.7322, + "step": 29400 + }, + { + "epoch": 4.362962962962963, + "grad_norm": 7.180872917175293, + "learning_rate": 8.144049995871839e-07, + "loss": 1.726, + "step": 29450 + }, + { + "epoch": 4.37037037037037, + "grad_norm": 4.465069770812988, + "learning_rate": 7.958336260265654e-07, + "loss": 1.7343, + "step": 29500 + }, + { + "epoch": 4.377777777777778, + "grad_norm": 4.990593910217285, + "learning_rate": 7.774676712278939e-07, + "loss": 1.7124, + "step": 29550 + }, + { + "epoch": 4.385185185185185, + "grad_norm": 4.23641300201416, + "learning_rate": 7.593075450829046e-07, + "loss": 1.7329, + "step": 29600 + }, + { + "epoch": 4.392592592592592, + "grad_norm": 4.954015254974365, + "learning_rate": 7.413536528896381e-07, + "loss": 1.6599, + "step": 29650 + }, + { + "epoch": 4.4, + "grad_norm": 5.484986782073975, + "learning_rate": 7.236063953434091e-07, + "loss": 1.7504, + "step": 29700 + }, + { + "epoch": 4.407407407407407, + "grad_norm": 4.7733964920043945, + "learning_rate": 7.060661685278481e-07, + "loss": 1.7344, + "step": 29750 + }, + { + "epoch": 4.4148148148148145, + "grad_norm": 3.8824641704559326, + "learning_rate": 6.887333639060767e-07, + "loss": 1.6395, + "step": 29800 + }, + { + "epoch": 4.4222222222222225, + "grad_norm": 4.051513671875, + "learning_rate": 6.716083683119623e-07, + "loss": 1.6314, + "step": 29850 + }, + { + "epoch": 4.42962962962963, + "grad_norm": 5.516358852386475, + "learning_rate": 6.54691563941483e-07, + "loss": 1.7142, + "step": 29900 + }, + { + "epoch": 4.437037037037037, + "grad_norm": 5.0649027824401855, + "learning_rate": 6.379833283442061e-07, + "loss": 1.7769, + "step": 29950 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 4.46967077255249, + "learning_rate": 6.214840344148509e-07, + "loss": 1.6883, + "step": 30000 + }, + { + "epoch": 4.451851851851852, + "grad_norm": 5.248079299926758, + "learning_rate": 6.051940503849818e-07, + "loss": 1.6829, + "step": 30050 + }, + { + "epoch": 4.459259259259259, + "grad_norm": 4.6656293869018555, + "learning_rate": 5.891137398147706e-07, + "loss": 1.6501, + "step": 30100 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 4.85382080078125, + "learning_rate": 5.732434615848992e-07, + "loss": 1.6991, + "step": 30150 + }, + { + "epoch": 4.474074074074074, + "grad_norm": 5.441938877105713, + "learning_rate": 5.575835698885445e-07, + "loss": 1.6589, + "step": 30200 + }, + { + "epoch": 4.481481481481482, + "grad_norm": 4.716953754425049, + "learning_rate": 5.421344142234653e-07, + "loss": 1.7254, + "step": 30250 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 4.488583564758301, + "learning_rate": 5.268963393842185e-07, + "loss": 1.7533, + "step": 30300 + }, + { + "epoch": 4.496296296296296, + "grad_norm": 6.067137718200684, + "learning_rate": 5.118696854544491e-07, + "loss": 1.7206, + "step": 30350 + }, + { + "epoch": 4.503703703703704, + "grad_norm": 4.0403947830200195, + "learning_rate": 4.970547877993081e-07, + "loss": 1.7447, + "step": 30400 + }, + { + "epoch": 4.511111111111111, + "grad_norm": 5.8618998527526855, + "learning_rate": 4.824519770579672e-07, + "loss": 1.7343, + "step": 30450 + }, + { + "epoch": 4.518518518518518, + "grad_norm": 6.678895950317383, + "learning_rate": 4.6806157913623417e-07, + "loss": 1.6971, + "step": 30500 + }, + { + "epoch": 4.525925925925926, + "grad_norm": 4.533618450164795, + "learning_rate": 4.5388391519929134e-07, + "loss": 1.6467, + "step": 30550 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 4.837048053741455, + "learning_rate": 4.399193016645109e-07, + "loss": 1.7892, + "step": 30600 + }, + { + "epoch": 4.540740740740741, + "grad_norm": 4.566707611083984, + "learning_rate": 4.2616805019440653e-07, + "loss": 1.6981, + "step": 30650 + }, + { + "epoch": 4.548148148148148, + "grad_norm": 4.793781280517578, + "learning_rate": 4.126304676896753e-07, + "loss": 1.7256, + "step": 30700 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 5.338557243347168, + "learning_rate": 3.993068562823399e-07, + "loss": 1.7732, + "step": 30750 + }, + { + "epoch": 4.562962962962963, + "grad_norm": 5.471566200256348, + "learning_rate": 3.8619751332901744e-07, + "loss": 1.7244, + "step": 30800 + }, + { + "epoch": 4.57037037037037, + "grad_norm": 6.156429290771484, + "learning_rate": 3.7330273140427585e-07, + "loss": 1.7038, + "step": 30850 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 4.914401531219482, + "learning_rate": 3.606227982941046e-07, + "loss": 1.784, + "step": 30900 + }, + { + "epoch": 4.5851851851851855, + "grad_norm": 5.399247646331787, + "learning_rate": 3.481579969894977e-07, + "loss": 1.7026, + "step": 30950 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 4.374510765075684, + "learning_rate": 3.359086056801253e-07, + "loss": 1.7585, + "step": 31000 + }, + { + "epoch": 4.6, + "grad_norm": 4.571732044219971, + "learning_rate": 3.238748977481421e-07, + "loss": 1.7257, + "step": 31050 + }, + { + "epoch": 4.607407407407408, + "grad_norm": 4.58890438079834, + "learning_rate": 3.1205714176207105e-07, + "loss": 1.7471, + "step": 31100 + }, + { + "epoch": 4.614814814814815, + "grad_norm": 4.49856424331665, + "learning_rate": 3.004556014708182e-07, + "loss": 1.6543, + "step": 31150 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 4.499317169189453, + "learning_rate": 2.8907053579778075e-07, + "loss": 1.6906, + "step": 31200 + }, + { + "epoch": 4.62962962962963, + "grad_norm": 5.533344745635986, + "learning_rate": 2.7790219883507385e-07, + "loss": 1.6691, + "step": 31250 + }, + { + "epoch": 4.637037037037037, + "grad_norm": 5.220572471618652, + "learning_rate": 2.6695083983785843e-07, + "loss": 1.7941, + "step": 31300 + }, + { + "epoch": 4.644444444444445, + "grad_norm": 4.013186454772949, + "learning_rate": 2.5621670321877236e-07, + "loss": 1.672, + "step": 31350 + }, + { + "epoch": 4.651851851851852, + "grad_norm": 5.1433186531066895, + "learning_rate": 2.45700028542486e-07, + "loss": 1.712, + "step": 31400 + }, + { + "epoch": 4.659259259259259, + "grad_norm": 5.291711330413818, + "learning_rate": 2.354010505203419e-07, + "loss": 1.7306, + "step": 31450 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 4.273839473724365, + "learning_rate": 2.2531999900513358e-07, + "loss": 1.6904, + "step": 31500 + }, + { + "epoch": 4.674074074074074, + "grad_norm": 4.840414047241211, + "learning_rate": 2.1545709898596057e-07, + "loss": 1.7894, + "step": 31550 + }, + { + "epoch": 4.681481481481481, + "grad_norm": 4.572354316711426, + "learning_rate": 2.058125705832159e-07, + "loss": 1.7386, + "step": 31600 + }, + { + "epoch": 4.688888888888889, + "grad_norm": 4.349206924438477, + "learning_rate": 1.9638662904367e-07, + "loss": 1.7278, + "step": 31650 + }, + { + "epoch": 4.696296296296296, + "grad_norm": 4.914106369018555, + "learning_rate": 1.871794847356656e-07, + "loss": 1.7336, + "step": 31700 + }, + { + "epoch": 4.703703703703704, + "grad_norm": 5.19305419921875, + "learning_rate": 1.781913431444282e-07, + "loss": 1.8137, + "step": 31750 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 4.804193019866943, + "learning_rate": 1.6942240486747196e-07, + "loss": 1.7152, + "step": 31800 + }, + { + "epoch": 4.718518518518518, + "grad_norm": 5.960775375366211, + "learning_rate": 1.6087286561013215e-07, + "loss": 1.7112, + "step": 31850 + }, + { + "epoch": 4.725925925925926, + "grad_norm": 4.539139747619629, + "learning_rate": 1.5254291618118978e-07, + "loss": 1.6712, + "step": 31900 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 5.396848678588867, + "learning_rate": 1.4443274248861495e-07, + "loss": 1.7833, + "step": 31950 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 3.9417476654052734, + "learning_rate": 1.3654252553542025e-07, + "loss": 1.7082, + "step": 32000 + }, + { + "epoch": 4.7481481481481485, + "grad_norm": 4.94804573059082, + "learning_rate": 1.2887244141562062e-07, + "loss": 1.6837, + "step": 32050 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 5.223379135131836, + "learning_rate": 1.214226613103009e-07, + "loss": 1.7133, + "step": 32100 + }, + { + "epoch": 4.762962962962963, + "grad_norm": 4.776657581329346, + "learning_rate": 1.141933514837934e-07, + "loss": 1.6771, + "step": 32150 + }, + { + "epoch": 4.770370370370371, + "grad_norm": 4.386743545532227, + "learning_rate": 1.071846732799775e-07, + "loss": 1.7138, + "step": 32200 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 4.870341777801514, + "learning_rate": 1.0039678311866585e-07, + "loss": 1.705, + "step": 32250 + }, + { + "epoch": 4.785185185185185, + "grad_norm": 4.143662452697754, + "learning_rate": 9.382983249212163e-08, + "loss": 1.6861, + "step": 32300 + }, + { + "epoch": 4.792592592592593, + "grad_norm": 4.790794849395752, + "learning_rate": 8.748396796167568e-08, + "loss": 1.7338, + "step": 32350 + }, + { + "epoch": 4.8, + "grad_norm": 4.551774501800537, + "learning_rate": 8.135933115445471e-08, + "loss": 1.6955, + "step": 32400 + }, + { + "epoch": 4.807407407407408, + "grad_norm": 4.733094692230225, + "learning_rate": 7.545605876021933e-08, + "loss": 1.6594, + "step": 32450 + }, + { + "epoch": 4.814814814814815, + "grad_norm": 4.897435188293457, + "learning_rate": 6.977428252831764e-08, + "loss": 1.6837, + "step": 32500 + }, + { + "epoch": 4.822222222222222, + "grad_norm": 5.258769989013672, + "learning_rate": 6.431412926473978e-08, + "loss": 1.6683, + "step": 32550 + }, + { + "epoch": 4.82962962962963, + "grad_norm": 5.621219158172607, + "learning_rate": 5.907572082929247e-08, + "loss": 1.7212, + "step": 32600 + }, + { + "epoch": 4.837037037037037, + "grad_norm": 5.88584566116333, + "learning_rate": 5.4059174132873314e-08, + "loss": 1.7297, + "step": 32650 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 4.229117393493652, + "learning_rate": 4.9264601134870747e-08, + "loss": 1.714, + "step": 32700 + }, + { + "epoch": 4.851851851851852, + "grad_norm": 4.282808303833008, + "learning_rate": 4.4692108840656e-08, + "loss": 1.7891, + "step": 32750 + }, + { + "epoch": 4.859259259259259, + "grad_norm": 4.613882541656494, + "learning_rate": 4.0341799299198345e-08, + "loss": 1.7311, + "step": 32800 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 5.826322555541992, + "learning_rate": 3.621376960079248e-08, + "loss": 1.7449, + "step": 32850 + }, + { + "epoch": 4.874074074074074, + "grad_norm": 3.8570024967193604, + "learning_rate": 3.230811187488248e-08, + "loss": 1.7437, + "step": 32900 + }, + { + "epoch": 4.881481481481481, + "grad_norm": 5.855494022369385, + "learning_rate": 2.8624913288012314e-08, + "loss": 1.7327, + "step": 32950 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 4.846534252166748, + "learning_rate": 2.5164256041879663e-08, + "loss": 1.732, + "step": 33000 + }, + { + "epoch": 4.896296296296296, + "grad_norm": 5.538894176483154, + "learning_rate": 2.192621737150069e-08, + "loss": 1.7365, + "step": 33050 + }, + { + "epoch": 4.9037037037037035, + "grad_norm": 5.251986026763916, + "learning_rate": 1.8910869543482535e-08, + "loss": 1.6428, + "step": 33100 + }, + { + "epoch": 4.911111111111111, + "grad_norm": 6.140721797943115, + "learning_rate": 1.6118279854415718e-08, + "loss": 1.7507, + "step": 33150 + }, + { + "epoch": 4.9185185185185185, + "grad_norm": 5.360207557678223, + "learning_rate": 1.3548510629373125e-08, + "loss": 1.7524, + "step": 33200 + }, + { + "epoch": 4.925925925925926, + "grad_norm": 5.260952472686768, + "learning_rate": 1.1201619220511106e-08, + "loss": 1.7157, + "step": 33250 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 4.921377658843994, + "learning_rate": 9.077658005799405e-09, + "loss": 1.7424, + "step": 33300 + }, + { + "epoch": 4.940740740740741, + "grad_norm": 5.121984004974365, + "learning_rate": 7.176674387842086e-09, + "loss": 1.6539, + "step": 33350 + }, + { + "epoch": 4.948148148148148, + "grad_norm": 4.907476425170898, + "learning_rate": 5.498710792831707e-09, + "loss": 1.7879, + "step": 33400 + }, + { + "epoch": 4.955555555555556, + "grad_norm": 5.7554030418396, + "learning_rate": 4.04380466958898e-09, + "loss": 1.7458, + "step": 33450 + }, + { + "epoch": 4.962962962962963, + "grad_norm": 4.102212905883789, + "learning_rate": 2.8119884887389814e-09, + "loss": 1.7358, + "step": 33500 + }, + { + "epoch": 4.97037037037037, + "grad_norm": 4.823195934295654, + "learning_rate": 1.8032897419772987e-09, + "loss": 1.6797, + "step": 33550 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 5.787658214569092, + "learning_rate": 1.0177309414638458e-09, + "loss": 1.7365, + "step": 33600 + }, + { + "epoch": 4.985185185185185, + "grad_norm": 5.443394184112549, + "learning_rate": 4.5532961931216146e-10, + "loss": 1.6624, + "step": 33650 + }, + { + "epoch": 4.992592592592593, + "grad_norm": 5.065293312072754, + "learning_rate": 1.1609832720860248e-10, + "loss": 1.6947, + "step": 33700 + }, + { + "epoch": 5.0, + "grad_norm": 4.812777042388916, + "learning_rate": 4.4636121465657654e-14, + "loss": 1.6106, + "step": 33750 + }, + { + "epoch": 5.007407407407407, + "grad_norm": 4.977204322814941, + "learning_rate": 1.3532600998952505e-06, + "loss": 1.7371, + "step": 33800 + }, + { + "epoch": 5.014814814814815, + "grad_norm": 5.09653377532959, + "learning_rate": 1.3336002340133524e-06, + "loss": 1.7169, + "step": 33850 + }, + { + "epoch": 5.022222222222222, + "grad_norm": 4.8081560134887695, + "learning_rate": 1.3140740147366925e-06, + "loss": 1.7553, + "step": 33900 + }, + { + "epoch": 5.029629629629629, + "grad_norm": 3.804783582687378, + "learning_rate": 1.2946817431837289e-06, + "loss": 1.7236, + "step": 33950 + }, + { + "epoch": 5.037037037037037, + "grad_norm": 4.684085845947266, + "learning_rate": 1.2754237184072737e-06, + "loss": 1.6995, + "step": 34000 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 5.6429877281188965, + "learning_rate": 1.2563002373898936e-06, + "loss": 1.7936, + "step": 34050 + }, + { + "epoch": 5.051851851851852, + "grad_norm": 5.1504807472229, + "learning_rate": 1.2373115950393254e-06, + "loss": 1.7631, + "step": 34100 + }, + { + "epoch": 5.059259259259259, + "grad_norm": 5.963595867156982, + "learning_rate": 1.2184580841839233e-06, + "loss": 1.682, + "step": 34150 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 4.3478827476501465, + "learning_rate": 1.1997399955681444e-06, + "loss": 1.7337, + "step": 34200 + }, + { + "epoch": 5.074074074074074, + "grad_norm": 6.539457321166992, + "learning_rate": 1.1811576178480743e-06, + "loss": 1.7334, + "step": 34250 + }, + { + "epoch": 5.0814814814814815, + "grad_norm": 5.564241886138916, + "learning_rate": 1.162711237586961e-06, + "loss": 1.7236, + "step": 34300 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 5.158255100250244, + "learning_rate": 1.1444011392508103e-06, + "loss": 1.7642, + "step": 34350 + }, + { + "epoch": 5.0962962962962965, + "grad_norm": 4.977708339691162, + "learning_rate": 1.1262276052039911e-06, + "loss": 1.7097, + "step": 34400 + }, + { + "epoch": 5.103703703703704, + "grad_norm": 6.154446125030518, + "learning_rate": 1.1081909157048775e-06, + "loss": 1.6729, + "step": 34450 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 5.063690662384033, + "learning_rate": 1.09029134890153e-06, + "loss": 1.7204, + "step": 34500 + }, + { + "epoch": 5.118518518518519, + "grad_norm": 4.185413360595703, + "learning_rate": 1.0725291808274196e-06, + "loss": 1.6558, + "step": 34550 + }, + { + "epoch": 5.125925925925926, + "grad_norm": 4.706033706665039, + "learning_rate": 1.054904685397148e-06, + "loss": 1.7625, + "step": 34600 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 4.567237377166748, + "learning_rate": 1.0374181344022339e-06, + "loss": 1.7039, + "step": 34650 + }, + { + "epoch": 5.140740740740741, + "grad_norm": 4.0956807136535645, + "learning_rate": 1.0200697975069274e-06, + "loss": 1.6927, + "step": 34700 + }, + { + "epoch": 5.148148148148148, + "grad_norm": 4.75858211517334, + "learning_rate": 1.0028599422440466e-06, + "loss": 1.6918, + "step": 34750 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 6.522222518920898, + "learning_rate": 9.857888340108478e-07, + "loss": 1.7537, + "step": 34800 + }, + { + "epoch": 5.162962962962963, + "grad_norm": 4.785046100616455, + "learning_rate": 9.68856736064936e-07, + "loss": 1.7179, + "step": 34850 + }, + { + "epoch": 5.17037037037037, + "grad_norm": 4.325683116912842, + "learning_rate": 9.5206390952021e-07, + "loss": 1.6709, + "step": 34900 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 4.622459888458252, + "learning_rate": 9.354106133428287e-07, + "loss": 1.7491, + "step": 34950 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 4.967048168182373, + "learning_rate": 9.188971043472172e-07, + "loss": 1.738, + "step": 35000 + }, + { + "epoch": 5.192592592592592, + "grad_norm": 4.760488986968994, + "learning_rate": 9.025236371921176e-07, + "loss": 1.6654, + "step": 35050 + }, + { + "epoch": 5.2, + "grad_norm": 5.821338653564453, + "learning_rate": 8.862904643766435e-07, + "loss": 1.6761, + "step": 35100 + }, + { + "epoch": 5.207407407407407, + "grad_norm": 5.513166427612305, + "learning_rate": 8.701978362364039e-07, + "loss": 1.7369, + "step": 35150 + }, + { + "epoch": 5.214814814814815, + "grad_norm": 4.389341354370117, + "learning_rate": 8.542460009396313e-07, + "loss": 1.7099, + "step": 35200 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 5.52158260345459, + "learning_rate": 8.38435204483361e-07, + "loss": 1.7589, + "step": 35250 + }, + { + "epoch": 5.229629629629629, + "grad_norm": 4.525510311126709, + "learning_rate": 8.227656906896297e-07, + "loss": 1.6873, + "step": 35300 + }, + { + "epoch": 5.237037037037037, + "grad_norm": 4.573180675506592, + "learning_rate": 8.072377012017262e-07, + "loss": 1.6743, + "step": 35350 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 4.818542957305908, + "learning_rate": 7.918514754804596e-07, + "loss": 1.6599, + "step": 35400 + }, + { + "epoch": 5.2518518518518515, + "grad_norm": 6.3008341789245605, + "learning_rate": 7.766072508004596e-07, + "loss": 1.7452, + "step": 35450 + }, + { + "epoch": 5.2592592592592595, + "grad_norm": 5.006597518920898, + "learning_rate": 7.615052622465336e-07, + "loss": 1.7196, + "step": 35500 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 4.924106597900391, + "learning_rate": 7.465457427100231e-07, + "loss": 1.7712, + "step": 35550 + }, + { + "epoch": 5.274074074074074, + "grad_norm": 5.4804887771606445, + "learning_rate": 7.317289228852286e-07, + "loss": 1.715, + "step": 35600 + }, + { + "epoch": 5.281481481481482, + "grad_norm": 4.695647239685059, + "learning_rate": 7.170550312658375e-07, + "loss": 1.7124, + "step": 35650 + }, + { + "epoch": 5.288888888888889, + "grad_norm": 6.215033054351807, + "learning_rate": 7.025242941414146e-07, + "loss": 1.6316, + "step": 35700 + }, + { + "epoch": 5.296296296296296, + "grad_norm": 4.85810661315918, + "learning_rate": 6.881369355938971e-07, + "loss": 1.6962, + "step": 35750 + }, + { + "epoch": 5.303703703703704, + "grad_norm": 5.630098342895508, + "learning_rate": 6.73893177494156e-07, + "loss": 1.656, + "step": 35800 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 5.91583776473999, + "learning_rate": 6.597932394985617e-07, + "loss": 1.756, + "step": 35850 + }, + { + "epoch": 5.318518518518519, + "grad_norm": 4.883135795593262, + "learning_rate": 6.458373390455996e-07, + "loss": 1.7179, + "step": 35900 + }, + { + "epoch": 5.325925925925926, + "grad_norm": 4.665558815002441, + "learning_rate": 6.320256913525146e-07, + "loss": 1.7424, + "step": 35950 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 5.224910259246826, + "learning_rate": 6.183585094120082e-07, + "loss": 1.7098, + "step": 36000 + }, + { + "epoch": 5.340740740740741, + "grad_norm": 4.836453437805176, + "learning_rate": 6.048360039889267e-07, + "loss": 1.7507, + "step": 36050 + }, + { + "epoch": 5.348148148148148, + "grad_norm": 4.658173084259033, + "learning_rate": 5.914583836170341e-07, + "loss": 1.7571, + "step": 36100 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 6.138154029846191, + "learning_rate": 5.782258545957841e-07, + "loss": 1.7125, + "step": 36150 + }, + { + "epoch": 5.362962962962963, + "grad_norm": 5.144318580627441, + "learning_rate": 5.651386209871468e-07, + "loss": 1.759, + "step": 36200 + }, + { + "epoch": 5.37037037037037, + "grad_norm": 4.0739970207214355, + "learning_rate": 5.521968846124514e-07, + "loss": 1.7342, + "step": 36250 + }, + { + "epoch": 5.377777777777778, + "grad_norm": 4.604063987731934, + "learning_rate": 5.394008450492816e-07, + "loss": 1.7351, + "step": 36300 + }, + { + "epoch": 5.385185185185185, + "grad_norm": 4.261337757110596, + "learning_rate": 5.267506996283989e-07, + "loss": 1.7219, + "step": 36350 + }, + { + "epoch": 5.392592592592592, + "grad_norm": 4.81792688369751, + "learning_rate": 5.142466434306958e-07, + "loss": 1.6602, + "step": 36400 + }, + { + "epoch": 5.4, + "grad_norm": 5.473517894744873, + "learning_rate": 5.018888692841828e-07, + "loss": 1.7543, + "step": 36450 + }, + { + "epoch": 5.407407407407407, + "grad_norm": 5.177619457244873, + "learning_rate": 4.8967756776103e-07, + "loss": 1.7373, + "step": 36500 + }, + { + "epoch": 5.4148148148148145, + "grad_norm": 5.174300193786621, + "learning_rate": 4.776129271746078e-07, + "loss": 1.7004, + "step": 36550 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 4.85015344619751, + "learning_rate": 4.6569513357660245e-07, + "loss": 1.7679, + "step": 36600 + }, + { + "epoch": 5.42962962962963, + "grad_norm": 4.257620334625244, + "learning_rate": 4.5392437075413297e-07, + "loss": 1.7639, + "step": 36650 + }, + { + "epoch": 5.437037037037037, + "grad_norm": 5.19285249710083, + "learning_rate": 4.423008202269241e-07, + "loss": 1.6995, + "step": 36700 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 5.407149791717529, + "learning_rate": 4.3082466124450105e-07, + "loss": 1.6688, + "step": 36750 + }, + { + "epoch": 5.451851851851852, + "grad_norm": 7.0187225341796875, + "learning_rate": 4.194960707834339e-07, + "loss": 1.7256, + "step": 36800 + }, + { + "epoch": 5.459259259259259, + "grad_norm": 5.111575126647949, + "learning_rate": 4.083152235446031e-07, + "loss": 1.6883, + "step": 36850 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 4.569883346557617, + "learning_rate": 3.972822919505026e-07, + "loss": 1.6723, + "step": 36900 + }, + { + "epoch": 5.474074074074074, + "grad_norm": 4.689021587371826, + "learning_rate": 3.863974461425868e-07, + "loss": 1.6803, + "step": 36950 + }, + { + "epoch": 5.481481481481482, + "grad_norm": 4.931379795074463, + "learning_rate": 3.7566085397864216e-07, + "loss": 1.7024, + "step": 37000 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 4.577634811401367, + "learning_rate": 3.650726810302041e-07, + "loss": 1.7181, + "step": 37050 + }, + { + "epoch": 5.496296296296296, + "grad_norm": 5.35470724105835, + "learning_rate": 3.546330905799944e-07, + "loss": 1.778, + "step": 37100 + }, + { + "epoch": 5.503703703703704, + "grad_norm": 5.660891056060791, + "learning_rate": 3.443422436194155e-07, + "loss": 1.7464, + "step": 37150 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 5.5450334548950195, + "learning_rate": 3.3420029884605466e-07, + "loss": 1.6753, + "step": 37200 + }, + { + "epoch": 5.518518518518518, + "grad_norm": 4.8132853507995605, + "learning_rate": 3.242074126612471e-07, + "loss": 1.7366, + "step": 37250 + }, + { + "epoch": 5.525925925925926, + "grad_norm": 4.943393230438232, + "learning_rate": 3.1436373916766236e-07, + "loss": 1.7129, + "step": 37300 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 4.270570278167725, + "learning_rate": 3.0466943016692175e-07, + "loss": 1.7389, + "step": 37350 + }, + { + "epoch": 5.540740740740741, + "grad_norm": 6.0502095222473145, + "learning_rate": 2.9512463515725896e-07, + "loss": 1.6554, + "step": 37400 + }, + { + "epoch": 5.548148148148148, + "grad_norm": 5.404415607452393, + "learning_rate": 2.8572950133122556e-07, + "loss": 1.7489, + "step": 37450 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 5.563807010650635, + "learning_rate": 2.764841735734047e-07, + "loss": 1.7091, + "step": 37500 + }, + { + "epoch": 5.562962962962963, + "grad_norm": 6.912181377410889, + "learning_rate": 2.673887944581877e-07, + "loss": 1.7228, + "step": 37550 + }, + { + "epoch": 5.57037037037037, + "grad_norm": 5.763159275054932, + "learning_rate": 2.5844350424757194e-07, + "loss": 1.7131, + "step": 37600 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 5.707430362701416, + "learning_rate": 2.4964844088899985e-07, + "loss": 1.6694, + "step": 37650 + }, + { + "epoch": 5.5851851851851855, + "grad_norm": 5.327348232269287, + "learning_rate": 2.41003740013227e-07, + "loss": 1.7245, + "step": 37700 + }, + { + "epoch": 5.592592592592593, + "grad_norm": 4.956423759460449, + "learning_rate": 2.3250953493223484e-07, + "loss": 1.7274, + "step": 37750 + }, + { + "epoch": 5.6, + "grad_norm": 4.380049228668213, + "learning_rate": 2.2416595663717344e-07, + "loss": 1.7224, + "step": 37800 + }, + { + "epoch": 5.607407407407408, + "grad_norm": 5.581598281860352, + "learning_rate": 2.1597313379634332e-07, + "loss": 1.7526, + "step": 37850 + }, + { + "epoch": 5.614814814814815, + "grad_norm": 5.117220401763916, + "learning_rate": 2.079311927532046e-07, + "loss": 1.604, + "step": 37900 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 4.668652057647705, + "learning_rate": 2.0004025752443978e-07, + "loss": 1.7551, + "step": 37950 + }, + { + "epoch": 5.62962962962963, + "grad_norm": 4.981814861297607, + "learning_rate": 1.9230044979803075e-07, + "loss": 1.7394, + "step": 38000 + }, + { + "epoch": 5.637037037037037, + "grad_norm": 5.624590873718262, + "learning_rate": 1.847118889313837e-07, + "loss": 1.7687, + "step": 38050 + }, + { + "epoch": 5.644444444444445, + "grad_norm": 4.681608200073242, + "learning_rate": 1.7727469194950053e-07, + "loss": 1.7569, + "step": 38100 + }, + { + "epoch": 5.651851851851852, + "grad_norm": 5.595433235168457, + "learning_rate": 1.6998897354315592e-07, + "loss": 1.7543, + "step": 38150 + }, + { + "epoch": 5.659259259259259, + "grad_norm": 5.625829219818115, + "learning_rate": 1.6285484606713976e-07, + "loss": 1.7192, + "step": 38200 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 3.95115327835083, + "learning_rate": 1.5587241953852416e-07, + "loss": 1.7095, + "step": 38250 + }, + { + "epoch": 5.674074074074074, + "grad_norm": 4.709456443786621, + "learning_rate": 1.4904180163496373e-07, + "loss": 1.7102, + "step": 38300 + }, + { + "epoch": 5.681481481481481, + "grad_norm": 5.535898208618164, + "learning_rate": 1.423630976930368e-07, + "loss": 1.7262, + "step": 38350 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 5.115292549133301, + "learning_rate": 1.3583641070662011e-07, + "loss": 1.7252, + "step": 38400 + }, + { + "epoch": 5.696296296296296, + "grad_norm": 4.890398979187012, + "learning_rate": 1.2946184132530015e-07, + "loss": 1.7384, + "step": 38450 + }, + { + "epoch": 5.703703703703704, + "grad_norm": 4.627488136291504, + "learning_rate": 1.2323948785282313e-07, + "loss": 1.8393, + "step": 38500 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 5.317155361175537, + "learning_rate": 1.1716944624557524e-07, + "loss": 1.7405, + "step": 38550 + }, + { + "epoch": 5.718518518518518, + "grad_norm": 4.53377628326416, + "learning_rate": 1.1125181011111042e-07, + "loss": 1.6679, + "step": 38600 + }, + { + "epoch": 5.725925925925926, + "grad_norm": 6.2958879470825195, + "learning_rate": 1.0548667070669594e-07, + "loss": 1.7211, + "step": 38650 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 6.272239685058594, + "learning_rate": 9.98741169379125e-08, + "loss": 1.7378, + "step": 38700 + }, + { + "epoch": 5.7407407407407405, + "grad_norm": 4.6558732986450195, + "learning_rate": 9.441423535728523e-08, + "loss": 1.8154, + "step": 38750 + }, + { + "epoch": 5.7481481481481485, + "grad_norm": 6.190096378326416, + "learning_rate": 8.910711016294039e-08, + "loss": 1.7789, + "step": 38800 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 4.927070140838623, + "learning_rate": 8.395282319731302e-08, + "loss": 1.7477, + "step": 38850 + }, + { + "epoch": 5.762962962962963, + "grad_norm": 4.361870765686035, + "learning_rate": 7.895145394588577e-08, + "loss": 1.6863, + "step": 38900 + }, + { + "epoch": 5.770370370370371, + "grad_norm": 5.228439807891846, + "learning_rate": 7.410307953595874e-08, + "loss": 1.7682, + "step": 38950 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 4.051884174346924, + "learning_rate": 6.940777473546379e-08, + "loss": 1.6873, + "step": 39000 + }, + { + "epoch": 5.785185185185185, + "grad_norm": 4.61396598815918, + "learning_rate": 6.486561195180763e-08, + "loss": 1.6778, + "step": 39050 + }, + { + "epoch": 5.792592592592593, + "grad_norm": 4.127589225769043, + "learning_rate": 6.047666123076168e-08, + "loss": 1.6759, + "step": 39100 + }, + { + "epoch": 5.8, + "grad_norm": 5.580689907073975, + "learning_rate": 5.624099025537399e-08, + "loss": 1.6931, + "step": 39150 + }, + { + "epoch": 5.807407407407408, + "grad_norm": 4.461786270141602, + "learning_rate": 5.215866434493011e-08, + "loss": 1.7217, + "step": 39200 + }, + { + "epoch": 5.814814814814815, + "grad_norm": 5.180467128753662, + "learning_rate": 4.822974645394718e-08, + "loss": 1.7161, + "step": 39250 + }, + { + "epoch": 5.822222222222222, + "grad_norm": 5.350657939910889, + "learning_rate": 4.445429717119809e-08, + "loss": 1.761, + "step": 39300 + }, + { + "epoch": 5.82962962962963, + "grad_norm": 4.767494201660156, + "learning_rate": 4.083237471878221e-08, + "loss": 1.7417, + "step": 39350 + }, + { + "epoch": 5.837037037037037, + "grad_norm": 6.330611228942871, + "learning_rate": 3.736403495122498e-08, + "loss": 1.8127, + "step": 39400 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 5.576878547668457, + "learning_rate": 3.404933135461419e-08, + "loss": 1.655, + "step": 39450 + }, + { + "epoch": 5.851851851851852, + "grad_norm": 4.449545383453369, + "learning_rate": 3.088831504577949e-08, + "loss": 1.6512, + "step": 39500 + }, + { + "epoch": 5.859259259259259, + "grad_norm": 5.951160430908203, + "learning_rate": 2.7881034771505277e-08, + "loss": 1.7151, + "step": 39550 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 3.876145124435425, + "learning_rate": 2.5027536907772375e-08, + "loss": 1.7376, + "step": 39600 + }, + { + "epoch": 5.874074074074074, + "grad_norm": 4.853695392608643, + "learning_rate": 2.2327865459047527e-08, + "loss": 1.7232, + "step": 39650 + }, + { + "epoch": 5.881481481481481, + "grad_norm": 4.873416900634766, + "learning_rate": 1.9782062057603913e-08, + "loss": 1.7437, + "step": 39700 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 5.661427974700928, + "learning_rate": 1.7390165962879458e-08, + "loss": 1.6729, + "step": 39750 + }, + { + "epoch": 5.896296296296296, + "grad_norm": 5.479251861572266, + "learning_rate": 1.515221406087175e-08, + "loss": 1.758, + "step": 39800 + }, + { + "epoch": 5.9037037037037035, + "grad_norm": 4.6737823486328125, + "learning_rate": 1.3068240863566285e-08, + "loss": 1.6668, + "step": 39850 + }, + { + "epoch": 5.911111111111111, + "grad_norm": 6.463534355163574, + "learning_rate": 1.1138278508407985e-08, + "loss": 1.7779, + "step": 39900 + }, + { + "epoch": 5.9185185185185185, + "grad_norm": 5.896658897399902, + "learning_rate": 9.362356757804947e-09, + "loss": 1.703, + "step": 39950 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 5.711425304412842, + "learning_rate": 7.740502998666577e-09, + "loss": 1.6643, + "step": 40000 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 4.859104156494141, + "learning_rate": 6.272742241985042e-09, + "loss": 1.7116, + "step": 40050 + }, + { + "epoch": 5.940740740740741, + "grad_norm": 5.876493453979492, + "learning_rate": 4.9590971224444676e-09, + "loss": 1.6339, + "step": 40100 + }, + { + "epoch": 5.948148148148148, + "grad_norm": 4.462888717651367, + "learning_rate": 3.799587898080104e-09, + "loss": 1.6942, + "step": 40150 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 5.941119194030762, + "learning_rate": 2.7942324499585782e-09, + "loss": 1.7655, + "step": 40200 + }, + { + "epoch": 5.962962962962963, + "grad_norm": 4.833111763000488, + "learning_rate": 1.943046281903671e-09, + "loss": 1.7224, + "step": 40250 + }, + { + "epoch": 5.97037037037037, + "grad_norm": 5.334336757659912, + "learning_rate": 1.2460425202587279e-09, + "loss": 1.684, + "step": 40300 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 5.39022159576416, + "learning_rate": 7.032319136845989e-10, + "loss": 1.6564, + "step": 40350 + }, + { + "epoch": 5.985185185185185, + "grad_norm": 4.58676290512085, + "learning_rate": 3.1462283299199534e-10, + "loss": 1.7112, + "step": 40400 + }, + { + "epoch": 5.992592592592593, + "grad_norm": 3.8273935317993164, + "learning_rate": 8.022127101492416e-11, + "loss": 1.6573, + "step": 40450 + }, + { + "epoch": 6.0, + "grad_norm": 4.507394313812256, + "learning_rate": 3.0842514098239354e-14, + "loss": 1.704, + "step": 40500 + }, + { + "epoch": 6.007407407407407, + "grad_norm": 4.695806503295898, + "learning_rate": 9.968610323118833e-07, + "loss": 1.7468, + "step": 40550 + }, + { + "epoch": 6.014814814814815, + "grad_norm": 4.552427768707275, + "learning_rate": 9.822878109140143e-07, + "loss": 1.6836, + "step": 40600 + }, + { + "epoch": 6.022222222222222, + "grad_norm": 5.570436954498291, + "learning_rate": 9.678163953077624e-07, + "loss": 1.6861, + "step": 40650 + }, + { + "epoch": 6.029629629629629, + "grad_norm": 4.6333909034729, + "learning_rate": 9.53446948868707e-07, + "loss": 1.6624, + "step": 40700 + }, + { + "epoch": 6.037037037037037, + "grad_norm": 6.026545524597168, + "learning_rate": 9.391796338212356e-07, + "loss": 1.6884, + "step": 40750 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 4.965294361114502, + "learning_rate": 9.25014611236723e-07, + "loss": 1.7189, + "step": 40800 + }, + { + "epoch": 6.051851851851852, + "grad_norm": 4.187284469604492, + "learning_rate": 9.109520410317107e-07, + "loss": 1.6792, + "step": 40850 + }, + { + "epoch": 6.059259259259259, + "grad_norm": 5.307417392730713, + "learning_rate": 8.969920819661016e-07, + "loss": 1.6886, + "step": 40900 + }, + { + "epoch": 6.066666666666666, + "grad_norm": 5.512707710266113, + "learning_rate": 8.831348916413606e-07, + "loss": 1.7772, + "step": 40950 + }, + { + "epoch": 6.074074074074074, + "grad_norm": 5.011601448059082, + "learning_rate": 8.693806264987482e-07, + "loss": 1.7525, + "step": 41000 + }, + { + "epoch": 6.0814814814814815, + "grad_norm": 4.537539005279541, + "learning_rate": 8.55729441817541e-07, + "loss": 1.6975, + "step": 41050 + }, + { + "epoch": 6.088888888888889, + "grad_norm": 4.727987766265869, + "learning_rate": 8.421814917132898e-07, + "loss": 1.67, + "step": 41100 + }, + { + "epoch": 6.0962962962962965, + "grad_norm": 4.8712382316589355, + "learning_rate": 8.287369291360736e-07, + "loss": 1.7992, + "step": 41150 + }, + { + "epoch": 6.103703703703704, + "grad_norm": 4.62169075012207, + "learning_rate": 8.1539590586877e-07, + "loss": 1.7066, + "step": 41200 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 6.729462146759033, + "learning_rate": 8.021585725253511e-07, + "loss": 1.6944, + "step": 41250 + }, + { + "epoch": 6.118518518518519, + "grad_norm": 4.7458577156066895, + "learning_rate": 7.890250785491771e-07, + "loss": 1.7662, + "step": 41300 + }, + { + "epoch": 6.125925925925926, + "grad_norm": 4.354922771453857, + "learning_rate": 7.759955722113077e-07, + "loss": 1.679, + "step": 41350 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 5.127136707305908, + "learning_rate": 7.630702006088298e-07, + "loss": 1.758, + "step": 41400 + }, + { + "epoch": 6.140740740740741, + "grad_norm": 4.486684322357178, + "learning_rate": 7.502491096632003e-07, + "loss": 1.7028, + "step": 41450 + }, + { + "epoch": 6.148148148148148, + "grad_norm": 4.6135077476501465, + "learning_rate": 7.375324441185938e-07, + "loss": 1.7477, + "step": 41500 + }, + { + "epoch": 6.155555555555556, + "grad_norm": 4.534246921539307, + "learning_rate": 7.249203475402722e-07, + "loss": 1.6541, + "step": 41550 + }, + { + "epoch": 6.162962962962963, + "grad_norm": 4.888129711151123, + "learning_rate": 7.124129623129605e-07, + "loss": 1.6781, + "step": 41600 + }, + { + "epoch": 6.17037037037037, + "grad_norm": 4.807866096496582, + "learning_rate": 7.000104296392418e-07, + "loss": 1.7778, + "step": 41650 + }, + { + "epoch": 6.177777777777778, + "grad_norm": 5.536331653594971, + "learning_rate": 6.877128895379625e-07, + "loss": 1.7536, + "step": 41700 + }, + { + "epoch": 6.185185185185185, + "grad_norm": 4.815216541290283, + "learning_rate": 6.755204808426529e-07, + "loss": 1.7125, + "step": 41750 + }, + { + "epoch": 6.192592592592592, + "grad_norm": 5.147290229797363, + "learning_rate": 6.634333411999527e-07, + "loss": 1.7725, + "step": 41800 + }, + { + "epoch": 6.2, + "grad_norm": 5.061643600463867, + "learning_rate": 6.51451607068071e-07, + "loss": 1.7192, + "step": 41850 + }, + { + "epoch": 6.207407407407407, + "grad_norm": 4.440853595733643, + "learning_rate": 6.395754137152321e-07, + "loss": 1.6838, + "step": 41900 + }, + { + "epoch": 6.214814814814815, + "grad_norm": 4.56868314743042, + "learning_rate": 6.278048952181548e-07, + "loss": 1.7385, + "step": 41950 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 3.9864776134490967, + "learning_rate": 6.161401844605408e-07, + "loss": 1.7712, + "step": 42000 + }, + { + "epoch": 6.229629629629629, + "grad_norm": 4.387381553649902, + "learning_rate": 6.045814131315653e-07, + "loss": 1.6408, + "step": 42050 + }, + { + "epoch": 6.237037037037037, + "grad_norm": 4.279029369354248, + "learning_rate": 5.931287117244012e-07, + "loss": 1.7067, + "step": 42100 + }, + { + "epoch": 6.2444444444444445, + "grad_norm": 4.650752067565918, + "learning_rate": 5.817822095347414e-07, + "loss": 1.7025, + "step": 42150 + }, + { + "epoch": 6.2518518518518515, + "grad_norm": 4.82672643661499, + "learning_rate": 5.705420346593349e-07, + "loss": 1.638, + "step": 42200 + }, + { + "epoch": 6.2592592592592595, + "grad_norm": 4.590485095977783, + "learning_rate": 5.594083139945505e-07, + "loss": 1.6822, + "step": 42250 + }, + { + "epoch": 6.266666666666667, + "grad_norm": 4.995912551879883, + "learning_rate": 5.48381173234932e-07, + "loss": 1.6746, + "step": 42300 + }, + { + "epoch": 6.274074074074074, + "grad_norm": 5.399720668792725, + "learning_rate": 5.374607368717899e-07, + "loss": 1.7591, + "step": 42350 + }, + { + "epoch": 6.281481481481482, + "grad_norm": 4.2428693771362305, + "learning_rate": 5.266471281917906e-07, + "loss": 1.7012, + "step": 42400 + }, + { + "epoch": 6.288888888888889, + "grad_norm": 5.344005107879639, + "learning_rate": 5.159404692755621e-07, + "loss": 1.7282, + "step": 42450 + }, + { + "epoch": 6.296296296296296, + "grad_norm": 5.161723613739014, + "learning_rate": 5.053408809963234e-07, + "loss": 1.6308, + "step": 42500 + }, + { + "epoch": 6.303703703703704, + "grad_norm": 4.982937812805176, + "learning_rate": 4.948484830185152e-07, + "loss": 1.8198, + "step": 42550 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 6.293619632720947, + "learning_rate": 4.844633937964471e-07, + "loss": 1.7127, + "step": 42600 + }, + { + "epoch": 6.318518518518519, + "grad_norm": 4.575809478759766, + "learning_rate": 4.741857305729636e-07, + "loss": 1.668, + "step": 42650 + }, + { + "epoch": 6.325925925925926, + "grad_norm": 4.730220794677734, + "learning_rate": 4.6401560937812006e-07, + "loss": 1.7546, + "step": 42700 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 5.089296340942383, + "learning_rate": 4.5395314502787224e-07, + "loss": 1.6804, + "step": 42750 + }, + { + "epoch": 6.340740740740741, + "grad_norm": 4.743657112121582, + "learning_rate": 4.4399845112277795e-07, + "loss": 1.7564, + "step": 42800 + }, + { + "epoch": 6.348148148148148, + "grad_norm": 3.8570306301116943, + "learning_rate": 4.341516400467194e-07, + "loss": 1.6802, + "step": 42850 + }, + { + "epoch": 6.355555555555555, + "grad_norm": 5.294703960418701, + "learning_rate": 4.244128229656297e-07, + "loss": 1.7917, + "step": 42900 + }, + { + "epoch": 6.362962962962963, + "grad_norm": 4.600700855255127, + "learning_rate": 4.1478210982624055e-07, + "loss": 1.6821, + "step": 42950 + }, + { + "epoch": 6.37037037037037, + "grad_norm": 5.0378642082214355, + "learning_rate": 4.052596093548433e-07, + "loss": 1.7137, + "step": 43000 + }, + { + "epoch": 6.377777777777778, + "grad_norm": 5.134553909301758, + "learning_rate": 3.9584542905604984e-07, + "loss": 1.7319, + "step": 43050 + }, + { + "epoch": 6.385185185185185, + "grad_norm": 4.90350866317749, + "learning_rate": 3.8653967521159683e-07, + "loss": 1.7284, + "step": 43100 + }, + { + "epoch": 6.392592592592592, + "grad_norm": 5.329576015472412, + "learning_rate": 3.773424528791314e-07, + "loss": 1.7192, + "step": 43150 + }, + { + "epoch": 6.4, + "grad_norm": 6.085256099700928, + "learning_rate": 3.682538658910317e-07, + "loss": 1.7121, + "step": 43200 + }, + { + "epoch": 6.407407407407407, + "grad_norm": 5.10631799697876, + "learning_rate": 3.5927401685323383e-07, + "loss": 1.7475, + "step": 43250 + }, + { + "epoch": 6.4148148148148145, + "grad_norm": 5.675637722015381, + "learning_rate": 3.5040300714407116e-07, + "loss": 1.7204, + "step": 43300 + }, + { + "epoch": 6.4222222222222225, + "grad_norm": 4.893547058105469, + "learning_rate": 3.4164093691313126e-07, + "loss": 1.7431, + "step": 43350 + }, + { + "epoch": 6.42962962962963, + "grad_norm": 5.115847110748291, + "learning_rate": 3.329879050801288e-07, + "loss": 1.688, + "step": 43400 + }, + { + "epoch": 6.437037037037037, + "grad_norm": 4.1967010498046875, + "learning_rate": 3.2444400933378085e-07, + "loss": 1.6674, + "step": 43450 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 5.261149883270264, + "learning_rate": 3.160093461307123e-07, + "loss": 1.7298, + "step": 43500 + }, + { + "epoch": 6.451851851851852, + "grad_norm": 5.664730072021484, + "learning_rate": 3.0768401069436235e-07, + "loss": 1.7269, + "step": 43550 + }, + { + "epoch": 6.459259259259259, + "grad_norm": 4.641995906829834, + "learning_rate": 2.9946809701390855e-07, + "loss": 1.7186, + "step": 43600 + }, + { + "epoch": 6.466666666666667, + "grad_norm": 5.614559650421143, + "learning_rate": 2.91361697843211e-07, + "loss": 1.6936, + "step": 43650 + }, + { + "epoch": 6.474074074074074, + "grad_norm": 4.712127208709717, + "learning_rate": 2.8336490469975665e-07, + "loss": 1.6855, + "step": 43700 + }, + { + "epoch": 6.481481481481482, + "grad_norm": 4.914728164672852, + "learning_rate": 2.7547780786363666e-07, + "loss": 1.7235, + "step": 43750 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 3.8866348266601562, + "learning_rate": 2.677004963765184e-07, + "loss": 1.6946, + "step": 43800 + }, + { + "epoch": 6.496296296296296, + "grad_norm": 4.636266231536865, + "learning_rate": 2.6003305804064626e-07, + "loss": 1.7051, + "step": 43850 + }, + { + "epoch": 6.503703703703704, + "grad_norm": 4.9295501708984375, + "learning_rate": 2.524755794178413e-07, + "loss": 1.7064, + "step": 43900 + }, + { + "epoch": 6.511111111111111, + "grad_norm": 4.815357685089111, + "learning_rate": 2.4502814582853863e-07, + "loss": 1.7588, + "step": 43950 + }, + { + "epoch": 6.518518518518518, + "grad_norm": 5.123908519744873, + "learning_rate": 2.3769084135081165e-07, + "loss": 1.6893, + "step": 44000 + }, + { + "epoch": 6.525925925925926, + "grad_norm": 4.06140661239624, + "learning_rate": 2.3046374881942614e-07, + "loss": 1.7157, + "step": 44050 + }, + { + "epoch": 6.533333333333333, + "grad_norm": 4.8077569007873535, + "learning_rate": 2.2334694982490857e-07, + "loss": 1.7089, + "step": 44100 + }, + { + "epoch": 6.540740740740741, + "grad_norm": 4.5206828117370605, + "learning_rate": 2.1634052471262267e-07, + "loss": 1.6945, + "step": 44150 + }, + { + "epoch": 6.548148148148148, + "grad_norm": 4.904362678527832, + "learning_rate": 2.0944455258185893e-07, + "loss": 1.6603, + "step": 44200 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 4.935449123382568, + "learning_rate": 2.0265911128494852e-07, + "loss": 1.7091, + "step": 44250 + }, + { + "epoch": 6.562962962962963, + "grad_norm": 5.564135551452637, + "learning_rate": 1.9598427742637872e-07, + "loss": 1.7323, + "step": 44300 + }, + { + "epoch": 6.57037037037037, + "grad_norm": 5.302987575531006, + "learning_rate": 1.8942012636192997e-07, + "loss": 1.7145, + "step": 44350 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 4.711696147918701, + "learning_rate": 1.829667321978268e-07, + "loss": 1.6974, + "step": 44400 + }, + { + "epoch": 6.5851851851851855, + "grad_norm": 5.5999932289123535, + "learning_rate": 1.7662416778989722e-07, + "loss": 1.7, + "step": 44450 + }, + { + "epoch": 6.592592592592593, + "grad_norm": 4.745241641998291, + "learning_rate": 1.7039250474275682e-07, + "loss": 1.7128, + "step": 44500 + }, + { + "epoch": 6.6, + "grad_norm": 4.5061750411987305, + "learning_rate": 1.6427181340899045e-07, + "loss": 1.8055, + "step": 44550 + }, + { + "epoch": 6.607407407407408, + "grad_norm": 3.9571361541748047, + "learning_rate": 1.5826216288836738e-07, + "loss": 1.6653, + "step": 44600 + }, + { + "epoch": 6.614814814814815, + "grad_norm": 5.4514288902282715, + "learning_rate": 1.523636210270585e-07, + "loss": 1.7121, + "step": 44650 + }, + { + "epoch": 6.622222222222222, + "grad_norm": 4.900452613830566, + "learning_rate": 1.4657625441686697e-07, + "loss": 1.6662, + "step": 44700 + }, + { + "epoch": 6.62962962962963, + "grad_norm": 4.878017902374268, + "learning_rate": 1.4090012839447998e-07, + "loss": 1.6853, + "step": 44750 + }, + { + "epoch": 6.637037037037037, + "grad_norm": 6.244802951812744, + "learning_rate": 1.353353070407304e-07, + "loss": 1.7196, + "step": 44800 + }, + { + "epoch": 6.644444444444445, + "grad_norm": 4.747304439544678, + "learning_rate": 1.2988185317987178e-07, + "loss": 1.7786, + "step": 44850 + }, + { + "epoch": 6.651851851851852, + "grad_norm": 4.684929370880127, + "learning_rate": 1.2453982837887123e-07, + "loss": 1.7424, + "step": 44900 + }, + { + "epoch": 6.659259259259259, + "grad_norm": 5.081334114074707, + "learning_rate": 1.1930929294671324e-07, + "loss": 1.7229, + "step": 44950 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 4.667269706726074, + "learning_rate": 1.1419030593371794e-07, + "loss": 1.7856, + "step": 45000 + }, + { + "epoch": 6.674074074074074, + "grad_norm": 5.279778957366943, + "learning_rate": 1.0918292513087736e-07, + "loss": 1.7846, + "step": 45050 + }, + { + "epoch": 6.681481481481481, + "grad_norm": 5.6646270751953125, + "learning_rate": 1.0428720706920137e-07, + "loss": 1.7493, + "step": 45100 + }, + { + "epoch": 6.688888888888889, + "grad_norm": 4.250929832458496, + "learning_rate": 9.950320701907601e-08, + "loss": 1.7586, + "step": 45150 + }, + { + "epoch": 6.696296296296296, + "grad_norm": 5.012825012207031, + "learning_rate": 9.48309789896451e-08, + "loss": 1.7286, + "step": 45200 + }, + { + "epoch": 6.703703703703704, + "grad_norm": 4.255910396575928, + "learning_rate": 9.027057572819963e-08, + "loss": 1.774, + "step": 45250 + }, + { + "epoch": 6.711111111111111, + "grad_norm": 6.555685997009277, + "learning_rate": 8.582204871957711e-08, + "loss": 1.6969, + "step": 45300 + }, + { + "epoch": 6.718518518518518, + "grad_norm": 5.877486228942871, + "learning_rate": 8.148544818558646e-08, + "loss": 1.7386, + "step": 45350 + }, + { + "epoch": 6.725925925925926, + "grad_norm": 6.148101806640625, + "learning_rate": 7.726082308443849e-08, + "loss": 1.6688, + "step": 45400 + }, + { + "epoch": 6.733333333333333, + "grad_norm": 5.574037075042725, + "learning_rate": 7.314822111019304e-08, + "loss": 1.7214, + "step": 45450 + }, + { + "epoch": 6.7407407407407405, + "grad_norm": 4.646797180175781, + "learning_rate": 6.914768869221933e-08, + "loss": 1.7853, + "step": 45500 + }, + { + "epoch": 6.7481481481481485, + "grad_norm": 4.4932661056518555, + "learning_rate": 6.525927099467644e-08, + "loss": 1.6435, + "step": 45550 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 4.503779411315918, + "learning_rate": 6.148301191599593e-08, + "loss": 1.7112, + "step": 45600 + }, + { + "epoch": 6.762962962962963, + "grad_norm": 6.027365684509277, + "learning_rate": 5.781895408839333e-08, + "loss": 1.7374, + "step": 45650 + }, + { + "epoch": 6.770370370370371, + "grad_norm": 3.774756908416748, + "learning_rate": 5.426713887738522e-08, + "loss": 1.6818, + "step": 45700 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 4.556633949279785, + "learning_rate": 5.082760638131734e-08, + "loss": 1.6574, + "step": 45750 + }, + { + "epoch": 6.785185185185185, + "grad_norm": 4.793794631958008, + "learning_rate": 4.750039543091944e-08, + "loss": 1.7449, + "step": 45800 + }, + { + "epoch": 6.792592592592593, + "grad_norm": 4.510838508605957, + "learning_rate": 4.4285543588858946e-08, + "loss": 1.7321, + "step": 45850 + }, + { + "epoch": 6.8, + "grad_norm": 4.843527317047119, + "learning_rate": 4.118308714932462e-08, + "loss": 1.7609, + "step": 45900 + }, + { + "epoch": 6.807407407407408, + "grad_norm": 6.334366321563721, + "learning_rate": 3.819306113761245e-08, + "loss": 1.6737, + "step": 45950 + }, + { + "epoch": 6.814814814814815, + "grad_norm": 5.0371994972229, + "learning_rate": 3.531549930973044e-08, + "loss": 1.6988, + "step": 46000 + }, + { + "epoch": 6.822222222222222, + "grad_norm": 4.915505886077881, + "learning_rate": 3.255043415201664e-08, + "loss": 1.6995, + "step": 46050 + }, + { + "epoch": 6.82962962962963, + "grad_norm": 5.524471282958984, + "learning_rate": 2.989789688077615e-08, + "loss": 1.68, + "step": 46100 + }, + { + "epoch": 6.837037037037037, + "grad_norm": 7.363884449005127, + "learning_rate": 2.7357917441926952e-08, + "loss": 1.6951, + "step": 46150 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 4.0302534103393555, + "learning_rate": 2.4930524510656805e-08, + "loss": 1.6722, + "step": 46200 + }, + { + "epoch": 6.851851851851852, + "grad_norm": 4.887310028076172, + "learning_rate": 2.261574549111023e-08, + "loss": 1.7051, + "step": 46250 + }, + { + "epoch": 6.859259259259259, + "grad_norm": 5.650010585784912, + "learning_rate": 2.0413606516066496e-08, + "loss": 1.6899, + "step": 46300 + }, + { + "epoch": 6.866666666666667, + "grad_norm": 4.32122802734375, + "learning_rate": 1.832413244665654e-08, + "loss": 1.7623, + "step": 46350 + }, + { + "epoch": 6.874074074074074, + "grad_norm": 4.738122940063477, + "learning_rate": 1.6347346872074287e-08, + "loss": 1.7246, + "step": 46400 + }, + { + "epoch": 6.881481481481481, + "grad_norm": 4.318696975708008, + "learning_rate": 1.4483272109310221e-08, + "loss": 1.7247, + "step": 46450 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 5.06472635269165, + "learning_rate": 1.2731929202907111e-08, + "loss": 1.694, + "step": 46500 + }, + { + "epoch": 6.896296296296296, + "grad_norm": 4.829148292541504, + "learning_rate": 1.1093337924714675e-08, + "loss": 1.7271, + "step": 46550 + }, + { + "epoch": 6.9037037037037035, + "grad_norm": 4.980674743652344, + "learning_rate": 9.567516773667517e-09, + "loss": 1.6858, + "step": 46600 + }, + { + "epoch": 6.911111111111111, + "grad_norm": 5.86638069152832, + "learning_rate": 8.154482975579747e-09, + "loss": 1.7079, + "step": 46650 + }, + { + "epoch": 6.9185185185185185, + "grad_norm": 4.498690605163574, + "learning_rate": 6.854252482949575e-09, + "loss": 1.7333, + "step": 46700 + }, + { + "epoch": 6.925925925925926, + "grad_norm": 4.175648212432861, + "learning_rate": 5.666839974775018e-09, + "loss": 1.6606, + "step": 46750 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 4.801917552947998, + "learning_rate": 4.592258856394027e-09, + "loss": 1.7077, + "step": 46800 + }, + { + "epoch": 6.940740740740741, + "grad_norm": 3.6366991996765137, + "learning_rate": 3.6305212593290562e-09, + "loss": 1.702, + "step": 46850 + }, + { + "epoch": 6.948148148148148, + "grad_norm": 4.430602550506592, + "learning_rate": 2.781638041152723e-09, + "loss": 1.5927, + "step": 46900 + }, + { + "epoch": 6.955555555555556, + "grad_norm": 5.0436906814575195, + "learning_rate": 2.0456187853612476e-09, + "loss": 1.7092, + "step": 46950 + }, + { + "epoch": 6.962962962962963, + "grad_norm": 4.557282447814941, + "learning_rate": 1.422471801272307e-09, + "loss": 1.7299, + "step": 47000 + }, + { + "epoch": 6.97037037037037, + "grad_norm": 4.492140769958496, + "learning_rate": 9.122041239262302e-10, + "loss": 1.7693, + "step": 47050 + }, + { + "epoch": 6.977777777777778, + "grad_norm": 6.007195472717285, + "learning_rate": 5.148215140093893e-10, + "loss": 1.6889, + "step": 47100 + }, + { + "epoch": 6.985185185185185, + "grad_norm": 4.410074234008789, + "learning_rate": 2.3032845778647728e-10, + "loss": 1.7252, + "step": 47150 + }, + { + "epoch": 6.992592592592593, + "grad_norm": 4.711976528167725, + "learning_rate": 5.872816705165817e-11, + "loss": 1.7031, + "step": 47200 + }, + { + "epoch": 7.0, + "grad_norm": 5.362443923950195, + "learning_rate": 2.2579094149932645e-14, + "loss": 1.7029, + "step": 47250 + }, + { + "epoch": 7.007407407407407, + "grad_norm": 5.93522310256958, + "learning_rate": 7.642412558190549e-07, + "loss": 1.6308, + "step": 47300 + }, + { + "epoch": 7.014814814814815, + "grad_norm": 5.8510308265686035, + "learning_rate": 7.530237348772096e-07, + "loss": 1.7759, + "step": 47350 + }, + { + "epoch": 7.022222222222222, + "grad_norm": 4.586331367492676, + "learning_rate": 7.418859273881395e-07, + "loss": 1.7443, + "step": 47400 + }, + { + "epoch": 7.029629629629629, + "grad_norm": 4.656227111816406, + "learning_rate": 7.308279293651843e-07, + "loss": 1.6972, + "step": 47450 + }, + { + "epoch": 7.037037037037037, + "grad_norm": 5.146853446960449, + "learning_rate": 7.198498361336915e-07, + "loss": 1.7466, + "step": 47500 + }, + { + "epoch": 7.044444444444444, + "grad_norm": 6.802228927612305, + "learning_rate": 7.089517423301873e-07, + "loss": 1.6718, + "step": 47550 + }, + { + "epoch": 7.051851851851852, + "grad_norm": 5.65762186050415, + "learning_rate": 6.98133741901561e-07, + "loss": 1.6671, + "step": 47600 + }, + { + "epoch": 7.059259259259259, + "grad_norm": 4.821460247039795, + "learning_rate": 6.873959281042652e-07, + "loss": 1.7146, + "step": 47650 + }, + { + "epoch": 7.066666666666666, + "grad_norm": 5.252972602844238, + "learning_rate": 6.767383935034966e-07, + "loss": 1.7193, + "step": 47700 + }, + { + "epoch": 7.074074074074074, + "grad_norm": 4.320099353790283, + "learning_rate": 6.661612299724151e-07, + "loss": 1.6708, + "step": 47750 + }, + { + "epoch": 7.0814814814814815, + "grad_norm": 5.750055313110352, + "learning_rate": 6.556645286913321e-07, + "loss": 1.7479, + "step": 47800 + }, + { + "epoch": 7.088888888888889, + "grad_norm": 5.870389461517334, + "learning_rate": 6.452483801469445e-07, + "loss": 1.6698, + "step": 47850 + }, + { + "epoch": 7.0962962962962965, + "grad_norm": 4.311103820800781, + "learning_rate": 6.349128741315391e-07, + "loss": 1.6857, + "step": 47900 + }, + { + "epoch": 7.103703703703704, + "grad_norm": 4.5788421630859375, + "learning_rate": 6.24658099742228e-07, + "loss": 1.7202, + "step": 47950 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 5.257473468780518, + "learning_rate": 6.144841453801753e-07, + "loss": 1.6565, + "step": 48000 + }, + { + "epoch": 7.118518518518519, + "grad_norm": 5.473094463348389, + "learning_rate": 6.043910987498392e-07, + "loss": 1.6446, + "step": 48050 + }, + { + "epoch": 7.125925925925926, + "grad_norm": 4.250508785247803, + "learning_rate": 5.943790468582123e-07, + "loss": 1.6908, + "step": 48100 + }, + { + "epoch": 7.133333333333334, + "grad_norm": 4.74968957901001, + "learning_rate": 5.844480760140692e-07, + "loss": 1.6499, + "step": 48150 + }, + { + "epoch": 7.140740740740741, + "grad_norm": 4.537721157073975, + "learning_rate": 5.745982718272358e-07, + "loss": 1.6836, + "step": 48200 + }, + { + "epoch": 7.148148148148148, + "grad_norm": 5.611239433288574, + "learning_rate": 5.648297192078312e-07, + "loss": 1.7206, + "step": 48250 + }, + { + "epoch": 7.155555555555556, + "grad_norm": 6.5248613357543945, + "learning_rate": 5.551425023655555e-07, + "loss": 1.754, + "step": 48300 + }, + { + "epoch": 7.162962962962963, + "grad_norm": 5.197353839874268, + "learning_rate": 5.455367048089455e-07, + "loss": 1.7913, + "step": 48350 + }, + { + "epoch": 7.17037037037037, + "grad_norm": 5.016681671142578, + "learning_rate": 5.360124093446728e-07, + "loss": 1.7725, + "step": 48400 + }, + { + "epoch": 7.177777777777778, + "grad_norm": 4.837548732757568, + "learning_rate": 5.265696980768154e-07, + "loss": 1.6899, + "step": 48450 + }, + { + "epoch": 7.185185185185185, + "grad_norm": 4.5065436363220215, + "learning_rate": 5.172086524061582e-07, + "loss": 1.6665, + "step": 48500 + }, + { + "epoch": 7.192592592592592, + "grad_norm": 5.763499736785889, + "learning_rate": 5.079293530294882e-07, + "loss": 1.6199, + "step": 48550 + }, + { + "epoch": 7.2, + "grad_norm": 4.539221286773682, + "learning_rate": 4.987318799388962e-07, + "loss": 1.6389, + "step": 48600 + }, + { + "epoch": 7.207407407407407, + "grad_norm": 5.979331016540527, + "learning_rate": 4.896163124210984e-07, + "loss": 1.8119, + "step": 48650 + }, + { + "epoch": 7.214814814814815, + "grad_norm": 6.506845951080322, + "learning_rate": 4.805827290567344e-07, + "loss": 1.7298, + "step": 48700 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 4.519911766052246, + "learning_rate": 4.716312077197116e-07, + "loss": 1.7396, + "step": 48750 + }, + { + "epoch": 7.229629629629629, + "grad_norm": 6.238486289978027, + "learning_rate": 4.6276182557651206e-07, + "loss": 1.6573, + "step": 48800 + }, + { + "epoch": 7.237037037037037, + "grad_norm": 6.776463985443115, + "learning_rate": 4.539746590855465e-07, + "loss": 1.7192, + "step": 48850 + }, + { + "epoch": 7.2444444444444445, + "grad_norm": 4.782049179077148, + "learning_rate": 4.4526978399648034e-07, + "loss": 1.7457, + "step": 48900 + }, + { + "epoch": 7.2518518518518515, + "grad_norm": 5.603179931640625, + "learning_rate": 4.3664727534959093e-07, + "loss": 1.6802, + "step": 48950 + }, + { + "epoch": 7.2592592592592595, + "grad_norm": 5.583448886871338, + "learning_rate": 4.2810720747511356e-07, + "loss": 1.7304, + "step": 49000 + }, + { + "epoch": 7.266666666666667, + "grad_norm": 5.498423099517822, + "learning_rate": 4.196496539926065e-07, + "loss": 1.6843, + "step": 49050 + }, + { + "epoch": 7.274074074074074, + "grad_norm": 4.396875381469727, + "learning_rate": 4.1127468781031244e-07, + "loss": 1.7462, + "step": 49100 + }, + { + "epoch": 7.281481481481482, + "grad_norm": 4.723330020904541, + "learning_rate": 4.0298238112453034e-07, + "loss": 1.7431, + "step": 49150 + }, + { + "epoch": 7.288888888888889, + "grad_norm": 5.44240140914917, + "learning_rate": 3.9477280541899696e-07, + "loss": 1.6973, + "step": 49200 + }, + { + "epoch": 7.296296296296296, + "grad_norm": 3.720705270767212, + "learning_rate": 3.866460314642617e-07, + "loss": 1.6725, + "step": 49250 + }, + { + "epoch": 7.303703703703704, + "grad_norm": 5.060797691345215, + "learning_rate": 3.786021293170905e-07, + "loss": 1.7596, + "step": 49300 + }, + { + "epoch": 7.311111111111111, + "grad_norm": 5.072236061096191, + "learning_rate": 3.706411683198452e-07, + "loss": 1.6878, + "step": 49350 + }, + { + "epoch": 7.318518518518519, + "grad_norm": 6.246946334838867, + "learning_rate": 3.627632170999029e-07, + "loss": 1.7211, + "step": 49400 + }, + { + "epoch": 7.325925925925926, + "grad_norm": 5.087826728820801, + "learning_rate": 3.5496834356904873e-07, + "loss": 1.6149, + "step": 49450 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 4.0795392990112305, + "learning_rate": 3.4725661492290285e-07, + "loss": 1.6767, + "step": 49500 + }, + { + "epoch": 7.340740740740741, + "grad_norm": 5.544936180114746, + "learning_rate": 3.396280976403321e-07, + "loss": 1.6989, + "step": 49550 + }, + { + "epoch": 7.348148148148148, + "grad_norm": 4.768216609954834, + "learning_rate": 3.320828574828805e-07, + "loss": 1.7244, + "step": 49600 + }, + { + "epoch": 7.355555555555555, + "grad_norm": 5.588283538818359, + "learning_rate": 3.2462095949420734e-07, + "loss": 1.6634, + "step": 49650 + }, + { + "epoch": 7.362962962962963, + "grad_norm": 5.01304292678833, + "learning_rate": 3.172424679995167e-07, + "loss": 1.7597, + "step": 49700 + }, + { + "epoch": 7.37037037037037, + "grad_norm": 5.483970642089844, + "learning_rate": 3.0994744660501117e-07, + "loss": 1.631, + "step": 49750 + }, + { + "epoch": 7.377777777777778, + "grad_norm": 4.719277858734131, + "learning_rate": 3.027359581973377e-07, + "loss": 1.7333, + "step": 49800 + }, + { + "epoch": 7.385185185185185, + "grad_norm": 4.411671161651611, + "learning_rate": 2.956080649430504e-07, + "loss": 1.7788, + "step": 49850 + }, + { + "epoch": 7.392592592592592, + "grad_norm": 4.788469314575195, + "learning_rate": 2.885638282880698e-07, + "loss": 1.7177, + "step": 49900 + }, + { + "epoch": 7.4, + "grad_norm": 5.937678813934326, + "learning_rate": 2.8160330895715905e-07, + "loss": 1.728, + "step": 49950 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 5.438296318054199, + "learning_rate": 2.747265669533938e-07, + "loss": 1.6454, + "step": 50000 + }, + { + "epoch": 7.4148148148148145, + "grad_norm": 4.224100112915039, + "learning_rate": 2.6793366155765e-07, + "loss": 1.6623, + "step": 50050 + }, + { + "epoch": 7.4222222222222225, + "grad_norm": 4.0681657791137695, + "learning_rate": 2.6122465132809026e-07, + "loss": 1.6685, + "step": 50100 + }, + { + "epoch": 7.42962962962963, + "grad_norm": 4.77517032623291, + "learning_rate": 2.545995940996604e-07, + "loss": 1.7017, + "step": 50150 + }, + { + "epoch": 7.437037037037037, + "grad_norm": 5.433816432952881, + "learning_rate": 2.480585469835917e-07, + "loss": 1.7676, + "step": 50200 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 4.526613712310791, + "learning_rate": 2.4160156636690355e-07, + "loss": 1.681, + "step": 50250 + }, + { + "epoch": 7.451851851851852, + "grad_norm": 5.165018081665039, + "learning_rate": 2.352287079119242e-07, + "loss": 1.7755, + "step": 50300 + }, + { + "epoch": 7.459259259259259, + "grad_norm": 4.912266254425049, + "learning_rate": 2.2894002655580727e-07, + "loss": 1.7047, + "step": 50350 + }, + { + "epoch": 7.466666666666667, + "grad_norm": 4.354547023773193, + "learning_rate": 2.2273557651005806e-07, + "loss": 1.6958, + "step": 50400 + }, + { + "epoch": 7.474074074074074, + "grad_norm": 4.184830188751221, + "learning_rate": 2.1661541126006603e-07, + "loss": 1.6962, + "step": 50450 + }, + { + "epoch": 7.481481481481482, + "grad_norm": 5.213493824005127, + "learning_rate": 2.1057958356464624e-07, + "loss": 1.6959, + "step": 50500 + }, + { + "epoch": 7.488888888888889, + "grad_norm": 4.918919086456299, + "learning_rate": 2.0462814545558318e-07, + "loss": 1.6993, + "step": 50550 + }, + { + "epoch": 7.496296296296296, + "grad_norm": 4.9153218269348145, + "learning_rate": 1.987611482371776e-07, + "loss": 1.6823, + "step": 50600 + }, + { + "epoch": 7.503703703703704, + "grad_norm": 4.119404315948486, + "learning_rate": 1.929786424858149e-07, + "loss": 1.6665, + "step": 50650 + }, + { + "epoch": 7.511111111111111, + "grad_norm": 4.445786476135254, + "learning_rate": 1.8728067804951844e-07, + "loss": 1.6962, + "step": 50700 + }, + { + "epoch": 7.518518518518518, + "grad_norm": 7.642633438110352, + "learning_rate": 1.8166730404752474e-07, + "loss": 1.6504, + "step": 50750 + }, + { + "epoch": 7.525925925925926, + "grad_norm": 4.436051845550537, + "learning_rate": 1.7613856886986025e-07, + "loss": 1.7457, + "step": 50800 + }, + { + "epoch": 7.533333333333333, + "grad_norm": 4.766321182250977, + "learning_rate": 1.706945201769239e-07, + "loss": 1.7546, + "step": 50850 + }, + { + "epoch": 7.540740740740741, + "grad_norm": 5.1242289543151855, + "learning_rate": 1.6533520489907305e-07, + "loss": 1.6541, + "step": 50900 + }, + { + "epoch": 7.548148148148148, + "grad_norm": 4.930501937866211, + "learning_rate": 1.600606692362261e-07, + "loss": 1.7488, + "step": 50950 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 6.121901988983154, + "learning_rate": 1.5487095865745593e-07, + "loss": 1.7194, + "step": 51000 + }, + { + "epoch": 7.562962962962963, + "grad_norm": 4.327436447143555, + "learning_rate": 1.497661179006027e-07, + "loss": 1.7288, + "step": 51050 + }, + { + "epoch": 7.57037037037037, + "grad_norm": 5.480533599853516, + "learning_rate": 1.4474619097188835e-07, + "loss": 1.6727, + "step": 51100 + }, + { + "epoch": 7.5777777777777775, + "grad_norm": 5.402645111083984, + "learning_rate": 1.398112211455338e-07, + "loss": 1.675, + "step": 51150 + }, + { + "epoch": 7.5851851851851855, + "grad_norm": 4.789156913757324, + "learning_rate": 1.3496125096339129e-07, + "loss": 1.6841, + "step": 51200 + }, + { + "epoch": 7.592592592592593, + "grad_norm": 5.137218952178955, + "learning_rate": 1.3019632223457258e-07, + "loss": 1.6567, + "step": 51250 + }, + { + "epoch": 7.6, + "grad_norm": 5.8797736167907715, + "learning_rate": 1.2551647603509242e-07, + "loss": 1.7339, + "step": 51300 + }, + { + "epoch": 7.607407407407408, + "grad_norm": 5.697160720825195, + "learning_rate": 1.2092175270750905e-07, + "loss": 1.6819, + "step": 51350 + }, + { + "epoch": 7.614814814814815, + "grad_norm": 4.531454086303711, + "learning_rate": 1.1641219186058317e-07, + "loss": 1.6752, + "step": 51400 + }, + { + "epoch": 7.622222222222222, + "grad_norm": 4.519044876098633, + "learning_rate": 1.1198783236893274e-07, + "loss": 1.7337, + "step": 51450 + }, + { + "epoch": 7.62962962962963, + "grad_norm": 4.373157501220703, + "learning_rate": 1.076487123726977e-07, + "loss": 1.7309, + "step": 51500 + }, + { + "epoch": 7.637037037037037, + "grad_norm": 5.711727142333984, + "learning_rate": 1.0339486927721131e-07, + "loss": 1.5929, + "step": 51550 + }, + { + "epoch": 7.644444444444445, + "grad_norm": 4.87087869644165, + "learning_rate": 9.922633975268048e-08, + "loss": 1.7241, + "step": 51600 + }, + { + "epoch": 7.651851851851852, + "grad_norm": 5.056915760040283, + "learning_rate": 9.514315973386589e-08, + "loss": 1.6532, + "step": 51650 + }, + { + "epoch": 7.659259259259259, + "grad_norm": 6.0363569259643555, + "learning_rate": 9.114536441977351e-08, + "loss": 1.8079, + "step": 51700 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 4.777937412261963, + "learning_rate": 8.723298827335247e-08, + "loss": 1.7689, + "step": 51750 + }, + { + "epoch": 7.674074074074074, + "grad_norm": 4.326016426086426, + "learning_rate": 8.340606502119542e-08, + "loss": 1.7673, + "step": 51800 + }, + { + "epoch": 7.681481481481481, + "grad_norm": 4.3759541511535645, + "learning_rate": 7.96646276532509e-08, + "loss": 1.7127, + "step": 51850 + }, + { + "epoch": 7.688888888888889, + "grad_norm": 4.051638603210449, + "learning_rate": 7.600870842253805e-08, + "loss": 1.7043, + "step": 51900 + }, + { + "epoch": 7.696296296296296, + "grad_norm": 6.273709774017334, + "learning_rate": 7.243833884486462e-08, + "loss": 1.6677, + "step": 51950 + }, + { + "epoch": 7.703703703703704, + "grad_norm": 4.785228729248047, + "learning_rate": 6.895354969856271e-08, + "loss": 1.6629, + "step": 52000 + }, + { + "epoch": 7.711111111111111, + "grad_norm": 4.696812152862549, + "learning_rate": 6.555437102421458e-08, + "loss": 1.6376, + "step": 52050 + }, + { + "epoch": 7.718518518518518, + "grad_norm": 5.83518123626709, + "learning_rate": 6.224083212440391e-08, + "loss": 1.7027, + "step": 52100 + }, + { + "epoch": 7.725925925925926, + "grad_norm": 4.885500431060791, + "learning_rate": 5.9012961563451595e-08, + "loss": 1.7116, + "step": 52150 + }, + { + "epoch": 7.733333333333333, + "grad_norm": 6.395565509796143, + "learning_rate": 5.5870787167181526e-08, + "loss": 1.643, + "step": 52200 + }, + { + "epoch": 7.7407407407407405, + "grad_norm": 6.444253444671631, + "learning_rate": 5.281433602267072e-08, + "loss": 1.7616, + "step": 52250 + }, + { + "epoch": 7.7481481481481485, + "grad_norm": 4.798886299133301, + "learning_rate": 4.984363447802065e-08, + "loss": 1.6833, + "step": 52300 + }, + { + "epoch": 7.7555555555555555, + "grad_norm": 5.61707067489624, + "learning_rate": 4.695870814213188e-08, + "loss": 1.7252, + "step": 52350 + }, + { + "epoch": 7.762962962962963, + "grad_norm": 4.683418273925781, + "learning_rate": 4.4159581884476444e-08, + "loss": 1.6788, + "step": 52400 + }, + { + "epoch": 7.770370370370371, + "grad_norm": 7.159529209136963, + "learning_rate": 4.144627983489136e-08, + "loss": 1.6954, + "step": 52450 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 3.812352418899536, + "learning_rate": 3.881882538336657e-08, + "loss": 1.6584, + "step": 52500 + }, + { + "epoch": 7.785185185185185, + "grad_norm": 4.499040126800537, + "learning_rate": 3.627724117984177e-08, + "loss": 1.6402, + "step": 52550 + }, + { + "epoch": 7.792592592592593, + "grad_norm": 4.413599014282227, + "learning_rate": 3.3821549134014365e-08, + "loss": 1.6671, + "step": 52600 + }, + { + "epoch": 7.8, + "grad_norm": 5.2130126953125, + "learning_rate": 3.145177041514958e-08, + "loss": 1.6463, + "step": 52650 + }, + { + "epoch": 7.807407407407408, + "grad_norm": 4.559216022491455, + "learning_rate": 2.9167925451898393e-08, + "loss": 1.6551, + "step": 52700 + }, + { + "epoch": 7.814814814814815, + "grad_norm": 5.212027072906494, + "learning_rate": 2.6970033932118833e-08, + "loss": 1.7194, + "step": 52750 + }, + { + "epoch": 7.822222222222222, + "grad_norm": 5.727282524108887, + "learning_rate": 2.4858114802711607e-08, + "loss": 1.7803, + "step": 52800 + }, + { + "epoch": 7.82962962962963, + "grad_norm": 4.719393253326416, + "learning_rate": 2.2832186269449163e-08, + "loss": 1.7053, + "step": 52850 + }, + { + "epoch": 7.837037037037037, + "grad_norm": 6.631552696228027, + "learning_rate": 2.0892265796825795e-08, + "loss": 1.6426, + "step": 52900 + }, + { + "epoch": 7.844444444444444, + "grad_norm": 5.424106121063232, + "learning_rate": 1.9038370107905546e-08, + "loss": 1.6795, + "step": 52950 + }, + { + "epoch": 7.851851851851852, + "grad_norm": 4.815089702606201, + "learning_rate": 1.7270515184172333e-08, + "loss": 1.6766, + "step": 53000 + }, + { + "epoch": 7.859259259259259, + "grad_norm": 5.34329891204834, + "learning_rate": 1.5588716265398928e-08, + "loss": 1.6708, + "step": 53050 + }, + { + "epoch": 7.866666666666667, + "grad_norm": 4.181158065795898, + "learning_rate": 1.399298784951708e-08, + "loss": 1.7145, + "step": 53100 + }, + { + "epoch": 7.874074074074074, + "grad_norm": 4.6021623611450195, + "learning_rate": 1.2483343692482054e-08, + "loss": 1.6803, + "step": 53150 + }, + { + "epoch": 7.881481481481481, + "grad_norm": 5.097445011138916, + "learning_rate": 1.1059796808164935e-08, + "loss": 1.6934, + "step": 53200 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 4.321223258972168, + "learning_rate": 9.722359468234966e-09, + "loss": 1.6973, + "step": 53250 + }, + { + "epoch": 7.896296296296296, + "grad_norm": 5.460144519805908, + "learning_rate": 8.471043202057383e-09, + "loss": 1.6736, + "step": 53300 + }, + { + "epoch": 7.9037037037037035, + "grad_norm": 5.165498733520508, + "learning_rate": 7.305858796586851e-09, + "loss": 1.7208, + "step": 53350 + }, + { + "epoch": 7.911111111111111, + "grad_norm": 5.325474262237549, + "learning_rate": 6.226816296283078e-09, + "loss": 1.6963, + "step": 53400 + }, + { + "epoch": 7.9185185185185185, + "grad_norm": 4.388497352600098, + "learning_rate": 5.23392500302089e-09, + "loss": 1.6946, + "step": 53450 + }, + { + "epoch": 7.925925925925926, + "grad_norm": 4.683349609375, + "learning_rate": 4.327193476006963e-09, + "loss": 1.7026, + "step": 53500 + }, + { + "epoch": 7.933333333333334, + "grad_norm": 4.730576515197754, + "learning_rate": 3.506629531710992e-09, + "loss": 1.6988, + "step": 53550 + }, + { + "epoch": 7.940740740740741, + "grad_norm": 4.461126804351807, + "learning_rate": 2.7722402437935224e-09, + "loss": 1.7147, + "step": 53600 + }, + { + "epoch": 7.948148148148148, + "grad_norm": 4.75923490524292, + "learning_rate": 2.124031943050442e-09, + "loss": 1.6821, + "step": 53650 + }, + { + "epoch": 7.955555555555556, + "grad_norm": 4.524296760559082, + "learning_rate": 1.5620102173530272e-09, + "loss": 1.6972, + "step": 53700 + }, + { + "epoch": 7.962962962962963, + "grad_norm": 4.123456954956055, + "learning_rate": 1.0861799116046457e-09, + "loss": 1.6129, + "step": 53750 + }, + { + "epoch": 7.97037037037037, + "grad_norm": 5.126282691955566, + "learning_rate": 6.965451276919055e-10, + "loss": 1.6673, + "step": 53800 + }, + { + "epoch": 7.977777777777778, + "grad_norm": 5.508212089538574, + "learning_rate": 3.931092244602308e-10, + "loss": 1.7623, + "step": 53850 + }, + { + "epoch": 7.985185185185185, + "grad_norm": 5.099086284637451, + "learning_rate": 1.7587481767389336e-10, + "loss": 1.7465, + "step": 53900 + }, + { + "epoch": 7.992592592592593, + "grad_norm": 4.575199127197266, + "learning_rate": 4.484377999935952e-11, + "loss": 1.7073, + "step": 53950 + }, + { + "epoch": 8.0, + "grad_norm": 4.952852249145508, + "learning_rate": 1.724098974698052e-14, + "loss": 1.765, + "step": 54000 + }, + { + "epoch": 8.007407407407408, + "grad_norm": 5.91780424118042, + "learning_rate": 6.042434474232695e-07, + "loss": 1.6681, + "step": 54050 + }, + { + "epoch": 8.014814814814814, + "grad_norm": 5.638923168182373, + "learning_rate": 5.953501031737729e-07, + "loss": 1.6993, + "step": 54100 + }, + { + "epoch": 8.022222222222222, + "grad_norm": 4.633487224578857, + "learning_rate": 5.865206835271154e-07, + "loss": 1.6813, + "step": 54150 + }, + { + "epoch": 8.02962962962963, + "grad_norm": 5.461434364318848, + "learning_rate": 5.777552484979865e-07, + "loss": 1.7147, + "step": 54200 + }, + { + "epoch": 8.037037037037036, + "grad_norm": 4.728063583374023, + "learning_rate": 5.69053857666163e-07, + "loss": 1.6808, + "step": 54250 + }, + { + "epoch": 8.044444444444444, + "grad_norm": 4.8988213539123535, + "learning_rate": 5.604165701761067e-07, + "loss": 1.7503, + "step": 54300 + }, + { + "epoch": 8.051851851851852, + "grad_norm": 5.497891902923584, + "learning_rate": 5.518434447365606e-07, + "loss": 1.677, + "step": 54350 + }, + { + "epoch": 8.059259259259258, + "grad_norm": 4.321437358856201, + "learning_rate": 5.433345396201506e-07, + "loss": 1.7058, + "step": 54400 + }, + { + "epoch": 8.066666666666666, + "grad_norm": 5.676205158233643, + "learning_rate": 5.34889912662988e-07, + "loss": 1.6936, + "step": 54450 + }, + { + "epoch": 8.074074074074074, + "grad_norm": 4.2474894523620605, + "learning_rate": 5.265096212642762e-07, + "loss": 1.6949, + "step": 54500 + }, + { + "epoch": 8.081481481481482, + "grad_norm": 6.3445143699646, + "learning_rate": 5.181937223859246e-07, + "loss": 1.7373, + "step": 54550 + }, + { + "epoch": 8.088888888888889, + "grad_norm": 4.5797038078308105, + "learning_rate": 5.099422725521553e-07, + "loss": 1.6622, + "step": 54600 + }, + { + "epoch": 8.096296296296297, + "grad_norm": 5.370152950286865, + "learning_rate": 5.017553278491238e-07, + "loss": 1.6767, + "step": 54650 + }, + { + "epoch": 8.103703703703705, + "grad_norm": 5.182304859161377, + "learning_rate": 4.936329439245369e-07, + "loss": 1.7094, + "step": 54700 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 4.945371150970459, + "learning_rate": 4.855751759872707e-07, + "loss": 1.6705, + "step": 54750 + }, + { + "epoch": 8.118518518518519, + "grad_norm": 4.181769847869873, + "learning_rate": 4.775820788070018e-07, + "loss": 1.7182, + "step": 54800 + }, + { + "epoch": 8.125925925925927, + "grad_norm": 4.810791969299316, + "learning_rate": 4.6965370671382735e-07, + "loss": 1.7706, + "step": 54850 + }, + { + "epoch": 8.133333333333333, + "grad_norm": 4.857903480529785, + "learning_rate": 4.6179011359790414e-07, + "loss": 1.7079, + "step": 54900 + }, + { + "epoch": 8.14074074074074, + "grad_norm": 3.9374470710754395, + "learning_rate": 4.539913529090734e-07, + "loss": 1.7181, + "step": 54950 + }, + { + "epoch": 8.148148148148149, + "grad_norm": 4.6941237449646, + "learning_rate": 4.462574776565054e-07, + "loss": 1.6491, + "step": 55000 + }, + { + "epoch": 8.155555555555555, + "grad_norm": 4.580036640167236, + "learning_rate": 4.3858854040833564e-07, + "loss": 1.7548, + "step": 55050 + }, + { + "epoch": 8.162962962962963, + "grad_norm": 3.972926139831543, + "learning_rate": 4.3098459329130813e-07, + "loss": 1.718, + "step": 55100 + }, + { + "epoch": 8.170370370370371, + "grad_norm": 5.282564163208008, + "learning_rate": 4.2344568799041807e-07, + "loss": 1.7448, + "step": 55150 + }, + { + "epoch": 8.177777777777777, + "grad_norm": 5.370199680328369, + "learning_rate": 4.159718757485642e-07, + "loss": 1.7111, + "step": 55200 + }, + { + "epoch": 8.185185185185185, + "grad_norm": 6.447271347045898, + "learning_rate": 4.085632073662016e-07, + "loss": 1.7007, + "step": 55250 + }, + { + "epoch": 8.192592592592593, + "grad_norm": 4.709756851196289, + "learning_rate": 4.012197332009915e-07, + "loss": 1.6363, + "step": 55300 + }, + { + "epoch": 8.2, + "grad_norm": 4.706228256225586, + "learning_rate": 3.9394150316746317e-07, + "loss": 1.6756, + "step": 55350 + }, + { + "epoch": 8.207407407407407, + "grad_norm": 4.972978591918945, + "learning_rate": 3.867285667366727e-07, + "loss": 1.7409, + "step": 55400 + }, + { + "epoch": 8.214814814814815, + "grad_norm": 4.665986061096191, + "learning_rate": 3.7958097293586684e-07, + "loss": 1.7058, + "step": 55450 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 5.660872459411621, + "learning_rate": 3.7249877034815306e-07, + "loss": 1.7001, + "step": 55500 + }, + { + "epoch": 8.22962962962963, + "grad_norm": 5.380090713500977, + "learning_rate": 3.654820071121612e-07, + "loss": 1.7128, + "step": 55550 + }, + { + "epoch": 8.237037037037037, + "grad_norm": 7.617238998413086, + "learning_rate": 3.5853073092172566e-07, + "loss": 1.6826, + "step": 55600 + }, + { + "epoch": 8.244444444444444, + "grad_norm": 5.554752349853516, + "learning_rate": 3.5164498902555687e-07, + "loss": 1.7362, + "step": 55650 + }, + { + "epoch": 8.251851851851852, + "grad_norm": 4.808479309082031, + "learning_rate": 3.448248282269173e-07, + "loss": 1.65, + "step": 55700 + }, + { + "epoch": 8.25925925925926, + "grad_norm": 3.840391159057617, + "learning_rate": 3.380702948833103e-07, + "loss": 1.7487, + "step": 55750 + }, + { + "epoch": 8.266666666666667, + "grad_norm": 5.313462257385254, + "learning_rate": 3.313814349061573e-07, + "loss": 1.6453, + "step": 55800 + }, + { + "epoch": 8.274074074074074, + "grad_norm": 4.17125129699707, + "learning_rate": 3.247582937604921e-07, + "loss": 1.709, + "step": 55850 + }, + { + "epoch": 8.281481481481482, + "grad_norm": 3.761216163635254, + "learning_rate": 3.1820091646464825e-07, + "loss": 1.6329, + "step": 55900 + }, + { + "epoch": 8.28888888888889, + "grad_norm": 5.288180351257324, + "learning_rate": 3.117093475899546e-07, + "loss": 1.6381, + "step": 55950 + }, + { + "epoch": 8.296296296296296, + "grad_norm": 4.487936973571777, + "learning_rate": 3.0528363126043016e-07, + "loss": 1.7076, + "step": 56000 + }, + { + "epoch": 8.303703703703704, + "grad_norm": 4.927828788757324, + "learning_rate": 2.9892381115248836e-07, + "loss": 1.6595, + "step": 56050 + }, + { + "epoch": 8.311111111111112, + "grad_norm": 4.865903377532959, + "learning_rate": 2.9262993049463564e-07, + "loss": 1.6762, + "step": 56100 + }, + { + "epoch": 8.318518518518518, + "grad_norm": 5.675967216491699, + "learning_rate": 2.864020320671812e-07, + "loss": 1.6536, + "step": 56150 + }, + { + "epoch": 8.325925925925926, + "grad_norm": 6.040846347808838, + "learning_rate": 2.8024015820194093e-07, + "loss": 1.6822, + "step": 56200 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 5.212625026702881, + "learning_rate": 2.741443507819597e-07, + "loss": 1.731, + "step": 56250 + }, + { + "epoch": 8.34074074074074, + "grad_norm": 4.386477947235107, + "learning_rate": 2.681146512412136e-07, + "loss": 1.6979, + "step": 56300 + }, + { + "epoch": 8.348148148148148, + "grad_norm": 5.230815887451172, + "learning_rate": 2.621511005643407e-07, + "loss": 1.7063, + "step": 56350 + }, + { + "epoch": 8.355555555555556, + "grad_norm": 4.48565673828125, + "learning_rate": 2.5625373928635176e-07, + "loss": 1.6047, + "step": 56400 + }, + { + "epoch": 8.362962962962962, + "grad_norm": 4.869395732879639, + "learning_rate": 2.5042260749236434e-07, + "loss": 1.7181, + "step": 56450 + }, + { + "epoch": 8.37037037037037, + "grad_norm": 4.625112056732178, + "learning_rate": 2.446577448173215e-07, + "loss": 1.7443, + "step": 56500 + }, + { + "epoch": 8.377777777777778, + "grad_norm": 4.910737991333008, + "learning_rate": 2.3895919044573223e-07, + "loss": 1.667, + "step": 56550 + }, + { + "epoch": 8.385185185185184, + "grad_norm": 5.058411598205566, + "learning_rate": 2.3332698311139378e-07, + "loss": 1.7574, + "step": 56600 + }, + { + "epoch": 8.392592592592592, + "grad_norm": 4.682469844818115, + "learning_rate": 2.2776116109713753e-07, + "loss": 1.6996, + "step": 56650 + }, + { + "epoch": 8.4, + "grad_norm": 5.020571708679199, + "learning_rate": 2.2226176223456353e-07, + "loss": 1.6864, + "step": 56700 + }, + { + "epoch": 8.407407407407407, + "grad_norm": 4.348459720611572, + "learning_rate": 2.1682882390378633e-07, + "loss": 1.737, + "step": 56750 + }, + { + "epoch": 8.414814814814815, + "grad_norm": 4.722019672393799, + "learning_rate": 2.1146238303317858e-07, + "loss": 1.6447, + "step": 56800 + }, + { + "epoch": 8.422222222222222, + "grad_norm": 5.096248626708984, + "learning_rate": 2.0616247609912543e-07, + "loss": 1.7022, + "step": 56850 + }, + { + "epoch": 8.42962962962963, + "grad_norm": 6.641050338745117, + "learning_rate": 2.0092913912576617e-07, + "loss": 1.696, + "step": 56900 + }, + { + "epoch": 8.437037037037037, + "grad_norm": 5.072902202606201, + "learning_rate": 1.9576240768475975e-07, + "loss": 1.6881, + "step": 56950 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 4.475610733032227, + "learning_rate": 1.9066231689503721e-07, + "loss": 1.7255, + "step": 57000 + }, + { + "epoch": 8.451851851851853, + "grad_norm": 5.1616621017456055, + "learning_rate": 1.856289014225654e-07, + "loss": 1.7362, + "step": 57050 + }, + { + "epoch": 8.459259259259259, + "grad_norm": 4.945127964019775, + "learning_rate": 1.806621954801091e-07, + "loss": 1.7219, + "step": 57100 + }, + { + "epoch": 8.466666666666667, + "grad_norm": 4.375204086303711, + "learning_rate": 1.7576223282700255e-07, + "loss": 1.6566, + "step": 57150 + }, + { + "epoch": 8.474074074074075, + "grad_norm": 4.772896766662598, + "learning_rate": 1.7092904676891509e-07, + "loss": 1.724, + "step": 57200 + }, + { + "epoch": 8.481481481481481, + "grad_norm": 6.058934211730957, + "learning_rate": 1.6616267015762799e-07, + "loss": 1.7495, + "step": 57250 + }, + { + "epoch": 8.488888888888889, + "grad_norm": 5.737463474273682, + "learning_rate": 1.6146313539081026e-07, + "loss": 1.76, + "step": 57300 + }, + { + "epoch": 8.496296296296297, + "grad_norm": 4.475915431976318, + "learning_rate": 1.5683047441179656e-07, + "loss": 1.6865, + "step": 57350 + }, + { + "epoch": 8.503703703703703, + "grad_norm": 4.296292781829834, + "learning_rate": 1.522647187093751e-07, + "loss": 1.7438, + "step": 57400 + }, + { + "epoch": 8.511111111111111, + "grad_norm": 5.862701416015625, + "learning_rate": 1.4776589931756902e-07, + "loss": 1.7516, + "step": 57450 + }, + { + "epoch": 8.518518518518519, + "grad_norm": 5.117536544799805, + "learning_rate": 1.4333404681542428e-07, + "loss": 1.7426, + "step": 57500 + }, + { + "epoch": 8.525925925925925, + "grad_norm": 4.911297798156738, + "learning_rate": 1.3896919132680875e-07, + "loss": 1.7246, + "step": 57550 + }, + { + "epoch": 8.533333333333333, + "grad_norm": 6.487166881561279, + "learning_rate": 1.346713625202001e-07, + "loss": 1.7129, + "step": 57600 + }, + { + "epoch": 8.540740740740741, + "grad_norm": 5.194836616516113, + "learning_rate": 1.3044058960848815e-07, + "loss": 1.6967, + "step": 57650 + }, + { + "epoch": 8.548148148148147, + "grad_norm": 4.895190238952637, + "learning_rate": 1.2627690134877524e-07, + "loss": 1.7175, + "step": 57700 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 4.806921005249023, + "learning_rate": 1.2218032604218056e-07, + "loss": 1.6729, + "step": 57750 + }, + { + "epoch": 8.562962962962963, + "grad_norm": 5.2344865798950195, + "learning_rate": 1.1815089153364711e-07, + "loss": 1.7009, + "step": 57800 + }, + { + "epoch": 8.57037037037037, + "grad_norm": 5.767990589141846, + "learning_rate": 1.1418862521175634e-07, + "loss": 1.7572, + "step": 57850 + }, + { + "epoch": 8.577777777777778, + "grad_norm": 4.904351711273193, + "learning_rate": 1.1029355400853481e-07, + "loss": 1.6867, + "step": 57900 + }, + { + "epoch": 8.585185185185185, + "grad_norm": 5.978799343109131, + "learning_rate": 1.0646570439928006e-07, + "loss": 1.7102, + "step": 57950 + }, + { + "epoch": 8.592592592592592, + "grad_norm": 5.98206901550293, + "learning_rate": 1.0270510240236953e-07, + "loss": 1.7309, + "step": 58000 + }, + { + "epoch": 8.6, + "grad_norm": 5.000571250915527, + "learning_rate": 9.901177357909742e-08, + "loss": 1.7079, + "step": 58050 + }, + { + "epoch": 8.607407407407408, + "grad_norm": 5.031994342803955, + "learning_rate": 9.538574303348813e-08, + "loss": 1.6487, + "step": 58100 + }, + { + "epoch": 8.614814814814816, + "grad_norm": 7.2956671714782715, + "learning_rate": 9.182703541213423e-08, + "loss": 1.6381, + "step": 58150 + }, + { + "epoch": 8.622222222222222, + "grad_norm": 5.399662971496582, + "learning_rate": 8.833567490402206e-08, + "loss": 1.7005, + "step": 58200 + }, + { + "epoch": 8.62962962962963, + "grad_norm": 5.364224433898926, + "learning_rate": 8.49116852403764e-08, + "loss": 1.6661, + "step": 58250 + }, + { + "epoch": 8.637037037037038, + "grad_norm": 4.17866325378418, + "learning_rate": 8.155508969448944e-08, + "loss": 1.5835, + "step": 58300 + }, + { + "epoch": 8.644444444444444, + "grad_norm": 5.372631072998047, + "learning_rate": 7.826591108156867e-08, + "loss": 1.6723, + "step": 58350 + }, + { + "epoch": 8.651851851851852, + "grad_norm": 4.693414688110352, + "learning_rate": 7.504417175858036e-08, + "loss": 1.6853, + "step": 58400 + }, + { + "epoch": 8.65925925925926, + "grad_norm": 5.828296661376953, + "learning_rate": 7.188989362409638e-08, + "loss": 1.7392, + "step": 58450 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 4.067485809326172, + "learning_rate": 6.880309811814757e-08, + "loss": 1.6361, + "step": 58500 + }, + { + "epoch": 8.674074074074074, + "grad_norm": 4.196707248687744, + "learning_rate": 6.578380622207503e-08, + "loss": 1.6795, + "step": 58550 + }, + { + "epoch": 8.681481481481482, + "grad_norm": 5.865671634674072, + "learning_rate": 6.283203845839137e-08, + "loss": 1.6751, + "step": 58600 + }, + { + "epoch": 8.688888888888888, + "grad_norm": 4.712571620941162, + "learning_rate": 5.994781489063738e-08, + "loss": 1.6887, + "step": 58650 + }, + { + "epoch": 8.696296296296296, + "grad_norm": 4.615510940551758, + "learning_rate": 5.713115512324674e-08, + "loss": 1.6539, + "step": 58700 + }, + { + "epoch": 8.703703703703704, + "grad_norm": 4.818018913269043, + "learning_rate": 5.438207830141706e-08, + "loss": 1.6824, + "step": 58750 + }, + { + "epoch": 8.71111111111111, + "grad_norm": 5.266839981079102, + "learning_rate": 5.1700603110971246e-08, + "loss": 1.6686, + "step": 58800 + }, + { + "epoch": 8.718518518518518, + "grad_norm": 7.0362229347229, + "learning_rate": 4.908674777823863e-08, + "loss": 1.7219, + "step": 58850 + }, + { + "epoch": 8.725925925925926, + "grad_norm": 3.8932015895843506, + "learning_rate": 4.6540530069927317e-08, + "loss": 1.7637, + "step": 58900 + }, + { + "epoch": 8.733333333333333, + "grad_norm": 5.245306015014648, + "learning_rate": 4.406196729300094e-08, + "loss": 1.6668, + "step": 58950 + }, + { + "epoch": 8.74074074074074, + "grad_norm": 6.06035852432251, + "learning_rate": 4.165107629456877e-08, + "loss": 1.7053, + "step": 59000 + }, + { + "epoch": 8.748148148148148, + "grad_norm": 5.4172539710998535, + "learning_rate": 3.930787346176357e-08, + "loss": 1.7195, + "step": 59050 + }, + { + "epoch": 8.755555555555556, + "grad_norm": 4.596226215362549, + "learning_rate": 3.7032374721632794e-08, + "loss": 1.6544, + "step": 59100 + }, + { + "epoch": 8.762962962962963, + "grad_norm": 4.076667308807373, + "learning_rate": 3.482459554102979e-08, + "loss": 1.656, + "step": 59150 + }, + { + "epoch": 8.77037037037037, + "grad_norm": 6.138818264007568, + "learning_rate": 3.2684550926512795e-08, + "loss": 1.7229, + "step": 59200 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 3.7312471866607666, + "learning_rate": 3.061225542423718e-08, + "loss": 1.7003, + "step": 59250 + }, + { + "epoch": 8.785185185185185, + "grad_norm": 4.663834571838379, + "learning_rate": 2.8607723119858932e-08, + "loss": 1.729, + "step": 59300 + }, + { + "epoch": 8.792592592592593, + "grad_norm": 4.302097797393799, + "learning_rate": 2.6670967638439127e-08, + "loss": 1.7085, + "step": 59350 + }, + { + "epoch": 8.8, + "grad_norm": 4.932514667510986, + "learning_rate": 2.48020021443518e-08, + "loss": 1.6708, + "step": 59400 + }, + { + "epoch": 8.807407407407407, + "grad_norm": 3.9513580799102783, + "learning_rate": 2.3000839341192905e-08, + "loss": 1.7047, + "step": 59450 + }, + { + "epoch": 8.814814814814815, + "grad_norm": 5.340576648712158, + "learning_rate": 2.1267491471697043e-08, + "loss": 1.6384, + "step": 59500 + }, + { + "epoch": 8.822222222222223, + "grad_norm": 4.683200359344482, + "learning_rate": 1.9601970317647546e-08, + "loss": 1.6958, + "step": 59550 + }, + { + "epoch": 8.829629629629629, + "grad_norm": 4.607886791229248, + "learning_rate": 1.8004287199805403e-08, + "loss": 1.7749, + "step": 59600 + }, + { + "epoch": 8.837037037037037, + "grad_norm": 5.207304000854492, + "learning_rate": 1.6474452977827127e-08, + "loss": 1.7023, + "step": 59650 + }, + { + "epoch": 8.844444444444445, + "grad_norm": 4.992751598358154, + "learning_rate": 1.501247805018924e-08, + "loss": 1.7879, + "step": 59700 + }, + { + "epoch": 8.851851851851851, + "grad_norm": 5.176036834716797, + "learning_rate": 1.3618372354121668e-08, + "loss": 1.7696, + "step": 59750 + }, + { + "epoch": 8.85925925925926, + "grad_norm": 6.48920202255249, + "learning_rate": 1.22921453655378e-08, + "loss": 1.7325, + "step": 59800 + }, + { + "epoch": 8.866666666666667, + "grad_norm": 5.500480651855469, + "learning_rate": 1.103380609897342e-08, + "loss": 1.7892, + "step": 59850 + }, + { + "epoch": 8.874074074074073, + "grad_norm": 5.276499271392822, + "learning_rate": 9.843363107518988e-09, + "loss": 1.7182, + "step": 59900 + }, + { + "epoch": 8.881481481481481, + "grad_norm": 5.408885955810547, + "learning_rate": 8.720824482767453e-09, + "loss": 1.6658, + "step": 59950 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 4.236413955688477, + "learning_rate": 7.66619785475653e-09, + "loss": 1.6929, + "step": 60000 + }, + { + "epoch": 8.896296296296295, + "grad_norm": 4.405470371246338, + "learning_rate": 6.6794903919187306e-09, + "loss": 1.6295, + "step": 60050 + }, + { + "epoch": 8.903703703703703, + "grad_norm": 4.904160022735596, + "learning_rate": 5.7607088010291914e-09, + "loss": 1.6921, + "step": 60100 + }, + { + "epoch": 8.911111111111111, + "grad_norm": 6.341813564300537, + "learning_rate": 4.90985932716459e-09, + "loss": 1.7889, + "step": 60150 + }, + { + "epoch": 8.918518518518518, + "grad_norm": 6.032997131347656, + "learning_rate": 4.126947753655408e-09, + "loss": 1.7337, + "step": 60200 + }, + { + "epoch": 8.925925925925926, + "grad_norm": 4.988321781158447, + "learning_rate": 3.4119794020526233e-09, + "loss": 1.6482, + "step": 60250 + }, + { + "epoch": 8.933333333333334, + "grad_norm": 4.9094672203063965, + "learning_rate": 2.764959132086631e-09, + "loss": 1.7243, + "step": 60300 + }, + { + "epoch": 8.940740740740742, + "grad_norm": 4.590478897094727, + "learning_rate": 2.1858913416372696e-09, + "loss": 1.7556, + "step": 60350 + }, + { + "epoch": 8.948148148148148, + "grad_norm": 6.892956256866455, + "learning_rate": 1.6747799667005128e-09, + "loss": 1.6658, + "step": 60400 + }, + { + "epoch": 8.955555555555556, + "grad_norm": 5.133352756500244, + "learning_rate": 1.2316284813673751e-09, + "loss": 1.6599, + "step": 60450 + }, + { + "epoch": 8.962962962962964, + "grad_norm": 4.256348609924316, + "learning_rate": 8.56439897793937e-10, + "loss": 1.6971, + "step": 60500 + }, + { + "epoch": 8.97037037037037, + "grad_norm": 4.697518825531006, + "learning_rate": 5.492167661846903e-10, + "loss": 1.6753, + "step": 60550 + }, + { + "epoch": 8.977777777777778, + "grad_norm": 7.553053855895996, + "learning_rate": 3.099611747747755e-10, + "loss": 1.706, + "step": 60600 + }, + { + "epoch": 8.985185185185186, + "grad_norm": 4.563235282897949, + "learning_rate": 1.3867474981443807e-10, + "loss": 1.6904, + "step": 60650 + }, + { + "epoch": 8.992592592592592, + "grad_norm": 5.773245334625244, + "learning_rate": 3.5358655559036834e-11, + "loss": 1.8243, + "step": 60700 + }, + { + "epoch": 9.0, + "grad_norm": 5.451295852661133, + "learning_rate": 1.359426238245476e-14, + "loss": 1.6434, + "step": 60750 + }, + { + "epoch": 9.007407407407408, + "grad_norm": 4.893211841583252, + "learning_rate": 4.895797438496442e-07, + "loss": 1.7101, + "step": 60800 + }, + { + "epoch": 9.014814814814814, + "grad_norm": 5.335631370544434, + "learning_rate": 4.823600249171412e-07, + "loss": 1.7203, + "step": 60850 + }, + { + "epoch": 9.022222222222222, + "grad_norm": 6.353170871734619, + "learning_rate": 4.7519262014055324e-07, + "loss": 1.6979, + "step": 60900 + }, + { + "epoch": 9.02962962962963, + "grad_norm": 6.264624118804932, + "learning_rate": 4.6807756891585677e-07, + "loss": 1.7308, + "step": 60950 + }, + { + "epoch": 9.037037037037036, + "grad_norm": 3.9906938076019287, + "learning_rate": 4.610149103512673e-07, + "loss": 1.728, + "step": 61000 + }, + { + "epoch": 9.044444444444444, + "grad_norm": 5.856201648712158, + "learning_rate": 4.540046832670175e-07, + "loss": 1.7142, + "step": 61050 + }, + { + "epoch": 9.051851851851852, + "grad_norm": 4.669765949249268, + "learning_rate": 4.4704692619515045e-07, + "loss": 1.6873, + "step": 61100 + }, + { + "epoch": 9.059259259259258, + "grad_norm": 4.824378967285156, + "learning_rate": 4.4014167737930656e-07, + "loss": 1.6526, + "step": 61150 + }, + { + "epoch": 9.066666666666666, + "grad_norm": 4.9388041496276855, + "learning_rate": 4.332889747745095e-07, + "loss": 1.6905, + "step": 61200 + }, + { + "epoch": 9.074074074074074, + "grad_norm": 4.846113204956055, + "learning_rate": 4.2648885604696267e-07, + "loss": 1.6202, + "step": 61250 + }, + { + "epoch": 9.081481481481482, + "grad_norm": 6.360806465148926, + "learning_rate": 4.197413585738408e-07, + "loss": 1.7341, + "step": 61300 + }, + { + "epoch": 9.088888888888889, + "grad_norm": 5.023980140686035, + "learning_rate": 4.130465194430766e-07, + "loss": 1.6094, + "step": 61350 + }, + { + "epoch": 9.096296296296297, + "grad_norm": 4.734802722930908, + "learning_rate": 4.064043754531699e-07, + "loss": 1.6848, + "step": 61400 + }, + { + "epoch": 9.103703703703705, + "grad_norm": 5.307347774505615, + "learning_rate": 3.998149631129788e-07, + "loss": 1.6636, + "step": 61450 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 5.5121235847473145, + "learning_rate": 3.932783186415179e-07, + "loss": 1.7264, + "step": 61500 + }, + { + "epoch": 9.118518518518519, + "grad_norm": 4.755031585693359, + "learning_rate": 3.8679447796776016e-07, + "loss": 1.7101, + "step": 61550 + }, + { + "epoch": 9.125925925925927, + "grad_norm": 6.262842655181885, + "learning_rate": 3.8036347673044316e-07, + "loss": 1.695, + "step": 61600 + }, + { + "epoch": 9.133333333333333, + "grad_norm": 5.596631050109863, + "learning_rate": 3.7398535027786455e-07, + "loss": 1.7107, + "step": 61650 + }, + { + "epoch": 9.14074074074074, + "grad_norm": 4.791952610015869, + "learning_rate": 3.676601336676988e-07, + "loss": 1.7179, + "step": 61700 + }, + { + "epoch": 9.148148148148149, + "grad_norm": 5.435273170471191, + "learning_rate": 3.613878616667954e-07, + "loss": 1.7008, + "step": 61750 + }, + { + "epoch": 9.155555555555555, + "grad_norm": 4.244542121887207, + "learning_rate": 3.5516856875099314e-07, + "loss": 1.7337, + "step": 61800 + }, + { + "epoch": 9.162962962962963, + "grad_norm": 5.632518768310547, + "learning_rate": 3.490022891049283e-07, + "loss": 1.6774, + "step": 61850 + }, + { + "epoch": 9.170370370370371, + "grad_norm": 5.066909313201904, + "learning_rate": 3.428890566218457e-07, + "loss": 1.643, + "step": 61900 + }, + { + "epoch": 9.177777777777777, + "grad_norm": 5.419461250305176, + "learning_rate": 3.368289049034179e-07, + "loss": 1.6886, + "step": 61950 + }, + { + "epoch": 9.185185185185185, + "grad_norm": 4.569416522979736, + "learning_rate": 3.30821867259552e-07, + "loss": 1.6416, + "step": 62000 + }, + { + "epoch": 9.192592592592593, + "grad_norm": 5.561820983886719, + "learning_rate": 3.24867976708213e-07, + "loss": 1.7122, + "step": 62050 + }, + { + "epoch": 9.2, + "grad_norm": 5.220768928527832, + "learning_rate": 3.1896726597524074e-07, + "loss": 1.6168, + "step": 62100 + }, + { + "epoch": 9.207407407407407, + "grad_norm": 4.5533528327941895, + "learning_rate": 3.1311976749416997e-07, + "loss": 1.6806, + "step": 62150 + }, + { + "epoch": 9.214814814814815, + "grad_norm": 4.327883243560791, + "learning_rate": 3.0732551340605046e-07, + "loss": 1.6956, + "step": 62200 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 5.154954433441162, + "learning_rate": 3.015845355592728e-07, + "loss": 1.7016, + "step": 62250 + }, + { + "epoch": 9.22962962962963, + "grad_norm": 4.6678571701049805, + "learning_rate": 2.958968655093919e-07, + "loss": 1.6852, + "step": 62300 + }, + { + "epoch": 9.237037037037037, + "grad_norm": 5.951033115386963, + "learning_rate": 2.9026253451895357e-07, + "loss": 1.6981, + "step": 62350 + }, + { + "epoch": 9.244444444444444, + "grad_norm": 4.83495569229126, + "learning_rate": 2.846815735573227e-07, + "loss": 1.7676, + "step": 62400 + }, + { + "epoch": 9.251851851851852, + "grad_norm": 4.016190052032471, + "learning_rate": 2.791540133005144e-07, + "loss": 1.6907, + "step": 62450 + }, + { + "epoch": 9.25925925925926, + "grad_norm": 5.682389736175537, + "learning_rate": 2.73679884131024e-07, + "loss": 1.7596, + "step": 62500 + }, + { + "epoch": 9.266666666666667, + "grad_norm": 5.448666095733643, + "learning_rate": 2.682592161376607e-07, + "loss": 1.7473, + "step": 62550 + }, + { + "epoch": 9.274074074074074, + "grad_norm": 6.466638565063477, + "learning_rate": 2.6289203911537884e-07, + "loss": 1.6856, + "step": 62600 + }, + { + "epoch": 9.281481481481482, + "grad_norm": 5.566060543060303, + "learning_rate": 2.575783825651201e-07, + "loss": 1.7308, + "step": 62650 + }, + { + "epoch": 9.28888888888889, + "grad_norm": 4.0370025634765625, + "learning_rate": 2.5231827569365044e-07, + "loss": 1.7389, + "step": 62700 + }, + { + "epoch": 9.296296296296296, + "grad_norm": 5.2997727394104, + "learning_rate": 2.4711174741338996e-07, + "loss": 1.7766, + "step": 62750 + }, + { + "epoch": 9.303703703703704, + "grad_norm": 4.835376739501953, + "learning_rate": 2.419588263422701e-07, + "loss": 1.6751, + "step": 62800 + }, + { + "epoch": 9.311111111111112, + "grad_norm": 4.635543346405029, + "learning_rate": 2.3685954080356345e-07, + "loss": 1.6458, + "step": 62850 + }, + { + "epoch": 9.318518518518518, + "grad_norm": 5.237940311431885, + "learning_rate": 2.318139188257318e-07, + "loss": 1.6964, + "step": 62900 + }, + { + "epoch": 9.325925925925926, + "grad_norm": 4.608352184295654, + "learning_rate": 2.2682198814227395e-07, + "loss": 1.7155, + "step": 62950 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 6.122480392456055, + "learning_rate": 2.2188377619157374e-07, + "loss": 1.7642, + "step": 63000 + }, + { + "epoch": 9.34074074074074, + "grad_norm": 5.679895401000977, + "learning_rate": 2.1699931011674225e-07, + "loss": 1.7617, + "step": 63050 + }, + { + "epoch": 9.348148148148148, + "grad_norm": 6.1035003662109375, + "learning_rate": 2.1216861676547684e-07, + "loss": 1.7427, + "step": 63100 + }, + { + "epoch": 9.355555555555556, + "grad_norm": 5.812370300292969, + "learning_rate": 2.073917226899147e-07, + "loss": 1.7262, + "step": 63150 + }, + { + "epoch": 9.362962962962962, + "grad_norm": 5.102630615234375, + "learning_rate": 2.026686541464773e-07, + "loss": 1.7531, + "step": 63200 + }, + { + "epoch": 9.37037037037037, + "grad_norm": 5.473056793212891, + "learning_rate": 1.9799943709573166e-07, + "loss": 1.7032, + "step": 63250 + }, + { + "epoch": 9.377777777777778, + "grad_norm": 5.401584625244141, + "learning_rate": 1.9338409720224938e-07, + "loss": 1.7035, + "step": 63300 + }, + { + "epoch": 9.385185185185184, + "grad_norm": 4.778750419616699, + "learning_rate": 1.8882265983446558e-07, + "loss": 1.6414, + "step": 63350 + }, + { + "epoch": 9.392592592592592, + "grad_norm": 4.272122859954834, + "learning_rate": 1.8431515006453127e-07, + "loss": 1.7321, + "step": 63400 + }, + { + "epoch": 9.4, + "grad_norm": 4.675436496734619, + "learning_rate": 1.7986159266818904e-07, + "loss": 1.7496, + "step": 63450 + }, + { + "epoch": 9.407407407407407, + "grad_norm": 5.439012050628662, + "learning_rate": 1.7546201212462642e-07, + "loss": 1.7051, + "step": 63500 + }, + { + "epoch": 9.414814814814815, + "grad_norm": 4.923573970794678, + "learning_rate": 1.7111643261634502e-07, + "loss": 1.6861, + "step": 63550 + }, + { + "epoch": 9.422222222222222, + "grad_norm": 4.010578632354736, + "learning_rate": 1.6682487802902493e-07, + "loss": 1.6795, + "step": 63600 + }, + { + "epoch": 9.42962962962963, + "grad_norm": 5.064062118530273, + "learning_rate": 1.625873719514004e-07, + "loss": 1.6973, + "step": 63650 + }, + { + "epoch": 9.437037037037037, + "grad_norm": 4.9291887283325195, + "learning_rate": 1.5840393767512118e-07, + "loss": 1.6806, + "step": 63700 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 5.1407341957092285, + "learning_rate": 1.542745981946303e-07, + "loss": 1.7001, + "step": 63750 + }, + { + "epoch": 9.451851851851853, + "grad_norm": 5.599128723144531, + "learning_rate": 1.5019937620703862e-07, + "loss": 1.7244, + "step": 63800 + }, + { + "epoch": 9.459259259259259, + "grad_norm": 4.7101945877075195, + "learning_rate": 1.4617829411199492e-07, + "loss": 1.7128, + "step": 63850 + }, + { + "epoch": 9.466666666666667, + "grad_norm": 4.43619966506958, + "learning_rate": 1.4221137401156492e-07, + "loss": 1.6842, + "step": 63900 + }, + { + "epoch": 9.474074074074075, + "grad_norm": 4.620861053466797, + "learning_rate": 1.3829863771011253e-07, + "loss": 1.6377, + "step": 63950 + }, + { + "epoch": 9.481481481481481, + "grad_norm": 4.613444805145264, + "learning_rate": 1.344401067141754e-07, + "loss": 1.6818, + "step": 64000 + }, + { + "epoch": 9.488888888888889, + "grad_norm": 5.942145347595215, + "learning_rate": 1.3063580223235284e-07, + "loss": 1.6883, + "step": 64050 + }, + { + "epoch": 9.496296296296297, + "grad_norm": 4.702767848968506, + "learning_rate": 1.268857451751826e-07, + "loss": 1.7628, + "step": 64100 + }, + { + "epoch": 9.503703703703703, + "grad_norm": 4.619032382965088, + "learning_rate": 1.2318995615502983e-07, + "loss": 1.7036, + "step": 64150 + }, + { + "epoch": 9.511111111111111, + "grad_norm": 4.408839702606201, + "learning_rate": 1.1954845548597162e-07, + "loss": 1.7135, + "step": 64200 + }, + { + "epoch": 9.518518518518519, + "grad_norm": 6.090911388397217, + "learning_rate": 1.1596126318368928e-07, + "loss": 1.7363, + "step": 64250 + }, + { + "epoch": 9.525925925925925, + "grad_norm": 6.030341148376465, + "learning_rate": 1.1242839896535407e-07, + "loss": 1.6981, + "step": 64300 + }, + { + "epoch": 9.533333333333333, + "grad_norm": 6.468939304351807, + "learning_rate": 1.089498822495183e-07, + "loss": 1.6546, + "step": 64350 + }, + { + "epoch": 9.540740740740741, + "grad_norm": 4.483176231384277, + "learning_rate": 1.0552573215601436e-07, + "loss": 1.7018, + "step": 64400 + }, + { + "epoch": 9.548148148148147, + "grad_norm": 4.81044864654541, + "learning_rate": 1.0215596750584588e-07, + "loss": 1.7386, + "step": 64450 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 5.380277633666992, + "learning_rate": 9.88406068210801e-08, + "loss": 1.634, + "step": 64500 + }, + { + "epoch": 9.562962962962963, + "grad_norm": 5.343459606170654, + "learning_rate": 9.557966832475341e-08, + "loss": 1.7023, + "step": 64550 + }, + { + "epoch": 9.57037037037037, + "grad_norm": 4.430337905883789, + "learning_rate": 9.237316994076929e-08, + "loss": 1.6313, + "step": 64600 + }, + { + "epoch": 9.577777777777778, + "grad_norm": 7.060146331787109, + "learning_rate": 8.922112929379501e-08, + "loss": 1.7565, + "step": 64650 + }, + { + "epoch": 9.585185185185185, + "grad_norm": 5.158222675323486, + "learning_rate": 8.612356370917174e-08, + "loss": 1.6618, + "step": 64700 + }, + { + "epoch": 9.592592592592592, + "grad_norm": 4.8023362159729, + "learning_rate": 8.308049021281461e-08, + "loss": 1.729, + "step": 64750 + }, + { + "epoch": 9.6, + "grad_norm": 5.179930686950684, + "learning_rate": 8.009192553111833e-08, + "loss": 1.7514, + "step": 64800 + }, + { + "epoch": 9.607407407407408, + "grad_norm": 4.108941078186035, + "learning_rate": 7.715788609087171e-08, + "loss": 1.7692, + "step": 64850 + }, + { + "epoch": 9.614814814814816, + "grad_norm": 4.883074760437012, + "learning_rate": 7.427838801915887e-08, + "loss": 1.6442, + "step": 64900 + }, + { + "epoch": 9.622222222222222, + "grad_norm": 4.569499969482422, + "learning_rate": 7.145344714327707e-08, + "loss": 1.6202, + "step": 64950 + }, + { + "epoch": 9.62962962962963, + "grad_norm": 4.791108131408691, + "learning_rate": 6.868307899064675e-08, + "loss": 1.7196, + "step": 65000 + }, + { + "epoch": 9.637037037037038, + "grad_norm": 6.292260646820068, + "learning_rate": 6.59672987887272e-08, + "loss": 1.7988, + "step": 65050 + }, + { + "epoch": 9.644444444444444, + "grad_norm": 5.572513103485107, + "learning_rate": 6.330612146492998e-08, + "loss": 1.7117, + "step": 65100 + }, + { + "epoch": 9.651851851851852, + "grad_norm": 6.622492790222168, + "learning_rate": 6.069956164654445e-08, + "loss": 1.7011, + "step": 65150 + }, + { + "epoch": 9.65925925925926, + "grad_norm": 5.106518268585205, + "learning_rate": 5.8147633660647904e-08, + "loss": 1.6777, + "step": 65200 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 4.711566925048828, + "learning_rate": 5.565035153403231e-08, + "loss": 1.7627, + "step": 65250 + }, + { + "epoch": 9.674074074074074, + "grad_norm": 5.837401866912842, + "learning_rate": 5.320772899312654e-08, + "loss": 1.7406, + "step": 65300 + }, + { + "epoch": 9.681481481481482, + "grad_norm": 5.084963798522949, + "learning_rate": 5.081977946392092e-08, + "loss": 1.6659, + "step": 65350 + }, + { + "epoch": 9.688888888888888, + "grad_norm": 5.511571884155273, + "learning_rate": 4.84865160718917e-08, + "loss": 1.7159, + "step": 65400 + }, + { + "epoch": 9.696296296296296, + "grad_norm": 6.268984317779541, + "learning_rate": 4.620795164193004e-08, + "loss": 1.7017, + "step": 65450 + }, + { + "epoch": 9.703703703703704, + "grad_norm": 4.758915901184082, + "learning_rate": 4.3984098698274245e-08, + "loss": 1.6664, + "step": 65500 + }, + { + "epoch": 9.71111111111111, + "grad_norm": 4.562999725341797, + "learning_rate": 4.181496946443653e-08, + "loss": 1.7089, + "step": 65550 + }, + { + "epoch": 9.718518518518518, + "grad_norm": 5.656510829925537, + "learning_rate": 3.970057586313747e-08, + "loss": 1.7346, + "step": 65600 + }, + { + "epoch": 9.725925925925926, + "grad_norm": 5.465713977813721, + "learning_rate": 3.764092951623943e-08, + "loss": 1.6746, + "step": 65650 + }, + { + "epoch": 9.733333333333333, + "grad_norm": 5.345441818237305, + "learning_rate": 3.563604174468771e-08, + "loss": 1.7312, + "step": 65700 + }, + { + "epoch": 9.74074074074074, + "grad_norm": 5.494285583496094, + "learning_rate": 3.368592356844058e-08, + "loss": 1.7035, + "step": 65750 + }, + { + "epoch": 9.748148148148148, + "grad_norm": 5.0170135498046875, + "learning_rate": 3.179058570641602e-08, + "loss": 1.6879, + "step": 65800 + }, + { + "epoch": 9.755555555555556, + "grad_norm": 6.04074239730835, + "learning_rate": 2.99500385764262e-08, + "loss": 1.6952, + "step": 65850 + }, + { + "epoch": 9.762962962962963, + "grad_norm": 4.81399393081665, + "learning_rate": 2.8164292295125294e-08, + "loss": 1.7135, + "step": 65900 + }, + { + "epoch": 9.77037037037037, + "grad_norm": 4.348941802978516, + "learning_rate": 2.6433356677952883e-08, + "loss": 1.6335, + "step": 65950 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 4.942220211029053, + "learning_rate": 2.47572412390773e-08, + "loss": 1.7366, + "step": 66000 + }, + { + "epoch": 9.785185185185185, + "grad_norm": 4.79942512512207, + "learning_rate": 2.3135955191345704e-08, + "loss": 1.6299, + "step": 66050 + }, + { + "epoch": 9.792592592592593, + "grad_norm": 5.222789287567139, + "learning_rate": 2.1569507446232983e-08, + "loss": 1.7802, + "step": 66100 + }, + { + "epoch": 9.8, + "grad_norm": 5.1869940757751465, + "learning_rate": 2.0057906613792922e-08, + "loss": 1.7028, + "step": 66150 + }, + { + "epoch": 9.807407407407407, + "grad_norm": 4.29283332824707, + "learning_rate": 1.8601161002611555e-08, + "loss": 1.7188, + "step": 66200 + }, + { + "epoch": 9.814814814814815, + "grad_norm": 4.335317134857178, + "learning_rate": 1.719927861975834e-08, + "loss": 1.6448, + "step": 66250 + }, + { + "epoch": 9.822222222222223, + "grad_norm": 4.7093634605407715, + "learning_rate": 1.585226717074728e-08, + "loss": 1.7305, + "step": 66300 + }, + { + "epoch": 9.829629629629629, + "grad_norm": 5.174459934234619, + "learning_rate": 1.4560134059488084e-08, + "loss": 1.6758, + "step": 66350 + }, + { + "epoch": 9.837037037037037, + "grad_norm": 4.238924503326416, + "learning_rate": 1.3322886388252854e-08, + "loss": 1.6736, + "step": 66400 + }, + { + "epoch": 9.844444444444445, + "grad_norm": 5.224400043487549, + "learning_rate": 1.214053095763168e-08, + "loss": 1.8016, + "step": 66450 + }, + { + "epoch": 9.851851851851851, + "grad_norm": 4.289679527282715, + "learning_rate": 1.1013074266496005e-08, + "loss": 1.6809, + "step": 66500 + }, + { + "epoch": 9.85925925925926, + "grad_norm": 5.104315280914307, + "learning_rate": 9.940522511965311e-09, + "loss": 1.7685, + "step": 66550 + }, + { + "epoch": 9.866666666666667, + "grad_norm": 6.96699857711792, + "learning_rate": 8.922881589369381e-09, + "loss": 1.7857, + "step": 66600 + }, + { + "epoch": 9.874074074074073, + "grad_norm": 6.210925102233887, + "learning_rate": 7.960157092221644e-09, + "loss": 1.6882, + "step": 66650 + }, + { + "epoch": 9.881481481481481, + "grad_norm": 4.829061985015869, + "learning_rate": 7.052354312180321e-09, + "loss": 1.6723, + "step": 66700 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 5.419042587280273, + "learning_rate": 6.199478239027334e-09, + "loss": 1.719, + "step": 66750 + }, + { + "epoch": 9.896296296296295, + "grad_norm": 4.705636501312256, + "learning_rate": 5.401533560636107e-09, + "loss": 1.6663, + "step": 66800 + }, + { + "epoch": 9.903703703703703, + "grad_norm": 4.650430202484131, + "learning_rate": 4.658524662947139e-09, + "loss": 1.6806, + "step": 66850 + }, + { + "epoch": 9.911111111111111, + "grad_norm": 6.103776931762695, + "learning_rate": 3.970455629942471e-09, + "loss": 1.7272, + "step": 66900 + }, + { + "epoch": 9.918518518518518, + "grad_norm": 4.458265781402588, + "learning_rate": 3.337330243627923e-09, + "loss": 1.6886, + "step": 66950 + }, + { + "epoch": 9.925925925925926, + "grad_norm": 4.948592662811279, + "learning_rate": 2.7591519840064473e-09, + "loss": 1.6957, + "step": 67000 + }, + { + "epoch": 9.933333333333334, + "grad_norm": 4.904621601104736, + "learning_rate": 2.2359240290614757e-09, + "loss": 1.68, + "step": 67050 + }, + { + "epoch": 9.940740740740742, + "grad_norm": 4.690095901489258, + "learning_rate": 1.7676492547402668e-09, + "loss": 1.7341, + "step": 67100 + }, + { + "epoch": 9.948148148148148, + "grad_norm": 7.331205368041992, + "learning_rate": 1.354330234936141e-09, + "loss": 1.7303, + "step": 67150 + }, + { + "epoch": 9.955555555555556, + "grad_norm": 5.197513103485107, + "learning_rate": 9.959692414784893e-10, + "loss": 1.7282, + "step": 67200 + }, + { + "epoch": 9.962962962962964, + "grad_norm": 6.541003227233887, + "learning_rate": 6.925682441150106e-10, + "loss": 1.6193, + "step": 67250 + }, + { + "epoch": 9.97037037037037, + "grad_norm": 5.752736568450928, + "learning_rate": 4.441289105017177e-10, + "loss": 1.6806, + "step": 67300 + }, + { + "epoch": 9.977777777777778, + "grad_norm": 4.675209999084473, + "learning_rate": 2.5065260619960843e-10, + "loss": 1.6589, + "step": 67350 + }, + { + "epoch": 9.985185185185186, + "grad_norm": 4.999233722686768, + "learning_rate": 1.121403946580113e-10, + "loss": 1.7465, + "step": 67400 + }, + { + "epoch": 9.992592592592592, + "grad_norm": 5.814984321594238, + "learning_rate": 2.859303721791662e-11, + "loss": 1.6395, + "step": 67450 + }, + { + "epoch": 10.0, + "grad_norm": 5.604928016662598, + "learning_rate": 1.0993098653599987e-14, + "loss": 1.7599, + "step": 67500 + }, + { + "epoch": 10.007407407407408, + "grad_norm": 5.701760292053223, + "learning_rate": 4.046448759873345e-07, + "loss": 1.7425, + "step": 67550 + }, + { + "epoch": 10.014814814814814, + "grad_norm": 5.556209564208984, + "learning_rate": 3.9866912851546425e-07, + "loss": 1.6211, + "step": 67600 + }, + { + "epoch": 10.022222222222222, + "grad_norm": 6.1393914222717285, + "learning_rate": 3.9273693694206084e-07, + "loss": 1.7484, + "step": 67650 + }, + { + "epoch": 10.02962962962963, + "grad_norm": 6.207096099853516, + "learning_rate": 3.8684832817817986e-07, + "loss": 1.6913, + "step": 67700 + }, + { + "epoch": 10.037037037037036, + "grad_norm": 5.700395107269287, + "learning_rate": 3.8100332893716174e-07, + "loss": 1.7092, + "step": 67750 + }, + { + "epoch": 10.044444444444444, + "grad_norm": 4.307992458343506, + "learning_rate": 3.752019657345196e-07, + "loss": 1.6546, + "step": 67800 + }, + { + "epoch": 10.051851851851852, + "grad_norm": 5.123924255371094, + "learning_rate": 3.694442648878105e-07, + "loss": 1.6957, + "step": 67850 + }, + { + "epoch": 10.059259259259258, + "grad_norm": 5.588382244110107, + "learning_rate": 3.6373025251652096e-07, + "loss": 1.7285, + "step": 67900 + }, + { + "epoch": 10.066666666666666, + "grad_norm": 5.349360466003418, + "learning_rate": 3.580599545419483e-07, + "loss": 1.6671, + "step": 67950 + }, + { + "epoch": 10.074074074074074, + "grad_norm": 4.467996120452881, + "learning_rate": 3.5243339668708075e-07, + "loss": 1.7021, + "step": 68000 + }, + { + "epoch": 10.081481481481482, + "grad_norm": 4.691610813140869, + "learning_rate": 3.468506044764808e-07, + "loss": 1.7114, + "step": 68050 + }, + { + "epoch": 10.088888888888889, + "grad_norm": 4.518200397491455, + "learning_rate": 3.413116032361741e-07, + "loss": 1.7254, + "step": 68100 + }, + { + "epoch": 10.096296296296297, + "grad_norm": 5.5028510093688965, + "learning_rate": 3.358164180935275e-07, + "loss": 1.7031, + "step": 68150 + }, + { + "epoch": 10.103703703703705, + "grad_norm": 4.488427639007568, + "learning_rate": 3.3036507397713915e-07, + "loss": 1.7363, + "step": 68200 + }, + { + "epoch": 10.11111111111111, + "grad_norm": 4.6745381355285645, + "learning_rate": 3.2495759561672837e-07, + "loss": 1.7013, + "step": 68250 + }, + { + "epoch": 10.118518518518519, + "grad_norm": 6.56805419921875, + "learning_rate": 3.195940075430137e-07, + "loss": 1.7279, + "step": 68300 + }, + { + "epoch": 10.125925925925927, + "grad_norm": 6.056013584136963, + "learning_rate": 3.142743340876131e-07, + "loss": 1.6627, + "step": 68350 + }, + { + "epoch": 10.133333333333333, + "grad_norm": 4.359686374664307, + "learning_rate": 3.089985993829281e-07, + "loss": 1.6903, + "step": 68400 + }, + { + "epoch": 10.14074074074074, + "grad_norm": 4.175838470458984, + "learning_rate": 3.0376682736202866e-07, + "loss": 1.6757, + "step": 68450 + }, + { + "epoch": 10.148148148148149, + "grad_norm": 4.473140716552734, + "learning_rate": 2.985790417585588e-07, + "loss": 1.7066, + "step": 68500 + }, + { + "epoch": 10.155555555555555, + "grad_norm": 4.994370937347412, + "learning_rate": 2.9343526610661534e-07, + "loss": 1.6348, + "step": 68550 + }, + { + "epoch": 10.162962962962963, + "grad_norm": 4.70601224899292, + "learning_rate": 2.883355237406471e-07, + "loss": 1.7447, + "step": 68600 + }, + { + "epoch": 10.170370370370371, + "grad_norm": 4.815600395202637, + "learning_rate": 2.832798377953505e-07, + "loss": 1.6319, + "step": 68650 + }, + { + "epoch": 10.177777777777777, + "grad_norm": 5.1428937911987305, + "learning_rate": 2.7826823120555955e-07, + "loss": 1.7659, + "step": 68700 + }, + { + "epoch": 10.185185185185185, + "grad_norm": 5.623711109161377, + "learning_rate": 2.7330072670614604e-07, + "loss": 1.6637, + "step": 68750 + }, + { + "epoch": 10.192592592592593, + "grad_norm": 4.861164569854736, + "learning_rate": 2.683773468319173e-07, + "loss": 1.6388, + "step": 68800 + }, + { + "epoch": 10.2, + "grad_norm": 4.522395133972168, + "learning_rate": 2.6349811391750856e-07, + "loss": 1.6704, + "step": 68850 + }, + { + "epoch": 10.207407407407407, + "grad_norm": 4.430446147918701, + "learning_rate": 2.586630500972853e-07, + "loss": 1.6849, + "step": 68900 + }, + { + "epoch": 10.214814814814815, + "grad_norm": 4.729742527008057, + "learning_rate": 2.538721773052433e-07, + "loss": 1.6672, + "step": 68950 + }, + { + "epoch": 10.222222222222221, + "grad_norm": 6.109224319458008, + "learning_rate": 2.491255172749085e-07, + "loss": 1.6799, + "step": 69000 + }, + { + "epoch": 10.22962962962963, + "grad_norm": 4.983971118927002, + "learning_rate": 2.444230915392376e-07, + "loss": 1.6689, + "step": 69050 + }, + { + "epoch": 10.237037037037037, + "grad_norm": 5.701051235198975, + "learning_rate": 2.397649214305198e-07, + "loss": 1.6814, + "step": 69100 + }, + { + "epoch": 10.244444444444444, + "grad_norm": 5.279200553894043, + "learning_rate": 2.3515102808028378e-07, + "loss": 1.6893, + "step": 69150 + }, + { + "epoch": 10.251851851851852, + "grad_norm": 4.546329021453857, + "learning_rate": 2.3058143241919906e-07, + "loss": 1.7076, + "step": 69200 + }, + { + "epoch": 10.25925925925926, + "grad_norm": 5.384993553161621, + "learning_rate": 2.2605615517697576e-07, + "loss": 1.734, + "step": 69250 + }, + { + "epoch": 10.266666666666667, + "grad_norm": 4.743589878082275, + "learning_rate": 2.2157521688228488e-07, + "loss": 1.7385, + "step": 69300 + }, + { + "epoch": 10.274074074074074, + "grad_norm": 5.3543291091918945, + "learning_rate": 2.171386378626461e-07, + "loss": 1.6681, + "step": 69350 + }, + { + "epoch": 10.281481481481482, + "grad_norm": 4.380980491638184, + "learning_rate": 2.127464382443545e-07, + "loss": 1.6716, + "step": 69400 + }, + { + "epoch": 10.28888888888889, + "grad_norm": 7.292746067047119, + "learning_rate": 2.0839863795237724e-07, + "loss": 1.747, + "step": 69450 + }, + { + "epoch": 10.296296296296296, + "grad_norm": 4.844071388244629, + "learning_rate": 2.0409525671026498e-07, + "loss": 1.6245, + "step": 69500 + }, + { + "epoch": 10.303703703703704, + "grad_norm": 4.968372821807861, + "learning_rate": 1.99836314040065e-07, + "loss": 1.6881, + "step": 69550 + }, + { + "epoch": 10.311111111111112, + "grad_norm": 3.9518678188323975, + "learning_rate": 1.956218292622325e-07, + "loss": 1.6664, + "step": 69600 + }, + { + "epoch": 10.318518518518518, + "grad_norm": 4.433720111846924, + "learning_rate": 1.9145182149553966e-07, + "loss": 1.6951, + "step": 69650 + }, + { + "epoch": 10.325925925925926, + "grad_norm": 5.678008079528809, + "learning_rate": 1.873263096569955e-07, + "loss": 1.6644, + "step": 69700 + }, + { + "epoch": 10.333333333333334, + "grad_norm": 6.03004789352417, + "learning_rate": 1.832453124617495e-07, + "loss": 1.7189, + "step": 69750 + }, + { + "epoch": 10.34074074074074, + "grad_norm": 4.946279048919678, + "learning_rate": 1.7920884842301922e-07, + "loss": 1.6496, + "step": 69800 + }, + { + "epoch": 10.348148148148148, + "grad_norm": 5.097282409667969, + "learning_rate": 1.7521693585199505e-07, + "loss": 1.6875, + "step": 69850 + }, + { + "epoch": 10.355555555555556, + "grad_norm": 4.687585353851318, + "learning_rate": 1.7126959285776456e-07, + "loss": 1.6469, + "step": 69900 + }, + { + "epoch": 10.362962962962962, + "grad_norm": 5.8932929039001465, + "learning_rate": 1.6736683734722814e-07, + "loss": 1.7221, + "step": 69950 + }, + { + "epoch": 10.37037037037037, + "grad_norm": 4.6569743156433105, + "learning_rate": 1.6350868702501798e-07, + "loss": 1.6485, + "step": 70000 + }, + { + "epoch": 10.377777777777778, + "grad_norm": 5.339687347412109, + "learning_rate": 1.5969515939341485e-07, + "loss": 1.7525, + "step": 70050 + }, + { + "epoch": 10.385185185185184, + "grad_norm": 4.882029056549072, + "learning_rate": 1.5592627175227248e-07, + "loss": 1.7105, + "step": 70100 + }, + { + "epoch": 10.392592592592592, + "grad_norm": 5.221914768218994, + "learning_rate": 1.522020411989389e-07, + "loss": 1.7217, + "step": 70150 + }, + { + "epoch": 10.4, + "grad_norm": 5.761238098144531, + "learning_rate": 1.4852248462817741e-07, + "loss": 1.6958, + "step": 70200 + }, + { + "epoch": 10.407407407407407, + "grad_norm": 5.501303672790527, + "learning_rate": 1.4488761873209022e-07, + "loss": 1.6864, + "step": 70250 + }, + { + "epoch": 10.414814814814815, + "grad_norm": 4.387554168701172, + "learning_rate": 1.4129746000004052e-07, + "loss": 1.7137, + "step": 70300 + }, + { + "epoch": 10.422222222222222, + "grad_norm": 4.2735419273376465, + "learning_rate": 1.3775202471858263e-07, + "loss": 1.6479, + "step": 70350 + }, + { + "epoch": 10.42962962962963, + "grad_norm": 7.969890594482422, + "learning_rate": 1.342513289713865e-07, + "loss": 1.6789, + "step": 70400 + }, + { + "epoch": 10.437037037037037, + "grad_norm": 4.56361198425293, + "learning_rate": 1.3079538863915775e-07, + "loss": 1.6735, + "step": 70450 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 5.747202396392822, + "learning_rate": 1.2738421939958e-07, + "loss": 1.7478, + "step": 70500 + }, + { + "epoch": 10.451851851851853, + "grad_norm": 5.894442081451416, + "learning_rate": 1.2401783672722822e-07, + "loss": 1.6335, + "step": 70550 + }, + { + "epoch": 10.459259259259259, + "grad_norm": 6.281641006469727, + "learning_rate": 1.2069625589350987e-07, + "loss": 1.7553, + "step": 70600 + }, + { + "epoch": 10.466666666666667, + "grad_norm": 4.168921947479248, + "learning_rate": 1.1741949196658942e-07, + "loss": 1.6481, + "step": 70650 + }, + { + "epoch": 10.474074074074075, + "grad_norm": 5.182560443878174, + "learning_rate": 1.1418755981132179e-07, + "loss": 1.7527, + "step": 70700 + }, + { + "epoch": 10.481481481481481, + "grad_norm": 5.2377190589904785, + "learning_rate": 1.1100047408918569e-07, + "loss": 1.7067, + "step": 70750 + }, + { + "epoch": 10.488888888888889, + "grad_norm": 5.3976263999938965, + "learning_rate": 1.078582492582192e-07, + "loss": 1.7029, + "step": 70800 + }, + { + "epoch": 10.496296296296297, + "grad_norm": 4.8626604080200195, + "learning_rate": 1.0476089957294545e-07, + "loss": 1.653, + "step": 70850 + }, + { + "epoch": 10.503703703703703, + "grad_norm": 4.868640422821045, + "learning_rate": 1.0170843908432037e-07, + "loss": 1.6797, + "step": 70900 + }, + { + "epoch": 10.511111111111111, + "grad_norm": 6.026078224182129, + "learning_rate": 9.87008816396573e-08, + "loss": 1.7255, + "step": 70950 + }, + { + "epoch": 10.518518518518519, + "grad_norm": 5.1598896980285645, + "learning_rate": 9.573824088257244e-08, + "loss": 1.7599, + "step": 71000 + }, + { + "epoch": 10.525925925925925, + "grad_norm": 7.120807647705078, + "learning_rate": 9.282053025291948e-08, + "loss": 1.6233, + "step": 71050 + }, + { + "epoch": 10.533333333333333, + "grad_norm": 4.827940940856934, + "learning_rate": 8.994776298672847e-08, + "loss": 1.7441, + "step": 71100 + }, + { + "epoch": 10.540740740740741, + "grad_norm": 4.910730838775635, + "learning_rate": 8.711995211614587e-08, + "loss": 1.6672, + "step": 71150 + }, + { + "epoch": 10.548148148148147, + "grad_norm": 4.025373458862305, + "learning_rate": 8.43371104693802e-08, + "loss": 1.641, + "step": 71200 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 5.08291482925415, + "learning_rate": 8.159925067063423e-08, + "loss": 1.6622, + "step": 71250 + }, + { + "epoch": 10.562962962962963, + "grad_norm": 5.166038513183594, + "learning_rate": 7.890638514005511e-08, + "loss": 1.6466, + "step": 71300 + }, + { + "epoch": 10.57037037037037, + "grad_norm": 4.054682731628418, + "learning_rate": 7.625852609367546e-08, + "loss": 1.7034, + "step": 71350 + }, + { + "epoch": 10.577777777777778, + "grad_norm": 5.348453521728516, + "learning_rate": 7.365568554336122e-08, + "loss": 1.68, + "step": 71400 + }, + { + "epoch": 10.585185185185185, + "grad_norm": 6.874485969543457, + "learning_rate": 7.109787529675172e-08, + "loss": 1.6973, + "step": 71450 + }, + { + "epoch": 10.592592592592592, + "grad_norm": 3.6352336406707764, + "learning_rate": 6.858510695720744e-08, + "loss": 1.6651, + "step": 71500 + }, + { + "epoch": 10.6, + "grad_norm": 4.947807788848877, + "learning_rate": 6.611739192376344e-08, + "loss": 1.6939, + "step": 71550 + }, + { + "epoch": 10.607407407407408, + "grad_norm": 5.23864221572876, + "learning_rate": 6.369474139107046e-08, + "loss": 1.6455, + "step": 71600 + }, + { + "epoch": 10.614814814814816, + "grad_norm": 4.680927276611328, + "learning_rate": 6.131716634934504e-08, + "loss": 1.7969, + "step": 71650 + }, + { + "epoch": 10.622222222222222, + "grad_norm": 4.570713996887207, + "learning_rate": 5.89846775843228e-08, + "loss": 1.674, + "step": 71700 + }, + { + "epoch": 10.62962962962963, + "grad_norm": 5.311273574829102, + "learning_rate": 5.669728567720967e-08, + "loss": 1.6284, + "step": 71750 + }, + { + "epoch": 10.637037037037038, + "grad_norm": 5.45431661605835, + "learning_rate": 5.4455001004629634e-08, + "loss": 1.7311, + "step": 71800 + }, + { + "epoch": 10.644444444444444, + "grad_norm": 5.246209621429443, + "learning_rate": 5.225783373858151e-08, + "loss": 1.6305, + "step": 71850 + }, + { + "epoch": 10.651851851851852, + "grad_norm": 4.364826202392578, + "learning_rate": 5.010579384639114e-08, + "loss": 1.6551, + "step": 71900 + }, + { + "epoch": 10.65925925925926, + "grad_norm": 4.401340961456299, + "learning_rate": 4.799889109066591e-08, + "loss": 1.6878, + "step": 71950 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 5.108217716217041, + "learning_rate": 4.5937135029253675e-08, + "loss": 1.6827, + "step": 72000 + }, + { + "epoch": 10.674074074074074, + "grad_norm": 5.362737655639648, + "learning_rate": 4.3920535015193886e-08, + "loss": 1.7257, + "step": 72050 + }, + { + "epoch": 10.681481481481482, + "grad_norm": 4.8335490226745605, + "learning_rate": 4.194910019667875e-08, + "loss": 1.7556, + "step": 72100 + }, + { + "epoch": 10.688888888888888, + "grad_norm": 4.182542324066162, + "learning_rate": 4.0022839517013246e-08, + "loss": 1.7039, + "step": 72150 + }, + { + "epoch": 10.696296296296296, + "grad_norm": 5.213995456695557, + "learning_rate": 3.8141761714568514e-08, + "loss": 1.7118, + "step": 72200 + }, + { + "epoch": 10.703703703703704, + "grad_norm": 5.52512788772583, + "learning_rate": 3.630587532275076e-08, + "loss": 1.6531, + "step": 72250 + }, + { + "epoch": 10.71111111111111, + "grad_norm": 4.814249515533447, + "learning_rate": 3.4515188669954624e-08, + "loss": 1.7214, + "step": 72300 + }, + { + "epoch": 10.718518518518518, + "grad_norm": 5.535644054412842, + "learning_rate": 3.276970987952877e-08, + "loss": 1.639, + "step": 72350 + }, + { + "epoch": 10.725925925925926, + "grad_norm": 4.050861358642578, + "learning_rate": 3.1069446869741446e-08, + "loss": 1.6789, + "step": 72400 + }, + { + "epoch": 10.733333333333333, + "grad_norm": 4.139535903930664, + "learning_rate": 2.941440735373835e-08, + "loss": 1.6347, + "step": 72450 + }, + { + "epoch": 10.74074074074074, + "grad_norm": 5.31736421585083, + "learning_rate": 2.7804598839514806e-08, + "loss": 1.7072, + "step": 72500 + }, + { + "epoch": 10.748148148148148, + "grad_norm": 4.640048503875732, + "learning_rate": 2.6240028629876958e-08, + "loss": 1.7176, + "step": 72550 + }, + { + "epoch": 10.755555555555556, + "grad_norm": 4.653814315795898, + "learning_rate": 2.472070382240843e-08, + "loss": 1.6947, + "step": 72600 + }, + { + "epoch": 10.762962962962963, + "grad_norm": 4.801093578338623, + "learning_rate": 2.3246631309441492e-08, + "loss": 1.72, + "step": 72650 + }, + { + "epoch": 10.77037037037037, + "grad_norm": 4.096927165985107, + "learning_rate": 2.181781777802261e-08, + "loss": 1.7342, + "step": 72700 + }, + { + "epoch": 10.777777777777779, + "grad_norm": 4.117656707763672, + "learning_rate": 2.0434269709885822e-08, + "loss": 1.7043, + "step": 72750 + }, + { + "epoch": 10.785185185185185, + "grad_norm": 5.932491302490234, + "learning_rate": 1.909599338141832e-08, + "loss": 1.7292, + "step": 72800 + }, + { + "epoch": 10.792592592592593, + "grad_norm": 6.006489276885986, + "learning_rate": 1.7802994863636013e-08, + "loss": 1.7046, + "step": 72850 + }, + { + "epoch": 10.8, + "grad_norm": 5.510446071624756, + "learning_rate": 1.6555280022152454e-08, + "loss": 1.7306, + "step": 72900 + }, + { + "epoch": 10.807407407407407, + "grad_norm": 7.160583019256592, + "learning_rate": 1.5352854517158843e-08, + "loss": 1.7025, + "step": 72950 + }, + { + "epoch": 10.814814814814815, + "grad_norm": 3.88580060005188, + "learning_rate": 1.4195723803387407e-08, + "loss": 1.7404, + "step": 73000 + }, + { + "epoch": 10.822222222222223, + "grad_norm": 4.92537260055542, + "learning_rate": 1.3083893130100278e-08, + "loss": 1.6605, + "step": 73050 + }, + { + "epoch": 10.829629629629629, + "grad_norm": 6.940608978271484, + "learning_rate": 1.201736754105176e-08, + "loss": 1.6139, + "step": 73100 + }, + { + "epoch": 10.837037037037037, + "grad_norm": 5.638848781585693, + "learning_rate": 1.0996151874478333e-08, + "loss": 1.6273, + "step": 73150 + }, + { + "epoch": 10.844444444444445, + "grad_norm": 4.141793251037598, + "learning_rate": 1.0020250763064232e-08, + "loss": 1.7212, + "step": 73200 + }, + { + "epoch": 10.851851851851851, + "grad_norm": 5.597693920135498, + "learning_rate": 9.08966863393257e-09, + "loss": 1.6653, + "step": 73250 + }, + { + "epoch": 10.85925925925926, + "grad_norm": 5.222171306610107, + "learning_rate": 8.204409708616468e-09, + "loss": 1.6919, + "step": 73300 + }, + { + "epoch": 10.866666666666667, + "grad_norm": 5.148636817932129, + "learning_rate": 7.3644780030424075e-09, + "loss": 1.7524, + "step": 73350 + }, + { + "epoch": 10.874074074074073, + "grad_norm": 5.547111988067627, + "learning_rate": 6.569877327514684e-09, + "loss": 1.6789, + "step": 73400 + }, + { + "epoch": 10.881481481481481, + "grad_norm": 4.137996673583984, + "learning_rate": 5.820611286693201e-09, + "loss": 1.7212, + "step": 73450 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 5.037812232971191, + "learning_rate": 5.116683279582369e-09, + "loss": 1.6422, + "step": 73500 + }, + { + "epoch": 10.896296296296295, + "grad_norm": 5.004727363586426, + "learning_rate": 4.4580964995122325e-09, + "loss": 1.6445, + "step": 73550 + }, + { + "epoch": 10.903703703703703, + "grad_norm": 3.8220603466033936, + "learning_rate": 3.844853934124038e-09, + "loss": 1.6438, + "step": 73600 + }, + { + "epoch": 10.911111111111111, + "grad_norm": 5.583298683166504, + "learning_rate": 3.2769583653580185e-09, + "loss": 1.7172, + "step": 73650 + }, + { + "epoch": 10.918518518518518, + "grad_norm": 4.771687984466553, + "learning_rate": 2.754412369441184e-09, + "loss": 1.7834, + "step": 73700 + }, + { + "epoch": 10.925925925925926, + "grad_norm": 7.343170642852783, + "learning_rate": 2.277218316873997e-09, + "loss": 1.6955, + "step": 73750 + }, + { + "epoch": 10.933333333333334, + "grad_norm": 4.965358734130859, + "learning_rate": 1.8453783724214913e-09, + "loss": 1.7047, + "step": 73800 + }, + { + "epoch": 10.940740740740742, + "grad_norm": 5.137145519256592, + "learning_rate": 1.4588944950988394e-09, + "loss": 1.7098, + "step": 73850 + }, + { + "epoch": 10.948148148148148, + "grad_norm": 4.667434215545654, + "learning_rate": 1.1177684381702414e-09, + "loss": 1.7216, + "step": 73900 + }, + { + "epoch": 10.955555555555556, + "grad_norm": 4.412621021270752, + "learning_rate": 8.220017491344934e-10, + "loss": 1.6661, + "step": 73950 + }, + { + "epoch": 10.962962962962964, + "grad_norm": 4.227043151855469, + "learning_rate": 5.715957697216556e-10, + "loss": 1.7637, + "step": 74000 + }, + { + "epoch": 10.97037037037037, + "grad_norm": 5.481570720672607, + "learning_rate": 3.6655163588195097e-10, + "loss": 1.6849, + "step": 74050 + }, + { + "epoch": 10.977777777777778, + "grad_norm": 7.311110496520996, + "learning_rate": 2.0687027778909518e-10, + "loss": 1.6825, + "step": 74100 + }, + { + "epoch": 10.985185185185186, + "grad_norm": 4.955511569976807, + "learning_rate": 9.255241982697449e-11, + "loss": 1.7145, + "step": 74150 + }, + { + "epoch": 10.992592592592592, + "grad_norm": 5.094233989715576, + "learning_rate": 2.359858059186593e-11, + "loss": 1.7051, + "step": 74200 + }, + { + "epoch": 11.0, + "grad_norm": 5.1888346672058105, + "learning_rate": 9.07289132712208e-15, + "loss": 1.7045, + "step": 74250 + }, + { + "epoch": 11.007407407407408, + "grad_norm": 6.677313804626465, + "learning_rate": 3.4000141682162035e-07, + "loss": 1.7766, + "step": 74300 + }, + { + "epoch": 11.014814814814814, + "grad_norm": 5.349052429199219, + "learning_rate": 3.3497486947049575e-07, + "loss": 1.7002, + "step": 74350 + }, + { + "epoch": 11.022222222222222, + "grad_norm": 4.597198009490967, + "learning_rate": 3.2998512237565005e-07, + "loss": 1.6818, + "step": 74400 + }, + { + "epoch": 11.02962962962963, + "grad_norm": 5.520595073699951, + "learning_rate": 3.250321945358903e-07, + "loss": 1.6704, + "step": 74450 + }, + { + "epoch": 11.037037037037036, + "grad_norm": 6.03912353515625, + "learning_rate": 3.201161048098367e-07, + "loss": 1.6693, + "step": 74500 + }, + { + "epoch": 11.044444444444444, + "grad_norm": 5.658355236053467, + "learning_rate": 3.152368719158416e-07, + "loss": 1.7106, + "step": 74550 + }, + { + "epoch": 11.051851851851852, + "grad_norm": 5.297006130218506, + "learning_rate": 3.1039451443192537e-07, + "loss": 1.636, + "step": 74600 + }, + { + "epoch": 11.059259259259258, + "grad_norm": 5.458147048950195, + "learning_rate": 3.0558905079569933e-07, + "loss": 1.6864, + "step": 74650 + }, + { + "epoch": 11.066666666666666, + "grad_norm": 6.0984110832214355, + "learning_rate": 3.008204993043029e-07, + "loss": 1.6783, + "step": 74700 + }, + { + "epoch": 11.074074074074074, + "grad_norm": 5.887842655181885, + "learning_rate": 2.9608887811432674e-07, + "loss": 1.7436, + "step": 74750 + }, + { + "epoch": 11.081481481481482, + "grad_norm": 5.7084197998046875, + "learning_rate": 2.9139420524174953e-07, + "loss": 1.6846, + "step": 74800 + }, + { + "epoch": 11.088888888888889, + "grad_norm": 4.7401323318481445, + "learning_rate": 2.867364985618648e-07, + "loss": 1.7038, + "step": 74850 + }, + { + "epoch": 11.096296296296297, + "grad_norm": 6.002399444580078, + "learning_rate": 2.82115775809213e-07, + "loss": 1.7359, + "step": 74900 + }, + { + "epoch": 11.103703703703705, + "grad_norm": 7.190243721008301, + "learning_rate": 2.7753205457752174e-07, + "loss": 1.72, + "step": 74950 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 5.5445146560668945, + "learning_rate": 2.7298535231962464e-07, + "loss": 1.6135, + "step": 75000 + }, + { + "epoch": 11.118518518518519, + "grad_norm": 4.865965366363525, + "learning_rate": 2.684756863474103e-07, + "loss": 1.7384, + "step": 75050 + }, + { + "epoch": 11.125925925925927, + "grad_norm": 5.077027320861816, + "learning_rate": 2.6400307383174227e-07, + "loss": 1.6512, + "step": 75100 + }, + { + "epoch": 11.133333333333333, + "grad_norm": 4.601258277893066, + "learning_rate": 2.595675318024093e-07, + "loss": 1.7225, + "step": 75150 + }, + { + "epoch": 11.14074074074074, + "grad_norm": 4.5170488357543945, + "learning_rate": 2.5516907714804306e-07, + "loss": 1.6548, + "step": 75200 + }, + { + "epoch": 11.148148148148149, + "grad_norm": 4.254936218261719, + "learning_rate": 2.508077266160669e-07, + "loss": 1.6346, + "step": 75250 + }, + { + "epoch": 11.155555555555555, + "grad_norm": 4.214375019073486, + "learning_rate": 2.464834968126251e-07, + "loss": 1.72, + "step": 75300 + }, + { + "epoch": 11.162962962962963, + "grad_norm": 4.431253910064697, + "learning_rate": 2.421964042025271e-07, + "loss": 1.7185, + "step": 75350 + }, + { + "epoch": 11.170370370370371, + "grad_norm": 6.110872745513916, + "learning_rate": 2.3794646510917564e-07, + "loss": 1.6903, + "step": 75400 + }, + { + "epoch": 11.177777777777777, + "grad_norm": 4.43080472946167, + "learning_rate": 2.3373369571450755e-07, + "loss": 1.6936, + "step": 75450 + }, + { + "epoch": 11.185185185185185, + "grad_norm": 5.446310520172119, + "learning_rate": 2.295581120589363e-07, + "loss": 1.7097, + "step": 75500 + }, + { + "epoch": 11.192592592592593, + "grad_norm": 5.263556957244873, + "learning_rate": 2.254197300412897e-07, + "loss": 1.696, + "step": 75550 + }, + { + "epoch": 11.2, + "grad_norm": 4.2185139656066895, + "learning_rate": 2.213185654187433e-07, + "loss": 1.6967, + "step": 75600 + }, + { + "epoch": 11.207407407407407, + "grad_norm": 5.42978572845459, + "learning_rate": 2.1725463380676824e-07, + "loss": 1.7041, + "step": 75650 + }, + { + "epoch": 11.214814814814815, + "grad_norm": 5.338979244232178, + "learning_rate": 2.1322795067906688e-07, + "loss": 1.6466, + "step": 75700 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 4.058988571166992, + "learning_rate": 2.092385313675138e-07, + "loss": 1.6315, + "step": 75750 + }, + { + "epoch": 11.22962962962963, + "grad_norm": 5.591002464294434, + "learning_rate": 2.0528639106210392e-07, + "loss": 1.6745, + "step": 75800 + }, + { + "epoch": 11.237037037037037, + "grad_norm": 4.783778190612793, + "learning_rate": 2.013715448108855e-07, + "loss": 1.693, + "step": 75850 + }, + { + "epoch": 11.244444444444444, + "grad_norm": 4.906090259552002, + "learning_rate": 1.974940075199061e-07, + "loss": 1.7632, + "step": 75900 + }, + { + "epoch": 11.251851851851852, + "grad_norm": 6.792934417724609, + "learning_rate": 1.9365379395316243e-07, + "loss": 1.6931, + "step": 75950 + }, + { + "epoch": 11.25925925925926, + "grad_norm": 4.3067731857299805, + "learning_rate": 1.898509187325337e-07, + "loss": 1.6851, + "step": 76000 + }, + { + "epoch": 11.266666666666667, + "grad_norm": 4.990530014038086, + "learning_rate": 1.860853963377318e-07, + "loss": 1.6251, + "step": 76050 + }, + { + "epoch": 11.274074074074074, + "grad_norm": 5.153524875640869, + "learning_rate": 1.8235724110624575e-07, + "loss": 1.724, + "step": 76100 + }, + { + "epoch": 11.281481481481482, + "grad_norm": 6.641872406005859, + "learning_rate": 1.7866646723328608e-07, + "loss": 1.754, + "step": 76150 + }, + { + "epoch": 11.28888888888889, + "grad_norm": 4.482833385467529, + "learning_rate": 1.7501308877173162e-07, + "loss": 1.6947, + "step": 76200 + }, + { + "epoch": 11.296296296296296, + "grad_norm": 5.8048014640808105, + "learning_rate": 1.7139711963207517e-07, + "loss": 1.6623, + "step": 76250 + }, + { + "epoch": 11.303703703703704, + "grad_norm": 4.707399845123291, + "learning_rate": 1.678185735823712e-07, + "loss": 1.701, + "step": 76300 + }, + { + "epoch": 11.311111111111112, + "grad_norm": 4.89939022064209, + "learning_rate": 1.6427746424818258e-07, + "loss": 1.6908, + "step": 76350 + }, + { + "epoch": 11.318518518518518, + "grad_norm": 5.831413269042969, + "learning_rate": 1.607738051125296e-07, + "loss": 1.7078, + "step": 76400 + }, + { + "epoch": 11.325925925925926, + "grad_norm": 7.055150985717773, + "learning_rate": 1.573076095158399e-07, + "loss": 1.7405, + "step": 76450 + }, + { + "epoch": 11.333333333333334, + "grad_norm": 4.682511329650879, + "learning_rate": 1.538788906558919e-07, + "loss": 1.7782, + "step": 76500 + }, + { + "epoch": 11.34074074074074, + "grad_norm": 6.273463726043701, + "learning_rate": 1.5048766158777372e-07, + "loss": 1.6904, + "step": 76550 + }, + { + "epoch": 11.348148148148148, + "grad_norm": 5.013375759124756, + "learning_rate": 1.4713393522382547e-07, + "loss": 1.6795, + "step": 76600 + }, + { + "epoch": 11.355555555555556, + "grad_norm": 4.531692028045654, + "learning_rate": 1.4381772433359474e-07, + "loss": 1.7108, + "step": 76650 + }, + { + "epoch": 11.362962962962962, + "grad_norm": 5.173058032989502, + "learning_rate": 1.405390415437835e-07, + "loss": 1.7178, + "step": 76700 + }, + { + "epoch": 11.37037037037037, + "grad_norm": 4.779514789581299, + "learning_rate": 1.372978993382068e-07, + "loss": 1.6769, + "step": 76750 + }, + { + "epoch": 11.377777777777778, + "grad_norm": 5.0213117599487305, + "learning_rate": 1.3409431005773855e-07, + "loss": 1.808, + "step": 76800 + }, + { + "epoch": 11.385185185185184, + "grad_norm": 5.859137535095215, + "learning_rate": 1.3092828590026695e-07, + "loss": 1.6981, + "step": 76850 + }, + { + "epoch": 11.392592592592592, + "grad_norm": 5.248096942901611, + "learning_rate": 1.277998389206514e-07, + "loss": 1.6445, + "step": 76900 + }, + { + "epoch": 11.4, + "grad_norm": 6.535493850708008, + "learning_rate": 1.2470898103066896e-07, + "loss": 1.6978, + "step": 76950 + }, + { + "epoch": 11.407407407407407, + "grad_norm": 4.829598426818848, + "learning_rate": 1.2165572399897908e-07, + "loss": 1.7188, + "step": 77000 + }, + { + "epoch": 11.414814814814815, + "grad_norm": 4.927868843078613, + "learning_rate": 1.1864007945107004e-07, + "loss": 1.7097, + "step": 77050 + }, + { + "epoch": 11.422222222222222, + "grad_norm": 4.605199337005615, + "learning_rate": 1.15662058869217e-07, + "loss": 1.6289, + "step": 77100 + }, + { + "epoch": 11.42962962962963, + "grad_norm": 7.104889869689941, + "learning_rate": 1.1272167359244302e-07, + "loss": 1.7027, + "step": 77150 + }, + { + "epoch": 11.437037037037037, + "grad_norm": 4.649669170379639, + "learning_rate": 1.0981893481646689e-07, + "loss": 1.7473, + "step": 77200 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 4.562942981719971, + "learning_rate": 1.0695385359367094e-07, + "loss": 1.6945, + "step": 77250 + }, + { + "epoch": 11.451851851851853, + "grad_norm": 5.604371547698975, + "learning_rate": 1.0412644083305112e-07, + "loss": 1.6964, + "step": 77300 + }, + { + "epoch": 11.459259259259259, + "grad_norm": 4.250510215759277, + "learning_rate": 1.0133670730017697e-07, + "loss": 1.6596, + "step": 77350 + }, + { + "epoch": 11.466666666666667, + "grad_norm": 4.702193737030029, + "learning_rate": 9.858466361715502e-08, + "loss": 1.6443, + "step": 77400 + }, + { + "epoch": 11.474074074074075, + "grad_norm": 4.870285511016846, + "learning_rate": 9.587032026258214e-08, + "loss": 1.7001, + "step": 77450 + }, + { + "epoch": 11.481481481481481, + "grad_norm": 4.658622741699219, + "learning_rate": 9.319368757151004e-08, + "loss": 1.7291, + "step": 77500 + }, + { + "epoch": 11.488888888888889, + "grad_norm": 4.985848903656006, + "learning_rate": 9.055477573540417e-08, + "loss": 1.6981, + "step": 77550 + }, + { + "epoch": 11.496296296296297, + "grad_norm": 4.855040550231934, + "learning_rate": 8.795359480210374e-08, + "loss": 1.6741, + "step": 77600 + }, + { + "epoch": 11.503703703703703, + "grad_norm": 4.5977983474731445, + "learning_rate": 8.53901546757896e-08, + "loss": 1.6, + "step": 77650 + }, + { + "epoch": 11.511111111111111, + "grad_norm": 5.066938400268555, + "learning_rate": 8.286446511693635e-08, + "loss": 1.7648, + "step": 77700 + }, + { + "epoch": 11.518518518518519, + "grad_norm": 5.341429233551025, + "learning_rate": 8.037653574228255e-08, + "loss": 1.7957, + "step": 77750 + }, + { + "epoch": 11.525925925925925, + "grad_norm": 4.630782127380371, + "learning_rate": 7.79263760247928e-08, + "loss": 1.6445, + "step": 77800 + }, + { + "epoch": 11.533333333333333, + "grad_norm": 5.8252129554748535, + "learning_rate": 7.551399529362125e-08, + "loss": 1.6807, + "step": 77850 + }, + { + "epoch": 11.540740740740741, + "grad_norm": 6.431312561035156, + "learning_rate": 7.313940273407482e-08, + "loss": 1.6229, + "step": 77900 + }, + { + "epoch": 11.548148148148147, + "grad_norm": 5.191285133361816, + "learning_rate": 7.080260738758227e-08, + "loss": 1.6838, + "step": 77950 + }, + { + "epoch": 11.555555555555555, + "grad_norm": 5.839999198913574, + "learning_rate": 6.850361815165185e-08, + "loss": 1.7031, + "step": 78000 + }, + { + "epoch": 11.562962962962963, + "grad_norm": 4.388585567474365, + "learning_rate": 6.62424437798459e-08, + "loss": 1.7339, + "step": 78050 + }, + { + "epoch": 11.57037037037037, + "grad_norm": 3.9459519386291504, + "learning_rate": 6.401909288174523e-08, + "loss": 1.7138, + "step": 78100 + }, + { + "epoch": 11.577777777777778, + "grad_norm": 5.464336395263672, + "learning_rate": 6.183357392291145e-08, + "loss": 1.6937, + "step": 78150 + }, + { + "epoch": 11.585185185185185, + "grad_norm": 5.290124893188477, + "learning_rate": 5.968589522485912e-08, + "loss": 1.6891, + "step": 78200 + }, + { + "epoch": 11.592592592592592, + "grad_norm": 4.703418731689453, + "learning_rate": 5.757606496502699e-08, + "loss": 1.7016, + "step": 78250 + }, + { + "epoch": 11.6, + "grad_norm": 6.593367099761963, + "learning_rate": 5.550409117674016e-08, + "loss": 1.6356, + "step": 78300 + }, + { + "epoch": 11.607407407407408, + "grad_norm": 4.0880608558654785, + "learning_rate": 5.346998174918128e-08, + "loss": 1.6753, + "step": 78350 + }, + { + "epoch": 11.614814814814816, + "grad_norm": 5.509574890136719, + "learning_rate": 5.147374442736497e-08, + "loss": 1.5876, + "step": 78400 + }, + { + "epoch": 11.622222222222222, + "grad_norm": 5.011131286621094, + "learning_rate": 4.951538681210455e-08, + "loss": 1.6904, + "step": 78450 + }, + { + "epoch": 11.62962962962963, + "grad_norm": 5.238678455352783, + "learning_rate": 4.759491635998204e-08, + "loss": 1.6404, + "step": 78500 + }, + { + "epoch": 11.637037037037038, + "grad_norm": 5.725277423858643, + "learning_rate": 4.571234038332262e-08, + "loss": 1.6658, + "step": 78550 + }, + { + "epoch": 11.644444444444444, + "grad_norm": 5.188413619995117, + "learning_rate": 4.386766605016468e-08, + "loss": 1.6906, + "step": 78600 + }, + { + "epoch": 11.651851851851852, + "grad_norm": 5.059532165527344, + "learning_rate": 4.206090038423649e-08, + "loss": 1.6779, + "step": 78650 + }, + { + "epoch": 11.65925925925926, + "grad_norm": 3.8585777282714844, + "learning_rate": 4.029205026492178e-08, + "loss": 1.6209, + "step": 78700 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 5.420309543609619, + "learning_rate": 3.856112242724086e-08, + "loss": 1.712, + "step": 78750 + }, + { + "epoch": 11.674074074074074, + "grad_norm": 4.85215950012207, + "learning_rate": 3.6868123461824e-08, + "loss": 1.675, + "step": 78800 + }, + { + "epoch": 11.681481481481482, + "grad_norm": 5.239144325256348, + "learning_rate": 3.5213059814880326e-08, + "loss": 1.7549, + "step": 78850 + }, + { + "epoch": 11.688888888888888, + "grad_norm": 4.639777183532715, + "learning_rate": 3.359593778818115e-08, + "loss": 1.7411, + "step": 78900 + }, + { + "epoch": 11.696296296296296, + "grad_norm": 5.959284782409668, + "learning_rate": 3.201676353903005e-08, + "loss": 1.6628, + "step": 78950 + }, + { + "epoch": 11.703703703703704, + "grad_norm": 4.810410976409912, + "learning_rate": 3.047554308024503e-08, + "loss": 1.7053, + "step": 79000 + }, + { + "epoch": 11.71111111111111, + "grad_norm": 5.420835971832275, + "learning_rate": 2.8972282280128606e-08, + "loss": 1.7336, + "step": 79050 + }, + { + "epoch": 11.718518518518518, + "grad_norm": 4.595527172088623, + "learning_rate": 2.7506986862451122e-08, + "loss": 1.736, + "step": 79100 + }, + { + "epoch": 11.725925925925926, + "grad_norm": 5.292618274688721, + "learning_rate": 2.6079662406428564e-08, + "loss": 1.6576, + "step": 79150 + }, + { + "epoch": 11.733333333333333, + "grad_norm": 5.267721652984619, + "learning_rate": 2.4690314346695888e-08, + "loss": 1.6958, + "step": 79200 + }, + { + "epoch": 11.74074074074074, + "grad_norm": 4.758387565612793, + "learning_rate": 2.3338947973293724e-08, + "loss": 1.6424, + "step": 79250 + }, + { + "epoch": 11.748148148148148, + "grad_norm": 4.851783752441406, + "learning_rate": 2.202556843164283e-08, + "loss": 1.7366, + "step": 79300 + }, + { + "epoch": 11.755555555555556, + "grad_norm": 5.405657768249512, + "learning_rate": 2.0750180722529657e-08, + "loss": 1.6203, + "step": 79350 + }, + { + "epoch": 11.762962962962963, + "grad_norm": 4.255611896514893, + "learning_rate": 1.9512789702078594e-08, + "loss": 1.7053, + "step": 79400 + }, + { + "epoch": 11.77037037037037, + "grad_norm": 4.81674861907959, + "learning_rate": 1.8313400081744203e-08, + "loss": 1.7215, + "step": 79450 + }, + { + "epoch": 11.777777777777779, + "grad_norm": 4.983034610748291, + "learning_rate": 1.7152016428285678e-08, + "loss": 1.6665, + "step": 79500 + }, + { + "epoch": 11.785185185185185, + "grad_norm": 6.600327014923096, + "learning_rate": 1.60286431637513e-08, + "loss": 1.7343, + "step": 79550 + }, + { + "epoch": 11.792592592592593, + "grad_norm": 4.829182147979736, + "learning_rate": 1.494328456546401e-08, + "loss": 1.6987, + "step": 79600 + }, + { + "epoch": 11.8, + "grad_norm": 4.379661560058594, + "learning_rate": 1.3895944766001424e-08, + "loss": 1.733, + "step": 79650 + }, + { + "epoch": 11.807407407407407, + "grad_norm": 5.277597427368164, + "learning_rate": 1.2886627753183611e-08, + "loss": 1.5864, + "step": 79700 + }, + { + "epoch": 11.814814814814815, + "grad_norm": 4.692282199859619, + "learning_rate": 1.1915337370055347e-08, + "loss": 1.7295, + "step": 79750 + }, + { + "epoch": 11.822222222222223, + "grad_norm": 5.036397457122803, + "learning_rate": 1.0982077314871664e-08, + "loss": 1.6723, + "step": 79800 + }, + { + "epoch": 11.829629629629629, + "grad_norm": 6.376845359802246, + "learning_rate": 1.008685114108454e-08, + "loss": 1.7038, + "step": 79850 + }, + { + "epoch": 11.837037037037037, + "grad_norm": 4.599322319030762, + "learning_rate": 9.229662257331794e-09, + "loss": 1.7503, + "step": 79900 + }, + { + "epoch": 11.844444444444445, + "grad_norm": 4.890994548797607, + "learning_rate": 8.410513927419317e-09, + "loss": 1.6431, + "step": 79950 + }, + { + "epoch": 11.851851851851851, + "grad_norm": 4.89326810836792, + "learning_rate": 7.629409270311083e-09, + "loss": 1.744, + "step": 80000 + }, + { + "epoch": 11.85925925925926, + "grad_norm": 6.547264099121094, + "learning_rate": 6.8863512601169416e-09, + "loss": 1.6933, + "step": 80050 + }, + { + "epoch": 11.866666666666667, + "grad_norm": 4.861441135406494, + "learning_rate": 6.181342726082618e-09, + "loss": 1.7057, + "step": 80100 + }, + { + "epoch": 11.874074074074073, + "grad_norm": 7.09140157699585, + "learning_rate": 5.514386352577505e-09, + "loss": 1.6498, + "step": 80150 + }, + { + "epoch": 11.881481481481481, + "grad_norm": 5.269583225250244, + "learning_rate": 4.885484679084673e-09, + "loss": 1.694, + "step": 80200 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 4.094651699066162, + "learning_rate": 4.294640100189762e-09, + "loss": 1.7106, + "step": 80250 + }, + { + "epoch": 11.896296296296295, + "grad_norm": 4.695838928222656, + "learning_rate": 3.741854865574323e-09, + "loss": 1.7345, + "step": 80300 + }, + { + "epoch": 11.903703703703703, + "grad_norm": 4.913790702819824, + "learning_rate": 3.2271310800091603e-09, + "loss": 1.7232, + "step": 80350 + }, + { + "epoch": 11.911111111111111, + "grad_norm": 6.038999557495117, + "learning_rate": 2.750470703338781e-09, + "loss": 1.6752, + "step": 80400 + }, + { + "epoch": 11.918518518518518, + "grad_norm": 4.919781684875488, + "learning_rate": 2.3118755504825117e-09, + "loss": 1.6748, + "step": 80450 + }, + { + "epoch": 11.925925925925926, + "grad_norm": 4.607057571411133, + "learning_rate": 1.9113472914189524e-09, + "loss": 1.6536, + "step": 80500 + }, + { + "epoch": 11.933333333333334, + "grad_norm": 4.645559310913086, + "learning_rate": 1.5488874511904174e-09, + "loss": 1.661, + "step": 80550 + }, + { + "epoch": 11.940740740740742, + "grad_norm": 4.680781841278076, + "learning_rate": 1.224497409887393e-09, + "loss": 1.7155, + "step": 80600 + }, + { + "epoch": 11.948148148148148, + "grad_norm": 5.476153373718262, + "learning_rate": 9.38178402646317e-10, + "loss": 1.7228, + "step": 80650 + }, + { + "epoch": 11.955555555555556, + "grad_norm": 4.937467575073242, + "learning_rate": 6.899315196473577e-10, + "loss": 1.6652, + "step": 80700 + }, + { + "epoch": 11.962962962962964, + "grad_norm": 4.530655384063721, + "learning_rate": 4.79757706107753e-10, + "loss": 1.696, + "step": 80750 + }, + { + "epoch": 11.97037037037037, + "grad_norm": 4.738536834716797, + "learning_rate": 3.0765776227847934e-10, + "loss": 1.6556, + "step": 80800 + }, + { + "epoch": 11.977777777777778, + "grad_norm": 4.434654712677002, + "learning_rate": 1.7363234344425217e-10, + "loss": 1.723, + "step": 80850 + }, + { + "epoch": 11.985185185185186, + "grad_norm": 4.7945475578308105, + "learning_rate": 7.768195991353366e-11, + "loss": 1.7508, + "step": 80900 + }, + { + "epoch": 11.992592592592592, + "grad_norm": 5.558487415313721, + "learning_rate": 1.9806977025194074e-11, + "loss": 1.743, + "step": 80950 + }, + { + "epoch": 12.0, + "grad_norm": 5.6761698722839355, + "learning_rate": 7.615141850436659e-15, + "loss": 1.6792, + "step": 81000 + }, + { + "epoch": 12.007407407407408, + "grad_norm": 4.084346771240234, + "learning_rate": 2.8967442665334466e-07, + "loss": 1.7158, + "step": 81050 + }, + { + "epoch": 12.014814814814814, + "grad_norm": 4.7874531745910645, + "learning_rate": 2.8538830419264973e-07, + "loss": 1.6393, + "step": 81100 + }, + { + "epoch": 12.022222222222222, + "grad_norm": 5.466848373413086, + "learning_rate": 2.8113366894419567e-07, + "loss": 1.674, + "step": 81150 + }, + { + "epoch": 12.02962962962963, + "grad_norm": 5.19240665435791, + "learning_rate": 2.769105346981982e-07, + "loss": 1.7633, + "step": 81200 + }, + { + "epoch": 12.037037037037036, + "grad_norm": 6.463864326477051, + "learning_rate": 2.727189151427723e-07, + "loss": 1.6806, + "step": 81250 + }, + { + "epoch": 12.044444444444444, + "grad_norm": 6.174072265625, + "learning_rate": 2.685588238638881e-07, + "loss": 1.7947, + "step": 81300 + }, + { + "epoch": 12.051851851851852, + "grad_norm": 5.756320476531982, + "learning_rate": 2.644302743453242e-07, + "loss": 1.711, + "step": 81350 + }, + { + "epoch": 12.059259259259258, + "grad_norm": 4.0832085609436035, + "learning_rate": 2.6033327996862425e-07, + "loss": 1.6688, + "step": 81400 + }, + { + "epoch": 12.066666666666666, + "grad_norm": 3.832885980606079, + "learning_rate": 2.562678540130592e-07, + "loss": 1.6276, + "step": 81450 + }, + { + "epoch": 12.074074074074074, + "grad_norm": 5.205352783203125, + "learning_rate": 2.522340096555742e-07, + "loss": 1.63, + "step": 81500 + }, + { + "epoch": 12.081481481481482, + "grad_norm": 5.11032772064209, + "learning_rate": 2.482317599707551e-07, + "loss": 1.6658, + "step": 81550 + }, + { + "epoch": 12.088888888888889, + "grad_norm": 5.3631672859191895, + "learning_rate": 2.4426111793078076e-07, + "loss": 1.7122, + "step": 81600 + }, + { + "epoch": 12.096296296296297, + "grad_norm": 4.74794340133667, + "learning_rate": 2.403220964053843e-07, + "loss": 1.6735, + "step": 81650 + }, + { + "epoch": 12.103703703703705, + "grad_norm": 5.673887252807617, + "learning_rate": 2.3641470816180623e-07, + "loss": 1.6518, + "step": 81700 + }, + { + "epoch": 12.11111111111111, + "grad_norm": 7.255314826965332, + "learning_rate": 2.3253896586476145e-07, + "loss": 1.6672, + "step": 81750 + }, + { + "epoch": 12.118518518518519, + "grad_norm": 4.749680995941162, + "learning_rate": 2.2869488207638902e-07, + "loss": 1.6897, + "step": 81800 + }, + { + "epoch": 12.125925925925927, + "grad_norm": 6.6680707931518555, + "learning_rate": 2.2488246925621682e-07, + "loss": 1.6805, + "step": 81850 + }, + { + "epoch": 12.133333333333333, + "grad_norm": 5.097541332244873, + "learning_rate": 2.211017397611237e-07, + "loss": 1.7271, + "step": 81900 + }, + { + "epoch": 12.14074074074074, + "grad_norm": 4.618227005004883, + "learning_rate": 2.1735270584529067e-07, + "loss": 1.6749, + "step": 81950 + }, + { + "epoch": 12.148148148148149, + "grad_norm": 4.820761203765869, + "learning_rate": 2.136353796601698e-07, + "loss": 1.6567, + "step": 82000 + }, + { + "epoch": 12.155555555555555, + "grad_norm": 5.2633843421936035, + "learning_rate": 2.0994977325443866e-07, + "loss": 1.6774, + "step": 82050 + }, + { + "epoch": 12.162962962962963, + "grad_norm": 4.625190734863281, + "learning_rate": 2.0629589857396714e-07, + "loss": 1.6732, + "step": 82100 + }, + { + "epoch": 12.170370370370371, + "grad_norm": 4.52297830581665, + "learning_rate": 2.026737674617729e-07, + "loss": 1.6959, + "step": 82150 + }, + { + "epoch": 12.177777777777777, + "grad_norm": 5.830942153930664, + "learning_rate": 1.990833916579882e-07, + "loss": 1.6977, + "step": 82200 + }, + { + "epoch": 12.185185185185185, + "grad_norm": 5.219923973083496, + "learning_rate": 1.9552478279981523e-07, + "loss": 1.7054, + "step": 82250 + }, + { + "epoch": 12.192592592592593, + "grad_norm": 4.906498432159424, + "learning_rate": 1.9199795242149543e-07, + "loss": 1.6458, + "step": 82300 + }, + { + "epoch": 12.2, + "grad_norm": 4.760297775268555, + "learning_rate": 1.8850291195427028e-07, + "loss": 1.6583, + "step": 82350 + }, + { + "epoch": 12.207407407407407, + "grad_norm": 5.414527416229248, + "learning_rate": 1.850396727263415e-07, + "loss": 1.7202, + "step": 82400 + }, + { + "epoch": 12.214814814814815, + "grad_norm": 7.676029205322266, + "learning_rate": 1.8160824596283432e-07, + "loss": 1.6914, + "step": 82450 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 5.5357842445373535, + "learning_rate": 1.7820864278576544e-07, + "loss": 1.6447, + "step": 82500 + }, + { + "epoch": 12.22962962962963, + "grad_norm": 4.6864776611328125, + "learning_rate": 1.7484087421400175e-07, + "loss": 1.7594, + "step": 82550 + }, + { + "epoch": 12.237037037037037, + "grad_norm": 5.506605625152588, + "learning_rate": 1.7150495116323162e-07, + "loss": 1.6862, + "step": 82600 + }, + { + "epoch": 12.244444444444444, + "grad_norm": 5.496425628662109, + "learning_rate": 1.682008844459182e-07, + "loss": 1.7077, + "step": 82650 + }, + { + "epoch": 12.251851851851852, + "grad_norm": 4.794198989868164, + "learning_rate": 1.6492868477127832e-07, + "loss": 1.6838, + "step": 82700 + }, + { + "epoch": 12.25925925925926, + "grad_norm": 5.178542137145996, + "learning_rate": 1.616883627452348e-07, + "loss": 1.6919, + "step": 82750 + }, + { + "epoch": 12.266666666666667, + "grad_norm": 4.447923183441162, + "learning_rate": 1.5847992887039086e-07, + "loss": 1.6294, + "step": 82800 + }, + { + "epoch": 12.274074074074074, + "grad_norm": 5.297534942626953, + "learning_rate": 1.5530339354599354e-07, + "loss": 1.7025, + "step": 82850 + }, + { + "epoch": 12.281481481481482, + "grad_norm": 4.665591716766357, + "learning_rate": 1.52158767067897e-07, + "loss": 1.636, + "step": 82900 + }, + { + "epoch": 12.28888888888889, + "grad_norm": 5.7167253494262695, + "learning_rate": 1.490460596285348e-07, + "loss": 1.7184, + "step": 82950 + }, + { + "epoch": 12.296296296296296, + "grad_norm": 5.026512145996094, + "learning_rate": 1.4596528131688327e-07, + "loss": 1.6492, + "step": 83000 + }, + { + "epoch": 12.303703703703704, + "grad_norm": 5.1102986335754395, + "learning_rate": 1.4291644211842703e-07, + "loss": 1.7015, + "step": 83050 + }, + { + "epoch": 12.311111111111112, + "grad_norm": 6.56468391418457, + "learning_rate": 1.398995519151314e-07, + "loss": 1.6735, + "step": 83100 + }, + { + "epoch": 12.318518518518518, + "grad_norm": 5.295648097991943, + "learning_rate": 1.369146204854055e-07, + "loss": 1.6439, + "step": 83150 + }, + { + "epoch": 12.325925925925926, + "grad_norm": 4.968232154846191, + "learning_rate": 1.3396165750407698e-07, + "loss": 1.7418, + "step": 83200 + }, + { + "epoch": 12.333333333333334, + "grad_norm": 5.407866954803467, + "learning_rate": 1.3104067254235188e-07, + "loss": 1.6915, + "step": 83250 + }, + { + "epoch": 12.34074074074074, + "grad_norm": 4.8211588859558105, + "learning_rate": 1.2815167506779136e-07, + "loss": 1.6711, + "step": 83300 + }, + { + "epoch": 12.348148148148148, + "grad_norm": 5.250539779663086, + "learning_rate": 1.2529467444427846e-07, + "loss": 1.6842, + "step": 83350 + }, + { + "epoch": 12.355555555555556, + "grad_norm": 4.547904968261719, + "learning_rate": 1.2246967993198467e-07, + "loss": 1.684, + "step": 83400 + }, + { + "epoch": 12.362962962962962, + "grad_norm": 4.18063497543335, + "learning_rate": 1.196767006873445e-07, + "loss": 1.6088, + "step": 83450 + }, + { + "epoch": 12.37037037037037, + "grad_norm": 4.705265522003174, + "learning_rate": 1.1691574576302323e-07, + "loss": 1.7407, + "step": 83500 + }, + { + "epoch": 12.377777777777778, + "grad_norm": 5.691997051239014, + "learning_rate": 1.1418682410788917e-07, + "loss": 1.6649, + "step": 83550 + }, + { + "epoch": 12.385185185185184, + "grad_norm": 4.024935722351074, + "learning_rate": 1.1148994456698258e-07, + "loss": 1.6379, + "step": 83600 + }, + { + "epoch": 12.392592592592592, + "grad_norm": 4.250906467437744, + "learning_rate": 1.0882511588148792e-07, + "loss": 1.6544, + "step": 83650 + }, + { + "epoch": 12.4, + "grad_norm": 4.258973121643066, + "learning_rate": 1.0619234668870826e-07, + "loss": 1.6382, + "step": 83700 + }, + { + "epoch": 12.407407407407407, + "grad_norm": 4.957441329956055, + "learning_rate": 1.0359164552202982e-07, + "loss": 1.6946, + "step": 83750 + }, + { + "epoch": 12.414814814814815, + "grad_norm": 5.518395900726318, + "learning_rate": 1.0102302081090531e-07, + "loss": 1.7093, + "step": 83800 + }, + { + "epoch": 12.422222222222222, + "grad_norm": 5.175966739654541, + "learning_rate": 9.848648088081502e-08, + "loss": 1.7392, + "step": 83850 + }, + { + "epoch": 12.42962962962963, + "grad_norm": 4.846656322479248, + "learning_rate": 9.598203395324912e-08, + "loss": 1.752, + "step": 83900 + }, + { + "epoch": 12.437037037037037, + "grad_norm": 4.2871551513671875, + "learning_rate": 9.350968814567429e-08, + "loss": 1.6824, + "step": 83950 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 4.269059181213379, + "learning_rate": 9.106945147151381e-08, + "loss": 1.6696, + "step": 84000 + }, + { + "epoch": 12.451851851851853, + "grad_norm": 4.492150783538818, + "learning_rate": 8.866133184011527e-08, + "loss": 1.6966, + "step": 84050 + }, + { + "epoch": 12.459259259259259, + "grad_norm": 5.761516094207764, + "learning_rate": 8.62853370567296e-08, + "loss": 1.6358, + "step": 84100 + }, + { + "epoch": 12.466666666666667, + "grad_norm": 5.793025493621826, + "learning_rate": 8.394147482248205e-08, + "loss": 1.692, + "step": 84150 + }, + { + "epoch": 12.474074074074075, + "grad_norm": 5.595602035522461, + "learning_rate": 8.162975273435014e-08, + "loss": 1.6476, + "step": 84200 + }, + { + "epoch": 12.481481481481481, + "grad_norm": 5.4267659187316895, + "learning_rate": 7.935017828513914e-08, + "loss": 1.7016, + "step": 84250 + }, + { + "epoch": 12.488888888888889, + "grad_norm": 4.711421966552734, + "learning_rate": 7.710275886345542e-08, + "loss": 1.6854, + "step": 84300 + }, + { + "epoch": 12.496296296296297, + "grad_norm": 5.19965124130249, + "learning_rate": 7.48875017536832e-08, + "loss": 1.6466, + "step": 84350 + }, + { + "epoch": 12.503703703703703, + "grad_norm": 4.186498641967773, + "learning_rate": 7.27044141359634e-08, + "loss": 1.7562, + "step": 84400 + }, + { + "epoch": 12.511111111111111, + "grad_norm": 5.019369602203369, + "learning_rate": 7.055350308616704e-08, + "loss": 1.6783, + "step": 84450 + }, + { + "epoch": 12.518518518518519, + "grad_norm": 5.182914733886719, + "learning_rate": 6.843477557587408e-08, + "loss": 1.7501, + "step": 84500 + }, + { + "epoch": 12.525925925925925, + "grad_norm": 4.619785785675049, + "learning_rate": 6.634823847235017e-08, + "loss": 1.6467, + "step": 84550 + }, + { + "epoch": 12.533333333333333, + "grad_norm": 4.456223011016846, + "learning_rate": 6.429389853852553e-08, + "loss": 1.71, + "step": 84600 + }, + { + "epoch": 12.540740740740741, + "grad_norm": 7.912137985229492, + "learning_rate": 6.227176243297272e-08, + "loss": 1.8024, + "step": 84650 + }, + { + "epoch": 12.548148148148147, + "grad_norm": 4.187272548675537, + "learning_rate": 6.02818367098812e-08, + "loss": 1.6879, + "step": 84700 + }, + { + "epoch": 12.555555555555555, + "grad_norm": 5.394083023071289, + "learning_rate": 5.832412781904384e-08, + "loss": 1.6566, + "step": 84750 + }, + { + "epoch": 12.562962962962963, + "grad_norm": 5.078476905822754, + "learning_rate": 5.639864210582935e-08, + "loss": 1.7164, + "step": 84800 + }, + { + "epoch": 12.57037037037037, + "grad_norm": 5.757031440734863, + "learning_rate": 5.450538581116327e-08, + "loss": 1.6773, + "step": 84850 + }, + { + "epoch": 12.577777777777778, + "grad_norm": 5.4857869148254395, + "learning_rate": 5.264436507150916e-08, + "loss": 1.7081, + "step": 84900 + }, + { + "epoch": 12.585185185185185, + "grad_norm": 5.091628074645996, + "learning_rate": 5.081558591884972e-08, + "loss": 1.6757, + "step": 84950 + }, + { + "epoch": 12.592592592592592, + "grad_norm": 5.476707458496094, + "learning_rate": 4.901905428066345e-08, + "loss": 1.6907, + "step": 85000 + }, + { + "epoch": 12.6, + "grad_norm": 5.704261779785156, + "learning_rate": 4.7254775979906906e-08, + "loss": 1.6711, + "step": 85050 + }, + { + "epoch": 12.607407407407408, + "grad_norm": 5.539083003997803, + "learning_rate": 4.5522756734999173e-08, + "loss": 1.708, + "step": 85100 + }, + { + "epoch": 12.614814814814816, + "grad_norm": 5.648597240447998, + "learning_rate": 4.382300215979851e-08, + "loss": 1.6932, + "step": 85150 + }, + { + "epoch": 12.622222222222222, + "grad_norm": 6.897541522979736, + "learning_rate": 4.215551776358573e-08, + "loss": 1.6701, + "step": 85200 + }, + { + "epoch": 12.62962962962963, + "grad_norm": 5.8567938804626465, + "learning_rate": 4.052030895104864e-08, + "loss": 1.6799, + "step": 85250 + }, + { + "epoch": 12.637037037037038, + "grad_norm": 4.133502006530762, + "learning_rate": 3.891738102226206e-08, + "loss": 1.7131, + "step": 85300 + }, + { + "epoch": 12.644444444444444, + "grad_norm": 5.222926616668701, + "learning_rate": 3.7346739172671177e-08, + "loss": 1.6594, + "step": 85350 + }, + { + "epoch": 12.651851851851852, + "grad_norm": 5.704329967498779, + "learning_rate": 3.5808388493072665e-08, + "loss": 1.692, + "step": 85400 + }, + { + "epoch": 12.65925925925926, + "grad_norm": 4.969203472137451, + "learning_rate": 3.43023339696058e-08, + "loss": 1.7122, + "step": 85450 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 4.0226731300354, + "learning_rate": 3.282858048372583e-08, + "loss": 1.6186, + "step": 85500 + }, + { + "epoch": 12.674074074074074, + "grad_norm": 5.419471740722656, + "learning_rate": 3.1387132812193963e-08, + "loss": 1.682, + "step": 85550 + }, + { + "epoch": 12.681481481481482, + "grad_norm": 5.011806488037109, + "learning_rate": 2.997799562706294e-08, + "loss": 1.6846, + "step": 85600 + }, + { + "epoch": 12.688888888888888, + "grad_norm": 4.759771347045898, + "learning_rate": 2.8601173495659274e-08, + "loss": 1.7007, + "step": 85650 + }, + { + "epoch": 12.696296296296296, + "grad_norm": 5.667296886444092, + "learning_rate": 2.7256670880568826e-08, + "loss": 1.6965, + "step": 85700 + }, + { + "epoch": 12.703703703703704, + "grad_norm": 5.863560676574707, + "learning_rate": 2.5944492139623467e-08, + "loss": 1.7107, + "step": 85750 + }, + { + "epoch": 12.71111111111111, + "grad_norm": 5.254322052001953, + "learning_rate": 2.4664641525884436e-08, + "loss": 1.6653, + "step": 85800 + }, + { + "epoch": 12.718518518518518, + "grad_norm": 5.583574295043945, + "learning_rate": 2.341712318763123e-08, + "loss": 1.6571, + "step": 85850 + }, + { + "epoch": 12.725925925925926, + "grad_norm": 5.015530586242676, + "learning_rate": 2.2201941168349393e-08, + "loss": 1.7041, + "step": 85900 + }, + { + "epoch": 12.733333333333333, + "grad_norm": 6.795839309692383, + "learning_rate": 2.1019099406712762e-08, + "loss": 1.6937, + "step": 85950 + }, + { + "epoch": 12.74074074074074, + "grad_norm": 4.828438758850098, + "learning_rate": 1.9868601736573458e-08, + "loss": 1.6692, + "step": 86000 + }, + { + "epoch": 12.748148148148148, + "grad_norm": 4.375940322875977, + "learning_rate": 1.8750451886950795e-08, + "loss": 1.743, + "step": 86050 + }, + { + "epoch": 12.755555555555556, + "grad_norm": 5.71312141418457, + "learning_rate": 1.7664653482015737e-08, + "loss": 1.7376, + "step": 86100 + }, + { + "epoch": 12.762962962962963, + "grad_norm": 4.118297100067139, + "learning_rate": 1.6611210041080906e-08, + "loss": 1.6836, + "step": 86150 + }, + { + "epoch": 12.77037037037037, + "grad_norm": 4.494056224822998, + "learning_rate": 1.5590124978592803e-08, + "loss": 1.6745, + "step": 86200 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 5.0475993156433105, + "learning_rate": 1.460140160411294e-08, + "loss": 1.697, + "step": 86250 + }, + { + "epoch": 12.785185185185185, + "grad_norm": 5.805638790130615, + "learning_rate": 1.3645043122313407e-08, + "loss": 1.6773, + "step": 86300 + }, + { + "epoch": 12.792592592592593, + "grad_norm": 4.78032112121582, + "learning_rate": 1.2721052632964637e-08, + "loss": 1.5882, + "step": 86350 + }, + { + "epoch": 12.8, + "grad_norm": 5.578061103820801, + "learning_rate": 1.1829433130924328e-08, + "loss": 1.6681, + "step": 86400 + }, + { + "epoch": 12.807407407407407, + "grad_norm": 6.236872673034668, + "learning_rate": 1.097018750612966e-08, + "loss": 1.7019, + "step": 86450 + }, + { + "epoch": 12.814814814814815, + "grad_norm": 5.2029571533203125, + "learning_rate": 1.0143318543585079e-08, + "loss": 1.7867, + "step": 86500 + }, + { + "epoch": 12.822222222222223, + "grad_norm": 4.312541961669922, + "learning_rate": 9.348828923358977e-09, + "loss": 1.7128, + "step": 86550 + }, + { + "epoch": 12.829629629629629, + "grad_norm": 4.430631637573242, + "learning_rate": 8.586721220565918e-09, + "loss": 1.6753, + "step": 86600 + }, + { + "epoch": 12.837037037037037, + "grad_norm": 4.498734951019287, + "learning_rate": 7.856997905367758e-09, + "loss": 1.6709, + "step": 86650 + }, + { + "epoch": 12.844444444444445, + "grad_norm": 5.665780067443848, + "learning_rate": 7.159661342958091e-09, + "loss": 1.6619, + "step": 86700 + }, + { + "epoch": 12.851851851851851, + "grad_norm": 5.194167613983154, + "learning_rate": 6.494713793561147e-09, + "loss": 1.6525, + "step": 86750 + }, + { + "epoch": 12.85925925925926, + "grad_norm": 4.512106418609619, + "learning_rate": 5.862157412419578e-09, + "loss": 1.7062, + "step": 86800 + }, + { + "epoch": 12.866666666666667, + "grad_norm": 4.46773624420166, + "learning_rate": 5.261994249786684e-09, + "loss": 1.6675, + "step": 86850 + }, + { + "epoch": 12.874074074074073, + "grad_norm": 5.507213592529297, + "learning_rate": 4.694226250926415e-09, + "loss": 1.6859, + "step": 86900 + }, + { + "epoch": 12.881481481481481, + "grad_norm": 7.564279079437256, + "learning_rate": 4.158855256101157e-09, + "loss": 1.6731, + "step": 86950 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 4.482030868530273, + "learning_rate": 3.655883000565075e-09, + "loss": 1.711, + "step": 87000 + }, + { + "epoch": 12.896296296296295, + "grad_norm": 4.927270412445068, + "learning_rate": 3.185311114565215e-09, + "loss": 1.6582, + "step": 87050 + }, + { + "epoch": 12.903703703703703, + "grad_norm": 5.129358291625977, + "learning_rate": 2.74714112332819e-09, + "loss": 1.6187, + "step": 87100 + }, + { + "epoch": 12.911111111111111, + "grad_norm": 4.1775102615356445, + "learning_rate": 2.341374447060174e-09, + "loss": 1.7252, + "step": 87150 + }, + { + "epoch": 12.918518518518518, + "grad_norm": 4.9707159996032715, + "learning_rate": 1.9680124009413548e-09, + "loss": 1.6923, + "step": 87200 + }, + { + "epoch": 12.925925925925926, + "grad_norm": 5.600582122802734, + "learning_rate": 1.62705619512038e-09, + "loss": 1.6414, + "step": 87250 + }, + { + "epoch": 12.933333333333334, + "grad_norm": 4.928083896636963, + "learning_rate": 1.3185069347121382e-09, + "loss": 1.7132, + "step": 87300 + }, + { + "epoch": 12.940740740740742, + "grad_norm": 4.8590264320373535, + "learning_rate": 1.0423656197944275e-09, + "loss": 1.6717, + "step": 87350 + }, + { + "epoch": 12.948148148148148, + "grad_norm": 4.67088508605957, + "learning_rate": 7.986331454012952e-10, + "loss": 1.6741, + "step": 87400 + }, + { + "epoch": 12.955555555555556, + "grad_norm": 5.243936061859131, + "learning_rate": 5.873103015241466e-10, + "loss": 1.6741, + "step": 87450 + }, + { + "epoch": 12.962962962962964, + "grad_norm": 5.689966678619385, + "learning_rate": 4.083977731073052e-10, + "loss": 1.6958, + "step": 87500 + }, + { + "epoch": 12.97037037037037, + "grad_norm": 5.250327110290527, + "learning_rate": 2.61896140045792e-10, + "loss": 1.6868, + "step": 87550 + }, + { + "epoch": 12.977777777777778, + "grad_norm": 4.910762310028076, + "learning_rate": 1.4780587718421503e-10, + "loss": 1.6528, + "step": 87600 + }, + { + "epoch": 12.985185185185186, + "grad_norm": 3.781712532043457, + "learning_rate": 6.612735431343886e-11, + "loss": 1.6715, + "step": 87650 + }, + { + "epoch": 12.992592592592592, + "grad_norm": 5.627158164978027, + "learning_rate": 1.6860836172805095e-11, + "loss": 1.6577, + "step": 87700 + }, + { + "epoch": 13.0, + "grad_norm": 4.692312717437744, + "learning_rate": 6.4824456913470394e-15, + "loss": 1.7212, + "step": 87750 + }, + { + "epoch": 13.007407407407408, + "grad_norm": 4.713183879852295, + "learning_rate": 2.497337918370513e-07, + "loss": 1.7233, + "step": 87800 + }, + { + "epoch": 13.014814814814814, + "grad_norm": 4.382723331451416, + "learning_rate": 2.460361831871905e-07, + "loss": 1.6443, + "step": 87850 + }, + { + "epoch": 13.022222222222222, + "grad_norm": 5.862541675567627, + "learning_rate": 2.423658119020356e-07, + "loss": 1.7126, + "step": 87900 + }, + { + "epoch": 13.02962962962963, + "grad_norm": 4.680122375488281, + "learning_rate": 2.3872268823087907e-07, + "loss": 1.661, + "step": 87950 + }, + { + "epoch": 13.037037037037036, + "grad_norm": 5.052659511566162, + "learning_rate": 2.3510682234692773e-07, + "loss": 1.6396, + "step": 88000 + }, + { + "epoch": 13.044444444444444, + "grad_norm": 5.200165271759033, + "learning_rate": 2.3151822434727246e-07, + "loss": 1.6467, + "step": 88050 + }, + { + "epoch": 13.051851851851852, + "grad_norm": 6.37606143951416, + "learning_rate": 2.2795690425286065e-07, + "loss": 1.6887, + "step": 88100 + }, + { + "epoch": 13.059259259259258, + "grad_norm": 4.551187515258789, + "learning_rate": 2.2442287200846512e-07, + "loss": 1.6731, + "step": 88150 + }, + { + "epoch": 13.066666666666666, + "grad_norm": 4.3722052574157715, + "learning_rate": 2.2091613748266183e-07, + "loss": 1.7147, + "step": 88200 + }, + { + "epoch": 13.074074074074074, + "grad_norm": 4.7190775871276855, + "learning_rate": 2.174367104677999e-07, + "loss": 1.7001, + "step": 88250 + }, + { + "epoch": 13.081481481481482, + "grad_norm": 5.733537197113037, + "learning_rate": 2.1398460067997174e-07, + "loss": 1.6306, + "step": 88300 + }, + { + "epoch": 13.088888888888889, + "grad_norm": 4.682885646820068, + "learning_rate": 2.105598177589896e-07, + "loss": 1.7563, + "step": 88350 + }, + { + "epoch": 13.096296296296297, + "grad_norm": 5.364824295043945, + "learning_rate": 2.071623712683557e-07, + "loss": 1.7217, + "step": 88400 + }, + { + "epoch": 13.103703703703705, + "grad_norm": 5.048696517944336, + "learning_rate": 2.0379227069523776e-07, + "loss": 1.6839, + "step": 88450 + }, + { + "epoch": 13.11111111111111, + "grad_norm": 4.838836669921875, + "learning_rate": 2.0044952545044238e-07, + "loss": 1.7394, + "step": 88500 + }, + { + "epoch": 13.118518518518519, + "grad_norm": 4.3506340980529785, + "learning_rate": 1.97134144868385e-07, + "loss": 1.6381, + "step": 88550 + }, + { + "epoch": 13.125925925925927, + "grad_norm": 4.708609104156494, + "learning_rate": 1.9384613820707e-07, + "loss": 1.6736, + "step": 88600 + }, + { + "epoch": 13.133333333333333, + "grad_norm": 5.755831241607666, + "learning_rate": 1.9058551464806175e-07, + "loss": 1.7046, + "step": 88650 + }, + { + "epoch": 13.14074074074074, + "grad_norm": 5.2482686042785645, + "learning_rate": 1.8735228329645805e-07, + "loss": 1.6601, + "step": 88700 + }, + { + "epoch": 13.148148148148149, + "grad_norm": 5.522831916809082, + "learning_rate": 1.8414645318086344e-07, + "loss": 1.7117, + "step": 88750 + }, + { + "epoch": 13.155555555555555, + "grad_norm": 6.40302038192749, + "learning_rate": 1.809680332533692e-07, + "loss": 1.7129, + "step": 88800 + }, + { + "epoch": 13.162962962962963, + "grad_norm": 5.780543804168701, + "learning_rate": 1.7781703238952564e-07, + "loss": 1.7132, + "step": 88850 + }, + { + "epoch": 13.170370370370371, + "grad_norm": 4.71435022354126, + "learning_rate": 1.7469345938831316e-07, + "loss": 1.7371, + "step": 88900 + }, + { + "epoch": 13.177777777777777, + "grad_norm": 6.810309886932373, + "learning_rate": 1.715973229721246e-07, + "loss": 1.6313, + "step": 88950 + }, + { + "epoch": 13.185185185185185, + "grad_norm": 5.402350902557373, + "learning_rate": 1.685286317867374e-07, + "loss": 1.6334, + "step": 89000 + }, + { + "epoch": 13.192592592592593, + "grad_norm": 4.750962257385254, + "learning_rate": 1.6548739440129025e-07, + "loss": 1.7537, + "step": 89050 + }, + { + "epoch": 13.2, + "grad_norm": 5.475610256195068, + "learning_rate": 1.6247361930825546e-07, + "loss": 1.6997, + "step": 89100 + }, + { + "epoch": 13.207407407407407, + "grad_norm": 4.554515838623047, + "learning_rate": 1.5948731492342328e-07, + "loss": 1.6814, + "step": 89150 + }, + { + "epoch": 13.214814814814815, + "grad_norm": 4.694421768188477, + "learning_rate": 1.5652848958587097e-07, + "loss": 1.657, + "step": 89200 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 5.7868547439575195, + "learning_rate": 1.5359715155794374e-07, + "loss": 1.7237, + "step": 89250 + }, + { + "epoch": 13.22962962962963, + "grad_norm": 4.124547958374023, + "learning_rate": 1.5069330902522495e-07, + "loss": 1.5919, + "step": 89300 + }, + { + "epoch": 13.237037037037037, + "grad_norm": 4.9132208824157715, + "learning_rate": 1.4781697009652595e-07, + "loss": 1.678, + "step": 89350 + }, + { + "epoch": 13.244444444444444, + "grad_norm": 5.06511926651001, + "learning_rate": 1.4496814280385297e-07, + "loss": 1.7352, + "step": 89400 + }, + { + "epoch": 13.251851851851852, + "grad_norm": 4.450133323669434, + "learning_rate": 1.421468351023836e-07, + "loss": 1.6572, + "step": 89450 + }, + { + "epoch": 13.25925925925926, + "grad_norm": 4.740837097167969, + "learning_rate": 1.393530548704536e-07, + "loss": 1.6888, + "step": 89500 + }, + { + "epoch": 13.266666666666667, + "grad_norm": 5.064001083374023, + "learning_rate": 1.3658680990953021e-07, + "loss": 1.713, + "step": 89550 + }, + { + "epoch": 13.274074074074074, + "grad_norm": 5.2124481201171875, + "learning_rate": 1.3384810794418669e-07, + "loss": 1.6592, + "step": 89600 + }, + { + "epoch": 13.281481481481482, + "grad_norm": 3.9894776344299316, + "learning_rate": 1.3113695662208436e-07, + "loss": 1.6943, + "step": 89650 + }, + { + "epoch": 13.28888888888889, + "grad_norm": 5.926461219787598, + "learning_rate": 1.2845336351395398e-07, + "loss": 1.718, + "step": 89700 + }, + { + "epoch": 13.296296296296296, + "grad_norm": 4.480943202972412, + "learning_rate": 1.2579733611357004e-07, + "loss": 1.6878, + "step": 89750 + }, + { + "epoch": 13.303703703703704, + "grad_norm": 5.713892936706543, + "learning_rate": 1.231688818377297e-07, + "loss": 1.6483, + "step": 89800 + }, + { + "epoch": 13.311111111111112, + "grad_norm": 5.857698440551758, + "learning_rate": 1.2056800802623724e-07, + "loss": 1.6905, + "step": 89850 + }, + { + "epoch": 13.318518518518518, + "grad_norm": 4.647960662841797, + "learning_rate": 1.1799472194187755e-07, + "loss": 1.6675, + "step": 89900 + }, + { + "epoch": 13.325925925925926, + "grad_norm": 6.600680828094482, + "learning_rate": 1.1544903077039926e-07, + "loss": 1.736, + "step": 89950 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 5.771788120269775, + "learning_rate": 1.1293094162049378e-07, + "loss": 1.6509, + "step": 90000 + }, + { + "epoch": 13.34074074074074, + "grad_norm": 4.439211845397949, + "learning_rate": 1.104404615237753e-07, + "loss": 1.7102, + "step": 90050 + }, + { + "epoch": 13.348148148148148, + "grad_norm": 5.149127960205078, + "learning_rate": 1.0797759743476299e-07, + "loss": 1.6438, + "step": 90100 + }, + { + "epoch": 13.355555555555556, + "grad_norm": 4.4529876708984375, + "learning_rate": 1.0554235623085662e-07, + "loss": 1.705, + "step": 90150 + }, + { + "epoch": 13.362962962962962, + "grad_norm": 5.989099979400635, + "learning_rate": 1.0313474471232321e-07, + "loss": 1.6944, + "step": 90200 + }, + { + "epoch": 13.37037037037037, + "grad_norm": 5.291862487792969, + "learning_rate": 1.0075476960227481e-07, + "loss": 1.6965, + "step": 90250 + }, + { + "epoch": 13.377777777777778, + "grad_norm": 5.628286838531494, + "learning_rate": 9.84024375466508e-08, + "loss": 1.691, + "step": 90300 + }, + { + "epoch": 13.385185185185184, + "grad_norm": 4.2257914543151855, + "learning_rate": 9.607775511420003e-08, + "loss": 1.6602, + "step": 90350 + }, + { + "epoch": 13.392592592592592, + "grad_norm": 5.1985087394714355, + "learning_rate": 9.37807287964576e-08, + "loss": 1.6339, + "step": 90400 + }, + { + "epoch": 13.4, + "grad_norm": 5.6320366859436035, + "learning_rate": 9.151136500773594e-08, + "loss": 1.7014, + "step": 90450 + }, + { + "epoch": 13.407407407407407, + "grad_norm": 4.394415378570557, + "learning_rate": 8.92696700850959e-08, + "loss": 1.6796, + "step": 90500 + }, + { + "epoch": 13.414814814814815, + "grad_norm": 5.783812046051025, + "learning_rate": 8.705565028833906e-08, + "loss": 1.7261, + "step": 90550 + }, + { + "epoch": 13.422222222222222, + "grad_norm": 5.535831451416016, + "learning_rate": 8.486931179998325e-08, + "loss": 1.6693, + "step": 90600 + }, + { + "epoch": 13.42962962962963, + "grad_norm": 5.076855659484863, + "learning_rate": 8.271066072525036e-08, + "loss": 1.6616, + "step": 90650 + }, + { + "epoch": 13.437037037037037, + "grad_norm": 4.915444850921631, + "learning_rate": 8.0579703092043e-08, + "loss": 1.7366, + "step": 90700 + }, + { + "epoch": 13.444444444444445, + "grad_norm": 4.743108749389648, + "learning_rate": 7.847644485093675e-08, + "loss": 1.651, + "step": 90750 + }, + { + "epoch": 13.451851851851853, + "grad_norm": 3.879176616668701, + "learning_rate": 7.640089187515465e-08, + "loss": 1.6693, + "step": 90800 + }, + { + "epoch": 13.459259259259259, + "grad_norm": 5.211658954620361, + "learning_rate": 7.435304996055271e-08, + "loss": 1.6643, + "step": 90850 + }, + { + "epoch": 13.466666666666667, + "grad_norm": 5.480631351470947, + "learning_rate": 7.233292482560883e-08, + "loss": 1.687, + "step": 90900 + }, + { + "epoch": 13.474074074074075, + "grad_norm": 5.0310797691345215, + "learning_rate": 7.034052211140396e-08, + "loss": 1.7151, + "step": 90950 + }, + { + "epoch": 13.481481481481481, + "grad_norm": 4.684755325317383, + "learning_rate": 6.83758473816043e-08, + "loss": 1.6381, + "step": 91000 + }, + { + "epoch": 13.488888888888889, + "grad_norm": 5.320138454437256, + "learning_rate": 6.643890612244574e-08, + "loss": 1.6996, + "step": 91050 + }, + { + "epoch": 13.496296296296297, + "grad_norm": 4.682376861572266, + "learning_rate": 6.452970374272283e-08, + "loss": 1.7423, + "step": 91100 + }, + { + "epoch": 13.503703703703703, + "grad_norm": 5.152847766876221, + "learning_rate": 6.26482455737698e-08, + "loss": 1.6966, + "step": 91150 + }, + { + "epoch": 13.511111111111111, + "grad_norm": 6.191291809082031, + "learning_rate": 6.079453686944514e-08, + "loss": 1.6915, + "step": 91200 + }, + { + "epoch": 13.518518518518519, + "grad_norm": 4.434044361114502, + "learning_rate": 5.896858280612261e-08, + "loss": 1.6644, + "step": 91250 + }, + { + "epoch": 13.525925925925925, + "grad_norm": 4.534839630126953, + "learning_rate": 5.7170388482670204e-08, + "loss": 1.7303, + "step": 91300 + }, + { + "epoch": 13.533333333333333, + "grad_norm": 4.041924953460693, + "learning_rate": 5.539995892043793e-08, + "loss": 1.6277, + "step": 91350 + }, + { + "epoch": 13.540740740740741, + "grad_norm": 5.528424263000488, + "learning_rate": 5.3657299063244464e-08, + "loss": 1.6396, + "step": 91400 + }, + { + "epoch": 13.548148148148147, + "grad_norm": 4.683634281158447, + "learning_rate": 5.194241377736609e-08, + "loss": 1.6793, + "step": 91450 + }, + { + "epoch": 13.555555555555555, + "grad_norm": 5.272689342498779, + "learning_rate": 5.025530785151778e-08, + "loss": 1.6472, + "step": 91500 + }, + { + "epoch": 13.562962962962963, + "grad_norm": 5.290887832641602, + "learning_rate": 4.859598599684101e-08, + "loss": 1.7151, + "step": 91550 + }, + { + "epoch": 13.57037037037037, + "grad_norm": 4.272365570068359, + "learning_rate": 4.696445284689377e-08, + "loss": 1.7064, + "step": 91600 + }, + { + "epoch": 13.577777777777778, + "grad_norm": 4.948604106903076, + "learning_rate": 4.536071295763722e-08, + "loss": 1.6692, + "step": 91650 + }, + { + "epoch": 13.585185185185185, + "grad_norm": 4.477553367614746, + "learning_rate": 4.378477080741794e-08, + "loss": 1.6854, + "step": 91700 + }, + { + "epoch": 13.592592592592592, + "grad_norm": 5.140685558319092, + "learning_rate": 4.2236630796961274e-08, + "loss": 1.6542, + "step": 91750 + }, + { + "epoch": 13.6, + "grad_norm": 4.887373924255371, + "learning_rate": 4.0716297249357994e-08, + "loss": 1.7003, + "step": 91800 + }, + { + "epoch": 13.607407407407408, + "grad_norm": 5.651754856109619, + "learning_rate": 3.922377441004655e-08, + "loss": 1.6769, + "step": 91850 + }, + { + "epoch": 13.614814814814816, + "grad_norm": 4.647298336029053, + "learning_rate": 3.775906644680971e-08, + "loss": 1.6149, + "step": 91900 + }, + { + "epoch": 13.622222222222222, + "grad_norm": 4.287783622741699, + "learning_rate": 3.6322177449757964e-08, + "loss": 1.7111, + "step": 91950 + }, + { + "epoch": 13.62962962962963, + "grad_norm": 4.785926342010498, + "learning_rate": 3.491311143131726e-08, + "loss": 1.624, + "step": 92000 + }, + { + "epoch": 13.637037037037038, + "grad_norm": 4.703386306762695, + "learning_rate": 3.353187232622124e-08, + "loss": 1.7183, + "step": 92050 + }, + { + "epoch": 13.644444444444444, + "grad_norm": 4.661455154418945, + "learning_rate": 3.2178463991497934e-08, + "loss": 1.693, + "step": 92100 + }, + { + "epoch": 13.651851851851852, + "grad_norm": 5.412303924560547, + "learning_rate": 3.085289020646087e-08, + "loss": 1.7245, + "step": 92150 + }, + { + "epoch": 13.65925925925926, + "grad_norm": 4.2518205642700195, + "learning_rate": 2.9555154672694652e-08, + "loss": 1.6777, + "step": 92200 + }, + { + "epoch": 13.666666666666666, + "grad_norm": 4.5041351318359375, + "learning_rate": 2.8285261014050492e-08, + "loss": 1.6368, + "step": 92250 + }, + { + "epoch": 13.674074074074074, + "grad_norm": 5.045154571533203, + "learning_rate": 2.704321277662847e-08, + "loss": 1.6564, + "step": 92300 + }, + { + "epoch": 13.681481481481482, + "grad_norm": 4.422163486480713, + "learning_rate": 2.5829013428776416e-08, + "loss": 1.6091, + "step": 92350 + }, + { + "epoch": 13.688888888888888, + "grad_norm": 6.019820213317871, + "learning_rate": 2.4642666361073265e-08, + "loss": 1.7035, + "step": 92400 + }, + { + "epoch": 13.696296296296296, + "grad_norm": 5.835787773132324, + "learning_rate": 2.3484174886322375e-08, + "loss": 1.6213, + "step": 92450 + }, + { + "epoch": 13.703703703703704, + "grad_norm": 5.5061798095703125, + "learning_rate": 2.235354223954378e-08, + "loss": 1.6693, + "step": 92500 + }, + { + "epoch": 13.71111111111111, + "grad_norm": 5.730589866638184, + "learning_rate": 2.125077157796085e-08, + "loss": 1.7064, + "step": 92550 + }, + { + "epoch": 13.718518518518518, + "grad_norm": 4.635679244995117, + "learning_rate": 2.0175865980995858e-08, + "loss": 1.6796, + "step": 92600 + }, + { + "epoch": 13.725925925925926, + "grad_norm": 6.5054121017456055, + "learning_rate": 1.9128828450257764e-08, + "loss": 1.6115, + "step": 92650 + }, + { + "epoch": 13.733333333333333, + "grad_norm": 5.930258274078369, + "learning_rate": 1.8109661909537777e-08, + "loss": 1.6923, + "step": 92700 + }, + { + "epoch": 13.74074074074074, + "grad_norm": 6.16044807434082, + "learning_rate": 1.7118369204797148e-08, + "loss": 1.7119, + "step": 92750 + }, + { + "epoch": 13.748148148148148, + "grad_norm": 5.259925365447998, + "learning_rate": 1.6154953104161597e-08, + "loss": 1.6455, + "step": 92800 + }, + { + "epoch": 13.755555555555556, + "grad_norm": 4.732719898223877, + "learning_rate": 1.521941629791468e-08, + "loss": 1.6123, + "step": 92850 + }, + { + "epoch": 13.762962962962963, + "grad_norm": 3.6326496601104736, + "learning_rate": 1.4311761398486668e-08, + "loss": 1.6245, + "step": 92900 + }, + { + "epoch": 13.77037037037037, + "grad_norm": 4.96213960647583, + "learning_rate": 1.3431990940450113e-08, + "loss": 1.6823, + "step": 92950 + }, + { + "epoch": 13.777777777777779, + "grad_norm": 6.506005764007568, + "learning_rate": 1.2580107380512075e-08, + "loss": 1.6541, + "step": 93000 + }, + { + "epoch": 13.785185185185185, + "grad_norm": 5.4200921058654785, + "learning_rate": 1.1756113097506349e-08, + "loss": 1.7077, + "step": 93050 + }, + { + "epoch": 13.792592592592593, + "grad_norm": 4.497105598449707, + "learning_rate": 1.096001039238681e-08, + "loss": 1.6276, + "step": 93100 + }, + { + "epoch": 13.8, + "grad_norm": 4.131155967712402, + "learning_rate": 1.0191801488225184e-08, + "loss": 1.7466, + "step": 93150 + }, + { + "epoch": 13.807407407407407, + "grad_norm": 4.873898029327393, + "learning_rate": 9.451488530198837e-09, + "loss": 1.682, + "step": 93200 + }, + { + "epoch": 13.814814814814815, + "grad_norm": 5.858630657196045, + "learning_rate": 8.739073585586345e-09, + "loss": 1.745, + "step": 93250 + }, + { + "epoch": 13.822222222222223, + "grad_norm": 4.94985294342041, + "learning_rate": 8.054558643765253e-09, + "loss": 1.7271, + "step": 93300 + }, + { + "epoch": 13.829629629629629, + "grad_norm": 4.77255916595459, + "learning_rate": 7.397945616202107e-09, + "loss": 1.6015, + "step": 93350 + }, + { + "epoch": 13.837037037037037, + "grad_norm": 4.5871992111206055, + "learning_rate": 6.7692363364513235e-09, + "loss": 1.6554, + "step": 93400 + }, + { + "epoch": 13.844444444444445, + "grad_norm": 5.18468713760376, + "learning_rate": 6.1684325601441e-09, + "loss": 1.7069, + "step": 93450 + }, + { + "epoch": 13.851851851851851, + "grad_norm": 4.543237209320068, + "learning_rate": 5.595535964990628e-09, + "loss": 1.6495, + "step": 93500 + }, + { + "epoch": 13.85925925925926, + "grad_norm": 6.7150983810424805, + "learning_rate": 5.050548150771218e-09, + "loss": 1.6884, + "step": 93550 + }, + { + "epoch": 13.866666666666667, + "grad_norm": 4.884583950042725, + "learning_rate": 4.533470639330739e-09, + "loss": 1.735, + "step": 93600 + }, + { + "epoch": 13.874074074074073, + "grad_norm": 4.831175327301025, + "learning_rate": 4.0443048745775205e-09, + "loss": 1.6228, + "step": 93650 + }, + { + "epoch": 13.881481481481481, + "grad_norm": 4.210744857788086, + "learning_rate": 3.5830522224777897e-09, + "loss": 1.6364, + "step": 93700 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 6.04276180267334, + "learning_rate": 3.1497139710534587e-09, + "loss": 1.6024, + "step": 93750 + }, + { + "epoch": 13.896296296296295, + "grad_norm": 4.816607475280762, + "learning_rate": 2.744291330375459e-09, + "loss": 1.6544, + "step": 93800 + }, + { + "epoch": 13.903703703703703, + "grad_norm": 5.6914286613464355, + "learning_rate": 2.3667854325604143e-09, + "loss": 1.6407, + "step": 93850 + }, + { + "epoch": 13.911111111111111, + "grad_norm": 4.977962970733643, + "learning_rate": 2.0171973317739677e-09, + "loss": 1.6743, + "step": 93900 + }, + { + "epoch": 13.918518518518518, + "grad_norm": 6.445103645324707, + "learning_rate": 1.6955280042185717e-09, + "loss": 1.707, + "step": 93950 + }, + { + "epoch": 13.925925925925926, + "grad_norm": 7.13929557800293, + "learning_rate": 1.4017783481379277e-09, + "loss": 1.7058, + "step": 94000 + }, + { + "epoch": 13.933333333333334, + "grad_norm": 3.9623467922210693, + "learning_rate": 1.1359491838081049e-09, + "loss": 1.5874, + "step": 94050 + }, + { + "epoch": 13.940740740740742, + "grad_norm": 4.890029430389404, + "learning_rate": 8.980412535442018e-10, + "loss": 1.7015, + "step": 94100 + }, + { + "epoch": 13.948148148148148, + "grad_norm": 5.10414981842041, + "learning_rate": 6.880552216870229e-10, + "loss": 1.7628, + "step": 94150 + }, + { + "epoch": 13.955555555555556, + "grad_norm": 6.242023944854736, + "learning_rate": 5.059916746130711e-10, + "loss": 1.679, + "step": 94200 + }, + { + "epoch": 13.962962962962964, + "grad_norm": 5.566348552703857, + "learning_rate": 3.5185112072122493e-10, + "loss": 1.6291, + "step": 94250 + }, + { + "epoch": 13.97037037037037, + "grad_norm": 4.626772880554199, + "learning_rate": 2.256339904427307e-10, + "loss": 1.7245, + "step": 94300 + }, + { + "epoch": 13.977777777777778, + "grad_norm": 5.087474346160889, + "learning_rate": 1.2734063622898973e-10, + "loss": 1.6368, + "step": 94350 + }, + { + "epoch": 13.985185185185186, + "grad_norm": 4.8912458419799805, + "learning_rate": 5.697133255821996e-11, + "loss": 1.6854, + "step": 94400 + }, + { + "epoch": 13.992592592592592, + "grad_norm": 4.6201982498168945, + "learning_rate": 1.4526275935455857e-11, + "loss": 1.6349, + "step": 94450 + }, + { + "epoch": 14.0, + "grad_norm": 4.993613243103027, + "learning_rate": 5.584882556419758e-15, + "loss": 1.6955, + "step": 94500 + }, + { + "epoch": 14.007407407407408, + "grad_norm": 4.081660747528076, + "learning_rate": 2.1750947185748995e-07, + "loss": 1.7098, + "step": 94550 + }, + { + "epoch": 14.014814814814814, + "grad_norm": 5.262495994567871, + "learning_rate": 2.1428725575616372e-07, + "loss": 1.6867, + "step": 94600 + }, + { + "epoch": 14.022222222222222, + "grad_norm": 4.7010698318481445, + "learning_rate": 2.1108882678338548e-07, + "loss": 1.5879, + "step": 94650 + }, + { + "epoch": 14.02962962962963, + "grad_norm": 5.260746002197266, + "learning_rate": 2.0791419271390057e-07, + "loss": 1.6757, + "step": 94700 + }, + { + "epoch": 14.037037037037036, + "grad_norm": 5.294519424438477, + "learning_rate": 2.0476336126461492e-07, + "loss": 1.6988, + "step": 94750 + }, + { + "epoch": 14.044444444444444, + "grad_norm": 5.61260461807251, + "learning_rate": 2.0163634009457756e-07, + "loss": 1.6908, + "step": 94800 + }, + { + "epoch": 14.051851851851852, + "grad_norm": 4.751020431518555, + "learning_rate": 1.9853313680495588e-07, + "loss": 1.7395, + "step": 94850 + }, + { + "epoch": 14.059259259259258, + "grad_norm": 3.8591971397399902, + "learning_rate": 1.9545375893902262e-07, + "loss": 1.6961, + "step": 94900 + }, + { + "epoch": 14.066666666666666, + "grad_norm": 5.062747478485107, + "learning_rate": 1.9239821398213455e-07, + "loss": 1.6664, + "step": 94950 + }, + { + "epoch": 14.074074074074074, + "grad_norm": 5.873329162597656, + "learning_rate": 1.8936650936171586e-07, + "loss": 1.6769, + "step": 95000 + }, + { + "epoch": 14.081481481481482, + "grad_norm": 4.468142986297607, + "learning_rate": 1.8635865244724162e-07, + "loss": 1.6555, + "step": 95050 + }, + { + "epoch": 14.088888888888889, + "grad_norm": 5.790612697601318, + "learning_rate": 1.833746505502143e-07, + "loss": 1.6727, + "step": 95100 + }, + { + "epoch": 14.096296296296297, + "grad_norm": 4.851980686187744, + "learning_rate": 1.8041451092415063e-07, + "loss": 1.7384, + "step": 95150 + }, + { + "epoch": 14.103703703703705, + "grad_norm": 5.899025917053223, + "learning_rate": 1.7747824076456478e-07, + "loss": 1.6937, + "step": 95200 + }, + { + "epoch": 14.11111111111111, + "grad_norm": 4.192124366760254, + "learning_rate": 1.7456584720894842e-07, + "loss": 1.673, + "step": 95250 + }, + { + "epoch": 14.118518518518519, + "grad_norm": 6.0272603034973145, + "learning_rate": 1.7167733733674974e-07, + "loss": 1.6529, + "step": 95300 + }, + { + "epoch": 14.125925925925927, + "grad_norm": 4.812941074371338, + "learning_rate": 1.6881271816936662e-07, + "loss": 1.7446, + "step": 95350 + }, + { + "epoch": 14.133333333333333, + "grad_norm": 4.837795734405518, + "learning_rate": 1.659719966701201e-07, + "loss": 1.7196, + "step": 95400 + }, + { + "epoch": 14.14074074074074, + "grad_norm": 3.2878236770629883, + "learning_rate": 1.631551797442421e-07, + "loss": 1.7074, + "step": 95450 + }, + { + "epoch": 14.148148148148149, + "grad_norm": 4.426816940307617, + "learning_rate": 1.6036227423885443e-07, + "loss": 1.6393, + "step": 95500 + }, + { + "epoch": 14.155555555555555, + "grad_norm": 5.182004451751709, + "learning_rate": 1.5759328694295861e-07, + "loss": 1.6379, + "step": 95550 + }, + { + "epoch": 14.162962962962963, + "grad_norm": 5.603666305541992, + "learning_rate": 1.5484822458741388e-07, + "loss": 1.7089, + "step": 95600 + }, + { + "epoch": 14.170370370370371, + "grad_norm": 6.255915641784668, + "learning_rate": 1.5212709384492152e-07, + "loss": 1.7131, + "step": 95650 + }, + { + "epoch": 14.177777777777777, + "grad_norm": 5.393916606903076, + "learning_rate": 1.4942990133001266e-07, + "loss": 1.6868, + "step": 95700 + }, + { + "epoch": 14.185185185185185, + "grad_norm": 4.44481086730957, + "learning_rate": 1.4675665359902502e-07, + "loss": 1.663, + "step": 95750 + }, + { + "epoch": 14.192592592592593, + "grad_norm": 5.424983024597168, + "learning_rate": 1.4410735715009837e-07, + "loss": 1.6858, + "step": 95800 + }, + { + "epoch": 14.2, + "grad_norm": 4.479785442352295, + "learning_rate": 1.414820184231447e-07, + "loss": 1.6969, + "step": 95850 + }, + { + "epoch": 14.207407407407407, + "grad_norm": 5.457512855529785, + "learning_rate": 1.3888064379984356e-07, + "loss": 1.643, + "step": 95900 + }, + { + "epoch": 14.214814814814815, + "grad_norm": 4.97783899307251, + "learning_rate": 1.3630323960361902e-07, + "loss": 1.7059, + "step": 95950 + }, + { + "epoch": 14.222222222222221, + "grad_norm": 5.594468116760254, + "learning_rate": 1.3374981209963166e-07, + "loss": 1.6873, + "step": 96000 + }, + { + "epoch": 14.22962962962963, + "grad_norm": 5.437595367431641, + "learning_rate": 1.3122036749475765e-07, + "loss": 1.6673, + "step": 96050 + }, + { + "epoch": 14.237037037037037, + "grad_norm": 5.472507953643799, + "learning_rate": 1.2871491193757634e-07, + "loss": 1.6547, + "step": 96100 + }, + { + "epoch": 14.244444444444444, + "grad_norm": 4.9965925216674805, + "learning_rate": 1.2623345151835164e-07, + "loss": 1.6488, + "step": 96150 + }, + { + "epoch": 14.251851851851852, + "grad_norm": 3.97997784614563, + "learning_rate": 1.2377599226902402e-07, + "loss": 1.7462, + "step": 96200 + }, + { + "epoch": 14.25925925925926, + "grad_norm": 4.046382904052734, + "learning_rate": 1.2134254016318847e-07, + "loss": 1.5989, + "step": 96250 + }, + { + "epoch": 14.266666666666667, + "grad_norm": 4.786545276641846, + "learning_rate": 1.1893310111608547e-07, + "loss": 1.6232, + "step": 96300 + }, + { + "epoch": 14.274074074074074, + "grad_norm": 4.758726119995117, + "learning_rate": 1.1654768098458446e-07, + "loss": 1.6304, + "step": 96350 + }, + { + "epoch": 14.281481481481482, + "grad_norm": 5.127438545227051, + "learning_rate": 1.1418628556716715e-07, + "loss": 1.6618, + "step": 96400 + }, + { + "epoch": 14.28888888888889, + "grad_norm": 4.724511623382568, + "learning_rate": 1.1184892060392083e-07, + "loss": 1.7565, + "step": 96450 + }, + { + "epoch": 14.296296296296296, + "grad_norm": 5.429440021514893, + "learning_rate": 1.0953559177651395e-07, + "loss": 1.5518, + "step": 96500 + }, + { + "epoch": 14.303703703703704, + "grad_norm": 3.7063474655151367, + "learning_rate": 1.0724630470819286e-07, + "loss": 1.6977, + "step": 96550 + }, + { + "epoch": 14.311111111111112, + "grad_norm": 6.464829444885254, + "learning_rate": 1.0498106496375837e-07, + "loss": 1.6884, + "step": 96600 + }, + { + "epoch": 14.318518518518518, + "grad_norm": 5.439199447631836, + "learning_rate": 1.0273987804956032e-07, + "loss": 1.6838, + "step": 96650 + }, + { + "epoch": 14.325925925925926, + "grad_norm": 4.898883819580078, + "learning_rate": 1.0052274941348084e-07, + "loss": 1.7748, + "step": 96700 + }, + { + "epoch": 14.333333333333334, + "grad_norm": 4.992605686187744, + "learning_rate": 9.832968444491886e-08, + "loss": 1.6865, + "step": 96750 + }, + { + "epoch": 14.34074074074074, + "grad_norm": 5.274005889892578, + "learning_rate": 9.616068847478233e-08, + "loss": 1.7458, + "step": 96800 + }, + { + "epoch": 14.348148148148148, + "grad_norm": 4.971953392028809, + "learning_rate": 9.40157667754693e-08, + "loss": 1.6809, + "step": 96850 + }, + { + "epoch": 14.355555555555556, + "grad_norm": 5.416146755218506, + "learning_rate": 9.189492456086024e-08, + "loss": 1.6902, + "step": 96900 + }, + { + "epoch": 14.362962962962962, + "grad_norm": 5.203887462615967, + "learning_rate": 8.979816698630128e-08, + "loss": 1.651, + "step": 96950 + }, + { + "epoch": 14.37037037037037, + "grad_norm": 6.0059590339660645, + "learning_rate": 8.772549914859652e-08, + "loss": 1.7222, + "step": 97000 + }, + { + "epoch": 14.377777777777778, + "grad_norm": 5.621152400970459, + "learning_rate": 8.567692608598798e-08, + "loss": 1.6189, + "step": 97050 + }, + { + "epoch": 14.385185185185184, + "grad_norm": 5.870107173919678, + "learning_rate": 8.365245277815348e-08, + "loss": 1.7122, + "step": 97100 + }, + { + "epoch": 14.392592592592592, + "grad_norm": 4.718721866607666, + "learning_rate": 8.165208414618542e-08, + "loss": 1.7046, + "step": 97150 + }, + { + "epoch": 14.4, + "grad_norm": 5.118833065032959, + "learning_rate": 7.967582505258308e-08, + "loss": 1.7248, + "step": 97200 + }, + { + "epoch": 14.407407407407407, + "grad_norm": 5.250682353973389, + "learning_rate": 7.772368030124044e-08, + "loss": 1.6675, + "step": 97250 + }, + { + "epoch": 14.414814814814815, + "grad_norm": 4.605399131774902, + "learning_rate": 7.57956546374361e-08, + "loss": 1.6864, + "step": 97300 + }, + { + "epoch": 14.422222222222222, + "grad_norm": 5.241720676422119, + "learning_rate": 7.389175274781668e-08, + "loss": 1.7509, + "step": 97350 + }, + { + "epoch": 14.42962962962963, + "grad_norm": 4.5354323387146, + "learning_rate": 7.201197926039238e-08, + "loss": 1.7077, + "step": 97400 + }, + { + "epoch": 14.437037037037037, + "grad_norm": 5.364243030548096, + "learning_rate": 7.01563387445181e-08, + "loss": 1.6977, + "step": 97450 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 4.9642486572265625, + "learning_rate": 6.832483571088899e-08, + "loss": 1.6097, + "step": 97500 + }, + { + "epoch": 14.451851851851853, + "grad_norm": 5.477876663208008, + "learning_rate": 6.651747461152602e-08, + "loss": 1.6578, + "step": 97550 + }, + { + "epoch": 14.459259259259259, + "grad_norm": 4.5182929039001465, + "learning_rate": 6.4734259839766e-08, + "loss": 1.6748, + "step": 97600 + }, + { + "epoch": 14.466666666666667, + "grad_norm": 4.248973846435547, + "learning_rate": 6.297519573025046e-08, + "loss": 1.6873, + "step": 97650 + }, + { + "epoch": 14.474074074074075, + "grad_norm": 5.4366230964660645, + "learning_rate": 6.124028655891567e-08, + "loss": 1.6764, + "step": 97700 + }, + { + "epoch": 14.481481481481481, + "grad_norm": 5.3513336181640625, + "learning_rate": 5.952953654298266e-08, + "loss": 1.6915, + "step": 97750 + }, + { + "epoch": 14.488888888888889, + "grad_norm": 6.808749675750732, + "learning_rate": 5.784294984094496e-08, + "loss": 1.6613, + "step": 97800 + }, + { + "epoch": 14.496296296296297, + "grad_norm": 4.615975379943848, + "learning_rate": 5.61805305525609e-08, + "loss": 1.7153, + "step": 97850 + }, + { + "epoch": 14.503703703703703, + "grad_norm": 6.45193338394165, + "learning_rate": 5.4542282718841324e-08, + "loss": 1.713, + "step": 97900 + }, + { + "epoch": 14.511111111111111, + "grad_norm": 6.218621253967285, + "learning_rate": 5.2928210322044094e-08, + "loss": 1.7161, + "step": 97950 + }, + { + "epoch": 14.518518518518519, + "grad_norm": 3.8265745639801025, + "learning_rate": 5.1338317285658524e-08, + "loss": 1.6358, + "step": 98000 + }, + { + "epoch": 14.525925925925925, + "grad_norm": 6.531336784362793, + "learning_rate": 4.977260747439872e-08, + "loss": 1.6534, + "step": 98050 + }, + { + "epoch": 14.533333333333333, + "grad_norm": 4.259779930114746, + "learning_rate": 4.8231084694195795e-08, + "loss": 1.6495, + "step": 98100 + }, + { + "epoch": 14.540740740740741, + "grad_norm": 5.932340145111084, + "learning_rate": 4.67137526921857e-08, + "loss": 1.7128, + "step": 98150 + }, + { + "epoch": 14.548148148148147, + "grad_norm": 5.576129913330078, + "learning_rate": 4.522061515670251e-08, + "loss": 1.7082, + "step": 98200 + }, + { + "epoch": 14.555555555555555, + "grad_norm": 5.470938682556152, + "learning_rate": 4.375167571726735e-08, + "loss": 1.7498, + "step": 98250 + }, + { + "epoch": 14.562962962962963, + "grad_norm": 4.535619258880615, + "learning_rate": 4.230693794458063e-08, + "loss": 1.7499, + "step": 98300 + }, + { + "epoch": 14.57037037037037, + "grad_norm": 4.099649429321289, + "learning_rate": 4.0886405350514244e-08, + "loss": 1.8051, + "step": 98350 + }, + { + "epoch": 14.577777777777778, + "grad_norm": 5.937483310699463, + "learning_rate": 3.949008138810051e-08, + "loss": 1.7008, + "step": 98400 + }, + { + "epoch": 14.585185185185185, + "grad_norm": 4.595200061798096, + "learning_rate": 3.8117969451526574e-08, + "loss": 1.6601, + "step": 98450 + }, + { + "epoch": 14.592592592592592, + "grad_norm": 4.622509002685547, + "learning_rate": 3.677007287612444e-08, + "loss": 1.6999, + "step": 98500 + }, + { + "epoch": 14.6, + "grad_norm": 4.620297908782959, + "learning_rate": 3.544639493836544e-08, + "loss": 1.6864, + "step": 98550 + }, + { + "epoch": 14.607407407407408, + "grad_norm": 6.888701438903809, + "learning_rate": 3.4146938855845744e-08, + "loss": 1.6404, + "step": 98600 + }, + { + "epoch": 14.614814814814816, + "grad_norm": 6.738227367401123, + "learning_rate": 3.2871707787287545e-08, + "loss": 1.6618, + "step": 98650 + }, + { + "epoch": 14.622222222222222, + "grad_norm": 4.549380302429199, + "learning_rate": 3.162070483252344e-08, + "loss": 1.703, + "step": 98700 + }, + { + "epoch": 14.62962962962963, + "grad_norm": 5.150018215179443, + "learning_rate": 3.039393303249538e-08, + "loss": 1.6822, + "step": 98750 + }, + { + "epoch": 14.637037037037038, + "grad_norm": 5.686187744140625, + "learning_rate": 2.9191395369240204e-08, + "loss": 1.7022, + "step": 98800 + }, + { + "epoch": 14.644444444444444, + "grad_norm": 5.78209114074707, + "learning_rate": 2.801309476589076e-08, + "loss": 1.7199, + "step": 98850 + }, + { + "epoch": 14.651851851851852, + "grad_norm": 3.936169147491455, + "learning_rate": 2.685903408666035e-08, + "loss": 1.7143, + "step": 98900 + }, + { + "epoch": 14.65925925925926, + "grad_norm": 5.642170429229736, + "learning_rate": 2.572921613684498e-08, + "loss": 1.7003, + "step": 98950 + }, + { + "epoch": 14.666666666666666, + "grad_norm": 4.85971212387085, + "learning_rate": 2.4623643662804454e-08, + "loss": 1.6994, + "step": 99000 + }, + { + "epoch": 14.674074074074074, + "grad_norm": 5.203510284423828, + "learning_rate": 2.3542319351969046e-08, + "loss": 1.6054, + "step": 99050 + }, + { + "epoch": 14.681481481481482, + "grad_norm": 5.952207088470459, + "learning_rate": 2.2485245832822856e-08, + "loss": 1.6189, + "step": 99100 + }, + { + "epoch": 14.688888888888888, + "grad_norm": 5.479672908782959, + "learning_rate": 2.1452425674901577e-08, + "loss": 1.6995, + "step": 99150 + }, + { + "epoch": 14.696296296296296, + "grad_norm": 4.928932189941406, + "learning_rate": 2.0443861388788066e-08, + "loss": 1.6801, + "step": 99200 + }, + { + "epoch": 14.703703703703704, + "grad_norm": 4.984295845031738, + "learning_rate": 1.945955542610012e-08, + "loss": 1.6402, + "step": 99250 + }, + { + "epoch": 14.71111111111111, + "grad_norm": 5.150063514709473, + "learning_rate": 1.8499510179491585e-08, + "loss": 1.6855, + "step": 99300 + }, + { + "epoch": 14.718518518518518, + "grad_norm": 6.898922920227051, + "learning_rate": 1.7563727982642386e-08, + "loss": 1.7513, + "step": 99350 + }, + { + "epoch": 14.725925925925926, + "grad_norm": 23.222375869750977, + "learning_rate": 1.6652211110254057e-08, + "loss": 1.6884, + "step": 99400 + }, + { + "epoch": 14.733333333333333, + "grad_norm": 4.689803600311279, + "learning_rate": 1.5764961778041988e-08, + "loss": 1.6416, + "step": 99450 + }, + { + "epoch": 14.74074074074074, + "grad_norm": 5.273927211761475, + "learning_rate": 1.4901982142735415e-08, + "loss": 1.6945, + "step": 99500 + }, + { + "epoch": 14.748148148148148, + "grad_norm": 4.713109493255615, + "learning_rate": 1.4063274302065222e-08, + "loss": 1.6675, + "step": 99550 + }, + { + "epoch": 14.755555555555556, + "grad_norm": 4.455766201019287, + "learning_rate": 1.324884029476392e-08, + "loss": 1.7309, + "step": 99600 + }, + { + "epoch": 14.762962962962963, + "grad_norm": 4.254674911499023, + "learning_rate": 1.2458682100560116e-08, + "loss": 1.7088, + "step": 99650 + }, + { + "epoch": 14.77037037037037, + "grad_norm": 5.807867527008057, + "learning_rate": 1.1692801640171835e-08, + "loss": 1.697, + "step": 99700 + }, + { + "epoch": 14.777777777777779, + "grad_norm": 4.785783767700195, + "learning_rate": 1.0951200775302095e-08, + "loss": 1.7298, + "step": 99750 + }, + { + "epoch": 14.785185185185185, + "grad_norm": 5.500148296356201, + "learning_rate": 1.0233881308635563e-08, + "loss": 1.6814, + "step": 99800 + }, + { + "epoch": 14.792592592592593, + "grad_norm": 6.528011798858643, + "learning_rate": 9.54084498383412e-09, + "loss": 1.7591, + "step": 99850 + }, + { + "epoch": 14.8, + "grad_norm": 5.078983783721924, + "learning_rate": 8.872093485531307e-09, + "loss": 1.6907, + "step": 99900 + }, + { + "epoch": 14.807407407407407, + "grad_norm": 5.844426155090332, + "learning_rate": 8.227628439330115e-09, + "loss": 1.7544, + "step": 99950 + }, + { + "epoch": 14.814814814814815, + "grad_norm": 4.08227014541626, + "learning_rate": 7.607451411797417e-09, + "loss": 1.735, + "step": 100000 + }, + { + "epoch": 14.822222222222223, + "grad_norm": 4.534005641937256, + "learning_rate": 7.01156391046065e-09, + "loss": 1.6321, + "step": 100050 + }, + { + "epoch": 14.829629629629629, + "grad_norm": 4.907926082611084, + "learning_rate": 6.43996738380337e-09, + "loss": 1.6915, + "step": 100100 + }, + { + "epoch": 14.837037037037037, + "grad_norm": 4.026651382446289, + "learning_rate": 5.892663221264139e-09, + "loss": 1.6458, + "step": 100150 + }, + { + "epoch": 14.844444444444445, + "grad_norm": 5.169808387756348, + "learning_rate": 5.36965275323098e-09, + "loss": 1.663, + "step": 100200 + }, + { + "epoch": 14.851851851851851, + "grad_norm": 4.850079536437988, + "learning_rate": 4.870937251038044e-09, + "loss": 1.6436, + "step": 100250 + }, + { + "epoch": 14.85925925925926, + "grad_norm": 4.930896282196045, + "learning_rate": 4.396517926964495e-09, + "loss": 1.7308, + "step": 100300 + }, + { + "epoch": 14.866666666666667, + "grad_norm": 4.7086076736450195, + "learning_rate": 3.946395934230074e-09, + "loss": 1.716, + "step": 100350 + }, + { + "epoch": 14.874074074074073, + "grad_norm": 4.457938194274902, + "learning_rate": 3.5205723669917703e-09, + "loss": 1.6974, + "step": 100400 + }, + { + "epoch": 14.881481481481481, + "grad_norm": 4.839356422424316, + "learning_rate": 3.119048260341595e-09, + "loss": 1.6968, + "step": 100450 + }, + { + "epoch": 14.88888888888889, + "grad_norm": 5.151297569274902, + "learning_rate": 2.7418245903054752e-09, + "loss": 1.6483, + "step": 100500 + }, + { + "epoch": 14.896296296296295, + "grad_norm": 5.776646614074707, + "learning_rate": 2.3889022738399216e-09, + "loss": 1.6593, + "step": 100550 + }, + { + "epoch": 14.903703703703703, + "grad_norm": 4.784289836883545, + "learning_rate": 2.060282168829808e-09, + "loss": 1.6998, + "step": 100600 + }, + { + "epoch": 14.911111111111111, + "grad_norm": 4.684325695037842, + "learning_rate": 1.7559650740828215e-09, + "loss": 1.7249, + "step": 100650 + }, + { + "epoch": 14.918518518518518, + "grad_norm": 5.666114330291748, + "learning_rate": 1.4759517293361225e-09, + "loss": 1.7104, + "step": 100700 + }, + { + "epoch": 14.925925925925926, + "grad_norm": 5.29168701171875, + "learning_rate": 1.220242815246353e-09, + "loss": 1.6885, + "step": 100750 + }, + { + "epoch": 14.933333333333334, + "grad_norm": 4.432060718536377, + "learning_rate": 9.88838953389637e-10, + "loss": 1.7025, + "step": 100800 + }, + { + "epoch": 14.940740740740742, + "grad_norm": 4.997260570526123, + "learning_rate": 7.817407062638005e-10, + "loss": 1.7068, + "step": 100850 + }, + { + "epoch": 14.948148148148148, + "grad_norm": 5.093183994293213, + "learning_rate": 5.989485772850412e-10, + "loss": 1.6982, + "step": 100900 + }, + { + "epoch": 14.955555555555556, + "grad_norm": 5.697869777679443, + "learning_rate": 4.4046301078237706e-10, + "loss": 1.6626, + "step": 100950 + }, + { + "epoch": 14.962962962962964, + "grad_norm": 5.6799750328063965, + "learning_rate": 3.062843920043079e-10, + "loss": 1.7029, + "step": 101000 + }, + { + "epoch": 14.97037037037037, + "grad_norm": 5.68807315826416, + "learning_rate": 1.964130471110437e-10, + "loss": 1.6871, + "step": 101050 + }, + { + "epoch": 14.977777777777778, + "grad_norm": 4.425951957702637, + "learning_rate": 1.1084924318005563e-10, + "loss": 1.6629, + "step": 101100 + }, + { + "epoch": 14.985185185185186, + "grad_norm": 4.260661602020264, + "learning_rate": 4.959318819941494e-11, + "loss": 1.6988, + "step": 101150 + }, + { + "epoch": 14.992592592592592, + "grad_norm": 4.999781608581543, + "learning_rate": 1.2645031070013248e-11, + "loss": 1.7212, + "step": 101200 + }, + { + "epoch": 15.0, + "grad_norm": 6.509689807891846, + "learning_rate": 4.861604452344182e-15, + "loss": 1.6589, + "step": 101250 + }, + { + "epoch": 15.007407407407408, + "grad_norm": 4.362274169921875, + "learning_rate": 1.9113652725600174e-07, + "loss": 1.7051, + "step": 101300 + }, + { + "epoch": 15.014814814814814, + "grad_norm": 4.904382705688477, + "learning_rate": 1.8830376298905716e-07, + "loss": 1.6349, + "step": 101350 + }, + { + "epoch": 15.022222222222222, + "grad_norm": 5.014002323150635, + "learning_rate": 1.8549194789315384e-07, + "loss": 1.6786, + "step": 101400 + }, + { + "epoch": 15.02962962962963, + "grad_norm": 5.237969875335693, + "learning_rate": 1.8270108797185936e-07, + "loss": 1.7113, + "step": 101450 + }, + { + "epoch": 15.037037037037036, + "grad_norm": 4.123836517333984, + "learning_rate": 1.7993118918400054e-07, + "loss": 1.7214, + "step": 101500 + }, + { + "epoch": 15.044444444444444, + "grad_norm": 5.174574375152588, + "learning_rate": 1.7718225744364993e-07, + "loss": 1.7668, + "step": 101550 + }, + { + "epoch": 15.051851851851852, + "grad_norm": 4.138543605804443, + "learning_rate": 1.7445429862011476e-07, + "loss": 1.6625, + "step": 101600 + }, + { + "epoch": 15.059259259259258, + "grad_norm": 5.6647539138793945, + "learning_rate": 1.7174731853791814e-07, + "loss": 1.7204, + "step": 101650 + }, + { + "epoch": 15.066666666666666, + "grad_norm": 5.137762069702148, + "learning_rate": 1.6906132297679568e-07, + "loss": 1.6656, + "step": 101700 + }, + { + "epoch": 15.074074074074074, + "grad_norm": 5.82663631439209, + "learning_rate": 1.663963176716743e-07, + "loss": 1.6957, + "step": 101750 + }, + { + "epoch": 15.081481481481482, + "grad_norm": 4.870634078979492, + "learning_rate": 1.6375230831266909e-07, + "loss": 1.7525, + "step": 101800 + }, + { + "epoch": 15.088888888888889, + "grad_norm": 5.1896071434021, + "learning_rate": 1.6112930054505981e-07, + "loss": 1.6674, + "step": 101850 + }, + { + "epoch": 15.096296296296297, + "grad_norm": 4.642868518829346, + "learning_rate": 1.5852729996929106e-07, + "loss": 1.6844, + "step": 101900 + }, + { + "epoch": 15.103703703703705, + "grad_norm": 7.358834266662598, + "learning_rate": 1.5594631214095103e-07, + "loss": 1.6857, + "step": 101950 + }, + { + "epoch": 15.11111111111111, + "grad_norm": 4.920676231384277, + "learning_rate": 1.533863425707649e-07, + "loss": 1.7326, + "step": 102000 + }, + { + "epoch": 15.118518518518519, + "grad_norm": 5.6548380851745605, + "learning_rate": 1.508473967245794e-07, + "loss": 1.6935, + "step": 102050 + }, + { + "epoch": 15.125925925925927, + "grad_norm": 5.018331527709961, + "learning_rate": 1.48329480023357e-07, + "loss": 1.712, + "step": 102100 + }, + { + "epoch": 15.133333333333333, + "grad_norm": 5.96315336227417, + "learning_rate": 1.4583259784315518e-07, + "loss": 1.7581, + "step": 102150 + }, + { + "epoch": 15.14074074074074, + "grad_norm": 4.610811710357666, + "learning_rate": 1.4335675551512384e-07, + "loss": 1.7412, + "step": 102200 + }, + { + "epoch": 15.148148148148149, + "grad_norm": 4.799295902252197, + "learning_rate": 1.409019583254889e-07, + "loss": 1.6462, + "step": 102250 + }, + { + "epoch": 15.155555555555555, + "grad_norm": 6.016835689544678, + "learning_rate": 1.3846821151554223e-07, + "loss": 1.7501, + "step": 102300 + }, + { + "epoch": 15.162962962962963, + "grad_norm": 5.653822898864746, + "learning_rate": 1.3605552028163162e-07, + "loss": 1.6666, + "step": 102350 + }, + { + "epoch": 15.170370370370371, + "grad_norm": 5.209936141967773, + "learning_rate": 1.3366388977514634e-07, + "loss": 1.7113, + "step": 102400 + }, + { + "epoch": 15.177777777777777, + "grad_norm": 4.76084566116333, + "learning_rate": 1.3129332510251057e-07, + "loss": 1.6819, + "step": 102450 + }, + { + "epoch": 15.185185185185185, + "grad_norm": 4.614091873168945, + "learning_rate": 1.289438313251701e-07, + "loss": 1.7483, + "step": 102500 + }, + { + "epoch": 15.192592592592593, + "grad_norm": 5.6824951171875, + "learning_rate": 1.2661541345958095e-07, + "loss": 1.6486, + "step": 102550 + }, + { + "epoch": 15.2, + "grad_norm": 5.0764641761779785, + "learning_rate": 1.2430807647720088e-07, + "loss": 1.616, + "step": 102600 + }, + { + "epoch": 15.207407407407407, + "grad_norm": 5.760731220245361, + "learning_rate": 1.220218253044747e-07, + "loss": 1.6611, + "step": 102650 + }, + { + "epoch": 15.214814814814815, + "grad_norm": 4.543821811676025, + "learning_rate": 1.1975666482282988e-07, + "loss": 1.6207, + "step": 102700 + }, + { + "epoch": 15.222222222222221, + "grad_norm": 6.1336894035339355, + "learning_rate": 1.1751259986866104e-07, + "loss": 1.6834, + "step": 102750 + }, + { + "epoch": 15.22962962962963, + "grad_norm": 5.639930725097656, + "learning_rate": 1.1528963523331993e-07, + "loss": 1.659, + "step": 102800 + }, + { + "epoch": 15.237037037037037, + "grad_norm": 5.458514213562012, + "learning_rate": 1.1308777566310769e-07, + "loss": 1.7154, + "step": 102850 + }, + { + "epoch": 15.244444444444444, + "grad_norm": 5.286754608154297, + "learning_rate": 1.1090702585926483e-07, + "loss": 1.6816, + "step": 102900 + }, + { + "epoch": 15.251851851851852, + "grad_norm": 6.629901885986328, + "learning_rate": 1.087473904779579e-07, + "loss": 1.6801, + "step": 102950 + }, + { + "epoch": 15.25925925925926, + "grad_norm": 4.6929121017456055, + "learning_rate": 1.0660887413027399e-07, + "loss": 1.6725, + "step": 103000 + }, + { + "epoch": 15.266666666666667, + "grad_norm": 6.1419878005981445, + "learning_rate": 1.044914813822051e-07, + "loss": 1.7276, + "step": 103050 + }, + { + "epoch": 15.274074074074074, + "grad_norm": 5.224550247192383, + "learning_rate": 1.0239521675464492e-07, + "loss": 1.6544, + "step": 103100 + }, + { + "epoch": 15.281481481481482, + "grad_norm": 5.665772438049316, + "learning_rate": 1.0032008472337318e-07, + "loss": 1.7196, + "step": 103150 + }, + { + "epoch": 15.28888888888889, + "grad_norm": 5.348624229431152, + "learning_rate": 9.826608971905238e-08, + "loss": 1.6823, + "step": 103200 + }, + { + "epoch": 15.296296296296296, + "grad_norm": 5.742287635803223, + "learning_rate": 9.623323612721225e-08, + "loss": 1.6856, + "step": 103250 + }, + { + "epoch": 15.303703703703704, + "grad_norm": 4.182469844818115, + "learning_rate": 9.422152828824305e-08, + "loss": 1.7428, + "step": 103300 + }, + { + "epoch": 15.311111111111112, + "grad_norm": 6.233865737915039, + "learning_rate": 9.223097049738783e-08, + "loss": 1.6919, + "step": 103350 + }, + { + "epoch": 15.318518518518518, + "grad_norm": 5.403355121612549, + "learning_rate": 9.026156700473021e-08, + "loss": 1.604, + "step": 103400 + }, + { + "epoch": 15.325925925925926, + "grad_norm": 4.667754173278809, + "learning_rate": 8.831332201518883e-08, + "loss": 1.7329, + "step": 103450 + }, + { + "epoch": 15.333333333333334, + "grad_norm": 5.18316125869751, + "learning_rate": 8.63862396885018e-08, + "loss": 1.657, + "step": 103500 + }, + { + "epoch": 15.34074074074074, + "grad_norm": 6.3779706954956055, + "learning_rate": 8.448032413922891e-08, + "loss": 1.6772, + "step": 103550 + }, + { + "epoch": 15.348148148148148, + "grad_norm": 5.585716724395752, + "learning_rate": 8.259557943673169e-08, + "loss": 1.6485, + "step": 103600 + }, + { + "epoch": 15.355555555555556, + "grad_norm": 5.607356548309326, + "learning_rate": 8.073200960517003e-08, + "loss": 1.7131, + "step": 103650 + }, + { + "epoch": 15.362962962962962, + "grad_norm": 5.320977210998535, + "learning_rate": 7.888961862349332e-08, + "loss": 1.6469, + "step": 103700 + }, + { + "epoch": 15.37037037037037, + "grad_norm": 4.2483296394348145, + "learning_rate": 7.706841042543268e-08, + "loss": 1.665, + "step": 103750 + }, + { + "epoch": 15.377777777777778, + "grad_norm": 6.139911651611328, + "learning_rate": 7.526838889948873e-08, + "loss": 1.7047, + "step": 103800 + }, + { + "epoch": 15.385185185185184, + "grad_norm": 5.27344274520874, + "learning_rate": 7.348955788892831e-08, + "loss": 1.7626, + "step": 103850 + }, + { + "epoch": 15.392592592592592, + "grad_norm": 3.9390082359313965, + "learning_rate": 7.17319211917733e-08, + "loss": 1.6708, + "step": 103900 + }, + { + "epoch": 15.4, + "grad_norm": 5.188051700592041, + "learning_rate": 6.999548256079181e-08, + "loss": 1.7132, + "step": 103950 + }, + { + "epoch": 15.407407407407407, + "grad_norm": 5.408111572265625, + "learning_rate": 6.828024570349479e-08, + "loss": 1.7483, + "step": 104000 + }, + { + "epoch": 15.414814814814815, + "grad_norm": 5.115500450134277, + "learning_rate": 6.658621428212053e-08, + "loss": 1.633, + "step": 104050 + }, + { + "epoch": 15.422222222222222, + "grad_norm": 4.845335483551025, + "learning_rate": 6.491339191363465e-08, + "loss": 1.6942, + "step": 104100 + }, + { + "epoch": 15.42962962962963, + "grad_norm": 4.13123083114624, + "learning_rate": 6.326178216971568e-08, + "loss": 1.6817, + "step": 104150 + }, + { + "epoch": 15.437037037037037, + "grad_norm": 6.038280010223389, + "learning_rate": 6.163138857675499e-08, + "loss": 1.7143, + "step": 104200 + }, + { + "epoch": 15.444444444444445, + "grad_norm": 4.800933837890625, + "learning_rate": 6.002221461583913e-08, + "loss": 1.6918, + "step": 104250 + }, + { + "epoch": 15.451851851851853, + "grad_norm": 5.348603248596191, + "learning_rate": 5.843426372275307e-08, + "loss": 1.6809, + "step": 104300 + }, + { + "epoch": 15.459259259259259, + "grad_norm": 5.338648796081543, + "learning_rate": 5.6867539287966954e-08, + "loss": 1.6494, + "step": 104350 + }, + { + "epoch": 15.466666666666667, + "grad_norm": 6.060091495513916, + "learning_rate": 5.532204465662716e-08, + "loss": 1.7146, + "step": 104400 + }, + { + "epoch": 15.474074074074075, + "grad_norm": 4.730654239654541, + "learning_rate": 5.37977831285541e-08, + "loss": 1.6168, + "step": 104450 + }, + { + "epoch": 15.481481481481481, + "grad_norm": 5.17317533493042, + "learning_rate": 5.2294757958233356e-08, + "loss": 1.7384, + "step": 104500 + }, + { + "epoch": 15.488888888888889, + "grad_norm": 5.633426666259766, + "learning_rate": 5.081297235480675e-08, + "loss": 1.7138, + "step": 104550 + }, + { + "epoch": 15.496296296296297, + "grad_norm": 4.512260437011719, + "learning_rate": 4.9352429482067975e-08, + "loss": 1.7167, + "step": 104600 + }, + { + "epoch": 15.503703703703703, + "grad_norm": 5.672184467315674, + "learning_rate": 4.7913132458454746e-08, + "loss": 1.7112, + "step": 104650 + }, + { + "epoch": 15.511111111111111, + "grad_norm": 5.912533283233643, + "learning_rate": 4.6495084357041084e-08, + "loss": 1.666, + "step": 104700 + }, + { + "epoch": 15.518518518518519, + "grad_norm": 4.872348785400391, + "learning_rate": 4.509828820553397e-08, + "loss": 1.7347, + "step": 104750 + }, + { + "epoch": 15.525925925925925, + "grad_norm": 4.933444023132324, + "learning_rate": 4.3722746986264443e-08, + "loss": 1.6805, + "step": 104800 + }, + { + "epoch": 15.533333333333333, + "grad_norm": 5.971294403076172, + "learning_rate": 4.236846363618097e-08, + "loss": 1.6679, + "step": 104850 + }, + { + "epoch": 15.540740740740741, + "grad_norm": 4.82595157623291, + "learning_rate": 4.103544104684276e-08, + "loss": 1.5771, + "step": 104900 + }, + { + "epoch": 15.548148148148147, + "grad_norm": 5.648787975311279, + "learning_rate": 3.972368206441757e-08, + "loss": 1.7697, + "step": 104950 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 5.242377281188965, + "learning_rate": 3.843318948967056e-08, + "loss": 1.7878, + "step": 105000 + }, + { + "epoch": 15.562962962962963, + "grad_norm": 6.494607448577881, + "learning_rate": 3.716396607796102e-08, + "loss": 1.6129, + "step": 105050 + }, + { + "epoch": 15.57037037037037, + "grad_norm": 4.520471096038818, + "learning_rate": 3.5916014539236765e-08, + "loss": 1.6146, + "step": 105100 + }, + { + "epoch": 15.577777777777778, + "grad_norm": 5.042867183685303, + "learning_rate": 3.468933753802528e-08, + "loss": 1.7067, + "step": 105150 + }, + { + "epoch": 15.585185185185185, + "grad_norm": 5.183259963989258, + "learning_rate": 3.348393769343372e-08, + "loss": 1.7039, + "step": 105200 + }, + { + "epoch": 15.592592592592592, + "grad_norm": 5.700682640075684, + "learning_rate": 3.22998175791378e-08, + "loss": 1.686, + "step": 105250 + }, + { + "epoch": 15.6, + "grad_norm": 6.726398944854736, + "learning_rate": 3.113697972337848e-08, + "loss": 1.7659, + "step": 105300 + }, + { + "epoch": 15.607407407407408, + "grad_norm": 5.453873157501221, + "learning_rate": 2.999542660895638e-08, + "loss": 1.7263, + "step": 105350 + }, + { + "epoch": 15.614814814814816, + "grad_norm": 8.06754207611084, + "learning_rate": 2.8875160673227375e-08, + "loss": 1.6145, + "step": 105400 + }, + { + "epoch": 15.622222222222222, + "grad_norm": 5.7392988204956055, + "learning_rate": 2.7776184308095922e-08, + "loss": 1.7672, + "step": 105450 + }, + { + "epoch": 15.62962962962963, + "grad_norm": 6.453995227813721, + "learning_rate": 2.6698499860011718e-08, + "loss": 1.6994, + "step": 105500 + }, + { + "epoch": 15.637037037037038, + "grad_norm": 7.151883602142334, + "learning_rate": 2.5642109629961942e-08, + "loss": 1.6469, + "step": 105550 + }, + { + "epoch": 15.644444444444444, + "grad_norm": 6.86553430557251, + "learning_rate": 2.4607015873469032e-08, + "loss": 1.7241, + "step": 105600 + }, + { + "epoch": 15.651851851851852, + "grad_norm": 4.4429850578308105, + "learning_rate": 2.3593220800584015e-08, + "loss": 1.707, + "step": 105650 + }, + { + "epoch": 15.65925925925926, + "grad_norm": 5.352493762969971, + "learning_rate": 2.2600726575885413e-08, + "loss": 1.6433, + "step": 105700 + }, + { + "epoch": 15.666666666666666, + "grad_norm": 5.694179058074951, + "learning_rate": 2.162953531846812e-08, + "loss": 1.6451, + "step": 105750 + }, + { + "epoch": 15.674074074074074, + "grad_norm": 4.886950969696045, + "learning_rate": 2.0679649101944532e-08, + "loss": 1.6637, + "step": 105800 + }, + { + "epoch": 15.681481481481482, + "grad_norm": 3.8822832107543945, + "learning_rate": 1.9751069954436763e-08, + "loss": 1.6385, + "step": 105850 + }, + { + "epoch": 15.688888888888888, + "grad_norm": 6.058652400970459, + "learning_rate": 1.884379985857776e-08, + "loss": 1.6387, + "step": 105900 + }, + { + "epoch": 15.696296296296296, + "grad_norm": 6.563283443450928, + "learning_rate": 1.795784075149687e-08, + "loss": 1.6333, + "step": 105950 + }, + { + "epoch": 15.703703703703704, + "grad_norm": 5.563347339630127, + "learning_rate": 1.7093194524827605e-08, + "loss": 1.6209, + "step": 106000 + }, + { + "epoch": 15.71111111111111, + "grad_norm": 5.790256977081299, + "learning_rate": 1.6249863024693223e-08, + "loss": 1.671, + "step": 106050 + }, + { + "epoch": 15.718518518518518, + "grad_norm": 5.468883514404297, + "learning_rate": 1.542784805171116e-08, + "loss": 1.6543, + "step": 106100 + }, + { + "epoch": 15.725925925925926, + "grad_norm": 5.149229526519775, + "learning_rate": 1.4627151360983027e-08, + "loss": 1.7003, + "step": 106150 + }, + { + "epoch": 15.733333333333333, + "grad_norm": 5.396352767944336, + "learning_rate": 1.3847774662094637e-08, + "loss": 1.6859, + "step": 106200 + }, + { + "epoch": 15.74074074074074, + "grad_norm": 5.088778972625732, + "learning_rate": 1.308971961911154e-08, + "loss": 1.6757, + "step": 106250 + }, + { + "epoch": 15.748148148148148, + "grad_norm": 4.770002365112305, + "learning_rate": 1.2352987850571263e-08, + "loss": 1.615, + "step": 106300 + }, + { + "epoch": 15.755555555555556, + "grad_norm": 5.888800621032715, + "learning_rate": 1.1637580929487747e-08, + "loss": 1.6846, + "step": 106350 + }, + { + "epoch": 15.762962962962963, + "grad_norm": 4.9219136238098145, + "learning_rate": 1.0943500383342465e-08, + "loss": 1.7002, + "step": 106400 + }, + { + "epoch": 15.77037037037037, + "grad_norm": 4.826274871826172, + "learning_rate": 1.0270747694082206e-08, + "loss": 1.7268, + "step": 106450 + }, + { + "epoch": 15.777777777777779, + "grad_norm": 5.474734783172607, + "learning_rate": 9.61932429811574e-09, + "loss": 1.6723, + "step": 106500 + }, + { + "epoch": 15.785185185185185, + "grad_norm": 4.7281694412231445, + "learning_rate": 8.989231586311598e-09, + "loss": 1.7157, + "step": 106550 + }, + { + "epoch": 15.792592592592593, + "grad_norm": 5.0702314376831055, + "learning_rate": 8.380470903995852e-09, + "loss": 1.6829, + "step": 106600 + }, + { + "epoch": 15.8, + "grad_norm": 6.055559158325195, + "learning_rate": 7.793043550945456e-09, + "loss": 1.7089, + "step": 106650 + }, + { + "epoch": 15.807407407407407, + "grad_norm": 4.831183910369873, + "learning_rate": 7.226950781390463e-09, + "loss": 1.7252, + "step": 106700 + }, + { + "epoch": 15.814814814814815, + "grad_norm": 7.790440559387207, + "learning_rate": 6.682193804008475e-09, + "loss": 1.6727, + "step": 106750 + }, + { + "epoch": 15.822222222222223, + "grad_norm": 5.7263031005859375, + "learning_rate": 6.1587737819224224e-09, + "loss": 1.6554, + "step": 106800 + }, + { + "epoch": 15.829629629629629, + "grad_norm": 5.707858562469482, + "learning_rate": 5.656691832696126e-09, + "loss": 1.6258, + "step": 106850 + }, + { + "epoch": 15.837037037037037, + "grad_norm": 4.361595153808594, + "learning_rate": 5.1759490283376235e-09, + "loss": 1.6215, + "step": 106900 + }, + { + "epoch": 15.844444444444445, + "grad_norm": 4.160728931427002, + "learning_rate": 4.7165463952913996e-09, + "loss": 1.7259, + "step": 106950 + }, + { + "epoch": 15.851851851851851, + "grad_norm": 6.267629146575928, + "learning_rate": 4.278484914437276e-09, + "loss": 1.675, + "step": 107000 + }, + { + "epoch": 15.85925925925926, + "grad_norm": 4.6345014572143555, + "learning_rate": 3.8617655210915205e-09, + "loss": 1.7014, + "step": 107050 + }, + { + "epoch": 15.866666666666667, + "grad_norm": 5.403736591339111, + "learning_rate": 3.4663891050001895e-09, + "loss": 1.6819, + "step": 107100 + }, + { + "epoch": 15.874074074074073, + "grad_norm": 4.492266654968262, + "learning_rate": 3.0923565103402333e-09, + "loss": 1.6867, + "step": 107150 + }, + { + "epoch": 15.881481481481481, + "grad_norm": 5.615554332733154, + "learning_rate": 2.739668535717277e-09, + "loss": 1.6419, + "step": 107200 + }, + { + "epoch": 15.88888888888889, + "grad_norm": 5.3445000648498535, + "learning_rate": 2.408325934162292e-09, + "loss": 1.7122, + "step": 107250 + }, + { + "epoch": 15.896296296296295, + "grad_norm": 4.250411510467529, + "learning_rate": 2.098329413133815e-09, + "loss": 1.6285, + "step": 107300 + }, + { + "epoch": 15.903703703703703, + "grad_norm": 5.699678421020508, + "learning_rate": 1.8096796345112854e-09, + "loss": 1.722, + "step": 107350 + }, + { + "epoch": 15.911111111111111, + "grad_norm": 4.965419292449951, + "learning_rate": 1.5423772145983785e-09, + "loss": 1.6915, + "step": 107400 + }, + { + "epoch": 15.918518518518518, + "grad_norm": 5.15787410736084, + "learning_rate": 1.2964227241163418e-09, + "loss": 1.7422, + "step": 107450 + }, + { + "epoch": 15.925925925925926, + "grad_norm": 4.798497200012207, + "learning_rate": 1.0718166882106585e-09, + "loss": 1.6678, + "step": 107500 + }, + { + "epoch": 15.933333333333334, + "grad_norm": 6.525084495544434, + "learning_rate": 8.685595864399433e-10, + "loss": 1.6567, + "step": 107550 + }, + { + "epoch": 15.940740740740742, + "grad_norm": 4.747183799743652, + "learning_rate": 6.866518527848254e-10, + "loss": 1.6978, + "step": 107600 + }, + { + "epoch": 15.948148148148148, + "grad_norm": 4.648836612701416, + "learning_rate": 5.260938756401768e-10, + "loss": 1.6565, + "step": 107650 + }, + { + "epoch": 15.955555555555556, + "grad_norm": 4.3354268074035645, + "learning_rate": 3.868859978173323e-10, + "loss": 1.6512, + "step": 107700 + }, + { + "epoch": 15.962962962962964, + "grad_norm": 8.2989501953125, + "learning_rate": 2.6902851654075914e-10, + "loss": 1.69, + "step": 107750 + }, + { + "epoch": 15.97037037037037, + "grad_norm": 5.1592230796813965, + "learning_rate": 1.7252168345249787e-10, + "loss": 1.714, + "step": 107800 + }, + { + "epoch": 15.977777777777778, + "grad_norm": 4.358281135559082, + "learning_rate": 9.736570460439077e-11, + "loss": 1.6542, + "step": 107850 + }, + { + "epoch": 15.985185185185186, + "grad_norm": 5.035132884979248, + "learning_rate": 4.3560740465853345e-11, + "loss": 1.7706, + "step": 107900 + }, + { + "epoch": 15.992592592592592, + "grad_norm": 4.129238128662109, + "learning_rate": 1.1106905914992639e-11, + "loss": 1.6808, + "step": 107950 + }, + { + "epoch": 16.0, + "grad_norm": 5.48187255859375, + "learning_rate": 4.270245268500617e-15, + "loss": 1.7222, + "step": 108000 + } + ], + "logging_steps": 50, + "max_steps": 108000, + "num_input_tokens_seen": 0, + "num_train_epochs": 16, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3104265682349158e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}