{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 100, "global_step": 108000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007407407407407408, "grad_norm": 22.549795150756836, "learning_rate": 1.9600000000000003e-06, "loss": 10.1124, "step": 50 }, { "epoch": 0.014814814814814815, "grad_norm": 22.49587631225586, "learning_rate": 3.96e-06, "loss": 7.7186, "step": 100 }, { "epoch": 0.022222222222222223, "grad_norm": 8.133307456970215, "learning_rate": 5.9600000000000005e-06, "loss": 4.0014, "step": 150 }, { "epoch": 0.02962962962962963, "grad_norm": 5.439042091369629, "learning_rate": 7.960000000000002e-06, "loss": 2.5781, "step": 200 }, { "epoch": 0.037037037037037035, "grad_norm": 6.463360786437988, "learning_rate": 9.960000000000001e-06, "loss": 2.1167, "step": 250 }, { "epoch": 0.044444444444444446, "grad_norm": 5.057401180267334, "learning_rate": 1.196e-05, "loss": 2.0461, "step": 300 }, { "epoch": 0.05185185185185185, "grad_norm": 5.143754482269287, "learning_rate": 1.396e-05, "loss": 2.0597, "step": 350 }, { "epoch": 0.05925925925925926, "grad_norm": 5.2624993324279785, "learning_rate": 1.5960000000000003e-05, "loss": 1.9737, "step": 400 }, { "epoch": 0.06666666666666667, "grad_norm": 5.019354820251465, "learning_rate": 1.796e-05, "loss": 1.9023, "step": 450 }, { "epoch": 0.07407407407407407, "grad_norm": 4.731785297393799, "learning_rate": 1.9960000000000002e-05, "loss": 1.9953, "step": 500 }, { "epoch": 0.08148148148148149, "grad_norm": 4.751569747924805, "learning_rate": 1.9996966947554476e-05, "loss": 1.9253, "step": 550 }, { "epoch": 0.08888888888888889, "grad_norm": 3.835200071334839, "learning_rate": 1.9987620859825225e-05, "loss": 1.8835, "step": 600 }, { "epoch": 0.0962962962962963, "grad_norm": 4.402861595153809, "learning_rate": 1.997196637669223e-05, "loss": 1.8831, "step": 650 }, { "epoch": 0.1037037037037037, "grad_norm": 3.8608696460723877, "learning_rate": 1.9950013385862575e-05, "loss": 1.92, "step": 700 }, { "epoch": 0.1111111111111111, "grad_norm": 3.7692012786865234, "learning_rate": 1.9921775753315793e-05, "loss": 1.8961, "step": 750 }, { "epoch": 0.11851851851851852, "grad_norm": 3.9636597633361816, "learning_rate": 1.9887271314545823e-05, "loss": 1.9051, "step": 800 }, { "epoch": 0.1259259259259259, "grad_norm": 4.725071430206299, "learning_rate": 1.984652186329575e-05, "loss": 1.9338, "step": 850 }, { "epoch": 0.13333333333333333, "grad_norm": 4.309209823608398, "learning_rate": 1.9799553137792373e-05, "loss": 1.9735, "step": 900 }, { "epoch": 0.14074074074074075, "grad_norm": 4.000847816467285, "learning_rate": 1.9746394804489425e-05, "loss": 1.8845, "step": 950 }, { "epoch": 0.14814814814814814, "grad_norm": 4.6890058517456055, "learning_rate": 1.9687080439329585e-05, "loss": 1.9339, "step": 1000 }, { "epoch": 0.15555555555555556, "grad_norm": 4.7135701179504395, "learning_rate": 1.9621647506537192e-05, "loss": 1.8919, "step": 1050 }, { "epoch": 0.16296296296296298, "grad_norm": 4.486835956573486, "learning_rate": 1.955013733495505e-05, "loss": 1.7963, "step": 1100 }, { "epoch": 0.17037037037037037, "grad_norm": 4.228176593780518, "learning_rate": 1.947259509194024e-05, "loss": 1.8209, "step": 1150 }, { "epoch": 0.17777777777777778, "grad_norm": 4.834776878356934, "learning_rate": 1.9389069754835436e-05, "loss": 1.8378, "step": 1200 }, { "epoch": 0.18518518518518517, "grad_norm": 5.123416423797607, "learning_rate": 1.9299614080033794e-05, "loss": 1.96, "step": 1250 }, { "epoch": 0.1925925925925926, "grad_norm": 4.351609230041504, "learning_rate": 1.9204284569656848e-05, "loss": 1.8308, "step": 1300 }, { "epoch": 0.2, "grad_norm": 3.7220265865325928, "learning_rate": 1.91031414358666e-05, "loss": 1.8498, "step": 1350 }, { "epoch": 0.2074074074074074, "grad_norm": 3.787611484527588, "learning_rate": 1.8996248562834184e-05, "loss": 1.7831, "step": 1400 }, { "epoch": 0.21481481481481482, "grad_norm": 3.5662522315979004, "learning_rate": 1.8883673466389286e-05, "loss": 1.7446, "step": 1450 }, { "epoch": 0.2222222222222222, "grad_norm": 3.7757928371429443, "learning_rate": 1.876548725137569e-05, "loss": 1.8315, "step": 1500 }, { "epoch": 0.22962962962962963, "grad_norm": 4.004444122314453, "learning_rate": 1.8641764566739933e-05, "loss": 1.8419, "step": 1550 }, { "epoch": 0.23703703703703705, "grad_norm": 3.936206817626953, "learning_rate": 1.8512583558381422e-05, "loss": 1.8409, "step": 1600 }, { "epoch": 0.24444444444444444, "grad_norm": 3.8752267360687256, "learning_rate": 1.8378025819793832e-05, "loss": 1.8383, "step": 1650 }, { "epoch": 0.2518518518518518, "grad_norm": 3.54079008102417, "learning_rate": 1.823817634052888e-05, "loss": 1.8094, "step": 1700 }, { "epoch": 0.25925925925925924, "grad_norm": 4.669138431549072, "learning_rate": 1.8093123452515122e-05, "loss": 1.8002, "step": 1750 }, { "epoch": 0.26666666666666666, "grad_norm": 4.1702704429626465, "learning_rate": 1.7942958774265603e-05, "loss": 1.8473, "step": 1800 }, { "epoch": 0.2740740740740741, "grad_norm": 3.8928747177124023, "learning_rate": 1.778777715300964e-05, "loss": 1.8624, "step": 1850 }, { "epoch": 0.2814814814814815, "grad_norm": 3.1108946800231934, "learning_rate": 1.76276766047853e-05, "loss": 1.8579, "step": 1900 }, { "epoch": 0.28888888888888886, "grad_norm": 4.28292989730835, "learning_rate": 1.746275825253033e-05, "loss": 1.8081, "step": 1950 }, { "epoch": 0.2962962962962963, "grad_norm": 3.518152952194214, "learning_rate": 1.729312626221078e-05, "loss": 1.7689, "step": 2000 }, { "epoch": 0.3037037037037037, "grad_norm": 3.84735369682312, "learning_rate": 1.7118887777027525e-05, "loss": 1.7576, "step": 2050 }, { "epoch": 0.3111111111111111, "grad_norm": 4.495599269866943, "learning_rate": 1.694015284974233e-05, "loss": 1.753, "step": 2100 }, { "epoch": 0.31851851851851853, "grad_norm": 4.794450759887695, "learning_rate": 1.6757034373166164e-05, "loss": 1.7982, "step": 2150 }, { "epoch": 0.32592592592592595, "grad_norm": 3.8519253730773926, "learning_rate": 1.6569648008853686e-05, "loss": 1.831, "step": 2200 }, { "epoch": 0.3333333333333333, "grad_norm": 4.328887462615967, "learning_rate": 1.6378112114048925e-05, "loss": 1.8093, "step": 2250 }, { "epoch": 0.34074074074074073, "grad_norm": 4.722980976104736, "learning_rate": 1.61825476669283e-05, "loss": 1.7542, "step": 2300 }, { "epoch": 0.34814814814814815, "grad_norm": 3.8855786323547363, "learning_rate": 1.5983078190188224e-05, "loss": 1.78, "step": 2350 }, { "epoch": 0.35555555555555557, "grad_norm": 3.9583353996276855, "learning_rate": 1.577982967302552e-05, "loss": 1.811, "step": 2400 }, { "epoch": 0.362962962962963, "grad_norm": 3.5661697387695312, "learning_rate": 1.5572930491559928e-05, "loss": 1.7345, "step": 2450 }, { "epoch": 0.37037037037037035, "grad_norm": 3.3534741401672363, "learning_rate": 1.536251132774902e-05, "loss": 1.8426, "step": 2500 }, { "epoch": 0.37777777777777777, "grad_norm": 4.127283096313477, "learning_rate": 1.5148705086846656e-05, "loss": 1.8462, "step": 2550 }, { "epoch": 0.3851851851851852, "grad_norm": 3.9793572425842285, "learning_rate": 1.4931646813457183e-05, "loss": 1.824, "step": 2600 }, { "epoch": 0.3925925925925926, "grad_norm": 3.6798741817474365, "learning_rate": 1.4711473606238373e-05, "loss": 1.8243, "step": 2650 }, { "epoch": 0.4, "grad_norm": 4.0039520263671875, "learning_rate": 1.4488324531306963e-05, "loss": 1.7302, "step": 2700 }, { "epoch": 0.4074074074074074, "grad_norm": 4.815029144287109, "learning_rate": 1.4262340534401525e-05, "loss": 1.7768, "step": 2750 }, { "epoch": 0.4148148148148148, "grad_norm": 4.897470474243164, "learning_rate": 1.4033664351858107e-05, "loss": 1.8168, "step": 2800 }, { "epoch": 0.4222222222222222, "grad_norm": 4.236404895782471, "learning_rate": 1.3802440420454915e-05, "loss": 1.7363, "step": 2850 }, { "epoch": 0.42962962962962964, "grad_norm": 3.841942548751831, "learning_rate": 1.3568814786182938e-05, "loss": 1.7471, "step": 2900 }, { "epoch": 0.43703703703703706, "grad_norm": 4.591772556304932, "learning_rate": 1.3332935012000171e-05, "loss": 1.8046, "step": 2950 }, { "epoch": 0.4444444444444444, "grad_norm": 4.35361385345459, "learning_rate": 1.3094950084627698e-05, "loss": 1.8287, "step": 3000 }, { "epoch": 0.45185185185185184, "grad_norm": 3.7746520042419434, "learning_rate": 1.2855010320446471e-05, "loss": 1.838, "step": 3050 }, { "epoch": 0.45925925925925926, "grad_norm": 6.708522796630859, "learning_rate": 1.261326727055427e-05, "loss": 1.7481, "step": 3100 }, { "epoch": 0.4666666666666667, "grad_norm": 4.1498847007751465, "learning_rate": 1.2369873625042784e-05, "loss": 1.7127, "step": 3150 }, { "epoch": 0.4740740740740741, "grad_norm": 4.49635124206543, "learning_rate": 1.2124983116555271e-05, "loss": 1.7455, "step": 3200 }, { "epoch": 0.48148148148148145, "grad_norm": 3.5679614543914795, "learning_rate": 1.187875042318573e-05, "loss": 1.7606, "step": 3250 }, { "epoch": 0.4888888888888889, "grad_norm": 3.869044780731201, "learning_rate": 1.1631331070780895e-05, "loss": 1.7529, "step": 3300 }, { "epoch": 0.4962962962962963, "grad_norm": 3.826565980911255, "learning_rate": 1.138288133470678e-05, "loss": 1.7686, "step": 3350 }, { "epoch": 0.5037037037037037, "grad_norm": 3.7426226139068604, "learning_rate": 1.1133558141141823e-05, "loss": 1.7185, "step": 3400 }, { "epoch": 0.5111111111111111, "grad_norm": 3.6974334716796875, "learning_rate": 1.088351896795891e-05, "loss": 1.7201, "step": 3450 }, { "epoch": 0.5185185185185185, "grad_norm": 4.5774054527282715, "learning_rate": 1.0632921745259022e-05, "loss": 1.7731, "step": 3500 }, { "epoch": 0.5259259259259259, "grad_norm": 4.392045974731445, "learning_rate": 1.0381924755619161e-05, "loss": 1.7597, "step": 3550 }, { "epoch": 0.5333333333333333, "grad_norm": 4.4499993324279785, "learning_rate": 1.0130686534117704e-05, "loss": 1.8088, "step": 3600 }, { "epoch": 0.5407407407407407, "grad_norm": 4.234902858734131, "learning_rate": 9.879365768200245e-06, "loss": 1.7659, "step": 3650 }, { "epoch": 0.5481481481481482, "grad_norm": 3.981933116912842, "learning_rate": 9.62812119744919e-06, "loss": 1.7617, "step": 3700 }, { "epoch": 0.5555555555555556, "grad_norm": 3.9301373958587646, "learning_rate": 9.377111513320452e-06, "loss": 1.7294, "step": 3750 }, { "epoch": 0.562962962962963, "grad_norm": 4.41324520111084, "learning_rate": 9.12649525891052e-06, "loss": 1.7386, "step": 3800 }, { "epoch": 0.5703703703703704, "grad_norm": 4.823089599609375, "learning_rate": 8.876430728817238e-06, "loss": 1.7384, "step": 3850 }, { "epoch": 0.5777777777777777, "grad_norm": 4.53611421585083, "learning_rate": 8.627075869157543e-06, "loss": 1.7353, "step": 3900 }, { "epoch": 0.5851851851851851, "grad_norm": 3.9253506660461426, "learning_rate": 8.37858817780532e-06, "loss": 1.7514, "step": 3950 }, { "epoch": 0.5925925925925926, "grad_norm": 4.049194812774658, "learning_rate": 8.131124604912365e-06, "loss": 1.7986, "step": 4000 }, { "epoch": 0.6, "grad_norm": 3.7836596965789795, "learning_rate": 7.884841453775301e-06, "loss": 1.7423, "step": 4050 }, { "epoch": 0.6074074074074074, "grad_norm": 4.229864120483398, "learning_rate": 7.63989428211107e-06, "loss": 1.772, "step": 4100 }, { "epoch": 0.6148148148148148, "grad_norm": 3.277118444442749, "learning_rate": 7.3964378038033515e-06, "loss": 1.7633, "step": 4150 }, { "epoch": 0.6222222222222222, "grad_norm": 3.6668293476104736, "learning_rate": 7.154625791181937e-06, "loss": 1.7836, "step": 4200 }, { "epoch": 0.6296296296296297, "grad_norm": 4.401381015777588, "learning_rate": 6.914610977896858e-06, "loss": 1.8098, "step": 4250 }, { "epoch": 0.6370370370370371, "grad_norm": 5.0834808349609375, "learning_rate": 6.676544962448514e-06, "loss": 1.8092, "step": 4300 }, { "epoch": 0.6444444444444445, "grad_norm": 4.588042736053467, "learning_rate": 6.44057811243483e-06, "loss": 1.7334, "step": 4350 }, { "epoch": 0.6518518518518519, "grad_norm": 4.436654567718506, "learning_rate": 6.20685946957585e-06, "loss": 1.8513, "step": 4400 }, { "epoch": 0.6592592592592592, "grad_norm": 3.1908841133117676, "learning_rate": 5.97553665557578e-06, "loss": 1.7538, "step": 4450 }, { "epoch": 0.6666666666666666, "grad_norm": 3.9574151039123535, "learning_rate": 5.746755778881979e-06, "loss": 1.738, "step": 4500 }, { "epoch": 0.674074074074074, "grad_norm": 3.3809914588928223, "learning_rate": 5.520661342399726e-06, "loss": 1.7949, "step": 4550 }, { "epoch": 0.6814814814814815, "grad_norm": 5.055593967437744, "learning_rate": 5.297396152221066e-06, "loss": 1.7811, "step": 4600 }, { "epoch": 0.6888888888888889, "grad_norm": 4.099941253662109, "learning_rate": 5.0771012274254515e-06, "loss": 1.7209, "step": 4650 }, { "epoch": 0.6962962962962963, "grad_norm": 4.286564826965332, "learning_rate": 4.85991571100906e-06, "loss": 1.7683, "step": 4700 }, { "epoch": 0.7037037037037037, "grad_norm": 4.559092998504639, "learning_rate": 4.645976781999073e-06, "loss": 1.8504, "step": 4750 }, { "epoch": 0.7111111111111111, "grad_norm": 5.575582981109619, "learning_rate": 4.4354195688085e-06, "loss": 1.7454, "step": 4800 }, { "epoch": 0.7185185185185186, "grad_norm": 4.1070780754089355, "learning_rate": 4.228377063886143e-06, "loss": 1.7724, "step": 4850 }, { "epoch": 0.725925925925926, "grad_norm": 4.384026527404785, "learning_rate": 4.0249800397157425e-06, "loss": 1.7621, "step": 4900 }, { "epoch": 0.7333333333333333, "grad_norm": 3.8214433193206787, "learning_rate": 3.825356966217246e-06, "loss": 1.6777, "step": 4950 }, { "epoch": 0.7407407407407407, "grad_norm": 5.473362445831299, "learning_rate": 3.6296339296024853e-06, "loss": 1.7927, "step": 5000 }, { "epoch": 0.7481481481481481, "grad_norm": 5.610530853271484, "learning_rate": 3.437934552736388e-06, "loss": 1.7948, "step": 5050 }, { "epoch": 0.7555555555555555, "grad_norm": 4.789555072784424, "learning_rate": 3.2503799170541573e-06, "loss": 1.7594, "step": 5100 }, { "epoch": 0.762962962962963, "grad_norm": 4.272578239440918, "learning_rate": 3.067088486083628e-06, "loss": 1.7519, "step": 5150 }, { "epoch": 0.7703703703703704, "grad_norm": 4.48111629486084, "learning_rate": 2.888176030621148e-06, "loss": 1.7246, "step": 5200 }, { "epoch": 0.7777777777777778, "grad_norm": 3.5455284118652344, "learning_rate": 2.713755555608295e-06, "loss": 1.7379, "step": 5250 }, { "epoch": 0.7851851851851852, "grad_norm": 4.136303901672363, "learning_rate": 2.5439372287555164e-06, "loss": 1.8543, "step": 5300 }, { "epoch": 0.7925925925925926, "grad_norm": 3.6334547996520996, "learning_rate": 2.3788283109578282e-06, "loss": 1.7957, "step": 5350 }, { "epoch": 0.8, "grad_norm": 3.3337242603302, "learning_rate": 2.2185330885465626e-06, "loss": 1.7577, "step": 5400 }, { "epoch": 0.8074074074074075, "grad_norm": 4.086159706115723, "learning_rate": 2.0631528074198624e-06, "loss": 1.7125, "step": 5450 }, { "epoch": 0.8148148148148148, "grad_norm": 4.198705673217773, "learning_rate": 1.912785609093619e-06, "loss": 1.8493, "step": 5500 }, { "epoch": 0.8222222222222222, "grad_norm": 3.8114004135131836, "learning_rate": 1.7675264687131699e-06, "loss": 1.736, "step": 5550 }, { "epoch": 0.8296296296296296, "grad_norm": 4.404809474945068, "learning_rate": 1.6274671350649818e-06, "loss": 1.6118, "step": 5600 }, { "epoch": 0.837037037037037, "grad_norm": 5.045413017272949, "learning_rate": 1.4926960726261342e-06, "loss": 1.8435, "step": 5650 }, { "epoch": 0.8444444444444444, "grad_norm": 3.8105881214141846, "learning_rate": 1.3632984056882615e-06, "loss": 1.8167, "step": 5700 }, { "epoch": 0.8518518518518519, "grad_norm": 3.879211902618408, "learning_rate": 1.2393558645912395e-06, "loss": 1.8421, "step": 5750 }, { "epoch": 0.8592592592592593, "grad_norm": 4.706577301025391, "learning_rate": 1.1209467341005297e-06, "loss": 1.7424, "step": 5800 }, { "epoch": 0.8666666666666667, "grad_norm": 4.003446578979492, "learning_rate": 1.0081458039608638e-06, "loss": 1.7016, "step": 5850 }, { "epoch": 0.8740740740740741, "grad_norm": 4.579277515411377, "learning_rate": 9.010243216574233e-07, "loss": 1.8125, "step": 5900 }, { "epoch": 0.8814814814814815, "grad_norm": 3.8693718910217285, "learning_rate": 7.996499474144115e-07, "loss": 1.7651, "step": 5950 }, { "epoch": 0.8888888888888888, "grad_norm": 4.174098014831543, "learning_rate": 7.040867114593952e-07, "loss": 1.741, "step": 6000 }, { "epoch": 0.8962962962962963, "grad_norm": 4.958495616912842, "learning_rate": 6.143949735804477e-07, "loss": 1.7791, "step": 6050 }, { "epoch": 0.9037037037037037, "grad_norm": 4.0520548820495605, "learning_rate": 5.306313850016154e-07, "loss": 1.7638, "step": 6100 }, { "epoch": 0.9111111111111111, "grad_norm": 5.820758819580078, "learning_rate": 4.5284885260078014e-07, "loss": 1.7263, "step": 6150 }, { "epoch": 0.9185185185185185, "grad_norm": 3.6252601146698, "learning_rate": 3.8109650549255195e-07, "loss": 1.736, "step": 6200 }, { "epoch": 0.9259259259259259, "grad_norm": 4.056784152984619, "learning_rate": 3.1541966399726287e-07, "loss": 1.754, "step": 6250 }, { "epoch": 0.9333333333333333, "grad_norm": 3.9577994346618652, "learning_rate": 2.5585981101567627e-07, "loss": 1.8165, "step": 6300 }, { "epoch": 0.9407407407407408, "grad_norm": 4.5424089431762695, "learning_rate": 2.024545658275079e-07, "loss": 1.8013, "step": 6350 }, { "epoch": 0.9481481481481482, "grad_norm": 3.9447548389434814, "learning_rate": 1.5523766033027298e-07, "loss": 1.7521, "step": 6400 }, { "epoch": 0.9555555555555556, "grad_norm": 5.120405673980713, "learning_rate": 1.1423891773350238e-07, "loss": 1.7422, "step": 6450 }, { "epoch": 0.9629629629629629, "grad_norm": 4.158134937286377, "learning_rate": 7.948423372176384e-08, "loss": 1.7485, "step": 6500 }, { "epoch": 0.9703703703703703, "grad_norm": 4.426783561706543, "learning_rate": 5.099556009838913e-08, "loss": 1.8421, "step": 6550 }, { "epoch": 0.9777777777777777, "grad_norm": 4.222548961639404, "learning_rate": 2.8790890920249447e-08, "loss": 1.7025, "step": 6600 }, { "epoch": 0.9851851851851852, "grad_norm": 4.41562032699585, "learning_rate": 1.2884251132316839e-08, "loss": 1.79, "step": 6650 }, { "epoch": 0.9925925925925926, "grad_norm": 4.172543048858643, "learning_rate": 3.2856877092168895e-09, "loss": 1.7572, "step": 6700 }, { "epoch": 1.0, "grad_norm": 4.530770301818848, "learning_rate": 1.2633093371405836e-12, "loss": 1.7305, "step": 6750 }, { "epoch": 1.0074074074074073, "grad_norm": 4.5686211585998535, "learning_rate": 1.0485547568805592e-05, "loss": 1.7918, "step": 6800 }, { "epoch": 1.0148148148148148, "grad_norm": 4.193668842315674, "learning_rate": 1.0364827091168057e-05, "loss": 1.818, "step": 6850 }, { "epoch": 1.0222222222222221, "grad_norm": 3.926337957382202, "learning_rate": 1.0244053349399506e-05, "loss": 1.7526, "step": 6900 }, { "epoch": 1.0296296296296297, "grad_norm": 3.8789143562316895, "learning_rate": 1.0123243976259578e-05, "loss": 1.733, "step": 6950 }, { "epoch": 1.037037037037037, "grad_norm": 4.395325183868408, "learning_rate": 1.000241660971001e-05, "loss": 1.6787, "step": 7000 }, { "epoch": 1.0444444444444445, "grad_norm": 4.981533527374268, "learning_rate": 9.881588890339562e-06, "loss": 1.7642, "step": 7050 }, { "epoch": 1.0518518518518518, "grad_norm": 3.6582109928131104, "learning_rate": 9.760778458788497e-06, "loss": 1.7935, "step": 7100 }, { "epoch": 1.0592592592592593, "grad_norm": 4.174370288848877, "learning_rate": 9.640002953173087e-06, "loss": 1.7471, "step": 7150 }, { "epoch": 1.0666666666666667, "grad_norm": 4.729079246520996, "learning_rate": 9.519280006510476e-06, "loss": 1.7822, "step": 7200 }, { "epoch": 1.074074074074074, "grad_norm": 3.563659429550171, "learning_rate": 9.398627244144298e-06, "loss": 1.7268, "step": 7250 }, { "epoch": 1.0814814814814815, "grad_norm": 4.399823188781738, "learning_rate": 9.278062281171394e-06, "loss": 1.7783, "step": 7300 }, { "epoch": 1.0888888888888888, "grad_norm": 3.1910765171051025, "learning_rate": 9.157602719870045e-06, "loss": 1.8013, "step": 7350 }, { "epoch": 1.0962962962962963, "grad_norm": 4.391103267669678, "learning_rate": 9.037266147130064e-06, "loss": 1.7556, "step": 7400 }, { "epoch": 1.1037037037037036, "grad_norm": 4.619572162628174, "learning_rate": 8.917070131885155e-06, "loss": 1.8024, "step": 7450 }, { "epoch": 1.1111111111111112, "grad_norm": 4.849502086639404, "learning_rate": 8.797032222547856e-06, "loss": 1.7657, "step": 7500 }, { "epoch": 1.1185185185185185, "grad_norm": 3.781859874725342, "learning_rate": 8.67716994444752e-06, "loss": 1.7622, "step": 7550 }, { "epoch": 1.125925925925926, "grad_norm": 3.9502782821655273, "learning_rate": 8.557500797271638e-06, "loss": 1.7528, "step": 7600 }, { "epoch": 1.1333333333333333, "grad_norm": 4.41927433013916, "learning_rate": 8.438042252510919e-06, "loss": 1.7763, "step": 7650 }, { "epoch": 1.1407407407407408, "grad_norm": 4.123870372772217, "learning_rate": 8.318811750908481e-06, "loss": 1.7615, "step": 7700 }, { "epoch": 1.1481481481481481, "grad_norm": 4.480880260467529, "learning_rate": 8.199826699913524e-06, "loss": 1.7523, "step": 7750 }, { "epoch": 1.1555555555555554, "grad_norm": 4.082483291625977, "learning_rate": 8.081104471139885e-06, "loss": 1.7517, "step": 7800 }, { "epoch": 1.162962962962963, "grad_norm": 4.776106357574463, "learning_rate": 7.962662397829805e-06, "loss": 1.6679, "step": 7850 }, { "epoch": 1.1703703703703703, "grad_norm": 5.85430908203125, "learning_rate": 7.844517772323305e-06, "loss": 1.6882, "step": 7900 }, { "epoch": 1.1777777777777778, "grad_norm": 4.3917012214660645, "learning_rate": 7.726687843533539e-06, "loss": 1.7661, "step": 7950 }, { "epoch": 1.1851851851851851, "grad_norm": 3.9886467456817627, "learning_rate": 7.609189814428473e-06, "loss": 1.7336, "step": 8000 }, { "epoch": 1.1925925925925926, "grad_norm": 4.853082180023193, "learning_rate": 7.492040839519299e-06, "loss": 1.8087, "step": 8050 }, { "epoch": 1.2, "grad_norm": 4.096999645233154, "learning_rate": 7.37525802235588e-06, "loss": 1.7323, "step": 8100 }, { "epoch": 1.2074074074074075, "grad_norm": 4.180538177490234, "learning_rate": 7.258858413029683e-06, "loss": 1.8245, "step": 8150 }, { "epoch": 1.2148148148148148, "grad_norm": 4.450080871582031, "learning_rate": 7.142859005684486e-06, "loss": 1.7485, "step": 8200 }, { "epoch": 1.2222222222222223, "grad_norm": 5.08303689956665, "learning_rate": 7.027276736035256e-06, "loss": 1.7721, "step": 8250 }, { "epoch": 1.2296296296296296, "grad_norm": 3.8852577209472656, "learning_rate": 6.912128478895575e-06, "loss": 1.7258, "step": 8300 }, { "epoch": 1.237037037037037, "grad_norm": 4.540018081665039, "learning_rate": 6.797431045713948e-06, "loss": 1.8447, "step": 8350 }, { "epoch": 1.2444444444444445, "grad_norm": 4.3245954513549805, "learning_rate": 6.683201182119334e-06, "loss": 1.7964, "step": 8400 }, { "epoch": 1.2518518518518518, "grad_norm": 6.528281211853027, "learning_rate": 6.569455565476361e-06, "loss": 1.7047, "step": 8450 }, { "epoch": 1.2592592592592593, "grad_norm": 4.52567195892334, "learning_rate": 6.4562108024504065e-06, "loss": 1.7325, "step": 8500 }, { "epoch": 1.2666666666666666, "grad_norm": 3.1090915203094482, "learning_rate": 6.343483426583085e-06, "loss": 1.7662, "step": 8550 }, { "epoch": 1.2740740740740741, "grad_norm": 4.740061283111572, "learning_rate": 6.231289895878375e-06, "loss": 1.7386, "step": 8600 }, { "epoch": 1.2814814814814814, "grad_norm": 4.207073211669922, "learning_rate": 6.119646590399768e-06, "loss": 1.7749, "step": 8650 }, { "epoch": 1.2888888888888888, "grad_norm": 3.976893901824951, "learning_rate": 6.008569809878817e-06, "loss": 1.7334, "step": 8700 }, { "epoch": 1.2962962962962963, "grad_norm": 5.906888961791992, "learning_rate": 5.898075771335408e-06, "loss": 1.739, "step": 8750 }, { "epoch": 1.3037037037037038, "grad_norm": 4.902368068695068, "learning_rate": 5.788180606710076e-06, "loss": 1.7981, "step": 8800 }, { "epoch": 1.3111111111111111, "grad_norm": 5.860922813415527, "learning_rate": 5.678900360508813e-06, "loss": 1.7231, "step": 8850 }, { "epoch": 1.3185185185185184, "grad_norm": 4.610461711883545, "learning_rate": 5.570250987460557e-06, "loss": 1.814, "step": 8900 }, { "epoch": 1.325925925925926, "grad_norm": 4.3659844398498535, "learning_rate": 5.462248350187851e-06, "loss": 1.7117, "step": 8950 }, { "epoch": 1.3333333333333333, "grad_norm": 5.581747531890869, "learning_rate": 5.35490821689092e-06, "loss": 1.7883, "step": 9000 }, { "epoch": 1.3407407407407408, "grad_norm": 4.3234333992004395, "learning_rate": 5.248246259045545e-06, "loss": 1.7519, "step": 9050 }, { "epoch": 1.348148148148148, "grad_norm": 4.9023308753967285, "learning_rate": 5.142278049115043e-06, "loss": 1.7388, "step": 9100 }, { "epoch": 1.3555555555555556, "grad_norm": 5.058767318725586, "learning_rate": 5.037019058276733e-06, "loss": 1.7572, "step": 9150 }, { "epoch": 1.362962962962963, "grad_norm": 4.163683891296387, "learning_rate": 4.932484654163156e-06, "loss": 1.8142, "step": 9200 }, { "epoch": 1.3703703703703702, "grad_norm": 4.509832859039307, "learning_rate": 4.828690098618429e-06, "loss": 1.8259, "step": 9250 }, { "epoch": 1.3777777777777778, "grad_norm": 4.533676624298096, "learning_rate": 4.725650545470048e-06, "loss": 1.7511, "step": 9300 }, { "epoch": 1.3851851851851853, "grad_norm": 5.931670665740967, "learning_rate": 4.62338103831645e-06, "loss": 1.707, "step": 9350 }, { "epoch": 1.3925925925925926, "grad_norm": 5.659622669219971, "learning_rate": 4.521896508330672e-06, "loss": 1.6732, "step": 9400 }, { "epoch": 1.4, "grad_norm": 5.572442054748535, "learning_rate": 4.421211772080429e-06, "loss": 1.774, "step": 9450 }, { "epoch": 1.4074074074074074, "grad_norm": 4.983119487762451, "learning_rate": 4.321341529364921e-06, "loss": 1.8113, "step": 9500 }, { "epoch": 1.4148148148148147, "grad_norm": 4.496840953826904, "learning_rate": 4.222300361068686e-06, "loss": 1.8397, "step": 9550 }, { "epoch": 1.4222222222222223, "grad_norm": 4.4754204750061035, "learning_rate": 4.12410272703281e-06, "loss": 1.7113, "step": 9600 }, { "epoch": 1.4296296296296296, "grad_norm": 4.279294490814209, "learning_rate": 4.026762963943822e-06, "loss": 1.7018, "step": 9650 }, { "epoch": 1.4370370370370371, "grad_norm": 4.388822078704834, "learning_rate": 3.93029528324057e-06, "loss": 1.6982, "step": 9700 }, { "epoch": 1.4444444444444444, "grad_norm": 4.000326156616211, "learning_rate": 3.83471376903936e-06, "loss": 1.696, "step": 9750 }, { "epoch": 1.4518518518518517, "grad_norm": 4.983404636383057, "learning_rate": 3.740032376077698e-06, "loss": 1.6858, "step": 9800 }, { "epoch": 1.4592592592592593, "grad_norm": 3.823800563812256, "learning_rate": 3.646264927676937e-06, "loss": 1.7345, "step": 9850 }, { "epoch": 1.4666666666666668, "grad_norm": 5.286291599273682, "learning_rate": 3.5534251137240883e-06, "loss": 1.6576, "step": 9900 }, { "epoch": 1.474074074074074, "grad_norm": 4.020094394683838, "learning_rate": 3.461526488673118e-06, "loss": 1.7326, "step": 9950 }, { "epoch": 1.4814814814814814, "grad_norm": 3.9249887466430664, "learning_rate": 3.370582469566027e-06, "loss": 1.7345, "step": 10000 }, { "epoch": 1.488888888888889, "grad_norm": 4.402612686157227, "learning_rate": 3.2806063340739768e-06, "loss": 1.7224, "step": 10050 }, { "epoch": 1.4962962962962962, "grad_norm": 4.24970817565918, "learning_rate": 3.1916112185587833e-06, "loss": 1.761, "step": 10100 }, { "epoch": 1.5037037037037035, "grad_norm": 5.631906509399414, "learning_rate": 3.103610116155018e-06, "loss": 1.7461, "step": 10150 }, { "epoch": 1.511111111111111, "grad_norm": 3.5104784965515137, "learning_rate": 3.0166158748730456e-06, "loss": 1.747, "step": 10200 }, { "epoch": 1.5185185185185186, "grad_norm": 5.422842979431152, "learning_rate": 2.930641195723224e-06, "loss": 1.7131, "step": 10250 }, { "epoch": 1.525925925925926, "grad_norm": 4.995806694030762, "learning_rate": 2.845698630861593e-06, "loss": 1.8228, "step": 10300 }, { "epoch": 1.5333333333333332, "grad_norm": 4.923055171966553, "learning_rate": 2.761800581757258e-06, "loss": 1.7489, "step": 10350 }, { "epoch": 1.5407407407407407, "grad_norm": 3.771245241165161, "learning_rate": 2.6789592973818257e-06, "loss": 1.7353, "step": 10400 }, { "epoch": 1.5481481481481483, "grad_norm": 3.624614715576172, "learning_rate": 2.5971868724210513e-06, "loss": 1.7126, "step": 10450 }, { "epoch": 1.5555555555555556, "grad_norm": 7.3490471839904785, "learning_rate": 2.5164952455090575e-06, "loss": 1.6574, "step": 10500 }, { "epoch": 1.5629629629629629, "grad_norm": 3.961510419845581, "learning_rate": 2.436896197485282e-06, "loss": 1.8217, "step": 10550 }, { "epoch": 1.5703703703703704, "grad_norm": 4.2067975997924805, "learning_rate": 2.358401349674528e-06, "loss": 1.7681, "step": 10600 }, { "epoch": 1.5777777777777777, "grad_norm": 4.517464637756348, "learning_rate": 2.2810221621902563e-06, "loss": 1.7826, "step": 10650 }, { "epoch": 1.585185185185185, "grad_norm": 4.264112949371338, "learning_rate": 2.2047699322614234e-06, "loss": 1.7051, "step": 10700 }, { "epoch": 1.5925925925925926, "grad_norm": 4.641966819763184, "learning_rate": 2.1296557925831164e-06, "loss": 1.7346, "step": 10750 }, { "epoch": 1.6, "grad_norm": 4.101382732391357, "learning_rate": 2.0556907096911926e-06, "loss": 1.7784, "step": 10800 }, { "epoch": 1.6074074074074074, "grad_norm": 3.9751815795898438, "learning_rate": 1.9828854823611776e-06, "loss": 1.8022, "step": 10850 }, { "epoch": 1.6148148148148147, "grad_norm": 4.810856819152832, "learning_rate": 1.9112507400316814e-06, "loss": 1.6955, "step": 10900 }, { "epoch": 1.6222222222222222, "grad_norm": 4.649193286895752, "learning_rate": 1.8407969412525006e-06, "loss": 1.7858, "step": 10950 }, { "epoch": 1.6296296296296298, "grad_norm": 4.422476291656494, "learning_rate": 1.7715343721576973e-06, "loss": 1.6827, "step": 11000 }, { "epoch": 1.637037037037037, "grad_norm": 4.0928473472595215, "learning_rate": 1.7034731449638287e-06, "loss": 1.7319, "step": 11050 }, { "epoch": 1.6444444444444444, "grad_norm": 4.212646007537842, "learning_rate": 1.6366231964936019e-06, "loss": 1.6627, "step": 11100 }, { "epoch": 1.651851851851852, "grad_norm": 5.1110663414001465, "learning_rate": 1.5709942867250972e-06, "loss": 1.7207, "step": 11150 }, { "epoch": 1.6592592592592592, "grad_norm": 4.896889686584473, "learning_rate": 1.5065959973668355e-06, "loss": 1.6812, "step": 11200 }, { "epoch": 1.6666666666666665, "grad_norm": 5.54286003112793, "learning_rate": 1.4434377304588643e-06, "loss": 1.8249, "step": 11250 }, { "epoch": 1.674074074074074, "grad_norm": 6.0234575271606445, "learning_rate": 1.3815287070000727e-06, "loss": 1.7191, "step": 11300 }, { "epoch": 1.6814814814814816, "grad_norm": 5.337372779846191, "learning_rate": 1.3208779656019466e-06, "loss": 1.7623, "step": 11350 }, { "epoch": 1.6888888888888889, "grad_norm": 4.075193405151367, "learning_rate": 1.2614943611689446e-06, "loss": 1.7792, "step": 11400 }, { "epoch": 1.6962962962962962, "grad_norm": 3.773776054382324, "learning_rate": 1.203386563605693e-06, "loss": 1.7327, "step": 11450 }, { "epoch": 1.7037037037037037, "grad_norm": 3.25433349609375, "learning_rate": 1.146563056551202e-06, "loss": 1.6928, "step": 11500 }, { "epoch": 1.7111111111111112, "grad_norm": 3.913914442062378, "learning_rate": 1.0910321361402654e-06, "loss": 1.6769, "step": 11550 }, { "epoch": 1.7185185185185186, "grad_norm": 3.303250551223755, "learning_rate": 1.0368019097922344e-06, "loss": 1.713, "step": 11600 }, { "epoch": 1.7259259259259259, "grad_norm": 5.97332239151001, "learning_rate": 9.838802950273551e-07, "loss": 1.7482, "step": 11650 }, { "epoch": 1.7333333333333334, "grad_norm": 4.753826141357422, "learning_rate": 9.322750183108264e-07, "loss": 1.7199, "step": 11700 }, { "epoch": 1.7407407407407407, "grad_norm": 4.7902092933654785, "learning_rate": 8.819936139247421e-07, "loss": 1.6939, "step": 11750 }, { "epoch": 1.748148148148148, "grad_norm": 4.5352630615234375, "learning_rate": 8.33043422868095e-07, "loss": 1.7428, "step": 11800 }, { "epoch": 1.7555555555555555, "grad_norm": 3.912677526473999, "learning_rate": 7.854315917850163e-07, "loss": 1.6898, "step": 11850 }, { "epoch": 1.762962962962963, "grad_norm": 5.150155067443848, "learning_rate": 7.391650719213706e-07, "loss": 1.6438, "step": 11900 }, { "epoch": 1.7703703703703704, "grad_norm": 6.563386917114258, "learning_rate": 6.942506181098851e-07, "loss": 1.7661, "step": 11950 }, { "epoch": 1.7777777777777777, "grad_norm": 3.9827239513397217, "learning_rate": 6.506947877839587e-07, "loss": 1.8036, "step": 12000 }, { "epoch": 1.7851851851851852, "grad_norm": 4.162949562072754, "learning_rate": 6.085039400202852e-07, "loss": 1.772, "step": 12050 }, { "epoch": 1.7925925925925927, "grad_norm": 4.235226154327393, "learning_rate": 5.676842346104383e-07, "loss": 1.6865, "step": 12100 }, { "epoch": 1.8, "grad_norm": 4.130675315856934, "learning_rate": 5.28241631161559e-07, "loss": 1.6688, "step": 12150 }, { "epoch": 1.8074074074074074, "grad_norm": 4.83511209487915, "learning_rate": 4.901818882262532e-07, "loss": 1.6613, "step": 12200 }, { "epoch": 1.8148148148148149, "grad_norm": 4.381454944610596, "learning_rate": 4.53510562461863e-07, "loss": 1.7773, "step": 12250 }, { "epoch": 1.8222222222222222, "grad_norm": 5.143768787384033, "learning_rate": 4.182330078191976e-07, "loss": 1.8461, "step": 12300 }, { "epoch": 1.8296296296296295, "grad_norm": 4.87855339050293, "learning_rate": 3.8435437476086466e-07, "loss": 1.7111, "step": 12350 }, { "epoch": 1.837037037037037, "grad_norm": 4.4321441650390625, "learning_rate": 3.51879609509318e-07, "loss": 1.7595, "step": 12400 }, { "epoch": 1.8444444444444446, "grad_norm": 4.462729454040527, "learning_rate": 3.2081345332471204e-07, "loss": 1.8522, "step": 12450 }, { "epoch": 1.8518518518518519, "grad_norm": 5.265483379364014, "learning_rate": 2.911604418126901e-07, "loss": 1.7855, "step": 12500 }, { "epoch": 1.8592592592592592, "grad_norm": 6.335628509521484, "learning_rate": 2.6292490426218955e-07, "loss": 1.6626, "step": 12550 }, { "epoch": 1.8666666666666667, "grad_norm": 5.781252384185791, "learning_rate": 2.3611096301337623e-07, "loss": 1.8229, "step": 12600 }, { "epoch": 1.8740740740740742, "grad_norm": 4.649851322174072, "learning_rate": 2.1072253285578602e-07, "loss": 1.7732, "step": 12650 }, { "epoch": 1.8814814814814815, "grad_norm": 4.953747749328613, "learning_rate": 1.867633204567776e-07, "loss": 1.6664, "step": 12700 }, { "epoch": 1.8888888888888888, "grad_norm": 3.763566493988037, "learning_rate": 1.6423682382036288e-07, "loss": 1.764, "step": 12750 }, { "epoch": 1.8962962962962964, "grad_norm": 5.626904487609863, "learning_rate": 1.431463317765025e-07, "loss": 1.7337, "step": 12800 }, { "epoch": 1.9037037037037037, "grad_norm": 5.060257434844971, "learning_rate": 1.2349492350094195e-07, "loss": 1.8271, "step": 12850 }, { "epoch": 1.911111111111111, "grad_norm": 4.137857913970947, "learning_rate": 1.0528546806566342e-07, "loss": 1.7984, "step": 12900 }, { "epoch": 1.9185185185185185, "grad_norm": 4.870020389556885, "learning_rate": 8.852062402000095e-08, "loss": 1.7068, "step": 12950 }, { "epoch": 1.925925925925926, "grad_norm": 4.509660243988037, "learning_rate": 7.320283900249636e-08, "loss": 1.7385, "step": 13000 }, { "epoch": 1.9333333333333333, "grad_norm": 4.607212543487549, "learning_rate": 5.933434938354965e-08, "loss": 1.7423, "step": 13050 }, { "epoch": 1.9407407407407407, "grad_norm": 4.2224016189575195, "learning_rate": 4.6917179938912315e-08, "loss": 1.7218, "step": 13100 }, { "epoch": 1.9481481481481482, "grad_norm": 4.699623107910156, "learning_rate": 3.595314355407609e-08, "loss": 1.7581, "step": 13150 }, { "epoch": 1.9555555555555557, "grad_norm": 5.265933990478516, "learning_rate": 2.6443840959590183e-08, "loss": 1.7206, "step": 13200 }, { "epoch": 1.9629629629629628, "grad_norm": 4.173360347747803, "learning_rate": 1.839066049736271e-08, "loss": 1.7231, "step": 13250 }, { "epoch": 1.9703703703703703, "grad_norm": 4.037991523742676, "learning_rate": 1.1794777917957245e-08, "loss": 1.6786, "step": 13300 }, { "epoch": 1.9777777777777779, "grad_norm": 3.815904140472412, "learning_rate": 6.657156208946802e-09, "loss": 1.7763, "step": 13350 }, { "epoch": 1.9851851851851852, "grad_norm": 4.844634532928467, "learning_rate": 2.9785454543074244e-09, "loss": 1.7412, "step": 13400 }, { "epoch": 1.9925925925925925, "grad_norm": 4.531811237335205, "learning_rate": 7.594827249135517e-10, "loss": 1.7515, "step": 13450 }, { "epoch": 2.0, "grad_norm": 5.7891435623168945, "learning_rate": 2.9200012852115266e-13, "loss": 1.7226, "step": 13500 }, { "epoch": 2.0074074074074075, "grad_norm": 5.030249118804932, "learning_rate": 5.162960293961459e-06, "loss": 1.702, "step": 13550 }, { "epoch": 2.0148148148148146, "grad_norm": 4.552247047424316, "learning_rate": 5.093503346649434e-06, "loss": 1.6877, "step": 13600 }, { "epoch": 2.022222222222222, "grad_norm": 4.661364555358887, "learning_rate": 5.024356765789562e-06, "loss": 1.6621, "step": 13650 }, { "epoch": 2.0296296296296297, "grad_norm": 4.184024333953857, "learning_rate": 4.955524925333455e-06, "loss": 1.811, "step": 13700 }, { "epoch": 2.037037037037037, "grad_norm": 4.5465898513793945, "learning_rate": 4.887012179323451e-06, "loss": 1.7626, "step": 13750 }, { "epoch": 2.0444444444444443, "grad_norm": 5.515983581542969, "learning_rate": 4.818822861617165e-06, "loss": 1.761, "step": 13800 }, { "epoch": 2.051851851851852, "grad_norm": 4.300454616546631, "learning_rate": 4.7509612856133645e-06, "loss": 1.7684, "step": 13850 }, { "epoch": 2.0592592592592593, "grad_norm": 4.138401508331299, "learning_rate": 4.683431743979113e-06, "loss": 1.6814, "step": 13900 }, { "epoch": 2.066666666666667, "grad_norm": 4.315591335296631, "learning_rate": 4.616238508378233e-06, "loss": 1.7425, "step": 13950 }, { "epoch": 2.074074074074074, "grad_norm": 5.489791393280029, "learning_rate": 4.549385829201098e-06, "loss": 1.7289, "step": 14000 }, { "epoch": 2.0814814814814815, "grad_norm": 3.5941522121429443, "learning_rate": 4.482877935295768e-06, "loss": 1.7781, "step": 14050 }, { "epoch": 2.088888888888889, "grad_norm": 4.438997268676758, "learning_rate": 4.416719033700483e-06, "loss": 1.7008, "step": 14100 }, { "epoch": 2.096296296296296, "grad_norm": 4.225255966186523, "learning_rate": 4.350913309377562e-06, "loss": 1.7366, "step": 14150 }, { "epoch": 2.1037037037037036, "grad_norm": 5.48150110244751, "learning_rate": 4.28546492494865e-06, "loss": 1.784, "step": 14200 }, { "epoch": 2.111111111111111, "grad_norm": 4.1811299324035645, "learning_rate": 4.220378020431424e-06, "loss": 1.7335, "step": 14250 }, { "epoch": 2.1185185185185187, "grad_norm": 4.268235683441162, "learning_rate": 4.155656712977703e-06, "loss": 1.7866, "step": 14300 }, { "epoch": 2.1259259259259258, "grad_norm": 4.499932765960693, "learning_rate": 4.091305096613023e-06, "loss": 1.7826, "step": 14350 }, { "epoch": 2.1333333333333333, "grad_norm": 5.271858215332031, "learning_rate": 4.027327241977652e-06, "loss": 1.7491, "step": 14400 }, { "epoch": 2.140740740740741, "grad_norm": 5.147281169891357, "learning_rate": 3.963727196069101e-06, "loss": 1.8014, "step": 14450 }, { "epoch": 2.148148148148148, "grad_norm": 4.847378730773926, "learning_rate": 3.900508981986137e-06, "loss": 1.6866, "step": 14500 }, { "epoch": 2.1555555555555554, "grad_norm": 4.832731246948242, "learning_rate": 3.8376765986742795e-06, "loss": 1.811, "step": 14550 }, { "epoch": 2.162962962962963, "grad_norm": 4.3363800048828125, "learning_rate": 3.77523402067285e-06, "loss": 1.7513, "step": 14600 }, { "epoch": 2.1703703703703705, "grad_norm": 5.211742877960205, "learning_rate": 3.71318519786356e-06, "loss": 1.6949, "step": 14650 }, { "epoch": 2.1777777777777776, "grad_norm": 5.9687113761901855, "learning_rate": 3.6515340552206547e-06, "loss": 1.6632, "step": 14700 }, { "epoch": 2.185185185185185, "grad_norm": 4.977273464202881, "learning_rate": 3.5902844925626334e-06, "loss": 1.7158, "step": 14750 }, { "epoch": 2.1925925925925926, "grad_norm": 4.584545612335205, "learning_rate": 3.5294403843055604e-06, "loss": 1.6568, "step": 14800 }, { "epoch": 2.2, "grad_norm": 5.935274600982666, "learning_rate": 3.4690055792179824e-06, "loss": 1.6968, "step": 14850 }, { "epoch": 2.2074074074074073, "grad_norm": 5.389420509338379, "learning_rate": 3.408983900177486e-06, "loss": 1.763, "step": 14900 }, { "epoch": 2.214814814814815, "grad_norm": 5.381558895111084, "learning_rate": 3.3493791439288503e-06, "loss": 1.7165, "step": 14950 }, { "epoch": 2.2222222222222223, "grad_norm": 4.326274871826172, "learning_rate": 3.2901950808438975e-06, "loss": 1.7434, "step": 15000 }, { "epoch": 2.2296296296296294, "grad_norm": 5.2910919189453125, "learning_rate": 3.2314354546829874e-06, "loss": 1.7093, "step": 15050 }, { "epoch": 2.237037037037037, "grad_norm": 3.7212393283843994, "learning_rate": 3.173103982358211e-06, "loss": 1.6989, "step": 15100 }, { "epoch": 2.2444444444444445, "grad_norm": 5.0373759269714355, "learning_rate": 3.11520435369825e-06, "loss": 1.798, "step": 15150 }, { "epoch": 2.251851851851852, "grad_norm": 5.25777006149292, "learning_rate": 3.0577402312149963e-06, "loss": 1.7995, "step": 15200 }, { "epoch": 2.259259259259259, "grad_norm": 4.734623432159424, "learning_rate": 3.0007152498718596e-06, "loss": 1.7205, "step": 15250 }, { "epoch": 2.2666666666666666, "grad_norm": 5.25666618347168, "learning_rate": 2.9441330168538484e-06, "loss": 1.8312, "step": 15300 }, { "epoch": 2.274074074074074, "grad_norm": 4.524062156677246, "learning_rate": 2.8879971113393755e-06, "loss": 1.7134, "step": 15350 }, { "epoch": 2.2814814814814817, "grad_norm": 5.685743808746338, "learning_rate": 2.832311084273863e-06, "loss": 1.7821, "step": 15400 }, { "epoch": 2.2888888888888888, "grad_norm": 4.252753734588623, "learning_rate": 2.7770784581451205e-06, "loss": 1.7991, "step": 15450 }, { "epoch": 2.2962962962962963, "grad_norm": 5.265336990356445, "learning_rate": 2.7223027267605307e-06, "loss": 1.763, "step": 15500 }, { "epoch": 2.303703703703704, "grad_norm": 4.18657922744751, "learning_rate": 2.667987355026039e-06, "loss": 1.7952, "step": 15550 }, { "epoch": 2.311111111111111, "grad_norm": 4.812895774841309, "learning_rate": 2.614135778726965e-06, "loss": 1.7109, "step": 15600 }, { "epoch": 2.3185185185185184, "grad_norm": 5.158443450927734, "learning_rate": 2.5607514043106997e-06, "loss": 1.6805, "step": 15650 }, { "epoch": 2.325925925925926, "grad_norm": 4.908586025238037, "learning_rate": 2.507837608671194e-06, "loss": 1.726, "step": 15700 }, { "epoch": 2.3333333333333335, "grad_norm": 4.755005836486816, "learning_rate": 2.4553977389353678e-06, "loss": 1.7835, "step": 15750 }, { "epoch": 2.3407407407407406, "grad_norm": 4.698461532592773, "learning_rate": 2.4034351122513723e-06, "loss": 1.7529, "step": 15800 }, { "epoch": 2.348148148148148, "grad_norm": 5.384017467498779, "learning_rate": 2.351953015578775e-06, "loss": 1.8149, "step": 15850 }, { "epoch": 2.3555555555555556, "grad_norm": 7.171658515930176, "learning_rate": 2.3009547054806205e-06, "loss": 1.7038, "step": 15900 }, { "epoch": 2.362962962962963, "grad_norm": 5.168342590332031, "learning_rate": 2.2504434079174465e-06, "loss": 1.8115, "step": 15950 }, { "epoch": 2.3703703703703702, "grad_norm": 4.522395610809326, "learning_rate": 2.200422318043206e-06, "loss": 1.7485, "step": 16000 }, { "epoch": 2.3777777777777778, "grad_norm": 5.041557788848877, "learning_rate": 2.150894600003182e-06, "loss": 1.6743, "step": 16050 }, { "epoch": 2.3851851851851853, "grad_norm": 5.59356164932251, "learning_rate": 2.1018633867338055e-06, "loss": 1.8181, "step": 16100 }, { "epoch": 2.3925925925925924, "grad_norm": 5.429520130157471, "learning_rate": 2.0533317797644947e-06, "loss": 1.773, "step": 16150 }, { "epoch": 2.4, "grad_norm": 4.348500728607178, "learning_rate": 2.0053028490214555e-06, "loss": 1.7281, "step": 16200 }, { "epoch": 2.4074074074074074, "grad_norm": 5.83561897277832, "learning_rate": 1.957779632633503e-06, "loss": 1.6564, "step": 16250 }, { "epoch": 2.414814814814815, "grad_norm": 5.049750804901123, "learning_rate": 1.910765136739864e-06, "loss": 1.7724, "step": 16300 }, { "epoch": 2.422222222222222, "grad_norm": 5.865388870239258, "learning_rate": 1.8642623353000277e-06, "loss": 1.7494, "step": 16350 }, { "epoch": 2.4296296296296296, "grad_norm": 4.3326215744018555, "learning_rate": 1.8182741699056273e-06, "loss": 1.8173, "step": 16400 }, { "epoch": 2.437037037037037, "grad_norm": 5.618774890899658, "learning_rate": 1.7728035495943618e-06, "loss": 1.7839, "step": 16450 }, { "epoch": 2.4444444444444446, "grad_norm": 5.614139556884766, "learning_rate": 1.7278533506659822e-06, "loss": 1.6849, "step": 16500 }, { "epoch": 2.4518518518518517, "grad_norm": 4.3281779289245605, "learning_rate": 1.6834264165003478e-06, "loss": 1.701, "step": 16550 }, { "epoch": 2.4592592592592593, "grad_norm": 4.949063777923584, "learning_rate": 1.6395255573775626e-06, "loss": 1.7329, "step": 16600 }, { "epoch": 2.466666666666667, "grad_norm": 4.536187648773193, "learning_rate": 1.5961535503002168e-06, "loss": 1.7248, "step": 16650 }, { "epoch": 2.474074074074074, "grad_norm": 4.2007293701171875, "learning_rate": 1.5533131388177115e-06, "loss": 1.6858, "step": 16700 }, { "epoch": 2.4814814814814814, "grad_norm": 4.544723033905029, "learning_rate": 1.511007032852716e-06, "loss": 1.7631, "step": 16750 }, { "epoch": 2.488888888888889, "grad_norm": 4.876046180725098, "learning_rate": 1.4692379085297549e-06, "loss": 1.727, "step": 16800 }, { "epoch": 2.4962962962962965, "grad_norm": 4.374514102935791, "learning_rate": 1.4280084080059175e-06, "loss": 1.6994, "step": 16850 }, { "epoch": 2.5037037037037035, "grad_norm": 4.541500091552734, "learning_rate": 1.3873211393037333e-06, "loss": 1.7648, "step": 16900 }, { "epoch": 2.511111111111111, "grad_norm": 4.923274993896484, "learning_rate": 1.347178676146188e-06, "loss": 1.7582, "step": 16950 }, { "epoch": 2.5185185185185186, "grad_norm": 4.922545433044434, "learning_rate": 1.3075835577939332e-06, "loss": 1.7092, "step": 17000 }, { "epoch": 2.525925925925926, "grad_norm": 3.655978202819824, "learning_rate": 1.268538288884651e-06, "loss": 1.7205, "step": 17050 }, { "epoch": 2.533333333333333, "grad_norm": 4.191182613372803, "learning_rate": 1.2300453392746226e-06, "loss": 1.7521, "step": 17100 }, { "epoch": 2.5407407407407407, "grad_norm": 4.058208465576172, "learning_rate": 1.1921071438824971e-06, "loss": 1.7354, "step": 17150 }, { "epoch": 2.5481481481481483, "grad_norm": 4.33668327331543, "learning_rate": 1.1547261025352674e-06, "loss": 1.7775, "step": 17200 }, { "epoch": 2.5555555555555554, "grad_norm": 4.856389999389648, "learning_rate": 1.1179045798164634e-06, "loss": 1.8189, "step": 17250 }, { "epoch": 2.562962962962963, "grad_norm": 4.473984241485596, "learning_rate": 1.08164490491658e-06, "loss": 1.763, "step": 17300 }, { "epoch": 2.5703703703703704, "grad_norm": 5.137580871582031, "learning_rate": 1.0459493714857404e-06, "loss": 1.7901, "step": 17350 }, { "epoch": 2.5777777777777775, "grad_norm": 4.774714946746826, "learning_rate": 1.0108202374886111e-06, "loss": 1.6943, "step": 17400 }, { "epoch": 2.585185185185185, "grad_norm": 4.900181293487549, "learning_rate": 9.762597250615647e-07, "loss": 1.8258, "step": 17450 }, { "epoch": 2.5925925925925926, "grad_norm": 3.775563955307007, "learning_rate": 9.422700203721235e-07, "loss": 1.7091, "step": 17500 }, { "epoch": 2.6, "grad_norm": 5.2524800300598145, "learning_rate": 9.088532734806655e-07, "loss": 1.7415, "step": 17550 }, { "epoch": 2.6074074074074076, "grad_norm": 4.3708648681640625, "learning_rate": 8.760115982044259e-07, "loss": 1.8055, "step": 17600 }, { "epoch": 2.6148148148148147, "grad_norm": 5.368456840515137, "learning_rate": 8.437470719837737e-07, "loss": 1.7102, "step": 17650 }, { "epoch": 2.6222222222222222, "grad_norm": 4.581341743469238, "learning_rate": 8.120617357508109e-07, "loss": 1.7465, "step": 17700 }, { "epoch": 2.6296296296296298, "grad_norm": 4.498773097991943, "learning_rate": 7.809575938002744e-07, "loss": 1.7582, "step": 17750 }, { "epoch": 2.637037037037037, "grad_norm": 4.894174098968506, "learning_rate": 7.504366136627372e-07, "loss": 1.7584, "step": 17800 }, { "epoch": 2.6444444444444444, "grad_norm": 5.387177467346191, "learning_rate": 7.205007259801589e-07, "loss": 1.7143, "step": 17850 }, { "epoch": 2.651851851851852, "grad_norm": 4.475066661834717, "learning_rate": 6.911518243837634e-07, "loss": 1.6871, "step": 17900 }, { "epoch": 2.659259259259259, "grad_norm": 5.426724910736084, "learning_rate": 6.623917653742473e-07, "loss": 1.7643, "step": 17950 }, { "epoch": 2.6666666666666665, "grad_norm": 5.106094837188721, "learning_rate": 6.342223682043536e-07, "loss": 1.7472, "step": 18000 }, { "epoch": 2.674074074074074, "grad_norm": 5.548438549041748, "learning_rate": 6.066454147637791e-07, "loss": 1.7487, "step": 18050 }, { "epoch": 2.6814814814814816, "grad_norm": 4.959684371948242, "learning_rate": 5.796626494664736e-07, "loss": 1.7598, "step": 18100 }, { "epoch": 2.688888888888889, "grad_norm": 4.252431392669678, "learning_rate": 5.53275779140291e-07, "loss": 1.7861, "step": 18150 }, { "epoch": 2.696296296296296, "grad_norm": 4.257000923156738, "learning_rate": 5.274864729190121e-07, "loss": 1.7652, "step": 18200 }, { "epoch": 2.7037037037037037, "grad_norm": 5.807356357574463, "learning_rate": 5.022963621367738e-07, "loss": 1.7479, "step": 18250 }, { "epoch": 2.7111111111111112, "grad_norm": 4.54324197769165, "learning_rate": 4.777070402248674e-07, "loss": 1.6874, "step": 18300 }, { "epoch": 2.7185185185185183, "grad_norm": 4.591061592102051, "learning_rate": 4.5372006261095616e-07, "loss": 1.7495, "step": 18350 }, { "epoch": 2.725925925925926, "grad_norm": 4.683126449584961, "learning_rate": 4.3033694662067193e-07, "loss": 1.8024, "step": 18400 }, { "epoch": 2.7333333333333334, "grad_norm": 5.42063570022583, "learning_rate": 4.075591713816396e-07, "loss": 1.7193, "step": 18450 }, { "epoch": 2.7407407407407405, "grad_norm": 4.868284225463867, "learning_rate": 3.85388177729914e-07, "loss": 1.7279, "step": 18500 }, { "epoch": 2.748148148148148, "grad_norm": 4.900742053985596, "learning_rate": 3.6382536811884304e-07, "loss": 1.7472, "step": 18550 }, { "epoch": 2.7555555555555555, "grad_norm": 6.0796308517456055, "learning_rate": 3.428721065303442e-07, "loss": 1.7336, "step": 18600 }, { "epoch": 2.762962962962963, "grad_norm": 5.27460241317749, "learning_rate": 3.225297183886289e-07, "loss": 1.7737, "step": 18650 }, { "epoch": 2.7703703703703706, "grad_norm": 5.652526378631592, "learning_rate": 3.0279949047636094e-07, "loss": 1.6492, "step": 18700 }, { "epoch": 2.7777777777777777, "grad_norm": 4.943962574005127, "learning_rate": 2.836826708532603e-07, "loss": 1.7477, "step": 18750 }, { "epoch": 2.785185185185185, "grad_norm": 4.6496429443359375, "learning_rate": 2.6518046877715643e-07, "loss": 1.7588, "step": 18800 }, { "epoch": 2.7925925925925927, "grad_norm": 5.191527366638184, "learning_rate": 2.472940546274871e-07, "loss": 1.7494, "step": 18850 }, { "epoch": 2.8, "grad_norm": 4.17832612991333, "learning_rate": 2.300245598312778e-07, "loss": 1.6849, "step": 18900 }, { "epoch": 2.8074074074074074, "grad_norm": 3.590705633163452, "learning_rate": 2.1337307679156206e-07, "loss": 1.7048, "step": 18950 }, { "epoch": 2.814814814814815, "grad_norm": 7.666831970214844, "learning_rate": 1.9734065881828467e-07, "loss": 1.7125, "step": 19000 }, { "epoch": 2.822222222222222, "grad_norm": 4.35758638381958, "learning_rate": 1.8192832006166949e-07, "loss": 1.7405, "step": 19050 }, { "epoch": 2.8296296296296295, "grad_norm": 4.952466011047363, "learning_rate": 1.6713703544807169e-07, "loss": 1.7473, "step": 19100 }, { "epoch": 2.837037037037037, "grad_norm": 3.8383476734161377, "learning_rate": 1.5296774061830722e-07, "loss": 1.7803, "step": 19150 }, { "epoch": 2.8444444444444446, "grad_norm": 4.229979515075684, "learning_rate": 1.3942133186846563e-07, "loss": 1.6724, "step": 19200 }, { "epoch": 2.851851851851852, "grad_norm": 4.510770797729492, "learning_rate": 1.2649866609321548e-07, "loss": 1.7898, "step": 19250 }, { "epoch": 2.859259259259259, "grad_norm": 5.060910701751709, "learning_rate": 1.1420056073159879e-07, "loss": 1.632, "step": 19300 }, { "epoch": 2.8666666666666667, "grad_norm": 4.019813537597656, "learning_rate": 1.0252779371532795e-07, "loss": 1.72, "step": 19350 }, { "epoch": 2.8740740740740742, "grad_norm": 4.553711414337158, "learning_rate": 9.148110341956618e-08, "loss": 1.7606, "step": 19400 }, { "epoch": 2.8814814814814813, "grad_norm": 4.837120056152344, "learning_rate": 8.10611886162338e-08, "loss": 1.7198, "step": 19450 }, { "epoch": 2.888888888888889, "grad_norm": 6.557154178619385, "learning_rate": 7.126870842979695e-08, "loss": 1.8004, "step": 19500 }, { "epoch": 2.8962962962962964, "grad_norm": 6.096388816833496, "learning_rate": 6.210428229557641e-08, "loss": 1.7971, "step": 19550 }, { "epoch": 2.9037037037037035, "grad_norm": 5.821315288543701, "learning_rate": 5.356848992056574e-08, "loss": 1.6286, "step": 19600 }, { "epoch": 2.911111111111111, "grad_norm": 6.1501336097717285, "learning_rate": 4.566187124676269e-08, "loss": 1.798, "step": 19650 }, { "epoch": 2.9185185185185185, "grad_norm": 5.411888599395752, "learning_rate": 3.8384926417008864e-08, "loss": 1.7652, "step": 19700 }, { "epoch": 2.925925925925926, "grad_norm": 4.397125720977783, "learning_rate": 3.1738115743358275e-08, "loss": 1.7398, "step": 19750 }, { "epoch": 2.9333333333333336, "grad_norm": 4.768675327301025, "learning_rate": 2.5721859677957374e-08, "loss": 1.7477, "step": 19800 }, { "epoch": 2.9407407407407407, "grad_norm": 4.665643215179443, "learning_rate": 2.033653878644626e-08, "loss": 1.7327, "step": 19850 }, { "epoch": 2.948148148148148, "grad_norm": 4.826103687286377, "learning_rate": 1.5582493723893533e-08, "loss": 1.8006, "step": 19900 }, { "epoch": 2.9555555555555557, "grad_norm": 5.414469242095947, "learning_rate": 1.1460025213236858e-08, "loss": 1.7529, "step": 19950 }, { "epoch": 2.962962962962963, "grad_norm": 3.83282732963562, "learning_rate": 7.969394026269284e-09, "loss": 1.6982, "step": 20000 }, { "epoch": 2.9703703703703703, "grad_norm": 4.964536190032959, "learning_rate": 5.110820967140217e-09, "loss": 1.7715, "step": 20050 }, { "epoch": 2.977777777777778, "grad_norm": 4.930854320526123, "learning_rate": 2.8844868583866036e-09, "loss": 1.7523, "step": 20100 }, { "epoch": 2.985185185185185, "grad_norm": 4.766547679901123, "learning_rate": 1.2905325294987337e-09, "loss": 1.7657, "step": 20150 }, { "epoch": 2.9925925925925925, "grad_norm": 5.212747097015381, "learning_rate": 3.290588080095969e-10, "loss": 1.7505, "step": 20200 }, { "epoch": 3.0, "grad_norm": 4.470175266265869, "learning_rate": 1.265131144290166e-13, "loss": 1.622, "step": 20250 }, { "epoch": 3.0074074074074075, "grad_norm": 5.816748142242432, "learning_rate": 2.9929275944909074e-06, "loss": 1.716, "step": 20300 }, { "epoch": 3.0148148148148146, "grad_norm": 5.761480331420898, "learning_rate": 2.9507610183637545e-06, "loss": 1.7195, "step": 20350 }, { "epoch": 3.022222222222222, "grad_norm": 4.432187557220459, "learning_rate": 2.9088421211129593e-06, "loss": 1.7226, "step": 20400 }, { "epoch": 3.0296296296296297, "grad_norm": 5.013230323791504, "learning_rate": 2.8671723755819604e-06, "loss": 1.712, "step": 20450 }, { "epoch": 3.037037037037037, "grad_norm": 5.5916266441345215, "learning_rate": 2.8257532458601156e-06, "loss": 1.7733, "step": 20500 }, { "epoch": 3.0444444444444443, "grad_norm": 4.905713081359863, "learning_rate": 2.7845861872312497e-06, "loss": 1.7558, "step": 20550 }, { "epoch": 3.051851851851852, "grad_norm": 4.5096306800842285, "learning_rate": 2.743672646122539e-06, "loss": 1.7287, "step": 20600 }, { "epoch": 3.0592592592592593, "grad_norm": 3.8488352298736572, "learning_rate": 2.703014060053688e-06, "loss": 1.6764, "step": 20650 }, { "epoch": 3.066666666666667, "grad_norm": 5.562567234039307, "learning_rate": 2.6626118575864003e-06, "loss": 1.7835, "step": 20700 }, { "epoch": 3.074074074074074, "grad_norm": 4.081836700439453, "learning_rate": 2.622467458274216e-06, "loss": 1.775, "step": 20750 }, { "epoch": 3.0814814814814815, "grad_norm": 5.104362964630127, "learning_rate": 2.5825822726126095e-06, "loss": 1.6917, "step": 20800 }, { "epoch": 3.088888888888889, "grad_norm": 3.979689359664917, "learning_rate": 2.542957701989447e-06, "loss": 1.6521, "step": 20850 }, { "epoch": 3.096296296296296, "grad_norm": 4.8478779792785645, "learning_rate": 2.503595138635747e-06, "loss": 1.6938, "step": 20900 }, { "epoch": 3.1037037037037036, "grad_norm": 4.569914817810059, "learning_rate": 2.464495965576745e-06, "loss": 1.6867, "step": 20950 }, { "epoch": 3.111111111111111, "grad_norm": 5.286581993103027, "learning_rate": 2.4256615565833285e-06, "loss": 1.712, "step": 21000 }, { "epoch": 3.1185185185185187, "grad_norm": 3.896458387374878, "learning_rate": 2.3870932761237487e-06, "loss": 1.7861, "step": 21050 }, { "epoch": 3.1259259259259258, "grad_norm": 3.608572244644165, "learning_rate": 2.34879247931568e-06, "loss": 1.7556, "step": 21100 }, { "epoch": 3.1333333333333333, "grad_norm": 4.977129936218262, "learning_rate": 2.310760511878619e-06, "loss": 1.7147, "step": 21150 }, { "epoch": 3.140740740740741, "grad_norm": 6.970212936401367, "learning_rate": 2.2729987100865946e-06, "loss": 1.7385, "step": 21200 }, { "epoch": 3.148148148148148, "grad_norm": 5.554306507110596, "learning_rate": 2.2355084007212126e-06, "loss": 1.7916, "step": 21250 }, { "epoch": 3.1555555555555554, "grad_norm": 4.7065558433532715, "learning_rate": 2.1982909010250465e-06, "loss": 1.7447, "step": 21300 }, { "epoch": 3.162962962962963, "grad_norm": 3.5494189262390137, "learning_rate": 2.161347518655358e-06, "loss": 1.6721, "step": 21350 }, { "epoch": 3.1703703703703705, "grad_norm": 5.571441173553467, "learning_rate": 2.1246795516381324e-06, "loss": 1.8119, "step": 21400 }, { "epoch": 3.1777777777777776, "grad_norm": 4.489774227142334, "learning_rate": 2.0882882883224996e-06, "loss": 1.7543, "step": 21450 }, { "epoch": 3.185185185185185, "grad_norm": 4.602801322937012, "learning_rate": 2.0521750073354484e-06, "loss": 1.6942, "step": 21500 }, { "epoch": 3.1925925925925926, "grad_norm": 4.661532878875732, "learning_rate": 2.0163409775369015e-06, "loss": 1.7236, "step": 21550 }, { "epoch": 3.2, "grad_norm": 5.045383930206299, "learning_rate": 1.9807874579751427e-06, "loss": 1.7122, "step": 21600 }, { "epoch": 3.2074074074074073, "grad_norm": 3.879729986190796, "learning_rate": 1.9455156978425783e-06, "loss": 1.7256, "step": 21650 }, { "epoch": 3.214814814814815, "grad_norm": 5.181421279907227, "learning_rate": 1.9105269364318323e-06, "loss": 1.7287, "step": 21700 }, { "epoch": 3.2222222222222223, "grad_norm": 6.361110210418701, "learning_rate": 1.8758224030922224e-06, "loss": 1.7141, "step": 21750 }, { "epoch": 3.2296296296296294, "grad_norm": 4.662287712097168, "learning_rate": 1.8414033171865564e-06, "loss": 1.7346, "step": 21800 }, { "epoch": 3.237037037037037, "grad_norm": 5.042120933532715, "learning_rate": 1.8072708880482825e-06, "loss": 1.6893, "step": 21850 }, { "epoch": 3.2444444444444445, "grad_norm": 5.092305660247803, "learning_rate": 1.7734263149390141e-06, "loss": 1.7447, "step": 21900 }, { "epoch": 3.251851851851852, "grad_norm": 5.0466227531433105, "learning_rate": 1.739870787006387e-06, "loss": 1.687, "step": 21950 }, { "epoch": 3.259259259259259, "grad_norm": 4.487703323364258, "learning_rate": 1.7066054832422641e-06, "loss": 1.7448, "step": 22000 }, { "epoch": 3.2666666666666666, "grad_norm": 4.347760200500488, "learning_rate": 1.6736315724413344e-06, "loss": 1.7028, "step": 22050 }, { "epoch": 3.274074074074074, "grad_norm": 4.359129905700684, "learning_rate": 1.6409502131600352e-06, "loss": 1.6988, "step": 22100 }, { "epoch": 3.2814814814814817, "grad_norm": 5.131579399108887, "learning_rate": 1.6085625536758376e-06, "loss": 1.7299, "step": 22150 }, { "epoch": 3.2888888888888888, "grad_norm": 5.537661552429199, "learning_rate": 1.5764697319469147e-06, "loss": 1.6983, "step": 22200 }, { "epoch": 3.2962962962962963, "grad_norm": 5.458590030670166, "learning_rate": 1.5446728755721563e-06, "loss": 1.7139, "step": 22250 }, { "epoch": 3.303703703703704, "grad_norm": 5.582429885864258, "learning_rate": 1.5131731017515384e-06, "loss": 1.7113, "step": 22300 }, { "epoch": 3.311111111111111, "grad_norm": 5.524541854858398, "learning_rate": 1.4819715172468873e-06, "loss": 1.7508, "step": 22350 }, { "epoch": 3.3185185185185184, "grad_norm": 5.851492404937744, "learning_rate": 1.451069218342983e-06, "loss": 1.7525, "step": 22400 }, { "epoch": 3.325925925925926, "grad_norm": 5.018085479736328, "learning_rate": 1.4204672908090345e-06, "loss": 1.7677, "step": 22450 }, { "epoch": 3.3333333333333335, "grad_norm": 4.522815704345703, "learning_rate": 1.390166809860547e-06, "loss": 1.7647, "step": 22500 }, { "epoch": 3.3407407407407406, "grad_norm": 4.073155879974365, "learning_rate": 1.3601688401215274e-06, "loss": 1.6966, "step": 22550 }, { "epoch": 3.348148148148148, "grad_norm": 4.451920509338379, "learning_rate": 1.3304744355870914e-06, "loss": 1.7378, "step": 22600 }, { "epoch": 3.3555555555555556, "grad_norm": 4.887360572814941, "learning_rate": 1.301084639586424e-06, "loss": 1.7222, "step": 22650 }, { "epoch": 3.362962962962963, "grad_norm": 4.85434627532959, "learning_rate": 1.2720004847461165e-06, "loss": 1.6973, "step": 22700 }, { "epoch": 3.3703703703703702, "grad_norm": 5.109053134918213, "learning_rate": 1.2432229929538952e-06, "loss": 1.7477, "step": 22750 }, { "epoch": 3.3777777777777778, "grad_norm": 4.921713352203369, "learning_rate": 1.214753175322716e-06, "loss": 1.6625, "step": 22800 }, { "epoch": 3.3851851851851853, "grad_norm": 4.879732608795166, "learning_rate": 1.1865920321552238e-06, "loss": 1.6934, "step": 22850 }, { "epoch": 3.3925925925925924, "grad_norm": 4.966546058654785, "learning_rate": 1.158740552908627e-06, "loss": 1.7571, "step": 22900 }, { "epoch": 3.4, "grad_norm": 4.508844375610352, "learning_rate": 1.1311997161599186e-06, "loss": 1.7194, "step": 22950 }, { "epoch": 3.4074074074074074, "grad_norm": 6.013401508331299, "learning_rate": 1.1039704895714941e-06, "loss": 1.7142, "step": 23000 }, { "epoch": 3.414814814814815, "grad_norm": 5.535034656524658, "learning_rate": 1.0770538298571598e-06, "loss": 1.707, "step": 23050 }, { "epoch": 3.422222222222222, "grad_norm": 3.6205241680145264, "learning_rate": 1.0504506827485139e-06, "loss": 1.7567, "step": 23100 }, { "epoch": 3.4296296296296296, "grad_norm": 3.9489850997924805, "learning_rate": 1.024161982961711e-06, "loss": 1.7595, "step": 23150 }, { "epoch": 3.437037037037037, "grad_norm": 5.946234703063965, "learning_rate": 9.981886541646325e-07, "loss": 1.7318, "step": 23200 }, { "epoch": 3.4444444444444446, "grad_norm": 4.73652458190918, "learning_rate": 9.725316089444291e-07, "loss": 1.7689, "step": 23250 }, { "epoch": 3.4518518518518517, "grad_norm": 4.716205596923828, "learning_rate": 9.471917487754456e-07, "loss": 1.71, "step": 23300 }, { "epoch": 3.4592592592592593, "grad_norm": 5.690927028656006, "learning_rate": 9.221699639875637e-07, "loss": 1.7837, "step": 23350 }, { "epoch": 3.466666666666667, "grad_norm": 6.249578952789307, "learning_rate": 8.974671337349128e-07, "loss": 1.6703, "step": 23400 }, { "epoch": 3.474074074074074, "grad_norm": 5.07996940612793, "learning_rate": 8.730841259649725e-07, "loss": 1.7051, "step": 23450 }, { "epoch": 3.4814814814814814, "grad_norm": 5.213183403015137, "learning_rate": 8.49021797388091e-07, "loss": 1.7092, "step": 23500 }, { "epoch": 3.488888888888889, "grad_norm": 5.148874759674072, "learning_rate": 8.252809934473771e-07, "loss": 1.715, "step": 23550 }, { "epoch": 3.4962962962962965, "grad_norm": 4.876300811767578, "learning_rate": 8.018625482889897e-07, "loss": 1.6792, "step": 23600 }, { "epoch": 3.5037037037037035, "grad_norm": 5.332327842712402, "learning_rate": 7.787672847328387e-07, "loss": 1.77, "step": 23650 }, { "epoch": 3.511111111111111, "grad_norm": 4.203309535980225, "learning_rate": 7.559960142436751e-07, "loss": 1.7447, "step": 23700 }, { "epoch": 3.5185185185185186, "grad_norm": 4.4003987312316895, "learning_rate": 7.335495369025669e-07, "loss": 1.6881, "step": 23750 }, { "epoch": 3.525925925925926, "grad_norm": 4.845513343811035, "learning_rate": 7.11428641378804e-07, "loss": 1.7904, "step": 23800 }, { "epoch": 3.533333333333333, "grad_norm": 4.780256748199463, "learning_rate": 6.896341049021804e-07, "loss": 1.6839, "step": 23850 }, { "epoch": 3.5407407407407407, "grad_norm": 5.716015815734863, "learning_rate": 6.6816669323568e-07, "loss": 1.727, "step": 23900 }, { "epoch": 3.5481481481481483, "grad_norm": 4.932426929473877, "learning_rate": 6.470271606485834e-07, "loss": 1.7465, "step": 23950 }, { "epoch": 3.5555555555555554, "grad_norm": 4.499339580535889, "learning_rate": 6.262162498899593e-07, "loss": 1.7221, "step": 24000 }, { "epoch": 3.562962962962963, "grad_norm": 4.518272399902344, "learning_rate": 6.057346921625628e-07, "loss": 1.7454, "step": 24050 }, { "epoch": 3.5703703703703704, "grad_norm": 4.595861434936523, "learning_rate": 5.855832070971557e-07, "loss": 1.7209, "step": 24100 }, { "epoch": 3.5777777777777775, "grad_norm": 4.064270973205566, "learning_rate": 5.657625027272162e-07, "loss": 1.793, "step": 24150 }, { "epoch": 3.585185185185185, "grad_norm": 5.750711441040039, "learning_rate": 5.462732754640554e-07, "loss": 1.7546, "step": 24200 }, { "epoch": 3.5925925925925926, "grad_norm": 4.3929901123046875, "learning_rate": 5.271162100723592e-07, "loss": 1.716, "step": 24250 }, { "epoch": 3.6, "grad_norm": 5.952783584594727, "learning_rate": 5.08291979646125e-07, "loss": 1.7265, "step": 24300 }, { "epoch": 3.6074074074074076, "grad_norm": 4.167656898498535, "learning_rate": 4.898012455850065e-07, "loss": 1.6597, "step": 24350 }, { "epoch": 3.6148148148148147, "grad_norm": 4.184138774871826, "learning_rate": 4.7164465757108424e-07, "loss": 1.8007, "step": 24400 }, { "epoch": 3.6222222222222222, "grad_norm": 4.484316349029541, "learning_rate": 4.5382285354602983e-07, "loss": 1.764, "step": 24450 }, { "epoch": 3.6296296296296298, "grad_norm": 4.9524126052856445, "learning_rate": 4.363364596887021e-07, "loss": 1.7239, "step": 24500 }, { "epoch": 3.637037037037037, "grad_norm": 4.901634693145752, "learning_rate": 4.191860903931344e-07, "loss": 1.7435, "step": 24550 }, { "epoch": 3.6444444444444444, "grad_norm": 4.495138645172119, "learning_rate": 4.0237234824695327e-07, "loss": 1.6682, "step": 24600 }, { "epoch": 3.651851851851852, "grad_norm": 4.296281337738037, "learning_rate": 3.858958240102084e-07, "loss": 1.749, "step": 24650 }, { "epoch": 3.659259259259259, "grad_norm": 5.341070652008057, "learning_rate": 3.69757096594614e-07, "loss": 1.7463, "step": 24700 }, { "epoch": 3.6666666666666665, "grad_norm": 4.133769989013672, "learning_rate": 3.5395673304320253e-07, "loss": 1.6866, "step": 24750 }, { "epoch": 3.674074074074074, "grad_norm": 6.342161178588867, "learning_rate": 3.384952885104109e-07, "loss": 1.7442, "step": 24800 }, { "epoch": 3.6814814814814816, "grad_norm": 5.4333720207214355, "learning_rate": 3.233733062425715e-07, "loss": 1.7159, "step": 24850 }, { "epoch": 3.688888888888889, "grad_norm": 4.437062740325928, "learning_rate": 3.0859131755881956e-07, "loss": 1.7579, "step": 24900 }, { "epoch": 3.696296296296296, "grad_norm": 5.132778167724609, "learning_rate": 2.9414984183243177e-07, "loss": 1.6867, "step": 24950 }, { "epoch": 3.7037037037037037, "grad_norm": 4.612050533294678, "learning_rate": 2.800493864725784e-07, "loss": 1.667, "step": 25000 }, { "epoch": 3.7111111111111112, "grad_norm": 4.683866500854492, "learning_rate": 2.662904469064842e-07, "loss": 1.7585, "step": 25050 }, { "epoch": 3.7185185185185183, "grad_norm": 5.033325672149658, "learning_rate": 2.52873506562038e-07, "loss": 1.7299, "step": 25100 }, { "epoch": 3.725925925925926, "grad_norm": 5.117885589599609, "learning_rate": 2.39799036850793e-07, "loss": 1.7486, "step": 25150 }, { "epoch": 3.7333333333333334, "grad_norm": 3.9167544841766357, "learning_rate": 2.2706749715141085e-07, "loss": 1.6908, "step": 25200 }, { "epoch": 3.7407407407407405, "grad_norm": 5.12332010269165, "learning_rate": 2.1467933479351942e-07, "loss": 1.6907, "step": 25250 }, { "epoch": 3.748148148148148, "grad_norm": 5.0879130363464355, "learning_rate": 2.0263498504199397e-07, "loss": 1.7745, "step": 25300 }, { "epoch": 3.7555555555555555, "grad_norm": 4.076460838317871, "learning_rate": 1.909348710816672e-07, "loss": 1.7463, "step": 25350 }, { "epoch": 3.762962962962963, "grad_norm": 4.88745641708374, "learning_rate": 1.7957940400245677e-07, "loss": 1.7632, "step": 25400 }, { "epoch": 3.7703703703703706, "grad_norm": 5.026834964752197, "learning_rate": 1.6856898278492573e-07, "loss": 1.825, "step": 25450 }, { "epoch": 3.7777777777777777, "grad_norm": 5.161101341247559, "learning_rate": 1.5790399428625925e-07, "loss": 1.7309, "step": 25500 }, { "epoch": 3.785185185185185, "grad_norm": 4.700355529785156, "learning_rate": 1.475848132266733e-07, "loss": 1.716, "step": 25550 }, { "epoch": 3.7925925925925927, "grad_norm": 4.75068998336792, "learning_rate": 1.3761180217625514e-07, "loss": 1.7438, "step": 25600 }, { "epoch": 3.8, "grad_norm": 4.847548484802246, "learning_rate": 1.2798531154221362e-07, "loss": 1.6976, "step": 25650 }, { "epoch": 3.8074074074074074, "grad_norm": 4.9904866218566895, "learning_rate": 1.1870567955657552e-07, "loss": 1.733, "step": 25700 }, { "epoch": 3.814814814814815, "grad_norm": 5.078738689422607, "learning_rate": 1.097732322642997e-07, "loss": 1.7232, "step": 25750 }, { "epoch": 3.822222222222222, "grad_norm": 5.824419975280762, "learning_rate": 1.0118828351181609e-07, "loss": 1.7559, "step": 25800 }, { "epoch": 3.8296296296296295, "grad_norm": 3.6938419342041016, "learning_rate": 9.295113493600683e-08, "loss": 1.6989, "step": 25850 }, { "epoch": 3.837037037037037, "grad_norm": 6.135478973388672, "learning_rate": 8.506207595360361e-08, "loss": 1.6932, "step": 25900 }, { "epoch": 3.8444444444444446, "grad_norm": 5.020415782928467, "learning_rate": 7.752138375101914e-08, "loss": 1.7119, "step": 25950 }, { "epoch": 3.851851851851852, "grad_norm": 5.36544942855835, "learning_rate": 7.032932327460828e-08, "loss": 1.7197, "step": 26000 }, { "epoch": 3.859259259259259, "grad_norm": 4.222412586212158, "learning_rate": 6.348614722135771e-08, "loss": 1.7421, "step": 26050 }, { "epoch": 3.8666666666666667, "grad_norm": 4.589230537414551, "learning_rate": 5.699209603001077e-08, "loss": 1.7871, "step": 26100 }, { "epoch": 3.8740740740740742, "grad_norm": 4.290050506591797, "learning_rate": 5.0847397872617607e-08, "loss": 1.7179, "step": 26150 }, { "epoch": 3.8814814814814813, "grad_norm": 5.372164726257324, "learning_rate": 4.50522686465138e-08, "loss": 1.7356, "step": 26200 }, { "epoch": 3.888888888888889, "grad_norm": 4.672863006591797, "learning_rate": 3.960691196674304e-08, "loss": 1.7587, "step": 26250 }, { "epoch": 3.8962962962962964, "grad_norm": 6.645358562469482, "learning_rate": 3.451151915889961e-08, "loss": 1.6881, "step": 26300 }, { "epoch": 3.9037037037037035, "grad_norm": 5.1877336502075195, "learning_rate": 2.9766269252401448e-08, "loss": 1.6786, "step": 26350 }, { "epoch": 3.911111111111111, "grad_norm": 5.258729934692383, "learning_rate": 2.5371328974206356e-08, "loss": 1.7356, "step": 26400 }, { "epoch": 3.9185185185185185, "grad_norm": 4.210945129394531, "learning_rate": 2.1326852742949987e-08, "loss": 1.7084, "step": 26450 }, { "epoch": 3.925925925925926, "grad_norm": 5.776233196258545, "learning_rate": 1.7632982663521314e-08, "loss": 1.741, "step": 26500 }, { "epoch": 3.9333333333333336, "grad_norm": 4.814140796661377, "learning_rate": 1.4289848522073269e-08, "loss": 1.7106, "step": 26550 }, { "epoch": 3.9407407407407407, "grad_norm": 5.385549068450928, "learning_rate": 1.1297567781454188e-08, "loss": 1.7509, "step": 26600 }, { "epoch": 3.948148148148148, "grad_norm": 5.47978401184082, "learning_rate": 8.656245577089994e-09, "loss": 1.6962, "step": 26650 }, { "epoch": 3.9555555555555557, "grad_norm": 5.5808281898498535, "learning_rate": 6.365974713283818e-09, "loss": 1.6714, "step": 26700 }, { "epoch": 3.962962962962963, "grad_norm": 5.073295593261719, "learning_rate": 4.426835659958606e-09, "loss": 1.7363, "step": 26750 }, { "epoch": 3.9703703703703703, "grad_norm": 5.013734817504883, "learning_rate": 2.838896549828274e-09, "loss": 1.7764, "step": 26800 }, { "epoch": 3.977777777777778, "grad_norm": 5.511692047119141, "learning_rate": 1.6022131760018433e-09, "loss": 1.6929, "step": 26850 }, { "epoch": 3.985185185185185, "grad_norm": 4.73627233505249, "learning_rate": 7.168289900305602e-10, "loss": 1.6978, "step": 26900 }, { "epoch": 3.9925925925925925, "grad_norm": 4.294295310974121, "learning_rate": 1.827751003724565e-10, "loss": 1.7181, "step": 26950 }, { "epoch": 4.0, "grad_norm": 4.967982292175293, "learning_rate": 7.027130211056943e-14, "loss": 1.7233, "step": 27000 }, { "epoch": 4.007407407407407, "grad_norm": 4.82836389541626, "learning_rate": 1.9382473829432267e-06, "loss": 1.7041, "step": 27050 }, { "epoch": 4.014814814814815, "grad_norm": 5.167964935302734, "learning_rate": 1.9103854553176194e-06, "loss": 1.8133, "step": 27100 }, { "epoch": 4.022222222222222, "grad_norm": 4.666854381561279, "learning_rate": 1.8827040718648226e-06, "loss": 1.6764, "step": 27150 }, { "epoch": 4.029629629629629, "grad_norm": 5.357041835784912, "learning_rate": 1.8552038503784874e-06, "loss": 1.749, "step": 27200 }, { "epoch": 4.037037037037037, "grad_norm": 6.967157363891602, "learning_rate": 1.82788540460908e-06, "loss": 1.7238, "step": 27250 }, { "epoch": 4.044444444444444, "grad_norm": 5.682682037353516, "learning_rate": 1.8007493442502034e-06, "loss": 1.7133, "step": 27300 }, { "epoch": 4.051851851851852, "grad_norm": 5.161554336547852, "learning_rate": 1.7737962749249681e-06, "loss": 1.6942, "step": 27350 }, { "epoch": 4.059259259259259, "grad_norm": 4.5494232177734375, "learning_rate": 1.7470267981724963e-06, "loss": 1.7458, "step": 27400 }, { "epoch": 4.066666666666666, "grad_norm": 4.907562732696533, "learning_rate": 1.7204415114344875e-06, "loss": 1.7069, "step": 27450 }, { "epoch": 4.074074074074074, "grad_norm": 5.7244648933410645, "learning_rate": 1.6940410080418723e-06, "loss": 1.7731, "step": 27500 }, { "epoch": 4.0814814814814815, "grad_norm": 5.122848987579346, "learning_rate": 1.6678258772016043e-06, "loss": 1.6889, "step": 27550 }, { "epoch": 4.088888888888889, "grad_norm": 5.414388656616211, "learning_rate": 1.6417967039834693e-06, "loss": 1.7262, "step": 27600 }, { "epoch": 4.0962962962962965, "grad_norm": 4.686498641967773, "learning_rate": 1.6159540693070609e-06, "loss": 1.7175, "step": 27650 }, { "epoch": 4.103703703703704, "grad_norm": 3.999427556991577, "learning_rate": 1.5902985499287894e-06, "loss": 1.7779, "step": 27700 }, { "epoch": 4.111111111111111, "grad_norm": 5.516722202301025, "learning_rate": 1.5648307184290335e-06, "loss": 1.6821, "step": 27750 }, { "epoch": 4.118518518518519, "grad_norm": 4.3700761795043945, "learning_rate": 1.539551143199346e-06, "loss": 1.774, "step": 27800 }, { "epoch": 4.125925925925926, "grad_norm": 5.696080207824707, "learning_rate": 1.5144603884297705e-06, "loss": 1.7454, "step": 27850 }, { "epoch": 4.133333333333334, "grad_norm": 4.253551006317139, "learning_rate": 1.4895590140962546e-06, "loss": 1.76, "step": 27900 }, { "epoch": 4.140740740740741, "grad_norm": 4.988155364990234, "learning_rate": 1.4648475759481518e-06, "loss": 1.7696, "step": 27950 }, { "epoch": 4.148148148148148, "grad_norm": 4.6464972496032715, "learning_rate": 1.440326625495807e-06, "loss": 1.7652, "step": 28000 }, { "epoch": 4.155555555555556, "grad_norm": 5.924875736236572, "learning_rate": 1.4159967099982708e-06, "loss": 1.6888, "step": 28050 }, { "epoch": 4.162962962962963, "grad_norm": 4.925395488739014, "learning_rate": 1.3918583724510604e-06, "loss": 1.7267, "step": 28100 }, { "epoch": 4.17037037037037, "grad_norm": 4.7780046463012695, "learning_rate": 1.367912151574059e-06, "loss": 1.6646, "step": 28150 }, { "epoch": 4.177777777777778, "grad_norm": 3.9888927936553955, "learning_rate": 1.3441585817994818e-06, "loss": 1.6915, "step": 28200 }, { "epoch": 4.185185185185185, "grad_norm": 4.683606147766113, "learning_rate": 1.3205981932599555e-06, "loss": 1.7211, "step": 28250 }, { "epoch": 4.192592592592592, "grad_norm": 5.409604072570801, "learning_rate": 1.2972315117766876e-06, "loss": 1.7155, "step": 28300 }, { "epoch": 4.2, "grad_norm": 4.003017425537109, "learning_rate": 1.2740590588477198e-06, "loss": 1.7622, "step": 28350 }, { "epoch": 4.207407407407407, "grad_norm": 5.602859973907471, "learning_rate": 1.2510813516363064e-06, "loss": 1.7029, "step": 28400 }, { "epoch": 4.214814814814815, "grad_norm": 5.32771110534668, "learning_rate": 1.228298902959353e-06, "loss": 1.7684, "step": 28450 }, { "epoch": 4.222222222222222, "grad_norm": 5.030776500701904, "learning_rate": 1.2057122212759887e-06, "loss": 1.7379, "step": 28500 }, { "epoch": 4.229629629629629, "grad_norm": 5.6066365242004395, "learning_rate": 1.1833218106762113e-06, "loss": 1.7228, "step": 28550 }, { "epoch": 4.237037037037037, "grad_norm": 4.757691860198975, "learning_rate": 1.1611281708696332e-06, "loss": 1.6407, "step": 28600 }, { "epoch": 4.2444444444444445, "grad_norm": 4.368016242980957, "learning_rate": 1.1391317971743366e-06, "loss": 1.7502, "step": 28650 }, { "epoch": 4.2518518518518515, "grad_norm": 4.5439581871032715, "learning_rate": 1.1173331805058074e-06, "loss": 1.732, "step": 28700 }, { "epoch": 4.2592592592592595, "grad_norm": 4.928348064422607, "learning_rate": 1.0957328073659945e-06, "loss": 1.7228, "step": 28750 }, { "epoch": 4.266666666666667, "grad_norm": 5.2048139572143555, "learning_rate": 1.074331159832439e-06, "loss": 1.7416, "step": 28800 }, { "epoch": 4.274074074074074, "grad_norm": 4.4075446128845215, "learning_rate": 1.0531287155475223e-06, "loss": 1.7174, "step": 28850 }, { "epoch": 4.281481481481482, "grad_norm": 4.489355564117432, "learning_rate": 1.032125947707805e-06, "loss": 1.6647, "step": 28900 }, { "epoch": 4.288888888888889, "grad_norm": 4.746804237365723, "learning_rate": 1.0113233250534594e-06, "loss": 1.743, "step": 28950 }, { "epoch": 4.296296296296296, "grad_norm": 6.516097068786621, "learning_rate": 9.907213118578184e-07, "loss": 1.7985, "step": 29000 }, { "epoch": 4.303703703703704, "grad_norm": 4.8853678703308105, "learning_rate": 9.703203679170116e-07, "loss": 1.7554, "step": 29050 }, { "epoch": 4.311111111111111, "grad_norm": 4.837742805480957, "learning_rate": 9.501209485396968e-07, "loss": 1.7171, "step": 29100 }, { "epoch": 4.318518518518519, "grad_norm": 4.9490556716918945, "learning_rate": 9.301235045368995e-07, "loss": 1.6853, "step": 29150 }, { "epoch": 4.325925925925926, "grad_norm": 4.251191139221191, "learning_rate": 9.103284822119629e-07, "loss": 1.6628, "step": 29200 }, { "epoch": 4.333333333333333, "grad_norm": 4.1913533210754395, "learning_rate": 8.907363233505772e-07, "loss": 1.638, "step": 29250 }, { "epoch": 4.340740740740741, "grad_norm": 5.5985517501831055, "learning_rate": 8.713474652109155e-07, "loss": 1.6611, "step": 29300 }, { "epoch": 4.348148148148148, "grad_norm": 5.221460819244385, "learning_rate": 8.521623405138902e-07, "loss": 1.7188, "step": 29350 }, { "epoch": 4.355555555555555, "grad_norm": 5.904771327972412, "learning_rate": 8.331813774334796e-07, "loss": 1.7322, "step": 29400 }, { "epoch": 4.362962962962963, "grad_norm": 7.180872917175293, "learning_rate": 8.144049995871839e-07, "loss": 1.726, "step": 29450 }, { "epoch": 4.37037037037037, "grad_norm": 4.465069770812988, "learning_rate": 7.958336260265654e-07, "loss": 1.7343, "step": 29500 }, { "epoch": 4.377777777777778, "grad_norm": 4.990593910217285, "learning_rate": 7.774676712278939e-07, "loss": 1.7124, "step": 29550 }, { "epoch": 4.385185185185185, "grad_norm": 4.23641300201416, "learning_rate": 7.593075450829046e-07, "loss": 1.7329, "step": 29600 }, { "epoch": 4.392592592592592, "grad_norm": 4.954015254974365, "learning_rate": 7.413536528896381e-07, "loss": 1.6599, "step": 29650 }, { "epoch": 4.4, "grad_norm": 5.484986782073975, "learning_rate": 7.236063953434091e-07, "loss": 1.7504, "step": 29700 }, { "epoch": 4.407407407407407, "grad_norm": 4.7733964920043945, "learning_rate": 7.060661685278481e-07, "loss": 1.7344, "step": 29750 }, { "epoch": 4.4148148148148145, "grad_norm": 3.8824641704559326, "learning_rate": 6.887333639060767e-07, "loss": 1.6395, "step": 29800 }, { "epoch": 4.4222222222222225, "grad_norm": 4.051513671875, "learning_rate": 6.716083683119623e-07, "loss": 1.6314, "step": 29850 }, { "epoch": 4.42962962962963, "grad_norm": 5.516358852386475, "learning_rate": 6.54691563941483e-07, "loss": 1.7142, "step": 29900 }, { "epoch": 4.437037037037037, "grad_norm": 5.0649027824401855, "learning_rate": 6.379833283442061e-07, "loss": 1.7769, "step": 29950 }, { "epoch": 4.444444444444445, "grad_norm": 4.46967077255249, "learning_rate": 6.214840344148509e-07, "loss": 1.6883, "step": 30000 }, { "epoch": 4.451851851851852, "grad_norm": 5.248079299926758, "learning_rate": 6.051940503849818e-07, "loss": 1.6829, "step": 30050 }, { "epoch": 4.459259259259259, "grad_norm": 4.6656293869018555, "learning_rate": 5.891137398147706e-07, "loss": 1.6501, "step": 30100 }, { "epoch": 4.466666666666667, "grad_norm": 4.85382080078125, "learning_rate": 5.732434615848992e-07, "loss": 1.6991, "step": 30150 }, { "epoch": 4.474074074074074, "grad_norm": 5.441938877105713, "learning_rate": 5.575835698885445e-07, "loss": 1.6589, "step": 30200 }, { "epoch": 4.481481481481482, "grad_norm": 4.716953754425049, "learning_rate": 5.421344142234653e-07, "loss": 1.7254, "step": 30250 }, { "epoch": 4.488888888888889, "grad_norm": 4.488583564758301, "learning_rate": 5.268963393842185e-07, "loss": 1.7533, "step": 30300 }, { "epoch": 4.496296296296296, "grad_norm": 6.067137718200684, "learning_rate": 5.118696854544491e-07, "loss": 1.7206, "step": 30350 }, { "epoch": 4.503703703703704, "grad_norm": 4.0403947830200195, "learning_rate": 4.970547877993081e-07, "loss": 1.7447, "step": 30400 }, { "epoch": 4.511111111111111, "grad_norm": 5.8618998527526855, "learning_rate": 4.824519770579672e-07, "loss": 1.7343, "step": 30450 }, { "epoch": 4.518518518518518, "grad_norm": 6.678895950317383, "learning_rate": 4.6806157913623417e-07, "loss": 1.6971, "step": 30500 }, { "epoch": 4.525925925925926, "grad_norm": 4.533618450164795, "learning_rate": 4.5388391519929134e-07, "loss": 1.6467, "step": 30550 }, { "epoch": 4.533333333333333, "grad_norm": 4.837048053741455, "learning_rate": 4.399193016645109e-07, "loss": 1.7892, "step": 30600 }, { "epoch": 4.540740740740741, "grad_norm": 4.566707611083984, "learning_rate": 4.2616805019440653e-07, "loss": 1.6981, "step": 30650 }, { "epoch": 4.548148148148148, "grad_norm": 4.793781280517578, "learning_rate": 4.126304676896753e-07, "loss": 1.7256, "step": 30700 }, { "epoch": 4.555555555555555, "grad_norm": 5.338557243347168, "learning_rate": 3.993068562823399e-07, "loss": 1.7732, "step": 30750 }, { "epoch": 4.562962962962963, "grad_norm": 5.471566200256348, "learning_rate": 3.8619751332901744e-07, "loss": 1.7244, "step": 30800 }, { "epoch": 4.57037037037037, "grad_norm": 6.156429290771484, "learning_rate": 3.7330273140427585e-07, "loss": 1.7038, "step": 30850 }, { "epoch": 4.5777777777777775, "grad_norm": 4.914401531219482, "learning_rate": 3.606227982941046e-07, "loss": 1.784, "step": 30900 }, { "epoch": 4.5851851851851855, "grad_norm": 5.399247646331787, "learning_rate": 3.481579969894977e-07, "loss": 1.7026, "step": 30950 }, { "epoch": 4.592592592592593, "grad_norm": 4.374510765075684, "learning_rate": 3.359086056801253e-07, "loss": 1.7585, "step": 31000 }, { "epoch": 4.6, "grad_norm": 4.571732044219971, "learning_rate": 3.238748977481421e-07, "loss": 1.7257, "step": 31050 }, { "epoch": 4.607407407407408, "grad_norm": 4.58890438079834, "learning_rate": 3.1205714176207105e-07, "loss": 1.7471, "step": 31100 }, { "epoch": 4.614814814814815, "grad_norm": 4.49856424331665, "learning_rate": 3.004556014708182e-07, "loss": 1.6543, "step": 31150 }, { "epoch": 4.622222222222222, "grad_norm": 4.499317169189453, "learning_rate": 2.8907053579778075e-07, "loss": 1.6906, "step": 31200 }, { "epoch": 4.62962962962963, "grad_norm": 5.533344745635986, "learning_rate": 2.7790219883507385e-07, "loss": 1.6691, "step": 31250 }, { "epoch": 4.637037037037037, "grad_norm": 5.220572471618652, "learning_rate": 2.6695083983785843e-07, "loss": 1.7941, "step": 31300 }, { "epoch": 4.644444444444445, "grad_norm": 4.013186454772949, "learning_rate": 2.5621670321877236e-07, "loss": 1.672, "step": 31350 }, { "epoch": 4.651851851851852, "grad_norm": 5.1433186531066895, "learning_rate": 2.45700028542486e-07, "loss": 1.712, "step": 31400 }, { "epoch": 4.659259259259259, "grad_norm": 5.291711330413818, "learning_rate": 2.354010505203419e-07, "loss": 1.7306, "step": 31450 }, { "epoch": 4.666666666666667, "grad_norm": 4.273839473724365, "learning_rate": 2.2531999900513358e-07, "loss": 1.6904, "step": 31500 }, { "epoch": 4.674074074074074, "grad_norm": 4.840414047241211, "learning_rate": 2.1545709898596057e-07, "loss": 1.7894, "step": 31550 }, { "epoch": 4.681481481481481, "grad_norm": 4.572354316711426, "learning_rate": 2.058125705832159e-07, "loss": 1.7386, "step": 31600 }, { "epoch": 4.688888888888889, "grad_norm": 4.349206924438477, "learning_rate": 1.9638662904367e-07, "loss": 1.7278, "step": 31650 }, { "epoch": 4.696296296296296, "grad_norm": 4.914106369018555, "learning_rate": 1.871794847356656e-07, "loss": 1.7336, "step": 31700 }, { "epoch": 4.703703703703704, "grad_norm": 5.19305419921875, "learning_rate": 1.781913431444282e-07, "loss": 1.8137, "step": 31750 }, { "epoch": 4.711111111111111, "grad_norm": 4.804193019866943, "learning_rate": 1.6942240486747196e-07, "loss": 1.7152, "step": 31800 }, { "epoch": 4.718518518518518, "grad_norm": 5.960775375366211, "learning_rate": 1.6087286561013215e-07, "loss": 1.7112, "step": 31850 }, { "epoch": 4.725925925925926, "grad_norm": 4.539139747619629, "learning_rate": 1.5254291618118978e-07, "loss": 1.6712, "step": 31900 }, { "epoch": 4.733333333333333, "grad_norm": 5.396848678588867, "learning_rate": 1.4443274248861495e-07, "loss": 1.7833, "step": 31950 }, { "epoch": 4.7407407407407405, "grad_norm": 3.9417476654052734, "learning_rate": 1.3654252553542025e-07, "loss": 1.7082, "step": 32000 }, { "epoch": 4.7481481481481485, "grad_norm": 4.94804573059082, "learning_rate": 1.2887244141562062e-07, "loss": 1.6837, "step": 32050 }, { "epoch": 4.7555555555555555, "grad_norm": 5.223379135131836, "learning_rate": 1.214226613103009e-07, "loss": 1.7133, "step": 32100 }, { "epoch": 4.762962962962963, "grad_norm": 4.776657581329346, "learning_rate": 1.141933514837934e-07, "loss": 1.6771, "step": 32150 }, { "epoch": 4.770370370370371, "grad_norm": 4.386743545532227, "learning_rate": 1.071846732799775e-07, "loss": 1.7138, "step": 32200 }, { "epoch": 4.777777777777778, "grad_norm": 4.870341777801514, "learning_rate": 1.0039678311866585e-07, "loss": 1.705, "step": 32250 }, { "epoch": 4.785185185185185, "grad_norm": 4.143662452697754, "learning_rate": 9.382983249212163e-08, "loss": 1.6861, "step": 32300 }, { "epoch": 4.792592592592593, "grad_norm": 4.790794849395752, "learning_rate": 8.748396796167568e-08, "loss": 1.7338, "step": 32350 }, { "epoch": 4.8, "grad_norm": 4.551774501800537, "learning_rate": 8.135933115445471e-08, "loss": 1.6955, "step": 32400 }, { "epoch": 4.807407407407408, "grad_norm": 4.733094692230225, "learning_rate": 7.545605876021933e-08, "loss": 1.6594, "step": 32450 }, { "epoch": 4.814814814814815, "grad_norm": 4.897435188293457, "learning_rate": 6.977428252831764e-08, "loss": 1.6837, "step": 32500 }, { "epoch": 4.822222222222222, "grad_norm": 5.258769989013672, "learning_rate": 6.431412926473978e-08, "loss": 1.6683, "step": 32550 }, { "epoch": 4.82962962962963, "grad_norm": 5.621219158172607, "learning_rate": 5.907572082929247e-08, "loss": 1.7212, "step": 32600 }, { "epoch": 4.837037037037037, "grad_norm": 5.88584566116333, "learning_rate": 5.4059174132873314e-08, "loss": 1.7297, "step": 32650 }, { "epoch": 4.844444444444444, "grad_norm": 4.229117393493652, "learning_rate": 4.9264601134870747e-08, "loss": 1.714, "step": 32700 }, { "epoch": 4.851851851851852, "grad_norm": 4.282808303833008, "learning_rate": 4.4692108840656e-08, "loss": 1.7891, "step": 32750 }, { "epoch": 4.859259259259259, "grad_norm": 4.613882541656494, "learning_rate": 4.0341799299198345e-08, "loss": 1.7311, "step": 32800 }, { "epoch": 4.866666666666667, "grad_norm": 5.826322555541992, "learning_rate": 3.621376960079248e-08, "loss": 1.7449, "step": 32850 }, { "epoch": 4.874074074074074, "grad_norm": 3.8570024967193604, "learning_rate": 3.230811187488248e-08, "loss": 1.7437, "step": 32900 }, { "epoch": 4.881481481481481, "grad_norm": 5.855494022369385, "learning_rate": 2.8624913288012314e-08, "loss": 1.7327, "step": 32950 }, { "epoch": 4.888888888888889, "grad_norm": 4.846534252166748, "learning_rate": 2.5164256041879663e-08, "loss": 1.732, "step": 33000 }, { "epoch": 4.896296296296296, "grad_norm": 5.538894176483154, "learning_rate": 2.192621737150069e-08, "loss": 1.7365, "step": 33050 }, { "epoch": 4.9037037037037035, "grad_norm": 5.251986026763916, "learning_rate": 1.8910869543482535e-08, "loss": 1.6428, "step": 33100 }, { "epoch": 4.911111111111111, "grad_norm": 6.140721797943115, "learning_rate": 1.6118279854415718e-08, "loss": 1.7507, "step": 33150 }, { "epoch": 4.9185185185185185, "grad_norm": 5.360207557678223, "learning_rate": 1.3548510629373125e-08, "loss": 1.7524, "step": 33200 }, { "epoch": 4.925925925925926, "grad_norm": 5.260952472686768, "learning_rate": 1.1201619220511106e-08, "loss": 1.7157, "step": 33250 }, { "epoch": 4.933333333333334, "grad_norm": 4.921377658843994, "learning_rate": 9.077658005799405e-09, "loss": 1.7424, "step": 33300 }, { "epoch": 4.940740740740741, "grad_norm": 5.121984004974365, "learning_rate": 7.176674387842086e-09, "loss": 1.6539, "step": 33350 }, { "epoch": 4.948148148148148, "grad_norm": 4.907476425170898, "learning_rate": 5.498710792831707e-09, "loss": 1.7879, "step": 33400 }, { "epoch": 4.955555555555556, "grad_norm": 5.7554030418396, "learning_rate": 4.04380466958898e-09, "loss": 1.7458, "step": 33450 }, { "epoch": 4.962962962962963, "grad_norm": 4.102212905883789, "learning_rate": 2.8119884887389814e-09, "loss": 1.7358, "step": 33500 }, { "epoch": 4.97037037037037, "grad_norm": 4.823195934295654, "learning_rate": 1.8032897419772987e-09, "loss": 1.6797, "step": 33550 }, { "epoch": 4.977777777777778, "grad_norm": 5.787658214569092, "learning_rate": 1.0177309414638458e-09, "loss": 1.7365, "step": 33600 }, { "epoch": 4.985185185185185, "grad_norm": 5.443394184112549, "learning_rate": 4.5532961931216146e-10, "loss": 1.6624, "step": 33650 }, { "epoch": 4.992592592592593, "grad_norm": 5.065293312072754, "learning_rate": 1.1609832720860248e-10, "loss": 1.6947, "step": 33700 }, { "epoch": 5.0, "grad_norm": 4.812777042388916, "learning_rate": 4.4636121465657654e-14, "loss": 1.6106, "step": 33750 }, { "epoch": 5.007407407407407, "grad_norm": 4.977204322814941, "learning_rate": 1.3532600998952505e-06, "loss": 1.7371, "step": 33800 }, { "epoch": 5.014814814814815, "grad_norm": 5.09653377532959, "learning_rate": 1.3336002340133524e-06, "loss": 1.7169, "step": 33850 }, { "epoch": 5.022222222222222, "grad_norm": 4.8081560134887695, "learning_rate": 1.3140740147366925e-06, "loss": 1.7553, "step": 33900 }, { "epoch": 5.029629629629629, "grad_norm": 3.804783582687378, "learning_rate": 1.2946817431837289e-06, "loss": 1.7236, "step": 33950 }, { "epoch": 5.037037037037037, "grad_norm": 4.684085845947266, "learning_rate": 1.2754237184072737e-06, "loss": 1.6995, "step": 34000 }, { "epoch": 5.044444444444444, "grad_norm": 5.6429877281188965, "learning_rate": 1.2563002373898936e-06, "loss": 1.7936, "step": 34050 }, { "epoch": 5.051851851851852, "grad_norm": 5.1504807472229, "learning_rate": 1.2373115950393254e-06, "loss": 1.7631, "step": 34100 }, { "epoch": 5.059259259259259, "grad_norm": 5.963595867156982, "learning_rate": 1.2184580841839233e-06, "loss": 1.682, "step": 34150 }, { "epoch": 5.066666666666666, "grad_norm": 4.3478827476501465, "learning_rate": 1.1997399955681444e-06, "loss": 1.7337, "step": 34200 }, { "epoch": 5.074074074074074, "grad_norm": 6.539457321166992, "learning_rate": 1.1811576178480743e-06, "loss": 1.7334, "step": 34250 }, { "epoch": 5.0814814814814815, "grad_norm": 5.564241886138916, "learning_rate": 1.162711237586961e-06, "loss": 1.7236, "step": 34300 }, { "epoch": 5.088888888888889, "grad_norm": 5.158255100250244, "learning_rate": 1.1444011392508103e-06, "loss": 1.7642, "step": 34350 }, { "epoch": 5.0962962962962965, "grad_norm": 4.977708339691162, "learning_rate": 1.1262276052039911e-06, "loss": 1.7097, "step": 34400 }, { "epoch": 5.103703703703704, "grad_norm": 6.154446125030518, "learning_rate": 1.1081909157048775e-06, "loss": 1.6729, "step": 34450 }, { "epoch": 5.111111111111111, "grad_norm": 5.063690662384033, "learning_rate": 1.09029134890153e-06, "loss": 1.7204, "step": 34500 }, { "epoch": 5.118518518518519, "grad_norm": 4.185413360595703, "learning_rate": 1.0725291808274196e-06, "loss": 1.6558, "step": 34550 }, { "epoch": 5.125925925925926, "grad_norm": 4.706033706665039, "learning_rate": 1.054904685397148e-06, "loss": 1.7625, "step": 34600 }, { "epoch": 5.133333333333334, "grad_norm": 4.567237377166748, "learning_rate": 1.0374181344022339e-06, "loss": 1.7039, "step": 34650 }, { "epoch": 5.140740740740741, "grad_norm": 4.0956807136535645, "learning_rate": 1.0200697975069274e-06, "loss": 1.6927, "step": 34700 }, { "epoch": 5.148148148148148, "grad_norm": 4.75858211517334, "learning_rate": 1.0028599422440466e-06, "loss": 1.6918, "step": 34750 }, { "epoch": 5.155555555555556, "grad_norm": 6.522222518920898, "learning_rate": 9.857888340108478e-07, "loss": 1.7537, "step": 34800 }, { "epoch": 5.162962962962963, "grad_norm": 4.785046100616455, "learning_rate": 9.68856736064936e-07, "loss": 1.7179, "step": 34850 }, { "epoch": 5.17037037037037, "grad_norm": 4.325683116912842, "learning_rate": 9.5206390952021e-07, "loss": 1.6709, "step": 34900 }, { "epoch": 5.177777777777778, "grad_norm": 4.622459888458252, "learning_rate": 9.354106133428287e-07, "loss": 1.7491, "step": 34950 }, { "epoch": 5.185185185185185, "grad_norm": 4.967048168182373, "learning_rate": 9.188971043472172e-07, "loss": 1.738, "step": 35000 }, { "epoch": 5.192592592592592, "grad_norm": 4.760488986968994, "learning_rate": 9.025236371921176e-07, "loss": 1.6654, "step": 35050 }, { "epoch": 5.2, "grad_norm": 5.821338653564453, "learning_rate": 8.862904643766435e-07, "loss": 1.6761, "step": 35100 }, { "epoch": 5.207407407407407, "grad_norm": 5.513166427612305, "learning_rate": 8.701978362364039e-07, "loss": 1.7369, "step": 35150 }, { "epoch": 5.214814814814815, "grad_norm": 4.389341354370117, "learning_rate": 8.542460009396313e-07, "loss": 1.7099, "step": 35200 }, { "epoch": 5.222222222222222, "grad_norm": 5.52158260345459, "learning_rate": 8.38435204483361e-07, "loss": 1.7589, "step": 35250 }, { "epoch": 5.229629629629629, "grad_norm": 4.525510311126709, "learning_rate": 8.227656906896297e-07, "loss": 1.6873, "step": 35300 }, { "epoch": 5.237037037037037, "grad_norm": 4.573180675506592, "learning_rate": 8.072377012017262e-07, "loss": 1.6743, "step": 35350 }, { "epoch": 5.2444444444444445, "grad_norm": 4.818542957305908, "learning_rate": 7.918514754804596e-07, "loss": 1.6599, "step": 35400 }, { "epoch": 5.2518518518518515, "grad_norm": 6.3008341789245605, "learning_rate": 7.766072508004596e-07, "loss": 1.7452, "step": 35450 }, { "epoch": 5.2592592592592595, "grad_norm": 5.006597518920898, "learning_rate": 7.615052622465336e-07, "loss": 1.7196, "step": 35500 }, { "epoch": 5.266666666666667, "grad_norm": 4.924106597900391, "learning_rate": 7.465457427100231e-07, "loss": 1.7712, "step": 35550 }, { "epoch": 5.274074074074074, "grad_norm": 5.4804887771606445, "learning_rate": 7.317289228852286e-07, "loss": 1.715, "step": 35600 }, { "epoch": 5.281481481481482, "grad_norm": 4.695647239685059, "learning_rate": 7.170550312658375e-07, "loss": 1.7124, "step": 35650 }, { "epoch": 5.288888888888889, "grad_norm": 6.215033054351807, "learning_rate": 7.025242941414146e-07, "loss": 1.6316, "step": 35700 }, { "epoch": 5.296296296296296, "grad_norm": 4.85810661315918, "learning_rate": 6.881369355938971e-07, "loss": 1.6962, "step": 35750 }, { "epoch": 5.303703703703704, "grad_norm": 5.630098342895508, "learning_rate": 6.73893177494156e-07, "loss": 1.656, "step": 35800 }, { "epoch": 5.311111111111111, "grad_norm": 5.91583776473999, "learning_rate": 6.597932394985617e-07, "loss": 1.756, "step": 35850 }, { "epoch": 5.318518518518519, "grad_norm": 4.883135795593262, "learning_rate": 6.458373390455996e-07, "loss": 1.7179, "step": 35900 }, { "epoch": 5.325925925925926, "grad_norm": 4.665558815002441, "learning_rate": 6.320256913525146e-07, "loss": 1.7424, "step": 35950 }, { "epoch": 5.333333333333333, "grad_norm": 5.224910259246826, "learning_rate": 6.183585094120082e-07, "loss": 1.7098, "step": 36000 }, { "epoch": 5.340740740740741, "grad_norm": 4.836453437805176, "learning_rate": 6.048360039889267e-07, "loss": 1.7507, "step": 36050 }, { "epoch": 5.348148148148148, "grad_norm": 4.658173084259033, "learning_rate": 5.914583836170341e-07, "loss": 1.7571, "step": 36100 }, { "epoch": 5.355555555555555, "grad_norm": 6.138154029846191, "learning_rate": 5.782258545957841e-07, "loss": 1.7125, "step": 36150 }, { "epoch": 5.362962962962963, "grad_norm": 5.144318580627441, "learning_rate": 5.651386209871468e-07, "loss": 1.759, "step": 36200 }, { "epoch": 5.37037037037037, "grad_norm": 4.0739970207214355, "learning_rate": 5.521968846124514e-07, "loss": 1.7342, "step": 36250 }, { "epoch": 5.377777777777778, "grad_norm": 4.604063987731934, "learning_rate": 5.394008450492816e-07, "loss": 1.7351, "step": 36300 }, { "epoch": 5.385185185185185, "grad_norm": 4.261337757110596, "learning_rate": 5.267506996283989e-07, "loss": 1.7219, "step": 36350 }, { "epoch": 5.392592592592592, "grad_norm": 4.81792688369751, "learning_rate": 5.142466434306958e-07, "loss": 1.6602, "step": 36400 }, { "epoch": 5.4, "grad_norm": 5.473517894744873, "learning_rate": 5.018888692841828e-07, "loss": 1.7543, "step": 36450 }, { "epoch": 5.407407407407407, "grad_norm": 5.177619457244873, "learning_rate": 4.8967756776103e-07, "loss": 1.7373, "step": 36500 }, { "epoch": 5.4148148148148145, "grad_norm": 5.174300193786621, "learning_rate": 4.776129271746078e-07, "loss": 1.7004, "step": 36550 }, { "epoch": 5.4222222222222225, "grad_norm": 4.85015344619751, "learning_rate": 4.6569513357660245e-07, "loss": 1.7679, "step": 36600 }, { "epoch": 5.42962962962963, "grad_norm": 4.257620334625244, "learning_rate": 4.5392437075413297e-07, "loss": 1.7639, "step": 36650 }, { "epoch": 5.437037037037037, "grad_norm": 5.19285249710083, "learning_rate": 4.423008202269241e-07, "loss": 1.6995, "step": 36700 }, { "epoch": 5.444444444444445, "grad_norm": 5.407149791717529, "learning_rate": 4.3082466124450105e-07, "loss": 1.6688, "step": 36750 }, { "epoch": 5.451851851851852, "grad_norm": 7.0187225341796875, "learning_rate": 4.194960707834339e-07, "loss": 1.7256, "step": 36800 }, { "epoch": 5.459259259259259, "grad_norm": 5.111575126647949, "learning_rate": 4.083152235446031e-07, "loss": 1.6883, "step": 36850 }, { "epoch": 5.466666666666667, "grad_norm": 4.569883346557617, "learning_rate": 3.972822919505026e-07, "loss": 1.6723, "step": 36900 }, { "epoch": 5.474074074074074, "grad_norm": 4.689021587371826, "learning_rate": 3.863974461425868e-07, "loss": 1.6803, "step": 36950 }, { "epoch": 5.481481481481482, "grad_norm": 4.931379795074463, "learning_rate": 3.7566085397864216e-07, "loss": 1.7024, "step": 37000 }, { "epoch": 5.488888888888889, "grad_norm": 4.577634811401367, "learning_rate": 3.650726810302041e-07, "loss": 1.7181, "step": 37050 }, { "epoch": 5.496296296296296, "grad_norm": 5.35470724105835, "learning_rate": 3.546330905799944e-07, "loss": 1.778, "step": 37100 }, { "epoch": 5.503703703703704, "grad_norm": 5.660891056060791, "learning_rate": 3.443422436194155e-07, "loss": 1.7464, "step": 37150 }, { "epoch": 5.511111111111111, "grad_norm": 5.5450334548950195, "learning_rate": 3.3420029884605466e-07, "loss": 1.6753, "step": 37200 }, { "epoch": 5.518518518518518, "grad_norm": 4.8132853507995605, "learning_rate": 3.242074126612471e-07, "loss": 1.7366, "step": 37250 }, { "epoch": 5.525925925925926, "grad_norm": 4.943393230438232, "learning_rate": 3.1436373916766236e-07, "loss": 1.7129, "step": 37300 }, { "epoch": 5.533333333333333, "grad_norm": 4.270570278167725, "learning_rate": 3.0466943016692175e-07, "loss": 1.7389, "step": 37350 }, { "epoch": 5.540740740740741, "grad_norm": 6.0502095222473145, "learning_rate": 2.9512463515725896e-07, "loss": 1.6554, "step": 37400 }, { "epoch": 5.548148148148148, "grad_norm": 5.404415607452393, "learning_rate": 2.8572950133122556e-07, "loss": 1.7489, "step": 37450 }, { "epoch": 5.555555555555555, "grad_norm": 5.563807010650635, "learning_rate": 2.764841735734047e-07, "loss": 1.7091, "step": 37500 }, { "epoch": 5.562962962962963, "grad_norm": 6.912181377410889, "learning_rate": 2.673887944581877e-07, "loss": 1.7228, "step": 37550 }, { "epoch": 5.57037037037037, "grad_norm": 5.763159275054932, "learning_rate": 2.5844350424757194e-07, "loss": 1.7131, "step": 37600 }, { "epoch": 5.5777777777777775, "grad_norm": 5.707430362701416, "learning_rate": 2.4964844088899985e-07, "loss": 1.6694, "step": 37650 }, { "epoch": 5.5851851851851855, "grad_norm": 5.327348232269287, "learning_rate": 2.41003740013227e-07, "loss": 1.7245, "step": 37700 }, { "epoch": 5.592592592592593, "grad_norm": 4.956423759460449, "learning_rate": 2.3250953493223484e-07, "loss": 1.7274, "step": 37750 }, { "epoch": 5.6, "grad_norm": 4.380049228668213, "learning_rate": 2.2416595663717344e-07, "loss": 1.7224, "step": 37800 }, { "epoch": 5.607407407407408, "grad_norm": 5.581598281860352, "learning_rate": 2.1597313379634332e-07, "loss": 1.7526, "step": 37850 }, { "epoch": 5.614814814814815, "grad_norm": 5.117220401763916, "learning_rate": 2.079311927532046e-07, "loss": 1.604, "step": 37900 }, { "epoch": 5.622222222222222, "grad_norm": 4.668652057647705, "learning_rate": 2.0004025752443978e-07, "loss": 1.7551, "step": 37950 }, { "epoch": 5.62962962962963, "grad_norm": 4.981814861297607, "learning_rate": 1.9230044979803075e-07, "loss": 1.7394, "step": 38000 }, { "epoch": 5.637037037037037, "grad_norm": 5.624590873718262, "learning_rate": 1.847118889313837e-07, "loss": 1.7687, "step": 38050 }, { "epoch": 5.644444444444445, "grad_norm": 4.681608200073242, "learning_rate": 1.7727469194950053e-07, "loss": 1.7569, "step": 38100 }, { "epoch": 5.651851851851852, "grad_norm": 5.595433235168457, "learning_rate": 1.6998897354315592e-07, "loss": 1.7543, "step": 38150 }, { "epoch": 5.659259259259259, "grad_norm": 5.625829219818115, "learning_rate": 1.6285484606713976e-07, "loss": 1.7192, "step": 38200 }, { "epoch": 5.666666666666667, "grad_norm": 3.95115327835083, "learning_rate": 1.5587241953852416e-07, "loss": 1.7095, "step": 38250 }, { "epoch": 5.674074074074074, "grad_norm": 4.709456443786621, "learning_rate": 1.4904180163496373e-07, "loss": 1.7102, "step": 38300 }, { "epoch": 5.681481481481481, "grad_norm": 5.535898208618164, "learning_rate": 1.423630976930368e-07, "loss": 1.7262, "step": 38350 }, { "epoch": 5.688888888888889, "grad_norm": 5.115292549133301, "learning_rate": 1.3583641070662011e-07, "loss": 1.7252, "step": 38400 }, { "epoch": 5.696296296296296, "grad_norm": 4.890398979187012, "learning_rate": 1.2946184132530015e-07, "loss": 1.7384, "step": 38450 }, { "epoch": 5.703703703703704, "grad_norm": 4.627488136291504, "learning_rate": 1.2323948785282313e-07, "loss": 1.8393, "step": 38500 }, { "epoch": 5.711111111111111, "grad_norm": 5.317155361175537, "learning_rate": 1.1716944624557524e-07, "loss": 1.7405, "step": 38550 }, { "epoch": 5.718518518518518, "grad_norm": 4.53377628326416, "learning_rate": 1.1125181011111042e-07, "loss": 1.6679, "step": 38600 }, { "epoch": 5.725925925925926, "grad_norm": 6.2958879470825195, "learning_rate": 1.0548667070669594e-07, "loss": 1.7211, "step": 38650 }, { "epoch": 5.733333333333333, "grad_norm": 6.272239685058594, "learning_rate": 9.98741169379125e-08, "loss": 1.7378, "step": 38700 }, { "epoch": 5.7407407407407405, "grad_norm": 4.6558732986450195, "learning_rate": 9.441423535728523e-08, "loss": 1.8154, "step": 38750 }, { "epoch": 5.7481481481481485, "grad_norm": 6.190096378326416, "learning_rate": 8.910711016294039e-08, "loss": 1.7789, "step": 38800 }, { "epoch": 5.7555555555555555, "grad_norm": 4.927070140838623, "learning_rate": 8.395282319731302e-08, "loss": 1.7477, "step": 38850 }, { "epoch": 5.762962962962963, "grad_norm": 4.361870765686035, "learning_rate": 7.895145394588577e-08, "loss": 1.6863, "step": 38900 }, { "epoch": 5.770370370370371, "grad_norm": 5.228439807891846, "learning_rate": 7.410307953595874e-08, "loss": 1.7682, "step": 38950 }, { "epoch": 5.777777777777778, "grad_norm": 4.051884174346924, "learning_rate": 6.940777473546379e-08, "loss": 1.6873, "step": 39000 }, { "epoch": 5.785185185185185, "grad_norm": 4.61396598815918, "learning_rate": 6.486561195180763e-08, "loss": 1.6778, "step": 39050 }, { "epoch": 5.792592592592593, "grad_norm": 4.127589225769043, "learning_rate": 6.047666123076168e-08, "loss": 1.6759, "step": 39100 }, { "epoch": 5.8, "grad_norm": 5.580689907073975, "learning_rate": 5.624099025537399e-08, "loss": 1.6931, "step": 39150 }, { "epoch": 5.807407407407408, "grad_norm": 4.461786270141602, "learning_rate": 5.215866434493011e-08, "loss": 1.7217, "step": 39200 }, { "epoch": 5.814814814814815, "grad_norm": 5.180467128753662, "learning_rate": 4.822974645394718e-08, "loss": 1.7161, "step": 39250 }, { "epoch": 5.822222222222222, "grad_norm": 5.350657939910889, "learning_rate": 4.445429717119809e-08, "loss": 1.761, "step": 39300 }, { "epoch": 5.82962962962963, "grad_norm": 4.767494201660156, "learning_rate": 4.083237471878221e-08, "loss": 1.7417, "step": 39350 }, { "epoch": 5.837037037037037, "grad_norm": 6.330611228942871, "learning_rate": 3.736403495122498e-08, "loss": 1.8127, "step": 39400 }, { "epoch": 5.844444444444444, "grad_norm": 5.576878547668457, "learning_rate": 3.404933135461419e-08, "loss": 1.655, "step": 39450 }, { "epoch": 5.851851851851852, "grad_norm": 4.449545383453369, "learning_rate": 3.088831504577949e-08, "loss": 1.6512, "step": 39500 }, { "epoch": 5.859259259259259, "grad_norm": 5.951160430908203, "learning_rate": 2.7881034771505277e-08, "loss": 1.7151, "step": 39550 }, { "epoch": 5.866666666666667, "grad_norm": 3.876145124435425, "learning_rate": 2.5027536907772375e-08, "loss": 1.7376, "step": 39600 }, { "epoch": 5.874074074074074, "grad_norm": 4.853695392608643, "learning_rate": 2.2327865459047527e-08, "loss": 1.7232, "step": 39650 }, { "epoch": 5.881481481481481, "grad_norm": 4.873416900634766, "learning_rate": 1.9782062057603913e-08, "loss": 1.7437, "step": 39700 }, { "epoch": 5.888888888888889, "grad_norm": 5.661427974700928, "learning_rate": 1.7390165962879458e-08, "loss": 1.6729, "step": 39750 }, { "epoch": 5.896296296296296, "grad_norm": 5.479251861572266, "learning_rate": 1.515221406087175e-08, "loss": 1.758, "step": 39800 }, { "epoch": 5.9037037037037035, "grad_norm": 4.6737823486328125, "learning_rate": 1.3068240863566285e-08, "loss": 1.6668, "step": 39850 }, { "epoch": 5.911111111111111, "grad_norm": 6.463534355163574, "learning_rate": 1.1138278508407985e-08, "loss": 1.7779, "step": 39900 }, { "epoch": 5.9185185185185185, "grad_norm": 5.896658897399902, "learning_rate": 9.362356757804947e-09, "loss": 1.703, "step": 39950 }, { "epoch": 5.925925925925926, "grad_norm": 5.711425304412842, "learning_rate": 7.740502998666577e-09, "loss": 1.6643, "step": 40000 }, { "epoch": 5.933333333333334, "grad_norm": 4.859104156494141, "learning_rate": 6.272742241985042e-09, "loss": 1.7116, "step": 40050 }, { "epoch": 5.940740740740741, "grad_norm": 5.876493453979492, "learning_rate": 4.9590971224444676e-09, "loss": 1.6339, "step": 40100 }, { "epoch": 5.948148148148148, "grad_norm": 4.462888717651367, "learning_rate": 3.799587898080104e-09, "loss": 1.6942, "step": 40150 }, { "epoch": 5.955555555555556, "grad_norm": 5.941119194030762, "learning_rate": 2.7942324499585782e-09, "loss": 1.7655, "step": 40200 }, { "epoch": 5.962962962962963, "grad_norm": 4.833111763000488, "learning_rate": 1.943046281903671e-09, "loss": 1.7224, "step": 40250 }, { "epoch": 5.97037037037037, "grad_norm": 5.334336757659912, "learning_rate": 1.2460425202587279e-09, "loss": 1.684, "step": 40300 }, { "epoch": 5.977777777777778, "grad_norm": 5.39022159576416, "learning_rate": 7.032319136845989e-10, "loss": 1.6564, "step": 40350 }, { "epoch": 5.985185185185185, "grad_norm": 4.58676290512085, "learning_rate": 3.1462283299199534e-10, "loss": 1.7112, "step": 40400 }, { "epoch": 5.992592592592593, "grad_norm": 3.8273935317993164, "learning_rate": 8.022127101492416e-11, "loss": 1.6573, "step": 40450 }, { "epoch": 6.0, "grad_norm": 4.507394313812256, "learning_rate": 3.0842514098239354e-14, "loss": 1.704, "step": 40500 }, { "epoch": 6.007407407407407, "grad_norm": 4.695806503295898, "learning_rate": 9.968610323118833e-07, "loss": 1.7468, "step": 40550 }, { "epoch": 6.014814814814815, "grad_norm": 4.552427768707275, "learning_rate": 9.822878109140143e-07, "loss": 1.6836, "step": 40600 }, { "epoch": 6.022222222222222, "grad_norm": 5.570436954498291, "learning_rate": 9.678163953077624e-07, "loss": 1.6861, "step": 40650 }, { "epoch": 6.029629629629629, "grad_norm": 4.6333909034729, "learning_rate": 9.53446948868707e-07, "loss": 1.6624, "step": 40700 }, { "epoch": 6.037037037037037, "grad_norm": 6.026545524597168, "learning_rate": 9.391796338212356e-07, "loss": 1.6884, "step": 40750 }, { "epoch": 6.044444444444444, "grad_norm": 4.965294361114502, "learning_rate": 9.25014611236723e-07, "loss": 1.7189, "step": 40800 }, { "epoch": 6.051851851851852, "grad_norm": 4.187284469604492, "learning_rate": 9.109520410317107e-07, "loss": 1.6792, "step": 40850 }, { "epoch": 6.059259259259259, "grad_norm": 5.307417392730713, "learning_rate": 8.969920819661016e-07, "loss": 1.6886, "step": 40900 }, { "epoch": 6.066666666666666, "grad_norm": 5.512707710266113, "learning_rate": 8.831348916413606e-07, "loss": 1.7772, "step": 40950 }, { "epoch": 6.074074074074074, "grad_norm": 5.011601448059082, "learning_rate": 8.693806264987482e-07, "loss": 1.7525, "step": 41000 }, { "epoch": 6.0814814814814815, "grad_norm": 4.537539005279541, "learning_rate": 8.55729441817541e-07, "loss": 1.6975, "step": 41050 }, { "epoch": 6.088888888888889, "grad_norm": 4.727987766265869, "learning_rate": 8.421814917132898e-07, "loss": 1.67, "step": 41100 }, { "epoch": 6.0962962962962965, "grad_norm": 4.8712382316589355, "learning_rate": 8.287369291360736e-07, "loss": 1.7992, "step": 41150 }, { "epoch": 6.103703703703704, "grad_norm": 4.62169075012207, "learning_rate": 8.1539590586877e-07, "loss": 1.7066, "step": 41200 }, { "epoch": 6.111111111111111, "grad_norm": 6.729462146759033, "learning_rate": 8.021585725253511e-07, "loss": 1.6944, "step": 41250 }, { "epoch": 6.118518518518519, "grad_norm": 4.7458577156066895, "learning_rate": 7.890250785491771e-07, "loss": 1.7662, "step": 41300 }, { "epoch": 6.125925925925926, "grad_norm": 4.354922771453857, "learning_rate": 7.759955722113077e-07, "loss": 1.679, "step": 41350 }, { "epoch": 6.133333333333334, "grad_norm": 5.127136707305908, "learning_rate": 7.630702006088298e-07, "loss": 1.758, "step": 41400 }, { "epoch": 6.140740740740741, "grad_norm": 4.486684322357178, "learning_rate": 7.502491096632003e-07, "loss": 1.7028, "step": 41450 }, { "epoch": 6.148148148148148, "grad_norm": 4.6135077476501465, "learning_rate": 7.375324441185938e-07, "loss": 1.7477, "step": 41500 }, { "epoch": 6.155555555555556, "grad_norm": 4.534246921539307, "learning_rate": 7.249203475402722e-07, "loss": 1.6541, "step": 41550 }, { "epoch": 6.162962962962963, "grad_norm": 4.888129711151123, "learning_rate": 7.124129623129605e-07, "loss": 1.6781, "step": 41600 }, { "epoch": 6.17037037037037, "grad_norm": 4.807866096496582, "learning_rate": 7.000104296392418e-07, "loss": 1.7778, "step": 41650 }, { "epoch": 6.177777777777778, "grad_norm": 5.536331653594971, "learning_rate": 6.877128895379625e-07, "loss": 1.7536, "step": 41700 }, { "epoch": 6.185185185185185, "grad_norm": 4.815216541290283, "learning_rate": 6.755204808426529e-07, "loss": 1.7125, "step": 41750 }, { "epoch": 6.192592592592592, "grad_norm": 5.147290229797363, "learning_rate": 6.634333411999527e-07, "loss": 1.7725, "step": 41800 }, { "epoch": 6.2, "grad_norm": 5.061643600463867, "learning_rate": 6.51451607068071e-07, "loss": 1.7192, "step": 41850 }, { "epoch": 6.207407407407407, "grad_norm": 4.440853595733643, "learning_rate": 6.395754137152321e-07, "loss": 1.6838, "step": 41900 }, { "epoch": 6.214814814814815, "grad_norm": 4.56868314743042, "learning_rate": 6.278048952181548e-07, "loss": 1.7385, "step": 41950 }, { "epoch": 6.222222222222222, "grad_norm": 3.9864776134490967, "learning_rate": 6.161401844605408e-07, "loss": 1.7712, "step": 42000 }, { "epoch": 6.229629629629629, "grad_norm": 4.387381553649902, "learning_rate": 6.045814131315653e-07, "loss": 1.6408, "step": 42050 }, { "epoch": 6.237037037037037, "grad_norm": 4.279029369354248, "learning_rate": 5.931287117244012e-07, "loss": 1.7067, "step": 42100 }, { "epoch": 6.2444444444444445, "grad_norm": 4.650752067565918, "learning_rate": 5.817822095347414e-07, "loss": 1.7025, "step": 42150 }, { "epoch": 6.2518518518518515, "grad_norm": 4.82672643661499, "learning_rate": 5.705420346593349e-07, "loss": 1.638, "step": 42200 }, { "epoch": 6.2592592592592595, "grad_norm": 4.590485095977783, "learning_rate": 5.594083139945505e-07, "loss": 1.6822, "step": 42250 }, { "epoch": 6.266666666666667, "grad_norm": 4.995912551879883, "learning_rate": 5.48381173234932e-07, "loss": 1.6746, "step": 42300 }, { "epoch": 6.274074074074074, "grad_norm": 5.399720668792725, "learning_rate": 5.374607368717899e-07, "loss": 1.7591, "step": 42350 }, { "epoch": 6.281481481481482, "grad_norm": 4.2428693771362305, "learning_rate": 5.266471281917906e-07, "loss": 1.7012, "step": 42400 }, { "epoch": 6.288888888888889, "grad_norm": 5.344005107879639, "learning_rate": 5.159404692755621e-07, "loss": 1.7282, "step": 42450 }, { "epoch": 6.296296296296296, "grad_norm": 5.161723613739014, "learning_rate": 5.053408809963234e-07, "loss": 1.6308, "step": 42500 }, { "epoch": 6.303703703703704, "grad_norm": 4.982937812805176, "learning_rate": 4.948484830185152e-07, "loss": 1.8198, "step": 42550 }, { "epoch": 6.311111111111111, "grad_norm": 6.293619632720947, "learning_rate": 4.844633937964471e-07, "loss": 1.7127, "step": 42600 }, { "epoch": 6.318518518518519, "grad_norm": 4.575809478759766, "learning_rate": 4.741857305729636e-07, "loss": 1.668, "step": 42650 }, { "epoch": 6.325925925925926, "grad_norm": 4.730220794677734, "learning_rate": 4.6401560937812006e-07, "loss": 1.7546, "step": 42700 }, { "epoch": 6.333333333333333, "grad_norm": 5.089296340942383, "learning_rate": 4.5395314502787224e-07, "loss": 1.6804, "step": 42750 }, { "epoch": 6.340740740740741, "grad_norm": 4.743657112121582, "learning_rate": 4.4399845112277795e-07, "loss": 1.7564, "step": 42800 }, { "epoch": 6.348148148148148, "grad_norm": 3.8570306301116943, "learning_rate": 4.341516400467194e-07, "loss": 1.6802, "step": 42850 }, { "epoch": 6.355555555555555, "grad_norm": 5.294703960418701, "learning_rate": 4.244128229656297e-07, "loss": 1.7917, "step": 42900 }, { "epoch": 6.362962962962963, "grad_norm": 4.600700855255127, "learning_rate": 4.1478210982624055e-07, "loss": 1.6821, "step": 42950 }, { "epoch": 6.37037037037037, "grad_norm": 5.0378642082214355, "learning_rate": 4.052596093548433e-07, "loss": 1.7137, "step": 43000 }, { "epoch": 6.377777777777778, "grad_norm": 5.134553909301758, "learning_rate": 3.9584542905604984e-07, "loss": 1.7319, "step": 43050 }, { "epoch": 6.385185185185185, "grad_norm": 4.90350866317749, "learning_rate": 3.8653967521159683e-07, "loss": 1.7284, "step": 43100 }, { "epoch": 6.392592592592592, "grad_norm": 5.329576015472412, "learning_rate": 3.773424528791314e-07, "loss": 1.7192, "step": 43150 }, { "epoch": 6.4, "grad_norm": 6.085256099700928, "learning_rate": 3.682538658910317e-07, "loss": 1.7121, "step": 43200 }, { "epoch": 6.407407407407407, "grad_norm": 5.10631799697876, "learning_rate": 3.5927401685323383e-07, "loss": 1.7475, "step": 43250 }, { "epoch": 6.4148148148148145, "grad_norm": 5.675637722015381, "learning_rate": 3.5040300714407116e-07, "loss": 1.7204, "step": 43300 }, { "epoch": 6.4222222222222225, "grad_norm": 4.893547058105469, "learning_rate": 3.4164093691313126e-07, "loss": 1.7431, "step": 43350 }, { "epoch": 6.42962962962963, "grad_norm": 5.115847110748291, "learning_rate": 3.329879050801288e-07, "loss": 1.688, "step": 43400 }, { "epoch": 6.437037037037037, "grad_norm": 4.1967010498046875, "learning_rate": 3.2444400933378085e-07, "loss": 1.6674, "step": 43450 }, { "epoch": 6.444444444444445, "grad_norm": 5.261149883270264, "learning_rate": 3.160093461307123e-07, "loss": 1.7298, "step": 43500 }, { "epoch": 6.451851851851852, "grad_norm": 5.664730072021484, "learning_rate": 3.0768401069436235e-07, "loss": 1.7269, "step": 43550 }, { "epoch": 6.459259259259259, "grad_norm": 4.641995906829834, "learning_rate": 2.9946809701390855e-07, "loss": 1.7186, "step": 43600 }, { "epoch": 6.466666666666667, "grad_norm": 5.614559650421143, "learning_rate": 2.91361697843211e-07, "loss": 1.6936, "step": 43650 }, { "epoch": 6.474074074074074, "grad_norm": 4.712127208709717, "learning_rate": 2.8336490469975665e-07, "loss": 1.6855, "step": 43700 }, { "epoch": 6.481481481481482, "grad_norm": 4.914728164672852, "learning_rate": 2.7547780786363666e-07, "loss": 1.7235, "step": 43750 }, { "epoch": 6.488888888888889, "grad_norm": 3.8866348266601562, "learning_rate": 2.677004963765184e-07, "loss": 1.6946, "step": 43800 }, { "epoch": 6.496296296296296, "grad_norm": 4.636266231536865, "learning_rate": 2.6003305804064626e-07, "loss": 1.7051, "step": 43850 }, { "epoch": 6.503703703703704, "grad_norm": 4.9295501708984375, "learning_rate": 2.524755794178413e-07, "loss": 1.7064, "step": 43900 }, { "epoch": 6.511111111111111, "grad_norm": 4.815357685089111, "learning_rate": 2.4502814582853863e-07, "loss": 1.7588, "step": 43950 }, { "epoch": 6.518518518518518, "grad_norm": 5.123908519744873, "learning_rate": 2.3769084135081165e-07, "loss": 1.6893, "step": 44000 }, { "epoch": 6.525925925925926, "grad_norm": 4.06140661239624, "learning_rate": 2.3046374881942614e-07, "loss": 1.7157, "step": 44050 }, { "epoch": 6.533333333333333, "grad_norm": 4.8077569007873535, "learning_rate": 2.2334694982490857e-07, "loss": 1.7089, "step": 44100 }, { "epoch": 6.540740740740741, "grad_norm": 4.5206828117370605, "learning_rate": 2.1634052471262267e-07, "loss": 1.6945, "step": 44150 }, { "epoch": 6.548148148148148, "grad_norm": 4.904362678527832, "learning_rate": 2.0944455258185893e-07, "loss": 1.6603, "step": 44200 }, { "epoch": 6.555555555555555, "grad_norm": 4.935449123382568, "learning_rate": 2.0265911128494852e-07, "loss": 1.7091, "step": 44250 }, { "epoch": 6.562962962962963, "grad_norm": 5.564135551452637, "learning_rate": 1.9598427742637872e-07, "loss": 1.7323, "step": 44300 }, { "epoch": 6.57037037037037, "grad_norm": 5.302987575531006, "learning_rate": 1.8942012636192997e-07, "loss": 1.7145, "step": 44350 }, { "epoch": 6.5777777777777775, "grad_norm": 4.711696147918701, "learning_rate": 1.829667321978268e-07, "loss": 1.6974, "step": 44400 }, { "epoch": 6.5851851851851855, "grad_norm": 5.5999932289123535, "learning_rate": 1.7662416778989722e-07, "loss": 1.7, "step": 44450 }, { "epoch": 6.592592592592593, "grad_norm": 4.745241641998291, "learning_rate": 1.7039250474275682e-07, "loss": 1.7128, "step": 44500 }, { "epoch": 6.6, "grad_norm": 4.5061750411987305, "learning_rate": 1.6427181340899045e-07, "loss": 1.8055, "step": 44550 }, { "epoch": 6.607407407407408, "grad_norm": 3.9571361541748047, "learning_rate": 1.5826216288836738e-07, "loss": 1.6653, "step": 44600 }, { "epoch": 6.614814814814815, "grad_norm": 5.4514288902282715, "learning_rate": 1.523636210270585e-07, "loss": 1.7121, "step": 44650 }, { "epoch": 6.622222222222222, "grad_norm": 4.900452613830566, "learning_rate": 1.4657625441686697e-07, "loss": 1.6662, "step": 44700 }, { "epoch": 6.62962962962963, "grad_norm": 4.878017902374268, "learning_rate": 1.4090012839447998e-07, "loss": 1.6853, "step": 44750 }, { "epoch": 6.637037037037037, "grad_norm": 6.244802951812744, "learning_rate": 1.353353070407304e-07, "loss": 1.7196, "step": 44800 }, { "epoch": 6.644444444444445, "grad_norm": 4.747304439544678, "learning_rate": 1.2988185317987178e-07, "loss": 1.7786, "step": 44850 }, { "epoch": 6.651851851851852, "grad_norm": 4.684929370880127, "learning_rate": 1.2453982837887123e-07, "loss": 1.7424, "step": 44900 }, { "epoch": 6.659259259259259, "grad_norm": 5.081334114074707, "learning_rate": 1.1930929294671324e-07, "loss": 1.7229, "step": 44950 }, { "epoch": 6.666666666666667, "grad_norm": 4.667269706726074, "learning_rate": 1.1419030593371794e-07, "loss": 1.7856, "step": 45000 }, { "epoch": 6.674074074074074, "grad_norm": 5.279778957366943, "learning_rate": 1.0918292513087736e-07, "loss": 1.7846, "step": 45050 }, { "epoch": 6.681481481481481, "grad_norm": 5.6646270751953125, "learning_rate": 1.0428720706920137e-07, "loss": 1.7493, "step": 45100 }, { "epoch": 6.688888888888889, "grad_norm": 4.250929832458496, "learning_rate": 9.950320701907601e-08, "loss": 1.7586, "step": 45150 }, { "epoch": 6.696296296296296, "grad_norm": 5.012825012207031, "learning_rate": 9.48309789896451e-08, "loss": 1.7286, "step": 45200 }, { "epoch": 6.703703703703704, "grad_norm": 4.255910396575928, "learning_rate": 9.027057572819963e-08, "loss": 1.774, "step": 45250 }, { "epoch": 6.711111111111111, "grad_norm": 6.555685997009277, "learning_rate": 8.582204871957711e-08, "loss": 1.6969, "step": 45300 }, { "epoch": 6.718518518518518, "grad_norm": 5.877486228942871, "learning_rate": 8.148544818558646e-08, "loss": 1.7386, "step": 45350 }, { "epoch": 6.725925925925926, "grad_norm": 6.148101806640625, "learning_rate": 7.726082308443849e-08, "loss": 1.6688, "step": 45400 }, { "epoch": 6.733333333333333, "grad_norm": 5.574037075042725, "learning_rate": 7.314822111019304e-08, "loss": 1.7214, "step": 45450 }, { "epoch": 6.7407407407407405, "grad_norm": 4.646797180175781, "learning_rate": 6.914768869221933e-08, "loss": 1.7853, "step": 45500 }, { "epoch": 6.7481481481481485, "grad_norm": 4.4932661056518555, "learning_rate": 6.525927099467644e-08, "loss": 1.6435, "step": 45550 }, { "epoch": 6.7555555555555555, "grad_norm": 4.503779411315918, "learning_rate": 6.148301191599593e-08, "loss": 1.7112, "step": 45600 }, { "epoch": 6.762962962962963, "grad_norm": 6.027365684509277, "learning_rate": 5.781895408839333e-08, "loss": 1.7374, "step": 45650 }, { "epoch": 6.770370370370371, "grad_norm": 3.774756908416748, "learning_rate": 5.426713887738522e-08, "loss": 1.6818, "step": 45700 }, { "epoch": 6.777777777777778, "grad_norm": 4.556633949279785, "learning_rate": 5.082760638131734e-08, "loss": 1.6574, "step": 45750 }, { "epoch": 6.785185185185185, "grad_norm": 4.793794631958008, "learning_rate": 4.750039543091944e-08, "loss": 1.7449, "step": 45800 }, { "epoch": 6.792592592592593, "grad_norm": 4.510838508605957, "learning_rate": 4.4285543588858946e-08, "loss": 1.7321, "step": 45850 }, { "epoch": 6.8, "grad_norm": 4.843527317047119, "learning_rate": 4.118308714932462e-08, "loss": 1.7609, "step": 45900 }, { "epoch": 6.807407407407408, "grad_norm": 6.334366321563721, "learning_rate": 3.819306113761245e-08, "loss": 1.6737, "step": 45950 }, { "epoch": 6.814814814814815, "grad_norm": 5.0371994972229, "learning_rate": 3.531549930973044e-08, "loss": 1.6988, "step": 46000 }, { "epoch": 6.822222222222222, "grad_norm": 4.915505886077881, "learning_rate": 3.255043415201664e-08, "loss": 1.6995, "step": 46050 }, { "epoch": 6.82962962962963, "grad_norm": 5.524471282958984, "learning_rate": 2.989789688077615e-08, "loss": 1.68, "step": 46100 }, { "epoch": 6.837037037037037, "grad_norm": 7.363884449005127, "learning_rate": 2.7357917441926952e-08, "loss": 1.6951, "step": 46150 }, { "epoch": 6.844444444444444, "grad_norm": 4.0302534103393555, "learning_rate": 2.4930524510656805e-08, "loss": 1.6722, "step": 46200 }, { "epoch": 6.851851851851852, "grad_norm": 4.887310028076172, "learning_rate": 2.261574549111023e-08, "loss": 1.7051, "step": 46250 }, { "epoch": 6.859259259259259, "grad_norm": 5.650010585784912, "learning_rate": 2.0413606516066496e-08, "loss": 1.6899, "step": 46300 }, { "epoch": 6.866666666666667, "grad_norm": 4.32122802734375, "learning_rate": 1.832413244665654e-08, "loss": 1.7623, "step": 46350 }, { "epoch": 6.874074074074074, "grad_norm": 4.738122940063477, "learning_rate": 1.6347346872074287e-08, "loss": 1.7246, "step": 46400 }, { "epoch": 6.881481481481481, "grad_norm": 4.318696975708008, "learning_rate": 1.4483272109310221e-08, "loss": 1.7247, "step": 46450 }, { "epoch": 6.888888888888889, "grad_norm": 5.06472635269165, "learning_rate": 1.2731929202907111e-08, "loss": 1.694, "step": 46500 }, { "epoch": 6.896296296296296, "grad_norm": 4.829148292541504, "learning_rate": 1.1093337924714675e-08, "loss": 1.7271, "step": 46550 }, { "epoch": 6.9037037037037035, "grad_norm": 4.980674743652344, "learning_rate": 9.567516773667517e-09, "loss": 1.6858, "step": 46600 }, { "epoch": 6.911111111111111, "grad_norm": 5.86638069152832, "learning_rate": 8.154482975579747e-09, "loss": 1.7079, "step": 46650 }, { "epoch": 6.9185185185185185, "grad_norm": 4.498690605163574, "learning_rate": 6.854252482949575e-09, "loss": 1.7333, "step": 46700 }, { "epoch": 6.925925925925926, "grad_norm": 4.175648212432861, "learning_rate": 5.666839974775018e-09, "loss": 1.6606, "step": 46750 }, { "epoch": 6.933333333333334, "grad_norm": 4.801917552947998, "learning_rate": 4.592258856394027e-09, "loss": 1.7077, "step": 46800 }, { "epoch": 6.940740740740741, "grad_norm": 3.6366991996765137, "learning_rate": 3.6305212593290562e-09, "loss": 1.702, "step": 46850 }, { "epoch": 6.948148148148148, "grad_norm": 4.430602550506592, "learning_rate": 2.781638041152723e-09, "loss": 1.5927, "step": 46900 }, { "epoch": 6.955555555555556, "grad_norm": 5.0436906814575195, "learning_rate": 2.0456187853612476e-09, "loss": 1.7092, "step": 46950 }, { "epoch": 6.962962962962963, "grad_norm": 4.557282447814941, "learning_rate": 1.422471801272307e-09, "loss": 1.7299, "step": 47000 }, { "epoch": 6.97037037037037, "grad_norm": 4.492140769958496, "learning_rate": 9.122041239262302e-10, "loss": 1.7693, "step": 47050 }, { "epoch": 6.977777777777778, "grad_norm": 6.007195472717285, "learning_rate": 5.148215140093893e-10, "loss": 1.6889, "step": 47100 }, { "epoch": 6.985185185185185, "grad_norm": 4.410074234008789, "learning_rate": 2.3032845778647728e-10, "loss": 1.7252, "step": 47150 }, { "epoch": 6.992592592592593, "grad_norm": 4.711976528167725, "learning_rate": 5.872816705165817e-11, "loss": 1.7031, "step": 47200 }, { "epoch": 7.0, "grad_norm": 5.362443923950195, "learning_rate": 2.2579094149932645e-14, "loss": 1.7029, "step": 47250 }, { "epoch": 7.007407407407407, "grad_norm": 5.93522310256958, "learning_rate": 7.642412558190549e-07, "loss": 1.6308, "step": 47300 }, { "epoch": 7.014814814814815, "grad_norm": 5.8510308265686035, "learning_rate": 7.530237348772096e-07, "loss": 1.7759, "step": 47350 }, { "epoch": 7.022222222222222, "grad_norm": 4.586331367492676, "learning_rate": 7.418859273881395e-07, "loss": 1.7443, "step": 47400 }, { "epoch": 7.029629629629629, "grad_norm": 4.656227111816406, "learning_rate": 7.308279293651843e-07, "loss": 1.6972, "step": 47450 }, { "epoch": 7.037037037037037, "grad_norm": 5.146853446960449, "learning_rate": 7.198498361336915e-07, "loss": 1.7466, "step": 47500 }, { "epoch": 7.044444444444444, "grad_norm": 6.802228927612305, "learning_rate": 7.089517423301873e-07, "loss": 1.6718, "step": 47550 }, { "epoch": 7.051851851851852, "grad_norm": 5.65762186050415, "learning_rate": 6.98133741901561e-07, "loss": 1.6671, "step": 47600 }, { "epoch": 7.059259259259259, "grad_norm": 4.821460247039795, "learning_rate": 6.873959281042652e-07, "loss": 1.7146, "step": 47650 }, { "epoch": 7.066666666666666, "grad_norm": 5.252972602844238, "learning_rate": 6.767383935034966e-07, "loss": 1.7193, "step": 47700 }, { "epoch": 7.074074074074074, "grad_norm": 4.320099353790283, "learning_rate": 6.661612299724151e-07, "loss": 1.6708, "step": 47750 }, { "epoch": 7.0814814814814815, "grad_norm": 5.750055313110352, "learning_rate": 6.556645286913321e-07, "loss": 1.7479, "step": 47800 }, { "epoch": 7.088888888888889, "grad_norm": 5.870389461517334, "learning_rate": 6.452483801469445e-07, "loss": 1.6698, "step": 47850 }, { "epoch": 7.0962962962962965, "grad_norm": 4.311103820800781, "learning_rate": 6.349128741315391e-07, "loss": 1.6857, "step": 47900 }, { "epoch": 7.103703703703704, "grad_norm": 4.5788421630859375, "learning_rate": 6.24658099742228e-07, "loss": 1.7202, "step": 47950 }, { "epoch": 7.111111111111111, "grad_norm": 5.257473468780518, "learning_rate": 6.144841453801753e-07, "loss": 1.6565, "step": 48000 }, { "epoch": 7.118518518518519, "grad_norm": 5.473094463348389, "learning_rate": 6.043910987498392e-07, "loss": 1.6446, "step": 48050 }, { "epoch": 7.125925925925926, "grad_norm": 4.250508785247803, "learning_rate": 5.943790468582123e-07, "loss": 1.6908, "step": 48100 }, { "epoch": 7.133333333333334, "grad_norm": 4.74968957901001, "learning_rate": 5.844480760140692e-07, "loss": 1.6499, "step": 48150 }, { "epoch": 7.140740740740741, "grad_norm": 4.537721157073975, "learning_rate": 5.745982718272358e-07, "loss": 1.6836, "step": 48200 }, { "epoch": 7.148148148148148, "grad_norm": 5.611239433288574, "learning_rate": 5.648297192078312e-07, "loss": 1.7206, "step": 48250 }, { "epoch": 7.155555555555556, "grad_norm": 6.5248613357543945, "learning_rate": 5.551425023655555e-07, "loss": 1.754, "step": 48300 }, { "epoch": 7.162962962962963, "grad_norm": 5.197353839874268, "learning_rate": 5.455367048089455e-07, "loss": 1.7913, "step": 48350 }, { "epoch": 7.17037037037037, "grad_norm": 5.016681671142578, "learning_rate": 5.360124093446728e-07, "loss": 1.7725, "step": 48400 }, { "epoch": 7.177777777777778, "grad_norm": 4.837548732757568, "learning_rate": 5.265696980768154e-07, "loss": 1.6899, "step": 48450 }, { "epoch": 7.185185185185185, "grad_norm": 4.5065436363220215, "learning_rate": 5.172086524061582e-07, "loss": 1.6665, "step": 48500 }, { "epoch": 7.192592592592592, "grad_norm": 5.763499736785889, "learning_rate": 5.079293530294882e-07, "loss": 1.6199, "step": 48550 }, { "epoch": 7.2, "grad_norm": 4.539221286773682, "learning_rate": 4.987318799388962e-07, "loss": 1.6389, "step": 48600 }, { "epoch": 7.207407407407407, "grad_norm": 5.979331016540527, "learning_rate": 4.896163124210984e-07, "loss": 1.8119, "step": 48650 }, { "epoch": 7.214814814814815, "grad_norm": 6.506845951080322, "learning_rate": 4.805827290567344e-07, "loss": 1.7298, "step": 48700 }, { "epoch": 7.222222222222222, "grad_norm": 4.519911766052246, "learning_rate": 4.716312077197116e-07, "loss": 1.7396, "step": 48750 }, { "epoch": 7.229629629629629, "grad_norm": 6.238486289978027, "learning_rate": 4.6276182557651206e-07, "loss": 1.6573, "step": 48800 }, { "epoch": 7.237037037037037, "grad_norm": 6.776463985443115, "learning_rate": 4.539746590855465e-07, "loss": 1.7192, "step": 48850 }, { "epoch": 7.2444444444444445, "grad_norm": 4.782049179077148, "learning_rate": 4.4526978399648034e-07, "loss": 1.7457, "step": 48900 }, { "epoch": 7.2518518518518515, "grad_norm": 5.603179931640625, "learning_rate": 4.3664727534959093e-07, "loss": 1.6802, "step": 48950 }, { "epoch": 7.2592592592592595, "grad_norm": 5.583448886871338, "learning_rate": 4.2810720747511356e-07, "loss": 1.7304, "step": 49000 }, { "epoch": 7.266666666666667, "grad_norm": 5.498423099517822, "learning_rate": 4.196496539926065e-07, "loss": 1.6843, "step": 49050 }, { "epoch": 7.274074074074074, "grad_norm": 4.396875381469727, "learning_rate": 4.1127468781031244e-07, "loss": 1.7462, "step": 49100 }, { "epoch": 7.281481481481482, "grad_norm": 4.723330020904541, "learning_rate": 4.0298238112453034e-07, "loss": 1.7431, "step": 49150 }, { "epoch": 7.288888888888889, "grad_norm": 5.44240140914917, "learning_rate": 3.9477280541899696e-07, "loss": 1.6973, "step": 49200 }, { "epoch": 7.296296296296296, "grad_norm": 3.720705270767212, "learning_rate": 3.866460314642617e-07, "loss": 1.6725, "step": 49250 }, { "epoch": 7.303703703703704, "grad_norm": 5.060797691345215, "learning_rate": 3.786021293170905e-07, "loss": 1.7596, "step": 49300 }, { "epoch": 7.311111111111111, "grad_norm": 5.072236061096191, "learning_rate": 3.706411683198452e-07, "loss": 1.6878, "step": 49350 }, { "epoch": 7.318518518518519, "grad_norm": 6.246946334838867, "learning_rate": 3.627632170999029e-07, "loss": 1.7211, "step": 49400 }, { "epoch": 7.325925925925926, "grad_norm": 5.087826728820801, "learning_rate": 3.5496834356904873e-07, "loss": 1.6149, "step": 49450 }, { "epoch": 7.333333333333333, "grad_norm": 4.0795392990112305, "learning_rate": 3.4725661492290285e-07, "loss": 1.6767, "step": 49500 }, { "epoch": 7.340740740740741, "grad_norm": 5.544936180114746, "learning_rate": 3.396280976403321e-07, "loss": 1.6989, "step": 49550 }, { "epoch": 7.348148148148148, "grad_norm": 4.768216609954834, "learning_rate": 3.320828574828805e-07, "loss": 1.7244, "step": 49600 }, { "epoch": 7.355555555555555, "grad_norm": 5.588283538818359, "learning_rate": 3.2462095949420734e-07, "loss": 1.6634, "step": 49650 }, { "epoch": 7.362962962962963, "grad_norm": 5.01304292678833, "learning_rate": 3.172424679995167e-07, "loss": 1.7597, "step": 49700 }, { "epoch": 7.37037037037037, "grad_norm": 5.483970642089844, "learning_rate": 3.0994744660501117e-07, "loss": 1.631, "step": 49750 }, { "epoch": 7.377777777777778, "grad_norm": 4.719277858734131, "learning_rate": 3.027359581973377e-07, "loss": 1.7333, "step": 49800 }, { "epoch": 7.385185185185185, "grad_norm": 4.411671161651611, "learning_rate": 2.956080649430504e-07, "loss": 1.7788, "step": 49850 }, { "epoch": 7.392592592592592, "grad_norm": 4.788469314575195, "learning_rate": 2.885638282880698e-07, "loss": 1.7177, "step": 49900 }, { "epoch": 7.4, "grad_norm": 5.937678813934326, "learning_rate": 2.8160330895715905e-07, "loss": 1.728, "step": 49950 }, { "epoch": 7.407407407407407, "grad_norm": 5.438296318054199, "learning_rate": 2.747265669533938e-07, "loss": 1.6454, "step": 50000 }, { "epoch": 7.4148148148148145, "grad_norm": 4.224100112915039, "learning_rate": 2.6793366155765e-07, "loss": 1.6623, "step": 50050 }, { "epoch": 7.4222222222222225, "grad_norm": 4.0681657791137695, "learning_rate": 2.6122465132809026e-07, "loss": 1.6685, "step": 50100 }, { "epoch": 7.42962962962963, "grad_norm": 4.77517032623291, "learning_rate": 2.545995940996604e-07, "loss": 1.7017, "step": 50150 }, { "epoch": 7.437037037037037, "grad_norm": 5.433816432952881, "learning_rate": 2.480585469835917e-07, "loss": 1.7676, "step": 50200 }, { "epoch": 7.444444444444445, "grad_norm": 4.526613712310791, "learning_rate": 2.4160156636690355e-07, "loss": 1.681, "step": 50250 }, { "epoch": 7.451851851851852, "grad_norm": 5.165018081665039, "learning_rate": 2.352287079119242e-07, "loss": 1.7755, "step": 50300 }, { "epoch": 7.459259259259259, "grad_norm": 4.912266254425049, "learning_rate": 2.2894002655580727e-07, "loss": 1.7047, "step": 50350 }, { "epoch": 7.466666666666667, "grad_norm": 4.354547023773193, "learning_rate": 2.2273557651005806e-07, "loss": 1.6958, "step": 50400 }, { "epoch": 7.474074074074074, "grad_norm": 4.184830188751221, "learning_rate": 2.1661541126006603e-07, "loss": 1.6962, "step": 50450 }, { "epoch": 7.481481481481482, "grad_norm": 5.213493824005127, "learning_rate": 2.1057958356464624e-07, "loss": 1.6959, "step": 50500 }, { "epoch": 7.488888888888889, "grad_norm": 4.918919086456299, "learning_rate": 2.0462814545558318e-07, "loss": 1.6993, "step": 50550 }, { "epoch": 7.496296296296296, "grad_norm": 4.9153218269348145, "learning_rate": 1.987611482371776e-07, "loss": 1.6823, "step": 50600 }, { "epoch": 7.503703703703704, "grad_norm": 4.119404315948486, "learning_rate": 1.929786424858149e-07, "loss": 1.6665, "step": 50650 }, { "epoch": 7.511111111111111, "grad_norm": 4.445786476135254, "learning_rate": 1.8728067804951844e-07, "loss": 1.6962, "step": 50700 }, { "epoch": 7.518518518518518, "grad_norm": 7.642633438110352, "learning_rate": 1.8166730404752474e-07, "loss": 1.6504, "step": 50750 }, { "epoch": 7.525925925925926, "grad_norm": 4.436051845550537, "learning_rate": 1.7613856886986025e-07, "loss": 1.7457, "step": 50800 }, { "epoch": 7.533333333333333, "grad_norm": 4.766321182250977, "learning_rate": 1.706945201769239e-07, "loss": 1.7546, "step": 50850 }, { "epoch": 7.540740740740741, "grad_norm": 5.1242289543151855, "learning_rate": 1.6533520489907305e-07, "loss": 1.6541, "step": 50900 }, { "epoch": 7.548148148148148, "grad_norm": 4.930501937866211, "learning_rate": 1.600606692362261e-07, "loss": 1.7488, "step": 50950 }, { "epoch": 7.555555555555555, "grad_norm": 6.121901988983154, "learning_rate": 1.5487095865745593e-07, "loss": 1.7194, "step": 51000 }, { "epoch": 7.562962962962963, "grad_norm": 4.327436447143555, "learning_rate": 1.497661179006027e-07, "loss": 1.7288, "step": 51050 }, { "epoch": 7.57037037037037, "grad_norm": 5.480533599853516, "learning_rate": 1.4474619097188835e-07, "loss": 1.6727, "step": 51100 }, { "epoch": 7.5777777777777775, "grad_norm": 5.402645111083984, "learning_rate": 1.398112211455338e-07, "loss": 1.675, "step": 51150 }, { "epoch": 7.5851851851851855, "grad_norm": 4.789156913757324, "learning_rate": 1.3496125096339129e-07, "loss": 1.6841, "step": 51200 }, { "epoch": 7.592592592592593, "grad_norm": 5.137218952178955, "learning_rate": 1.3019632223457258e-07, "loss": 1.6567, "step": 51250 }, { "epoch": 7.6, "grad_norm": 5.8797736167907715, "learning_rate": 1.2551647603509242e-07, "loss": 1.7339, "step": 51300 }, { "epoch": 7.607407407407408, "grad_norm": 5.697160720825195, "learning_rate": 1.2092175270750905e-07, "loss": 1.6819, "step": 51350 }, { "epoch": 7.614814814814815, "grad_norm": 4.531454086303711, "learning_rate": 1.1641219186058317e-07, "loss": 1.6752, "step": 51400 }, { "epoch": 7.622222222222222, "grad_norm": 4.519044876098633, "learning_rate": 1.1198783236893274e-07, "loss": 1.7337, "step": 51450 }, { "epoch": 7.62962962962963, "grad_norm": 4.373157501220703, "learning_rate": 1.076487123726977e-07, "loss": 1.7309, "step": 51500 }, { "epoch": 7.637037037037037, "grad_norm": 5.711727142333984, "learning_rate": 1.0339486927721131e-07, "loss": 1.5929, "step": 51550 }, { "epoch": 7.644444444444445, "grad_norm": 4.87087869644165, "learning_rate": 9.922633975268048e-08, "loss": 1.7241, "step": 51600 }, { "epoch": 7.651851851851852, "grad_norm": 5.056915760040283, "learning_rate": 9.514315973386589e-08, "loss": 1.6532, "step": 51650 }, { "epoch": 7.659259259259259, "grad_norm": 6.0363569259643555, "learning_rate": 9.114536441977351e-08, "loss": 1.8079, "step": 51700 }, { "epoch": 7.666666666666667, "grad_norm": 4.777937412261963, "learning_rate": 8.723298827335247e-08, "loss": 1.7689, "step": 51750 }, { "epoch": 7.674074074074074, "grad_norm": 4.326016426086426, "learning_rate": 8.340606502119542e-08, "loss": 1.7673, "step": 51800 }, { "epoch": 7.681481481481481, "grad_norm": 4.3759541511535645, "learning_rate": 7.96646276532509e-08, "loss": 1.7127, "step": 51850 }, { "epoch": 7.688888888888889, "grad_norm": 4.051638603210449, "learning_rate": 7.600870842253805e-08, "loss": 1.7043, "step": 51900 }, { "epoch": 7.696296296296296, "grad_norm": 6.273709774017334, "learning_rate": 7.243833884486462e-08, "loss": 1.6677, "step": 51950 }, { "epoch": 7.703703703703704, "grad_norm": 4.785228729248047, "learning_rate": 6.895354969856271e-08, "loss": 1.6629, "step": 52000 }, { "epoch": 7.711111111111111, "grad_norm": 4.696812152862549, "learning_rate": 6.555437102421458e-08, "loss": 1.6376, "step": 52050 }, { "epoch": 7.718518518518518, "grad_norm": 5.83518123626709, "learning_rate": 6.224083212440391e-08, "loss": 1.7027, "step": 52100 }, { "epoch": 7.725925925925926, "grad_norm": 4.885500431060791, "learning_rate": 5.9012961563451595e-08, "loss": 1.7116, "step": 52150 }, { "epoch": 7.733333333333333, "grad_norm": 6.395565509796143, "learning_rate": 5.5870787167181526e-08, "loss": 1.643, "step": 52200 }, { "epoch": 7.7407407407407405, "grad_norm": 6.444253444671631, "learning_rate": 5.281433602267072e-08, "loss": 1.7616, "step": 52250 }, { "epoch": 7.7481481481481485, "grad_norm": 4.798886299133301, "learning_rate": 4.984363447802065e-08, "loss": 1.6833, "step": 52300 }, { "epoch": 7.7555555555555555, "grad_norm": 5.61707067489624, "learning_rate": 4.695870814213188e-08, "loss": 1.7252, "step": 52350 }, { "epoch": 7.762962962962963, "grad_norm": 4.683418273925781, "learning_rate": 4.4159581884476444e-08, "loss": 1.6788, "step": 52400 }, { "epoch": 7.770370370370371, "grad_norm": 7.159529209136963, "learning_rate": 4.144627983489136e-08, "loss": 1.6954, "step": 52450 }, { "epoch": 7.777777777777778, "grad_norm": 3.812352418899536, "learning_rate": 3.881882538336657e-08, "loss": 1.6584, "step": 52500 }, { "epoch": 7.785185185185185, "grad_norm": 4.499040126800537, "learning_rate": 3.627724117984177e-08, "loss": 1.6402, "step": 52550 }, { "epoch": 7.792592592592593, "grad_norm": 4.413599014282227, "learning_rate": 3.3821549134014365e-08, "loss": 1.6671, "step": 52600 }, { "epoch": 7.8, "grad_norm": 5.2130126953125, "learning_rate": 3.145177041514958e-08, "loss": 1.6463, "step": 52650 }, { "epoch": 7.807407407407408, "grad_norm": 4.559216022491455, "learning_rate": 2.9167925451898393e-08, "loss": 1.6551, "step": 52700 }, { "epoch": 7.814814814814815, "grad_norm": 5.212027072906494, "learning_rate": 2.6970033932118833e-08, "loss": 1.7194, "step": 52750 }, { "epoch": 7.822222222222222, "grad_norm": 5.727282524108887, "learning_rate": 2.4858114802711607e-08, "loss": 1.7803, "step": 52800 }, { "epoch": 7.82962962962963, "grad_norm": 4.719393253326416, "learning_rate": 2.2832186269449163e-08, "loss": 1.7053, "step": 52850 }, { "epoch": 7.837037037037037, "grad_norm": 6.631552696228027, "learning_rate": 2.0892265796825795e-08, "loss": 1.6426, "step": 52900 }, { "epoch": 7.844444444444444, "grad_norm": 5.424106121063232, "learning_rate": 1.9038370107905546e-08, "loss": 1.6795, "step": 52950 }, { "epoch": 7.851851851851852, "grad_norm": 4.815089702606201, "learning_rate": 1.7270515184172333e-08, "loss": 1.6766, "step": 53000 }, { "epoch": 7.859259259259259, "grad_norm": 5.34329891204834, "learning_rate": 1.5588716265398928e-08, "loss": 1.6708, "step": 53050 }, { "epoch": 7.866666666666667, "grad_norm": 4.181158065795898, "learning_rate": 1.399298784951708e-08, "loss": 1.7145, "step": 53100 }, { "epoch": 7.874074074074074, "grad_norm": 4.6021623611450195, "learning_rate": 1.2483343692482054e-08, "loss": 1.6803, "step": 53150 }, { "epoch": 7.881481481481481, "grad_norm": 5.097445011138916, "learning_rate": 1.1059796808164935e-08, "loss": 1.6934, "step": 53200 }, { "epoch": 7.888888888888889, "grad_norm": 4.321223258972168, "learning_rate": 9.722359468234966e-09, "loss": 1.6973, "step": 53250 }, { "epoch": 7.896296296296296, "grad_norm": 5.460144519805908, "learning_rate": 8.471043202057383e-09, "loss": 1.6736, "step": 53300 }, { "epoch": 7.9037037037037035, "grad_norm": 5.165498733520508, "learning_rate": 7.305858796586851e-09, "loss": 1.7208, "step": 53350 }, { "epoch": 7.911111111111111, "grad_norm": 5.325474262237549, "learning_rate": 6.226816296283078e-09, "loss": 1.6963, "step": 53400 }, { "epoch": 7.9185185185185185, "grad_norm": 4.388497352600098, "learning_rate": 5.23392500302089e-09, "loss": 1.6946, "step": 53450 }, { "epoch": 7.925925925925926, "grad_norm": 4.683349609375, "learning_rate": 4.327193476006963e-09, "loss": 1.7026, "step": 53500 }, { "epoch": 7.933333333333334, "grad_norm": 4.730576515197754, "learning_rate": 3.506629531710992e-09, "loss": 1.6988, "step": 53550 }, { "epoch": 7.940740740740741, "grad_norm": 4.461126804351807, "learning_rate": 2.7722402437935224e-09, "loss": 1.7147, "step": 53600 }, { "epoch": 7.948148148148148, "grad_norm": 4.75923490524292, "learning_rate": 2.124031943050442e-09, "loss": 1.6821, "step": 53650 }, { "epoch": 7.955555555555556, "grad_norm": 4.524296760559082, "learning_rate": 1.5620102173530272e-09, "loss": 1.6972, "step": 53700 }, { "epoch": 7.962962962962963, "grad_norm": 4.123456954956055, "learning_rate": 1.0861799116046457e-09, "loss": 1.6129, "step": 53750 }, { "epoch": 7.97037037037037, "grad_norm": 5.126282691955566, "learning_rate": 6.965451276919055e-10, "loss": 1.6673, "step": 53800 }, { "epoch": 7.977777777777778, "grad_norm": 5.508212089538574, "learning_rate": 3.931092244602308e-10, "loss": 1.7623, "step": 53850 }, { "epoch": 7.985185185185185, "grad_norm": 5.099086284637451, "learning_rate": 1.7587481767389336e-10, "loss": 1.7465, "step": 53900 }, { "epoch": 7.992592592592593, "grad_norm": 4.575199127197266, "learning_rate": 4.484377999935952e-11, "loss": 1.7073, "step": 53950 }, { "epoch": 8.0, "grad_norm": 4.952852249145508, "learning_rate": 1.724098974698052e-14, "loss": 1.765, "step": 54000 }, { "epoch": 8.007407407407408, "grad_norm": 5.91780424118042, "learning_rate": 6.042434474232695e-07, "loss": 1.6681, "step": 54050 }, { "epoch": 8.014814814814814, "grad_norm": 5.638923168182373, "learning_rate": 5.953501031737729e-07, "loss": 1.6993, "step": 54100 }, { "epoch": 8.022222222222222, "grad_norm": 4.633487224578857, "learning_rate": 5.865206835271154e-07, "loss": 1.6813, "step": 54150 }, { "epoch": 8.02962962962963, "grad_norm": 5.461434364318848, "learning_rate": 5.777552484979865e-07, "loss": 1.7147, "step": 54200 }, { "epoch": 8.037037037037036, "grad_norm": 4.728063583374023, "learning_rate": 5.69053857666163e-07, "loss": 1.6808, "step": 54250 }, { "epoch": 8.044444444444444, "grad_norm": 4.8988213539123535, "learning_rate": 5.604165701761067e-07, "loss": 1.7503, "step": 54300 }, { "epoch": 8.051851851851852, "grad_norm": 5.497891902923584, "learning_rate": 5.518434447365606e-07, "loss": 1.677, "step": 54350 }, { "epoch": 8.059259259259258, "grad_norm": 4.321437358856201, "learning_rate": 5.433345396201506e-07, "loss": 1.7058, "step": 54400 }, { "epoch": 8.066666666666666, "grad_norm": 5.676205158233643, "learning_rate": 5.34889912662988e-07, "loss": 1.6936, "step": 54450 }, { "epoch": 8.074074074074074, "grad_norm": 4.2474894523620605, "learning_rate": 5.265096212642762e-07, "loss": 1.6949, "step": 54500 }, { "epoch": 8.081481481481482, "grad_norm": 6.3445143699646, "learning_rate": 5.181937223859246e-07, "loss": 1.7373, "step": 54550 }, { "epoch": 8.088888888888889, "grad_norm": 4.5797038078308105, "learning_rate": 5.099422725521553e-07, "loss": 1.6622, "step": 54600 }, { "epoch": 8.096296296296297, "grad_norm": 5.370152950286865, "learning_rate": 5.017553278491238e-07, "loss": 1.6767, "step": 54650 }, { "epoch": 8.103703703703705, "grad_norm": 5.182304859161377, "learning_rate": 4.936329439245369e-07, "loss": 1.7094, "step": 54700 }, { "epoch": 8.11111111111111, "grad_norm": 4.945371150970459, "learning_rate": 4.855751759872707e-07, "loss": 1.6705, "step": 54750 }, { "epoch": 8.118518518518519, "grad_norm": 4.181769847869873, "learning_rate": 4.775820788070018e-07, "loss": 1.7182, "step": 54800 }, { "epoch": 8.125925925925927, "grad_norm": 4.810791969299316, "learning_rate": 4.6965370671382735e-07, "loss": 1.7706, "step": 54850 }, { "epoch": 8.133333333333333, "grad_norm": 4.857903480529785, "learning_rate": 4.6179011359790414e-07, "loss": 1.7079, "step": 54900 }, { "epoch": 8.14074074074074, "grad_norm": 3.9374470710754395, "learning_rate": 4.539913529090734e-07, "loss": 1.7181, "step": 54950 }, { "epoch": 8.148148148148149, "grad_norm": 4.6941237449646, "learning_rate": 4.462574776565054e-07, "loss": 1.6491, "step": 55000 }, { "epoch": 8.155555555555555, "grad_norm": 4.580036640167236, "learning_rate": 4.3858854040833564e-07, "loss": 1.7548, "step": 55050 }, { "epoch": 8.162962962962963, "grad_norm": 3.972926139831543, "learning_rate": 4.3098459329130813e-07, "loss": 1.718, "step": 55100 }, { "epoch": 8.170370370370371, "grad_norm": 5.282564163208008, "learning_rate": 4.2344568799041807e-07, "loss": 1.7448, "step": 55150 }, { "epoch": 8.177777777777777, "grad_norm": 5.370199680328369, "learning_rate": 4.159718757485642e-07, "loss": 1.7111, "step": 55200 }, { "epoch": 8.185185185185185, "grad_norm": 6.447271347045898, "learning_rate": 4.085632073662016e-07, "loss": 1.7007, "step": 55250 }, { "epoch": 8.192592592592593, "grad_norm": 4.709756851196289, "learning_rate": 4.012197332009915e-07, "loss": 1.6363, "step": 55300 }, { "epoch": 8.2, "grad_norm": 4.706228256225586, "learning_rate": 3.9394150316746317e-07, "loss": 1.6756, "step": 55350 }, { "epoch": 8.207407407407407, "grad_norm": 4.972978591918945, "learning_rate": 3.867285667366727e-07, "loss": 1.7409, "step": 55400 }, { "epoch": 8.214814814814815, "grad_norm": 4.665986061096191, "learning_rate": 3.7958097293586684e-07, "loss": 1.7058, "step": 55450 }, { "epoch": 8.222222222222221, "grad_norm": 5.660872459411621, "learning_rate": 3.7249877034815306e-07, "loss": 1.7001, "step": 55500 }, { "epoch": 8.22962962962963, "grad_norm": 5.380090713500977, "learning_rate": 3.654820071121612e-07, "loss": 1.7128, "step": 55550 }, { "epoch": 8.237037037037037, "grad_norm": 7.617238998413086, "learning_rate": 3.5853073092172566e-07, "loss": 1.6826, "step": 55600 }, { "epoch": 8.244444444444444, "grad_norm": 5.554752349853516, "learning_rate": 3.5164498902555687e-07, "loss": 1.7362, "step": 55650 }, { "epoch": 8.251851851851852, "grad_norm": 4.808479309082031, "learning_rate": 3.448248282269173e-07, "loss": 1.65, "step": 55700 }, { "epoch": 8.25925925925926, "grad_norm": 3.840391159057617, "learning_rate": 3.380702948833103e-07, "loss": 1.7487, "step": 55750 }, { "epoch": 8.266666666666667, "grad_norm": 5.313462257385254, "learning_rate": 3.313814349061573e-07, "loss": 1.6453, "step": 55800 }, { "epoch": 8.274074074074074, "grad_norm": 4.17125129699707, "learning_rate": 3.247582937604921e-07, "loss": 1.709, "step": 55850 }, { "epoch": 8.281481481481482, "grad_norm": 3.761216163635254, "learning_rate": 3.1820091646464825e-07, "loss": 1.6329, "step": 55900 }, { "epoch": 8.28888888888889, "grad_norm": 5.288180351257324, "learning_rate": 3.117093475899546e-07, "loss": 1.6381, "step": 55950 }, { "epoch": 8.296296296296296, "grad_norm": 4.487936973571777, "learning_rate": 3.0528363126043016e-07, "loss": 1.7076, "step": 56000 }, { "epoch": 8.303703703703704, "grad_norm": 4.927828788757324, "learning_rate": 2.9892381115248836e-07, "loss": 1.6595, "step": 56050 }, { "epoch": 8.311111111111112, "grad_norm": 4.865903377532959, "learning_rate": 2.9262993049463564e-07, "loss": 1.6762, "step": 56100 }, { "epoch": 8.318518518518518, "grad_norm": 5.675967216491699, "learning_rate": 2.864020320671812e-07, "loss": 1.6536, "step": 56150 }, { "epoch": 8.325925925925926, "grad_norm": 6.040846347808838, "learning_rate": 2.8024015820194093e-07, "loss": 1.6822, "step": 56200 }, { "epoch": 8.333333333333334, "grad_norm": 5.212625026702881, "learning_rate": 2.741443507819597e-07, "loss": 1.731, "step": 56250 }, { "epoch": 8.34074074074074, "grad_norm": 4.386477947235107, "learning_rate": 2.681146512412136e-07, "loss": 1.6979, "step": 56300 }, { "epoch": 8.348148148148148, "grad_norm": 5.230815887451172, "learning_rate": 2.621511005643407e-07, "loss": 1.7063, "step": 56350 }, { "epoch": 8.355555555555556, "grad_norm": 4.48565673828125, "learning_rate": 2.5625373928635176e-07, "loss": 1.6047, "step": 56400 }, { "epoch": 8.362962962962962, "grad_norm": 4.869395732879639, "learning_rate": 2.5042260749236434e-07, "loss": 1.7181, "step": 56450 }, { "epoch": 8.37037037037037, "grad_norm": 4.625112056732178, "learning_rate": 2.446577448173215e-07, "loss": 1.7443, "step": 56500 }, { "epoch": 8.377777777777778, "grad_norm": 4.910737991333008, "learning_rate": 2.3895919044573223e-07, "loss": 1.667, "step": 56550 }, { "epoch": 8.385185185185184, "grad_norm": 5.058411598205566, "learning_rate": 2.3332698311139378e-07, "loss": 1.7574, "step": 56600 }, { "epoch": 8.392592592592592, "grad_norm": 4.682469844818115, "learning_rate": 2.2776116109713753e-07, "loss": 1.6996, "step": 56650 }, { "epoch": 8.4, "grad_norm": 5.020571708679199, "learning_rate": 2.2226176223456353e-07, "loss": 1.6864, "step": 56700 }, { "epoch": 8.407407407407407, "grad_norm": 4.348459720611572, "learning_rate": 2.1682882390378633e-07, "loss": 1.737, "step": 56750 }, { "epoch": 8.414814814814815, "grad_norm": 4.722019672393799, "learning_rate": 2.1146238303317858e-07, "loss": 1.6447, "step": 56800 }, { "epoch": 8.422222222222222, "grad_norm": 5.096248626708984, "learning_rate": 2.0616247609912543e-07, "loss": 1.7022, "step": 56850 }, { "epoch": 8.42962962962963, "grad_norm": 6.641050338745117, "learning_rate": 2.0092913912576617e-07, "loss": 1.696, "step": 56900 }, { "epoch": 8.437037037037037, "grad_norm": 5.072902202606201, "learning_rate": 1.9576240768475975e-07, "loss": 1.6881, "step": 56950 }, { "epoch": 8.444444444444445, "grad_norm": 4.475610733032227, "learning_rate": 1.9066231689503721e-07, "loss": 1.7255, "step": 57000 }, { "epoch": 8.451851851851853, "grad_norm": 5.1616621017456055, "learning_rate": 1.856289014225654e-07, "loss": 1.7362, "step": 57050 }, { "epoch": 8.459259259259259, "grad_norm": 4.945127964019775, "learning_rate": 1.806621954801091e-07, "loss": 1.7219, "step": 57100 }, { "epoch": 8.466666666666667, "grad_norm": 4.375204086303711, "learning_rate": 1.7576223282700255e-07, "loss": 1.6566, "step": 57150 }, { "epoch": 8.474074074074075, "grad_norm": 4.772896766662598, "learning_rate": 1.7092904676891509e-07, "loss": 1.724, "step": 57200 }, { "epoch": 8.481481481481481, "grad_norm": 6.058934211730957, "learning_rate": 1.6616267015762799e-07, "loss": 1.7495, "step": 57250 }, { "epoch": 8.488888888888889, "grad_norm": 5.737463474273682, "learning_rate": 1.6146313539081026e-07, "loss": 1.76, "step": 57300 }, { "epoch": 8.496296296296297, "grad_norm": 4.475915431976318, "learning_rate": 1.5683047441179656e-07, "loss": 1.6865, "step": 57350 }, { "epoch": 8.503703703703703, "grad_norm": 4.296292781829834, "learning_rate": 1.522647187093751e-07, "loss": 1.7438, "step": 57400 }, { "epoch": 8.511111111111111, "grad_norm": 5.862701416015625, "learning_rate": 1.4776589931756902e-07, "loss": 1.7516, "step": 57450 }, { "epoch": 8.518518518518519, "grad_norm": 5.117536544799805, "learning_rate": 1.4333404681542428e-07, "loss": 1.7426, "step": 57500 }, { "epoch": 8.525925925925925, "grad_norm": 4.911297798156738, "learning_rate": 1.3896919132680875e-07, "loss": 1.7246, "step": 57550 }, { "epoch": 8.533333333333333, "grad_norm": 6.487166881561279, "learning_rate": 1.346713625202001e-07, "loss": 1.7129, "step": 57600 }, { "epoch": 8.540740740740741, "grad_norm": 5.194836616516113, "learning_rate": 1.3044058960848815e-07, "loss": 1.6967, "step": 57650 }, { "epoch": 8.548148148148147, "grad_norm": 4.895190238952637, "learning_rate": 1.2627690134877524e-07, "loss": 1.7175, "step": 57700 }, { "epoch": 8.555555555555555, "grad_norm": 4.806921005249023, "learning_rate": 1.2218032604218056e-07, "loss": 1.6729, "step": 57750 }, { "epoch": 8.562962962962963, "grad_norm": 5.2344865798950195, "learning_rate": 1.1815089153364711e-07, "loss": 1.7009, "step": 57800 }, { "epoch": 8.57037037037037, "grad_norm": 5.767990589141846, "learning_rate": 1.1418862521175634e-07, "loss": 1.7572, "step": 57850 }, { "epoch": 8.577777777777778, "grad_norm": 4.904351711273193, "learning_rate": 1.1029355400853481e-07, "loss": 1.6867, "step": 57900 }, { "epoch": 8.585185185185185, "grad_norm": 5.978799343109131, "learning_rate": 1.0646570439928006e-07, "loss": 1.7102, "step": 57950 }, { "epoch": 8.592592592592592, "grad_norm": 5.98206901550293, "learning_rate": 1.0270510240236953e-07, "loss": 1.7309, "step": 58000 }, { "epoch": 8.6, "grad_norm": 5.000571250915527, "learning_rate": 9.901177357909742e-08, "loss": 1.7079, "step": 58050 }, { "epoch": 8.607407407407408, "grad_norm": 5.031994342803955, "learning_rate": 9.538574303348813e-08, "loss": 1.6487, "step": 58100 }, { "epoch": 8.614814814814816, "grad_norm": 7.2956671714782715, "learning_rate": 9.182703541213423e-08, "loss": 1.6381, "step": 58150 }, { "epoch": 8.622222222222222, "grad_norm": 5.399662971496582, "learning_rate": 8.833567490402206e-08, "loss": 1.7005, "step": 58200 }, { "epoch": 8.62962962962963, "grad_norm": 5.364224433898926, "learning_rate": 8.49116852403764e-08, "loss": 1.6661, "step": 58250 }, { "epoch": 8.637037037037038, "grad_norm": 4.17866325378418, "learning_rate": 8.155508969448944e-08, "loss": 1.5835, "step": 58300 }, { "epoch": 8.644444444444444, "grad_norm": 5.372631072998047, "learning_rate": 7.826591108156867e-08, "loss": 1.6723, "step": 58350 }, { "epoch": 8.651851851851852, "grad_norm": 4.693414688110352, "learning_rate": 7.504417175858036e-08, "loss": 1.6853, "step": 58400 }, { "epoch": 8.65925925925926, "grad_norm": 5.828296661376953, "learning_rate": 7.188989362409638e-08, "loss": 1.7392, "step": 58450 }, { "epoch": 8.666666666666666, "grad_norm": 4.067485809326172, "learning_rate": 6.880309811814757e-08, "loss": 1.6361, "step": 58500 }, { "epoch": 8.674074074074074, "grad_norm": 4.196707248687744, "learning_rate": 6.578380622207503e-08, "loss": 1.6795, "step": 58550 }, { "epoch": 8.681481481481482, "grad_norm": 5.865671634674072, "learning_rate": 6.283203845839137e-08, "loss": 1.6751, "step": 58600 }, { "epoch": 8.688888888888888, "grad_norm": 4.712571620941162, "learning_rate": 5.994781489063738e-08, "loss": 1.6887, "step": 58650 }, { "epoch": 8.696296296296296, "grad_norm": 4.615510940551758, "learning_rate": 5.713115512324674e-08, "loss": 1.6539, "step": 58700 }, { "epoch": 8.703703703703704, "grad_norm": 4.818018913269043, "learning_rate": 5.438207830141706e-08, "loss": 1.6824, "step": 58750 }, { "epoch": 8.71111111111111, "grad_norm": 5.266839981079102, "learning_rate": 5.1700603110971246e-08, "loss": 1.6686, "step": 58800 }, { "epoch": 8.718518518518518, "grad_norm": 7.0362229347229, "learning_rate": 4.908674777823863e-08, "loss": 1.7219, "step": 58850 }, { "epoch": 8.725925925925926, "grad_norm": 3.8932015895843506, "learning_rate": 4.6540530069927317e-08, "loss": 1.7637, "step": 58900 }, { "epoch": 8.733333333333333, "grad_norm": 5.245306015014648, "learning_rate": 4.406196729300094e-08, "loss": 1.6668, "step": 58950 }, { "epoch": 8.74074074074074, "grad_norm": 6.06035852432251, "learning_rate": 4.165107629456877e-08, "loss": 1.7053, "step": 59000 }, { "epoch": 8.748148148148148, "grad_norm": 5.4172539710998535, "learning_rate": 3.930787346176357e-08, "loss": 1.7195, "step": 59050 }, { "epoch": 8.755555555555556, "grad_norm": 4.596226215362549, "learning_rate": 3.7032374721632794e-08, "loss": 1.6544, "step": 59100 }, { "epoch": 8.762962962962963, "grad_norm": 4.076667308807373, "learning_rate": 3.482459554102979e-08, "loss": 1.656, "step": 59150 }, { "epoch": 8.77037037037037, "grad_norm": 6.138818264007568, "learning_rate": 3.2684550926512795e-08, "loss": 1.7229, "step": 59200 }, { "epoch": 8.777777777777779, "grad_norm": 3.7312471866607666, "learning_rate": 3.061225542423718e-08, "loss": 1.7003, "step": 59250 }, { "epoch": 8.785185185185185, "grad_norm": 4.663834571838379, "learning_rate": 2.8607723119858932e-08, "loss": 1.729, "step": 59300 }, { "epoch": 8.792592592592593, "grad_norm": 4.302097797393799, "learning_rate": 2.6670967638439127e-08, "loss": 1.7085, "step": 59350 }, { "epoch": 8.8, "grad_norm": 4.932514667510986, "learning_rate": 2.48020021443518e-08, "loss": 1.6708, "step": 59400 }, { "epoch": 8.807407407407407, "grad_norm": 3.9513580799102783, "learning_rate": 2.3000839341192905e-08, "loss": 1.7047, "step": 59450 }, { "epoch": 8.814814814814815, "grad_norm": 5.340576648712158, "learning_rate": 2.1267491471697043e-08, "loss": 1.6384, "step": 59500 }, { "epoch": 8.822222222222223, "grad_norm": 4.683200359344482, "learning_rate": 1.9601970317647546e-08, "loss": 1.6958, "step": 59550 }, { "epoch": 8.829629629629629, "grad_norm": 4.607886791229248, "learning_rate": 1.8004287199805403e-08, "loss": 1.7749, "step": 59600 }, { "epoch": 8.837037037037037, "grad_norm": 5.207304000854492, "learning_rate": 1.6474452977827127e-08, "loss": 1.7023, "step": 59650 }, { "epoch": 8.844444444444445, "grad_norm": 4.992751598358154, "learning_rate": 1.501247805018924e-08, "loss": 1.7879, "step": 59700 }, { "epoch": 8.851851851851851, "grad_norm": 5.176036834716797, "learning_rate": 1.3618372354121668e-08, "loss": 1.7696, "step": 59750 }, { "epoch": 8.85925925925926, "grad_norm": 6.48920202255249, "learning_rate": 1.22921453655378e-08, "loss": 1.7325, "step": 59800 }, { "epoch": 8.866666666666667, "grad_norm": 5.500480651855469, "learning_rate": 1.103380609897342e-08, "loss": 1.7892, "step": 59850 }, { "epoch": 8.874074074074073, "grad_norm": 5.276499271392822, "learning_rate": 9.843363107518988e-09, "loss": 1.7182, "step": 59900 }, { "epoch": 8.881481481481481, "grad_norm": 5.408885955810547, "learning_rate": 8.720824482767453e-09, "loss": 1.6658, "step": 59950 }, { "epoch": 8.88888888888889, "grad_norm": 4.236413955688477, "learning_rate": 7.66619785475653e-09, "loss": 1.6929, "step": 60000 }, { "epoch": 8.896296296296295, "grad_norm": 4.405470371246338, "learning_rate": 6.6794903919187306e-09, "loss": 1.6295, "step": 60050 }, { "epoch": 8.903703703703703, "grad_norm": 4.904160022735596, "learning_rate": 5.7607088010291914e-09, "loss": 1.6921, "step": 60100 }, { "epoch": 8.911111111111111, "grad_norm": 6.341813564300537, "learning_rate": 4.90985932716459e-09, "loss": 1.7889, "step": 60150 }, { "epoch": 8.918518518518518, "grad_norm": 6.032997131347656, "learning_rate": 4.126947753655408e-09, "loss": 1.7337, "step": 60200 }, { "epoch": 8.925925925925926, "grad_norm": 4.988321781158447, "learning_rate": 3.4119794020526233e-09, "loss": 1.6482, "step": 60250 }, { "epoch": 8.933333333333334, "grad_norm": 4.9094672203063965, "learning_rate": 2.764959132086631e-09, "loss": 1.7243, "step": 60300 }, { "epoch": 8.940740740740742, "grad_norm": 4.590478897094727, "learning_rate": 2.1858913416372696e-09, "loss": 1.7556, "step": 60350 }, { "epoch": 8.948148148148148, "grad_norm": 6.892956256866455, "learning_rate": 1.6747799667005128e-09, "loss": 1.6658, "step": 60400 }, { "epoch": 8.955555555555556, "grad_norm": 5.133352756500244, "learning_rate": 1.2316284813673751e-09, "loss": 1.6599, "step": 60450 }, { "epoch": 8.962962962962964, "grad_norm": 4.256348609924316, "learning_rate": 8.56439897793937e-10, "loss": 1.6971, "step": 60500 }, { "epoch": 8.97037037037037, "grad_norm": 4.697518825531006, "learning_rate": 5.492167661846903e-10, "loss": 1.6753, "step": 60550 }, { "epoch": 8.977777777777778, "grad_norm": 7.553053855895996, "learning_rate": 3.099611747747755e-10, "loss": 1.706, "step": 60600 }, { "epoch": 8.985185185185186, "grad_norm": 4.563235282897949, "learning_rate": 1.3867474981443807e-10, "loss": 1.6904, "step": 60650 }, { "epoch": 8.992592592592592, "grad_norm": 5.773245334625244, "learning_rate": 3.5358655559036834e-11, "loss": 1.8243, "step": 60700 }, { "epoch": 9.0, "grad_norm": 5.451295852661133, "learning_rate": 1.359426238245476e-14, "loss": 1.6434, "step": 60750 }, { "epoch": 9.007407407407408, "grad_norm": 4.893211841583252, "learning_rate": 4.895797438496442e-07, "loss": 1.7101, "step": 60800 }, { "epoch": 9.014814814814814, "grad_norm": 5.335631370544434, "learning_rate": 4.823600249171412e-07, "loss": 1.7203, "step": 60850 }, { "epoch": 9.022222222222222, "grad_norm": 6.353170871734619, "learning_rate": 4.7519262014055324e-07, "loss": 1.6979, "step": 60900 }, { "epoch": 9.02962962962963, "grad_norm": 6.264624118804932, "learning_rate": 4.6807756891585677e-07, "loss": 1.7308, "step": 60950 }, { "epoch": 9.037037037037036, "grad_norm": 3.9906938076019287, "learning_rate": 4.610149103512673e-07, "loss": 1.728, "step": 61000 }, { "epoch": 9.044444444444444, "grad_norm": 5.856201648712158, "learning_rate": 4.540046832670175e-07, "loss": 1.7142, "step": 61050 }, { "epoch": 9.051851851851852, "grad_norm": 4.669765949249268, "learning_rate": 4.4704692619515045e-07, "loss": 1.6873, "step": 61100 }, { "epoch": 9.059259259259258, "grad_norm": 4.824378967285156, "learning_rate": 4.4014167737930656e-07, "loss": 1.6526, "step": 61150 }, { "epoch": 9.066666666666666, "grad_norm": 4.9388041496276855, "learning_rate": 4.332889747745095e-07, "loss": 1.6905, "step": 61200 }, { "epoch": 9.074074074074074, "grad_norm": 4.846113204956055, "learning_rate": 4.2648885604696267e-07, "loss": 1.6202, "step": 61250 }, { "epoch": 9.081481481481482, "grad_norm": 6.360806465148926, "learning_rate": 4.197413585738408e-07, "loss": 1.7341, "step": 61300 }, { "epoch": 9.088888888888889, "grad_norm": 5.023980140686035, "learning_rate": 4.130465194430766e-07, "loss": 1.6094, "step": 61350 }, { "epoch": 9.096296296296297, "grad_norm": 4.734802722930908, "learning_rate": 4.064043754531699e-07, "loss": 1.6848, "step": 61400 }, { "epoch": 9.103703703703705, "grad_norm": 5.307347774505615, "learning_rate": 3.998149631129788e-07, "loss": 1.6636, "step": 61450 }, { "epoch": 9.11111111111111, "grad_norm": 5.5121235847473145, "learning_rate": 3.932783186415179e-07, "loss": 1.7264, "step": 61500 }, { "epoch": 9.118518518518519, "grad_norm": 4.755031585693359, "learning_rate": 3.8679447796776016e-07, "loss": 1.7101, "step": 61550 }, { "epoch": 9.125925925925927, "grad_norm": 6.262842655181885, "learning_rate": 3.8036347673044316e-07, "loss": 1.695, "step": 61600 }, { "epoch": 9.133333333333333, "grad_norm": 5.596631050109863, "learning_rate": 3.7398535027786455e-07, "loss": 1.7107, "step": 61650 }, { "epoch": 9.14074074074074, "grad_norm": 4.791952610015869, "learning_rate": 3.676601336676988e-07, "loss": 1.7179, "step": 61700 }, { "epoch": 9.148148148148149, "grad_norm": 5.435273170471191, "learning_rate": 3.613878616667954e-07, "loss": 1.7008, "step": 61750 }, { "epoch": 9.155555555555555, "grad_norm": 4.244542121887207, "learning_rate": 3.5516856875099314e-07, "loss": 1.7337, "step": 61800 }, { "epoch": 9.162962962962963, "grad_norm": 5.632518768310547, "learning_rate": 3.490022891049283e-07, "loss": 1.6774, "step": 61850 }, { "epoch": 9.170370370370371, "grad_norm": 5.066909313201904, "learning_rate": 3.428890566218457e-07, "loss": 1.643, "step": 61900 }, { "epoch": 9.177777777777777, "grad_norm": 5.419461250305176, "learning_rate": 3.368289049034179e-07, "loss": 1.6886, "step": 61950 }, { "epoch": 9.185185185185185, "grad_norm": 4.569416522979736, "learning_rate": 3.30821867259552e-07, "loss": 1.6416, "step": 62000 }, { "epoch": 9.192592592592593, "grad_norm": 5.561820983886719, "learning_rate": 3.24867976708213e-07, "loss": 1.7122, "step": 62050 }, { "epoch": 9.2, "grad_norm": 5.220768928527832, "learning_rate": 3.1896726597524074e-07, "loss": 1.6168, "step": 62100 }, { "epoch": 9.207407407407407, "grad_norm": 4.5533528327941895, "learning_rate": 3.1311976749416997e-07, "loss": 1.6806, "step": 62150 }, { "epoch": 9.214814814814815, "grad_norm": 4.327883243560791, "learning_rate": 3.0732551340605046e-07, "loss": 1.6956, "step": 62200 }, { "epoch": 9.222222222222221, "grad_norm": 5.154954433441162, "learning_rate": 3.015845355592728e-07, "loss": 1.7016, "step": 62250 }, { "epoch": 9.22962962962963, "grad_norm": 4.6678571701049805, "learning_rate": 2.958968655093919e-07, "loss": 1.6852, "step": 62300 }, { "epoch": 9.237037037037037, "grad_norm": 5.951033115386963, "learning_rate": 2.9026253451895357e-07, "loss": 1.6981, "step": 62350 }, { "epoch": 9.244444444444444, "grad_norm": 4.83495569229126, "learning_rate": 2.846815735573227e-07, "loss": 1.7676, "step": 62400 }, { "epoch": 9.251851851851852, "grad_norm": 4.016190052032471, "learning_rate": 2.791540133005144e-07, "loss": 1.6907, "step": 62450 }, { "epoch": 9.25925925925926, "grad_norm": 5.682389736175537, "learning_rate": 2.73679884131024e-07, "loss": 1.7596, "step": 62500 }, { "epoch": 9.266666666666667, "grad_norm": 5.448666095733643, "learning_rate": 2.682592161376607e-07, "loss": 1.7473, "step": 62550 }, { "epoch": 9.274074074074074, "grad_norm": 6.466638565063477, "learning_rate": 2.6289203911537884e-07, "loss": 1.6856, "step": 62600 }, { "epoch": 9.281481481481482, "grad_norm": 5.566060543060303, "learning_rate": 2.575783825651201e-07, "loss": 1.7308, "step": 62650 }, { "epoch": 9.28888888888889, "grad_norm": 4.0370025634765625, "learning_rate": 2.5231827569365044e-07, "loss": 1.7389, "step": 62700 }, { "epoch": 9.296296296296296, "grad_norm": 5.2997727394104, "learning_rate": 2.4711174741338996e-07, "loss": 1.7766, "step": 62750 }, { "epoch": 9.303703703703704, "grad_norm": 4.835376739501953, "learning_rate": 2.419588263422701e-07, "loss": 1.6751, "step": 62800 }, { "epoch": 9.311111111111112, "grad_norm": 4.635543346405029, "learning_rate": 2.3685954080356345e-07, "loss": 1.6458, "step": 62850 }, { "epoch": 9.318518518518518, "grad_norm": 5.237940311431885, "learning_rate": 2.318139188257318e-07, "loss": 1.6964, "step": 62900 }, { "epoch": 9.325925925925926, "grad_norm": 4.608352184295654, "learning_rate": 2.2682198814227395e-07, "loss": 1.7155, "step": 62950 }, { "epoch": 9.333333333333334, "grad_norm": 6.122480392456055, "learning_rate": 2.2188377619157374e-07, "loss": 1.7642, "step": 63000 }, { "epoch": 9.34074074074074, "grad_norm": 5.679895401000977, "learning_rate": 2.1699931011674225e-07, "loss": 1.7617, "step": 63050 }, { "epoch": 9.348148148148148, "grad_norm": 6.1035003662109375, "learning_rate": 2.1216861676547684e-07, "loss": 1.7427, "step": 63100 }, { "epoch": 9.355555555555556, "grad_norm": 5.812370300292969, "learning_rate": 2.073917226899147e-07, "loss": 1.7262, "step": 63150 }, { "epoch": 9.362962962962962, "grad_norm": 5.102630615234375, "learning_rate": 2.026686541464773e-07, "loss": 1.7531, "step": 63200 }, { "epoch": 9.37037037037037, "grad_norm": 5.473056793212891, "learning_rate": 1.9799943709573166e-07, "loss": 1.7032, "step": 63250 }, { "epoch": 9.377777777777778, "grad_norm": 5.401584625244141, "learning_rate": 1.9338409720224938e-07, "loss": 1.7035, "step": 63300 }, { "epoch": 9.385185185185184, "grad_norm": 4.778750419616699, "learning_rate": 1.8882265983446558e-07, "loss": 1.6414, "step": 63350 }, { "epoch": 9.392592592592592, "grad_norm": 4.272122859954834, "learning_rate": 1.8431515006453127e-07, "loss": 1.7321, "step": 63400 }, { "epoch": 9.4, "grad_norm": 4.675436496734619, "learning_rate": 1.7986159266818904e-07, "loss": 1.7496, "step": 63450 }, { "epoch": 9.407407407407407, "grad_norm": 5.439012050628662, "learning_rate": 1.7546201212462642e-07, "loss": 1.7051, "step": 63500 }, { "epoch": 9.414814814814815, "grad_norm": 4.923573970794678, "learning_rate": 1.7111643261634502e-07, "loss": 1.6861, "step": 63550 }, { "epoch": 9.422222222222222, "grad_norm": 4.010578632354736, "learning_rate": 1.6682487802902493e-07, "loss": 1.6795, "step": 63600 }, { "epoch": 9.42962962962963, "grad_norm": 5.064062118530273, "learning_rate": 1.625873719514004e-07, "loss": 1.6973, "step": 63650 }, { "epoch": 9.437037037037037, "grad_norm": 4.9291887283325195, "learning_rate": 1.5840393767512118e-07, "loss": 1.6806, "step": 63700 }, { "epoch": 9.444444444444445, "grad_norm": 5.1407341957092285, "learning_rate": 1.542745981946303e-07, "loss": 1.7001, "step": 63750 }, { "epoch": 9.451851851851853, "grad_norm": 5.599128723144531, "learning_rate": 1.5019937620703862e-07, "loss": 1.7244, "step": 63800 }, { "epoch": 9.459259259259259, "grad_norm": 4.7101945877075195, "learning_rate": 1.4617829411199492e-07, "loss": 1.7128, "step": 63850 }, { "epoch": 9.466666666666667, "grad_norm": 4.43619966506958, "learning_rate": 1.4221137401156492e-07, "loss": 1.6842, "step": 63900 }, { "epoch": 9.474074074074075, "grad_norm": 4.620861053466797, "learning_rate": 1.3829863771011253e-07, "loss": 1.6377, "step": 63950 }, { "epoch": 9.481481481481481, "grad_norm": 4.613444805145264, "learning_rate": 1.344401067141754e-07, "loss": 1.6818, "step": 64000 }, { "epoch": 9.488888888888889, "grad_norm": 5.942145347595215, "learning_rate": 1.3063580223235284e-07, "loss": 1.6883, "step": 64050 }, { "epoch": 9.496296296296297, "grad_norm": 4.702767848968506, "learning_rate": 1.268857451751826e-07, "loss": 1.7628, "step": 64100 }, { "epoch": 9.503703703703703, "grad_norm": 4.619032382965088, "learning_rate": 1.2318995615502983e-07, "loss": 1.7036, "step": 64150 }, { "epoch": 9.511111111111111, "grad_norm": 4.408839702606201, "learning_rate": 1.1954845548597162e-07, "loss": 1.7135, "step": 64200 }, { "epoch": 9.518518518518519, "grad_norm": 6.090911388397217, "learning_rate": 1.1596126318368928e-07, "loss": 1.7363, "step": 64250 }, { "epoch": 9.525925925925925, "grad_norm": 6.030341148376465, "learning_rate": 1.1242839896535407e-07, "loss": 1.6981, "step": 64300 }, { "epoch": 9.533333333333333, "grad_norm": 6.468939304351807, "learning_rate": 1.089498822495183e-07, "loss": 1.6546, "step": 64350 }, { "epoch": 9.540740740740741, "grad_norm": 4.483176231384277, "learning_rate": 1.0552573215601436e-07, "loss": 1.7018, "step": 64400 }, { "epoch": 9.548148148148147, "grad_norm": 4.81044864654541, "learning_rate": 1.0215596750584588e-07, "loss": 1.7386, "step": 64450 }, { "epoch": 9.555555555555555, "grad_norm": 5.380277633666992, "learning_rate": 9.88406068210801e-08, "loss": 1.634, "step": 64500 }, { "epoch": 9.562962962962963, "grad_norm": 5.343459606170654, "learning_rate": 9.557966832475341e-08, "loss": 1.7023, "step": 64550 }, { "epoch": 9.57037037037037, "grad_norm": 4.430337905883789, "learning_rate": 9.237316994076929e-08, "loss": 1.6313, "step": 64600 }, { "epoch": 9.577777777777778, "grad_norm": 7.060146331787109, "learning_rate": 8.922112929379501e-08, "loss": 1.7565, "step": 64650 }, { "epoch": 9.585185185185185, "grad_norm": 5.158222675323486, "learning_rate": 8.612356370917174e-08, "loss": 1.6618, "step": 64700 }, { "epoch": 9.592592592592592, "grad_norm": 4.8023362159729, "learning_rate": 8.308049021281461e-08, "loss": 1.729, "step": 64750 }, { "epoch": 9.6, "grad_norm": 5.179930686950684, "learning_rate": 8.009192553111833e-08, "loss": 1.7514, "step": 64800 }, { "epoch": 9.607407407407408, "grad_norm": 4.108941078186035, "learning_rate": 7.715788609087171e-08, "loss": 1.7692, "step": 64850 }, { "epoch": 9.614814814814816, "grad_norm": 4.883074760437012, "learning_rate": 7.427838801915887e-08, "loss": 1.6442, "step": 64900 }, { "epoch": 9.622222222222222, "grad_norm": 4.569499969482422, "learning_rate": 7.145344714327707e-08, "loss": 1.6202, "step": 64950 }, { "epoch": 9.62962962962963, "grad_norm": 4.791108131408691, "learning_rate": 6.868307899064675e-08, "loss": 1.7196, "step": 65000 }, { "epoch": 9.637037037037038, "grad_norm": 6.292260646820068, "learning_rate": 6.59672987887272e-08, "loss": 1.7988, "step": 65050 }, { "epoch": 9.644444444444444, "grad_norm": 5.572513103485107, "learning_rate": 6.330612146492998e-08, "loss": 1.7117, "step": 65100 }, { "epoch": 9.651851851851852, "grad_norm": 6.622492790222168, "learning_rate": 6.069956164654445e-08, "loss": 1.7011, "step": 65150 }, { "epoch": 9.65925925925926, "grad_norm": 5.106518268585205, "learning_rate": 5.8147633660647904e-08, "loss": 1.6777, "step": 65200 }, { "epoch": 9.666666666666666, "grad_norm": 4.711566925048828, "learning_rate": 5.565035153403231e-08, "loss": 1.7627, "step": 65250 }, { "epoch": 9.674074074074074, "grad_norm": 5.837401866912842, "learning_rate": 5.320772899312654e-08, "loss": 1.7406, "step": 65300 }, { "epoch": 9.681481481481482, "grad_norm": 5.084963798522949, "learning_rate": 5.081977946392092e-08, "loss": 1.6659, "step": 65350 }, { "epoch": 9.688888888888888, "grad_norm": 5.511571884155273, "learning_rate": 4.84865160718917e-08, "loss": 1.7159, "step": 65400 }, { "epoch": 9.696296296296296, "grad_norm": 6.268984317779541, "learning_rate": 4.620795164193004e-08, "loss": 1.7017, "step": 65450 }, { "epoch": 9.703703703703704, "grad_norm": 4.758915901184082, "learning_rate": 4.3984098698274245e-08, "loss": 1.6664, "step": 65500 }, { "epoch": 9.71111111111111, "grad_norm": 4.562999725341797, "learning_rate": 4.181496946443653e-08, "loss": 1.7089, "step": 65550 }, { "epoch": 9.718518518518518, "grad_norm": 5.656510829925537, "learning_rate": 3.970057586313747e-08, "loss": 1.7346, "step": 65600 }, { "epoch": 9.725925925925926, "grad_norm": 5.465713977813721, "learning_rate": 3.764092951623943e-08, "loss": 1.6746, "step": 65650 }, { "epoch": 9.733333333333333, "grad_norm": 5.345441818237305, "learning_rate": 3.563604174468771e-08, "loss": 1.7312, "step": 65700 }, { "epoch": 9.74074074074074, "grad_norm": 5.494285583496094, "learning_rate": 3.368592356844058e-08, "loss": 1.7035, "step": 65750 }, { "epoch": 9.748148148148148, "grad_norm": 5.0170135498046875, "learning_rate": 3.179058570641602e-08, "loss": 1.6879, "step": 65800 }, { "epoch": 9.755555555555556, "grad_norm": 6.04074239730835, "learning_rate": 2.99500385764262e-08, "loss": 1.6952, "step": 65850 }, { "epoch": 9.762962962962963, "grad_norm": 4.81399393081665, "learning_rate": 2.8164292295125294e-08, "loss": 1.7135, "step": 65900 }, { "epoch": 9.77037037037037, "grad_norm": 4.348941802978516, "learning_rate": 2.6433356677952883e-08, "loss": 1.6335, "step": 65950 }, { "epoch": 9.777777777777779, "grad_norm": 4.942220211029053, "learning_rate": 2.47572412390773e-08, "loss": 1.7366, "step": 66000 }, { "epoch": 9.785185185185185, "grad_norm": 4.79942512512207, "learning_rate": 2.3135955191345704e-08, "loss": 1.6299, "step": 66050 }, { "epoch": 9.792592592592593, "grad_norm": 5.222789287567139, "learning_rate": 2.1569507446232983e-08, "loss": 1.7802, "step": 66100 }, { "epoch": 9.8, "grad_norm": 5.1869940757751465, "learning_rate": 2.0057906613792922e-08, "loss": 1.7028, "step": 66150 }, { "epoch": 9.807407407407407, "grad_norm": 4.29283332824707, "learning_rate": 1.8601161002611555e-08, "loss": 1.7188, "step": 66200 }, { "epoch": 9.814814814814815, "grad_norm": 4.335317134857178, "learning_rate": 1.719927861975834e-08, "loss": 1.6448, "step": 66250 }, { "epoch": 9.822222222222223, "grad_norm": 4.7093634605407715, "learning_rate": 1.585226717074728e-08, "loss": 1.7305, "step": 66300 }, { "epoch": 9.829629629629629, "grad_norm": 5.174459934234619, "learning_rate": 1.4560134059488084e-08, "loss": 1.6758, "step": 66350 }, { "epoch": 9.837037037037037, "grad_norm": 4.238924503326416, "learning_rate": 1.3322886388252854e-08, "loss": 1.6736, "step": 66400 }, { "epoch": 9.844444444444445, "grad_norm": 5.224400043487549, "learning_rate": 1.214053095763168e-08, "loss": 1.8016, "step": 66450 }, { "epoch": 9.851851851851851, "grad_norm": 4.289679527282715, "learning_rate": 1.1013074266496005e-08, "loss": 1.6809, "step": 66500 }, { "epoch": 9.85925925925926, "grad_norm": 5.104315280914307, "learning_rate": 9.940522511965311e-09, "loss": 1.7685, "step": 66550 }, { "epoch": 9.866666666666667, "grad_norm": 6.96699857711792, "learning_rate": 8.922881589369381e-09, "loss": 1.7857, "step": 66600 }, { "epoch": 9.874074074074073, "grad_norm": 6.210925102233887, "learning_rate": 7.960157092221644e-09, "loss": 1.6882, "step": 66650 }, { "epoch": 9.881481481481481, "grad_norm": 4.829061985015869, "learning_rate": 7.052354312180321e-09, "loss": 1.6723, "step": 66700 }, { "epoch": 9.88888888888889, "grad_norm": 5.419042587280273, "learning_rate": 6.199478239027334e-09, "loss": 1.719, "step": 66750 }, { "epoch": 9.896296296296295, "grad_norm": 4.705636501312256, "learning_rate": 5.401533560636107e-09, "loss": 1.6663, "step": 66800 }, { "epoch": 9.903703703703703, "grad_norm": 4.650430202484131, "learning_rate": 4.658524662947139e-09, "loss": 1.6806, "step": 66850 }, { "epoch": 9.911111111111111, "grad_norm": 6.103776931762695, "learning_rate": 3.970455629942471e-09, "loss": 1.7272, "step": 66900 }, { "epoch": 9.918518518518518, "grad_norm": 4.458265781402588, "learning_rate": 3.337330243627923e-09, "loss": 1.6886, "step": 66950 }, { "epoch": 9.925925925925926, "grad_norm": 4.948592662811279, "learning_rate": 2.7591519840064473e-09, "loss": 1.6957, "step": 67000 }, { "epoch": 9.933333333333334, "grad_norm": 4.904621601104736, "learning_rate": 2.2359240290614757e-09, "loss": 1.68, "step": 67050 }, { "epoch": 9.940740740740742, "grad_norm": 4.690095901489258, "learning_rate": 1.7676492547402668e-09, "loss": 1.7341, "step": 67100 }, { "epoch": 9.948148148148148, "grad_norm": 7.331205368041992, "learning_rate": 1.354330234936141e-09, "loss": 1.7303, "step": 67150 }, { "epoch": 9.955555555555556, "grad_norm": 5.197513103485107, "learning_rate": 9.959692414784893e-10, "loss": 1.7282, "step": 67200 }, { "epoch": 9.962962962962964, "grad_norm": 6.541003227233887, "learning_rate": 6.925682441150106e-10, "loss": 1.6193, "step": 67250 }, { "epoch": 9.97037037037037, "grad_norm": 5.752736568450928, "learning_rate": 4.441289105017177e-10, "loss": 1.6806, "step": 67300 }, { "epoch": 9.977777777777778, "grad_norm": 4.675209999084473, "learning_rate": 2.5065260619960843e-10, "loss": 1.6589, "step": 67350 }, { "epoch": 9.985185185185186, "grad_norm": 4.999233722686768, "learning_rate": 1.121403946580113e-10, "loss": 1.7465, "step": 67400 }, { "epoch": 9.992592592592592, "grad_norm": 5.814984321594238, "learning_rate": 2.859303721791662e-11, "loss": 1.6395, "step": 67450 }, { "epoch": 10.0, "grad_norm": 5.604928016662598, "learning_rate": 1.0993098653599987e-14, "loss": 1.7599, "step": 67500 }, { "epoch": 10.007407407407408, "grad_norm": 5.701760292053223, "learning_rate": 4.046448759873345e-07, "loss": 1.7425, "step": 67550 }, { "epoch": 10.014814814814814, "grad_norm": 5.556209564208984, "learning_rate": 3.9866912851546425e-07, "loss": 1.6211, "step": 67600 }, { "epoch": 10.022222222222222, "grad_norm": 6.1393914222717285, "learning_rate": 3.9273693694206084e-07, "loss": 1.7484, "step": 67650 }, { "epoch": 10.02962962962963, "grad_norm": 6.207096099853516, "learning_rate": 3.8684832817817986e-07, "loss": 1.6913, "step": 67700 }, { "epoch": 10.037037037037036, "grad_norm": 5.700395107269287, "learning_rate": 3.8100332893716174e-07, "loss": 1.7092, "step": 67750 }, { "epoch": 10.044444444444444, "grad_norm": 4.307992458343506, "learning_rate": 3.752019657345196e-07, "loss": 1.6546, "step": 67800 }, { "epoch": 10.051851851851852, "grad_norm": 5.123924255371094, "learning_rate": 3.694442648878105e-07, "loss": 1.6957, "step": 67850 }, { "epoch": 10.059259259259258, "grad_norm": 5.588382244110107, "learning_rate": 3.6373025251652096e-07, "loss": 1.7285, "step": 67900 }, { "epoch": 10.066666666666666, "grad_norm": 5.349360466003418, "learning_rate": 3.580599545419483e-07, "loss": 1.6671, "step": 67950 }, { "epoch": 10.074074074074074, "grad_norm": 4.467996120452881, "learning_rate": 3.5243339668708075e-07, "loss": 1.7021, "step": 68000 }, { "epoch": 10.081481481481482, "grad_norm": 4.691610813140869, "learning_rate": 3.468506044764808e-07, "loss": 1.7114, "step": 68050 }, { "epoch": 10.088888888888889, "grad_norm": 4.518200397491455, "learning_rate": 3.413116032361741e-07, "loss": 1.7254, "step": 68100 }, { "epoch": 10.096296296296297, "grad_norm": 5.5028510093688965, "learning_rate": 3.358164180935275e-07, "loss": 1.7031, "step": 68150 }, { "epoch": 10.103703703703705, "grad_norm": 4.488427639007568, "learning_rate": 3.3036507397713915e-07, "loss": 1.7363, "step": 68200 }, { "epoch": 10.11111111111111, "grad_norm": 4.6745381355285645, "learning_rate": 3.2495759561672837e-07, "loss": 1.7013, "step": 68250 }, { "epoch": 10.118518518518519, "grad_norm": 6.56805419921875, "learning_rate": 3.195940075430137e-07, "loss": 1.7279, "step": 68300 }, { "epoch": 10.125925925925927, "grad_norm": 6.056013584136963, "learning_rate": 3.142743340876131e-07, "loss": 1.6627, "step": 68350 }, { "epoch": 10.133333333333333, "grad_norm": 4.359686374664307, "learning_rate": 3.089985993829281e-07, "loss": 1.6903, "step": 68400 }, { "epoch": 10.14074074074074, "grad_norm": 4.175838470458984, "learning_rate": 3.0376682736202866e-07, "loss": 1.6757, "step": 68450 }, { "epoch": 10.148148148148149, "grad_norm": 4.473140716552734, "learning_rate": 2.985790417585588e-07, "loss": 1.7066, "step": 68500 }, { "epoch": 10.155555555555555, "grad_norm": 4.994370937347412, "learning_rate": 2.9343526610661534e-07, "loss": 1.6348, "step": 68550 }, { "epoch": 10.162962962962963, "grad_norm": 4.70601224899292, "learning_rate": 2.883355237406471e-07, "loss": 1.7447, "step": 68600 }, { "epoch": 10.170370370370371, "grad_norm": 4.815600395202637, "learning_rate": 2.832798377953505e-07, "loss": 1.6319, "step": 68650 }, { "epoch": 10.177777777777777, "grad_norm": 5.1428937911987305, "learning_rate": 2.7826823120555955e-07, "loss": 1.7659, "step": 68700 }, { "epoch": 10.185185185185185, "grad_norm": 5.623711109161377, "learning_rate": 2.7330072670614604e-07, "loss": 1.6637, "step": 68750 }, { "epoch": 10.192592592592593, "grad_norm": 4.861164569854736, "learning_rate": 2.683773468319173e-07, "loss": 1.6388, "step": 68800 }, { "epoch": 10.2, "grad_norm": 4.522395133972168, "learning_rate": 2.6349811391750856e-07, "loss": 1.6704, "step": 68850 }, { "epoch": 10.207407407407407, "grad_norm": 4.430446147918701, "learning_rate": 2.586630500972853e-07, "loss": 1.6849, "step": 68900 }, { "epoch": 10.214814814814815, "grad_norm": 4.729742527008057, "learning_rate": 2.538721773052433e-07, "loss": 1.6672, "step": 68950 }, { "epoch": 10.222222222222221, "grad_norm": 6.109224319458008, "learning_rate": 2.491255172749085e-07, "loss": 1.6799, "step": 69000 }, { "epoch": 10.22962962962963, "grad_norm": 4.983971118927002, "learning_rate": 2.444230915392376e-07, "loss": 1.6689, "step": 69050 }, { "epoch": 10.237037037037037, "grad_norm": 5.701051235198975, "learning_rate": 2.397649214305198e-07, "loss": 1.6814, "step": 69100 }, { "epoch": 10.244444444444444, "grad_norm": 5.279200553894043, "learning_rate": 2.3515102808028378e-07, "loss": 1.6893, "step": 69150 }, { "epoch": 10.251851851851852, "grad_norm": 4.546329021453857, "learning_rate": 2.3058143241919906e-07, "loss": 1.7076, "step": 69200 }, { "epoch": 10.25925925925926, "grad_norm": 5.384993553161621, "learning_rate": 2.2605615517697576e-07, "loss": 1.734, "step": 69250 }, { "epoch": 10.266666666666667, "grad_norm": 4.743589878082275, "learning_rate": 2.2157521688228488e-07, "loss": 1.7385, "step": 69300 }, { "epoch": 10.274074074074074, "grad_norm": 5.3543291091918945, "learning_rate": 2.171386378626461e-07, "loss": 1.6681, "step": 69350 }, { "epoch": 10.281481481481482, "grad_norm": 4.380980491638184, "learning_rate": 2.127464382443545e-07, "loss": 1.6716, "step": 69400 }, { "epoch": 10.28888888888889, "grad_norm": 7.292746067047119, "learning_rate": 2.0839863795237724e-07, "loss": 1.747, "step": 69450 }, { "epoch": 10.296296296296296, "grad_norm": 4.844071388244629, "learning_rate": 2.0409525671026498e-07, "loss": 1.6245, "step": 69500 }, { "epoch": 10.303703703703704, "grad_norm": 4.968372821807861, "learning_rate": 1.99836314040065e-07, "loss": 1.6881, "step": 69550 }, { "epoch": 10.311111111111112, "grad_norm": 3.9518678188323975, "learning_rate": 1.956218292622325e-07, "loss": 1.6664, "step": 69600 }, { "epoch": 10.318518518518518, "grad_norm": 4.433720111846924, "learning_rate": 1.9145182149553966e-07, "loss": 1.6951, "step": 69650 }, { "epoch": 10.325925925925926, "grad_norm": 5.678008079528809, "learning_rate": 1.873263096569955e-07, "loss": 1.6644, "step": 69700 }, { "epoch": 10.333333333333334, "grad_norm": 6.03004789352417, "learning_rate": 1.832453124617495e-07, "loss": 1.7189, "step": 69750 }, { "epoch": 10.34074074074074, "grad_norm": 4.946279048919678, "learning_rate": 1.7920884842301922e-07, "loss": 1.6496, "step": 69800 }, { "epoch": 10.348148148148148, "grad_norm": 5.097282409667969, "learning_rate": 1.7521693585199505e-07, "loss": 1.6875, "step": 69850 }, { "epoch": 10.355555555555556, "grad_norm": 4.687585353851318, "learning_rate": 1.7126959285776456e-07, "loss": 1.6469, "step": 69900 }, { "epoch": 10.362962962962962, "grad_norm": 5.8932929039001465, "learning_rate": 1.6736683734722814e-07, "loss": 1.7221, "step": 69950 }, { "epoch": 10.37037037037037, "grad_norm": 4.6569743156433105, "learning_rate": 1.6350868702501798e-07, "loss": 1.6485, "step": 70000 }, { "epoch": 10.377777777777778, "grad_norm": 5.339687347412109, "learning_rate": 1.5969515939341485e-07, "loss": 1.7525, "step": 70050 }, { "epoch": 10.385185185185184, "grad_norm": 4.882029056549072, "learning_rate": 1.5592627175227248e-07, "loss": 1.7105, "step": 70100 }, { "epoch": 10.392592592592592, "grad_norm": 5.221914768218994, "learning_rate": 1.522020411989389e-07, "loss": 1.7217, "step": 70150 }, { "epoch": 10.4, "grad_norm": 5.761238098144531, "learning_rate": 1.4852248462817741e-07, "loss": 1.6958, "step": 70200 }, { "epoch": 10.407407407407407, "grad_norm": 5.501303672790527, "learning_rate": 1.4488761873209022e-07, "loss": 1.6864, "step": 70250 }, { "epoch": 10.414814814814815, "grad_norm": 4.387554168701172, "learning_rate": 1.4129746000004052e-07, "loss": 1.7137, "step": 70300 }, { "epoch": 10.422222222222222, "grad_norm": 4.2735419273376465, "learning_rate": 1.3775202471858263e-07, "loss": 1.6479, "step": 70350 }, { "epoch": 10.42962962962963, "grad_norm": 7.969890594482422, "learning_rate": 1.342513289713865e-07, "loss": 1.6789, "step": 70400 }, { "epoch": 10.437037037037037, "grad_norm": 4.56361198425293, "learning_rate": 1.3079538863915775e-07, "loss": 1.6735, "step": 70450 }, { "epoch": 10.444444444444445, "grad_norm": 5.747202396392822, "learning_rate": 1.2738421939958e-07, "loss": 1.7478, "step": 70500 }, { "epoch": 10.451851851851853, "grad_norm": 5.894442081451416, "learning_rate": 1.2401783672722822e-07, "loss": 1.6335, "step": 70550 }, { "epoch": 10.459259259259259, "grad_norm": 6.281641006469727, "learning_rate": 1.2069625589350987e-07, "loss": 1.7553, "step": 70600 }, { "epoch": 10.466666666666667, "grad_norm": 4.168921947479248, "learning_rate": 1.1741949196658942e-07, "loss": 1.6481, "step": 70650 }, { "epoch": 10.474074074074075, "grad_norm": 5.182560443878174, "learning_rate": 1.1418755981132179e-07, "loss": 1.7527, "step": 70700 }, { "epoch": 10.481481481481481, "grad_norm": 5.2377190589904785, "learning_rate": 1.1100047408918569e-07, "loss": 1.7067, "step": 70750 }, { "epoch": 10.488888888888889, "grad_norm": 5.3976263999938965, "learning_rate": 1.078582492582192e-07, "loss": 1.7029, "step": 70800 }, { "epoch": 10.496296296296297, "grad_norm": 4.8626604080200195, "learning_rate": 1.0476089957294545e-07, "loss": 1.653, "step": 70850 }, { "epoch": 10.503703703703703, "grad_norm": 4.868640422821045, "learning_rate": 1.0170843908432037e-07, "loss": 1.6797, "step": 70900 }, { "epoch": 10.511111111111111, "grad_norm": 6.026078224182129, "learning_rate": 9.87008816396573e-08, "loss": 1.7255, "step": 70950 }, { "epoch": 10.518518518518519, "grad_norm": 5.1598896980285645, "learning_rate": 9.573824088257244e-08, "loss": 1.7599, "step": 71000 }, { "epoch": 10.525925925925925, "grad_norm": 7.120807647705078, "learning_rate": 9.282053025291948e-08, "loss": 1.6233, "step": 71050 }, { "epoch": 10.533333333333333, "grad_norm": 4.827940940856934, "learning_rate": 8.994776298672847e-08, "loss": 1.7441, "step": 71100 }, { "epoch": 10.540740740740741, "grad_norm": 4.910730838775635, "learning_rate": 8.711995211614587e-08, "loss": 1.6672, "step": 71150 }, { "epoch": 10.548148148148147, "grad_norm": 4.025373458862305, "learning_rate": 8.43371104693802e-08, "loss": 1.641, "step": 71200 }, { "epoch": 10.555555555555555, "grad_norm": 5.08291482925415, "learning_rate": 8.159925067063423e-08, "loss": 1.6622, "step": 71250 }, { "epoch": 10.562962962962963, "grad_norm": 5.166038513183594, "learning_rate": 7.890638514005511e-08, "loss": 1.6466, "step": 71300 }, { "epoch": 10.57037037037037, "grad_norm": 4.054682731628418, "learning_rate": 7.625852609367546e-08, "loss": 1.7034, "step": 71350 }, { "epoch": 10.577777777777778, "grad_norm": 5.348453521728516, "learning_rate": 7.365568554336122e-08, "loss": 1.68, "step": 71400 }, { "epoch": 10.585185185185185, "grad_norm": 6.874485969543457, "learning_rate": 7.109787529675172e-08, "loss": 1.6973, "step": 71450 }, { "epoch": 10.592592592592592, "grad_norm": 3.6352336406707764, "learning_rate": 6.858510695720744e-08, "loss": 1.6651, "step": 71500 }, { "epoch": 10.6, "grad_norm": 4.947807788848877, "learning_rate": 6.611739192376344e-08, "loss": 1.6939, "step": 71550 }, { "epoch": 10.607407407407408, "grad_norm": 5.23864221572876, "learning_rate": 6.369474139107046e-08, "loss": 1.6455, "step": 71600 }, { "epoch": 10.614814814814816, "grad_norm": 4.680927276611328, "learning_rate": 6.131716634934504e-08, "loss": 1.7969, "step": 71650 }, { "epoch": 10.622222222222222, "grad_norm": 4.570713996887207, "learning_rate": 5.89846775843228e-08, "loss": 1.674, "step": 71700 }, { "epoch": 10.62962962962963, "grad_norm": 5.311273574829102, "learning_rate": 5.669728567720967e-08, "loss": 1.6284, "step": 71750 }, { "epoch": 10.637037037037038, "grad_norm": 5.45431661605835, "learning_rate": 5.4455001004629634e-08, "loss": 1.7311, "step": 71800 }, { "epoch": 10.644444444444444, "grad_norm": 5.246209621429443, "learning_rate": 5.225783373858151e-08, "loss": 1.6305, "step": 71850 }, { "epoch": 10.651851851851852, "grad_norm": 4.364826202392578, "learning_rate": 5.010579384639114e-08, "loss": 1.6551, "step": 71900 }, { "epoch": 10.65925925925926, "grad_norm": 4.401340961456299, "learning_rate": 4.799889109066591e-08, "loss": 1.6878, "step": 71950 }, { "epoch": 10.666666666666666, "grad_norm": 5.108217716217041, "learning_rate": 4.5937135029253675e-08, "loss": 1.6827, "step": 72000 }, { "epoch": 10.674074074074074, "grad_norm": 5.362737655639648, "learning_rate": 4.3920535015193886e-08, "loss": 1.7257, "step": 72050 }, { "epoch": 10.681481481481482, "grad_norm": 4.8335490226745605, "learning_rate": 4.194910019667875e-08, "loss": 1.7556, "step": 72100 }, { "epoch": 10.688888888888888, "grad_norm": 4.182542324066162, "learning_rate": 4.0022839517013246e-08, "loss": 1.7039, "step": 72150 }, { "epoch": 10.696296296296296, "grad_norm": 5.213995456695557, "learning_rate": 3.8141761714568514e-08, "loss": 1.7118, "step": 72200 }, { "epoch": 10.703703703703704, "grad_norm": 5.52512788772583, "learning_rate": 3.630587532275076e-08, "loss": 1.6531, "step": 72250 }, { "epoch": 10.71111111111111, "grad_norm": 4.814249515533447, "learning_rate": 3.4515188669954624e-08, "loss": 1.7214, "step": 72300 }, { "epoch": 10.718518518518518, "grad_norm": 5.535644054412842, "learning_rate": 3.276970987952877e-08, "loss": 1.639, "step": 72350 }, { "epoch": 10.725925925925926, "grad_norm": 4.050861358642578, "learning_rate": 3.1069446869741446e-08, "loss": 1.6789, "step": 72400 }, { "epoch": 10.733333333333333, "grad_norm": 4.139535903930664, "learning_rate": 2.941440735373835e-08, "loss": 1.6347, "step": 72450 }, { "epoch": 10.74074074074074, "grad_norm": 5.31736421585083, "learning_rate": 2.7804598839514806e-08, "loss": 1.7072, "step": 72500 }, { "epoch": 10.748148148148148, "grad_norm": 4.640048503875732, "learning_rate": 2.6240028629876958e-08, "loss": 1.7176, "step": 72550 }, { "epoch": 10.755555555555556, "grad_norm": 4.653814315795898, "learning_rate": 2.472070382240843e-08, "loss": 1.6947, "step": 72600 }, { "epoch": 10.762962962962963, "grad_norm": 4.801093578338623, "learning_rate": 2.3246631309441492e-08, "loss": 1.72, "step": 72650 }, { "epoch": 10.77037037037037, "grad_norm": 4.096927165985107, "learning_rate": 2.181781777802261e-08, "loss": 1.7342, "step": 72700 }, { "epoch": 10.777777777777779, "grad_norm": 4.117656707763672, "learning_rate": 2.0434269709885822e-08, "loss": 1.7043, "step": 72750 }, { "epoch": 10.785185185185185, "grad_norm": 5.932491302490234, "learning_rate": 1.909599338141832e-08, "loss": 1.7292, "step": 72800 }, { "epoch": 10.792592592592593, "grad_norm": 6.006489276885986, "learning_rate": 1.7802994863636013e-08, "loss": 1.7046, "step": 72850 }, { "epoch": 10.8, "grad_norm": 5.510446071624756, "learning_rate": 1.6555280022152454e-08, "loss": 1.7306, "step": 72900 }, { "epoch": 10.807407407407407, "grad_norm": 7.160583019256592, "learning_rate": 1.5352854517158843e-08, "loss": 1.7025, "step": 72950 }, { "epoch": 10.814814814814815, "grad_norm": 3.88580060005188, "learning_rate": 1.4195723803387407e-08, "loss": 1.7404, "step": 73000 }, { "epoch": 10.822222222222223, "grad_norm": 4.92537260055542, "learning_rate": 1.3083893130100278e-08, "loss": 1.6605, "step": 73050 }, { "epoch": 10.829629629629629, "grad_norm": 6.940608978271484, "learning_rate": 1.201736754105176e-08, "loss": 1.6139, "step": 73100 }, { "epoch": 10.837037037037037, "grad_norm": 5.638848781585693, "learning_rate": 1.0996151874478333e-08, "loss": 1.6273, "step": 73150 }, { "epoch": 10.844444444444445, "grad_norm": 4.141793251037598, "learning_rate": 1.0020250763064232e-08, "loss": 1.7212, "step": 73200 }, { "epoch": 10.851851851851851, "grad_norm": 5.597693920135498, "learning_rate": 9.08966863393257e-09, "loss": 1.6653, "step": 73250 }, { "epoch": 10.85925925925926, "grad_norm": 5.222171306610107, "learning_rate": 8.204409708616468e-09, "loss": 1.6919, "step": 73300 }, { "epoch": 10.866666666666667, "grad_norm": 5.148636817932129, "learning_rate": 7.3644780030424075e-09, "loss": 1.7524, "step": 73350 }, { "epoch": 10.874074074074073, "grad_norm": 5.547111988067627, "learning_rate": 6.569877327514684e-09, "loss": 1.6789, "step": 73400 }, { "epoch": 10.881481481481481, "grad_norm": 4.137996673583984, "learning_rate": 5.820611286693201e-09, "loss": 1.7212, "step": 73450 }, { "epoch": 10.88888888888889, "grad_norm": 5.037812232971191, "learning_rate": 5.116683279582369e-09, "loss": 1.6422, "step": 73500 }, { "epoch": 10.896296296296295, "grad_norm": 5.004727363586426, "learning_rate": 4.4580964995122325e-09, "loss": 1.6445, "step": 73550 }, { "epoch": 10.903703703703703, "grad_norm": 3.8220603466033936, "learning_rate": 3.844853934124038e-09, "loss": 1.6438, "step": 73600 }, { "epoch": 10.911111111111111, "grad_norm": 5.583298683166504, "learning_rate": 3.2769583653580185e-09, "loss": 1.7172, "step": 73650 }, { "epoch": 10.918518518518518, "grad_norm": 4.771687984466553, "learning_rate": 2.754412369441184e-09, "loss": 1.7834, "step": 73700 }, { "epoch": 10.925925925925926, "grad_norm": 7.343170642852783, "learning_rate": 2.277218316873997e-09, "loss": 1.6955, "step": 73750 }, { "epoch": 10.933333333333334, "grad_norm": 4.965358734130859, "learning_rate": 1.8453783724214913e-09, "loss": 1.7047, "step": 73800 }, { "epoch": 10.940740740740742, "grad_norm": 5.137145519256592, "learning_rate": 1.4588944950988394e-09, "loss": 1.7098, "step": 73850 }, { "epoch": 10.948148148148148, "grad_norm": 4.667434215545654, "learning_rate": 1.1177684381702414e-09, "loss": 1.7216, "step": 73900 }, { "epoch": 10.955555555555556, "grad_norm": 4.412621021270752, "learning_rate": 8.220017491344934e-10, "loss": 1.6661, "step": 73950 }, { "epoch": 10.962962962962964, "grad_norm": 4.227043151855469, "learning_rate": 5.715957697216556e-10, "loss": 1.7637, "step": 74000 }, { "epoch": 10.97037037037037, "grad_norm": 5.481570720672607, "learning_rate": 3.6655163588195097e-10, "loss": 1.6849, "step": 74050 }, { "epoch": 10.977777777777778, "grad_norm": 7.311110496520996, "learning_rate": 2.0687027778909518e-10, "loss": 1.6825, "step": 74100 }, { "epoch": 10.985185185185186, "grad_norm": 4.955511569976807, "learning_rate": 9.255241982697449e-11, "loss": 1.7145, "step": 74150 }, { "epoch": 10.992592592592592, "grad_norm": 5.094233989715576, "learning_rate": 2.359858059186593e-11, "loss": 1.7051, "step": 74200 }, { "epoch": 11.0, "grad_norm": 5.1888346672058105, "learning_rate": 9.07289132712208e-15, "loss": 1.7045, "step": 74250 }, { "epoch": 11.007407407407408, "grad_norm": 6.677313804626465, "learning_rate": 3.4000141682162035e-07, "loss": 1.7766, "step": 74300 }, { "epoch": 11.014814814814814, "grad_norm": 5.349052429199219, "learning_rate": 3.3497486947049575e-07, "loss": 1.7002, "step": 74350 }, { "epoch": 11.022222222222222, "grad_norm": 4.597198009490967, "learning_rate": 3.2998512237565005e-07, "loss": 1.6818, "step": 74400 }, { "epoch": 11.02962962962963, "grad_norm": 5.520595073699951, "learning_rate": 3.250321945358903e-07, "loss": 1.6704, "step": 74450 }, { "epoch": 11.037037037037036, "grad_norm": 6.03912353515625, "learning_rate": 3.201161048098367e-07, "loss": 1.6693, "step": 74500 }, { "epoch": 11.044444444444444, "grad_norm": 5.658355236053467, "learning_rate": 3.152368719158416e-07, "loss": 1.7106, "step": 74550 }, { "epoch": 11.051851851851852, "grad_norm": 5.297006130218506, "learning_rate": 3.1039451443192537e-07, "loss": 1.636, "step": 74600 }, { "epoch": 11.059259259259258, "grad_norm": 5.458147048950195, "learning_rate": 3.0558905079569933e-07, "loss": 1.6864, "step": 74650 }, { "epoch": 11.066666666666666, "grad_norm": 6.0984110832214355, "learning_rate": 3.008204993043029e-07, "loss": 1.6783, "step": 74700 }, { "epoch": 11.074074074074074, "grad_norm": 5.887842655181885, "learning_rate": 2.9608887811432674e-07, "loss": 1.7436, "step": 74750 }, { "epoch": 11.081481481481482, "grad_norm": 5.7084197998046875, "learning_rate": 2.9139420524174953e-07, "loss": 1.6846, "step": 74800 }, { "epoch": 11.088888888888889, "grad_norm": 4.7401323318481445, "learning_rate": 2.867364985618648e-07, "loss": 1.7038, "step": 74850 }, { "epoch": 11.096296296296297, "grad_norm": 6.002399444580078, "learning_rate": 2.82115775809213e-07, "loss": 1.7359, "step": 74900 }, { "epoch": 11.103703703703705, "grad_norm": 7.190243721008301, "learning_rate": 2.7753205457752174e-07, "loss": 1.72, "step": 74950 }, { "epoch": 11.11111111111111, "grad_norm": 5.5445146560668945, "learning_rate": 2.7298535231962464e-07, "loss": 1.6135, "step": 75000 }, { "epoch": 11.118518518518519, "grad_norm": 4.865965366363525, "learning_rate": 2.684756863474103e-07, "loss": 1.7384, "step": 75050 }, { "epoch": 11.125925925925927, "grad_norm": 5.077027320861816, "learning_rate": 2.6400307383174227e-07, "loss": 1.6512, "step": 75100 }, { "epoch": 11.133333333333333, "grad_norm": 4.601258277893066, "learning_rate": 2.595675318024093e-07, "loss": 1.7225, "step": 75150 }, { "epoch": 11.14074074074074, "grad_norm": 4.5170488357543945, "learning_rate": 2.5516907714804306e-07, "loss": 1.6548, "step": 75200 }, { "epoch": 11.148148148148149, "grad_norm": 4.254936218261719, "learning_rate": 2.508077266160669e-07, "loss": 1.6346, "step": 75250 }, { "epoch": 11.155555555555555, "grad_norm": 4.214375019073486, "learning_rate": 2.464834968126251e-07, "loss": 1.72, "step": 75300 }, { "epoch": 11.162962962962963, "grad_norm": 4.431253910064697, "learning_rate": 2.421964042025271e-07, "loss": 1.7185, "step": 75350 }, { "epoch": 11.170370370370371, "grad_norm": 6.110872745513916, "learning_rate": 2.3794646510917564e-07, "loss": 1.6903, "step": 75400 }, { "epoch": 11.177777777777777, "grad_norm": 4.43080472946167, "learning_rate": 2.3373369571450755e-07, "loss": 1.6936, "step": 75450 }, { "epoch": 11.185185185185185, "grad_norm": 5.446310520172119, "learning_rate": 2.295581120589363e-07, "loss": 1.7097, "step": 75500 }, { "epoch": 11.192592592592593, "grad_norm": 5.263556957244873, "learning_rate": 2.254197300412897e-07, "loss": 1.696, "step": 75550 }, { "epoch": 11.2, "grad_norm": 4.2185139656066895, "learning_rate": 2.213185654187433e-07, "loss": 1.6967, "step": 75600 }, { "epoch": 11.207407407407407, "grad_norm": 5.42978572845459, "learning_rate": 2.1725463380676824e-07, "loss": 1.7041, "step": 75650 }, { "epoch": 11.214814814814815, "grad_norm": 5.338979244232178, "learning_rate": 2.1322795067906688e-07, "loss": 1.6466, "step": 75700 }, { "epoch": 11.222222222222221, "grad_norm": 4.058988571166992, "learning_rate": 2.092385313675138e-07, "loss": 1.6315, "step": 75750 }, { "epoch": 11.22962962962963, "grad_norm": 5.591002464294434, "learning_rate": 2.0528639106210392e-07, "loss": 1.6745, "step": 75800 }, { "epoch": 11.237037037037037, "grad_norm": 4.783778190612793, "learning_rate": 2.013715448108855e-07, "loss": 1.693, "step": 75850 }, { "epoch": 11.244444444444444, "grad_norm": 4.906090259552002, "learning_rate": 1.974940075199061e-07, "loss": 1.7632, "step": 75900 }, { "epoch": 11.251851851851852, "grad_norm": 6.792934417724609, "learning_rate": 1.9365379395316243e-07, "loss": 1.6931, "step": 75950 }, { "epoch": 11.25925925925926, "grad_norm": 4.3067731857299805, "learning_rate": 1.898509187325337e-07, "loss": 1.6851, "step": 76000 }, { "epoch": 11.266666666666667, "grad_norm": 4.990530014038086, "learning_rate": 1.860853963377318e-07, "loss": 1.6251, "step": 76050 }, { "epoch": 11.274074074074074, "grad_norm": 5.153524875640869, "learning_rate": 1.8235724110624575e-07, "loss": 1.724, "step": 76100 }, { "epoch": 11.281481481481482, "grad_norm": 6.641872406005859, "learning_rate": 1.7866646723328608e-07, "loss": 1.754, "step": 76150 }, { "epoch": 11.28888888888889, "grad_norm": 4.482833385467529, "learning_rate": 1.7501308877173162e-07, "loss": 1.6947, "step": 76200 }, { "epoch": 11.296296296296296, "grad_norm": 5.8048014640808105, "learning_rate": 1.7139711963207517e-07, "loss": 1.6623, "step": 76250 }, { "epoch": 11.303703703703704, "grad_norm": 4.707399845123291, "learning_rate": 1.678185735823712e-07, "loss": 1.701, "step": 76300 }, { "epoch": 11.311111111111112, "grad_norm": 4.89939022064209, "learning_rate": 1.6427746424818258e-07, "loss": 1.6908, "step": 76350 }, { "epoch": 11.318518518518518, "grad_norm": 5.831413269042969, "learning_rate": 1.607738051125296e-07, "loss": 1.7078, "step": 76400 }, { "epoch": 11.325925925925926, "grad_norm": 7.055150985717773, "learning_rate": 1.573076095158399e-07, "loss": 1.7405, "step": 76450 }, { "epoch": 11.333333333333334, "grad_norm": 4.682511329650879, "learning_rate": 1.538788906558919e-07, "loss": 1.7782, "step": 76500 }, { "epoch": 11.34074074074074, "grad_norm": 6.273463726043701, "learning_rate": 1.5048766158777372e-07, "loss": 1.6904, "step": 76550 }, { "epoch": 11.348148148148148, "grad_norm": 5.013375759124756, "learning_rate": 1.4713393522382547e-07, "loss": 1.6795, "step": 76600 }, { "epoch": 11.355555555555556, "grad_norm": 4.531692028045654, "learning_rate": 1.4381772433359474e-07, "loss": 1.7108, "step": 76650 }, { "epoch": 11.362962962962962, "grad_norm": 5.173058032989502, "learning_rate": 1.405390415437835e-07, "loss": 1.7178, "step": 76700 }, { "epoch": 11.37037037037037, "grad_norm": 4.779514789581299, "learning_rate": 1.372978993382068e-07, "loss": 1.6769, "step": 76750 }, { "epoch": 11.377777777777778, "grad_norm": 5.0213117599487305, "learning_rate": 1.3409431005773855e-07, "loss": 1.808, "step": 76800 }, { "epoch": 11.385185185185184, "grad_norm": 5.859137535095215, "learning_rate": 1.3092828590026695e-07, "loss": 1.6981, "step": 76850 }, { "epoch": 11.392592592592592, "grad_norm": 5.248096942901611, "learning_rate": 1.277998389206514e-07, "loss": 1.6445, "step": 76900 }, { "epoch": 11.4, "grad_norm": 6.535493850708008, "learning_rate": 1.2470898103066896e-07, "loss": 1.6978, "step": 76950 }, { "epoch": 11.407407407407407, "grad_norm": 4.829598426818848, "learning_rate": 1.2165572399897908e-07, "loss": 1.7188, "step": 77000 }, { "epoch": 11.414814814814815, "grad_norm": 4.927868843078613, "learning_rate": 1.1864007945107004e-07, "loss": 1.7097, "step": 77050 }, { "epoch": 11.422222222222222, "grad_norm": 4.605199337005615, "learning_rate": 1.15662058869217e-07, "loss": 1.6289, "step": 77100 }, { "epoch": 11.42962962962963, "grad_norm": 7.104889869689941, "learning_rate": 1.1272167359244302e-07, "loss": 1.7027, "step": 77150 }, { "epoch": 11.437037037037037, "grad_norm": 4.649669170379639, "learning_rate": 1.0981893481646689e-07, "loss": 1.7473, "step": 77200 }, { "epoch": 11.444444444444445, "grad_norm": 4.562942981719971, "learning_rate": 1.0695385359367094e-07, "loss": 1.6945, "step": 77250 }, { "epoch": 11.451851851851853, "grad_norm": 5.604371547698975, "learning_rate": 1.0412644083305112e-07, "loss": 1.6964, "step": 77300 }, { "epoch": 11.459259259259259, "grad_norm": 4.250510215759277, "learning_rate": 1.0133670730017697e-07, "loss": 1.6596, "step": 77350 }, { "epoch": 11.466666666666667, "grad_norm": 4.702193737030029, "learning_rate": 9.858466361715502e-08, "loss": 1.6443, "step": 77400 }, { "epoch": 11.474074074074075, "grad_norm": 4.870285511016846, "learning_rate": 9.587032026258214e-08, "loss": 1.7001, "step": 77450 }, { "epoch": 11.481481481481481, "grad_norm": 4.658622741699219, "learning_rate": 9.319368757151004e-08, "loss": 1.7291, "step": 77500 }, { "epoch": 11.488888888888889, "grad_norm": 4.985848903656006, "learning_rate": 9.055477573540417e-08, "loss": 1.6981, "step": 77550 }, { "epoch": 11.496296296296297, "grad_norm": 4.855040550231934, "learning_rate": 8.795359480210374e-08, "loss": 1.6741, "step": 77600 }, { "epoch": 11.503703703703703, "grad_norm": 4.5977983474731445, "learning_rate": 8.53901546757896e-08, "loss": 1.6, "step": 77650 }, { "epoch": 11.511111111111111, "grad_norm": 5.066938400268555, "learning_rate": 8.286446511693635e-08, "loss": 1.7648, "step": 77700 }, { "epoch": 11.518518518518519, "grad_norm": 5.341429233551025, "learning_rate": 8.037653574228255e-08, "loss": 1.7957, "step": 77750 }, { "epoch": 11.525925925925925, "grad_norm": 4.630782127380371, "learning_rate": 7.79263760247928e-08, "loss": 1.6445, "step": 77800 }, { "epoch": 11.533333333333333, "grad_norm": 5.8252129554748535, "learning_rate": 7.551399529362125e-08, "loss": 1.6807, "step": 77850 }, { "epoch": 11.540740740740741, "grad_norm": 6.431312561035156, "learning_rate": 7.313940273407482e-08, "loss": 1.6229, "step": 77900 }, { "epoch": 11.548148148148147, "grad_norm": 5.191285133361816, "learning_rate": 7.080260738758227e-08, "loss": 1.6838, "step": 77950 }, { "epoch": 11.555555555555555, "grad_norm": 5.839999198913574, "learning_rate": 6.850361815165185e-08, "loss": 1.7031, "step": 78000 }, { "epoch": 11.562962962962963, "grad_norm": 4.388585567474365, "learning_rate": 6.62424437798459e-08, "loss": 1.7339, "step": 78050 }, { "epoch": 11.57037037037037, "grad_norm": 3.9459519386291504, "learning_rate": 6.401909288174523e-08, "loss": 1.7138, "step": 78100 }, { "epoch": 11.577777777777778, "grad_norm": 5.464336395263672, "learning_rate": 6.183357392291145e-08, "loss": 1.6937, "step": 78150 }, { "epoch": 11.585185185185185, "grad_norm": 5.290124893188477, "learning_rate": 5.968589522485912e-08, "loss": 1.6891, "step": 78200 }, { "epoch": 11.592592592592592, "grad_norm": 4.703418731689453, "learning_rate": 5.757606496502699e-08, "loss": 1.7016, "step": 78250 }, { "epoch": 11.6, "grad_norm": 6.593367099761963, "learning_rate": 5.550409117674016e-08, "loss": 1.6356, "step": 78300 }, { "epoch": 11.607407407407408, "grad_norm": 4.0880608558654785, "learning_rate": 5.346998174918128e-08, "loss": 1.6753, "step": 78350 }, { "epoch": 11.614814814814816, "grad_norm": 5.509574890136719, "learning_rate": 5.147374442736497e-08, "loss": 1.5876, "step": 78400 }, { "epoch": 11.622222222222222, "grad_norm": 5.011131286621094, "learning_rate": 4.951538681210455e-08, "loss": 1.6904, "step": 78450 }, { "epoch": 11.62962962962963, "grad_norm": 5.238678455352783, "learning_rate": 4.759491635998204e-08, "loss": 1.6404, "step": 78500 }, { "epoch": 11.637037037037038, "grad_norm": 5.725277423858643, "learning_rate": 4.571234038332262e-08, "loss": 1.6658, "step": 78550 }, { "epoch": 11.644444444444444, "grad_norm": 5.188413619995117, "learning_rate": 4.386766605016468e-08, "loss": 1.6906, "step": 78600 }, { "epoch": 11.651851851851852, "grad_norm": 5.059532165527344, "learning_rate": 4.206090038423649e-08, "loss": 1.6779, "step": 78650 }, { "epoch": 11.65925925925926, "grad_norm": 3.8585777282714844, "learning_rate": 4.029205026492178e-08, "loss": 1.6209, "step": 78700 }, { "epoch": 11.666666666666666, "grad_norm": 5.420309543609619, "learning_rate": 3.856112242724086e-08, "loss": 1.712, "step": 78750 }, { "epoch": 11.674074074074074, "grad_norm": 4.85215950012207, "learning_rate": 3.6868123461824e-08, "loss": 1.675, "step": 78800 }, { "epoch": 11.681481481481482, "grad_norm": 5.239144325256348, "learning_rate": 3.5213059814880326e-08, "loss": 1.7549, "step": 78850 }, { "epoch": 11.688888888888888, "grad_norm": 4.639777183532715, "learning_rate": 3.359593778818115e-08, "loss": 1.7411, "step": 78900 }, { "epoch": 11.696296296296296, "grad_norm": 5.959284782409668, "learning_rate": 3.201676353903005e-08, "loss": 1.6628, "step": 78950 }, { "epoch": 11.703703703703704, "grad_norm": 4.810410976409912, "learning_rate": 3.047554308024503e-08, "loss": 1.7053, "step": 79000 }, { "epoch": 11.71111111111111, "grad_norm": 5.420835971832275, "learning_rate": 2.8972282280128606e-08, "loss": 1.7336, "step": 79050 }, { "epoch": 11.718518518518518, "grad_norm": 4.595527172088623, "learning_rate": 2.7506986862451122e-08, "loss": 1.736, "step": 79100 }, { "epoch": 11.725925925925926, "grad_norm": 5.292618274688721, "learning_rate": 2.6079662406428564e-08, "loss": 1.6576, "step": 79150 }, { "epoch": 11.733333333333333, "grad_norm": 5.267721652984619, "learning_rate": 2.4690314346695888e-08, "loss": 1.6958, "step": 79200 }, { "epoch": 11.74074074074074, "grad_norm": 4.758387565612793, "learning_rate": 2.3338947973293724e-08, "loss": 1.6424, "step": 79250 }, { "epoch": 11.748148148148148, "grad_norm": 4.851783752441406, "learning_rate": 2.202556843164283e-08, "loss": 1.7366, "step": 79300 }, { "epoch": 11.755555555555556, "grad_norm": 5.405657768249512, "learning_rate": 2.0750180722529657e-08, "loss": 1.6203, "step": 79350 }, { "epoch": 11.762962962962963, "grad_norm": 4.255611896514893, "learning_rate": 1.9512789702078594e-08, "loss": 1.7053, "step": 79400 }, { "epoch": 11.77037037037037, "grad_norm": 4.81674861907959, "learning_rate": 1.8313400081744203e-08, "loss": 1.7215, "step": 79450 }, { "epoch": 11.777777777777779, "grad_norm": 4.983034610748291, "learning_rate": 1.7152016428285678e-08, "loss": 1.6665, "step": 79500 }, { "epoch": 11.785185185185185, "grad_norm": 6.600327014923096, "learning_rate": 1.60286431637513e-08, "loss": 1.7343, "step": 79550 }, { "epoch": 11.792592592592593, "grad_norm": 4.829182147979736, "learning_rate": 1.494328456546401e-08, "loss": 1.6987, "step": 79600 }, { "epoch": 11.8, "grad_norm": 4.379661560058594, "learning_rate": 1.3895944766001424e-08, "loss": 1.733, "step": 79650 }, { "epoch": 11.807407407407407, "grad_norm": 5.277597427368164, "learning_rate": 1.2886627753183611e-08, "loss": 1.5864, "step": 79700 }, { "epoch": 11.814814814814815, "grad_norm": 4.692282199859619, "learning_rate": 1.1915337370055347e-08, "loss": 1.7295, "step": 79750 }, { "epoch": 11.822222222222223, "grad_norm": 5.036397457122803, "learning_rate": 1.0982077314871664e-08, "loss": 1.6723, "step": 79800 }, { "epoch": 11.829629629629629, "grad_norm": 6.376845359802246, "learning_rate": 1.008685114108454e-08, "loss": 1.7038, "step": 79850 }, { "epoch": 11.837037037037037, "grad_norm": 4.599322319030762, "learning_rate": 9.229662257331794e-09, "loss": 1.7503, "step": 79900 }, { "epoch": 11.844444444444445, "grad_norm": 4.890994548797607, "learning_rate": 8.410513927419317e-09, "loss": 1.6431, "step": 79950 }, { "epoch": 11.851851851851851, "grad_norm": 4.89326810836792, "learning_rate": 7.629409270311083e-09, "loss": 1.744, "step": 80000 }, { "epoch": 11.85925925925926, "grad_norm": 6.547264099121094, "learning_rate": 6.8863512601169416e-09, "loss": 1.6933, "step": 80050 }, { "epoch": 11.866666666666667, "grad_norm": 4.861441135406494, "learning_rate": 6.181342726082618e-09, "loss": 1.7057, "step": 80100 }, { "epoch": 11.874074074074073, "grad_norm": 7.09140157699585, "learning_rate": 5.514386352577505e-09, "loss": 1.6498, "step": 80150 }, { "epoch": 11.881481481481481, "grad_norm": 5.269583225250244, "learning_rate": 4.885484679084673e-09, "loss": 1.694, "step": 80200 }, { "epoch": 11.88888888888889, "grad_norm": 4.094651699066162, "learning_rate": 4.294640100189762e-09, "loss": 1.7106, "step": 80250 }, { "epoch": 11.896296296296295, "grad_norm": 4.695838928222656, "learning_rate": 3.741854865574323e-09, "loss": 1.7345, "step": 80300 }, { "epoch": 11.903703703703703, "grad_norm": 4.913790702819824, "learning_rate": 3.2271310800091603e-09, "loss": 1.7232, "step": 80350 }, { "epoch": 11.911111111111111, "grad_norm": 6.038999557495117, "learning_rate": 2.750470703338781e-09, "loss": 1.6752, "step": 80400 }, { "epoch": 11.918518518518518, "grad_norm": 4.919781684875488, "learning_rate": 2.3118755504825117e-09, "loss": 1.6748, "step": 80450 }, { "epoch": 11.925925925925926, "grad_norm": 4.607057571411133, "learning_rate": 1.9113472914189524e-09, "loss": 1.6536, "step": 80500 }, { "epoch": 11.933333333333334, "grad_norm": 4.645559310913086, "learning_rate": 1.5488874511904174e-09, "loss": 1.661, "step": 80550 }, { "epoch": 11.940740740740742, "grad_norm": 4.680781841278076, "learning_rate": 1.224497409887393e-09, "loss": 1.7155, "step": 80600 }, { "epoch": 11.948148148148148, "grad_norm": 5.476153373718262, "learning_rate": 9.38178402646317e-10, "loss": 1.7228, "step": 80650 }, { "epoch": 11.955555555555556, "grad_norm": 4.937467575073242, "learning_rate": 6.899315196473577e-10, "loss": 1.6652, "step": 80700 }, { "epoch": 11.962962962962964, "grad_norm": 4.530655384063721, "learning_rate": 4.79757706107753e-10, "loss": 1.696, "step": 80750 }, { "epoch": 11.97037037037037, "grad_norm": 4.738536834716797, "learning_rate": 3.0765776227847934e-10, "loss": 1.6556, "step": 80800 }, { "epoch": 11.977777777777778, "grad_norm": 4.434654712677002, "learning_rate": 1.7363234344425217e-10, "loss": 1.723, "step": 80850 }, { "epoch": 11.985185185185186, "grad_norm": 4.7945475578308105, "learning_rate": 7.768195991353366e-11, "loss": 1.7508, "step": 80900 }, { "epoch": 11.992592592592592, "grad_norm": 5.558487415313721, "learning_rate": 1.9806977025194074e-11, "loss": 1.743, "step": 80950 }, { "epoch": 12.0, "grad_norm": 5.6761698722839355, "learning_rate": 7.615141850436659e-15, "loss": 1.6792, "step": 81000 }, { "epoch": 12.007407407407408, "grad_norm": 4.084346771240234, "learning_rate": 2.8967442665334466e-07, "loss": 1.7158, "step": 81050 }, { "epoch": 12.014814814814814, "grad_norm": 4.7874531745910645, "learning_rate": 2.8538830419264973e-07, "loss": 1.6393, "step": 81100 }, { "epoch": 12.022222222222222, "grad_norm": 5.466848373413086, "learning_rate": 2.8113366894419567e-07, "loss": 1.674, "step": 81150 }, { "epoch": 12.02962962962963, "grad_norm": 5.19240665435791, "learning_rate": 2.769105346981982e-07, "loss": 1.7633, "step": 81200 }, { "epoch": 12.037037037037036, "grad_norm": 6.463864326477051, "learning_rate": 2.727189151427723e-07, "loss": 1.6806, "step": 81250 }, { "epoch": 12.044444444444444, "grad_norm": 6.174072265625, "learning_rate": 2.685588238638881e-07, "loss": 1.7947, "step": 81300 }, { "epoch": 12.051851851851852, "grad_norm": 5.756320476531982, "learning_rate": 2.644302743453242e-07, "loss": 1.711, "step": 81350 }, { "epoch": 12.059259259259258, "grad_norm": 4.0832085609436035, "learning_rate": 2.6033327996862425e-07, "loss": 1.6688, "step": 81400 }, { "epoch": 12.066666666666666, "grad_norm": 3.832885980606079, "learning_rate": 2.562678540130592e-07, "loss": 1.6276, "step": 81450 }, { "epoch": 12.074074074074074, "grad_norm": 5.205352783203125, "learning_rate": 2.522340096555742e-07, "loss": 1.63, "step": 81500 }, { "epoch": 12.081481481481482, "grad_norm": 5.11032772064209, "learning_rate": 2.482317599707551e-07, "loss": 1.6658, "step": 81550 }, { "epoch": 12.088888888888889, "grad_norm": 5.3631672859191895, "learning_rate": 2.4426111793078076e-07, "loss": 1.7122, "step": 81600 }, { "epoch": 12.096296296296297, "grad_norm": 4.74794340133667, "learning_rate": 2.403220964053843e-07, "loss": 1.6735, "step": 81650 }, { "epoch": 12.103703703703705, "grad_norm": 5.673887252807617, "learning_rate": 2.3641470816180623e-07, "loss": 1.6518, "step": 81700 }, { "epoch": 12.11111111111111, "grad_norm": 7.255314826965332, "learning_rate": 2.3253896586476145e-07, "loss": 1.6672, "step": 81750 }, { "epoch": 12.118518518518519, "grad_norm": 4.749680995941162, "learning_rate": 2.2869488207638902e-07, "loss": 1.6897, "step": 81800 }, { "epoch": 12.125925925925927, "grad_norm": 6.6680707931518555, "learning_rate": 2.2488246925621682e-07, "loss": 1.6805, "step": 81850 }, { "epoch": 12.133333333333333, "grad_norm": 5.097541332244873, "learning_rate": 2.211017397611237e-07, "loss": 1.7271, "step": 81900 }, { "epoch": 12.14074074074074, "grad_norm": 4.618227005004883, "learning_rate": 2.1735270584529067e-07, "loss": 1.6749, "step": 81950 }, { "epoch": 12.148148148148149, "grad_norm": 4.820761203765869, "learning_rate": 2.136353796601698e-07, "loss": 1.6567, "step": 82000 }, { "epoch": 12.155555555555555, "grad_norm": 5.2633843421936035, "learning_rate": 2.0994977325443866e-07, "loss": 1.6774, "step": 82050 }, { "epoch": 12.162962962962963, "grad_norm": 4.625190734863281, "learning_rate": 2.0629589857396714e-07, "loss": 1.6732, "step": 82100 }, { "epoch": 12.170370370370371, "grad_norm": 4.52297830581665, "learning_rate": 2.026737674617729e-07, "loss": 1.6959, "step": 82150 }, { "epoch": 12.177777777777777, "grad_norm": 5.830942153930664, "learning_rate": 1.990833916579882e-07, "loss": 1.6977, "step": 82200 }, { "epoch": 12.185185185185185, "grad_norm": 5.219923973083496, "learning_rate": 1.9552478279981523e-07, "loss": 1.7054, "step": 82250 }, { "epoch": 12.192592592592593, "grad_norm": 4.906498432159424, "learning_rate": 1.9199795242149543e-07, "loss": 1.6458, "step": 82300 }, { "epoch": 12.2, "grad_norm": 4.760297775268555, "learning_rate": 1.8850291195427028e-07, "loss": 1.6583, "step": 82350 }, { "epoch": 12.207407407407407, "grad_norm": 5.414527416229248, "learning_rate": 1.850396727263415e-07, "loss": 1.7202, "step": 82400 }, { "epoch": 12.214814814814815, "grad_norm": 7.676029205322266, "learning_rate": 1.8160824596283432e-07, "loss": 1.6914, "step": 82450 }, { "epoch": 12.222222222222221, "grad_norm": 5.5357842445373535, "learning_rate": 1.7820864278576544e-07, "loss": 1.6447, "step": 82500 }, { "epoch": 12.22962962962963, "grad_norm": 4.6864776611328125, "learning_rate": 1.7484087421400175e-07, "loss": 1.7594, "step": 82550 }, { "epoch": 12.237037037037037, "grad_norm": 5.506605625152588, "learning_rate": 1.7150495116323162e-07, "loss": 1.6862, "step": 82600 }, { "epoch": 12.244444444444444, "grad_norm": 5.496425628662109, "learning_rate": 1.682008844459182e-07, "loss": 1.7077, "step": 82650 }, { "epoch": 12.251851851851852, "grad_norm": 4.794198989868164, "learning_rate": 1.6492868477127832e-07, "loss": 1.6838, "step": 82700 }, { "epoch": 12.25925925925926, "grad_norm": 5.178542137145996, "learning_rate": 1.616883627452348e-07, "loss": 1.6919, "step": 82750 }, { "epoch": 12.266666666666667, "grad_norm": 4.447923183441162, "learning_rate": 1.5847992887039086e-07, "loss": 1.6294, "step": 82800 }, { "epoch": 12.274074074074074, "grad_norm": 5.297534942626953, "learning_rate": 1.5530339354599354e-07, "loss": 1.7025, "step": 82850 }, { "epoch": 12.281481481481482, "grad_norm": 4.665591716766357, "learning_rate": 1.52158767067897e-07, "loss": 1.636, "step": 82900 }, { "epoch": 12.28888888888889, "grad_norm": 5.7167253494262695, "learning_rate": 1.490460596285348e-07, "loss": 1.7184, "step": 82950 }, { "epoch": 12.296296296296296, "grad_norm": 5.026512145996094, "learning_rate": 1.4596528131688327e-07, "loss": 1.6492, "step": 83000 }, { "epoch": 12.303703703703704, "grad_norm": 5.1102986335754395, "learning_rate": 1.4291644211842703e-07, "loss": 1.7015, "step": 83050 }, { "epoch": 12.311111111111112, "grad_norm": 6.56468391418457, "learning_rate": 1.398995519151314e-07, "loss": 1.6735, "step": 83100 }, { "epoch": 12.318518518518518, "grad_norm": 5.295648097991943, "learning_rate": 1.369146204854055e-07, "loss": 1.6439, "step": 83150 }, { "epoch": 12.325925925925926, "grad_norm": 4.968232154846191, "learning_rate": 1.3396165750407698e-07, "loss": 1.7418, "step": 83200 }, { "epoch": 12.333333333333334, "grad_norm": 5.407866954803467, "learning_rate": 1.3104067254235188e-07, "loss": 1.6915, "step": 83250 }, { "epoch": 12.34074074074074, "grad_norm": 4.8211588859558105, "learning_rate": 1.2815167506779136e-07, "loss": 1.6711, "step": 83300 }, { "epoch": 12.348148148148148, "grad_norm": 5.250539779663086, "learning_rate": 1.2529467444427846e-07, "loss": 1.6842, "step": 83350 }, { "epoch": 12.355555555555556, "grad_norm": 4.547904968261719, "learning_rate": 1.2246967993198467e-07, "loss": 1.684, "step": 83400 }, { "epoch": 12.362962962962962, "grad_norm": 4.18063497543335, "learning_rate": 1.196767006873445e-07, "loss": 1.6088, "step": 83450 }, { "epoch": 12.37037037037037, "grad_norm": 4.705265522003174, "learning_rate": 1.1691574576302323e-07, "loss": 1.7407, "step": 83500 }, { "epoch": 12.377777777777778, "grad_norm": 5.691997051239014, "learning_rate": 1.1418682410788917e-07, "loss": 1.6649, "step": 83550 }, { "epoch": 12.385185185185184, "grad_norm": 4.024935722351074, "learning_rate": 1.1148994456698258e-07, "loss": 1.6379, "step": 83600 }, { "epoch": 12.392592592592592, "grad_norm": 4.250906467437744, "learning_rate": 1.0882511588148792e-07, "loss": 1.6544, "step": 83650 }, { "epoch": 12.4, "grad_norm": 4.258973121643066, "learning_rate": 1.0619234668870826e-07, "loss": 1.6382, "step": 83700 }, { "epoch": 12.407407407407407, "grad_norm": 4.957441329956055, "learning_rate": 1.0359164552202982e-07, "loss": 1.6946, "step": 83750 }, { "epoch": 12.414814814814815, "grad_norm": 5.518395900726318, "learning_rate": 1.0102302081090531e-07, "loss": 1.7093, "step": 83800 }, { "epoch": 12.422222222222222, "grad_norm": 5.175966739654541, "learning_rate": 9.848648088081502e-08, "loss": 1.7392, "step": 83850 }, { "epoch": 12.42962962962963, "grad_norm": 4.846656322479248, "learning_rate": 9.598203395324912e-08, "loss": 1.752, "step": 83900 }, { "epoch": 12.437037037037037, "grad_norm": 4.2871551513671875, "learning_rate": 9.350968814567429e-08, "loss": 1.6824, "step": 83950 }, { "epoch": 12.444444444444445, "grad_norm": 4.269059181213379, "learning_rate": 9.106945147151381e-08, "loss": 1.6696, "step": 84000 }, { "epoch": 12.451851851851853, "grad_norm": 4.492150783538818, "learning_rate": 8.866133184011527e-08, "loss": 1.6966, "step": 84050 }, { "epoch": 12.459259259259259, "grad_norm": 5.761516094207764, "learning_rate": 8.62853370567296e-08, "loss": 1.6358, "step": 84100 }, { "epoch": 12.466666666666667, "grad_norm": 5.793025493621826, "learning_rate": 8.394147482248205e-08, "loss": 1.692, "step": 84150 }, { "epoch": 12.474074074074075, "grad_norm": 5.595602035522461, "learning_rate": 8.162975273435014e-08, "loss": 1.6476, "step": 84200 }, { "epoch": 12.481481481481481, "grad_norm": 5.4267659187316895, "learning_rate": 7.935017828513914e-08, "loss": 1.7016, "step": 84250 }, { "epoch": 12.488888888888889, "grad_norm": 4.711421966552734, "learning_rate": 7.710275886345542e-08, "loss": 1.6854, "step": 84300 }, { "epoch": 12.496296296296297, "grad_norm": 5.19965124130249, "learning_rate": 7.48875017536832e-08, "loss": 1.6466, "step": 84350 }, { "epoch": 12.503703703703703, "grad_norm": 4.186498641967773, "learning_rate": 7.27044141359634e-08, "loss": 1.7562, "step": 84400 }, { "epoch": 12.511111111111111, "grad_norm": 5.019369602203369, "learning_rate": 7.055350308616704e-08, "loss": 1.6783, "step": 84450 }, { "epoch": 12.518518518518519, "grad_norm": 5.182914733886719, "learning_rate": 6.843477557587408e-08, "loss": 1.7501, "step": 84500 }, { "epoch": 12.525925925925925, "grad_norm": 4.619785785675049, "learning_rate": 6.634823847235017e-08, "loss": 1.6467, "step": 84550 }, { "epoch": 12.533333333333333, "grad_norm": 4.456223011016846, "learning_rate": 6.429389853852553e-08, "loss": 1.71, "step": 84600 }, { "epoch": 12.540740740740741, "grad_norm": 7.912137985229492, "learning_rate": 6.227176243297272e-08, "loss": 1.8024, "step": 84650 }, { "epoch": 12.548148148148147, "grad_norm": 4.187272548675537, "learning_rate": 6.02818367098812e-08, "loss": 1.6879, "step": 84700 }, { "epoch": 12.555555555555555, "grad_norm": 5.394083023071289, "learning_rate": 5.832412781904384e-08, "loss": 1.6566, "step": 84750 }, { "epoch": 12.562962962962963, "grad_norm": 5.078476905822754, "learning_rate": 5.639864210582935e-08, "loss": 1.7164, "step": 84800 }, { "epoch": 12.57037037037037, "grad_norm": 5.757031440734863, "learning_rate": 5.450538581116327e-08, "loss": 1.6773, "step": 84850 }, { "epoch": 12.577777777777778, "grad_norm": 5.4857869148254395, "learning_rate": 5.264436507150916e-08, "loss": 1.7081, "step": 84900 }, { "epoch": 12.585185185185185, "grad_norm": 5.091628074645996, "learning_rate": 5.081558591884972e-08, "loss": 1.6757, "step": 84950 }, { "epoch": 12.592592592592592, "grad_norm": 5.476707458496094, "learning_rate": 4.901905428066345e-08, "loss": 1.6907, "step": 85000 }, { "epoch": 12.6, "grad_norm": 5.704261779785156, "learning_rate": 4.7254775979906906e-08, "loss": 1.6711, "step": 85050 }, { "epoch": 12.607407407407408, "grad_norm": 5.539083003997803, "learning_rate": 4.5522756734999173e-08, "loss": 1.708, "step": 85100 }, { "epoch": 12.614814814814816, "grad_norm": 5.648597240447998, "learning_rate": 4.382300215979851e-08, "loss": 1.6932, "step": 85150 }, { "epoch": 12.622222222222222, "grad_norm": 6.897541522979736, "learning_rate": 4.215551776358573e-08, "loss": 1.6701, "step": 85200 }, { "epoch": 12.62962962962963, "grad_norm": 5.8567938804626465, "learning_rate": 4.052030895104864e-08, "loss": 1.6799, "step": 85250 }, { "epoch": 12.637037037037038, "grad_norm": 4.133502006530762, "learning_rate": 3.891738102226206e-08, "loss": 1.7131, "step": 85300 }, { "epoch": 12.644444444444444, "grad_norm": 5.222926616668701, "learning_rate": 3.7346739172671177e-08, "loss": 1.6594, "step": 85350 }, { "epoch": 12.651851851851852, "grad_norm": 5.704329967498779, "learning_rate": 3.5808388493072665e-08, "loss": 1.692, "step": 85400 }, { "epoch": 12.65925925925926, "grad_norm": 4.969203472137451, "learning_rate": 3.43023339696058e-08, "loss": 1.7122, "step": 85450 }, { "epoch": 12.666666666666666, "grad_norm": 4.0226731300354, "learning_rate": 3.282858048372583e-08, "loss": 1.6186, "step": 85500 }, { "epoch": 12.674074074074074, "grad_norm": 5.419471740722656, "learning_rate": 3.1387132812193963e-08, "loss": 1.682, "step": 85550 }, { "epoch": 12.681481481481482, "grad_norm": 5.011806488037109, "learning_rate": 2.997799562706294e-08, "loss": 1.6846, "step": 85600 }, { "epoch": 12.688888888888888, "grad_norm": 4.759771347045898, "learning_rate": 2.8601173495659274e-08, "loss": 1.7007, "step": 85650 }, { "epoch": 12.696296296296296, "grad_norm": 5.667296886444092, "learning_rate": 2.7256670880568826e-08, "loss": 1.6965, "step": 85700 }, { "epoch": 12.703703703703704, "grad_norm": 5.863560676574707, "learning_rate": 2.5944492139623467e-08, "loss": 1.7107, "step": 85750 }, { "epoch": 12.71111111111111, "grad_norm": 5.254322052001953, "learning_rate": 2.4664641525884436e-08, "loss": 1.6653, "step": 85800 }, { "epoch": 12.718518518518518, "grad_norm": 5.583574295043945, "learning_rate": 2.341712318763123e-08, "loss": 1.6571, "step": 85850 }, { "epoch": 12.725925925925926, "grad_norm": 5.015530586242676, "learning_rate": 2.2201941168349393e-08, "loss": 1.7041, "step": 85900 }, { "epoch": 12.733333333333333, "grad_norm": 6.795839309692383, "learning_rate": 2.1019099406712762e-08, "loss": 1.6937, "step": 85950 }, { "epoch": 12.74074074074074, "grad_norm": 4.828438758850098, "learning_rate": 1.9868601736573458e-08, "loss": 1.6692, "step": 86000 }, { "epoch": 12.748148148148148, "grad_norm": 4.375940322875977, "learning_rate": 1.8750451886950795e-08, "loss": 1.743, "step": 86050 }, { "epoch": 12.755555555555556, "grad_norm": 5.71312141418457, "learning_rate": 1.7664653482015737e-08, "loss": 1.7376, "step": 86100 }, { "epoch": 12.762962962962963, "grad_norm": 4.118297100067139, "learning_rate": 1.6611210041080906e-08, "loss": 1.6836, "step": 86150 }, { "epoch": 12.77037037037037, "grad_norm": 4.494056224822998, "learning_rate": 1.5590124978592803e-08, "loss": 1.6745, "step": 86200 }, { "epoch": 12.777777777777779, "grad_norm": 5.0475993156433105, "learning_rate": 1.460140160411294e-08, "loss": 1.697, "step": 86250 }, { "epoch": 12.785185185185185, "grad_norm": 5.805638790130615, "learning_rate": 1.3645043122313407e-08, "loss": 1.6773, "step": 86300 }, { "epoch": 12.792592592592593, "grad_norm": 4.78032112121582, "learning_rate": 1.2721052632964637e-08, "loss": 1.5882, "step": 86350 }, { "epoch": 12.8, "grad_norm": 5.578061103820801, "learning_rate": 1.1829433130924328e-08, "loss": 1.6681, "step": 86400 }, { "epoch": 12.807407407407407, "grad_norm": 6.236872673034668, "learning_rate": 1.097018750612966e-08, "loss": 1.7019, "step": 86450 }, { "epoch": 12.814814814814815, "grad_norm": 5.2029571533203125, "learning_rate": 1.0143318543585079e-08, "loss": 1.7867, "step": 86500 }, { "epoch": 12.822222222222223, "grad_norm": 4.312541961669922, "learning_rate": 9.348828923358977e-09, "loss": 1.7128, "step": 86550 }, { "epoch": 12.829629629629629, "grad_norm": 4.430631637573242, "learning_rate": 8.586721220565918e-09, "loss": 1.6753, "step": 86600 }, { "epoch": 12.837037037037037, "grad_norm": 4.498734951019287, "learning_rate": 7.856997905367758e-09, "loss": 1.6709, "step": 86650 }, { "epoch": 12.844444444444445, "grad_norm": 5.665780067443848, "learning_rate": 7.159661342958091e-09, "loss": 1.6619, "step": 86700 }, { "epoch": 12.851851851851851, "grad_norm": 5.194167613983154, "learning_rate": 6.494713793561147e-09, "loss": 1.6525, "step": 86750 }, { "epoch": 12.85925925925926, "grad_norm": 4.512106418609619, "learning_rate": 5.862157412419578e-09, "loss": 1.7062, "step": 86800 }, { "epoch": 12.866666666666667, "grad_norm": 4.46773624420166, "learning_rate": 5.261994249786684e-09, "loss": 1.6675, "step": 86850 }, { "epoch": 12.874074074074073, "grad_norm": 5.507213592529297, "learning_rate": 4.694226250926415e-09, "loss": 1.6859, "step": 86900 }, { "epoch": 12.881481481481481, "grad_norm": 7.564279079437256, "learning_rate": 4.158855256101157e-09, "loss": 1.6731, "step": 86950 }, { "epoch": 12.88888888888889, "grad_norm": 4.482030868530273, "learning_rate": 3.655883000565075e-09, "loss": 1.711, "step": 87000 }, { "epoch": 12.896296296296295, "grad_norm": 4.927270412445068, "learning_rate": 3.185311114565215e-09, "loss": 1.6582, "step": 87050 }, { "epoch": 12.903703703703703, "grad_norm": 5.129358291625977, "learning_rate": 2.74714112332819e-09, "loss": 1.6187, "step": 87100 }, { "epoch": 12.911111111111111, "grad_norm": 4.1775102615356445, "learning_rate": 2.341374447060174e-09, "loss": 1.7252, "step": 87150 }, { "epoch": 12.918518518518518, "grad_norm": 4.9707159996032715, "learning_rate": 1.9680124009413548e-09, "loss": 1.6923, "step": 87200 }, { "epoch": 12.925925925925926, "grad_norm": 5.600582122802734, "learning_rate": 1.62705619512038e-09, "loss": 1.6414, "step": 87250 }, { "epoch": 12.933333333333334, "grad_norm": 4.928083896636963, "learning_rate": 1.3185069347121382e-09, "loss": 1.7132, "step": 87300 }, { "epoch": 12.940740740740742, "grad_norm": 4.8590264320373535, "learning_rate": 1.0423656197944275e-09, "loss": 1.6717, "step": 87350 }, { "epoch": 12.948148148148148, "grad_norm": 4.67088508605957, "learning_rate": 7.986331454012952e-10, "loss": 1.6741, "step": 87400 }, { "epoch": 12.955555555555556, "grad_norm": 5.243936061859131, "learning_rate": 5.873103015241466e-10, "loss": 1.6741, "step": 87450 }, { "epoch": 12.962962962962964, "grad_norm": 5.689966678619385, "learning_rate": 4.083977731073052e-10, "loss": 1.6958, "step": 87500 }, { "epoch": 12.97037037037037, "grad_norm": 5.250327110290527, "learning_rate": 2.61896140045792e-10, "loss": 1.6868, "step": 87550 }, { "epoch": 12.977777777777778, "grad_norm": 4.910762310028076, "learning_rate": 1.4780587718421503e-10, "loss": 1.6528, "step": 87600 }, { "epoch": 12.985185185185186, "grad_norm": 3.781712532043457, "learning_rate": 6.612735431343886e-11, "loss": 1.6715, "step": 87650 }, { "epoch": 12.992592592592592, "grad_norm": 5.627158164978027, "learning_rate": 1.6860836172805095e-11, "loss": 1.6577, "step": 87700 }, { "epoch": 13.0, "grad_norm": 4.692312717437744, "learning_rate": 6.4824456913470394e-15, "loss": 1.7212, "step": 87750 }, { "epoch": 13.007407407407408, "grad_norm": 4.713183879852295, "learning_rate": 2.497337918370513e-07, "loss": 1.7233, "step": 87800 }, { "epoch": 13.014814814814814, "grad_norm": 4.382723331451416, "learning_rate": 2.460361831871905e-07, "loss": 1.6443, "step": 87850 }, { "epoch": 13.022222222222222, "grad_norm": 5.862541675567627, "learning_rate": 2.423658119020356e-07, "loss": 1.7126, "step": 87900 }, { "epoch": 13.02962962962963, "grad_norm": 4.680122375488281, "learning_rate": 2.3872268823087907e-07, "loss": 1.661, "step": 87950 }, { "epoch": 13.037037037037036, "grad_norm": 5.052659511566162, "learning_rate": 2.3510682234692773e-07, "loss": 1.6396, "step": 88000 }, { "epoch": 13.044444444444444, "grad_norm": 5.200165271759033, "learning_rate": 2.3151822434727246e-07, "loss": 1.6467, "step": 88050 }, { "epoch": 13.051851851851852, "grad_norm": 6.37606143951416, "learning_rate": 2.2795690425286065e-07, "loss": 1.6887, "step": 88100 }, { "epoch": 13.059259259259258, "grad_norm": 4.551187515258789, "learning_rate": 2.2442287200846512e-07, "loss": 1.6731, "step": 88150 }, { "epoch": 13.066666666666666, "grad_norm": 4.3722052574157715, "learning_rate": 2.2091613748266183e-07, "loss": 1.7147, "step": 88200 }, { "epoch": 13.074074074074074, "grad_norm": 4.7190775871276855, "learning_rate": 2.174367104677999e-07, "loss": 1.7001, "step": 88250 }, { "epoch": 13.081481481481482, "grad_norm": 5.733537197113037, "learning_rate": 2.1398460067997174e-07, "loss": 1.6306, "step": 88300 }, { "epoch": 13.088888888888889, "grad_norm": 4.682885646820068, "learning_rate": 2.105598177589896e-07, "loss": 1.7563, "step": 88350 }, { "epoch": 13.096296296296297, "grad_norm": 5.364824295043945, "learning_rate": 2.071623712683557e-07, "loss": 1.7217, "step": 88400 }, { "epoch": 13.103703703703705, "grad_norm": 5.048696517944336, "learning_rate": 2.0379227069523776e-07, "loss": 1.6839, "step": 88450 }, { "epoch": 13.11111111111111, "grad_norm": 4.838836669921875, "learning_rate": 2.0044952545044238e-07, "loss": 1.7394, "step": 88500 }, { "epoch": 13.118518518518519, "grad_norm": 4.3506340980529785, "learning_rate": 1.97134144868385e-07, "loss": 1.6381, "step": 88550 }, { "epoch": 13.125925925925927, "grad_norm": 4.708609104156494, "learning_rate": 1.9384613820707e-07, "loss": 1.6736, "step": 88600 }, { "epoch": 13.133333333333333, "grad_norm": 5.755831241607666, "learning_rate": 1.9058551464806175e-07, "loss": 1.7046, "step": 88650 }, { "epoch": 13.14074074074074, "grad_norm": 5.2482686042785645, "learning_rate": 1.8735228329645805e-07, "loss": 1.6601, "step": 88700 }, { "epoch": 13.148148148148149, "grad_norm": 5.522831916809082, "learning_rate": 1.8414645318086344e-07, "loss": 1.7117, "step": 88750 }, { "epoch": 13.155555555555555, "grad_norm": 6.40302038192749, "learning_rate": 1.809680332533692e-07, "loss": 1.7129, "step": 88800 }, { "epoch": 13.162962962962963, "grad_norm": 5.780543804168701, "learning_rate": 1.7781703238952564e-07, "loss": 1.7132, "step": 88850 }, { "epoch": 13.170370370370371, "grad_norm": 4.71435022354126, "learning_rate": 1.7469345938831316e-07, "loss": 1.7371, "step": 88900 }, { "epoch": 13.177777777777777, "grad_norm": 6.810309886932373, "learning_rate": 1.715973229721246e-07, "loss": 1.6313, "step": 88950 }, { "epoch": 13.185185185185185, "grad_norm": 5.402350902557373, "learning_rate": 1.685286317867374e-07, "loss": 1.6334, "step": 89000 }, { "epoch": 13.192592592592593, "grad_norm": 4.750962257385254, "learning_rate": 1.6548739440129025e-07, "loss": 1.7537, "step": 89050 }, { "epoch": 13.2, "grad_norm": 5.475610256195068, "learning_rate": 1.6247361930825546e-07, "loss": 1.6997, "step": 89100 }, { "epoch": 13.207407407407407, "grad_norm": 4.554515838623047, "learning_rate": 1.5948731492342328e-07, "loss": 1.6814, "step": 89150 }, { "epoch": 13.214814814814815, "grad_norm": 4.694421768188477, "learning_rate": 1.5652848958587097e-07, "loss": 1.657, "step": 89200 }, { "epoch": 13.222222222222221, "grad_norm": 5.7868547439575195, "learning_rate": 1.5359715155794374e-07, "loss": 1.7237, "step": 89250 }, { "epoch": 13.22962962962963, "grad_norm": 4.124547958374023, "learning_rate": 1.5069330902522495e-07, "loss": 1.5919, "step": 89300 }, { "epoch": 13.237037037037037, "grad_norm": 4.9132208824157715, "learning_rate": 1.4781697009652595e-07, "loss": 1.678, "step": 89350 }, { "epoch": 13.244444444444444, "grad_norm": 5.06511926651001, "learning_rate": 1.4496814280385297e-07, "loss": 1.7352, "step": 89400 }, { "epoch": 13.251851851851852, "grad_norm": 4.450133323669434, "learning_rate": 1.421468351023836e-07, "loss": 1.6572, "step": 89450 }, { "epoch": 13.25925925925926, "grad_norm": 4.740837097167969, "learning_rate": 1.393530548704536e-07, "loss": 1.6888, "step": 89500 }, { "epoch": 13.266666666666667, "grad_norm": 5.064001083374023, "learning_rate": 1.3658680990953021e-07, "loss": 1.713, "step": 89550 }, { "epoch": 13.274074074074074, "grad_norm": 5.2124481201171875, "learning_rate": 1.3384810794418669e-07, "loss": 1.6592, "step": 89600 }, { "epoch": 13.281481481481482, "grad_norm": 3.9894776344299316, "learning_rate": 1.3113695662208436e-07, "loss": 1.6943, "step": 89650 }, { "epoch": 13.28888888888889, "grad_norm": 5.926461219787598, "learning_rate": 1.2845336351395398e-07, "loss": 1.718, "step": 89700 }, { "epoch": 13.296296296296296, "grad_norm": 4.480943202972412, "learning_rate": 1.2579733611357004e-07, "loss": 1.6878, "step": 89750 }, { "epoch": 13.303703703703704, "grad_norm": 5.713892936706543, "learning_rate": 1.231688818377297e-07, "loss": 1.6483, "step": 89800 }, { "epoch": 13.311111111111112, "grad_norm": 5.857698440551758, "learning_rate": 1.2056800802623724e-07, "loss": 1.6905, "step": 89850 }, { "epoch": 13.318518518518518, "grad_norm": 4.647960662841797, "learning_rate": 1.1799472194187755e-07, "loss": 1.6675, "step": 89900 }, { "epoch": 13.325925925925926, "grad_norm": 6.600680828094482, "learning_rate": 1.1544903077039926e-07, "loss": 1.736, "step": 89950 }, { "epoch": 13.333333333333334, "grad_norm": 5.771788120269775, "learning_rate": 1.1293094162049378e-07, "loss": 1.6509, "step": 90000 }, { "epoch": 13.34074074074074, "grad_norm": 4.439211845397949, "learning_rate": 1.104404615237753e-07, "loss": 1.7102, "step": 90050 }, { "epoch": 13.348148148148148, "grad_norm": 5.149127960205078, "learning_rate": 1.0797759743476299e-07, "loss": 1.6438, "step": 90100 }, { "epoch": 13.355555555555556, "grad_norm": 4.4529876708984375, "learning_rate": 1.0554235623085662e-07, "loss": 1.705, "step": 90150 }, { "epoch": 13.362962962962962, "grad_norm": 5.989099979400635, "learning_rate": 1.0313474471232321e-07, "loss": 1.6944, "step": 90200 }, { "epoch": 13.37037037037037, "grad_norm": 5.291862487792969, "learning_rate": 1.0075476960227481e-07, "loss": 1.6965, "step": 90250 }, { "epoch": 13.377777777777778, "grad_norm": 5.628286838531494, "learning_rate": 9.84024375466508e-08, "loss": 1.691, "step": 90300 }, { "epoch": 13.385185185185184, "grad_norm": 4.2257914543151855, "learning_rate": 9.607775511420003e-08, "loss": 1.6602, "step": 90350 }, { "epoch": 13.392592592592592, "grad_norm": 5.1985087394714355, "learning_rate": 9.37807287964576e-08, "loss": 1.6339, "step": 90400 }, { "epoch": 13.4, "grad_norm": 5.6320366859436035, "learning_rate": 9.151136500773594e-08, "loss": 1.7014, "step": 90450 }, { "epoch": 13.407407407407407, "grad_norm": 4.394415378570557, "learning_rate": 8.92696700850959e-08, "loss": 1.6796, "step": 90500 }, { "epoch": 13.414814814814815, "grad_norm": 5.783812046051025, "learning_rate": 8.705565028833906e-08, "loss": 1.7261, "step": 90550 }, { "epoch": 13.422222222222222, "grad_norm": 5.535831451416016, "learning_rate": 8.486931179998325e-08, "loss": 1.6693, "step": 90600 }, { "epoch": 13.42962962962963, "grad_norm": 5.076855659484863, "learning_rate": 8.271066072525036e-08, "loss": 1.6616, "step": 90650 }, { "epoch": 13.437037037037037, "grad_norm": 4.915444850921631, "learning_rate": 8.0579703092043e-08, "loss": 1.7366, "step": 90700 }, { "epoch": 13.444444444444445, "grad_norm": 4.743108749389648, "learning_rate": 7.847644485093675e-08, "loss": 1.651, "step": 90750 }, { "epoch": 13.451851851851853, "grad_norm": 3.879176616668701, "learning_rate": 7.640089187515465e-08, "loss": 1.6693, "step": 90800 }, { "epoch": 13.459259259259259, "grad_norm": 5.211658954620361, "learning_rate": 7.435304996055271e-08, "loss": 1.6643, "step": 90850 }, { "epoch": 13.466666666666667, "grad_norm": 5.480631351470947, "learning_rate": 7.233292482560883e-08, "loss": 1.687, "step": 90900 }, { "epoch": 13.474074074074075, "grad_norm": 5.0310797691345215, "learning_rate": 7.034052211140396e-08, "loss": 1.7151, "step": 90950 }, { "epoch": 13.481481481481481, "grad_norm": 4.684755325317383, "learning_rate": 6.83758473816043e-08, "loss": 1.6381, "step": 91000 }, { "epoch": 13.488888888888889, "grad_norm": 5.320138454437256, "learning_rate": 6.643890612244574e-08, "loss": 1.6996, "step": 91050 }, { "epoch": 13.496296296296297, "grad_norm": 4.682376861572266, "learning_rate": 6.452970374272283e-08, "loss": 1.7423, "step": 91100 }, { "epoch": 13.503703703703703, "grad_norm": 5.152847766876221, "learning_rate": 6.26482455737698e-08, "loss": 1.6966, "step": 91150 }, { "epoch": 13.511111111111111, "grad_norm": 6.191291809082031, "learning_rate": 6.079453686944514e-08, "loss": 1.6915, "step": 91200 }, { "epoch": 13.518518518518519, "grad_norm": 4.434044361114502, "learning_rate": 5.896858280612261e-08, "loss": 1.6644, "step": 91250 }, { "epoch": 13.525925925925925, "grad_norm": 4.534839630126953, "learning_rate": 5.7170388482670204e-08, "loss": 1.7303, "step": 91300 }, { "epoch": 13.533333333333333, "grad_norm": 4.041924953460693, "learning_rate": 5.539995892043793e-08, "loss": 1.6277, "step": 91350 }, { "epoch": 13.540740740740741, "grad_norm": 5.528424263000488, "learning_rate": 5.3657299063244464e-08, "loss": 1.6396, "step": 91400 }, { "epoch": 13.548148148148147, "grad_norm": 4.683634281158447, "learning_rate": 5.194241377736609e-08, "loss": 1.6793, "step": 91450 }, { "epoch": 13.555555555555555, "grad_norm": 5.272689342498779, "learning_rate": 5.025530785151778e-08, "loss": 1.6472, "step": 91500 }, { "epoch": 13.562962962962963, "grad_norm": 5.290887832641602, "learning_rate": 4.859598599684101e-08, "loss": 1.7151, "step": 91550 }, { "epoch": 13.57037037037037, "grad_norm": 4.272365570068359, "learning_rate": 4.696445284689377e-08, "loss": 1.7064, "step": 91600 }, { "epoch": 13.577777777777778, "grad_norm": 4.948604106903076, "learning_rate": 4.536071295763722e-08, "loss": 1.6692, "step": 91650 }, { "epoch": 13.585185185185185, "grad_norm": 4.477553367614746, "learning_rate": 4.378477080741794e-08, "loss": 1.6854, "step": 91700 }, { "epoch": 13.592592592592592, "grad_norm": 5.140685558319092, "learning_rate": 4.2236630796961274e-08, "loss": 1.6542, "step": 91750 }, { "epoch": 13.6, "grad_norm": 4.887373924255371, "learning_rate": 4.0716297249357994e-08, "loss": 1.7003, "step": 91800 }, { "epoch": 13.607407407407408, "grad_norm": 5.651754856109619, "learning_rate": 3.922377441004655e-08, "loss": 1.6769, "step": 91850 }, { "epoch": 13.614814814814816, "grad_norm": 4.647298336029053, "learning_rate": 3.775906644680971e-08, "loss": 1.6149, "step": 91900 }, { "epoch": 13.622222222222222, "grad_norm": 4.287783622741699, "learning_rate": 3.6322177449757964e-08, "loss": 1.7111, "step": 91950 }, { "epoch": 13.62962962962963, "grad_norm": 4.785926342010498, "learning_rate": 3.491311143131726e-08, "loss": 1.624, "step": 92000 }, { "epoch": 13.637037037037038, "grad_norm": 4.703386306762695, "learning_rate": 3.353187232622124e-08, "loss": 1.7183, "step": 92050 }, { "epoch": 13.644444444444444, "grad_norm": 4.661455154418945, "learning_rate": 3.2178463991497934e-08, "loss": 1.693, "step": 92100 }, { "epoch": 13.651851851851852, "grad_norm": 5.412303924560547, "learning_rate": 3.085289020646087e-08, "loss": 1.7245, "step": 92150 }, { "epoch": 13.65925925925926, "grad_norm": 4.2518205642700195, "learning_rate": 2.9555154672694652e-08, "loss": 1.6777, "step": 92200 }, { "epoch": 13.666666666666666, "grad_norm": 4.5041351318359375, "learning_rate": 2.8285261014050492e-08, "loss": 1.6368, "step": 92250 }, { "epoch": 13.674074074074074, "grad_norm": 5.045154571533203, "learning_rate": 2.704321277662847e-08, "loss": 1.6564, "step": 92300 }, { "epoch": 13.681481481481482, "grad_norm": 4.422163486480713, "learning_rate": 2.5829013428776416e-08, "loss": 1.6091, "step": 92350 }, { "epoch": 13.688888888888888, "grad_norm": 6.019820213317871, "learning_rate": 2.4642666361073265e-08, "loss": 1.7035, "step": 92400 }, { "epoch": 13.696296296296296, "grad_norm": 5.835787773132324, "learning_rate": 2.3484174886322375e-08, "loss": 1.6213, "step": 92450 }, { "epoch": 13.703703703703704, "grad_norm": 5.5061798095703125, "learning_rate": 2.235354223954378e-08, "loss": 1.6693, "step": 92500 }, { "epoch": 13.71111111111111, "grad_norm": 5.730589866638184, "learning_rate": 2.125077157796085e-08, "loss": 1.7064, "step": 92550 }, { "epoch": 13.718518518518518, "grad_norm": 4.635679244995117, "learning_rate": 2.0175865980995858e-08, "loss": 1.6796, "step": 92600 }, { "epoch": 13.725925925925926, "grad_norm": 6.5054121017456055, "learning_rate": 1.9128828450257764e-08, "loss": 1.6115, "step": 92650 }, { "epoch": 13.733333333333333, "grad_norm": 5.930258274078369, "learning_rate": 1.8109661909537777e-08, "loss": 1.6923, "step": 92700 }, { "epoch": 13.74074074074074, "grad_norm": 6.16044807434082, "learning_rate": 1.7118369204797148e-08, "loss": 1.7119, "step": 92750 }, { "epoch": 13.748148148148148, "grad_norm": 5.259925365447998, "learning_rate": 1.6154953104161597e-08, "loss": 1.6455, "step": 92800 }, { "epoch": 13.755555555555556, "grad_norm": 4.732719898223877, "learning_rate": 1.521941629791468e-08, "loss": 1.6123, "step": 92850 }, { "epoch": 13.762962962962963, "grad_norm": 3.6326496601104736, "learning_rate": 1.4311761398486668e-08, "loss": 1.6245, "step": 92900 }, { "epoch": 13.77037037037037, "grad_norm": 4.96213960647583, "learning_rate": 1.3431990940450113e-08, "loss": 1.6823, "step": 92950 }, { "epoch": 13.777777777777779, "grad_norm": 6.506005764007568, "learning_rate": 1.2580107380512075e-08, "loss": 1.6541, "step": 93000 }, { "epoch": 13.785185185185185, "grad_norm": 5.4200921058654785, "learning_rate": 1.1756113097506349e-08, "loss": 1.7077, "step": 93050 }, { "epoch": 13.792592592592593, "grad_norm": 4.497105598449707, "learning_rate": 1.096001039238681e-08, "loss": 1.6276, "step": 93100 }, { "epoch": 13.8, "grad_norm": 4.131155967712402, "learning_rate": 1.0191801488225184e-08, "loss": 1.7466, "step": 93150 }, { "epoch": 13.807407407407407, "grad_norm": 4.873898029327393, "learning_rate": 9.451488530198837e-09, "loss": 1.682, "step": 93200 }, { "epoch": 13.814814814814815, "grad_norm": 5.858630657196045, "learning_rate": 8.739073585586345e-09, "loss": 1.745, "step": 93250 }, { "epoch": 13.822222222222223, "grad_norm": 4.94985294342041, "learning_rate": 8.054558643765253e-09, "loss": 1.7271, "step": 93300 }, { "epoch": 13.829629629629629, "grad_norm": 4.77255916595459, "learning_rate": 7.397945616202107e-09, "loss": 1.6015, "step": 93350 }, { "epoch": 13.837037037037037, "grad_norm": 4.5871992111206055, "learning_rate": 6.7692363364513235e-09, "loss": 1.6554, "step": 93400 }, { "epoch": 13.844444444444445, "grad_norm": 5.18468713760376, "learning_rate": 6.1684325601441e-09, "loss": 1.7069, "step": 93450 }, { "epoch": 13.851851851851851, "grad_norm": 4.543237209320068, "learning_rate": 5.595535964990628e-09, "loss": 1.6495, "step": 93500 }, { "epoch": 13.85925925925926, "grad_norm": 6.7150983810424805, "learning_rate": 5.050548150771218e-09, "loss": 1.6884, "step": 93550 }, { "epoch": 13.866666666666667, "grad_norm": 4.884583950042725, "learning_rate": 4.533470639330739e-09, "loss": 1.735, "step": 93600 }, { "epoch": 13.874074074074073, "grad_norm": 4.831175327301025, "learning_rate": 4.0443048745775205e-09, "loss": 1.6228, "step": 93650 }, { "epoch": 13.881481481481481, "grad_norm": 4.210744857788086, "learning_rate": 3.5830522224777897e-09, "loss": 1.6364, "step": 93700 }, { "epoch": 13.88888888888889, "grad_norm": 6.04276180267334, "learning_rate": 3.1497139710534587e-09, "loss": 1.6024, "step": 93750 }, { "epoch": 13.896296296296295, "grad_norm": 4.816607475280762, "learning_rate": 2.744291330375459e-09, "loss": 1.6544, "step": 93800 }, { "epoch": 13.903703703703703, "grad_norm": 5.6914286613464355, "learning_rate": 2.3667854325604143e-09, "loss": 1.6407, "step": 93850 }, { "epoch": 13.911111111111111, "grad_norm": 4.977962970733643, "learning_rate": 2.0171973317739677e-09, "loss": 1.6743, "step": 93900 }, { "epoch": 13.918518518518518, "grad_norm": 6.445103645324707, "learning_rate": 1.6955280042185717e-09, "loss": 1.707, "step": 93950 }, { "epoch": 13.925925925925926, "grad_norm": 7.13929557800293, "learning_rate": 1.4017783481379277e-09, "loss": 1.7058, "step": 94000 }, { "epoch": 13.933333333333334, "grad_norm": 3.9623467922210693, "learning_rate": 1.1359491838081049e-09, "loss": 1.5874, "step": 94050 }, { "epoch": 13.940740740740742, "grad_norm": 4.890029430389404, "learning_rate": 8.980412535442018e-10, "loss": 1.7015, "step": 94100 }, { "epoch": 13.948148148148148, "grad_norm": 5.10414981842041, "learning_rate": 6.880552216870229e-10, "loss": 1.7628, "step": 94150 }, { "epoch": 13.955555555555556, "grad_norm": 6.242023944854736, "learning_rate": 5.059916746130711e-10, "loss": 1.679, "step": 94200 }, { "epoch": 13.962962962962964, "grad_norm": 5.566348552703857, "learning_rate": 3.5185112072122493e-10, "loss": 1.6291, "step": 94250 }, { "epoch": 13.97037037037037, "grad_norm": 4.626772880554199, "learning_rate": 2.256339904427307e-10, "loss": 1.7245, "step": 94300 }, { "epoch": 13.977777777777778, "grad_norm": 5.087474346160889, "learning_rate": 1.2734063622898973e-10, "loss": 1.6368, "step": 94350 }, { "epoch": 13.985185185185186, "grad_norm": 4.8912458419799805, "learning_rate": 5.697133255821996e-11, "loss": 1.6854, "step": 94400 }, { "epoch": 13.992592592592592, "grad_norm": 4.6201982498168945, "learning_rate": 1.4526275935455857e-11, "loss": 1.6349, "step": 94450 }, { "epoch": 14.0, "grad_norm": 4.993613243103027, "learning_rate": 5.584882556419758e-15, "loss": 1.6955, "step": 94500 }, { "epoch": 14.007407407407408, "grad_norm": 4.081660747528076, "learning_rate": 2.1750947185748995e-07, "loss": 1.7098, "step": 94550 }, { "epoch": 14.014814814814814, "grad_norm": 5.262495994567871, "learning_rate": 2.1428725575616372e-07, "loss": 1.6867, "step": 94600 }, { "epoch": 14.022222222222222, "grad_norm": 4.7010698318481445, "learning_rate": 2.1108882678338548e-07, "loss": 1.5879, "step": 94650 }, { "epoch": 14.02962962962963, "grad_norm": 5.260746002197266, "learning_rate": 2.0791419271390057e-07, "loss": 1.6757, "step": 94700 }, { "epoch": 14.037037037037036, "grad_norm": 5.294519424438477, "learning_rate": 2.0476336126461492e-07, "loss": 1.6988, "step": 94750 }, { "epoch": 14.044444444444444, "grad_norm": 5.61260461807251, "learning_rate": 2.0163634009457756e-07, "loss": 1.6908, "step": 94800 }, { "epoch": 14.051851851851852, "grad_norm": 4.751020431518555, "learning_rate": 1.9853313680495588e-07, "loss": 1.7395, "step": 94850 }, { "epoch": 14.059259259259258, "grad_norm": 3.8591971397399902, "learning_rate": 1.9545375893902262e-07, "loss": 1.6961, "step": 94900 }, { "epoch": 14.066666666666666, "grad_norm": 5.062747478485107, "learning_rate": 1.9239821398213455e-07, "loss": 1.6664, "step": 94950 }, { "epoch": 14.074074074074074, "grad_norm": 5.873329162597656, "learning_rate": 1.8936650936171586e-07, "loss": 1.6769, "step": 95000 }, { "epoch": 14.081481481481482, "grad_norm": 4.468142986297607, "learning_rate": 1.8635865244724162e-07, "loss": 1.6555, "step": 95050 }, { "epoch": 14.088888888888889, "grad_norm": 5.790612697601318, "learning_rate": 1.833746505502143e-07, "loss": 1.6727, "step": 95100 }, { "epoch": 14.096296296296297, "grad_norm": 4.851980686187744, "learning_rate": 1.8041451092415063e-07, "loss": 1.7384, "step": 95150 }, { "epoch": 14.103703703703705, "grad_norm": 5.899025917053223, "learning_rate": 1.7747824076456478e-07, "loss": 1.6937, "step": 95200 }, { "epoch": 14.11111111111111, "grad_norm": 4.192124366760254, "learning_rate": 1.7456584720894842e-07, "loss": 1.673, "step": 95250 }, { "epoch": 14.118518518518519, "grad_norm": 6.0272603034973145, "learning_rate": 1.7167733733674974e-07, "loss": 1.6529, "step": 95300 }, { "epoch": 14.125925925925927, "grad_norm": 4.812941074371338, "learning_rate": 1.6881271816936662e-07, "loss": 1.7446, "step": 95350 }, { "epoch": 14.133333333333333, "grad_norm": 4.837795734405518, "learning_rate": 1.659719966701201e-07, "loss": 1.7196, "step": 95400 }, { "epoch": 14.14074074074074, "grad_norm": 3.2878236770629883, "learning_rate": 1.631551797442421e-07, "loss": 1.7074, "step": 95450 }, { "epoch": 14.148148148148149, "grad_norm": 4.426816940307617, "learning_rate": 1.6036227423885443e-07, "loss": 1.6393, "step": 95500 }, { "epoch": 14.155555555555555, "grad_norm": 5.182004451751709, "learning_rate": 1.5759328694295861e-07, "loss": 1.6379, "step": 95550 }, { "epoch": 14.162962962962963, "grad_norm": 5.603666305541992, "learning_rate": 1.5484822458741388e-07, "loss": 1.7089, "step": 95600 }, { "epoch": 14.170370370370371, "grad_norm": 6.255915641784668, "learning_rate": 1.5212709384492152e-07, "loss": 1.7131, "step": 95650 }, { "epoch": 14.177777777777777, "grad_norm": 5.393916606903076, "learning_rate": 1.4942990133001266e-07, "loss": 1.6868, "step": 95700 }, { "epoch": 14.185185185185185, "grad_norm": 4.44481086730957, "learning_rate": 1.4675665359902502e-07, "loss": 1.663, "step": 95750 }, { "epoch": 14.192592592592593, "grad_norm": 5.424983024597168, "learning_rate": 1.4410735715009837e-07, "loss": 1.6858, "step": 95800 }, { "epoch": 14.2, "grad_norm": 4.479785442352295, "learning_rate": 1.414820184231447e-07, "loss": 1.6969, "step": 95850 }, { "epoch": 14.207407407407407, "grad_norm": 5.457512855529785, "learning_rate": 1.3888064379984356e-07, "loss": 1.643, "step": 95900 }, { "epoch": 14.214814814814815, "grad_norm": 4.97783899307251, "learning_rate": 1.3630323960361902e-07, "loss": 1.7059, "step": 95950 }, { "epoch": 14.222222222222221, "grad_norm": 5.594468116760254, "learning_rate": 1.3374981209963166e-07, "loss": 1.6873, "step": 96000 }, { "epoch": 14.22962962962963, "grad_norm": 5.437595367431641, "learning_rate": 1.3122036749475765e-07, "loss": 1.6673, "step": 96050 }, { "epoch": 14.237037037037037, "grad_norm": 5.472507953643799, "learning_rate": 1.2871491193757634e-07, "loss": 1.6547, "step": 96100 }, { "epoch": 14.244444444444444, "grad_norm": 4.9965925216674805, "learning_rate": 1.2623345151835164e-07, "loss": 1.6488, "step": 96150 }, { "epoch": 14.251851851851852, "grad_norm": 3.97997784614563, "learning_rate": 1.2377599226902402e-07, "loss": 1.7462, "step": 96200 }, { "epoch": 14.25925925925926, "grad_norm": 4.046382904052734, "learning_rate": 1.2134254016318847e-07, "loss": 1.5989, "step": 96250 }, { "epoch": 14.266666666666667, "grad_norm": 4.786545276641846, "learning_rate": 1.1893310111608547e-07, "loss": 1.6232, "step": 96300 }, { "epoch": 14.274074074074074, "grad_norm": 4.758726119995117, "learning_rate": 1.1654768098458446e-07, "loss": 1.6304, "step": 96350 }, { "epoch": 14.281481481481482, "grad_norm": 5.127438545227051, "learning_rate": 1.1418628556716715e-07, "loss": 1.6618, "step": 96400 }, { "epoch": 14.28888888888889, "grad_norm": 4.724511623382568, "learning_rate": 1.1184892060392083e-07, "loss": 1.7565, "step": 96450 }, { "epoch": 14.296296296296296, "grad_norm": 5.429440021514893, "learning_rate": 1.0953559177651395e-07, "loss": 1.5518, "step": 96500 }, { "epoch": 14.303703703703704, "grad_norm": 3.7063474655151367, "learning_rate": 1.0724630470819286e-07, "loss": 1.6977, "step": 96550 }, { "epoch": 14.311111111111112, "grad_norm": 6.464829444885254, "learning_rate": 1.0498106496375837e-07, "loss": 1.6884, "step": 96600 }, { "epoch": 14.318518518518518, "grad_norm": 5.439199447631836, "learning_rate": 1.0273987804956032e-07, "loss": 1.6838, "step": 96650 }, { "epoch": 14.325925925925926, "grad_norm": 4.898883819580078, "learning_rate": 1.0052274941348084e-07, "loss": 1.7748, "step": 96700 }, { "epoch": 14.333333333333334, "grad_norm": 4.992605686187744, "learning_rate": 9.832968444491886e-08, "loss": 1.6865, "step": 96750 }, { "epoch": 14.34074074074074, "grad_norm": 5.274005889892578, "learning_rate": 9.616068847478233e-08, "loss": 1.7458, "step": 96800 }, { "epoch": 14.348148148148148, "grad_norm": 4.971953392028809, "learning_rate": 9.40157667754693e-08, "loss": 1.6809, "step": 96850 }, { "epoch": 14.355555555555556, "grad_norm": 5.416146755218506, "learning_rate": 9.189492456086024e-08, "loss": 1.6902, "step": 96900 }, { "epoch": 14.362962962962962, "grad_norm": 5.203887462615967, "learning_rate": 8.979816698630128e-08, "loss": 1.651, "step": 96950 }, { "epoch": 14.37037037037037, "grad_norm": 6.0059590339660645, "learning_rate": 8.772549914859652e-08, "loss": 1.7222, "step": 97000 }, { "epoch": 14.377777777777778, "grad_norm": 5.621152400970459, "learning_rate": 8.567692608598798e-08, "loss": 1.6189, "step": 97050 }, { "epoch": 14.385185185185184, "grad_norm": 5.870107173919678, "learning_rate": 8.365245277815348e-08, "loss": 1.7122, "step": 97100 }, { "epoch": 14.392592592592592, "grad_norm": 4.718721866607666, "learning_rate": 8.165208414618542e-08, "loss": 1.7046, "step": 97150 }, { "epoch": 14.4, "grad_norm": 5.118833065032959, "learning_rate": 7.967582505258308e-08, "loss": 1.7248, "step": 97200 }, { "epoch": 14.407407407407407, "grad_norm": 5.250682353973389, "learning_rate": 7.772368030124044e-08, "loss": 1.6675, "step": 97250 }, { "epoch": 14.414814814814815, "grad_norm": 4.605399131774902, "learning_rate": 7.57956546374361e-08, "loss": 1.6864, "step": 97300 }, { "epoch": 14.422222222222222, "grad_norm": 5.241720676422119, "learning_rate": 7.389175274781668e-08, "loss": 1.7509, "step": 97350 }, { "epoch": 14.42962962962963, "grad_norm": 4.5354323387146, "learning_rate": 7.201197926039238e-08, "loss": 1.7077, "step": 97400 }, { "epoch": 14.437037037037037, "grad_norm": 5.364243030548096, "learning_rate": 7.01563387445181e-08, "loss": 1.6977, "step": 97450 }, { "epoch": 14.444444444444445, "grad_norm": 4.9642486572265625, "learning_rate": 6.832483571088899e-08, "loss": 1.6097, "step": 97500 }, { "epoch": 14.451851851851853, "grad_norm": 5.477876663208008, "learning_rate": 6.651747461152602e-08, "loss": 1.6578, "step": 97550 }, { "epoch": 14.459259259259259, "grad_norm": 4.5182929039001465, "learning_rate": 6.4734259839766e-08, "loss": 1.6748, "step": 97600 }, { "epoch": 14.466666666666667, "grad_norm": 4.248973846435547, "learning_rate": 6.297519573025046e-08, "loss": 1.6873, "step": 97650 }, { "epoch": 14.474074074074075, "grad_norm": 5.4366230964660645, "learning_rate": 6.124028655891567e-08, "loss": 1.6764, "step": 97700 }, { "epoch": 14.481481481481481, "grad_norm": 5.3513336181640625, "learning_rate": 5.952953654298266e-08, "loss": 1.6915, "step": 97750 }, { "epoch": 14.488888888888889, "grad_norm": 6.808749675750732, "learning_rate": 5.784294984094496e-08, "loss": 1.6613, "step": 97800 }, { "epoch": 14.496296296296297, "grad_norm": 4.615975379943848, "learning_rate": 5.61805305525609e-08, "loss": 1.7153, "step": 97850 }, { "epoch": 14.503703703703703, "grad_norm": 6.45193338394165, "learning_rate": 5.4542282718841324e-08, "loss": 1.713, "step": 97900 }, { "epoch": 14.511111111111111, "grad_norm": 6.218621253967285, "learning_rate": 5.2928210322044094e-08, "loss": 1.7161, "step": 97950 }, { "epoch": 14.518518518518519, "grad_norm": 3.8265745639801025, "learning_rate": 5.1338317285658524e-08, "loss": 1.6358, "step": 98000 }, { "epoch": 14.525925925925925, "grad_norm": 6.531336784362793, "learning_rate": 4.977260747439872e-08, "loss": 1.6534, "step": 98050 }, { "epoch": 14.533333333333333, "grad_norm": 4.259779930114746, "learning_rate": 4.8231084694195795e-08, "loss": 1.6495, "step": 98100 }, { "epoch": 14.540740740740741, "grad_norm": 5.932340145111084, "learning_rate": 4.67137526921857e-08, "loss": 1.7128, "step": 98150 }, { "epoch": 14.548148148148147, "grad_norm": 5.576129913330078, "learning_rate": 4.522061515670251e-08, "loss": 1.7082, "step": 98200 }, { "epoch": 14.555555555555555, "grad_norm": 5.470938682556152, "learning_rate": 4.375167571726735e-08, "loss": 1.7498, "step": 98250 }, { "epoch": 14.562962962962963, "grad_norm": 4.535619258880615, "learning_rate": 4.230693794458063e-08, "loss": 1.7499, "step": 98300 }, { "epoch": 14.57037037037037, "grad_norm": 4.099649429321289, "learning_rate": 4.0886405350514244e-08, "loss": 1.8051, "step": 98350 }, { "epoch": 14.577777777777778, "grad_norm": 5.937483310699463, "learning_rate": 3.949008138810051e-08, "loss": 1.7008, "step": 98400 }, { "epoch": 14.585185185185185, "grad_norm": 4.595200061798096, "learning_rate": 3.8117969451526574e-08, "loss": 1.6601, "step": 98450 }, { "epoch": 14.592592592592592, "grad_norm": 4.622509002685547, "learning_rate": 3.677007287612444e-08, "loss": 1.6999, "step": 98500 }, { "epoch": 14.6, "grad_norm": 4.620297908782959, "learning_rate": 3.544639493836544e-08, "loss": 1.6864, "step": 98550 }, { "epoch": 14.607407407407408, "grad_norm": 6.888701438903809, "learning_rate": 3.4146938855845744e-08, "loss": 1.6404, "step": 98600 }, { "epoch": 14.614814814814816, "grad_norm": 6.738227367401123, "learning_rate": 3.2871707787287545e-08, "loss": 1.6618, "step": 98650 }, { "epoch": 14.622222222222222, "grad_norm": 4.549380302429199, "learning_rate": 3.162070483252344e-08, "loss": 1.703, "step": 98700 }, { "epoch": 14.62962962962963, "grad_norm": 5.150018215179443, "learning_rate": 3.039393303249538e-08, "loss": 1.6822, "step": 98750 }, { "epoch": 14.637037037037038, "grad_norm": 5.686187744140625, "learning_rate": 2.9191395369240204e-08, "loss": 1.7022, "step": 98800 }, { "epoch": 14.644444444444444, "grad_norm": 5.78209114074707, "learning_rate": 2.801309476589076e-08, "loss": 1.7199, "step": 98850 }, { "epoch": 14.651851851851852, "grad_norm": 3.936169147491455, "learning_rate": 2.685903408666035e-08, "loss": 1.7143, "step": 98900 }, { "epoch": 14.65925925925926, "grad_norm": 5.642170429229736, "learning_rate": 2.572921613684498e-08, "loss": 1.7003, "step": 98950 }, { "epoch": 14.666666666666666, "grad_norm": 4.85971212387085, "learning_rate": 2.4623643662804454e-08, "loss": 1.6994, "step": 99000 }, { "epoch": 14.674074074074074, "grad_norm": 5.203510284423828, "learning_rate": 2.3542319351969046e-08, "loss": 1.6054, "step": 99050 }, { "epoch": 14.681481481481482, "grad_norm": 5.952207088470459, "learning_rate": 2.2485245832822856e-08, "loss": 1.6189, "step": 99100 }, { "epoch": 14.688888888888888, "grad_norm": 5.479672908782959, "learning_rate": 2.1452425674901577e-08, "loss": 1.6995, "step": 99150 }, { "epoch": 14.696296296296296, "grad_norm": 4.928932189941406, "learning_rate": 2.0443861388788066e-08, "loss": 1.6801, "step": 99200 }, { "epoch": 14.703703703703704, "grad_norm": 4.984295845031738, "learning_rate": 1.945955542610012e-08, "loss": 1.6402, "step": 99250 }, { "epoch": 14.71111111111111, "grad_norm": 5.150063514709473, "learning_rate": 1.8499510179491585e-08, "loss": 1.6855, "step": 99300 }, { "epoch": 14.718518518518518, "grad_norm": 6.898922920227051, "learning_rate": 1.7563727982642386e-08, "loss": 1.7513, "step": 99350 }, { "epoch": 14.725925925925926, "grad_norm": 23.222375869750977, "learning_rate": 1.6652211110254057e-08, "loss": 1.6884, "step": 99400 }, { "epoch": 14.733333333333333, "grad_norm": 4.689803600311279, "learning_rate": 1.5764961778041988e-08, "loss": 1.6416, "step": 99450 }, { "epoch": 14.74074074074074, "grad_norm": 5.273927211761475, "learning_rate": 1.4901982142735415e-08, "loss": 1.6945, "step": 99500 }, { "epoch": 14.748148148148148, "grad_norm": 4.713109493255615, "learning_rate": 1.4063274302065222e-08, "loss": 1.6675, "step": 99550 }, { "epoch": 14.755555555555556, "grad_norm": 4.455766201019287, "learning_rate": 1.324884029476392e-08, "loss": 1.7309, "step": 99600 }, { "epoch": 14.762962962962963, "grad_norm": 4.254674911499023, "learning_rate": 1.2458682100560116e-08, "loss": 1.7088, "step": 99650 }, { "epoch": 14.77037037037037, "grad_norm": 5.807867527008057, "learning_rate": 1.1692801640171835e-08, "loss": 1.697, "step": 99700 }, { "epoch": 14.777777777777779, "grad_norm": 4.785783767700195, "learning_rate": 1.0951200775302095e-08, "loss": 1.7298, "step": 99750 }, { "epoch": 14.785185185185185, "grad_norm": 5.500148296356201, "learning_rate": 1.0233881308635563e-08, "loss": 1.6814, "step": 99800 }, { "epoch": 14.792592592592593, "grad_norm": 6.528011798858643, "learning_rate": 9.54084498383412e-09, "loss": 1.7591, "step": 99850 }, { "epoch": 14.8, "grad_norm": 5.078983783721924, "learning_rate": 8.872093485531307e-09, "loss": 1.6907, "step": 99900 }, { "epoch": 14.807407407407407, "grad_norm": 5.844426155090332, "learning_rate": 8.227628439330115e-09, "loss": 1.7544, "step": 99950 }, { "epoch": 14.814814814814815, "grad_norm": 4.08227014541626, "learning_rate": 7.607451411797417e-09, "loss": 1.735, "step": 100000 }, { "epoch": 14.822222222222223, "grad_norm": 4.534005641937256, "learning_rate": 7.01156391046065e-09, "loss": 1.6321, "step": 100050 }, { "epoch": 14.829629629629629, "grad_norm": 4.907926082611084, "learning_rate": 6.43996738380337e-09, "loss": 1.6915, "step": 100100 }, { "epoch": 14.837037037037037, "grad_norm": 4.026651382446289, "learning_rate": 5.892663221264139e-09, "loss": 1.6458, "step": 100150 }, { "epoch": 14.844444444444445, "grad_norm": 5.169808387756348, "learning_rate": 5.36965275323098e-09, "loss": 1.663, "step": 100200 }, { "epoch": 14.851851851851851, "grad_norm": 4.850079536437988, "learning_rate": 4.870937251038044e-09, "loss": 1.6436, "step": 100250 }, { "epoch": 14.85925925925926, "grad_norm": 4.930896282196045, "learning_rate": 4.396517926964495e-09, "loss": 1.7308, "step": 100300 }, { "epoch": 14.866666666666667, "grad_norm": 4.7086076736450195, "learning_rate": 3.946395934230074e-09, "loss": 1.716, "step": 100350 }, { "epoch": 14.874074074074073, "grad_norm": 4.457938194274902, "learning_rate": 3.5205723669917703e-09, "loss": 1.6974, "step": 100400 }, { "epoch": 14.881481481481481, "grad_norm": 4.839356422424316, "learning_rate": 3.119048260341595e-09, "loss": 1.6968, "step": 100450 }, { "epoch": 14.88888888888889, "grad_norm": 5.151297569274902, "learning_rate": 2.7418245903054752e-09, "loss": 1.6483, "step": 100500 }, { "epoch": 14.896296296296295, "grad_norm": 5.776646614074707, "learning_rate": 2.3889022738399216e-09, "loss": 1.6593, "step": 100550 }, { "epoch": 14.903703703703703, "grad_norm": 4.784289836883545, "learning_rate": 2.060282168829808e-09, "loss": 1.6998, "step": 100600 }, { "epoch": 14.911111111111111, "grad_norm": 4.684325695037842, "learning_rate": 1.7559650740828215e-09, "loss": 1.7249, "step": 100650 }, { "epoch": 14.918518518518518, "grad_norm": 5.666114330291748, "learning_rate": 1.4759517293361225e-09, "loss": 1.7104, "step": 100700 }, { "epoch": 14.925925925925926, "grad_norm": 5.29168701171875, "learning_rate": 1.220242815246353e-09, "loss": 1.6885, "step": 100750 }, { "epoch": 14.933333333333334, "grad_norm": 4.432060718536377, "learning_rate": 9.88838953389637e-10, "loss": 1.7025, "step": 100800 }, { "epoch": 14.940740740740742, "grad_norm": 4.997260570526123, "learning_rate": 7.817407062638005e-10, "loss": 1.7068, "step": 100850 }, { "epoch": 14.948148148148148, "grad_norm": 5.093183994293213, "learning_rate": 5.989485772850412e-10, "loss": 1.6982, "step": 100900 }, { "epoch": 14.955555555555556, "grad_norm": 5.697869777679443, "learning_rate": 4.4046301078237706e-10, "loss": 1.6626, "step": 100950 }, { "epoch": 14.962962962962964, "grad_norm": 5.6799750328063965, "learning_rate": 3.062843920043079e-10, "loss": 1.7029, "step": 101000 }, { "epoch": 14.97037037037037, "grad_norm": 5.68807315826416, "learning_rate": 1.964130471110437e-10, "loss": 1.6871, "step": 101050 }, { "epoch": 14.977777777777778, "grad_norm": 4.425951957702637, "learning_rate": 1.1084924318005563e-10, "loss": 1.6629, "step": 101100 }, { "epoch": 14.985185185185186, "grad_norm": 4.260661602020264, "learning_rate": 4.959318819941494e-11, "loss": 1.6988, "step": 101150 }, { "epoch": 14.992592592592592, "grad_norm": 4.999781608581543, "learning_rate": 1.2645031070013248e-11, "loss": 1.7212, "step": 101200 }, { "epoch": 15.0, "grad_norm": 6.509689807891846, "learning_rate": 4.861604452344182e-15, "loss": 1.6589, "step": 101250 }, { "epoch": 15.007407407407408, "grad_norm": 4.362274169921875, "learning_rate": 1.9113652725600174e-07, "loss": 1.7051, "step": 101300 }, { "epoch": 15.014814814814814, "grad_norm": 4.904382705688477, "learning_rate": 1.8830376298905716e-07, "loss": 1.6349, "step": 101350 }, { "epoch": 15.022222222222222, "grad_norm": 5.014002323150635, "learning_rate": 1.8549194789315384e-07, "loss": 1.6786, "step": 101400 }, { "epoch": 15.02962962962963, "grad_norm": 5.237969875335693, "learning_rate": 1.8270108797185936e-07, "loss": 1.7113, "step": 101450 }, { "epoch": 15.037037037037036, "grad_norm": 4.123836517333984, "learning_rate": 1.7993118918400054e-07, "loss": 1.7214, "step": 101500 }, { "epoch": 15.044444444444444, "grad_norm": 5.174574375152588, "learning_rate": 1.7718225744364993e-07, "loss": 1.7668, "step": 101550 }, { "epoch": 15.051851851851852, "grad_norm": 4.138543605804443, "learning_rate": 1.7445429862011476e-07, "loss": 1.6625, "step": 101600 }, { "epoch": 15.059259259259258, "grad_norm": 5.6647539138793945, "learning_rate": 1.7174731853791814e-07, "loss": 1.7204, "step": 101650 }, { "epoch": 15.066666666666666, "grad_norm": 5.137762069702148, "learning_rate": 1.6906132297679568e-07, "loss": 1.6656, "step": 101700 }, { "epoch": 15.074074074074074, "grad_norm": 5.82663631439209, "learning_rate": 1.663963176716743e-07, "loss": 1.6957, "step": 101750 }, { "epoch": 15.081481481481482, "grad_norm": 4.870634078979492, "learning_rate": 1.6375230831266909e-07, "loss": 1.7525, "step": 101800 }, { "epoch": 15.088888888888889, "grad_norm": 5.1896071434021, "learning_rate": 1.6112930054505981e-07, "loss": 1.6674, "step": 101850 }, { "epoch": 15.096296296296297, "grad_norm": 4.642868518829346, "learning_rate": 1.5852729996929106e-07, "loss": 1.6844, "step": 101900 }, { "epoch": 15.103703703703705, "grad_norm": 7.358834266662598, "learning_rate": 1.5594631214095103e-07, "loss": 1.6857, "step": 101950 }, { "epoch": 15.11111111111111, "grad_norm": 4.920676231384277, "learning_rate": 1.533863425707649e-07, "loss": 1.7326, "step": 102000 }, { "epoch": 15.118518518518519, "grad_norm": 5.6548380851745605, "learning_rate": 1.508473967245794e-07, "loss": 1.6935, "step": 102050 }, { "epoch": 15.125925925925927, "grad_norm": 5.018331527709961, "learning_rate": 1.48329480023357e-07, "loss": 1.712, "step": 102100 }, { "epoch": 15.133333333333333, "grad_norm": 5.96315336227417, "learning_rate": 1.4583259784315518e-07, "loss": 1.7581, "step": 102150 }, { "epoch": 15.14074074074074, "grad_norm": 4.610811710357666, "learning_rate": 1.4335675551512384e-07, "loss": 1.7412, "step": 102200 }, { "epoch": 15.148148148148149, "grad_norm": 4.799295902252197, "learning_rate": 1.409019583254889e-07, "loss": 1.6462, "step": 102250 }, { "epoch": 15.155555555555555, "grad_norm": 6.016835689544678, "learning_rate": 1.3846821151554223e-07, "loss": 1.7501, "step": 102300 }, { "epoch": 15.162962962962963, "grad_norm": 5.653822898864746, "learning_rate": 1.3605552028163162e-07, "loss": 1.6666, "step": 102350 }, { "epoch": 15.170370370370371, "grad_norm": 5.209936141967773, "learning_rate": 1.3366388977514634e-07, "loss": 1.7113, "step": 102400 }, { "epoch": 15.177777777777777, "grad_norm": 4.76084566116333, "learning_rate": 1.3129332510251057e-07, "loss": 1.6819, "step": 102450 }, { "epoch": 15.185185185185185, "grad_norm": 4.614091873168945, "learning_rate": 1.289438313251701e-07, "loss": 1.7483, "step": 102500 }, { "epoch": 15.192592592592593, "grad_norm": 5.6824951171875, "learning_rate": 1.2661541345958095e-07, "loss": 1.6486, "step": 102550 }, { "epoch": 15.2, "grad_norm": 5.0764641761779785, "learning_rate": 1.2430807647720088e-07, "loss": 1.616, "step": 102600 }, { "epoch": 15.207407407407407, "grad_norm": 5.760731220245361, "learning_rate": 1.220218253044747e-07, "loss": 1.6611, "step": 102650 }, { "epoch": 15.214814814814815, "grad_norm": 4.543821811676025, "learning_rate": 1.1975666482282988e-07, "loss": 1.6207, "step": 102700 }, { "epoch": 15.222222222222221, "grad_norm": 6.1336894035339355, "learning_rate": 1.1751259986866104e-07, "loss": 1.6834, "step": 102750 }, { "epoch": 15.22962962962963, "grad_norm": 5.639930725097656, "learning_rate": 1.1528963523331993e-07, "loss": 1.659, "step": 102800 }, { "epoch": 15.237037037037037, "grad_norm": 5.458514213562012, "learning_rate": 1.1308777566310769e-07, "loss": 1.7154, "step": 102850 }, { "epoch": 15.244444444444444, "grad_norm": 5.286754608154297, "learning_rate": 1.1090702585926483e-07, "loss": 1.6816, "step": 102900 }, { "epoch": 15.251851851851852, "grad_norm": 6.629901885986328, "learning_rate": 1.087473904779579e-07, "loss": 1.6801, "step": 102950 }, { "epoch": 15.25925925925926, "grad_norm": 4.6929121017456055, "learning_rate": 1.0660887413027399e-07, "loss": 1.6725, "step": 103000 }, { "epoch": 15.266666666666667, "grad_norm": 6.1419878005981445, "learning_rate": 1.044914813822051e-07, "loss": 1.7276, "step": 103050 }, { "epoch": 15.274074074074074, "grad_norm": 5.224550247192383, "learning_rate": 1.0239521675464492e-07, "loss": 1.6544, "step": 103100 }, { "epoch": 15.281481481481482, "grad_norm": 5.665772438049316, "learning_rate": 1.0032008472337318e-07, "loss": 1.7196, "step": 103150 }, { "epoch": 15.28888888888889, "grad_norm": 5.348624229431152, "learning_rate": 9.826608971905238e-08, "loss": 1.6823, "step": 103200 }, { "epoch": 15.296296296296296, "grad_norm": 5.742287635803223, "learning_rate": 9.623323612721225e-08, "loss": 1.6856, "step": 103250 }, { "epoch": 15.303703703703704, "grad_norm": 4.182469844818115, "learning_rate": 9.422152828824305e-08, "loss": 1.7428, "step": 103300 }, { "epoch": 15.311111111111112, "grad_norm": 6.233865737915039, "learning_rate": 9.223097049738783e-08, "loss": 1.6919, "step": 103350 }, { "epoch": 15.318518518518518, "grad_norm": 5.403355121612549, "learning_rate": 9.026156700473021e-08, "loss": 1.604, "step": 103400 }, { "epoch": 15.325925925925926, "grad_norm": 4.667754173278809, "learning_rate": 8.831332201518883e-08, "loss": 1.7329, "step": 103450 }, { "epoch": 15.333333333333334, "grad_norm": 5.18316125869751, "learning_rate": 8.63862396885018e-08, "loss": 1.657, "step": 103500 }, { "epoch": 15.34074074074074, "grad_norm": 6.3779706954956055, "learning_rate": 8.448032413922891e-08, "loss": 1.6772, "step": 103550 }, { "epoch": 15.348148148148148, "grad_norm": 5.585716724395752, "learning_rate": 8.259557943673169e-08, "loss": 1.6485, "step": 103600 }, { "epoch": 15.355555555555556, "grad_norm": 5.607356548309326, "learning_rate": 8.073200960517003e-08, "loss": 1.7131, "step": 103650 }, { "epoch": 15.362962962962962, "grad_norm": 5.320977210998535, "learning_rate": 7.888961862349332e-08, "loss": 1.6469, "step": 103700 }, { "epoch": 15.37037037037037, "grad_norm": 4.2483296394348145, "learning_rate": 7.706841042543268e-08, "loss": 1.665, "step": 103750 }, { "epoch": 15.377777777777778, "grad_norm": 6.139911651611328, "learning_rate": 7.526838889948873e-08, "loss": 1.7047, "step": 103800 }, { "epoch": 15.385185185185184, "grad_norm": 5.27344274520874, "learning_rate": 7.348955788892831e-08, "loss": 1.7626, "step": 103850 }, { "epoch": 15.392592592592592, "grad_norm": 3.9390082359313965, "learning_rate": 7.17319211917733e-08, "loss": 1.6708, "step": 103900 }, { "epoch": 15.4, "grad_norm": 5.188051700592041, "learning_rate": 6.999548256079181e-08, "loss": 1.7132, "step": 103950 }, { "epoch": 15.407407407407407, "grad_norm": 5.408111572265625, "learning_rate": 6.828024570349479e-08, "loss": 1.7483, "step": 104000 }, { "epoch": 15.414814814814815, "grad_norm": 5.115500450134277, "learning_rate": 6.658621428212053e-08, "loss": 1.633, "step": 104050 }, { "epoch": 15.422222222222222, "grad_norm": 4.845335483551025, "learning_rate": 6.491339191363465e-08, "loss": 1.6942, "step": 104100 }, { "epoch": 15.42962962962963, "grad_norm": 4.13123083114624, "learning_rate": 6.326178216971568e-08, "loss": 1.6817, "step": 104150 }, { "epoch": 15.437037037037037, "grad_norm": 6.038280010223389, "learning_rate": 6.163138857675499e-08, "loss": 1.7143, "step": 104200 }, { "epoch": 15.444444444444445, "grad_norm": 4.800933837890625, "learning_rate": 6.002221461583913e-08, "loss": 1.6918, "step": 104250 }, { "epoch": 15.451851851851853, "grad_norm": 5.348603248596191, "learning_rate": 5.843426372275307e-08, "loss": 1.6809, "step": 104300 }, { "epoch": 15.459259259259259, "grad_norm": 5.338648796081543, "learning_rate": 5.6867539287966954e-08, "loss": 1.6494, "step": 104350 }, { "epoch": 15.466666666666667, "grad_norm": 6.060091495513916, "learning_rate": 5.532204465662716e-08, "loss": 1.7146, "step": 104400 }, { "epoch": 15.474074074074075, "grad_norm": 4.730654239654541, "learning_rate": 5.37977831285541e-08, "loss": 1.6168, "step": 104450 }, { "epoch": 15.481481481481481, "grad_norm": 5.17317533493042, "learning_rate": 5.2294757958233356e-08, "loss": 1.7384, "step": 104500 }, { "epoch": 15.488888888888889, "grad_norm": 5.633426666259766, "learning_rate": 5.081297235480675e-08, "loss": 1.7138, "step": 104550 }, { "epoch": 15.496296296296297, "grad_norm": 4.512260437011719, "learning_rate": 4.9352429482067975e-08, "loss": 1.7167, "step": 104600 }, { "epoch": 15.503703703703703, "grad_norm": 5.672184467315674, "learning_rate": 4.7913132458454746e-08, "loss": 1.7112, "step": 104650 }, { "epoch": 15.511111111111111, "grad_norm": 5.912533283233643, "learning_rate": 4.6495084357041084e-08, "loss": 1.666, "step": 104700 }, { "epoch": 15.518518518518519, "grad_norm": 4.872348785400391, "learning_rate": 4.509828820553397e-08, "loss": 1.7347, "step": 104750 }, { "epoch": 15.525925925925925, "grad_norm": 4.933444023132324, "learning_rate": 4.3722746986264443e-08, "loss": 1.6805, "step": 104800 }, { "epoch": 15.533333333333333, "grad_norm": 5.971294403076172, "learning_rate": 4.236846363618097e-08, "loss": 1.6679, "step": 104850 }, { "epoch": 15.540740740740741, "grad_norm": 4.82595157623291, "learning_rate": 4.103544104684276e-08, "loss": 1.5771, "step": 104900 }, { "epoch": 15.548148148148147, "grad_norm": 5.648787975311279, "learning_rate": 3.972368206441757e-08, "loss": 1.7697, "step": 104950 }, { "epoch": 15.555555555555555, "grad_norm": 5.242377281188965, "learning_rate": 3.843318948967056e-08, "loss": 1.7878, "step": 105000 }, { "epoch": 15.562962962962963, "grad_norm": 6.494607448577881, "learning_rate": 3.716396607796102e-08, "loss": 1.6129, "step": 105050 }, { "epoch": 15.57037037037037, "grad_norm": 4.520471096038818, "learning_rate": 3.5916014539236765e-08, "loss": 1.6146, "step": 105100 }, { "epoch": 15.577777777777778, "grad_norm": 5.042867183685303, "learning_rate": 3.468933753802528e-08, "loss": 1.7067, "step": 105150 }, { "epoch": 15.585185185185185, "grad_norm": 5.183259963989258, "learning_rate": 3.348393769343372e-08, "loss": 1.7039, "step": 105200 }, { "epoch": 15.592592592592592, "grad_norm": 5.700682640075684, "learning_rate": 3.22998175791378e-08, "loss": 1.686, "step": 105250 }, { "epoch": 15.6, "grad_norm": 6.726398944854736, "learning_rate": 3.113697972337848e-08, "loss": 1.7659, "step": 105300 }, { "epoch": 15.607407407407408, "grad_norm": 5.453873157501221, "learning_rate": 2.999542660895638e-08, "loss": 1.7263, "step": 105350 }, { "epoch": 15.614814814814816, "grad_norm": 8.06754207611084, "learning_rate": 2.8875160673227375e-08, "loss": 1.6145, "step": 105400 }, { "epoch": 15.622222222222222, "grad_norm": 5.7392988204956055, "learning_rate": 2.7776184308095922e-08, "loss": 1.7672, "step": 105450 }, { "epoch": 15.62962962962963, "grad_norm": 6.453995227813721, "learning_rate": 2.6698499860011718e-08, "loss": 1.6994, "step": 105500 }, { "epoch": 15.637037037037038, "grad_norm": 7.151883602142334, "learning_rate": 2.5642109629961942e-08, "loss": 1.6469, "step": 105550 }, { "epoch": 15.644444444444444, "grad_norm": 6.86553430557251, "learning_rate": 2.4607015873469032e-08, "loss": 1.7241, "step": 105600 }, { "epoch": 15.651851851851852, "grad_norm": 4.4429850578308105, "learning_rate": 2.3593220800584015e-08, "loss": 1.707, "step": 105650 }, { "epoch": 15.65925925925926, "grad_norm": 5.352493762969971, "learning_rate": 2.2600726575885413e-08, "loss": 1.6433, "step": 105700 }, { "epoch": 15.666666666666666, "grad_norm": 5.694179058074951, "learning_rate": 2.162953531846812e-08, "loss": 1.6451, "step": 105750 }, { "epoch": 15.674074074074074, "grad_norm": 4.886950969696045, "learning_rate": 2.0679649101944532e-08, "loss": 1.6637, "step": 105800 }, { "epoch": 15.681481481481482, "grad_norm": 3.8822832107543945, "learning_rate": 1.9751069954436763e-08, "loss": 1.6385, "step": 105850 }, { "epoch": 15.688888888888888, "grad_norm": 6.058652400970459, "learning_rate": 1.884379985857776e-08, "loss": 1.6387, "step": 105900 }, { "epoch": 15.696296296296296, "grad_norm": 6.563283443450928, "learning_rate": 1.795784075149687e-08, "loss": 1.6333, "step": 105950 }, { "epoch": 15.703703703703704, "grad_norm": 5.563347339630127, "learning_rate": 1.7093194524827605e-08, "loss": 1.6209, "step": 106000 }, { "epoch": 15.71111111111111, "grad_norm": 5.790256977081299, "learning_rate": 1.6249863024693223e-08, "loss": 1.671, "step": 106050 }, { "epoch": 15.718518518518518, "grad_norm": 5.468883514404297, "learning_rate": 1.542784805171116e-08, "loss": 1.6543, "step": 106100 }, { "epoch": 15.725925925925926, "grad_norm": 5.149229526519775, "learning_rate": 1.4627151360983027e-08, "loss": 1.7003, "step": 106150 }, { "epoch": 15.733333333333333, "grad_norm": 5.396352767944336, "learning_rate": 1.3847774662094637e-08, "loss": 1.6859, "step": 106200 }, { "epoch": 15.74074074074074, "grad_norm": 5.088778972625732, "learning_rate": 1.308971961911154e-08, "loss": 1.6757, "step": 106250 }, { "epoch": 15.748148148148148, "grad_norm": 4.770002365112305, "learning_rate": 1.2352987850571263e-08, "loss": 1.615, "step": 106300 }, { "epoch": 15.755555555555556, "grad_norm": 5.888800621032715, "learning_rate": 1.1637580929487747e-08, "loss": 1.6846, "step": 106350 }, { "epoch": 15.762962962962963, "grad_norm": 4.9219136238098145, "learning_rate": 1.0943500383342465e-08, "loss": 1.7002, "step": 106400 }, { "epoch": 15.77037037037037, "grad_norm": 4.826274871826172, "learning_rate": 1.0270747694082206e-08, "loss": 1.7268, "step": 106450 }, { "epoch": 15.777777777777779, "grad_norm": 5.474734783172607, "learning_rate": 9.61932429811574e-09, "loss": 1.6723, "step": 106500 }, { "epoch": 15.785185185185185, "grad_norm": 4.7281694412231445, "learning_rate": 8.989231586311598e-09, "loss": 1.7157, "step": 106550 }, { "epoch": 15.792592592592593, "grad_norm": 5.0702314376831055, "learning_rate": 8.380470903995852e-09, "loss": 1.6829, "step": 106600 }, { "epoch": 15.8, "grad_norm": 6.055559158325195, "learning_rate": 7.793043550945456e-09, "loss": 1.7089, "step": 106650 }, { "epoch": 15.807407407407407, "grad_norm": 4.831183910369873, "learning_rate": 7.226950781390463e-09, "loss": 1.7252, "step": 106700 }, { "epoch": 15.814814814814815, "grad_norm": 7.790440559387207, "learning_rate": 6.682193804008475e-09, "loss": 1.6727, "step": 106750 }, { "epoch": 15.822222222222223, "grad_norm": 5.7263031005859375, "learning_rate": 6.1587737819224224e-09, "loss": 1.6554, "step": 106800 }, { "epoch": 15.829629629629629, "grad_norm": 5.707858562469482, "learning_rate": 5.656691832696126e-09, "loss": 1.6258, "step": 106850 }, { "epoch": 15.837037037037037, "grad_norm": 4.361595153808594, "learning_rate": 5.1759490283376235e-09, "loss": 1.6215, "step": 106900 }, { "epoch": 15.844444444444445, "grad_norm": 4.160728931427002, "learning_rate": 4.7165463952913996e-09, "loss": 1.7259, "step": 106950 }, { "epoch": 15.851851851851851, "grad_norm": 6.267629146575928, "learning_rate": 4.278484914437276e-09, "loss": 1.675, "step": 107000 }, { "epoch": 15.85925925925926, "grad_norm": 4.6345014572143555, "learning_rate": 3.8617655210915205e-09, "loss": 1.7014, "step": 107050 }, { "epoch": 15.866666666666667, "grad_norm": 5.403736591339111, "learning_rate": 3.4663891050001895e-09, "loss": 1.6819, "step": 107100 }, { "epoch": 15.874074074074073, "grad_norm": 4.492266654968262, "learning_rate": 3.0923565103402333e-09, "loss": 1.6867, "step": 107150 }, { "epoch": 15.881481481481481, "grad_norm": 5.615554332733154, "learning_rate": 2.739668535717277e-09, "loss": 1.6419, "step": 107200 }, { "epoch": 15.88888888888889, "grad_norm": 5.3445000648498535, "learning_rate": 2.408325934162292e-09, "loss": 1.7122, "step": 107250 }, { "epoch": 15.896296296296295, "grad_norm": 4.250411510467529, "learning_rate": 2.098329413133815e-09, "loss": 1.6285, "step": 107300 }, { "epoch": 15.903703703703703, "grad_norm": 5.699678421020508, "learning_rate": 1.8096796345112854e-09, "loss": 1.722, "step": 107350 }, { "epoch": 15.911111111111111, "grad_norm": 4.965419292449951, "learning_rate": 1.5423772145983785e-09, "loss": 1.6915, "step": 107400 }, { "epoch": 15.918518518518518, "grad_norm": 5.15787410736084, "learning_rate": 1.2964227241163418e-09, "loss": 1.7422, "step": 107450 }, { "epoch": 15.925925925925926, "grad_norm": 4.798497200012207, "learning_rate": 1.0718166882106585e-09, "loss": 1.6678, "step": 107500 }, { "epoch": 15.933333333333334, "grad_norm": 6.525084495544434, "learning_rate": 8.685595864399433e-10, "loss": 1.6567, "step": 107550 }, { "epoch": 15.940740740740742, "grad_norm": 4.747183799743652, "learning_rate": 6.866518527848254e-10, "loss": 1.6978, "step": 107600 }, { "epoch": 15.948148148148148, "grad_norm": 4.648836612701416, "learning_rate": 5.260938756401768e-10, "loss": 1.6565, "step": 107650 }, { "epoch": 15.955555555555556, "grad_norm": 4.3354268074035645, "learning_rate": 3.868859978173323e-10, "loss": 1.6512, "step": 107700 }, { "epoch": 15.962962962962964, "grad_norm": 8.2989501953125, "learning_rate": 2.6902851654075914e-10, "loss": 1.69, "step": 107750 }, { "epoch": 15.97037037037037, "grad_norm": 5.1592230796813965, "learning_rate": 1.7252168345249787e-10, "loss": 1.714, "step": 107800 }, { "epoch": 15.977777777777778, "grad_norm": 4.358281135559082, "learning_rate": 9.736570460439077e-11, "loss": 1.6542, "step": 107850 }, { "epoch": 15.985185185185186, "grad_norm": 5.035132884979248, "learning_rate": 4.3560740465853345e-11, "loss": 1.7706, "step": 107900 }, { "epoch": 15.992592592592592, "grad_norm": 4.129238128662109, "learning_rate": 1.1106905914992639e-11, "loss": 1.6808, "step": 107950 }, { "epoch": 16.0, "grad_norm": 5.48187255859375, "learning_rate": 4.270245268500617e-15, "loss": 1.7222, "step": 108000 } ], "logging_steps": 50, "max_steps": 108000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3104265682349158e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }