SpireLab's picture
Upload folder using huggingface_hub
137c748 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992228935207619,
"eval_steps": 5000,
"global_step": 415000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012038830042418818,
"grad_norm": 8.390454292297363,
"learning_rate": 5.910962367274216e-07,
"loss": 4.9044,
"step": 500
},
{
"epoch": 0.0024077660084837636,
"grad_norm": 12.908564567565918,
"learning_rate": 1.1930272313581972e-06,
"loss": 4.8297,
"step": 1000
},
{
"epoch": 0.0036116490127256454,
"grad_norm": 16.473039627075195,
"learning_rate": 1.7949582259889727e-06,
"loss": 4.7877,
"step": 1500
},
{
"epoch": 0.004815532016967527,
"grad_norm": 20.620445251464844,
"learning_rate": 2.3968892206197483e-06,
"loss": 4.6598,
"step": 2000
},
{
"epoch": 0.0060194150212094085,
"grad_norm": 17.269424438476562,
"learning_rate": 2.996412491272001e-06,
"loss": 4.3792,
"step": 2500
},
{
"epoch": 0.007223298025451291,
"grad_norm": 20.924165725708008,
"learning_rate": 3.598343485902776e-06,
"loss": 4.1505,
"step": 3000
},
{
"epoch": 0.008427181029693172,
"grad_norm": 22.36298179626465,
"learning_rate": 4.200274480533552e-06,
"loss": 3.8286,
"step": 3500
},
{
"epoch": 0.009631064033935054,
"grad_norm": 17.17203712463379,
"learning_rate": 4.802205475164327e-06,
"loss": 3.6786,
"step": 4000
},
{
"epoch": 0.010834947038176937,
"grad_norm": 21.19932746887207,
"learning_rate": 5.404136469795103e-06,
"loss": 3.5698,
"step": 4500
},
{
"epoch": 0.012038830042418817,
"grad_norm": 26.214515686035156,
"learning_rate": 6.0060674644258785e-06,
"loss": 3.5362,
"step": 5000
},
{
"epoch": 0.012038830042418817,
"eval_runtime": 6118.3437,
"eval_samples_per_second": 135.763,
"eval_steps_per_second": 33.941,
"step": 5000
},
{
"epoch": 0.0132427130466607,
"grad_norm": 20.36381721496582,
"learning_rate": 6.6079984590566535e-06,
"loss": 3.4738,
"step": 5500
},
{
"epoch": 0.014446596050902582,
"grad_norm": 27.98621940612793,
"learning_rate": 7.209929453687429e-06,
"loss": 3.4793,
"step": 6000
},
{
"epoch": 0.015650479055144464,
"grad_norm": 36.69669723510742,
"learning_rate": 7.810656586328943e-06,
"loss": 3.4226,
"step": 6500
},
{
"epoch": 0.016854362059386344,
"grad_norm": 19.70973014831543,
"learning_rate": 8.412587580959718e-06,
"loss": 3.3237,
"step": 7000
},
{
"epoch": 0.018058245063628225,
"grad_norm": 22.874879837036133,
"learning_rate": 9.014518575590495e-06,
"loss": 3.3729,
"step": 7500
},
{
"epoch": 0.01926212806787011,
"grad_norm": 27.921424865722656,
"learning_rate": 9.61644957022127e-06,
"loss": 3.3806,
"step": 8000
},
{
"epoch": 0.02046601107211199,
"grad_norm": 20.24401092529297,
"learning_rate": 1.0218380564852045e-05,
"loss": 3.3732,
"step": 8500
},
{
"epoch": 0.021669894076353873,
"grad_norm": 24.8074893951416,
"learning_rate": 1.0820311559482821e-05,
"loss": 3.3689,
"step": 9000
},
{
"epoch": 0.022873777080595754,
"grad_norm": 20.271554946899414,
"learning_rate": 1.1422242554113596e-05,
"loss": 3.3533,
"step": 9500
},
{
"epoch": 0.024077660084837634,
"grad_norm": 25.907699584960938,
"learning_rate": 1.2024173548744371e-05,
"loss": 3.3256,
"step": 10000
},
{
"epoch": 0.024077660084837634,
"eval_runtime": 5885.2151,
"eval_samples_per_second": 141.141,
"eval_steps_per_second": 35.285,
"step": 10000
},
{
"epoch": 0.025281543089079518,
"grad_norm": 27.258106231689453,
"learning_rate": 1.2624900681385886e-05,
"loss": 3.3775,
"step": 10500
},
{
"epoch": 0.0264854260933214,
"grad_norm": 34.32582473754883,
"learning_rate": 1.3226831676016663e-05,
"loss": 3.3426,
"step": 11000
},
{
"epoch": 0.02768930909756328,
"grad_norm": 32.09712600708008,
"learning_rate": 1.3828762670647438e-05,
"loss": 3.3584,
"step": 11500
},
{
"epoch": 0.028893192101805163,
"grad_norm": 21.120548248291016,
"learning_rate": 1.4430693665278213e-05,
"loss": 3.3117,
"step": 12000
},
{
"epoch": 0.030097075106047044,
"grad_norm": 21.870174407958984,
"learning_rate": 1.503262465990899e-05,
"loss": 3.3063,
"step": 12500
},
{
"epoch": 0.03130095811028893,
"grad_norm": 20.636926651000977,
"learning_rate": 1.56333517925505e-05,
"loss": 3.3181,
"step": 13000
},
{
"epoch": 0.03250484111453081,
"grad_norm": 17.365741729736328,
"learning_rate": 1.623528278718128e-05,
"loss": 3.2712,
"step": 13500
},
{
"epoch": 0.03370872411877269,
"grad_norm": 29.897491455078125,
"learning_rate": 1.683721378181205e-05,
"loss": 3.3179,
"step": 14000
},
{
"epoch": 0.03491260712301457,
"grad_norm": 21.453102111816406,
"learning_rate": 1.7439144776442828e-05,
"loss": 3.3506,
"step": 14500
},
{
"epoch": 0.03611649012725645,
"grad_norm": 27.832408905029297,
"learning_rate": 1.8041075771073605e-05,
"loss": 3.3097,
"step": 15000
},
{
"epoch": 0.03611649012725645,
"eval_runtime": 6111.9913,
"eval_samples_per_second": 135.904,
"eval_steps_per_second": 33.976,
"step": 15000
},
{
"epoch": 0.03732037313149834,
"grad_norm": 27.701480865478516,
"learning_rate": 1.864180290371512e-05,
"loss": 3.3058,
"step": 15500
},
{
"epoch": 0.03852425613574022,
"grad_norm": 28.425107955932617,
"learning_rate": 1.9243733898345895e-05,
"loss": 3.3129,
"step": 16000
},
{
"epoch": 0.0397281391399821,
"grad_norm": 20.096038818359375,
"learning_rate": 1.9845664892976672e-05,
"loss": 3.3126,
"step": 16500
},
{
"epoch": 0.04093202214422398,
"grad_norm": 32.19493103027344,
"learning_rate": 2.0447595887607445e-05,
"loss": 3.2431,
"step": 17000
},
{
"epoch": 0.04213590514846586,
"grad_norm": 27.442581176757812,
"learning_rate": 2.1049526882238222e-05,
"loss": 3.3009,
"step": 17500
},
{
"epoch": 0.043339788152707746,
"grad_norm": 23.221601486206055,
"learning_rate": 2.1650254014879736e-05,
"loss": 3.3023,
"step": 18000
},
{
"epoch": 0.04454367115694963,
"grad_norm": 20.713502883911133,
"learning_rate": 2.225218500951051e-05,
"loss": 3.2834,
"step": 18500
},
{
"epoch": 0.04574755416119151,
"grad_norm": 26.963842391967773,
"learning_rate": 2.2854116004141285e-05,
"loss": 3.2512,
"step": 19000
},
{
"epoch": 0.04695143716543339,
"grad_norm": 26.029918670654297,
"learning_rate": 2.3456046998772062e-05,
"loss": 3.2678,
"step": 19500
},
{
"epoch": 0.04815532016967527,
"grad_norm": 24.989070892333984,
"learning_rate": 2.405797799340284e-05,
"loss": 3.2962,
"step": 20000
},
{
"epoch": 0.04815532016967527,
"eval_runtime": 6189.8097,
"eval_samples_per_second": 134.196,
"eval_steps_per_second": 33.549,
"step": 20000
},
{
"epoch": 0.049359203173917156,
"grad_norm": 31.154277801513672,
"learning_rate": 2.4659908988033615e-05,
"loss": 3.2434,
"step": 20500
},
{
"epoch": 0.050563086178159036,
"grad_norm": 25.946931838989258,
"learning_rate": 2.526183998266439e-05,
"loss": 3.2261,
"step": 21000
},
{
"epoch": 0.05176696918240092,
"grad_norm": 18.157636642456055,
"learning_rate": 2.5863770977295165e-05,
"loss": 3.2844,
"step": 21500
},
{
"epoch": 0.0529708521866428,
"grad_norm": 21.17293930053711,
"learning_rate": 2.6464498109936682e-05,
"loss": 3.2154,
"step": 22000
},
{
"epoch": 0.05417473519088468,
"grad_norm": 26.187950134277344,
"learning_rate": 2.7066429104567452e-05,
"loss": 3.2815,
"step": 22500
},
{
"epoch": 0.05537861819512656,
"grad_norm": 27.726953506469727,
"learning_rate": 2.766715623720897e-05,
"loss": 3.2538,
"step": 23000
},
{
"epoch": 0.056582501199368446,
"grad_norm": 23.557273864746094,
"learning_rate": 2.8269087231839743e-05,
"loss": 3.2708,
"step": 23500
},
{
"epoch": 0.057786384203610326,
"grad_norm": 22.728757858276367,
"learning_rate": 2.8869814364481256e-05,
"loss": 3.2052,
"step": 24000
},
{
"epoch": 0.05899026720785221,
"grad_norm": 22.837238311767578,
"learning_rate": 2.9471745359112036e-05,
"loss": 3.2341,
"step": 24500
},
{
"epoch": 0.06019415021209409,
"grad_norm": 24.8745059967041,
"learning_rate": 3.0073676353742806e-05,
"loss": 3.2309,
"step": 25000
},
{
"epoch": 0.06019415021209409,
"eval_runtime": 6182.4703,
"eval_samples_per_second": 134.355,
"eval_steps_per_second": 33.589,
"step": 25000
},
{
"epoch": 0.06139803321633597,
"grad_norm": 26.50299835205078,
"learning_rate": 3.0675607348373586e-05,
"loss": 3.2289,
"step": 25500
},
{
"epoch": 0.06260191622057786,
"grad_norm": 26.083864212036133,
"learning_rate": 3.1277538343004356e-05,
"loss": 3.2321,
"step": 26000
},
{
"epoch": 0.06380579922481973,
"grad_norm": 34.2237434387207,
"learning_rate": 3.187946933763514e-05,
"loss": 3.2297,
"step": 26500
},
{
"epoch": 0.06500968222906162,
"grad_norm": 18.643739700317383,
"learning_rate": 3.248140033226591e-05,
"loss": 3.2801,
"step": 27000
},
{
"epoch": 0.0662135652333035,
"grad_norm": 16.629045486450195,
"learning_rate": 3.3083331326896686e-05,
"loss": 3.2924,
"step": 27500
},
{
"epoch": 0.06741744823754538,
"grad_norm": 24.145009994506836,
"learning_rate": 3.368526232152746e-05,
"loss": 3.208,
"step": 28000
},
{
"epoch": 0.06862133124178726,
"grad_norm": 27.239717483520508,
"learning_rate": 3.428719331615824e-05,
"loss": 3.2432,
"step": 28500
},
{
"epoch": 0.06982521424602914,
"grad_norm": 19.675600051879883,
"learning_rate": 3.488912431078901e-05,
"loss": 3.2389,
"step": 29000
},
{
"epoch": 0.07102909725027103,
"grad_norm": 26.076309204101562,
"learning_rate": 3.549105530541979e-05,
"loss": 3.2849,
"step": 29500
},
{
"epoch": 0.0722329802545129,
"grad_norm": 26.58043670654297,
"learning_rate": 3.60917824380613e-05,
"loss": 3.2597,
"step": 30000
},
{
"epoch": 0.0722329802545129,
"eval_runtime": 6142.9637,
"eval_samples_per_second": 135.219,
"eval_steps_per_second": 33.805,
"step": 30000
},
{
"epoch": 0.07343686325875479,
"grad_norm": 28.792818069458008,
"learning_rate": 3.6693713432692076e-05,
"loss": 3.2583,
"step": 30500
},
{
"epoch": 0.07464074626299667,
"grad_norm": 15.474958419799805,
"learning_rate": 3.72944405653336e-05,
"loss": 3.2516,
"step": 31000
},
{
"epoch": 0.07584462926723855,
"grad_norm": 22.783601760864258,
"learning_rate": 3.7896371559964367e-05,
"loss": 3.252,
"step": 31500
},
{
"epoch": 0.07704851227148043,
"grad_norm": 30.097503662109375,
"learning_rate": 3.849830255459514e-05,
"loss": 3.2607,
"step": 32000
},
{
"epoch": 0.07825239527572231,
"grad_norm": 16.432775497436523,
"learning_rate": 3.910023354922592e-05,
"loss": 3.2281,
"step": 32500
},
{
"epoch": 0.0794562782799642,
"grad_norm": 22.113264083862305,
"learning_rate": 3.9702164543856697e-05,
"loss": 3.1994,
"step": 33000
},
{
"epoch": 0.08066016128420608,
"grad_norm": 23.033098220825195,
"learning_rate": 4.0304095538487466e-05,
"loss": 3.2641,
"step": 33500
},
{
"epoch": 0.08186404428844796,
"grad_norm": 22.20917510986328,
"learning_rate": 4.090602653311825e-05,
"loss": 3.2382,
"step": 34000
},
{
"epoch": 0.08306792729268984,
"grad_norm": 29.848487854003906,
"learning_rate": 4.150795752774902e-05,
"loss": 3.2067,
"step": 34500
},
{
"epoch": 0.08427181029693172,
"grad_norm": 25.557289123535156,
"learning_rate": 4.2108684660390533e-05,
"loss": 3.1953,
"step": 35000
},
{
"epoch": 0.08427181029693172,
"eval_runtime": 6095.2446,
"eval_samples_per_second": 136.278,
"eval_steps_per_second": 34.07,
"step": 35000
},
{
"epoch": 0.0854756933011736,
"grad_norm": 20.05970001220703,
"learning_rate": 4.271061565502131e-05,
"loss": 3.2184,
"step": 35500
},
{
"epoch": 0.08667957630541549,
"grad_norm": 20.745208740234375,
"learning_rate": 4.3311342787662824e-05,
"loss": 3.2016,
"step": 36000
},
{
"epoch": 0.08788345930965737,
"grad_norm": 24.379274368286133,
"learning_rate": 4.39132737822936e-05,
"loss": 3.1851,
"step": 36500
},
{
"epoch": 0.08908734231389925,
"grad_norm": 14.577661514282227,
"learning_rate": 4.451520477692438e-05,
"loss": 3.2191,
"step": 37000
},
{
"epoch": 0.09029122531814113,
"grad_norm": 19.822919845581055,
"learning_rate": 4.5117135771555154e-05,
"loss": 3.1865,
"step": 37500
},
{
"epoch": 0.09149510832238301,
"grad_norm": 20.753366470336914,
"learning_rate": 4.5719066766185924e-05,
"loss": 3.238,
"step": 38000
},
{
"epoch": 0.0926989913266249,
"grad_norm": 16.455045700073242,
"learning_rate": 4.63209977608167e-05,
"loss": 3.2118,
"step": 38500
},
{
"epoch": 0.09390287433086678,
"grad_norm": 22.421308517456055,
"learning_rate": 4.692052103146896e-05,
"loss": 3.2192,
"step": 39000
},
{
"epoch": 0.09510675733510866,
"grad_norm": 18.18105125427246,
"learning_rate": 4.752245202609973e-05,
"loss": 3.2,
"step": 39500
},
{
"epoch": 0.09631064033935054,
"grad_norm": 14.840981483459473,
"learning_rate": 4.8124383020730504e-05,
"loss": 3.1897,
"step": 40000
},
{
"epoch": 0.09631064033935054,
"eval_runtime": 6142.3167,
"eval_samples_per_second": 135.233,
"eval_steps_per_second": 33.808,
"step": 40000
},
{
"epoch": 0.09751452334359242,
"grad_norm": 16.446571350097656,
"learning_rate": 4.872631401536128e-05,
"loss": 3.2723,
"step": 40500
},
{
"epoch": 0.09871840634783431,
"grad_norm": 32.157161712646484,
"learning_rate": 4.932824500999206e-05,
"loss": 3.1675,
"step": 41000
},
{
"epoch": 0.09992228935207619,
"grad_norm": 18.431787490844727,
"learning_rate": 4.993017600462283e-05,
"loss": 3.2205,
"step": 41500
},
{
"epoch": 0.10112617235631807,
"grad_norm": 21.929658889770508,
"learning_rate": 4.994087573470594e-05,
"loss": 3.1701,
"step": 42000
},
{
"epoch": 0.10233005536055995,
"grad_norm": 20.648868560791016,
"learning_rate": 4.987399308165837e-05,
"loss": 3.1764,
"step": 42500
},
{
"epoch": 0.10353393836480183,
"grad_norm": 20.37611198425293,
"learning_rate": 4.98071104286108e-05,
"loss": 3.1684,
"step": 43000
},
{
"epoch": 0.10473782136904371,
"grad_norm": 15.72383975982666,
"learning_rate": 4.974036154086932e-05,
"loss": 3.1698,
"step": 43500
},
{
"epoch": 0.1059417043732856,
"grad_norm": 17.074922561645508,
"learning_rate": 4.967347888782174e-05,
"loss": 3.1913,
"step": 44000
},
{
"epoch": 0.10714558737752748,
"grad_norm": 16.70379066467285,
"learning_rate": 4.9606596234774164e-05,
"loss": 3.2184,
"step": 44500
},
{
"epoch": 0.10834947038176936,
"grad_norm": 12.970040321350098,
"learning_rate": 4.9539713581726594e-05,
"loss": 3.1826,
"step": 45000
},
{
"epoch": 0.10834947038176936,
"eval_runtime": 6114.7803,
"eval_samples_per_second": 135.842,
"eval_steps_per_second": 33.961,
"step": 45000
},
{
"epoch": 0.10955335338601124,
"grad_norm": 17.909025192260742,
"learning_rate": 4.9472830928679016e-05,
"loss": 3.175,
"step": 45500
},
{
"epoch": 0.11075723639025312,
"grad_norm": 17.254976272583008,
"learning_rate": 4.9405948275631445e-05,
"loss": 3.1959,
"step": 46000
},
{
"epoch": 0.111961119394495,
"grad_norm": 17.655014038085938,
"learning_rate": 4.9339199387889964e-05,
"loss": 3.194,
"step": 46500
},
{
"epoch": 0.11316500239873689,
"grad_norm": 18.48583984375,
"learning_rate": 4.9272316734842386e-05,
"loss": 3.19,
"step": 47000
},
{
"epoch": 0.11436888540297876,
"grad_norm": 13.14960765838623,
"learning_rate": 4.920556784710091e-05,
"loss": 3.159,
"step": 47500
},
{
"epoch": 0.11557276840722065,
"grad_norm": 13.100486755371094,
"learning_rate": 4.913868519405333e-05,
"loss": 3.197,
"step": 48000
},
{
"epoch": 0.11677665141146253,
"grad_norm": 20.155548095703125,
"learning_rate": 4.9071802541005756e-05,
"loss": 3.192,
"step": 48500
},
{
"epoch": 0.11798053441570441,
"grad_norm": 16.038169860839844,
"learning_rate": 4.9004919887958185e-05,
"loss": 3.1575,
"step": 49000
},
{
"epoch": 0.1191844174199463,
"grad_norm": 22.394641876220703,
"learning_rate": 4.893803723491061e-05,
"loss": 3.1787,
"step": 49500
},
{
"epoch": 0.12038830042418817,
"grad_norm": 18.06816864013672,
"learning_rate": 4.887115458186303e-05,
"loss": 3.138,
"step": 50000
},
{
"epoch": 0.12038830042418817,
"eval_runtime": 6098.0137,
"eval_samples_per_second": 136.216,
"eval_steps_per_second": 34.054,
"step": 50000
},
{
"epoch": 0.12159218342843006,
"grad_norm": 25.612211227416992,
"learning_rate": 4.880427192881546e-05,
"loss": 3.1291,
"step": 50500
},
{
"epoch": 0.12279606643267194,
"grad_norm": 23.026065826416016,
"learning_rate": 4.873738927576788e-05,
"loss": 3.1429,
"step": 51000
},
{
"epoch": 0.12399994943691382,
"grad_norm": 17.348302841186523,
"learning_rate": 4.86706403880264e-05,
"loss": 3.208,
"step": 51500
},
{
"epoch": 0.1252038324411557,
"grad_norm": 25.699726104736328,
"learning_rate": 4.860375773497883e-05,
"loss": 3.1744,
"step": 52000
},
{
"epoch": 0.12640771544539758,
"grad_norm": 20.069185256958008,
"learning_rate": 4.853687508193125e-05,
"loss": 3.1429,
"step": 52500
},
{
"epoch": 0.12761159844963946,
"grad_norm": 15.095413208007812,
"learning_rate": 4.8469992428883674e-05,
"loss": 3.1767,
"step": 53000
},
{
"epoch": 0.12881548145388136,
"grad_norm": 12.854238510131836,
"learning_rate": 4.84031097758361e-05,
"loss": 3.1726,
"step": 53500
},
{
"epoch": 0.13001936445812323,
"grad_norm": 18.715251922607422,
"learning_rate": 4.833622712278853e-05,
"loss": 3.1745,
"step": 54000
},
{
"epoch": 0.1312232474623651,
"grad_norm": 16.41513442993164,
"learning_rate": 4.8269344469740955e-05,
"loss": 3.163,
"step": 54500
},
{
"epoch": 0.132427130466607,
"grad_norm": 16.754322052001953,
"learning_rate": 4.820246181669338e-05,
"loss": 3.2186,
"step": 55000
},
{
"epoch": 0.132427130466607,
"eval_runtime": 6080.0844,
"eval_samples_per_second": 136.617,
"eval_steps_per_second": 34.154,
"step": 55000
},
{
"epoch": 0.13363101347084888,
"grad_norm": 21.774137496948242,
"learning_rate": 4.8135712928951896e-05,
"loss": 3.1601,
"step": 55500
},
{
"epoch": 0.13483489647509075,
"grad_norm": 15.781363487243652,
"learning_rate": 4.8068830275904325e-05,
"loss": 3.1762,
"step": 56000
},
{
"epoch": 0.13603877947933263,
"grad_norm": 15.000273704528809,
"learning_rate": 4.800194762285675e-05,
"loss": 3.1576,
"step": 56500
},
{
"epoch": 0.13724266248357453,
"grad_norm": 16.367721557617188,
"learning_rate": 4.7935064969809176e-05,
"loss": 3.1373,
"step": 57000
},
{
"epoch": 0.1384465454878164,
"grad_norm": 14.089083671569824,
"learning_rate": 4.7868316082067695e-05,
"loss": 3.1886,
"step": 57500
},
{
"epoch": 0.13965042849205828,
"grad_norm": 15.01375675201416,
"learning_rate": 4.780143342902012e-05,
"loss": 3.1041,
"step": 58000
},
{
"epoch": 0.14085431149630018,
"grad_norm": 15.857519149780273,
"learning_rate": 4.773455077597254e-05,
"loss": 3.157,
"step": 58500
},
{
"epoch": 0.14205819450054205,
"grad_norm": 15.649994850158691,
"learning_rate": 4.7667801888231065e-05,
"loss": 3.1307,
"step": 59000
},
{
"epoch": 0.14326207750478392,
"grad_norm": 17.55912208557129,
"learning_rate": 4.760091923518349e-05,
"loss": 3.1491,
"step": 59500
},
{
"epoch": 0.1444659605090258,
"grad_norm": 39.259796142578125,
"learning_rate": 4.753403658213592e-05,
"loss": 3.156,
"step": 60000
},
{
"epoch": 0.1444659605090258,
"eval_runtime": 6118.114,
"eval_samples_per_second": 135.768,
"eval_steps_per_second": 33.942,
"step": 60000
},
{
"epoch": 0.1456698435132677,
"grad_norm": 14.720120429992676,
"learning_rate": 4.746715392908834e-05,
"loss": 3.1288,
"step": 60500
},
{
"epoch": 0.14687372651750957,
"grad_norm": 17.65592384338379,
"learning_rate": 4.740027127604076e-05,
"loss": 3.1324,
"step": 61000
},
{
"epoch": 0.14807760952175145,
"grad_norm": 22.232887268066406,
"learning_rate": 4.7333388622993184e-05,
"loss": 3.1047,
"step": 61500
},
{
"epoch": 0.14928149252599335,
"grad_norm": 13.700913429260254,
"learning_rate": 4.726650596994561e-05,
"loss": 3.1472,
"step": 62000
},
{
"epoch": 0.15048537553023522,
"grad_norm": 12.120200157165527,
"learning_rate": 4.719962331689804e-05,
"loss": 3.1314,
"step": 62500
},
{
"epoch": 0.1516892585344771,
"grad_norm": 18.1831111907959,
"learning_rate": 4.713287442915656e-05,
"loss": 3.1473,
"step": 63000
},
{
"epoch": 0.152893141538719,
"grad_norm": 16.345584869384766,
"learning_rate": 4.706599177610898e-05,
"loss": 3.0957,
"step": 63500
},
{
"epoch": 0.15409702454296087,
"grad_norm": 12.382160186767578,
"learning_rate": 4.6999109123061405e-05,
"loss": 3.1352,
"step": 64000
},
{
"epoch": 0.15530090754720274,
"grad_norm": 11.769241333007812,
"learning_rate": 4.6932226470013834e-05,
"loss": 3.1241,
"step": 64500
},
{
"epoch": 0.15650479055144462,
"grad_norm": 16.600685119628906,
"learning_rate": 4.686547758227235e-05,
"loss": 3.1001,
"step": 65000
},
{
"epoch": 0.15650479055144462,
"eval_runtime": 6099.2,
"eval_samples_per_second": 136.189,
"eval_steps_per_second": 34.047,
"step": 65000
},
{
"epoch": 0.15770867355568652,
"grad_norm": 27.690532684326172,
"learning_rate": 4.6798594929224775e-05,
"loss": 3.1427,
"step": 65500
},
{
"epoch": 0.1589125565599284,
"grad_norm": 16.452138900756836,
"learning_rate": 4.6731712276177205e-05,
"loss": 3.1489,
"step": 66000
},
{
"epoch": 0.16011643956417027,
"grad_norm": 22.97121238708496,
"learning_rate": 4.666482962312963e-05,
"loss": 3.1554,
"step": 66500
},
{
"epoch": 0.16132032256841217,
"grad_norm": 17.040058135986328,
"learning_rate": 4.659808073538815e-05,
"loss": 3.1619,
"step": 67000
},
{
"epoch": 0.16252420557265404,
"grad_norm": 15.484249114990234,
"learning_rate": 4.653119808234057e-05,
"loss": 3.1277,
"step": 67500
},
{
"epoch": 0.1637280885768959,
"grad_norm": 14.11099910736084,
"learning_rate": 4.6464315429293e-05,
"loss": 3.1736,
"step": 68000
},
{
"epoch": 0.16493197158113782,
"grad_norm": 13.42932415008545,
"learning_rate": 4.6397432776245426e-05,
"loss": 3.1114,
"step": 68500
},
{
"epoch": 0.1661358545853797,
"grad_norm": 14.439802169799805,
"learning_rate": 4.633055012319785e-05,
"loss": 3.1526,
"step": 69000
},
{
"epoch": 0.16733973758962156,
"grad_norm": 19.759899139404297,
"learning_rate": 4.626366747015027e-05,
"loss": 3.13,
"step": 69500
},
{
"epoch": 0.16854362059386344,
"grad_norm": 14.265944480895996,
"learning_rate": 4.61967848171027e-05,
"loss": 3.1197,
"step": 70000
},
{
"epoch": 0.16854362059386344,
"eval_runtime": 6159.4194,
"eval_samples_per_second": 134.858,
"eval_steps_per_second": 33.715,
"step": 70000
},
{
"epoch": 0.16974750359810534,
"grad_norm": 14.379167556762695,
"learning_rate": 4.612990216405512e-05,
"loss": 3.1587,
"step": 70500
},
{
"epoch": 0.1709513866023472,
"grad_norm": 16.7622013092041,
"learning_rate": 4.606315327631364e-05,
"loss": 3.1265,
"step": 71000
},
{
"epoch": 0.17215526960658908,
"grad_norm": 16.946565628051758,
"learning_rate": 4.599627062326607e-05,
"loss": 3.1666,
"step": 71500
},
{
"epoch": 0.17335915261083099,
"grad_norm": 15.118630409240723,
"learning_rate": 4.592952173552459e-05,
"loss": 3.1067,
"step": 72000
},
{
"epoch": 0.17456303561507286,
"grad_norm": 13.015983581542969,
"learning_rate": 4.586263908247702e-05,
"loss": 3.0988,
"step": 72500
},
{
"epoch": 0.17576691861931473,
"grad_norm": 16.574451446533203,
"learning_rate": 4.579575642942944e-05,
"loss": 3.1373,
"step": 73000
},
{
"epoch": 0.1769708016235566,
"grad_norm": 13.634519577026367,
"learning_rate": 4.572887377638186e-05,
"loss": 3.1362,
"step": 73500
},
{
"epoch": 0.1781746846277985,
"grad_norm": 19.195165634155273,
"learning_rate": 4.566199112333429e-05,
"loss": 3.0999,
"step": 74000
},
{
"epoch": 0.17937856763204038,
"grad_norm": 16.080059051513672,
"learning_rate": 4.5595108470286714e-05,
"loss": 3.1615,
"step": 74500
},
{
"epoch": 0.18058245063628225,
"grad_norm": 17.560720443725586,
"learning_rate": 4.5528225817239137e-05,
"loss": 3.1018,
"step": 75000
},
{
"epoch": 0.18058245063628225,
"eval_runtime": 6167.2803,
"eval_samples_per_second": 134.686,
"eval_steps_per_second": 33.672,
"step": 75000
},
{
"epoch": 0.18178633364052416,
"grad_norm": 14.580060005187988,
"learning_rate": 4.5461343164191566e-05,
"loss": 3.1417,
"step": 75500
},
{
"epoch": 0.18299021664476603,
"grad_norm": 20.411680221557617,
"learning_rate": 4.5394594276450084e-05,
"loss": 3.1915,
"step": 76000
},
{
"epoch": 0.1841940996490079,
"grad_norm": 24.153568267822266,
"learning_rate": 4.53278453887086e-05,
"loss": 3.129,
"step": 76500
},
{
"epoch": 0.1853979826532498,
"grad_norm": 20.700895309448242,
"learning_rate": 4.5260962735661025e-05,
"loss": 3.1237,
"step": 77000
},
{
"epoch": 0.18660186565749168,
"grad_norm": 19.473398208618164,
"learning_rate": 4.5194080082613455e-05,
"loss": 3.1593,
"step": 77500
},
{
"epoch": 0.18780574866173355,
"grad_norm": 16.656599044799805,
"learning_rate": 4.5127197429565884e-05,
"loss": 3.1273,
"step": 78000
},
{
"epoch": 0.18900963166597542,
"grad_norm": 17.83378028869629,
"learning_rate": 4.5060314776518306e-05,
"loss": 3.1454,
"step": 78500
},
{
"epoch": 0.19021351467021733,
"grad_norm": 12.069873809814453,
"learning_rate": 4.499343212347073e-05,
"loss": 3.1567,
"step": 79000
},
{
"epoch": 0.1914173976744592,
"grad_norm": 21.635231018066406,
"learning_rate": 4.492654947042316e-05,
"loss": 3.1063,
"step": 79500
},
{
"epoch": 0.19262128067870107,
"grad_norm": 12.80612564086914,
"learning_rate": 4.4859800582681676e-05,
"loss": 3.1169,
"step": 80000
},
{
"epoch": 0.19262128067870107,
"eval_runtime": 6089.7621,
"eval_samples_per_second": 136.4,
"eval_steps_per_second": 34.1,
"step": 80000
},
{
"epoch": 0.19382516368294297,
"grad_norm": 15.964377403259277,
"learning_rate": 4.47929179296341e-05,
"loss": 3.1736,
"step": 80500
},
{
"epoch": 0.19502904668718485,
"grad_norm": 33.44254684448242,
"learning_rate": 4.472603527658653e-05,
"loss": 3.1525,
"step": 81000
},
{
"epoch": 0.19623292969142672,
"grad_norm": 13.991809844970703,
"learning_rate": 4.465915262353895e-05,
"loss": 3.1492,
"step": 81500
},
{
"epoch": 0.19743681269566862,
"grad_norm": 12.851255416870117,
"learning_rate": 4.4592403735797475e-05,
"loss": 3.1153,
"step": 82000
},
{
"epoch": 0.1986406956999105,
"grad_norm": 17.928274154663086,
"learning_rate": 4.452552108274989e-05,
"loss": 3.1518,
"step": 82500
},
{
"epoch": 0.19984457870415237,
"grad_norm": 12.124229431152344,
"learning_rate": 4.445863842970232e-05,
"loss": 3.1087,
"step": 83000
},
{
"epoch": 0.20104846170839424,
"grad_norm": 15.766402244567871,
"learning_rate": 4.439175577665475e-05,
"loss": 3.1327,
"step": 83500
},
{
"epoch": 0.20225234471263615,
"grad_norm": 16.555757522583008,
"learning_rate": 4.432487312360717e-05,
"loss": 3.0765,
"step": 84000
},
{
"epoch": 0.20345622771687802,
"grad_norm": 19.65941619873047,
"learning_rate": 4.4257990470559594e-05,
"loss": 3.1393,
"step": 84500
},
{
"epoch": 0.2046601107211199,
"grad_norm": 16.987285614013672,
"learning_rate": 4.419110781751202e-05,
"loss": 3.1145,
"step": 85000
},
{
"epoch": 0.2046601107211199,
"eval_runtime": 6166.7962,
"eval_samples_per_second": 134.696,
"eval_steps_per_second": 33.674,
"step": 85000
},
{
"epoch": 0.2058639937253618,
"grad_norm": 14.441193580627441,
"learning_rate": 4.4124225164464445e-05,
"loss": 3.1662,
"step": 85500
},
{
"epoch": 0.20706787672960367,
"grad_norm": 18.12236976623535,
"learning_rate": 4.4057476276722964e-05,
"loss": 3.0565,
"step": 86000
},
{
"epoch": 0.20827175973384554,
"grad_norm": 12.500991821289062,
"learning_rate": 4.399059362367539e-05,
"loss": 3.1047,
"step": 86500
},
{
"epoch": 0.20947564273808741,
"grad_norm": 16.244428634643555,
"learning_rate": 4.3923710970627816e-05,
"loss": 3.0893,
"step": 87000
},
{
"epoch": 0.21067952574232932,
"grad_norm": 21.911731719970703,
"learning_rate": 4.385682831758024e-05,
"loss": 3.0743,
"step": 87500
},
{
"epoch": 0.2118834087465712,
"grad_norm": 16.75537109375,
"learning_rate": 4.378994566453267e-05,
"loss": 3.1221,
"step": 88000
},
{
"epoch": 0.21308729175081306,
"grad_norm": 20.14570426940918,
"learning_rate": 4.3723063011485096e-05,
"loss": 3.1413,
"step": 88500
},
{
"epoch": 0.21429117475505496,
"grad_norm": 14.766070365905762,
"learning_rate": 4.365618035843751e-05,
"loss": 3.0955,
"step": 89000
},
{
"epoch": 0.21549505775929684,
"grad_norm": 17.830801010131836,
"learning_rate": 4.358929770538994e-05,
"loss": 3.1517,
"step": 89500
},
{
"epoch": 0.2166989407635387,
"grad_norm": 10.205118179321289,
"learning_rate": 4.352254881764846e-05,
"loss": 3.1332,
"step": 90000
},
{
"epoch": 0.2166989407635387,
"eval_runtime": 6149.5749,
"eval_samples_per_second": 135.074,
"eval_steps_per_second": 33.769,
"step": 90000
},
{
"epoch": 0.2179028237677806,
"grad_norm": 16.20384979248047,
"learning_rate": 4.345566616460089e-05,
"loss": 3.1003,
"step": 90500
},
{
"epoch": 0.21910670677202249,
"grad_norm": 17.35607147216797,
"learning_rate": 4.338878351155331e-05,
"loss": 3.1193,
"step": 91000
},
{
"epoch": 0.22031058977626436,
"grad_norm": 17.914997100830078,
"learning_rate": 4.332190085850574e-05,
"loss": 3.0944,
"step": 91500
},
{
"epoch": 0.22151447278050623,
"grad_norm": 23.45078468322754,
"learning_rate": 4.325515197076426e-05,
"loss": 3.1518,
"step": 92000
},
{
"epoch": 0.22271835578474813,
"grad_norm": 19.160053253173828,
"learning_rate": 4.318826931771668e-05,
"loss": 3.1144,
"step": 92500
},
{
"epoch": 0.22392223878899,
"grad_norm": 16.796180725097656,
"learning_rate": 4.312152042997521e-05,
"loss": 3.1354,
"step": 93000
},
{
"epoch": 0.22512612179323188,
"grad_norm": 13.598986625671387,
"learning_rate": 4.305463777692762e-05,
"loss": 3.0675,
"step": 93500
},
{
"epoch": 0.22633000479747378,
"grad_norm": 16.168975830078125,
"learning_rate": 4.298775512388005e-05,
"loss": 3.1065,
"step": 94000
},
{
"epoch": 0.22753388780171566,
"grad_norm": 22.480331420898438,
"learning_rate": 4.292087247083248e-05,
"loss": 3.1201,
"step": 94500
},
{
"epoch": 0.22873777080595753,
"grad_norm": 16.593976974487305,
"learning_rate": 4.28539898177849e-05,
"loss": 3.1264,
"step": 95000
},
{
"epoch": 0.22873777080595753,
"eval_runtime": 6100.46,
"eval_samples_per_second": 136.161,
"eval_steps_per_second": 34.04,
"step": 95000
},
{
"epoch": 0.2299416538101994,
"grad_norm": 14.308032989501953,
"learning_rate": 4.2787107164737325e-05,
"loss": 3.1278,
"step": 95500
},
{
"epoch": 0.2311455368144413,
"grad_norm": 13.68152141571045,
"learning_rate": 4.272035827699585e-05,
"loss": 3.1079,
"step": 96000
},
{
"epoch": 0.23234941981868318,
"grad_norm": 15.30040454864502,
"learning_rate": 4.265347562394827e-05,
"loss": 3.0765,
"step": 96500
},
{
"epoch": 0.23355330282292505,
"grad_norm": 17.36260223388672,
"learning_rate": 4.2586592970900695e-05,
"loss": 3.0966,
"step": 97000
},
{
"epoch": 0.23475718582716695,
"grad_norm": 16.50679588317871,
"learning_rate": 4.2519710317853125e-05,
"loss": 3.1462,
"step": 97500
},
{
"epoch": 0.23596106883140883,
"grad_norm": 15.678003311157227,
"learning_rate": 4.245282766480555e-05,
"loss": 3.1434,
"step": 98000
},
{
"epoch": 0.2371649518356507,
"grad_norm": 14.652356147766113,
"learning_rate": 4.238594501175797e-05,
"loss": 3.1461,
"step": 98500
},
{
"epoch": 0.2383688348398926,
"grad_norm": 13.707479476928711,
"learning_rate": 4.23190623587104e-05,
"loss": 3.0894,
"step": 99000
},
{
"epoch": 0.23957271784413448,
"grad_norm": 22.13295555114746,
"learning_rate": 4.225217970566282e-05,
"loss": 3.1317,
"step": 99500
},
{
"epoch": 0.24077660084837635,
"grad_norm": 14.54344367980957,
"learning_rate": 4.2185430817921346e-05,
"loss": 3.1209,
"step": 100000
},
{
"epoch": 0.24077660084837635,
"eval_runtime": 6233.0245,
"eval_samples_per_second": 133.265,
"eval_steps_per_second": 33.316,
"step": 100000
},
{
"epoch": 0.24198048385261822,
"grad_norm": 16.891630172729492,
"learning_rate": 4.211854816487377e-05,
"loss": 3.1261,
"step": 100500
},
{
"epoch": 0.24318436685686012,
"grad_norm": 17.46337127685547,
"learning_rate": 4.205166551182619e-05,
"loss": 3.1625,
"step": 101000
},
{
"epoch": 0.244388249861102,
"grad_norm": 14.349138259887695,
"learning_rate": 4.1984916624084716e-05,
"loss": 3.0834,
"step": 101500
},
{
"epoch": 0.24559213286534387,
"grad_norm": 18.939817428588867,
"learning_rate": 4.191803397103714e-05,
"loss": 3.1521,
"step": 102000
},
{
"epoch": 0.24679601586958577,
"grad_norm": 16.54868507385254,
"learning_rate": 4.185115131798956e-05,
"loss": 3.0694,
"step": 102500
},
{
"epoch": 0.24799989887382765,
"grad_norm": 14.203706741333008,
"learning_rate": 4.178426866494199e-05,
"loss": 3.1201,
"step": 103000
},
{
"epoch": 0.24920378187806952,
"grad_norm": 14.797431945800781,
"learning_rate": 4.171738601189441e-05,
"loss": 3.1252,
"step": 103500
},
{
"epoch": 0.2504076648823114,
"grad_norm": 14.449517250061035,
"learning_rate": 4.165063712415294e-05,
"loss": 3.0932,
"step": 104000
},
{
"epoch": 0.25161154788655327,
"grad_norm": 17.101430892944336,
"learning_rate": 4.158375447110536e-05,
"loss": 3.1127,
"step": 104500
},
{
"epoch": 0.25281543089079517,
"grad_norm": 20.582412719726562,
"learning_rate": 4.151700558336388e-05,
"loss": 3.0675,
"step": 105000
},
{
"epoch": 0.25281543089079517,
"eval_runtime": 6156.9182,
"eval_samples_per_second": 134.912,
"eval_steps_per_second": 33.728,
"step": 105000
},
{
"epoch": 0.25401931389503707,
"grad_norm": 14.351494789123535,
"learning_rate": 4.14501229303163e-05,
"loss": 3.0845,
"step": 105500
},
{
"epoch": 0.2552231968992789,
"grad_norm": 11.951766967773438,
"learning_rate": 4.138324027726873e-05,
"loss": 3.0907,
"step": 106000
},
{
"epoch": 0.2564270799035208,
"grad_norm": 13.831068992614746,
"learning_rate": 4.131635762422115e-05,
"loss": 3.1139,
"step": 106500
},
{
"epoch": 0.2576309629077627,
"grad_norm": 16.089948654174805,
"learning_rate": 4.124947497117358e-05,
"loss": 3.085,
"step": 107000
},
{
"epoch": 0.25883484591200456,
"grad_norm": 16.427217483520508,
"learning_rate": 4.1182592318126004e-05,
"loss": 3.1444,
"step": 107500
},
{
"epoch": 0.26003872891624646,
"grad_norm": 16.443748474121094,
"learning_rate": 4.111570966507843e-05,
"loss": 3.1197,
"step": 108000
},
{
"epoch": 0.26124261192048837,
"grad_norm": 12.318251609802246,
"learning_rate": 4.1048827012030856e-05,
"loss": 3.0734,
"step": 108500
},
{
"epoch": 0.2624464949247302,
"grad_norm": 13.695268630981445,
"learning_rate": 4.098194435898328e-05,
"loss": 3.1275,
"step": 109000
},
{
"epoch": 0.2636503779289721,
"grad_norm": 15.07443904876709,
"learning_rate": 4.09151954712418e-05,
"loss": 3.097,
"step": 109500
},
{
"epoch": 0.264854260933214,
"grad_norm": 15.240448951721191,
"learning_rate": 4.0848312818194226e-05,
"loss": 3.088,
"step": 110000
},
{
"epoch": 0.264854260933214,
"eval_runtime": 6153.3584,
"eval_samples_per_second": 134.991,
"eval_steps_per_second": 33.748,
"step": 110000
},
{
"epoch": 0.26605814393745586,
"grad_norm": 13.12667179107666,
"learning_rate": 4.078250028759541e-05,
"loss": 3.0962,
"step": 110500
},
{
"epoch": 0.26726202694169776,
"grad_norm": 17.520675659179688,
"learning_rate": 4.0715617634547834e-05,
"loss": 3.0786,
"step": 111000
},
{
"epoch": 0.2684659099459396,
"grad_norm": 27.284038543701172,
"learning_rate": 4.064886874680636e-05,
"loss": 3.1162,
"step": 111500
},
{
"epoch": 0.2696697929501815,
"grad_norm": 12.623812675476074,
"learning_rate": 4.0581986093758775e-05,
"loss": 3.0993,
"step": 112000
},
{
"epoch": 0.2708736759544234,
"grad_norm": 14.702446937561035,
"learning_rate": 4.0515103440711204e-05,
"loss": 3.0733,
"step": 112500
},
{
"epoch": 0.27207755895866526,
"grad_norm": 16.056833267211914,
"learning_rate": 4.0448220787663634e-05,
"loss": 3.0788,
"step": 113000
},
{
"epoch": 0.27328144196290716,
"grad_norm": 12.753098487854004,
"learning_rate": 4.038147189992215e-05,
"loss": 3.0991,
"step": 113500
},
{
"epoch": 0.27448532496714906,
"grad_norm": 13.137269020080566,
"learning_rate": 4.0314589246874575e-05,
"loss": 3.0871,
"step": 114000
},
{
"epoch": 0.2756892079713909,
"grad_norm": 15.072389602661133,
"learning_rate": 4.0247706593827004e-05,
"loss": 3.115,
"step": 114500
},
{
"epoch": 0.2768930909756328,
"grad_norm": 15.979447364807129,
"learning_rate": 4.0180823940779426e-05,
"loss": 3.1002,
"step": 115000
},
{
"epoch": 0.2768930909756328,
"eval_runtime": 6179.6049,
"eval_samples_per_second": 134.417,
"eval_steps_per_second": 33.604,
"step": 115000
},
{
"epoch": 0.2780969739798747,
"grad_norm": 13.973761558532715,
"learning_rate": 4.011394128773185e-05,
"loss": 3.0706,
"step": 115500
},
{
"epoch": 0.27930085698411655,
"grad_norm": 16.156885147094727,
"learning_rate": 4.004705863468428e-05,
"loss": 3.0595,
"step": 116000
},
{
"epoch": 0.28050473998835845,
"grad_norm": 14.320749282836914,
"learning_rate": 3.99801759816367e-05,
"loss": 3.1083,
"step": 116500
},
{
"epoch": 0.28170862299260035,
"grad_norm": 13.002079010009766,
"learning_rate": 3.991329332858912e-05,
"loss": 3.0554,
"step": 117000
},
{
"epoch": 0.2829125059968422,
"grad_norm": 19.574172973632812,
"learning_rate": 3.984654444084764e-05,
"loss": 3.1074,
"step": 117500
},
{
"epoch": 0.2841163890010841,
"grad_norm": 12.356159210205078,
"learning_rate": 3.977966178780007e-05,
"loss": 3.1215,
"step": 118000
},
{
"epoch": 0.285320272005326,
"grad_norm": 17.327226638793945,
"learning_rate": 3.97127791347525e-05,
"loss": 3.047,
"step": 118500
},
{
"epoch": 0.28652415500956785,
"grad_norm": 16.561124801635742,
"learning_rate": 3.964589648170492e-05,
"loss": 3.1006,
"step": 119000
},
{
"epoch": 0.28772803801380975,
"grad_norm": 14.118390083312988,
"learning_rate": 3.9579013828657344e-05,
"loss": 3.08,
"step": 119500
},
{
"epoch": 0.2889319210180516,
"grad_norm": 15.130383491516113,
"learning_rate": 3.951213117560977e-05,
"loss": 3.0229,
"step": 120000
},
{
"epoch": 0.2889319210180516,
"eval_runtime": 6265.7809,
"eval_samples_per_second": 132.568,
"eval_steps_per_second": 33.142,
"step": 120000
},
{
"epoch": 0.2901358040222935,
"grad_norm": 20.27661895751953,
"learning_rate": 3.944538228786829e-05,
"loss": 3.0565,
"step": 120500
},
{
"epoch": 0.2913396870265354,
"grad_norm": 15.461856842041016,
"learning_rate": 3.9378499634820714e-05,
"loss": 3.0717,
"step": 121000
},
{
"epoch": 0.29254357003077724,
"grad_norm": 17.019287109375,
"learning_rate": 3.931161698177314e-05,
"loss": 3.1387,
"step": 121500
},
{
"epoch": 0.29374745303501915,
"grad_norm": 18.06890106201172,
"learning_rate": 3.9244734328725566e-05,
"loss": 3.1166,
"step": 122000
},
{
"epoch": 0.29495133603926105,
"grad_norm": 31.920703887939453,
"learning_rate": 3.917798544098409e-05,
"loss": 3.095,
"step": 122500
},
{
"epoch": 0.2961552190435029,
"grad_norm": 15.199366569519043,
"learning_rate": 3.9111102787936507e-05,
"loss": 3.0706,
"step": 123000
},
{
"epoch": 0.2973591020477448,
"grad_norm": 15.413779258728027,
"learning_rate": 3.9044220134888936e-05,
"loss": 3.121,
"step": 123500
},
{
"epoch": 0.2985629850519867,
"grad_norm": 14.4086275100708,
"learning_rate": 3.8977337481841365e-05,
"loss": 3.087,
"step": 124000
},
{
"epoch": 0.29976686805622854,
"grad_norm": 12.95889663696289,
"learning_rate": 3.891045482879379e-05,
"loss": 3.0934,
"step": 124500
},
{
"epoch": 0.30097075106047044,
"grad_norm": 19.025604248046875,
"learning_rate": 3.884357217574621e-05,
"loss": 3.1332,
"step": 125000
},
{
"epoch": 0.30097075106047044,
"eval_runtime": 6218.4719,
"eval_samples_per_second": 133.577,
"eval_steps_per_second": 33.394,
"step": 125000
},
{
"epoch": 0.30217463406471234,
"grad_norm": 14.700455665588379,
"learning_rate": 3.877668952269864e-05,
"loss": 3.0799,
"step": 125500
},
{
"epoch": 0.3033785170689542,
"grad_norm": 15.362942695617676,
"learning_rate": 3.870994063495716e-05,
"loss": 3.0551,
"step": 126000
},
{
"epoch": 0.3045824000731961,
"grad_norm": 18.218399047851562,
"learning_rate": 3.864305798190958e-05,
"loss": 3.0529,
"step": 126500
},
{
"epoch": 0.305786283077438,
"grad_norm": 18.461824417114258,
"learning_rate": 3.857617532886201e-05,
"loss": 3.1065,
"step": 127000
},
{
"epoch": 0.30699016608167984,
"grad_norm": 12.244810104370117,
"learning_rate": 3.850929267581443e-05,
"loss": 3.0844,
"step": 127500
},
{
"epoch": 0.30819404908592174,
"grad_norm": 20.86441993713379,
"learning_rate": 3.8442410022766854e-05,
"loss": 3.0551,
"step": 128000
},
{
"epoch": 0.30939793209016364,
"grad_norm": 16.215953826904297,
"learning_rate": 3.837552736971928e-05,
"loss": 3.0748,
"step": 128500
},
{
"epoch": 0.3106018150944055,
"grad_norm": 17.1651554107666,
"learning_rate": 3.8308644716671705e-05,
"loss": 3.144,
"step": 129000
},
{
"epoch": 0.3118056980986474,
"grad_norm": 22.377321243286133,
"learning_rate": 3.8241762063624134e-05,
"loss": 3.1162,
"step": 129500
},
{
"epoch": 0.31300958110288923,
"grad_norm": 21.55461883544922,
"learning_rate": 3.817501317588265e-05,
"loss": 3.1048,
"step": 130000
},
{
"epoch": 0.31300958110288923,
"eval_runtime": 6198.7963,
"eval_samples_per_second": 134.001,
"eval_steps_per_second": 33.5,
"step": 130000
},
{
"epoch": 0.31421346410713114,
"grad_norm": 17.96697425842285,
"learning_rate": 3.8108130522835075e-05,
"loss": 3.0576,
"step": 130500
},
{
"epoch": 0.31541734711137304,
"grad_norm": 15.112616539001465,
"learning_rate": 3.80412478697875e-05,
"loss": 3.1265,
"step": 131000
},
{
"epoch": 0.3166212301156149,
"grad_norm": 15.317338943481445,
"learning_rate": 3.797449898204602e-05,
"loss": 3.0716,
"step": 131500
},
{
"epoch": 0.3178251131198568,
"grad_norm": 14.246545791625977,
"learning_rate": 3.7907616328998445e-05,
"loss": 3.1111,
"step": 132000
},
{
"epoch": 0.3190289961240987,
"grad_norm": 14.737203598022461,
"learning_rate": 3.7840733675950874e-05,
"loss": 3.1051,
"step": 132500
},
{
"epoch": 0.32023287912834053,
"grad_norm": 16.053455352783203,
"learning_rate": 3.77738510229033e-05,
"loss": 3.0498,
"step": 133000
},
{
"epoch": 0.32143676213258243,
"grad_norm": 15.171459197998047,
"learning_rate": 3.770696836985572e-05,
"loss": 3.0535,
"step": 133500
},
{
"epoch": 0.32264064513682433,
"grad_norm": 23.735517501831055,
"learning_rate": 3.7640219482114245e-05,
"loss": 3.0349,
"step": 134000
},
{
"epoch": 0.3238445281410662,
"grad_norm": 13.836942672729492,
"learning_rate": 3.757333682906667e-05,
"loss": 3.0985,
"step": 134500
},
{
"epoch": 0.3250484111453081,
"grad_norm": 15.954339027404785,
"learning_rate": 3.750645417601909e-05,
"loss": 3.0927,
"step": 135000
},
{
"epoch": 0.3250484111453081,
"eval_runtime": 6258.0775,
"eval_samples_per_second": 132.732,
"eval_steps_per_second": 33.183,
"step": 135000
},
{
"epoch": 0.32625229414955,
"grad_norm": 23.13224983215332,
"learning_rate": 3.7439705288277615e-05,
"loss": 3.0961,
"step": 135500
},
{
"epoch": 0.3274561771537918,
"grad_norm": 11.840916633605957,
"learning_rate": 3.737282263523004e-05,
"loss": 3.0769,
"step": 136000
},
{
"epoch": 0.32866006015803373,
"grad_norm": 11.10158634185791,
"learning_rate": 3.7305939982182466e-05,
"loss": 3.0942,
"step": 136500
},
{
"epoch": 0.32986394316227563,
"grad_norm": 14.162835121154785,
"learning_rate": 3.723905732913489e-05,
"loss": 3.1289,
"step": 137000
},
{
"epoch": 0.3310678261665175,
"grad_norm": 23.765029907226562,
"learning_rate": 3.717217467608731e-05,
"loss": 3.0774,
"step": 137500
},
{
"epoch": 0.3322717091707594,
"grad_norm": 22.40215492248535,
"learning_rate": 3.710542578834583e-05,
"loss": 3.0886,
"step": 138000
},
{
"epoch": 0.3334755921750012,
"grad_norm": 16.616819381713867,
"learning_rate": 3.703854313529826e-05,
"loss": 3.102,
"step": 138500
},
{
"epoch": 0.3346794751792431,
"grad_norm": 19.094507217407227,
"learning_rate": 3.697166048225068e-05,
"loss": 3.1027,
"step": 139000
},
{
"epoch": 0.335883358183485,
"grad_norm": 20.761945724487305,
"learning_rate": 3.690477782920311e-05,
"loss": 3.0609,
"step": 139500
},
{
"epoch": 0.33708724118772687,
"grad_norm": 11.371627807617188,
"learning_rate": 3.683789517615553e-05,
"loss": 3.0916,
"step": 140000
},
{
"epoch": 0.33708724118772687,
"eval_runtime": 6174.667,
"eval_samples_per_second": 134.525,
"eval_steps_per_second": 33.631,
"step": 140000
},
{
"epoch": 0.3382911241919688,
"grad_norm": 15.36569881439209,
"learning_rate": 3.6771012523107955e-05,
"loss": 3.0964,
"step": 140500
},
{
"epoch": 0.3394950071962107,
"grad_norm": 19.703203201293945,
"learning_rate": 3.6704129870060384e-05,
"loss": 3.0631,
"step": 141000
},
{
"epoch": 0.3406988902004525,
"grad_norm": 23.92881965637207,
"learning_rate": 3.663724721701281e-05,
"loss": 3.0702,
"step": 141500
},
{
"epoch": 0.3419027732046944,
"grad_norm": 18.54579734802246,
"learning_rate": 3.657036456396523e-05,
"loss": 3.0732,
"step": 142000
},
{
"epoch": 0.3431066562089363,
"grad_norm": 13.281709671020508,
"learning_rate": 3.650348191091766e-05,
"loss": 3.0937,
"step": 142500
},
{
"epoch": 0.34431053921317817,
"grad_norm": 17.042314529418945,
"learning_rate": 3.6436733023176177e-05,
"loss": 3.0914,
"step": 143000
},
{
"epoch": 0.34551442221742007,
"grad_norm": 16.268789291381836,
"learning_rate": 3.6369850370128606e-05,
"loss": 3.0899,
"step": 143500
},
{
"epoch": 0.34671830522166197,
"grad_norm": 26.38330841064453,
"learning_rate": 3.630296771708103e-05,
"loss": 3.0666,
"step": 144000
},
{
"epoch": 0.3479221882259038,
"grad_norm": 14.961106300354004,
"learning_rate": 3.623608506403345e-05,
"loss": 3.069,
"step": 144500
},
{
"epoch": 0.3491260712301457,
"grad_norm": 12.415295600891113,
"learning_rate": 3.616920241098588e-05,
"loss": 3.0293,
"step": 145000
},
{
"epoch": 0.3491260712301457,
"eval_runtime": 6109.5629,
"eval_samples_per_second": 135.958,
"eval_steps_per_second": 33.99,
"step": 145000
},
{
"epoch": 0.3503299542343876,
"grad_norm": 16.554115295410156,
"learning_rate": 3.61024535232444e-05,
"loss": 3.0739,
"step": 145500
},
{
"epoch": 0.35153383723862947,
"grad_norm": 20.627267837524414,
"learning_rate": 3.603557087019682e-05,
"loss": 3.0799,
"step": 146000
},
{
"epoch": 0.35273772024287137,
"grad_norm": 15.106368064880371,
"learning_rate": 3.596868821714925e-05,
"loss": 3.0417,
"step": 146500
},
{
"epoch": 0.3539416032471132,
"grad_norm": 17.705570220947266,
"learning_rate": 3.590180556410168e-05,
"loss": 3.0896,
"step": 147000
},
{
"epoch": 0.3551454862513551,
"grad_norm": 16.01241683959961,
"learning_rate": 3.5834922911054094e-05,
"loss": 3.0729,
"step": 147500
},
{
"epoch": 0.356349369255597,
"grad_norm": 17.986221313476562,
"learning_rate": 3.576817402331262e-05,
"loss": 3.11,
"step": 148000
},
{
"epoch": 0.35755325225983886,
"grad_norm": 17.471803665161133,
"learning_rate": 3.570129137026504e-05,
"loss": 3.0968,
"step": 148500
},
{
"epoch": 0.35875713526408076,
"grad_norm": 16.683828353881836,
"learning_rate": 3.563440871721747e-05,
"loss": 3.0491,
"step": 149000
},
{
"epoch": 0.35996101826832266,
"grad_norm": 18.689273834228516,
"learning_rate": 3.5567526064169894e-05,
"loss": 3.0183,
"step": 149500
},
{
"epoch": 0.3611649012725645,
"grad_norm": 14.659083366394043,
"learning_rate": 3.550064341112232e-05,
"loss": 3.0965,
"step": 150000
},
{
"epoch": 0.3611649012725645,
"eval_runtime": 6228.4893,
"eval_samples_per_second": 133.362,
"eval_steps_per_second": 33.341,
"step": 150000
},
{
"epoch": 0.3623687842768064,
"grad_norm": 16.2710018157959,
"learning_rate": 3.5433760758074745e-05,
"loss": 3.1006,
"step": 150500
},
{
"epoch": 0.3635726672810483,
"grad_norm": 16.394590377807617,
"learning_rate": 3.5367011870333264e-05,
"loss": 3.0602,
"step": 151000
},
{
"epoch": 0.36477655028529016,
"grad_norm": 15.235190391540527,
"learning_rate": 3.5300129217285686e-05,
"loss": 3.0777,
"step": 151500
},
{
"epoch": 0.36598043328953206,
"grad_norm": 15.201708793640137,
"learning_rate": 3.5233246564238115e-05,
"loss": 3.0595,
"step": 152000
},
{
"epoch": 0.36718431629377396,
"grad_norm": 22.309728622436523,
"learning_rate": 3.5166363911190544e-05,
"loss": 3.0446,
"step": 152500
},
{
"epoch": 0.3683881992980158,
"grad_norm": 13.854850769042969,
"learning_rate": 3.509961502344906e-05,
"loss": 3.0665,
"step": 153000
},
{
"epoch": 0.3695920823022577,
"grad_norm": 14.474712371826172,
"learning_rate": 3.5032732370401485e-05,
"loss": 3.1098,
"step": 153500
},
{
"epoch": 0.3707959653064996,
"grad_norm": 13.207783699035645,
"learning_rate": 3.496584971735391e-05,
"loss": 3.1007,
"step": 154000
},
{
"epoch": 0.37199984831074145,
"grad_norm": 13.456844329833984,
"learning_rate": 3.489896706430634e-05,
"loss": 3.0957,
"step": 154500
},
{
"epoch": 0.37320373131498336,
"grad_norm": 17.590436935424805,
"learning_rate": 3.483208441125876e-05,
"loss": 3.0295,
"step": 155000
},
{
"epoch": 0.37320373131498336,
"eval_runtime": 6177.0488,
"eval_samples_per_second": 134.473,
"eval_steps_per_second": 33.618,
"step": 155000
},
{
"epoch": 0.3744076143192252,
"grad_norm": 12.911888122558594,
"learning_rate": 3.476520175821119e-05,
"loss": 3.0661,
"step": 155500
},
{
"epoch": 0.3756114973234671,
"grad_norm": 14.606691360473633,
"learning_rate": 3.469831910516361e-05,
"loss": 3.0804,
"step": 156000
},
{
"epoch": 0.376815380327709,
"grad_norm": 18.043087005615234,
"learning_rate": 3.463143645211603e-05,
"loss": 3.1359,
"step": 156500
},
{
"epoch": 0.37801926333195085,
"grad_norm": 15.033346176147461,
"learning_rate": 3.456468756437455e-05,
"loss": 2.9907,
"step": 157000
},
{
"epoch": 0.37922314633619275,
"grad_norm": 17.020784378051758,
"learning_rate": 3.449780491132698e-05,
"loss": 3.0606,
"step": 157500
},
{
"epoch": 0.38042702934043465,
"grad_norm": 22.74751091003418,
"learning_rate": 3.44310560235855e-05,
"loss": 3.1285,
"step": 158000
},
{
"epoch": 0.3816309123446765,
"grad_norm": 14.052987098693848,
"learning_rate": 3.436417337053793e-05,
"loss": 3.0703,
"step": 158500
},
{
"epoch": 0.3828347953489184,
"grad_norm": 22.046268463134766,
"learning_rate": 3.429729071749035e-05,
"loss": 3.0916,
"step": 159000
},
{
"epoch": 0.3840386783531603,
"grad_norm": 23.049739837646484,
"learning_rate": 3.4230408064442773e-05,
"loss": 3.0909,
"step": 159500
},
{
"epoch": 0.38524256135740215,
"grad_norm": 15.563003540039062,
"learning_rate": 3.41635254113952e-05,
"loss": 3.051,
"step": 160000
},
{
"epoch": 0.38524256135740215,
"eval_runtime": 6216.4944,
"eval_samples_per_second": 133.62,
"eval_steps_per_second": 33.405,
"step": 160000
},
{
"epoch": 0.38644644436164405,
"grad_norm": 11.055919647216797,
"learning_rate": 3.4096642758347625e-05,
"loss": 3.0614,
"step": 160500
},
{
"epoch": 0.38765032736588595,
"grad_norm": 18.309402465820312,
"learning_rate": 3.4029760105300054e-05,
"loss": 3.0618,
"step": 161000
},
{
"epoch": 0.3888542103701278,
"grad_norm": 15.657028198242188,
"learning_rate": 3.396287745225247e-05,
"loss": 3.068,
"step": 161500
},
{
"epoch": 0.3900580933743697,
"grad_norm": 15.660598754882812,
"learning_rate": 3.3896128564510995e-05,
"loss": 3.0956,
"step": 162000
},
{
"epoch": 0.3912619763786116,
"grad_norm": 17.219053268432617,
"learning_rate": 3.382924591146342e-05,
"loss": 3.0762,
"step": 162500
},
{
"epoch": 0.39246585938285344,
"grad_norm": 15.2114896774292,
"learning_rate": 3.376249702372194e-05,
"loss": 3.0193,
"step": 163000
},
{
"epoch": 0.39366974238709534,
"grad_norm": 15.437503814697266,
"learning_rate": 3.3695614370674365e-05,
"loss": 3.0757,
"step": 163500
},
{
"epoch": 0.39487362539133725,
"grad_norm": 17.652286529541016,
"learning_rate": 3.3628731717626794e-05,
"loss": 3.0871,
"step": 164000
},
{
"epoch": 0.3960775083955791,
"grad_norm": 14.703353881835938,
"learning_rate": 3.356184906457922e-05,
"loss": 3.0025,
"step": 164500
},
{
"epoch": 0.397281391399821,
"grad_norm": 15.438825607299805,
"learning_rate": 3.349496641153164e-05,
"loss": 3.049,
"step": 165000
},
{
"epoch": 0.397281391399821,
"eval_runtime": 6142.6208,
"eval_samples_per_second": 135.226,
"eval_steps_per_second": 33.807,
"step": 165000
},
{
"epoch": 0.39848527440406284,
"grad_norm": 21.73479461669922,
"learning_rate": 3.3428217523790165e-05,
"loss": 3.0724,
"step": 165500
},
{
"epoch": 0.39968915740830474,
"grad_norm": 13.589031219482422,
"learning_rate": 3.336133487074259e-05,
"loss": 3.0599,
"step": 166000
},
{
"epoch": 0.40089304041254664,
"grad_norm": 12.588455200195312,
"learning_rate": 3.329445221769501e-05,
"loss": 3.0674,
"step": 166500
},
{
"epoch": 0.4020969234167885,
"grad_norm": 16.856395721435547,
"learning_rate": 3.322756956464744e-05,
"loss": 3.0598,
"step": 167000
},
{
"epoch": 0.4033008064210304,
"grad_norm": 14.325052261352539,
"learning_rate": 3.316068691159986e-05,
"loss": 3.1033,
"step": 167500
},
{
"epoch": 0.4045046894252723,
"grad_norm": 20.509449005126953,
"learning_rate": 3.3093938023858386e-05,
"loss": 3.0843,
"step": 168000
},
{
"epoch": 0.40570857242951414,
"grad_norm": 17.73023796081543,
"learning_rate": 3.302705537081081e-05,
"loss": 3.0367,
"step": 168500
},
{
"epoch": 0.40691245543375604,
"grad_norm": 24.057329177856445,
"learning_rate": 3.296017271776323e-05,
"loss": 3.0771,
"step": 169000
},
{
"epoch": 0.40811633843799794,
"grad_norm": 19.776145935058594,
"learning_rate": 3.289329006471566e-05,
"loss": 3.0784,
"step": 169500
},
{
"epoch": 0.4093202214422398,
"grad_norm": 23.74951934814453,
"learning_rate": 3.282654117697418e-05,
"loss": 3.0786,
"step": 170000
},
{
"epoch": 0.4093202214422398,
"eval_runtime": 6188.4105,
"eval_samples_per_second": 134.226,
"eval_steps_per_second": 33.557,
"step": 170000
},
{
"epoch": 0.4105241044464817,
"grad_norm": 17.745681762695312,
"learning_rate": 3.27596585239266e-05,
"loss": 3.0666,
"step": 170500
},
{
"epoch": 0.4117279874507236,
"grad_norm": 20.147336959838867,
"learning_rate": 3.269277587087903e-05,
"loss": 3.1238,
"step": 171000
},
{
"epoch": 0.41293187045496543,
"grad_norm": 16.938888549804688,
"learning_rate": 3.262589321783145e-05,
"loss": 3.0414,
"step": 171500
},
{
"epoch": 0.41413575345920733,
"grad_norm": 15.663901329040527,
"learning_rate": 3.2559010564783875e-05,
"loss": 3.0892,
"step": 172000
},
{
"epoch": 0.41533963646344924,
"grad_norm": 16.39117431640625,
"learning_rate": 3.2492127911736304e-05,
"loss": 3.0685,
"step": 172500
},
{
"epoch": 0.4165435194676911,
"grad_norm": 14.299029350280762,
"learning_rate": 3.242537902399482e-05,
"loss": 3.0725,
"step": 173000
},
{
"epoch": 0.417747402471933,
"grad_norm": 11.168866157531738,
"learning_rate": 3.235849637094725e-05,
"loss": 3.0502,
"step": 173500
},
{
"epoch": 0.41895128547617483,
"grad_norm": 13.38841724395752,
"learning_rate": 3.2291613717899674e-05,
"loss": 3.062,
"step": 174000
},
{
"epoch": 0.42015516848041673,
"grad_norm": 14.151941299438477,
"learning_rate": 3.2224731064852097e-05,
"loss": 3.0666,
"step": 174500
},
{
"epoch": 0.42135905148465863,
"grad_norm": 17.730104446411133,
"learning_rate": 3.215784841180452e-05,
"loss": 3.0709,
"step": 175000
},
{
"epoch": 0.42135905148465863,
"eval_runtime": 6186.2143,
"eval_samples_per_second": 134.274,
"eval_steps_per_second": 33.569,
"step": 175000
},
{
"epoch": 0.4225629344889005,
"grad_norm": 16.822513580322266,
"learning_rate": 3.209096575875695e-05,
"loss": 3.065,
"step": 175500
},
{
"epoch": 0.4237668174931424,
"grad_norm": 15.454965591430664,
"learning_rate": 3.202408310570938e-05,
"loss": 3.0476,
"step": 176000
},
{
"epoch": 0.4249707004973843,
"grad_norm": 21.14031410217285,
"learning_rate": 3.195720045266179e-05,
"loss": 3.0339,
"step": 176500
},
{
"epoch": 0.4261745835016261,
"grad_norm": 19.002689361572266,
"learning_rate": 3.189045156492032e-05,
"loss": 3.0598,
"step": 177000
},
{
"epoch": 0.427378466505868,
"grad_norm": 11.582403182983398,
"learning_rate": 3.182356891187274e-05,
"loss": 3.0454,
"step": 177500
},
{
"epoch": 0.42858234951010993,
"grad_norm": 14.35600757598877,
"learning_rate": 3.175668625882517e-05,
"loss": 3.0677,
"step": 178000
},
{
"epoch": 0.4297862325143518,
"grad_norm": 18.5367374420166,
"learning_rate": 3.168980360577759e-05,
"loss": 3.1098,
"step": 178500
},
{
"epoch": 0.4309901155185937,
"grad_norm": 17.769344329833984,
"learning_rate": 3.162305471803611e-05,
"loss": 3.052,
"step": 179000
},
{
"epoch": 0.4321939985228356,
"grad_norm": 17.472938537597656,
"learning_rate": 3.155617206498854e-05,
"loss": 3.0699,
"step": 179500
},
{
"epoch": 0.4333978815270774,
"grad_norm": 14.995344161987305,
"learning_rate": 3.148928941194096e-05,
"loss": 3.0682,
"step": 180000
},
{
"epoch": 0.4333978815270774,
"eval_runtime": 6302.7174,
"eval_samples_per_second": 131.792,
"eval_steps_per_second": 32.948,
"step": 180000
},
{
"epoch": 0.4346017645313193,
"grad_norm": 17.150964736938477,
"learning_rate": 3.1422406758893384e-05,
"loss": 3.0906,
"step": 180500
},
{
"epoch": 0.4358056475355612,
"grad_norm": 14.804174423217773,
"learning_rate": 3.1355524105845814e-05,
"loss": 3.0493,
"step": 181000
},
{
"epoch": 0.43700953053980307,
"grad_norm": 17.898832321166992,
"learning_rate": 3.128864145279824e-05,
"loss": 3.089,
"step": 181500
},
{
"epoch": 0.43821341354404497,
"grad_norm": 16.601884841918945,
"learning_rate": 3.122189256505676e-05,
"loss": 3.0688,
"step": 182000
},
{
"epoch": 0.4394172965482868,
"grad_norm": 14.000849723815918,
"learning_rate": 3.1155009912009184e-05,
"loss": 3.0295,
"step": 182500
},
{
"epoch": 0.4406211795525287,
"grad_norm": 17.828115463256836,
"learning_rate": 3.1088127258961606e-05,
"loss": 3.0588,
"step": 183000
},
{
"epoch": 0.4418250625567706,
"grad_norm": 20.30364418029785,
"learning_rate": 3.1021244605914035e-05,
"loss": 3.0203,
"step": 183500
},
{
"epoch": 0.44302894556101247,
"grad_norm": 17.606700897216797,
"learning_rate": 3.095436195286646e-05,
"loss": 3.0568,
"step": 184000
},
{
"epoch": 0.44423282856525437,
"grad_norm": 17.633464813232422,
"learning_rate": 3.0887613065124976e-05,
"loss": 3.0702,
"step": 184500
},
{
"epoch": 0.44543671156949627,
"grad_norm": 14.55715274810791,
"learning_rate": 3.0820730412077405e-05,
"loss": 3.0746,
"step": 185000
},
{
"epoch": 0.44543671156949627,
"eval_runtime": 6306.1272,
"eval_samples_per_second": 131.72,
"eval_steps_per_second": 32.93,
"step": 185000
},
{
"epoch": 0.4466405945737381,
"grad_norm": 16.668909072875977,
"learning_rate": 3.075384775902983e-05,
"loss": 3.0566,
"step": 185500
},
{
"epoch": 0.44784447757798,
"grad_norm": 14.347661018371582,
"learning_rate": 3.068696510598225e-05,
"loss": 3.0616,
"step": 186000
},
{
"epoch": 0.4490483605822219,
"grad_norm": 17.429546356201172,
"learning_rate": 3.062021621824077e-05,
"loss": 3.0875,
"step": 186500
},
{
"epoch": 0.45025224358646376,
"grad_norm": 19.362503051757812,
"learning_rate": 3.0553467330499294e-05,
"loss": 3.057,
"step": 187000
},
{
"epoch": 0.45145612659070566,
"grad_norm": 14.057225227355957,
"learning_rate": 3.048658467745172e-05,
"loss": 3.0644,
"step": 187500
},
{
"epoch": 0.45266000959494757,
"grad_norm": 21.090145111083984,
"learning_rate": 3.0419702024404146e-05,
"loss": 3.0886,
"step": 188000
},
{
"epoch": 0.4538638925991894,
"grad_norm": 13.602699279785156,
"learning_rate": 3.0352819371356568e-05,
"loss": 3.0649,
"step": 188500
},
{
"epoch": 0.4550677756034313,
"grad_norm": 14.61277961730957,
"learning_rate": 3.0285936718308994e-05,
"loss": 3.0502,
"step": 189000
},
{
"epoch": 0.4562716586076732,
"grad_norm": 14.571629524230957,
"learning_rate": 3.021905406526142e-05,
"loss": 3.0512,
"step": 189500
},
{
"epoch": 0.45747554161191506,
"grad_norm": 16.995033264160156,
"learning_rate": 3.0152171412213842e-05,
"loss": 3.0619,
"step": 190000
},
{
"epoch": 0.45747554161191506,
"eval_runtime": 6119.7371,
"eval_samples_per_second": 135.732,
"eval_steps_per_second": 33.933,
"step": 190000
},
{
"epoch": 0.45867942461615696,
"grad_norm": 14.749920845031738,
"learning_rate": 3.0085288759166268e-05,
"loss": 3.0377,
"step": 190500
},
{
"epoch": 0.4598833076203988,
"grad_norm": 18.717721939086914,
"learning_rate": 3.0018406106118697e-05,
"loss": 3.028,
"step": 191000
},
{
"epoch": 0.4610871906246407,
"grad_norm": 13.981959342956543,
"learning_rate": 2.995152345307112e-05,
"loss": 3.0643,
"step": 191500
},
{
"epoch": 0.4622910736288826,
"grad_norm": 13.590766906738281,
"learning_rate": 2.9884640800023545e-05,
"loss": 3.0734,
"step": 192000
},
{
"epoch": 0.46349495663312446,
"grad_norm": 14.754199028015137,
"learning_rate": 2.981775814697597e-05,
"loss": 3.0575,
"step": 192500
},
{
"epoch": 0.46469883963736636,
"grad_norm": 15.374496459960938,
"learning_rate": 2.9751009259234493e-05,
"loss": 3.0545,
"step": 193000
},
{
"epoch": 0.46590272264160826,
"grad_norm": 17.713016510009766,
"learning_rate": 2.968412660618691e-05,
"loss": 3.022,
"step": 193500
},
{
"epoch": 0.4671066056458501,
"grad_norm": 13.752087593078613,
"learning_rate": 2.961724395313934e-05,
"loss": 3.0129,
"step": 194000
},
{
"epoch": 0.468310488650092,
"grad_norm": 11.1192626953125,
"learning_rate": 2.9550361300091767e-05,
"loss": 3.0285,
"step": 194500
},
{
"epoch": 0.4695143716543339,
"grad_norm": 17.55103874206543,
"learning_rate": 2.9483746177656378e-05,
"loss": 3.045,
"step": 195000
},
{
"epoch": 0.4695143716543339,
"eval_runtime": 6132.8059,
"eval_samples_per_second": 135.443,
"eval_steps_per_second": 33.861,
"step": 195000
},
{
"epoch": 0.47071825465857575,
"grad_norm": 27.24392318725586,
"learning_rate": 2.9416863524608807e-05,
"loss": 3.0499,
"step": 195500
},
{
"epoch": 0.47192213766281765,
"grad_norm": 14.595544815063477,
"learning_rate": 2.9349980871561226e-05,
"loss": 3.0375,
"step": 196000
},
{
"epoch": 0.47312602066705955,
"grad_norm": 13.058863639831543,
"learning_rate": 2.9283098218513655e-05,
"loss": 3.1024,
"step": 196500
},
{
"epoch": 0.4743299036713014,
"grad_norm": 15.837779998779297,
"learning_rate": 2.921621556546608e-05,
"loss": 3.082,
"step": 197000
},
{
"epoch": 0.4755337866755433,
"grad_norm": 14.441446304321289,
"learning_rate": 2.9149466677724603e-05,
"loss": 3.0608,
"step": 197500
},
{
"epoch": 0.4767376696797852,
"grad_norm": 16.908939361572266,
"learning_rate": 2.9082584024677022e-05,
"loss": 3.0524,
"step": 198000
},
{
"epoch": 0.47794155268402705,
"grad_norm": 15.620512962341309,
"learning_rate": 2.901570137162945e-05,
"loss": 3.0614,
"step": 198500
},
{
"epoch": 0.47914543568826895,
"grad_norm": 17.97640609741211,
"learning_rate": 2.8948818718581877e-05,
"loss": 3.0483,
"step": 199000
},
{
"epoch": 0.48034931869251085,
"grad_norm": 19.494766235351562,
"learning_rate": 2.88819360655343e-05,
"loss": 3.0629,
"step": 199500
},
{
"epoch": 0.4815532016967527,
"grad_norm": 18.747150421142578,
"learning_rate": 2.8815053412486725e-05,
"loss": 3.0774,
"step": 200000
},
{
"epoch": 0.4815532016967527,
"eval_runtime": 6171.1886,
"eval_samples_per_second": 134.6,
"eval_steps_per_second": 33.65,
"step": 200000
},
{
"epoch": 0.4827570847009946,
"grad_norm": 15.972591400146484,
"learning_rate": 2.874817075943915e-05,
"loss": 3.0938,
"step": 200500
},
{
"epoch": 0.48396096770523644,
"grad_norm": 16.991474151611328,
"learning_rate": 2.8681421871697673e-05,
"loss": 3.0431,
"step": 201000
},
{
"epoch": 0.48516485070947835,
"grad_norm": 16.47597312927246,
"learning_rate": 2.8614539218650095e-05,
"loss": 3.0886,
"step": 201500
},
{
"epoch": 0.48636873371372025,
"grad_norm": 20.3975830078125,
"learning_rate": 2.854765656560252e-05,
"loss": 3.0562,
"step": 202000
},
{
"epoch": 0.4875726167179621,
"grad_norm": 17.682926177978516,
"learning_rate": 2.8480773912554947e-05,
"loss": 3.1002,
"step": 202500
},
{
"epoch": 0.488776499722204,
"grad_norm": 18.027238845825195,
"learning_rate": 2.841389125950737e-05,
"loss": 3.0798,
"step": 203000
},
{
"epoch": 0.4899803827264459,
"grad_norm": 20.950571060180664,
"learning_rate": 2.8347142371765888e-05,
"loss": 3.0573,
"step": 203500
},
{
"epoch": 0.49118426573068774,
"grad_norm": 17.63266372680664,
"learning_rate": 2.8280259718718317e-05,
"loss": 3.048,
"step": 204000
},
{
"epoch": 0.49238814873492964,
"grad_norm": 17.037296295166016,
"learning_rate": 2.8213377065670743e-05,
"loss": 3.016,
"step": 204500
},
{
"epoch": 0.49359203173917154,
"grad_norm": 21.214052200317383,
"learning_rate": 2.8146494412623165e-05,
"loss": 3.0676,
"step": 205000
},
{
"epoch": 0.49359203173917154,
"eval_runtime": 6343.1785,
"eval_samples_per_second": 130.951,
"eval_steps_per_second": 32.738,
"step": 205000
},
{
"epoch": 0.4947959147434134,
"grad_norm": 17.722492218017578,
"learning_rate": 2.807961175957559e-05,
"loss": 3.076,
"step": 205500
},
{
"epoch": 0.4959997977476553,
"grad_norm": 17.147768020629883,
"learning_rate": 2.801272910652802e-05,
"loss": 3.0684,
"step": 206000
},
{
"epoch": 0.4972036807518972,
"grad_norm": 15.113913536071777,
"learning_rate": 2.794584645348044e-05,
"loss": 3.0133,
"step": 206500
},
{
"epoch": 0.49840756375613904,
"grad_norm": 15.339323043823242,
"learning_rate": 2.7878963800432868e-05,
"loss": 3.06,
"step": 207000
},
{
"epoch": 0.49961144676038094,
"grad_norm": 14.279352188110352,
"learning_rate": 2.7812214912691387e-05,
"loss": 3.0718,
"step": 207500
},
{
"epoch": 0.5008153297646228,
"grad_norm": 15.7473726272583,
"learning_rate": 2.7745466024949905e-05,
"loss": 3.0382,
"step": 208000
},
{
"epoch": 0.5020192127688647,
"grad_norm": 16.69623374938965,
"learning_rate": 2.7678583371902334e-05,
"loss": 3.0469,
"step": 208500
},
{
"epoch": 0.5032230957731065,
"grad_norm": 12.795482635498047,
"learning_rate": 2.7611700718854753e-05,
"loss": 3.0691,
"step": 209000
},
{
"epoch": 0.5044269787773484,
"grad_norm": 15.719594955444336,
"learning_rate": 2.7544818065807182e-05,
"loss": 3.0843,
"step": 209500
},
{
"epoch": 0.5056308617815903,
"grad_norm": 16.107906341552734,
"learning_rate": 2.74780691780657e-05,
"loss": 3.0939,
"step": 210000
},
{
"epoch": 0.5056308617815903,
"eval_runtime": 6288.8164,
"eval_samples_per_second": 132.083,
"eval_steps_per_second": 33.021,
"step": 210000
},
{
"epoch": 0.5068347447858322,
"grad_norm": 22.665922164916992,
"learning_rate": 2.741118652501813e-05,
"loss": 3.0311,
"step": 210500
},
{
"epoch": 0.5080386277900741,
"grad_norm": 12.993492126464844,
"learning_rate": 2.734430387197055e-05,
"loss": 3.0409,
"step": 211000
},
{
"epoch": 0.509242510794316,
"grad_norm": 13.392237663269043,
"learning_rate": 2.727742121892298e-05,
"loss": 3.0185,
"step": 211500
},
{
"epoch": 0.5104463937985578,
"grad_norm": 18.179622650146484,
"learning_rate": 2.7210538565875404e-05,
"loss": 3.1036,
"step": 212000
},
{
"epoch": 0.5116502768027997,
"grad_norm": 16.70694923400879,
"learning_rate": 2.7143655912827826e-05,
"loss": 3.063,
"step": 212500
},
{
"epoch": 0.5128541598070416,
"grad_norm": 23.674760818481445,
"learning_rate": 2.7076773259780252e-05,
"loss": 3.0342,
"step": 213000
},
{
"epoch": 0.5140580428112835,
"grad_norm": 19.409990310668945,
"learning_rate": 2.701002437203877e-05,
"loss": 3.0462,
"step": 213500
},
{
"epoch": 0.5152619258155254,
"grad_norm": 15.574653625488281,
"learning_rate": 2.69431417189912e-05,
"loss": 3.0292,
"step": 214000
},
{
"epoch": 0.5164658088197672,
"grad_norm": 17.644498825073242,
"learning_rate": 2.6876259065943622e-05,
"loss": 3.0152,
"step": 214500
},
{
"epoch": 0.5176696918240091,
"grad_norm": 14.58530330657959,
"learning_rate": 2.6809376412896048e-05,
"loss": 3.1034,
"step": 215000
},
{
"epoch": 0.5176696918240091,
"eval_runtime": 6223.0215,
"eval_samples_per_second": 133.479,
"eval_steps_per_second": 33.37,
"step": 215000
},
{
"epoch": 0.518873574828251,
"grad_norm": 19.024547576904297,
"learning_rate": 2.674249375984847e-05,
"loss": 3.0733,
"step": 215500
},
{
"epoch": 0.5200774578324929,
"grad_norm": 17.260374069213867,
"learning_rate": 2.6675611106800896e-05,
"loss": 3.0252,
"step": 216000
},
{
"epoch": 0.5212813408367348,
"grad_norm": 18.4815673828125,
"learning_rate": 2.6608862219059415e-05,
"loss": 3.069,
"step": 216500
},
{
"epoch": 0.5224852238409767,
"grad_norm": 15.065186500549316,
"learning_rate": 2.6541979566011844e-05,
"loss": 3.0697,
"step": 217000
},
{
"epoch": 0.5236891068452185,
"grad_norm": 16.79564666748047,
"learning_rate": 2.6475096912964263e-05,
"loss": 3.0433,
"step": 217500
},
{
"epoch": 0.5248929898494604,
"grad_norm": 18.250133514404297,
"learning_rate": 2.6408214259916692e-05,
"loss": 3.0243,
"step": 218000
},
{
"epoch": 0.5260968728537023,
"grad_norm": 15.040393829345703,
"learning_rate": 2.6341331606869118e-05,
"loss": 3.0501,
"step": 218500
},
{
"epoch": 0.5273007558579442,
"grad_norm": 18.00982093811035,
"learning_rate": 2.627444895382154e-05,
"loss": 3.0481,
"step": 219000
},
{
"epoch": 0.5285046388621861,
"grad_norm": 14.428119659423828,
"learning_rate": 2.6207566300773966e-05,
"loss": 3.0788,
"step": 219500
},
{
"epoch": 0.529708521866428,
"grad_norm": 19.191162109375,
"learning_rate": 2.6140683647726395e-05,
"loss": 3.0549,
"step": 220000
},
{
"epoch": 0.529708521866428,
"eval_runtime": 6262.92,
"eval_samples_per_second": 132.629,
"eval_steps_per_second": 33.157,
"step": 220000
},
{
"epoch": 0.5309124048706698,
"grad_norm": 18.9827938079834,
"learning_rate": 2.6073934759984914e-05,
"loss": 3.032,
"step": 220500
},
{
"epoch": 0.5321162878749117,
"grad_norm": 16.249061584472656,
"learning_rate": 2.6007052106937336e-05,
"loss": 3.0587,
"step": 221000
},
{
"epoch": 0.5333201708791536,
"grad_norm": 27.886228561401367,
"learning_rate": 2.5940303219195855e-05,
"loss": 3.0959,
"step": 221500
},
{
"epoch": 0.5345240538833955,
"grad_norm": 28.477378845214844,
"learning_rate": 2.587342056614828e-05,
"loss": 3.0545,
"step": 222000
},
{
"epoch": 0.5357279368876374,
"grad_norm": 54.090702056884766,
"learning_rate": 2.580653791310071e-05,
"loss": 3.0052,
"step": 222500
},
{
"epoch": 0.5369318198918792,
"grad_norm": 20.456764221191406,
"learning_rate": 2.5739655260053132e-05,
"loss": 3.0362,
"step": 223000
},
{
"epoch": 0.5381357028961211,
"grad_norm": 18.759544372558594,
"learning_rate": 2.5672772607005558e-05,
"loss": 3.0841,
"step": 223500
},
{
"epoch": 0.539339585900363,
"grad_norm": 24.140661239624023,
"learning_rate": 2.5605889953957983e-05,
"loss": 3.0545,
"step": 224000
},
{
"epoch": 0.5405434689046049,
"grad_norm": 15.08611011505127,
"learning_rate": 2.5539007300910406e-05,
"loss": 3.0784,
"step": 224500
},
{
"epoch": 0.5417473519088468,
"grad_norm": 20.986557006835938,
"learning_rate": 2.547212464786283e-05,
"loss": 3.0682,
"step": 225000
},
{
"epoch": 0.5417473519088468,
"eval_runtime": 6240.4652,
"eval_samples_per_second": 133.106,
"eval_steps_per_second": 33.277,
"step": 225000
},
{
"epoch": 0.5429512349130887,
"grad_norm": 11.451869010925293,
"learning_rate": 2.5405375760121354e-05,
"loss": 3.0627,
"step": 225500
},
{
"epoch": 0.5441551179173305,
"grad_norm": 17.614988327026367,
"learning_rate": 2.5338626872379872e-05,
"loss": 3.0518,
"step": 226000
},
{
"epoch": 0.5453590009215724,
"grad_norm": 14.993136405944824,
"learning_rate": 2.5271744219332298e-05,
"loss": 3.0515,
"step": 226500
},
{
"epoch": 0.5465628839258143,
"grad_norm": 21.78707504272461,
"learning_rate": 2.520486156628472e-05,
"loss": 3.0632,
"step": 227000
},
{
"epoch": 0.5477667669300562,
"grad_norm": 16.39373207092285,
"learning_rate": 2.513797891323715e-05,
"loss": 3.0524,
"step": 227500
},
{
"epoch": 0.5489706499342981,
"grad_norm": 13.787343978881836,
"learning_rate": 2.5071230025495668e-05,
"loss": 3.0449,
"step": 228000
},
{
"epoch": 0.55017453293854,
"grad_norm": 19.658519744873047,
"learning_rate": 2.5004347372448094e-05,
"loss": 3.0304,
"step": 228500
},
{
"epoch": 0.5513784159427818,
"grad_norm": 16.18865203857422,
"learning_rate": 2.493746471940052e-05,
"loss": 3.0746,
"step": 229000
},
{
"epoch": 0.5525822989470237,
"grad_norm": 17.702472686767578,
"learning_rate": 2.4870582066352942e-05,
"loss": 3.07,
"step": 229500
},
{
"epoch": 0.5537861819512656,
"grad_norm": 18.08761215209961,
"learning_rate": 2.4803699413305368e-05,
"loss": 3.0417,
"step": 230000
},
{
"epoch": 0.5537861819512656,
"eval_runtime": 6192.5264,
"eval_samples_per_second": 134.137,
"eval_steps_per_second": 33.534,
"step": 230000
},
{
"epoch": 0.5549900649555075,
"grad_norm": 12.940227508544922,
"learning_rate": 2.473695052556389e-05,
"loss": 3.0623,
"step": 230500
},
{
"epoch": 0.5561939479597494,
"grad_norm": 14.184712409973145,
"learning_rate": 2.4670067872516316e-05,
"loss": 3.0565,
"step": 231000
},
{
"epoch": 0.5573978309639912,
"grad_norm": 16.096614837646484,
"learning_rate": 2.4603185219468738e-05,
"loss": 2.9976,
"step": 231500
},
{
"epoch": 0.5586017139682331,
"grad_norm": 15.835817337036133,
"learning_rate": 2.4536302566421164e-05,
"loss": 2.9842,
"step": 232000
},
{
"epoch": 0.559805596972475,
"grad_norm": 22.432340621948242,
"learning_rate": 2.446941991337359e-05,
"loss": 3.0831,
"step": 232500
},
{
"epoch": 0.5610094799767169,
"grad_norm": 19.895309448242188,
"learning_rate": 2.4402537260326015e-05,
"loss": 3.0444,
"step": 233000
},
{
"epoch": 0.5622133629809588,
"grad_norm": 14.998634338378906,
"learning_rate": 2.4335788372584534e-05,
"loss": 3.0233,
"step": 233500
},
{
"epoch": 0.5634172459852007,
"grad_norm": 12.780035972595215,
"learning_rate": 2.426890571953696e-05,
"loss": 3.0215,
"step": 234000
},
{
"epoch": 0.5646211289894425,
"grad_norm": 18.854740142822266,
"learning_rate": 2.4202023066489385e-05,
"loss": 3.0684,
"step": 234500
},
{
"epoch": 0.5658250119936844,
"grad_norm": 17.486467361450195,
"learning_rate": 2.4135140413441808e-05,
"loss": 3.053,
"step": 235000
},
{
"epoch": 0.5658250119936844,
"eval_runtime": 6288.8357,
"eval_samples_per_second": 132.082,
"eval_steps_per_second": 33.021,
"step": 235000
},
{
"epoch": 0.5670288949979263,
"grad_norm": 14.92556095123291,
"learning_rate": 2.4068257760394233e-05,
"loss": 3.0641,
"step": 235500
},
{
"epoch": 0.5682327780021682,
"grad_norm": 13.280654907226562,
"learning_rate": 2.400137510734666e-05,
"loss": 3.0217,
"step": 236000
},
{
"epoch": 0.5694366610064101,
"grad_norm": 16.9669246673584,
"learning_rate": 2.393462621960518e-05,
"loss": 3.0162,
"step": 236500
},
{
"epoch": 0.570640544010652,
"grad_norm": 14.215867042541504,
"learning_rate": 2.3867743566557604e-05,
"loss": 3.0158,
"step": 237000
},
{
"epoch": 0.5718444270148938,
"grad_norm": 19.857236862182617,
"learning_rate": 2.380086091351003e-05,
"loss": 3.0011,
"step": 237500
},
{
"epoch": 0.5730483100191357,
"grad_norm": 14.70789909362793,
"learning_rate": 2.3733978260462455e-05,
"loss": 3.0155,
"step": 238000
},
{
"epoch": 0.5742521930233776,
"grad_norm": 16.156538009643555,
"learning_rate": 2.3667229372720977e-05,
"loss": 3.0281,
"step": 238500
},
{
"epoch": 0.5754560760276195,
"grad_norm": 29.431739807128906,
"learning_rate": 2.36003467196734e-05,
"loss": 3.0404,
"step": 239000
},
{
"epoch": 0.5766599590318614,
"grad_norm": 14.224696159362793,
"learning_rate": 2.3533464066625825e-05,
"loss": 3.0172,
"step": 239500
},
{
"epoch": 0.5778638420361032,
"grad_norm": 19.29595184326172,
"learning_rate": 2.346658141357825e-05,
"loss": 3.0622,
"step": 240000
},
{
"epoch": 0.5778638420361032,
"eval_runtime": 6227.4429,
"eval_samples_per_second": 133.385,
"eval_steps_per_second": 33.346,
"step": 240000
},
{
"epoch": 0.5790677250403451,
"grad_norm": 24.003347396850586,
"learning_rate": 2.3399698760530677e-05,
"loss": 2.9962,
"step": 240500
},
{
"epoch": 0.580271608044587,
"grad_norm": 16.034706115722656,
"learning_rate": 2.33328161074831e-05,
"loss": 3.0286,
"step": 241000
},
{
"epoch": 0.5814754910488289,
"grad_norm": 16.609622955322266,
"learning_rate": 2.3265933454435525e-05,
"loss": 3.031,
"step": 241500
},
{
"epoch": 0.5826793740530708,
"grad_norm": 30.813108444213867,
"learning_rate": 2.319905080138795e-05,
"loss": 3.0143,
"step": 242000
},
{
"epoch": 0.5838832570573127,
"grad_norm": 15.091474533081055,
"learning_rate": 2.313230191364647e-05,
"loss": 3.0475,
"step": 242500
},
{
"epoch": 0.5850871400615545,
"grad_norm": 19.889976501464844,
"learning_rate": 2.3065419260598895e-05,
"loss": 3.0551,
"step": 243000
},
{
"epoch": 0.5862910230657964,
"grad_norm": 16.42539405822754,
"learning_rate": 2.299853660755132e-05,
"loss": 2.9885,
"step": 243500
},
{
"epoch": 0.5874949060700383,
"grad_norm": 18.250354766845703,
"learning_rate": 2.2931653954503746e-05,
"loss": 3.0267,
"step": 244000
},
{
"epoch": 0.5886987890742802,
"grad_norm": 11.44227409362793,
"learning_rate": 2.286477130145617e-05,
"loss": 2.9568,
"step": 244500
},
{
"epoch": 0.5899026720785221,
"grad_norm": 21.37769889831543,
"learning_rate": 2.279802241371469e-05,
"loss": 3.0259,
"step": 245000
},
{
"epoch": 0.5899026720785221,
"eval_runtime": 6297.4297,
"eval_samples_per_second": 131.902,
"eval_steps_per_second": 32.976,
"step": 245000
},
{
"epoch": 0.591106555082764,
"grad_norm": 15.137754440307617,
"learning_rate": 2.273127352597321e-05,
"loss": 3.0087,
"step": 245500
},
{
"epoch": 0.5923104380870058,
"grad_norm": 15.59156608581543,
"learning_rate": 2.2664390872925635e-05,
"loss": 3.0397,
"step": 246000
},
{
"epoch": 0.5935143210912477,
"grad_norm": 14.741199493408203,
"learning_rate": 2.259750821987806e-05,
"loss": 3.0505,
"step": 246500
},
{
"epoch": 0.5947182040954896,
"grad_norm": 37.30345153808594,
"learning_rate": 2.2530625566830483e-05,
"loss": 3.0312,
"step": 247000
},
{
"epoch": 0.5959220870997315,
"grad_norm": 16.39379119873047,
"learning_rate": 2.2463742913782912e-05,
"loss": 3.0068,
"step": 247500
},
{
"epoch": 0.5971259701039734,
"grad_norm": 16.724523544311523,
"learning_rate": 2.2396860260735335e-05,
"loss": 3.0172,
"step": 248000
},
{
"epoch": 0.5983298531082153,
"grad_norm": 13.491678237915039,
"learning_rate": 2.2330111372993857e-05,
"loss": 3.0396,
"step": 248500
},
{
"epoch": 0.5995337361124571,
"grad_norm": 17.01793670654297,
"learning_rate": 2.226322871994628e-05,
"loss": 3.0092,
"step": 249000
},
{
"epoch": 0.600737619116699,
"grad_norm": 16.2504825592041,
"learning_rate": 2.219634606689871e-05,
"loss": 3.0564,
"step": 249500
},
{
"epoch": 0.6019415021209409,
"grad_norm": 19.381729125976562,
"learning_rate": 2.212946341385113e-05,
"loss": 2.9991,
"step": 250000
},
{
"epoch": 0.6019415021209409,
"eval_runtime": 6343.8627,
"eval_samples_per_second": 130.937,
"eval_steps_per_second": 32.734,
"step": 250000
},
{
"epoch": 0.6031453851251828,
"grad_norm": 15.789433479309082,
"learning_rate": 2.2062714526109653e-05,
"loss": 3.0164,
"step": 250500
},
{
"epoch": 0.6043492681294247,
"grad_norm": 15.380681037902832,
"learning_rate": 2.1995831873062075e-05,
"loss": 3.006,
"step": 251000
},
{
"epoch": 0.6055531511336665,
"grad_norm": 12.976866722106934,
"learning_rate": 2.19289492200145e-05,
"loss": 3.099,
"step": 251500
},
{
"epoch": 0.6067570341379084,
"grad_norm": 17.682626724243164,
"learning_rate": 2.1862066566966927e-05,
"loss": 3.0381,
"step": 252000
},
{
"epoch": 0.6079609171421503,
"grad_norm": 15.32071304321289,
"learning_rate": 2.1795183913919352e-05,
"loss": 3.0404,
"step": 252500
},
{
"epoch": 0.6091648001463922,
"grad_norm": 21.887651443481445,
"learning_rate": 2.1728301260871775e-05,
"loss": 3.0282,
"step": 253000
},
{
"epoch": 0.6103686831506341,
"grad_norm": 16.731210708618164,
"learning_rate": 2.1661552373130297e-05,
"loss": 3.0219,
"step": 253500
},
{
"epoch": 0.611572566154876,
"grad_norm": 22.759746551513672,
"learning_rate": 2.1594669720082722e-05,
"loss": 3.0442,
"step": 254000
},
{
"epoch": 0.6127764491591178,
"grad_norm": 18.68710708618164,
"learning_rate": 2.1527787067035145e-05,
"loss": 3.0091,
"step": 254500
},
{
"epoch": 0.6139803321633597,
"grad_norm": 23.144712448120117,
"learning_rate": 2.146090441398757e-05,
"loss": 3.0501,
"step": 255000
},
{
"epoch": 0.6139803321633597,
"eval_runtime": 6230.1182,
"eval_samples_per_second": 133.327,
"eval_steps_per_second": 33.332,
"step": 255000
},
{
"epoch": 0.6151842151676016,
"grad_norm": 18.833757400512695,
"learning_rate": 2.1394021760939996e-05,
"loss": 3.1018,
"step": 255500
},
{
"epoch": 0.6163880981718435,
"grad_norm": 21.688997268676758,
"learning_rate": 2.132727287319852e-05,
"loss": 3.0579,
"step": 256000
},
{
"epoch": 0.6175919811760854,
"grad_norm": 17.346538543701172,
"learning_rate": 2.126039022015094e-05,
"loss": 3.0306,
"step": 256500
},
{
"epoch": 0.6187958641803273,
"grad_norm": 18.86598014831543,
"learning_rate": 2.1193507567103366e-05,
"loss": 3.0237,
"step": 257000
},
{
"epoch": 0.6199997471845691,
"grad_norm": 13.735309600830078,
"learning_rate": 2.1126624914055792e-05,
"loss": 3.0416,
"step": 257500
},
{
"epoch": 0.621203630188811,
"grad_norm": 21.433256149291992,
"learning_rate": 2.1059742261008218e-05,
"loss": 3.0162,
"step": 258000
},
{
"epoch": 0.6224075131930529,
"grad_norm": 18.01786231994629,
"learning_rate": 2.099285960796064e-05,
"loss": 3.0192,
"step": 258500
},
{
"epoch": 0.6236113961972948,
"grad_norm": 17.93750762939453,
"learning_rate": 2.092597695491307e-05,
"loss": 3.0162,
"step": 259000
},
{
"epoch": 0.6248152792015367,
"grad_norm": 19.375873565673828,
"learning_rate": 2.0859094301865492e-05,
"loss": 2.9953,
"step": 259500
},
{
"epoch": 0.6260191622057785,
"grad_norm": 16.76817512512207,
"learning_rate": 2.0792479179430107e-05,
"loss": 2.9848,
"step": 260000
},
{
"epoch": 0.6260191622057785,
"eval_runtime": 6319.8113,
"eval_samples_per_second": 131.435,
"eval_steps_per_second": 32.859,
"step": 260000
},
{
"epoch": 0.6272230452100204,
"grad_norm": 19.69635009765625,
"learning_rate": 2.0725596526382532e-05,
"loss": 3.0555,
"step": 260500
},
{
"epoch": 0.6284269282142623,
"grad_norm": 16.243324279785156,
"learning_rate": 2.0658713873334955e-05,
"loss": 3.0212,
"step": 261000
},
{
"epoch": 0.6296308112185042,
"grad_norm": 17.867599487304688,
"learning_rate": 2.0591831220287384e-05,
"loss": 3.0451,
"step": 261500
},
{
"epoch": 0.6308346942227461,
"grad_norm": 17.559730529785156,
"learning_rate": 2.0525082332545903e-05,
"loss": 3.012,
"step": 262000
},
{
"epoch": 0.632038577226988,
"grad_norm": 14.618083953857422,
"learning_rate": 2.045833344480442e-05,
"loss": 3.034,
"step": 262500
},
{
"epoch": 0.6332424602312298,
"grad_norm": 16.521699905395508,
"learning_rate": 2.0391450791756847e-05,
"loss": 3.0197,
"step": 263000
},
{
"epoch": 0.6344463432354717,
"grad_norm": 16.326717376708984,
"learning_rate": 2.0324568138709273e-05,
"loss": 3.0566,
"step": 263500
},
{
"epoch": 0.6356502262397136,
"grad_norm": 22.72909164428711,
"learning_rate": 2.02576854856617e-05,
"loss": 3.0413,
"step": 264000
},
{
"epoch": 0.6368541092439555,
"grad_norm": 21.150442123413086,
"learning_rate": 2.019080283261412e-05,
"loss": 3.0337,
"step": 264500
},
{
"epoch": 0.6380579922481974,
"grad_norm": 18.094627380371094,
"learning_rate": 2.0123920179566547e-05,
"loss": 3.0103,
"step": 265000
},
{
"epoch": 0.6380579922481974,
"eval_runtime": 6283.8462,
"eval_samples_per_second": 132.187,
"eval_steps_per_second": 33.047,
"step": 265000
},
{
"epoch": 0.6392618752524393,
"grad_norm": 16.778398513793945,
"learning_rate": 2.0057037526518972e-05,
"loss": 3.0193,
"step": 265500
},
{
"epoch": 0.6404657582566811,
"grad_norm": 16.389066696166992,
"learning_rate": 1.9990154873471398e-05,
"loss": 3.0297,
"step": 266000
},
{
"epoch": 0.641669641260923,
"grad_norm": 15.284423828125,
"learning_rate": 1.9923405985729917e-05,
"loss": 3.0253,
"step": 266500
},
{
"epoch": 0.6428735242651649,
"grad_norm": 21.423006057739258,
"learning_rate": 1.9856523332682343e-05,
"loss": 3.0313,
"step": 267000
},
{
"epoch": 0.6440774072694068,
"grad_norm": 17.86176109313965,
"learning_rate": 1.9789640679634768e-05,
"loss": 3.0644,
"step": 267500
},
{
"epoch": 0.6452812902736487,
"grad_norm": 19.17348861694336,
"learning_rate": 1.9722758026587194e-05,
"loss": 3.0494,
"step": 268000
},
{
"epoch": 0.6464851732778905,
"grad_norm": 19.088390350341797,
"learning_rate": 1.9655875373539616e-05,
"loss": 3.0172,
"step": 268500
},
{
"epoch": 0.6476890562821324,
"grad_norm": 17.714704513549805,
"learning_rate": 1.9588992720492046e-05,
"loss": 3.0296,
"step": 269000
},
{
"epoch": 0.6488929392863743,
"grad_norm": 16.175125122070312,
"learning_rate": 1.9522110067444468e-05,
"loss": 3.033,
"step": 269500
},
{
"epoch": 0.6500968222906162,
"grad_norm": 13.180002212524414,
"learning_rate": 1.9455227414396894e-05,
"loss": 3.042,
"step": 270000
},
{
"epoch": 0.6500968222906162,
"eval_runtime": 6276.2648,
"eval_samples_per_second": 132.347,
"eval_steps_per_second": 33.087,
"step": 270000
},
{
"epoch": 0.6513007052948581,
"grad_norm": 19.098552703857422,
"learning_rate": 1.9388478526655412e-05,
"loss": 3.0693,
"step": 270500
},
{
"epoch": 0.6525045882991,
"grad_norm": 17.581096649169922,
"learning_rate": 1.9321595873607838e-05,
"loss": 3.0159,
"step": 271000
},
{
"epoch": 0.6537084713033418,
"grad_norm": 16.60484504699707,
"learning_rate": 1.9254713220560264e-05,
"loss": 3.0212,
"step": 271500
},
{
"epoch": 0.6549123543075837,
"grad_norm": 16.275178909301758,
"learning_rate": 1.918783056751269e-05,
"loss": 3.0536,
"step": 272000
},
{
"epoch": 0.6561162373118256,
"grad_norm": 18.09239959716797,
"learning_rate": 1.9121081679771208e-05,
"loss": 3.0576,
"step": 272500
},
{
"epoch": 0.6573201203160675,
"grad_norm": 17.817174911499023,
"learning_rate": 1.9054199026723634e-05,
"loss": 3.006,
"step": 273000
},
{
"epoch": 0.6585240033203094,
"grad_norm": 20.33548355102539,
"learning_rate": 1.8987450138982156e-05,
"loss": 3.0236,
"step": 273500
},
{
"epoch": 0.6597278863245513,
"grad_norm": 16.80567169189453,
"learning_rate": 1.892056748593458e-05,
"loss": 3.0272,
"step": 274000
},
{
"epoch": 0.660931769328793,
"grad_norm": 14.377747535705566,
"learning_rate": 1.8853684832887004e-05,
"loss": 3.0447,
"step": 274500
},
{
"epoch": 0.662135652333035,
"grad_norm": 20.724485397338867,
"learning_rate": 1.878680217983943e-05,
"loss": 3.0422,
"step": 275000
},
{
"epoch": 0.662135652333035,
"eval_runtime": 6186.2818,
"eval_samples_per_second": 134.272,
"eval_steps_per_second": 33.568,
"step": 275000
},
{
"epoch": 0.6633395353372769,
"grad_norm": 18.72093963623047,
"learning_rate": 1.8719919526791856e-05,
"loss": 3.0455,
"step": 275500
},
{
"epoch": 0.6645434183415188,
"grad_norm": 20.733427047729492,
"learning_rate": 1.8653170639050374e-05,
"loss": 3.0217,
"step": 276000
},
{
"epoch": 0.6657473013457607,
"grad_norm": 20.21004295349121,
"learning_rate": 1.85862879860028e-05,
"loss": 3.0201,
"step": 276500
},
{
"epoch": 0.6669511843500024,
"grad_norm": 16.68962860107422,
"learning_rate": 1.8519405332955226e-05,
"loss": 3.0333,
"step": 277000
},
{
"epoch": 0.6681550673542443,
"grad_norm": 16.575241088867188,
"learning_rate": 1.8452522679907648e-05,
"loss": 3.018,
"step": 277500
},
{
"epoch": 0.6693589503584862,
"grad_norm": 19.38899803161621,
"learning_rate": 1.8385640026860074e-05,
"loss": 3.0496,
"step": 278000
},
{
"epoch": 0.6705628333627282,
"grad_norm": 14.967867851257324,
"learning_rate": 1.831902490442469e-05,
"loss": 2.999,
"step": 278500
},
{
"epoch": 0.67176671636697,
"grad_norm": 22.434553146362305,
"learning_rate": 1.8252142251377114e-05,
"loss": 3.0349,
"step": 279000
},
{
"epoch": 0.672970599371212,
"grad_norm": 16.710906982421875,
"learning_rate": 1.818525959832954e-05,
"loss": 3.0342,
"step": 279500
},
{
"epoch": 0.6741744823754537,
"grad_norm": 15.848820686340332,
"learning_rate": 1.8118376945281966e-05,
"loss": 3.0272,
"step": 280000
},
{
"epoch": 0.6741744823754537,
"eval_runtime": 6353.5428,
"eval_samples_per_second": 130.737,
"eval_steps_per_second": 32.684,
"step": 280000
},
{
"epoch": 0.6753783653796956,
"grad_norm": 15.844106674194336,
"learning_rate": 1.805149429223439e-05,
"loss": 3.0116,
"step": 280500
},
{
"epoch": 0.6765822483839375,
"grad_norm": 19.46364402770996,
"learning_rate": 1.7984611639186817e-05,
"loss": 3.0428,
"step": 281000
},
{
"epoch": 0.6777861313881794,
"grad_norm": 16.986345291137695,
"learning_rate": 1.791772898613924e-05,
"loss": 3.0407,
"step": 281500
},
{
"epoch": 0.6789900143924213,
"grad_norm": 19.00211524963379,
"learning_rate": 1.7850846333091666e-05,
"loss": 3.0754,
"step": 282000
},
{
"epoch": 0.6801938973966632,
"grad_norm": 16.347320556640625,
"learning_rate": 1.778396368004409e-05,
"loss": 3.0583,
"step": 282500
},
{
"epoch": 0.681397780400905,
"grad_norm": 17.984121322631836,
"learning_rate": 1.7717081026996517e-05,
"loss": 3.0078,
"step": 283000
},
{
"epoch": 0.6826016634051469,
"grad_norm": 13.47775936126709,
"learning_rate": 1.765019837394894e-05,
"loss": 3.0313,
"step": 283500
},
{
"epoch": 0.6838055464093888,
"grad_norm": 19.955591201782227,
"learning_rate": 1.7583449486207458e-05,
"loss": 3.0128,
"step": 284000
},
{
"epoch": 0.6850094294136307,
"grad_norm": 15.306801795959473,
"learning_rate": 1.7516566833159887e-05,
"loss": 3.0537,
"step": 284500
},
{
"epoch": 0.6862133124178726,
"grad_norm": 18.41864013671875,
"learning_rate": 1.744968418011231e-05,
"loss": 2.9884,
"step": 285000
},
{
"epoch": 0.6862133124178726,
"eval_runtime": 6358.5857,
"eval_samples_per_second": 130.634,
"eval_steps_per_second": 32.659,
"step": 285000
},
{
"epoch": 0.6874171954221144,
"grad_norm": 23.076107025146484,
"learning_rate": 1.7382801527064735e-05,
"loss": 3.0266,
"step": 285500
},
{
"epoch": 0.6886210784263563,
"grad_norm": 13.705315589904785,
"learning_rate": 1.7315918874017158e-05,
"loss": 3.0475,
"step": 286000
},
{
"epoch": 0.6898249614305982,
"grad_norm": 16.31940460205078,
"learning_rate": 1.7249036220969587e-05,
"loss": 2.9996,
"step": 286500
},
{
"epoch": 0.6910288444348401,
"grad_norm": 18.389102935791016,
"learning_rate": 1.718215356792201e-05,
"loss": 3.0546,
"step": 287000
},
{
"epoch": 0.692232727439082,
"grad_norm": 13.655202865600586,
"learning_rate": 1.711540468018053e-05,
"loss": 3.0324,
"step": 287500
},
{
"epoch": 0.6934366104433239,
"grad_norm": 16.57909393310547,
"learning_rate": 1.7048522027132954e-05,
"loss": 3.0293,
"step": 288000
},
{
"epoch": 0.6946404934475657,
"grad_norm": 20.497554779052734,
"learning_rate": 1.6981639374085383e-05,
"loss": 3.0236,
"step": 288500
},
{
"epoch": 0.6958443764518076,
"grad_norm": 18.09133529663086,
"learning_rate": 1.6914756721037805e-05,
"loss": 3.0379,
"step": 289000
},
{
"epoch": 0.6970482594560495,
"grad_norm": 26.225669860839844,
"learning_rate": 1.684787406799023e-05,
"loss": 3.0053,
"step": 289500
},
{
"epoch": 0.6982521424602914,
"grad_norm": 17.222896575927734,
"learning_rate": 1.6780991414942657e-05,
"loss": 2.9939,
"step": 290000
},
{
"epoch": 0.6982521424602914,
"eval_runtime": 6305.6016,
"eval_samples_per_second": 131.731,
"eval_steps_per_second": 32.933,
"step": 290000
},
{
"epoch": 0.6994560254645333,
"grad_norm": 13.189409255981445,
"learning_rate": 1.6714108761895082e-05,
"loss": 3.0342,
"step": 290500
},
{
"epoch": 0.7006599084687752,
"grad_norm": 16.97842025756836,
"learning_rate": 1.6647226108847505e-05,
"loss": 3.046,
"step": 291000
},
{
"epoch": 0.701863791473017,
"grad_norm": 22.634611129760742,
"learning_rate": 1.658061098641212e-05,
"loss": 3.0375,
"step": 291500
},
{
"epoch": 0.7030676744772589,
"grad_norm": 18.193796157836914,
"learning_rate": 1.6513728333364545e-05,
"loss": 3.0379,
"step": 292000
},
{
"epoch": 0.7042715574815008,
"grad_norm": 18.391408920288086,
"learning_rate": 1.644684568031697e-05,
"loss": 2.9838,
"step": 292500
},
{
"epoch": 0.7054754404857427,
"grad_norm": 20.497100830078125,
"learning_rate": 1.6380096792575493e-05,
"loss": 3.0761,
"step": 293000
},
{
"epoch": 0.7066793234899846,
"grad_norm": 18.94228744506836,
"learning_rate": 1.6313214139527915e-05,
"loss": 3.0614,
"step": 293500
},
{
"epoch": 0.7078832064942264,
"grad_norm": 15.402490615844727,
"learning_rate": 1.624633148648034e-05,
"loss": 3.0053,
"step": 294000
},
{
"epoch": 0.7090870894984683,
"grad_norm": 26.502038955688477,
"learning_rate": 1.6179448833432767e-05,
"loss": 3.0216,
"step": 294500
},
{
"epoch": 0.7102909725027102,
"grad_norm": 20.452205657958984,
"learning_rate": 1.6112566180385193e-05,
"loss": 2.9757,
"step": 295000
},
{
"epoch": 0.7102909725027102,
"eval_runtime": 6348.6102,
"eval_samples_per_second": 130.839,
"eval_steps_per_second": 32.71,
"step": 295000
},
{
"epoch": 0.7114948555069521,
"grad_norm": 104.5809097290039,
"learning_rate": 1.6045683527337615e-05,
"loss": 3.0088,
"step": 295500
},
{
"epoch": 0.712698738511194,
"grad_norm": 15.921069145202637,
"learning_rate": 1.597880087429004e-05,
"loss": 3.0624,
"step": 296000
},
{
"epoch": 0.7139026215154359,
"grad_norm": 11.739727020263672,
"learning_rate": 1.5911918221242467e-05,
"loss": 3.0515,
"step": 296500
},
{
"epoch": 0.7151065045196777,
"grad_norm": 15.340862274169922,
"learning_rate": 1.5845169333500985e-05,
"loss": 3.0208,
"step": 297000
},
{
"epoch": 0.7163103875239196,
"grad_norm": 16.77552604675293,
"learning_rate": 1.577828668045341e-05,
"loss": 3.0112,
"step": 297500
},
{
"epoch": 0.7175142705281615,
"grad_norm": 19.09606170654297,
"learning_rate": 1.5711404027405837e-05,
"loss": 3.0038,
"step": 298000
},
{
"epoch": 0.7187181535324034,
"grad_norm": 12.892488479614258,
"learning_rate": 1.5644521374358262e-05,
"loss": 3.0353,
"step": 298500
},
{
"epoch": 0.7199220365366453,
"grad_norm": 15.720181465148926,
"learning_rate": 1.5577638721310685e-05,
"loss": 3.003,
"step": 299000
},
{
"epoch": 0.7211259195408872,
"grad_norm": 16.5432186126709,
"learning_rate": 1.5510756068263114e-05,
"loss": 3.0594,
"step": 299500
},
{
"epoch": 0.722329802545129,
"grad_norm": 24.2777042388916,
"learning_rate": 1.5443873415215536e-05,
"loss": 3.0239,
"step": 300000
},
{
"epoch": 0.722329802545129,
"eval_runtime": 6203.3821,
"eval_samples_per_second": 133.902,
"eval_steps_per_second": 33.476,
"step": 300000
},
{
"epoch": 0.7235336855493709,
"grad_norm": 14.297070503234863,
"learning_rate": 1.5376990762167962e-05,
"loss": 3.0123,
"step": 300500
},
{
"epoch": 0.7247375685536128,
"grad_norm": 18.216154098510742,
"learning_rate": 1.5310108109120384e-05,
"loss": 2.9833,
"step": 301000
},
{
"epoch": 0.7259414515578547,
"grad_norm": 15.619494438171387,
"learning_rate": 1.5243359221378908e-05,
"loss": 3.0715,
"step": 301500
},
{
"epoch": 0.7271453345620966,
"grad_norm": 22.748498916625977,
"learning_rate": 1.5176476568331332e-05,
"loss": 3.0101,
"step": 302000
},
{
"epoch": 0.7283492175663384,
"grad_norm": 16.824371337890625,
"learning_rate": 1.5109593915283756e-05,
"loss": 3.0347,
"step": 302500
},
{
"epoch": 0.7295531005705803,
"grad_norm": 15.611109733581543,
"learning_rate": 1.504271126223618e-05,
"loss": 3.0386,
"step": 303000
},
{
"epoch": 0.7307569835748222,
"grad_norm": 17.015262603759766,
"learning_rate": 1.4975962374494704e-05,
"loss": 3.0148,
"step": 303500
},
{
"epoch": 0.7319608665790641,
"grad_norm": 18.96904945373535,
"learning_rate": 1.4909079721447128e-05,
"loss": 3.1005,
"step": 304000
},
{
"epoch": 0.733164749583306,
"grad_norm": 21.718101501464844,
"learning_rate": 1.4842197068399552e-05,
"loss": 3.0489,
"step": 304500
},
{
"epoch": 0.7343686325875479,
"grad_norm": 14.246601104736328,
"learning_rate": 1.4775314415351976e-05,
"loss": 3.0439,
"step": 305000
},
{
"epoch": 0.7343686325875479,
"eval_runtime": 6028.9355,
"eval_samples_per_second": 137.776,
"eval_steps_per_second": 34.444,
"step": 305000
},
{
"epoch": 0.7355725155917897,
"grad_norm": 16.374101638793945,
"learning_rate": 1.4708431762304404e-05,
"loss": 2.9875,
"step": 305500
},
{
"epoch": 0.7367763985960316,
"grad_norm": 21.80797004699707,
"learning_rate": 1.4641549109256828e-05,
"loss": 3.0288,
"step": 306000
},
{
"epoch": 0.7379802816002735,
"grad_norm": 14.981256484985352,
"learning_rate": 1.4574666456209252e-05,
"loss": 3.0079,
"step": 306500
},
{
"epoch": 0.7391841646045154,
"grad_norm": 15.336825370788574,
"learning_rate": 1.4507783803161679e-05,
"loss": 3.0317,
"step": 307000
},
{
"epoch": 0.7403880476087573,
"grad_norm": 16.014474868774414,
"learning_rate": 1.4440901150114103e-05,
"loss": 3.0247,
"step": 307500
},
{
"epoch": 0.7415919306129992,
"grad_norm": 14.997090339660645,
"learning_rate": 1.4374152262372622e-05,
"loss": 3.0177,
"step": 308000
},
{
"epoch": 0.742795813617241,
"grad_norm": 17.185972213745117,
"learning_rate": 1.4307269609325048e-05,
"loss": 3.021,
"step": 308500
},
{
"epoch": 0.7439996966214829,
"grad_norm": 14.902591705322266,
"learning_rate": 1.4240386956277473e-05,
"loss": 3.0291,
"step": 309000
},
{
"epoch": 0.7452035796257248,
"grad_norm": 17.680278778076172,
"learning_rate": 1.4173504303229897e-05,
"loss": 3.0205,
"step": 309500
},
{
"epoch": 0.7464074626299667,
"grad_norm": 18.492225646972656,
"learning_rate": 1.4106621650182321e-05,
"loss": 3.012,
"step": 310000
},
{
"epoch": 0.7464074626299667,
"eval_runtime": 6370.9929,
"eval_samples_per_second": 130.379,
"eval_steps_per_second": 32.595,
"step": 310000
},
{
"epoch": 0.7476113456342086,
"grad_norm": 18.544729232788086,
"learning_rate": 1.4039872762440842e-05,
"loss": 2.9965,
"step": 310500
},
{
"epoch": 0.7488152286384504,
"grad_norm": 19.649858474731445,
"learning_rate": 1.397299010939327e-05,
"loss": 3.0335,
"step": 311000
},
{
"epoch": 0.7500191116426923,
"grad_norm": 19.35677146911621,
"learning_rate": 1.3906107456345693e-05,
"loss": 3.0426,
"step": 311500
},
{
"epoch": 0.7512229946469342,
"grad_norm": 19.635725021362305,
"learning_rate": 1.3839224803298117e-05,
"loss": 3.0506,
"step": 312000
},
{
"epoch": 0.7524268776511761,
"grad_norm": 16.11264991760254,
"learning_rate": 1.3772342150250541e-05,
"loss": 3.0185,
"step": 312500
},
{
"epoch": 0.753630760655418,
"grad_norm": 16.436038970947266,
"learning_rate": 1.3705593262509065e-05,
"loss": 2.9902,
"step": 313000
},
{
"epoch": 0.7548346436596599,
"grad_norm": 15.412540435791016,
"learning_rate": 1.363871060946149e-05,
"loss": 2.987,
"step": 313500
},
{
"epoch": 0.7560385266639017,
"grad_norm": 15.1536283493042,
"learning_rate": 1.3571827956413913e-05,
"loss": 2.9802,
"step": 314000
},
{
"epoch": 0.7572424096681436,
"grad_norm": 12.424234390258789,
"learning_rate": 1.3504945303366337e-05,
"loss": 3.0389,
"step": 314500
},
{
"epoch": 0.7584462926723855,
"grad_norm": 18.4250431060791,
"learning_rate": 1.3438062650318765e-05,
"loss": 3.0125,
"step": 315000
},
{
"epoch": 0.7584462926723855,
"eval_runtime": 6375.7867,
"eval_samples_per_second": 130.281,
"eval_steps_per_second": 32.57,
"step": 315000
},
{
"epoch": 0.7596501756766274,
"grad_norm": 16.10649299621582,
"learning_rate": 1.3371313762577283e-05,
"loss": 2.9806,
"step": 315500
},
{
"epoch": 0.7608540586808693,
"grad_norm": 20.46068572998047,
"learning_rate": 1.3304431109529707e-05,
"loss": 3.0044,
"step": 316000
},
{
"epoch": 0.7620579416851112,
"grad_norm": 13.980119705200195,
"learning_rate": 1.3237548456482131e-05,
"loss": 3.0349,
"step": 316500
},
{
"epoch": 0.763261824689353,
"grad_norm": 14.805524826049805,
"learning_rate": 1.3170665803434559e-05,
"loss": 3.0352,
"step": 317000
},
{
"epoch": 0.7644657076935949,
"grad_norm": 17.586395263671875,
"learning_rate": 1.3103783150386983e-05,
"loss": 3.0501,
"step": 317500
},
{
"epoch": 0.7656695906978368,
"grad_norm": 17.75722312927246,
"learning_rate": 1.3036900497339407e-05,
"loss": 3.0598,
"step": 318000
},
{
"epoch": 0.7668734737020787,
"grad_norm": 22.714632034301758,
"learning_rate": 1.2970017844291834e-05,
"loss": 3.0555,
"step": 318500
},
{
"epoch": 0.7680773567063206,
"grad_norm": 13.692117691040039,
"learning_rate": 1.2903268956550355e-05,
"loss": 3.0004,
"step": 319000
},
{
"epoch": 0.7692812397105625,
"grad_norm": 15.780096054077148,
"learning_rate": 1.2836386303502779e-05,
"loss": 3.0278,
"step": 319500
},
{
"epoch": 0.7704851227148043,
"grad_norm": 22.532176971435547,
"learning_rate": 1.2769503650455203e-05,
"loss": 3.045,
"step": 320000
},
{
"epoch": 0.7704851227148043,
"eval_runtime": 6361.4309,
"eval_samples_per_second": 130.575,
"eval_steps_per_second": 32.644,
"step": 320000
},
{
"epoch": 0.7716890057190462,
"grad_norm": 16.199644088745117,
"learning_rate": 1.270262099740763e-05,
"loss": 3.03,
"step": 320500
},
{
"epoch": 0.7728928887232881,
"grad_norm": 23.411863327026367,
"learning_rate": 1.2635738344360054e-05,
"loss": 3.0227,
"step": 321000
},
{
"epoch": 0.77409677172753,
"grad_norm": 14.578089714050293,
"learning_rate": 1.2568989456618575e-05,
"loss": 3.0099,
"step": 321500
},
{
"epoch": 0.7753006547317719,
"grad_norm": 22.472322463989258,
"learning_rate": 1.2502106803570999e-05,
"loss": 3.0347,
"step": 322000
},
{
"epoch": 0.7765045377360137,
"grad_norm": 12.440498352050781,
"learning_rate": 1.2435224150523425e-05,
"loss": 2.9987,
"step": 322500
},
{
"epoch": 0.7777084207402556,
"grad_norm": 20.633949279785156,
"learning_rate": 1.2368341497475849e-05,
"loss": 3.0421,
"step": 323000
},
{
"epoch": 0.7789123037444975,
"grad_norm": 17.52497673034668,
"learning_rate": 1.2301458844428274e-05,
"loss": 3.0747,
"step": 323500
},
{
"epoch": 0.7801161867487394,
"grad_norm": 19.617210388183594,
"learning_rate": 1.2234576191380698e-05,
"loss": 2.9955,
"step": 324000
},
{
"epoch": 0.7813200697529813,
"grad_norm": 16.269994735717773,
"learning_rate": 1.2167827303639219e-05,
"loss": 2.94,
"step": 324500
},
{
"epoch": 0.7825239527572232,
"grad_norm": 13.604962348937988,
"learning_rate": 1.2100944650591644e-05,
"loss": 3.0736,
"step": 325000
},
{
"epoch": 0.7825239527572232,
"eval_runtime": 6376.4381,
"eval_samples_per_second": 130.268,
"eval_steps_per_second": 32.567,
"step": 325000
},
{
"epoch": 0.783727835761465,
"grad_norm": 20.704360961914062,
"learning_rate": 1.203406199754407e-05,
"loss": 3.0536,
"step": 325500
},
{
"epoch": 0.7849317187657069,
"grad_norm": 14.824162483215332,
"learning_rate": 1.1967179344496494e-05,
"loss": 3.0263,
"step": 326000
},
{
"epoch": 0.7861356017699488,
"grad_norm": 16.627286911010742,
"learning_rate": 1.190029669144892e-05,
"loss": 3.0037,
"step": 326500
},
{
"epoch": 0.7873394847741907,
"grad_norm": 13.925793647766113,
"learning_rate": 1.183354780370744e-05,
"loss": 3.0127,
"step": 327000
},
{
"epoch": 0.7885433677784326,
"grad_norm": 19.544754028320312,
"learning_rate": 1.1766665150659866e-05,
"loss": 3.0307,
"step": 327500
},
{
"epoch": 0.7897472507826745,
"grad_norm": 13.963886260986328,
"learning_rate": 1.169978249761229e-05,
"loss": 3.034,
"step": 328000
},
{
"epoch": 0.7909511337869163,
"grad_norm": 17.435409545898438,
"learning_rate": 1.1632899844564716e-05,
"loss": 3.0295,
"step": 328500
},
{
"epoch": 0.7921550167911582,
"grad_norm": 17.950336456298828,
"learning_rate": 1.156601719151714e-05,
"loss": 3.0332,
"step": 329000
},
{
"epoch": 0.7933588997954001,
"grad_norm": 18.523168563842773,
"learning_rate": 1.1499134538469566e-05,
"loss": 3.0235,
"step": 329500
},
{
"epoch": 0.794562782799642,
"grad_norm": 14.469148635864258,
"learning_rate": 1.143225188542199e-05,
"loss": 3.0022,
"step": 330000
},
{
"epoch": 0.794562782799642,
"eval_runtime": 6383.675,
"eval_samples_per_second": 130.12,
"eval_steps_per_second": 32.53,
"step": 330000
},
{
"epoch": 0.7957666658038839,
"grad_norm": 17.111066818237305,
"learning_rate": 1.136550299768051e-05,
"loss": 3.0552,
"step": 330500
},
{
"epoch": 0.7969705488081257,
"grad_norm": 15.104440689086914,
"learning_rate": 1.1298620344632934e-05,
"loss": 3.0274,
"step": 331000
},
{
"epoch": 0.7981744318123676,
"grad_norm": 16.809152603149414,
"learning_rate": 1.123173769158536e-05,
"loss": 3.0156,
"step": 331500
},
{
"epoch": 0.7993783148166095,
"grad_norm": 16.31627655029297,
"learning_rate": 1.1164855038537784e-05,
"loss": 3.0302,
"step": 332000
},
{
"epoch": 0.8005821978208514,
"grad_norm": 14.074172019958496,
"learning_rate": 1.109797238549021e-05,
"loss": 3.0415,
"step": 332500
},
{
"epoch": 0.8017860808250933,
"grad_norm": 26.245460510253906,
"learning_rate": 1.1031089732442635e-05,
"loss": 3.0031,
"step": 333000
},
{
"epoch": 0.8029899638293352,
"grad_norm": 30.44843864440918,
"learning_rate": 1.0964340844701156e-05,
"loss": 3.0017,
"step": 333500
},
{
"epoch": 0.804193846833577,
"grad_norm": 17.4643611907959,
"learning_rate": 1.0897458191653582e-05,
"loss": 3.0633,
"step": 334000
},
{
"epoch": 0.8053977298378189,
"grad_norm": 31.82565689086914,
"learning_rate": 1.0830575538606006e-05,
"loss": 3.043,
"step": 334500
},
{
"epoch": 0.8066016128420608,
"grad_norm": 17.253402709960938,
"learning_rate": 1.0763692885558431e-05,
"loss": 3.0325,
"step": 335000
},
{
"epoch": 0.8066016128420608,
"eval_runtime": 6315.4758,
"eval_samples_per_second": 131.525,
"eval_steps_per_second": 32.881,
"step": 335000
},
{
"epoch": 0.8078054958463027,
"grad_norm": 22.236631393432617,
"learning_rate": 1.0696810232510855e-05,
"loss": 3.0358,
"step": 335500
},
{
"epoch": 0.8090093788505446,
"grad_norm": 14.467453956604004,
"learning_rate": 1.0629927579463281e-05,
"loss": 2.9967,
"step": 336000
},
{
"epoch": 0.8102132618547865,
"grad_norm": 23.571836471557617,
"learning_rate": 1.0563044926415705e-05,
"loss": 3.0579,
"step": 336500
},
{
"epoch": 0.8114171448590283,
"grad_norm": 19.492727279663086,
"learning_rate": 1.0496162273368131e-05,
"loss": 3.0471,
"step": 337000
},
{
"epoch": 0.8126210278632702,
"grad_norm": 14.599898338317871,
"learning_rate": 1.0429413385626651e-05,
"loss": 3.0066,
"step": 337500
},
{
"epoch": 0.8138249108675121,
"grad_norm": 17.604732513427734,
"learning_rate": 1.0362530732579075e-05,
"loss": 3.0106,
"step": 338000
},
{
"epoch": 0.815028793871754,
"grad_norm": 15.079025268554688,
"learning_rate": 1.0295781844837596e-05,
"loss": 3.006,
"step": 338500
},
{
"epoch": 0.8162326768759959,
"grad_norm": 17.019149780273438,
"learning_rate": 1.0228899191790021e-05,
"loss": 3.0254,
"step": 339000
},
{
"epoch": 0.8174365598802377,
"grad_norm": 15.817625045776367,
"learning_rate": 1.0162016538742445e-05,
"loss": 3.002,
"step": 339500
},
{
"epoch": 0.8186404428844796,
"grad_norm": 13.755847930908203,
"learning_rate": 1.0095133885694871e-05,
"loss": 3.0058,
"step": 340000
},
{
"epoch": 0.8186404428844796,
"eval_runtime": 6241.8468,
"eval_samples_per_second": 133.077,
"eval_steps_per_second": 33.269,
"step": 340000
},
{
"epoch": 0.8198443258887215,
"grad_norm": 16.21925926208496,
"learning_rate": 1.0028251232647295e-05,
"loss": 3.0572,
"step": 340500
},
{
"epoch": 0.8210482088929634,
"grad_norm": 17.245609283447266,
"learning_rate": 9.961502344905817e-06,
"loss": 3.0659,
"step": 341000
},
{
"epoch": 0.8222520918972053,
"grad_norm": 17.12338638305664,
"learning_rate": 9.894619691858241e-06,
"loss": 3.0002,
"step": 341500
},
{
"epoch": 0.8234559749014472,
"grad_norm": 13.26212215423584,
"learning_rate": 9.827737038810667e-06,
"loss": 2.9828,
"step": 342000
},
{
"epoch": 0.824659857905689,
"grad_norm": 20.169322967529297,
"learning_rate": 9.760854385763091e-06,
"loss": 2.9912,
"step": 342500
},
{
"epoch": 0.8258637409099309,
"grad_norm": 18.99537467956543,
"learning_rate": 9.693971732715517e-06,
"loss": 3.0485,
"step": 343000
},
{
"epoch": 0.8270676239141728,
"grad_norm": 27.021839141845703,
"learning_rate": 9.627089079667943e-06,
"loss": 3.029,
"step": 343500
},
{
"epoch": 0.8282715069184147,
"grad_norm": 21.197938919067383,
"learning_rate": 9.560206426620367e-06,
"loss": 3.058,
"step": 344000
},
{
"epoch": 0.8294753899226566,
"grad_norm": 15.80473518371582,
"learning_rate": 9.493457538878885e-06,
"loss": 3.0378,
"step": 344500
},
{
"epoch": 0.8306792729268985,
"grad_norm": 20.992782592773438,
"learning_rate": 9.426574885831311e-06,
"loss": 3.042,
"step": 345000
},
{
"epoch": 0.8306792729268985,
"eval_runtime": 6237.488,
"eval_samples_per_second": 133.17,
"eval_steps_per_second": 33.293,
"step": 345000
},
{
"epoch": 0.8318831559311403,
"grad_norm": 15.700128555297852,
"learning_rate": 9.359692232783737e-06,
"loss": 3.007,
"step": 345500
},
{
"epoch": 0.8330870389353822,
"grad_norm": 15.391378402709961,
"learning_rate": 9.292809579736161e-06,
"loss": 3.0211,
"step": 346000
},
{
"epoch": 0.8342909219396241,
"grad_norm": 17.32360076904297,
"learning_rate": 9.225926926688587e-06,
"loss": 3.0727,
"step": 346500
},
{
"epoch": 0.835494804943866,
"grad_norm": 15.85698127746582,
"learning_rate": 9.159178038947107e-06,
"loss": 3.0066,
"step": 347000
},
{
"epoch": 0.8366986879481079,
"grad_norm": 15.092347145080566,
"learning_rate": 9.092295385899533e-06,
"loss": 3.0106,
"step": 347500
},
{
"epoch": 0.8379025709523497,
"grad_norm": 14.47977352142334,
"learning_rate": 9.025412732851957e-06,
"loss": 3.0139,
"step": 348000
},
{
"epoch": 0.8391064539565916,
"grad_norm": 12.257486343383789,
"learning_rate": 8.958530079804383e-06,
"loss": 3.0264,
"step": 348500
},
{
"epoch": 0.8403103369608335,
"grad_norm": 17.00981330871582,
"learning_rate": 8.891781192062903e-06,
"loss": 3.0321,
"step": 349000
},
{
"epoch": 0.8415142199650754,
"grad_norm": 17.08600616455078,
"learning_rate": 8.824898539015327e-06,
"loss": 3.0046,
"step": 349500
},
{
"epoch": 0.8427181029693173,
"grad_norm": 14.907938003540039,
"learning_rate": 8.758015885967753e-06,
"loss": 3.0485,
"step": 350000
},
{
"epoch": 0.8427181029693173,
"eval_runtime": 6388.9903,
"eval_samples_per_second": 130.012,
"eval_steps_per_second": 32.503,
"step": 350000
},
{
"epoch": 0.8439219859735592,
"grad_norm": 14.369677543640137,
"learning_rate": 8.691133232920177e-06,
"loss": 3.0205,
"step": 350500
},
{
"epoch": 0.845125868977801,
"grad_norm": 19.901779174804688,
"learning_rate": 8.624250579872602e-06,
"loss": 3.0481,
"step": 351000
},
{
"epoch": 0.8463297519820429,
"grad_norm": 14.823498725891113,
"learning_rate": 8.557367926825027e-06,
"loss": 2.9577,
"step": 351500
},
{
"epoch": 0.8475336349862848,
"grad_norm": 19.70775032043457,
"learning_rate": 8.490485273777452e-06,
"loss": 3.0341,
"step": 352000
},
{
"epoch": 0.8487375179905267,
"grad_norm": 17.01579475402832,
"learning_rate": 8.423736386035973e-06,
"loss": 2.9874,
"step": 352500
},
{
"epoch": 0.8499414009947686,
"grad_norm": 16.942848205566406,
"learning_rate": 8.356853732988397e-06,
"loss": 3.0226,
"step": 353000
},
{
"epoch": 0.8511452839990105,
"grad_norm": 16.905664443969727,
"learning_rate": 8.289971079940822e-06,
"loss": 2.9484,
"step": 353500
},
{
"epoch": 0.8523491670032523,
"grad_norm": 15.149470329284668,
"learning_rate": 8.223088426893248e-06,
"loss": 2.9945,
"step": 354000
},
{
"epoch": 0.8535530500074942,
"grad_norm": 21.70083236694336,
"learning_rate": 8.156205773845672e-06,
"loss": 3.0103,
"step": 354500
},
{
"epoch": 0.854756933011736,
"grad_norm": 12.760059356689453,
"learning_rate": 8.089323120798098e-06,
"loss": 3.0178,
"step": 355000
},
{
"epoch": 0.854756933011736,
"eval_runtime": 6293.9927,
"eval_samples_per_second": 131.974,
"eval_steps_per_second": 32.994,
"step": 355000
},
{
"epoch": 0.855960816015978,
"grad_norm": 28.85261344909668,
"learning_rate": 8.022440467750522e-06,
"loss": 2.9688,
"step": 355500
},
{
"epoch": 0.8571646990202199,
"grad_norm": 13.942831039428711,
"learning_rate": 7.955557814702948e-06,
"loss": 2.9447,
"step": 356000
},
{
"epoch": 0.8583685820244616,
"grad_norm": 14.091262817382812,
"learning_rate": 7.888808926961468e-06,
"loss": 2.9865,
"step": 356500
},
{
"epoch": 0.8595724650287035,
"grad_norm": 19.63146209716797,
"learning_rate": 7.821926273913894e-06,
"loss": 3.0031,
"step": 357000
},
{
"epoch": 0.8607763480329454,
"grad_norm": 12.868454933166504,
"learning_rate": 7.755043620866318e-06,
"loss": 2.9701,
"step": 357500
},
{
"epoch": 0.8619802310371873,
"grad_norm": 18.4489803314209,
"learning_rate": 7.688160967818744e-06,
"loss": 2.9628,
"step": 358000
},
{
"epoch": 0.8631841140414293,
"grad_norm": 14.441180229187012,
"learning_rate": 7.6212783147711685e-06,
"loss": 3.0001,
"step": 358500
},
{
"epoch": 0.8643879970456712,
"grad_norm": 14.59991455078125,
"learning_rate": 7.554529427029688e-06,
"loss": 3.0118,
"step": 359000
},
{
"epoch": 0.8655918800499129,
"grad_norm": 24.200435638427734,
"learning_rate": 7.487646773982113e-06,
"loss": 3.0567,
"step": 359500
},
{
"epoch": 0.8667957630541548,
"grad_norm": 17.150327682495117,
"learning_rate": 7.420764120934539e-06,
"loss": 3.0472,
"step": 360000
},
{
"epoch": 0.8667957630541548,
"eval_runtime": 6262.3633,
"eval_samples_per_second": 132.641,
"eval_steps_per_second": 33.16,
"step": 360000
},
{
"epoch": 0.8679996460583967,
"grad_norm": 20.363269805908203,
"learning_rate": 7.353881467886964e-06,
"loss": 2.9548,
"step": 360500
},
{
"epoch": 0.8692035290626386,
"grad_norm": 16.118206024169922,
"learning_rate": 7.2869988148393885e-06,
"loss": 3.0507,
"step": 361000
},
{
"epoch": 0.8704074120668805,
"grad_norm": 16.389257431030273,
"learning_rate": 7.220116161791813e-06,
"loss": 3.0169,
"step": 361500
},
{
"epoch": 0.8716112950711224,
"grad_norm": 15.485569953918457,
"learning_rate": 7.153233508744238e-06,
"loss": 3.016,
"step": 362000
},
{
"epoch": 0.8728151780753642,
"grad_norm": 18.530200958251953,
"learning_rate": 7.086350855696663e-06,
"loss": 3.0083,
"step": 362500
},
{
"epoch": 0.8740190610796061,
"grad_norm": 14.700156211853027,
"learning_rate": 7.0196019679551835e-06,
"loss": 2.9861,
"step": 363000
},
{
"epoch": 0.875222944083848,
"grad_norm": 19.87506675720215,
"learning_rate": 6.952719314907609e-06,
"loss": 3.0287,
"step": 363500
},
{
"epoch": 0.8764268270880899,
"grad_norm": 25.59213638305664,
"learning_rate": 6.885836661860033e-06,
"loss": 3.0149,
"step": 364000
},
{
"epoch": 0.8776307100923318,
"grad_norm": 16.81450653076172,
"learning_rate": 6.818954008812459e-06,
"loss": 3.0167,
"step": 364500
},
{
"epoch": 0.8788345930965736,
"grad_norm": 20.761167526245117,
"learning_rate": 6.7522051210709786e-06,
"loss": 3.037,
"step": 365000
},
{
"epoch": 0.8788345930965736,
"eval_runtime": 6325.2523,
"eval_samples_per_second": 131.322,
"eval_steps_per_second": 32.831,
"step": 365000
},
{
"epoch": 0.8800384761008155,
"grad_norm": 18.997737884521484,
"learning_rate": 6.685322468023404e-06,
"loss": 3.0299,
"step": 365500
},
{
"epoch": 0.8812423591050574,
"grad_norm": 18.71440315246582,
"learning_rate": 6.618439814975828e-06,
"loss": 3.018,
"step": 366000
},
{
"epoch": 0.8824462421092993,
"grad_norm": 17.6945858001709,
"learning_rate": 6.551557161928254e-06,
"loss": 3.0215,
"step": 366500
},
{
"epoch": 0.8836501251135412,
"grad_norm": 17.693279266357422,
"learning_rate": 6.48467450888068e-06,
"loss": 3.0327,
"step": 367000
},
{
"epoch": 0.8848540081177831,
"grad_norm": 12.849013328552246,
"learning_rate": 6.417791855833104e-06,
"loss": 3.0219,
"step": 367500
},
{
"epoch": 0.8860578911220249,
"grad_norm": 15.688241958618164,
"learning_rate": 6.351042968091623e-06,
"loss": 3.0481,
"step": 368000
},
{
"epoch": 0.8872617741262668,
"grad_norm": 16.61380958557129,
"learning_rate": 6.284160315044049e-06,
"loss": 2.9957,
"step": 368500
},
{
"epoch": 0.8884656571305087,
"grad_norm": 14.891318321228027,
"learning_rate": 6.217277661996474e-06,
"loss": 3.0474,
"step": 369000
},
{
"epoch": 0.8896695401347506,
"grad_norm": 20.029443740844727,
"learning_rate": 6.150395008948899e-06,
"loss": 3.0241,
"step": 369500
},
{
"epoch": 0.8908734231389925,
"grad_norm": 13.43873119354248,
"learning_rate": 6.083512355901325e-06,
"loss": 3.0318,
"step": 370000
},
{
"epoch": 0.8908734231389925,
"eval_runtime": 6403.6333,
"eval_samples_per_second": 129.715,
"eval_steps_per_second": 32.429,
"step": 370000
},
{
"epoch": 0.8920773061432344,
"grad_norm": 16.173236846923828,
"learning_rate": 6.016763468159844e-06,
"loss": 3.0562,
"step": 370500
},
{
"epoch": 0.8932811891474762,
"grad_norm": 21.55840301513672,
"learning_rate": 5.949880815112269e-06,
"loss": 2.9967,
"step": 371000
},
{
"epoch": 0.8944850721517181,
"grad_norm": 15.276843070983887,
"learning_rate": 5.882998162064694e-06,
"loss": 3.0263,
"step": 371500
},
{
"epoch": 0.89568895515596,
"grad_norm": 15.087631225585938,
"learning_rate": 5.81611550901712e-06,
"loss": 2.9793,
"step": 372000
},
{
"epoch": 0.8968928381602019,
"grad_norm": 12.954302787780762,
"learning_rate": 5.749232855969545e-06,
"loss": 3.0192,
"step": 372500
},
{
"epoch": 0.8980967211644438,
"grad_norm": 20.37034797668457,
"learning_rate": 5.6823502029219695e-06,
"loss": 3.0274,
"step": 373000
},
{
"epoch": 0.8993006041686856,
"grad_norm": 16.947673797607422,
"learning_rate": 5.61560131518049e-06,
"loss": 3.0792,
"step": 373500
},
{
"epoch": 0.9005044871729275,
"grad_norm": 14.517135620117188,
"learning_rate": 5.548718662132915e-06,
"loss": 2.9878,
"step": 374000
},
{
"epoch": 0.9017083701771694,
"grad_norm": 15.187361717224121,
"learning_rate": 5.48183600908534e-06,
"loss": 3.0541,
"step": 374500
},
{
"epoch": 0.9029122531814113,
"grad_norm": 15.383942604064941,
"learning_rate": 5.4149533560377646e-06,
"loss": 3.021,
"step": 375000
},
{
"epoch": 0.9029122531814113,
"eval_runtime": 6390.605,
"eval_samples_per_second": 129.979,
"eval_steps_per_second": 32.495,
"step": 375000
},
{
"epoch": 0.9041161361856532,
"grad_norm": 17.510334014892578,
"learning_rate": 5.3480707029901895e-06,
"loss": 3.0943,
"step": 375500
},
{
"epoch": 0.9053200191898951,
"grad_norm": 16.601346969604492,
"learning_rate": 5.281321815248711e-06,
"loss": 3.0723,
"step": 376000
},
{
"epoch": 0.9065239021941369,
"grad_norm": 22.802818298339844,
"learning_rate": 5.2144391622011356e-06,
"loss": 3.0491,
"step": 376500
},
{
"epoch": 0.9077277851983788,
"grad_norm": 17.018939971923828,
"learning_rate": 5.1475565091535605e-06,
"loss": 3.0556,
"step": 377000
},
{
"epoch": 0.9089316682026207,
"grad_norm": 19.08505630493164,
"learning_rate": 5.080673856105985e-06,
"loss": 3.046,
"step": 377500
},
{
"epoch": 0.9101355512068626,
"grad_norm": 16.25370216369629,
"learning_rate": 5.01379120305841e-06,
"loss": 3.0191,
"step": 378000
},
{
"epoch": 0.9113394342111045,
"grad_norm": 16.954275131225586,
"learning_rate": 4.946908550010835e-06,
"loss": 3.0025,
"step": 378500
},
{
"epoch": 0.9125433172153464,
"grad_norm": 26.870176315307617,
"learning_rate": 4.8801596622693555e-06,
"loss": 3.0288,
"step": 379000
},
{
"epoch": 0.9137472002195882,
"grad_norm": 14.162908554077148,
"learning_rate": 4.81327700922178e-06,
"loss": 3.0278,
"step": 379500
},
{
"epoch": 0.9149510832238301,
"grad_norm": 16.129444122314453,
"learning_rate": 4.746394356174205e-06,
"loss": 3.0409,
"step": 380000
},
{
"epoch": 0.9149510832238301,
"eval_runtime": 6329.5669,
"eval_samples_per_second": 131.233,
"eval_steps_per_second": 32.808,
"step": 380000
},
{
"epoch": 0.916154966228072,
"grad_norm": 19.689468383789062,
"learning_rate": 4.67951170312663e-06,
"loss": 3.0399,
"step": 380500
},
{
"epoch": 0.9173588492323139,
"grad_norm": 17.123493194580078,
"learning_rate": 4.6127628153851506e-06,
"loss": 3.0142,
"step": 381000
},
{
"epoch": 0.9185627322365558,
"grad_norm": 15.44541072845459,
"learning_rate": 4.5458801623375755e-06,
"loss": 2.9937,
"step": 381500
},
{
"epoch": 0.9197666152407976,
"grad_norm": 20.037689208984375,
"learning_rate": 4.47899750929e-06,
"loss": 3.0889,
"step": 382000
},
{
"epoch": 0.9209704982450395,
"grad_norm": 17.4291934967041,
"learning_rate": 4.412114856242425e-06,
"loss": 2.9653,
"step": 382500
},
{
"epoch": 0.9221743812492814,
"grad_norm": 18.911190032958984,
"learning_rate": 4.345232203194851e-06,
"loss": 3.0299,
"step": 383000
},
{
"epoch": 0.9233782642535233,
"grad_norm": 18.403993606567383,
"learning_rate": 4.278349550147276e-06,
"loss": 3.0437,
"step": 383500
},
{
"epoch": 0.9245821472577652,
"grad_norm": 17.68988800048828,
"learning_rate": 4.211600662405795e-06,
"loss": 2.9655,
"step": 384000
},
{
"epoch": 0.9257860302620071,
"grad_norm": 15.752707481384277,
"learning_rate": 4.144718009358221e-06,
"loss": 3.0118,
"step": 384500
},
{
"epoch": 0.9269899132662489,
"grad_norm": 15.633676528930664,
"learning_rate": 4.077835356310646e-06,
"loss": 2.98,
"step": 385000
},
{
"epoch": 0.9269899132662489,
"eval_runtime": 6416.4378,
"eval_samples_per_second": 129.456,
"eval_steps_per_second": 32.364,
"step": 385000
},
{
"epoch": 0.9281937962704908,
"grad_norm": 22.764881134033203,
"learning_rate": 4.010952703263071e-06,
"loss": 3.0464,
"step": 385500
},
{
"epoch": 0.9293976792747327,
"grad_norm": 16.236614227294922,
"learning_rate": 3.944070050215496e-06,
"loss": 3.0362,
"step": 386000
},
{
"epoch": 0.9306015622789746,
"grad_norm": 14.50631332397461,
"learning_rate": 3.877321162474017e-06,
"loss": 3.071,
"step": 386500
},
{
"epoch": 0.9318054452832165,
"grad_norm": 13.831846237182617,
"learning_rate": 3.8104385094264415e-06,
"loss": 3.0001,
"step": 387000
},
{
"epoch": 0.9330093282874584,
"grad_norm": 12.26697826385498,
"learning_rate": 3.7435558563788664e-06,
"loss": 3.0437,
"step": 387500
},
{
"epoch": 0.9342132112917002,
"grad_norm": 20.174835205078125,
"learning_rate": 3.6766732033312913e-06,
"loss": 3.0136,
"step": 388000
},
{
"epoch": 0.9354170942959421,
"grad_norm": 19.26807975769043,
"learning_rate": 3.609790550283716e-06,
"loss": 3.0054,
"step": 388500
},
{
"epoch": 0.936620977300184,
"grad_norm": 13.987044334411621,
"learning_rate": 3.542907897236141e-06,
"loss": 3.004,
"step": 389000
},
{
"epoch": 0.9378248603044259,
"grad_norm": 19.408586502075195,
"learning_rate": 3.476025244188567e-06,
"loss": 3.0438,
"step": 389500
},
{
"epoch": 0.9390287433086678,
"grad_norm": 20.116239547729492,
"learning_rate": 3.4092763564470868e-06,
"loss": 3.0043,
"step": 390000
},
{
"epoch": 0.9390287433086678,
"eval_runtime": 6418.8738,
"eval_samples_per_second": 129.407,
"eval_steps_per_second": 32.352,
"step": 390000
},
{
"epoch": 0.9402326263129097,
"grad_norm": 15.818509101867676,
"learning_rate": 3.3423937033995117e-06,
"loss": 2.9467,
"step": 390500
},
{
"epoch": 0.9414365093171515,
"grad_norm": 17.208309173583984,
"learning_rate": 3.2755110503519366e-06,
"loss": 3.0507,
"step": 391000
},
{
"epoch": 0.9426403923213934,
"grad_norm": 14.738162994384766,
"learning_rate": 3.208628397304362e-06,
"loss": 3.0281,
"step": 391500
},
{
"epoch": 0.9438442753256353,
"grad_norm": 15.624344825744629,
"learning_rate": 3.141879509562882e-06,
"loss": 3.0248,
"step": 392000
},
{
"epoch": 0.9450481583298772,
"grad_norm": 17.159011840820312,
"learning_rate": 3.074996856515307e-06,
"loss": 2.9597,
"step": 392500
},
{
"epoch": 0.9462520413341191,
"grad_norm": 13.915901184082031,
"learning_rate": 3.008114203467732e-06,
"loss": 2.9937,
"step": 393000
},
{
"epoch": 0.9474559243383609,
"grad_norm": 20.13627052307129,
"learning_rate": 2.941231550420157e-06,
"loss": 2.9966,
"step": 393500
},
{
"epoch": 0.9486598073426028,
"grad_norm": 26.449026107788086,
"learning_rate": 2.8743488973725822e-06,
"loss": 2.9904,
"step": 394000
},
{
"epoch": 0.9498636903468447,
"grad_norm": 18.189252853393555,
"learning_rate": 2.807600009631102e-06,
"loss": 3.0078,
"step": 394500
},
{
"epoch": 0.9510675733510866,
"grad_norm": 20.91954803466797,
"learning_rate": 2.740717356583527e-06,
"loss": 3.0439,
"step": 395000
},
{
"epoch": 0.9510675733510866,
"eval_runtime": 6350.5821,
"eval_samples_per_second": 130.798,
"eval_steps_per_second": 32.7,
"step": 395000
},
{
"epoch": 0.9522714563553285,
"grad_norm": 18.318206787109375,
"learning_rate": 2.673834703535952e-06,
"loss": 2.989,
"step": 395500
},
{
"epoch": 0.9534753393595704,
"grad_norm": 16.19314193725586,
"learning_rate": 2.6069520504883773e-06,
"loss": 2.9842,
"step": 396000
},
{
"epoch": 0.9546792223638122,
"grad_norm": 16.36551856994629,
"learning_rate": 2.540069397440802e-06,
"loss": 2.9938,
"step": 396500
},
{
"epoch": 0.9558831053680541,
"grad_norm": 19.816038131713867,
"learning_rate": 2.473186744393227e-06,
"loss": 3.0204,
"step": 397000
},
{
"epoch": 0.957086988372296,
"grad_norm": 14.318347930908203,
"learning_rate": 2.4064378566517474e-06,
"loss": 3.0851,
"step": 397500
},
{
"epoch": 0.9582908713765379,
"grad_norm": 17.114421844482422,
"learning_rate": 2.3395552036041728e-06,
"loss": 3.0096,
"step": 398000
},
{
"epoch": 0.9594947543807798,
"grad_norm": 18.27849578857422,
"learning_rate": 2.2726725505565977e-06,
"loss": 3.0374,
"step": 398500
},
{
"epoch": 0.9606986373850217,
"grad_norm": 16.87068748474121,
"learning_rate": 2.2057898975090225e-06,
"loss": 3.0484,
"step": 399000
},
{
"epoch": 0.9619025203892635,
"grad_norm": 22.162954330444336,
"learning_rate": 2.138907244461448e-06,
"loss": 3.04,
"step": 399500
},
{
"epoch": 0.9631064033935054,
"grad_norm": 16.329286575317383,
"learning_rate": 2.0720245914138728e-06,
"loss": 2.9491,
"step": 400000
},
{
"epoch": 0.9631064033935054,
"eval_runtime": 6424.6879,
"eval_samples_per_second": 129.29,
"eval_steps_per_second": 32.323,
"step": 400000
},
{
"epoch": 0.9643102863977473,
"grad_norm": 16.189512252807617,
"learning_rate": 2.0052757036723927e-06,
"loss": 3.045,
"step": 400500
},
{
"epoch": 0.9655141694019892,
"grad_norm": 18.53325080871582,
"learning_rate": 1.9383930506248176e-06,
"loss": 3.0405,
"step": 401000
},
{
"epoch": 0.9667180524062311,
"grad_norm": 21.920936584472656,
"learning_rate": 1.871510397577243e-06,
"loss": 3.0347,
"step": 401500
},
{
"epoch": 0.9679219354104729,
"grad_norm": 10.188512802124023,
"learning_rate": 1.8046277445296678e-06,
"loss": 2.9497,
"step": 402000
},
{
"epoch": 0.9691258184147148,
"grad_norm": 23.691808700561523,
"learning_rate": 1.7377450914820931e-06,
"loss": 3.0046,
"step": 402500
},
{
"epoch": 0.9703297014189567,
"grad_norm": 17.392013549804688,
"learning_rate": 1.6709962037406129e-06,
"loss": 2.996,
"step": 403000
},
{
"epoch": 0.9715335844231986,
"grad_norm": 19.87090492248535,
"learning_rate": 1.6041135506930382e-06,
"loss": 3.042,
"step": 403500
},
{
"epoch": 0.9727374674274405,
"grad_norm": 19.895801544189453,
"learning_rate": 1.537230897645463e-06,
"loss": 2.978,
"step": 404000
},
{
"epoch": 0.9739413504316824,
"grad_norm": 16.795654296875,
"learning_rate": 1.4703482445978882e-06,
"loss": 3.0219,
"step": 404500
},
{
"epoch": 0.9751452334359242,
"grad_norm": 13.37932014465332,
"learning_rate": 1.4034655915503133e-06,
"loss": 3.0323,
"step": 405000
},
{
"epoch": 0.9751452334359242,
"eval_runtime": 6375.6502,
"eval_samples_per_second": 130.284,
"eval_steps_per_second": 32.571,
"step": 405000
},
{
"epoch": 0.9763491164401661,
"grad_norm": 14.84689712524414,
"learning_rate": 1.3367167038088334e-06,
"loss": 2.9709,
"step": 405500
},
{
"epoch": 0.977552999444408,
"grad_norm": 14.532979011535645,
"learning_rate": 1.2698340507612583e-06,
"loss": 3.0614,
"step": 406000
},
{
"epoch": 0.9787568824486499,
"grad_norm": 15.914132118225098,
"learning_rate": 1.2029513977136834e-06,
"loss": 3.0498,
"step": 406500
},
{
"epoch": 0.9799607654528918,
"grad_norm": 14.478850364685059,
"learning_rate": 1.1360687446661085e-06,
"loss": 2.9675,
"step": 407000
},
{
"epoch": 0.9811646484571337,
"grad_norm": 22.8538818359375,
"learning_rate": 1.0691860916185336e-06,
"loss": 3.0232,
"step": 407500
},
{
"epoch": 0.9823685314613755,
"grad_norm": 15.004932403564453,
"learning_rate": 1.0023034385709585e-06,
"loss": 2.9698,
"step": 408000
},
{
"epoch": 0.9835724144656174,
"grad_norm": 15.036443710327148,
"learning_rate": 9.354207855233835e-07,
"loss": 2.9961,
"step": 408500
},
{
"epoch": 0.9847762974698593,
"grad_norm": 19.975051879882812,
"learning_rate": 8.685381324758087e-07,
"loss": 3.0067,
"step": 409000
},
{
"epoch": 0.9859801804741012,
"grad_norm": 17.99605369567871,
"learning_rate": 8.017892447343288e-07,
"loss": 3.0702,
"step": 409500
},
{
"epoch": 0.9871840634783431,
"grad_norm": 15.935543060302734,
"learning_rate": 7.349065916867538e-07,
"loss": 3.0132,
"step": 410000
},
{
"epoch": 0.9871840634783431,
"eval_runtime": 6347.5902,
"eval_samples_per_second": 130.86,
"eval_steps_per_second": 32.715,
"step": 410000
},
{
"epoch": 0.9883879464825849,
"grad_norm": 12.5308256149292,
"learning_rate": 6.680239386391788e-07,
"loss": 2.977,
"step": 410500
},
{
"epoch": 0.9895918294868268,
"grad_norm": 15.325048446655273,
"learning_rate": 6.011412855916039e-07,
"loss": 3.0383,
"step": 411000
},
{
"epoch": 0.9907957124910687,
"grad_norm": 16.378740310668945,
"learning_rate": 5.342586325440288e-07,
"loss": 3.0278,
"step": 411500
},
{
"epoch": 0.9919995954953106,
"grad_norm": 17.669631958007812,
"learning_rate": 4.673759794964539e-07,
"loss": 2.9929,
"step": 412000
},
{
"epoch": 0.9932034784995525,
"grad_norm": 16.54693603515625,
"learning_rate": 4.004933264488789e-07,
"loss": 2.9768,
"step": 412500
},
{
"epoch": 0.9944073615037944,
"grad_norm": 16.434072494506836,
"learning_rate": 3.337444387073991e-07,
"loss": 3.0664,
"step": 413000
},
{
"epoch": 0.9956112445080362,
"grad_norm": 28.83799171447754,
"learning_rate": 2.6686178565982417e-07,
"loss": 3.0716,
"step": 413500
},
{
"epoch": 0.9968151275122781,
"grad_norm": 15.776455879211426,
"learning_rate": 1.9997913261224917e-07,
"loss": 3.0396,
"step": 414000
},
{
"epoch": 0.99801901051652,
"grad_norm": 18.937358856201172,
"learning_rate": 1.330964795646742e-07,
"loss": 3.0165,
"step": 414500
},
{
"epoch": 0.9992228935207619,
"grad_norm": 20.05877685546875,
"learning_rate": 6.621382651709922e-08,
"loss": 3.0059,
"step": 415000
},
{
"epoch": 0.9992228935207619,
"eval_runtime": 6436.495,
"eval_samples_per_second": 129.052,
"eval_steps_per_second": 32.263,
"step": 415000
}
],
"logging_steps": 500,
"max_steps": 415322,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1124779666016266e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}