multi-ling-pancake / trainer_state.json
omergoldman's picture
Upload folder using huggingface_hub
8cbf34d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 300,
"global_step": 49697,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002012224262394044,
"grad_norm": 1.5522648096084595,
"learning_rate": 1.9919517102615694e-06,
"loss": 1.2489,
"step": 100
},
{
"epoch": 0.004024448524788088,
"grad_norm": 1.59534752368927,
"learning_rate": 4.0040241448692155e-06,
"loss": 1.1249,
"step": 200
},
{
"epoch": 0.006036672787182132,
"grad_norm": 1.5959556102752686,
"learning_rate": 6.016096579476862e-06,
"loss": 1.0698,
"step": 300
},
{
"epoch": 0.006036672787182132,
"eval_loss": 0.9532507061958313,
"eval_runtime": 11.1234,
"eval_samples_per_second": 33.982,
"eval_steps_per_second": 1.169,
"step": 300
},
{
"epoch": 0.008048897049576176,
"grad_norm": 1.6886017322540283,
"learning_rate": 8.028169014084509e-06,
"loss": 1.0365,
"step": 400
},
{
"epoch": 0.01006112131197022,
"grad_norm": 1.421373963356018,
"learning_rate": 1.0040241448692154e-05,
"loss": 1.0155,
"step": 500
},
{
"epoch": 0.012073345574364264,
"grad_norm": 1.578765630722046,
"learning_rate": 1.20523138832998e-05,
"loss": 1.006,
"step": 600
},
{
"epoch": 0.012073345574364264,
"eval_loss": 0.8175720572471619,
"eval_runtime": 11.5611,
"eval_samples_per_second": 32.696,
"eval_steps_per_second": 1.124,
"step": 600
},
{
"epoch": 0.014085569836758306,
"grad_norm": 1.1900346279144287,
"learning_rate": 1.4064386317907446e-05,
"loss": 0.9972,
"step": 700
},
{
"epoch": 0.01609779409915235,
"grad_norm": 1.4590531587600708,
"learning_rate": 1.607645875251509e-05,
"loss": 0.9895,
"step": 800
},
{
"epoch": 0.018110018361546396,
"grad_norm": 1.8518555164337158,
"learning_rate": 1.8088531187122737e-05,
"loss": 0.9718,
"step": 900
},
{
"epoch": 0.018110018361546396,
"eval_loss": 0.781577467918396,
"eval_runtime": 11.4278,
"eval_samples_per_second": 33.077,
"eval_steps_per_second": 1.138,
"step": 900
},
{
"epoch": 0.02012224262394044,
"grad_norm": 1.351710319519043,
"learning_rate": 1.999999947988626e-05,
"loss": 0.972,
"step": 1000
},
{
"epoch": 0.022134466886334483,
"grad_norm": 1.2841336727142334,
"learning_rate": 1.9999770630715236e-05,
"loss": 0.9662,
"step": 1100
},
{
"epoch": 0.024146691148728527,
"grad_norm": 1.2296431064605713,
"learning_rate": 1.9999125701534677e-05,
"loss": 0.9578,
"step": 1200
},
{
"epoch": 0.024146691148728527,
"eval_loss": 0.9337042570114136,
"eval_runtime": 11.324,
"eval_samples_per_second": 33.38,
"eval_steps_per_second": 1.148,
"step": 1200
},
{
"epoch": 0.026158915411122568,
"grad_norm": 1.3778767585754395,
"learning_rate": 1.9998064719179408e-05,
"loss": 0.9614,
"step": 1300
},
{
"epoch": 0.028171139673516612,
"grad_norm": 1.3921650648117065,
"learning_rate": 1.9996587727795803e-05,
"loss": 0.9541,
"step": 1400
},
{
"epoch": 0.030183363935910656,
"grad_norm": 1.3527588844299316,
"learning_rate": 1.9994694788839924e-05,
"loss": 0.9488,
"step": 1500
},
{
"epoch": 0.030183363935910656,
"eval_loss": 0.7029635310173035,
"eval_runtime": 11.506,
"eval_samples_per_second": 32.853,
"eval_steps_per_second": 1.13,
"step": 1500
},
{
"epoch": 0.0321955881983047,
"grad_norm": 0.8907983303070068,
"learning_rate": 1.9992385981074994e-05,
"loss": 0.9418,
"step": 1600
},
{
"epoch": 0.03420781246069875,
"grad_norm": 1.136816382408142,
"learning_rate": 1.998966140056808e-05,
"loss": 0.946,
"step": 1700
},
{
"epoch": 0.03622003672309279,
"grad_norm": 1.0668370723724365,
"learning_rate": 1.9986521160686134e-05,
"loss": 0.9357,
"step": 1800
},
{
"epoch": 0.03622003672309279,
"eval_loss": 0.68252032995224,
"eval_runtime": 11.3032,
"eval_samples_per_second": 33.442,
"eval_steps_per_second": 1.15,
"step": 1800
},
{
"epoch": 0.038232260985486835,
"grad_norm": 0.8517168760299683,
"learning_rate": 1.9982965392091262e-05,
"loss": 0.936,
"step": 1900
},
{
"epoch": 0.04024448524788088,
"grad_norm": 1.0746815204620361,
"learning_rate": 1.9978994242735275e-05,
"loss": 0.9384,
"step": 2000
},
{
"epoch": 0.04225670951027492,
"grad_norm": 1.0119695663452148,
"learning_rate": 1.9974607877853555e-05,
"loss": 0.9252,
"step": 2100
},
{
"epoch": 0.04225670951027492,
"eval_loss": 0.672024130821228,
"eval_runtime": 11.3298,
"eval_samples_per_second": 33.363,
"eval_steps_per_second": 1.147,
"step": 2100
},
{
"epoch": 0.04426893377266897,
"grad_norm": 0.7535356283187866,
"learning_rate": 1.9969806479958154e-05,
"loss": 0.9215,
"step": 2200
},
{
"epoch": 0.04628115803506301,
"grad_norm": 0.837115466594696,
"learning_rate": 1.996459024883023e-05,
"loss": 0.9229,
"step": 2300
},
{
"epoch": 0.048293382297457055,
"grad_norm": 0.9772033095359802,
"learning_rate": 1.995895940151171e-05,
"loss": 0.9155,
"step": 2400
},
{
"epoch": 0.048293382297457055,
"eval_loss": 0.6609585285186768,
"eval_runtime": 11.8546,
"eval_samples_per_second": 31.886,
"eval_steps_per_second": 1.097,
"step": 2400
},
{
"epoch": 0.0503056065598511,
"grad_norm": 0.9059876799583435,
"learning_rate": 1.9952914172296264e-05,
"loss": 0.9104,
"step": 2500
},
{
"epoch": 0.052317830822245136,
"grad_norm": 1.090819239616394,
"learning_rate": 1.9946454812719572e-05,
"loss": 0.9056,
"step": 2600
},
{
"epoch": 0.05433005508463918,
"grad_norm": 0.8924378156661987,
"learning_rate": 1.9939581591548833e-05,
"loss": 0.9102,
"step": 2700
},
{
"epoch": 0.05433005508463918,
"eval_loss": 0.6568426489830017,
"eval_runtime": 11.3424,
"eval_samples_per_second": 33.326,
"eval_steps_per_second": 1.146,
"step": 2700
},
{
"epoch": 0.056342279347033224,
"grad_norm": 0.9142224788665771,
"learning_rate": 1.9932294794771596e-05,
"loss": 0.9101,
"step": 2800
},
{
"epoch": 0.05835450360942727,
"grad_norm": 1.060359239578247,
"learning_rate": 1.992459472558387e-05,
"loss": 0.9013,
"step": 2900
},
{
"epoch": 0.06036672787182131,
"grad_norm": 0.7167413234710693,
"learning_rate": 1.9916481704377487e-05,
"loss": 0.9002,
"step": 3000
},
{
"epoch": 0.06036672787182131,
"eval_loss": 0.6527668237686157,
"eval_runtime": 11.4,
"eval_samples_per_second": 33.158,
"eval_steps_per_second": 1.14,
"step": 3000
},
{
"epoch": 0.062378952134215356,
"grad_norm": 0.783549427986145,
"learning_rate": 1.9907956068726782e-05,
"loss": 0.897,
"step": 3100
},
{
"epoch": 0.0643911763966094,
"grad_norm": 0.9683724045753479,
"learning_rate": 1.9899018173374552e-05,
"loss": 0.9294,
"step": 3200
},
{
"epoch": 0.06640340065900345,
"grad_norm": 1.1547231674194336,
"learning_rate": 1.9889668390217284e-05,
"loss": 0.901,
"step": 3300
},
{
"epoch": 0.06640340065900345,
"eval_loss": 0.6419159173965454,
"eval_runtime": 11.429,
"eval_samples_per_second": 33.074,
"eval_steps_per_second": 1.137,
"step": 3300
},
{
"epoch": 0.0684156249213975,
"grad_norm": 0.81548011302948,
"learning_rate": 1.9879907108289684e-05,
"loss": 0.9008,
"step": 3400
},
{
"epoch": 0.07042784918379154,
"grad_norm": 0.7857891321182251,
"learning_rate": 1.98697347337485e-05,
"loss": 0.8928,
"step": 3500
},
{
"epoch": 0.07244007344618558,
"grad_norm": 0.8332715630531311,
"learning_rate": 1.985915168985561e-05,
"loss": 0.8889,
"step": 3600
},
{
"epoch": 0.07244007344618558,
"eval_loss": 0.6356409192085266,
"eval_runtime": 11.2917,
"eval_samples_per_second": 33.476,
"eval_steps_per_second": 1.151,
"step": 3600
},
{
"epoch": 0.07445229770857963,
"grad_norm": 0.9201735258102417,
"learning_rate": 1.9848158416960414e-05,
"loss": 0.8869,
"step": 3700
},
{
"epoch": 0.07646452197097367,
"grad_norm": 0.7852803468704224,
"learning_rate": 1.9836755372481512e-05,
"loss": 0.8973,
"step": 3800
},
{
"epoch": 0.07847674623336771,
"grad_norm": 0.7758309841156006,
"learning_rate": 1.982494303088767e-05,
"loss": 0.8925,
"step": 3900
},
{
"epoch": 0.07847674623336771,
"eval_loss": 0.6345422863960266,
"eval_runtime": 11.3533,
"eval_samples_per_second": 33.294,
"eval_steps_per_second": 1.145,
"step": 3900
},
{
"epoch": 0.08048897049576176,
"grad_norm": 0.9436432123184204,
"learning_rate": 1.981272188367809e-05,
"loss": 0.8847,
"step": 4000
},
{
"epoch": 0.0825011947581558,
"grad_norm": 0.8394960165023804,
"learning_rate": 1.980009243936193e-05,
"loss": 0.8923,
"step": 4100
},
{
"epoch": 0.08451341902054985,
"grad_norm": 0.8079524636268616,
"learning_rate": 1.9787055223437184e-05,
"loss": 0.8828,
"step": 4200
},
{
"epoch": 0.08451341902054985,
"eval_loss": 0.6277508735656738,
"eval_runtime": 11.2988,
"eval_samples_per_second": 33.455,
"eval_steps_per_second": 1.151,
"step": 4200
},
{
"epoch": 0.08652564328294389,
"grad_norm": 0.8562188744544983,
"learning_rate": 1.977361077836878e-05,
"loss": 0.8801,
"step": 4300
},
{
"epoch": 0.08853786754533793,
"grad_norm": 0.9642734527587891,
"learning_rate": 1.9759759663566032e-05,
"loss": 0.896,
"step": 4400
},
{
"epoch": 0.09055009180773198,
"grad_norm": 0.8723398447036743,
"learning_rate": 1.9745502455359367e-05,
"loss": 0.8879,
"step": 4500
},
{
"epoch": 0.09055009180773198,
"eval_loss": 0.6282201409339905,
"eval_runtime": 11.4757,
"eval_samples_per_second": 32.939,
"eval_steps_per_second": 1.133,
"step": 4500
},
{
"epoch": 0.09256231607012602,
"grad_norm": 0.8613621592521667,
"learning_rate": 1.9730839746976314e-05,
"loss": 0.8854,
"step": 4600
},
{
"epoch": 0.09457454033252007,
"grad_norm": 0.7336219549179077,
"learning_rate": 1.9715772148516855e-05,
"loss": 0.8806,
"step": 4700
},
{
"epoch": 0.09658676459491411,
"grad_norm": 0.7842460870742798,
"learning_rate": 1.970030028692802e-05,
"loss": 0.8798,
"step": 4800
},
{
"epoch": 0.09658676459491411,
"eval_loss": 0.6203732490539551,
"eval_runtime": 11.2931,
"eval_samples_per_second": 33.472,
"eval_steps_per_second": 1.151,
"step": 4800
},
{
"epoch": 0.09859898885730815,
"grad_norm": 1.042386770248413,
"learning_rate": 1.968442480597781e-05,
"loss": 0.8786,
"step": 4900
},
{
"epoch": 0.1006112131197022,
"grad_norm": 0.8358279466629028,
"learning_rate": 1.9668146366228398e-05,
"loss": 0.8834,
"step": 5000
},
{
"epoch": 0.10262343738209624,
"grad_norm": 0.9129268527030945,
"learning_rate": 1.965146564500866e-05,
"loss": 0.8763,
"step": 5100
},
{
"epoch": 0.10262343738209624,
"eval_loss": 0.6140510439872742,
"eval_runtime": 11.3122,
"eval_samples_per_second": 33.415,
"eval_steps_per_second": 1.149,
"step": 5100
},
{
"epoch": 0.10463566164449027,
"grad_norm": 0.9329330325126648,
"learning_rate": 1.963438333638598e-05,
"loss": 0.8724,
"step": 5200
},
{
"epoch": 0.10664788590688432,
"grad_norm": 0.9156613349914551,
"learning_rate": 1.9616900151137375e-05,
"loss": 0.8798,
"step": 5300
},
{
"epoch": 0.10866011016927836,
"grad_norm": 1.0988123416900635,
"learning_rate": 1.9599016816719912e-05,
"loss": 0.8864,
"step": 5400
},
{
"epoch": 0.10866011016927836,
"eval_loss": 0.613735556602478,
"eval_runtime": 11.5595,
"eval_samples_per_second": 32.7,
"eval_steps_per_second": 1.125,
"step": 5400
},
{
"epoch": 0.1106723344316724,
"grad_norm": 0.9962302446365356,
"learning_rate": 1.9580734077240467e-05,
"loss": 0.879,
"step": 5500
},
{
"epoch": 0.11268455869406645,
"grad_norm": 0.6542097926139832,
"learning_rate": 1.9562052693424724e-05,
"loss": 0.8754,
"step": 5600
},
{
"epoch": 0.11469678295646049,
"grad_norm": 0.8420646786689758,
"learning_rate": 1.9542973442585542e-05,
"loss": 0.8753,
"step": 5700
},
{
"epoch": 0.11469678295646049,
"eval_loss": 0.6112973690032959,
"eval_runtime": 11.3099,
"eval_samples_per_second": 33.422,
"eval_steps_per_second": 1.149,
"step": 5700
},
{
"epoch": 0.11670900721885454,
"grad_norm": 1.0234030485153198,
"learning_rate": 1.9523497118590625e-05,
"loss": 0.869,
"step": 5800
},
{
"epoch": 0.11872123148124858,
"grad_norm": 0.7687940001487732,
"learning_rate": 1.9503624531829463e-05,
"loss": 0.875,
"step": 5900
},
{
"epoch": 0.12073345574364262,
"grad_norm": 0.858860194683075,
"learning_rate": 1.9483356509179633e-05,
"loss": 0.8682,
"step": 6000
},
{
"epoch": 0.12073345574364262,
"eval_loss": 0.6082560420036316,
"eval_runtime": 11.2984,
"eval_samples_per_second": 33.456,
"eval_steps_per_second": 1.151,
"step": 6000
},
{
"epoch": 0.12274568000603667,
"grad_norm": 0.7500011324882507,
"learning_rate": 1.946269389397239e-05,
"loss": 0.8667,
"step": 6100
},
{
"epoch": 0.12475790426843071,
"grad_norm": 0.8498502373695374,
"learning_rate": 1.9441637545957558e-05,
"loss": 0.8717,
"step": 6200
},
{
"epoch": 0.12677012853082475,
"grad_norm": 0.9230628609657288,
"learning_rate": 1.9420188341267783e-05,
"loss": 0.8689,
"step": 6300
},
{
"epoch": 0.12677012853082475,
"eval_loss": 0.6047795414924622,
"eval_runtime": 11.3052,
"eval_samples_per_second": 33.436,
"eval_steps_per_second": 1.15,
"step": 6300
},
{
"epoch": 0.1287823527932188,
"grad_norm": 0.7312197089195251,
"learning_rate": 1.939834717238207e-05,
"loss": 0.8676,
"step": 6400
},
{
"epoch": 0.13079457705561284,
"grad_norm": 0.7080931067466736,
"learning_rate": 1.9376114948088634e-05,
"loss": 0.8632,
"step": 6500
},
{
"epoch": 0.1328068013180069,
"grad_norm": 0.793525755405426,
"learning_rate": 1.9353492593447107e-05,
"loss": 0.8682,
"step": 6600
},
{
"epoch": 0.1328068013180069,
"eval_loss": 0.6011930704116821,
"eval_runtime": 11.4543,
"eval_samples_per_second": 33.001,
"eval_steps_per_second": 1.135,
"step": 6600
},
{
"epoch": 0.13481902558040093,
"grad_norm": 0.7798284292221069,
"learning_rate": 1.9330481049750028e-05,
"loss": 0.8636,
"step": 6700
},
{
"epoch": 0.136831249842795,
"grad_norm": 0.9270545840263367,
"learning_rate": 1.9307081274483698e-05,
"loss": 0.8644,
"step": 6800
},
{
"epoch": 0.13884347410518902,
"grad_norm": 0.7777066826820374,
"learning_rate": 1.9283294241288315e-05,
"loss": 0.8682,
"step": 6900
},
{
"epoch": 0.13884347410518902,
"eval_loss": 0.6046885848045349,
"eval_runtime": 11.4509,
"eval_samples_per_second": 33.01,
"eval_steps_per_second": 1.135,
"step": 6900
},
{
"epoch": 0.14085569836758308,
"grad_norm": 0.7538514733314514,
"learning_rate": 1.925912093991748e-05,
"loss": 0.8654,
"step": 7000
},
{
"epoch": 0.1428679226299771,
"grad_norm": 0.6866621375083923,
"learning_rate": 1.9234562376197015e-05,
"loss": 0.8497,
"step": 7100
},
{
"epoch": 0.14488014689237116,
"grad_norm": 0.829768717288971,
"learning_rate": 1.92096195719831e-05,
"loss": 0.8575,
"step": 7200
},
{
"epoch": 0.14488014689237116,
"eval_loss": 0.6001401543617249,
"eval_runtime": 11.2516,
"eval_samples_per_second": 33.595,
"eval_steps_per_second": 1.155,
"step": 7200
},
{
"epoch": 0.1468923711547652,
"grad_norm": 0.8665058016777039,
"learning_rate": 1.9184293565119755e-05,
"loss": 0.8612,
"step": 7300
},
{
"epoch": 0.14890459541715925,
"grad_norm": 0.7740942239761353,
"learning_rate": 1.9158585409395674e-05,
"loss": 0.8596,
"step": 7400
},
{
"epoch": 0.15091681967955328,
"grad_norm": 0.672917902469635,
"learning_rate": 1.9132496174500364e-05,
"loss": 0.854,
"step": 7500
},
{
"epoch": 0.15091681967955328,
"eval_loss": 0.5939906239509583,
"eval_runtime": 11.3101,
"eval_samples_per_second": 33.421,
"eval_steps_per_second": 1.149,
"step": 7500
},
{
"epoch": 0.15292904394194734,
"grad_norm": 0.719465970993042,
"learning_rate": 1.9106026945979627e-05,
"loss": 0.8615,
"step": 7600
},
{
"epoch": 0.15494126820434137,
"grad_norm": 0.7433097958564758,
"learning_rate": 1.9079178825190416e-05,
"loss": 0.8564,
"step": 7700
},
{
"epoch": 0.15695349246673543,
"grad_norm": 0.7390840649604797,
"learning_rate": 1.9051952929254983e-05,
"loss": 0.8526,
"step": 7800
},
{
"epoch": 0.15695349246673543,
"eval_loss": 0.5941105484962463,
"eval_runtime": 11.2494,
"eval_samples_per_second": 33.602,
"eval_steps_per_second": 1.156,
"step": 7800
},
{
"epoch": 0.15896571672912946,
"grad_norm": 0.721076488494873,
"learning_rate": 1.902435039101442e-05,
"loss": 0.8535,
"step": 7900
},
{
"epoch": 0.16097794099152352,
"grad_norm": 0.7117634415626526,
"learning_rate": 1.899637235898151e-05,
"loss": 0.8548,
"step": 8000
},
{
"epoch": 0.16299016525391755,
"grad_norm": 0.7325859069824219,
"learning_rate": 1.8968019997292937e-05,
"loss": 0.8661,
"step": 8100
},
{
"epoch": 0.16299016525391755,
"eval_loss": 0.5943772196769714,
"eval_runtime": 11.2277,
"eval_samples_per_second": 33.667,
"eval_steps_per_second": 1.158,
"step": 8100
},
{
"epoch": 0.1650023895163116,
"grad_norm": 0.8927565217018127,
"learning_rate": 1.893929448566085e-05,
"loss": 0.8535,
"step": 8200
},
{
"epoch": 0.16701461377870563,
"grad_norm": 0.9083840250968933,
"learning_rate": 1.8910197019323782e-05,
"loss": 0.8581,
"step": 8300
},
{
"epoch": 0.1690268380410997,
"grad_norm": 0.7133694291114807,
"learning_rate": 1.8880728808996906e-05,
"loss": 0.8491,
"step": 8400
},
{
"epoch": 0.1690268380410997,
"eval_loss": 0.5923792719841003,
"eval_runtime": 11.2757,
"eval_samples_per_second": 33.523,
"eval_steps_per_second": 1.153,
"step": 8400
},
{
"epoch": 0.17103906230349372,
"grad_norm": 0.7994174361228943,
"learning_rate": 1.8850891080821673e-05,
"loss": 0.8577,
"step": 8500
},
{
"epoch": 0.17305128656588778,
"grad_norm": 1.106224775314331,
"learning_rate": 1.8820685076314782e-05,
"loss": 0.849,
"step": 8600
},
{
"epoch": 0.1750635108282818,
"grad_norm": 1.0492300987243652,
"learning_rate": 1.8790112052316523e-05,
"loss": 0.8579,
"step": 8700
},
{
"epoch": 0.1750635108282818,
"eval_loss": 0.6185858845710754,
"eval_runtime": 11.3469,
"eval_samples_per_second": 33.313,
"eval_steps_per_second": 1.146,
"step": 8700
},
{
"epoch": 0.17707573509067587,
"grad_norm": 0.7523091435432434,
"learning_rate": 1.875917328093849e-05,
"loss": 0.8548,
"step": 8800
},
{
"epoch": 0.1790879593530699,
"grad_norm": 0.8177125453948975,
"learning_rate": 1.8727870049510636e-05,
"loss": 0.8512,
"step": 8900
},
{
"epoch": 0.18110018361546396,
"grad_norm": 0.7863544821739197,
"learning_rate": 1.869620366052772e-05,
"loss": 0.8474,
"step": 9000
},
{
"epoch": 0.18110018361546396,
"eval_loss": 0.5867164134979248,
"eval_runtime": 11.2542,
"eval_samples_per_second": 33.588,
"eval_steps_per_second": 1.155,
"step": 9000
},
{
"epoch": 0.18311240787785799,
"grad_norm": 0.7436131834983826,
"learning_rate": 1.8664175431595106e-05,
"loss": 0.8587,
"step": 9100
},
{
"epoch": 0.18512463214025204,
"grad_norm": 0.803816020488739,
"learning_rate": 1.8631786695373943e-05,
"loss": 0.8455,
"step": 9200
},
{
"epoch": 0.18713685640264607,
"grad_norm": 0.9202460050582886,
"learning_rate": 1.8599038799525712e-05,
"loss": 0.8513,
"step": 9300
},
{
"epoch": 0.18713685640264607,
"eval_loss": 0.583454430103302,
"eval_runtime": 11.2388,
"eval_samples_per_second": 33.633,
"eval_steps_per_second": 1.157,
"step": 9300
},
{
"epoch": 0.18914908066504013,
"grad_norm": 0.8134105801582336,
"learning_rate": 1.856593310665614e-05,
"loss": 0.8499,
"step": 9400
},
{
"epoch": 0.19116130492743416,
"grad_norm": 0.7113932967185974,
"learning_rate": 1.8532470994258533e-05,
"loss": 0.849,
"step": 9500
},
{
"epoch": 0.19317352918982822,
"grad_norm": 0.8230564594268799,
"learning_rate": 1.8498653854656424e-05,
"loss": 0.8413,
"step": 9600
},
{
"epoch": 0.19317352918982822,
"eval_loss": 0.5848163962364197,
"eval_runtime": 11.2801,
"eval_samples_per_second": 33.51,
"eval_steps_per_second": 1.152,
"step": 9600
},
{
"epoch": 0.19518575345222225,
"grad_norm": 0.6756404638290405,
"learning_rate": 1.8464483094945667e-05,
"loss": 0.8543,
"step": 9700
},
{
"epoch": 0.1971979777146163,
"grad_norm": 0.7398785352706909,
"learning_rate": 1.8429960136935878e-05,
"loss": 0.8428,
"step": 9800
},
{
"epoch": 0.19921020197701034,
"grad_norm": 0.7419747710227966,
"learning_rate": 1.8395086417091272e-05,
"loss": 0.8516,
"step": 9900
},
{
"epoch": 0.19921020197701034,
"eval_loss": 0.5863896608352661,
"eval_runtime": 11.3198,
"eval_samples_per_second": 33.393,
"eval_steps_per_second": 1.148,
"step": 9900
},
{
"epoch": 0.2012224262394044,
"grad_norm": 0.8145945072174072,
"learning_rate": 1.8359863386470904e-05,
"loss": 0.8508,
"step": 10000
},
{
"epoch": 0.20323465050179843,
"grad_norm": 0.7068437933921814,
"learning_rate": 1.8324292510668278e-05,
"loss": 0.8495,
"step": 10100
},
{
"epoch": 0.20524687476419248,
"grad_norm": 0.7419267892837524,
"learning_rate": 1.828837526975038e-05,
"loss": 0.8461,
"step": 10200
},
{
"epoch": 0.20524687476419248,
"eval_loss": 0.5834963917732239,
"eval_runtime": 11.7842,
"eval_samples_per_second": 32.077,
"eval_steps_per_second": 1.103,
"step": 10200
},
{
"epoch": 0.2072590990265865,
"grad_norm": 1.129436731338501,
"learning_rate": 1.8252113158196078e-05,
"loss": 0.8435,
"step": 10300
},
{
"epoch": 0.20927132328898054,
"grad_norm": 0.6937255859375,
"learning_rate": 1.821550768483396e-05,
"loss": 0.8485,
"step": 10400
},
{
"epoch": 0.2112835475513746,
"grad_norm": 0.8506975769996643,
"learning_rate": 1.8178560372779525e-05,
"loss": 0.8473,
"step": 10500
},
{
"epoch": 0.2112835475513746,
"eval_loss": 0.5813661217689514,
"eval_runtime": 11.832,
"eval_samples_per_second": 31.947,
"eval_steps_per_second": 1.099,
"step": 10500
},
{
"epoch": 0.21329577181376863,
"grad_norm": 0.733964204788208,
"learning_rate": 1.814127275937183e-05,
"loss": 0.836,
"step": 10600
},
{
"epoch": 0.2153079960761627,
"grad_norm": 0.7400948405265808,
"learning_rate": 1.8103646396109523e-05,
"loss": 0.8473,
"step": 10700
},
{
"epoch": 0.21732022033855672,
"grad_norm": 0.9023438096046448,
"learning_rate": 1.8065682848586266e-05,
"loss": 0.8468,
"step": 10800
},
{
"epoch": 0.21732022033855672,
"eval_loss": 0.5793610215187073,
"eval_runtime": 11.234,
"eval_samples_per_second": 33.648,
"eval_steps_per_second": 1.157,
"step": 10800
},
{
"epoch": 0.21933244460095078,
"grad_norm": 0.82066810131073,
"learning_rate": 1.8027383696425613e-05,
"loss": 0.8457,
"step": 10900
},
{
"epoch": 0.2213446688633448,
"grad_norm": 0.6094478964805603,
"learning_rate": 1.7988750533215276e-05,
"loss": 0.8408,
"step": 11000
},
{
"epoch": 0.22335689312573886,
"grad_norm": 0.7535290122032166,
"learning_rate": 1.7949784966440823e-05,
"loss": 0.8403,
"step": 11100
},
{
"epoch": 0.22335689312573886,
"eval_loss": 0.578126072883606,
"eval_runtime": 11.202,
"eval_samples_per_second": 33.744,
"eval_steps_per_second": 1.161,
"step": 11100
},
{
"epoch": 0.2253691173881329,
"grad_norm": 0.7472143769264221,
"learning_rate": 1.791048861741877e-05,
"loss": 0.8434,
"step": 11200
},
{
"epoch": 0.22738134165052695,
"grad_norm": 0.8236815333366394,
"learning_rate": 1.7870863121229162e-05,
"loss": 0.8273,
"step": 11300
},
{
"epoch": 0.22939356591292098,
"grad_norm": 0.6772099137306213,
"learning_rate": 1.783091012664749e-05,
"loss": 0.8355,
"step": 11400
},
{
"epoch": 0.22939356591292098,
"eval_loss": 0.5848814249038696,
"eval_runtime": 11.4019,
"eval_samples_per_second": 33.152,
"eval_steps_per_second": 1.14,
"step": 11400
},
{
"epoch": 0.23140579017531504,
"grad_norm": 0.7480434775352478,
"learning_rate": 1.779063129607612e-05,
"loss": 0.8437,
"step": 11500
},
{
"epoch": 0.23341801443770907,
"grad_norm": 0.8341161608695984,
"learning_rate": 1.7750028305475125e-05,
"loss": 0.8384,
"step": 11600
},
{
"epoch": 0.23543023870010313,
"grad_norm": 0.9399694800376892,
"learning_rate": 1.7709102844292516e-05,
"loss": 0.8419,
"step": 11700
},
{
"epoch": 0.23543023870010313,
"eval_loss": 0.5769637227058411,
"eval_runtime": 11.2547,
"eval_samples_per_second": 33.586,
"eval_steps_per_second": 1.155,
"step": 11700
},
{
"epoch": 0.23744246296249716,
"grad_norm": 0.8473734855651855,
"learning_rate": 1.7667856615393987e-05,
"loss": 0.8346,
"step": 11800
},
{
"epoch": 0.23945468722489122,
"grad_norm": 0.6887069940567017,
"learning_rate": 1.7626291334992027e-05,
"loss": 0.8381,
"step": 11900
},
{
"epoch": 0.24146691148728525,
"grad_norm": 0.6946566700935364,
"learning_rate": 1.758440873257454e-05,
"loss": 0.8345,
"step": 12000
},
{
"epoch": 0.24146691148728525,
"eval_loss": 0.5747541785240173,
"eval_runtime": 11.4122,
"eval_samples_per_second": 33.122,
"eval_steps_per_second": 1.139,
"step": 12000
},
{
"epoch": 0.2434791357496793,
"grad_norm": 0.681305468082428,
"learning_rate": 1.7542210550832854e-05,
"loss": 0.841,
"step": 12100
},
{
"epoch": 0.24549136001207333,
"grad_norm": 0.8475384712219238,
"learning_rate": 1.749969854558923e-05,
"loss": 0.8392,
"step": 12200
},
{
"epoch": 0.2475035842744674,
"grad_norm": 1.1652250289916992,
"learning_rate": 1.745687448572379e-05,
"loss": 0.8388,
"step": 12300
},
{
"epoch": 0.2475035842744674,
"eval_loss": 0.5746700763702393,
"eval_runtime": 11.4476,
"eval_samples_per_second": 33.02,
"eval_steps_per_second": 1.136,
"step": 12300
},
{
"epoch": 0.24951580853686142,
"grad_norm": 0.7575956583023071,
"learning_rate": 1.741374015310094e-05,
"loss": 0.8362,
"step": 12400
},
{
"epoch": 0.25152803279925545,
"grad_norm": 0.7489831447601318,
"learning_rate": 1.737029734249519e-05,
"loss": 0.836,
"step": 12500
},
{
"epoch": 0.2535402570616495,
"grad_norm": 0.7467206716537476,
"learning_rate": 1.732654786151651e-05,
"loss": 0.8317,
"step": 12600
},
{
"epoch": 0.2535402570616495,
"eval_loss": 0.5750060081481934,
"eval_runtime": 11.2549,
"eval_samples_per_second": 33.585,
"eval_steps_per_second": 1.155,
"step": 12600
},
{
"epoch": 0.25555248132404357,
"grad_norm": 0.7825116515159607,
"learning_rate": 1.7282493530535095e-05,
"loss": 0.8335,
"step": 12700
},
{
"epoch": 0.2575647055864376,
"grad_norm": 0.8054665923118591,
"learning_rate": 1.723813618260564e-05,
"loss": 0.8332,
"step": 12800
},
{
"epoch": 0.25957692984883163,
"grad_norm": 0.740932822227478,
"learning_rate": 1.7193477663391055e-05,
"loss": 0.8333,
"step": 12900
},
{
"epoch": 0.25957692984883163,
"eval_loss": 0.574753999710083,
"eval_runtime": 11.3005,
"eval_samples_per_second": 33.45,
"eval_steps_per_second": 1.15,
"step": 12900
},
{
"epoch": 0.2615891541112257,
"grad_norm": 0.6655648350715637,
"learning_rate": 1.714851983108567e-05,
"loss": 0.8332,
"step": 13000
},
{
"epoch": 0.26360137837361974,
"grad_norm": 0.8892366886138916,
"learning_rate": 1.710326455633792e-05,
"loss": 0.833,
"step": 13100
},
{
"epoch": 0.2656136026360138,
"grad_norm": 0.7081986665725708,
"learning_rate": 1.7057713722172505e-05,
"loss": 0.8352,
"step": 13200
},
{
"epoch": 0.2656136026360138,
"eval_loss": 0.569306492805481,
"eval_runtime": 11.2208,
"eval_samples_per_second": 33.688,
"eval_steps_per_second": 1.159,
"step": 13200
},
{
"epoch": 0.2676258268984078,
"grad_norm": 0.7726171612739563,
"learning_rate": 1.701186922391206e-05,
"loss": 0.8325,
"step": 13300
},
{
"epoch": 0.26963805116080186,
"grad_norm": 0.6000068187713623,
"learning_rate": 1.6965732969098262e-05,
"loss": 0.8303,
"step": 13400
},
{
"epoch": 0.2716502754231959,
"grad_norm": 0.7751488089561462,
"learning_rate": 1.6919306877412474e-05,
"loss": 0.8311,
"step": 13500
},
{
"epoch": 0.2716502754231959,
"eval_loss": 0.5708428621292114,
"eval_runtime": 11.2236,
"eval_samples_per_second": 33.679,
"eval_steps_per_second": 1.158,
"step": 13500
},
{
"epoch": 0.27366249968559,
"grad_norm": 0.7674184441566467,
"learning_rate": 1.6872592880595872e-05,
"loss": 0.8391,
"step": 13600
},
{
"epoch": 0.275674723947984,
"grad_norm": 0.999799370765686,
"learning_rate": 1.6825592922369066e-05,
"loss": 0.8215,
"step": 13700
},
{
"epoch": 0.27768694821037804,
"grad_norm": 0.7192254662513733,
"learning_rate": 1.6778308958351213e-05,
"loss": 0.8304,
"step": 13800
},
{
"epoch": 0.27768694821037804,
"eval_loss": 0.5696760416030884,
"eval_runtime": 11.2331,
"eval_samples_per_second": 33.65,
"eval_steps_per_second": 1.157,
"step": 13800
},
{
"epoch": 0.2796991724727721,
"grad_norm": 1.1758594512939453,
"learning_rate": 1.673074295597867e-05,
"loss": 0.8346,
"step": 13900
},
{
"epoch": 0.28171139673516615,
"grad_norm": 0.5974677801132202,
"learning_rate": 1.6682896894423094e-05,
"loss": 0.824,
"step": 14000
},
{
"epoch": 0.28372362099756016,
"grad_norm": 0.720886766910553,
"learning_rate": 1.6634772764509128e-05,
"loss": 0.8246,
"step": 14100
},
{
"epoch": 0.28372362099756016,
"eval_loss": 0.5675772428512573,
"eval_runtime": 11.3956,
"eval_samples_per_second": 33.171,
"eval_steps_per_second": 1.141,
"step": 14100
},
{
"epoch": 0.2857358452599542,
"grad_norm": 0.6889091730117798,
"learning_rate": 1.6586372568631545e-05,
"loss": 0.8231,
"step": 14200
},
{
"epoch": 0.28774806952234827,
"grad_norm": 0.6523007154464722,
"learning_rate": 1.6537698320671933e-05,
"loss": 0.8272,
"step": 14300
},
{
"epoch": 0.28976029378474233,
"grad_norm": 0.7638033628463745,
"learning_rate": 1.64887520459149e-05,
"loss": 0.8306,
"step": 14400
},
{
"epoch": 0.28976029378474233,
"eval_loss": 0.569464921951294,
"eval_runtime": 11.248,
"eval_samples_per_second": 33.606,
"eval_steps_per_second": 1.156,
"step": 14400
},
{
"epoch": 0.29177251804713633,
"grad_norm": 0.6883799433708191,
"learning_rate": 1.6439535780963808e-05,
"loss": 0.8327,
"step": 14500
},
{
"epoch": 0.2937847423095304,
"grad_norm": 0.8693552017211914,
"learning_rate": 1.6390051573656028e-05,
"loss": 0.8299,
"step": 14600
},
{
"epoch": 0.29579696657192445,
"grad_norm": 0.6811352372169495,
"learning_rate": 1.634030148297773e-05,
"loss": 0.8257,
"step": 14700
},
{
"epoch": 0.29579696657192445,
"eval_loss": 0.5680450797080994,
"eval_runtime": 11.451,
"eval_samples_per_second": 33.01,
"eval_steps_per_second": 1.135,
"step": 14700
},
{
"epoch": 0.2978091908343185,
"grad_norm": 0.7108572721481323,
"learning_rate": 1.629028757897821e-05,
"loss": 0.826,
"step": 14800
},
{
"epoch": 0.2998214150967125,
"grad_norm": 0.701524555683136,
"learning_rate": 1.6240011942683774e-05,
"loss": 0.8233,
"step": 14900
},
{
"epoch": 0.30183363935910656,
"grad_norm": 0.6415804028511047,
"learning_rate": 1.6189476666011123e-05,
"loss": 0.8174,
"step": 15000
},
{
"epoch": 0.30183363935910656,
"eval_loss": 0.5662389397621155,
"eval_runtime": 11.3747,
"eval_samples_per_second": 33.232,
"eval_steps_per_second": 1.143,
"step": 15000
},
{
"epoch": 0.3038458636215006,
"grad_norm": 0.593760073184967,
"learning_rate": 1.6138683851680328e-05,
"loss": 0.8269,
"step": 15100
},
{
"epoch": 0.3058580878838947,
"grad_norm": 0.6708555221557617,
"learning_rate": 1.608763561312733e-05,
"loss": 0.8277,
"step": 15200
},
{
"epoch": 0.3078703121462887,
"grad_norm": 0.5819365382194519,
"learning_rate": 1.603633407441601e-05,
"loss": 0.8237,
"step": 15300
},
{
"epoch": 0.3078703121462887,
"eval_loss": 0.5628697872161865,
"eval_runtime": 11.3199,
"eval_samples_per_second": 33.393,
"eval_steps_per_second": 1.148,
"step": 15300
},
{
"epoch": 0.30988253640868274,
"grad_norm": 0.725537896156311,
"learning_rate": 1.5984781370149798e-05,
"loss": 0.8355,
"step": 15400
},
{
"epoch": 0.3118947606710768,
"grad_norm": 0.642382800579071,
"learning_rate": 1.5932979645382863e-05,
"loss": 0.8292,
"step": 15500
},
{
"epoch": 0.31390698493347086,
"grad_norm": 0.6141934394836426,
"learning_rate": 1.588093105553086e-05,
"loss": 0.8306,
"step": 15600
},
{
"epoch": 0.31390698493347086,
"eval_loss": 0.5633600354194641,
"eval_runtime": 11.3793,
"eval_samples_per_second": 33.218,
"eval_steps_per_second": 1.142,
"step": 15600
},
{
"epoch": 0.31591920919586486,
"grad_norm": 0.6902384757995605,
"learning_rate": 1.5828637766281238e-05,
"loss": 0.8243,
"step": 15700
},
{
"epoch": 0.3179314334582589,
"grad_norm": 0.7464603781700134,
"learning_rate": 1.5776101953503134e-05,
"loss": 0.8296,
"step": 15800
},
{
"epoch": 0.319943657720653,
"grad_norm": 0.6735148429870605,
"learning_rate": 1.5723325803156834e-05,
"loss": 0.8168,
"step": 15900
},
{
"epoch": 0.319943657720653,
"eval_loss": 0.5626727938652039,
"eval_runtime": 11.3991,
"eval_samples_per_second": 33.16,
"eval_steps_per_second": 1.14,
"step": 15900
},
{
"epoch": 0.32195588198304703,
"grad_norm": 0.7461301684379578,
"learning_rate": 1.5670311511202823e-05,
"loss": 0.8175,
"step": 16000
},
{
"epoch": 0.32396810624544103,
"grad_norm": 0.6454249620437622,
"learning_rate": 1.5617061283510404e-05,
"loss": 0.8287,
"step": 16100
},
{
"epoch": 0.3259803305078351,
"grad_norm": 0.723892331123352,
"learning_rate": 1.5563577335765925e-05,
"loss": 0.8256,
"step": 16200
},
{
"epoch": 0.3259803305078351,
"eval_loss": 0.5635449290275574,
"eval_runtime": 11.3171,
"eval_samples_per_second": 33.401,
"eval_steps_per_second": 1.149,
"step": 16200
},
{
"epoch": 0.32799255477022915,
"grad_norm": 0.6277914047241211,
"learning_rate": 1.5509861893380576e-05,
"loss": 0.8274,
"step": 16300
},
{
"epoch": 0.3300047790326232,
"grad_norm": 0.6103200316429138,
"learning_rate": 1.5455917191397806e-05,
"loss": 0.8207,
"step": 16400
},
{
"epoch": 0.3320170032950172,
"grad_norm": 0.6216299533843994,
"learning_rate": 1.5401745474400306e-05,
"loss": 0.8218,
"step": 16500
},
{
"epoch": 0.3320170032950172,
"eval_loss": 0.5613713264465332,
"eval_runtime": 11.3097,
"eval_samples_per_second": 33.423,
"eval_steps_per_second": 1.149,
"step": 16500
},
{
"epoch": 0.33402922755741127,
"grad_norm": 0.6130411624908447,
"learning_rate": 1.5347348996416626e-05,
"loss": 0.8193,
"step": 16600
},
{
"epoch": 0.3360414518198053,
"grad_norm": 0.7175905704498291,
"learning_rate": 1.5292730020827394e-05,
"loss": 0.8205,
"step": 16700
},
{
"epoch": 0.3380536760821994,
"grad_norm": 0.5804928541183472,
"learning_rate": 1.5237890820271124e-05,
"loss": 0.8256,
"step": 16800
},
{
"epoch": 0.3380536760821994,
"eval_loss": 0.558940589427948,
"eval_runtime": 11.507,
"eval_samples_per_second": 32.849,
"eval_steps_per_second": 1.13,
"step": 16800
},
{
"epoch": 0.3400659003445934,
"grad_norm": 0.7494300007820129,
"learning_rate": 1.518283367654966e-05,
"loss": 0.8225,
"step": 16900
},
{
"epoch": 0.34207812460698744,
"grad_norm": 0.5440366268157959,
"learning_rate": 1.5127560880533242e-05,
"loss": 0.8272,
"step": 17000
},
{
"epoch": 0.3440903488693815,
"grad_norm": 0.5601567625999451,
"learning_rate": 1.5072074732065165e-05,
"loss": 0.829,
"step": 17100
},
{
"epoch": 0.3440903488693815,
"eval_loss": 0.5592995285987854,
"eval_runtime": 11.056,
"eval_samples_per_second": 34.19,
"eval_steps_per_second": 1.176,
"step": 17100
},
{
"epoch": 0.34610257313177556,
"grad_norm": 0.6553789377212524,
"learning_rate": 1.5016377539866106e-05,
"loss": 0.824,
"step": 17200
},
{
"epoch": 0.34811479739416956,
"grad_norm": 0.7243614792823792,
"learning_rate": 1.4960471621438047e-05,
"loss": 0.8206,
"step": 17300
},
{
"epoch": 0.3501270216565636,
"grad_norm": 0.7584229111671448,
"learning_rate": 1.4904359302967848e-05,
"loss": 0.8264,
"step": 17400
},
{
"epoch": 0.3501270216565636,
"eval_loss": 0.5582433342933655,
"eval_runtime": 11.4613,
"eval_samples_per_second": 32.98,
"eval_steps_per_second": 1.134,
"step": 17400
},
{
"epoch": 0.3521392459189577,
"grad_norm": 0.9413104057312012,
"learning_rate": 1.4848042919230464e-05,
"loss": 0.8082,
"step": 17500
},
{
"epoch": 0.35415147018135174,
"grad_norm": 0.7952352166175842,
"learning_rate": 1.4791524813491789e-05,
"loss": 0.8138,
"step": 17600
},
{
"epoch": 0.35616369444374574,
"grad_norm": 0.6611462235450745,
"learning_rate": 1.4734807337411166e-05,
"loss": 0.817,
"step": 17700
},
{
"epoch": 0.35616369444374574,
"eval_loss": 0.5570442080497742,
"eval_runtime": 11.4931,
"eval_samples_per_second": 32.889,
"eval_steps_per_second": 1.131,
"step": 17700
},
{
"epoch": 0.3581759187061398,
"grad_norm": 0.8845998644828796,
"learning_rate": 1.4677892850943516e-05,
"loss": 0.8124,
"step": 17800
},
{
"epoch": 0.36018814296853385,
"grad_norm": 0.6421878337860107,
"learning_rate": 1.462078372224117e-05,
"loss": 0.814,
"step": 17900
},
{
"epoch": 0.3622003672309279,
"grad_norm": 0.6532554030418396,
"learning_rate": 1.456348232755531e-05,
"loss": 0.8081,
"step": 18000
},
{
"epoch": 0.3622003672309279,
"eval_loss": 0.5557852983474731,
"eval_runtime": 11.4159,
"eval_samples_per_second": 33.112,
"eval_steps_per_second": 1.139,
"step": 18000
},
{
"epoch": 0.3642125914933219,
"grad_norm": 0.8483557105064392,
"learning_rate": 1.4505991051137112e-05,
"loss": 0.8137,
"step": 18100
},
{
"epoch": 0.36622481575571597,
"grad_norm": 0.7414484620094299,
"learning_rate": 1.4448312285138524e-05,
"loss": 0.8095,
"step": 18200
},
{
"epoch": 0.36823704001811003,
"grad_norm": 0.6685389280319214,
"learning_rate": 1.4390448429512747e-05,
"loss": 0.8108,
"step": 18300
},
{
"epoch": 0.36823704001811003,
"eval_loss": 0.5559925436973572,
"eval_runtime": 11.4267,
"eval_samples_per_second": 33.081,
"eval_steps_per_second": 1.138,
"step": 18300
},
{
"epoch": 0.3702492642805041,
"grad_norm": 0.5973154306411743,
"learning_rate": 1.4332401891914365e-05,
"loss": 0.8144,
"step": 18400
},
{
"epoch": 0.3722614885428981,
"grad_norm": 0.6153602004051208,
"learning_rate": 1.4274175087599166e-05,
"loss": 0.8234,
"step": 18500
},
{
"epoch": 0.37427371280529215,
"grad_norm": 0.6379988789558411,
"learning_rate": 1.4215770439323657e-05,
"loss": 0.8137,
"step": 18600
},
{
"epoch": 0.37427371280529215,
"eval_loss": 0.5545734763145447,
"eval_runtime": 11.3444,
"eval_samples_per_second": 33.32,
"eval_steps_per_second": 1.146,
"step": 18600
},
{
"epoch": 0.3762859370676862,
"grad_norm": 0.6836999654769897,
"learning_rate": 1.4157190377244233e-05,
"loss": 0.811,
"step": 18700
},
{
"epoch": 0.37829816133008026,
"grad_norm": 0.5659916400909424,
"learning_rate": 1.409843733881608e-05,
"loss": 0.8175,
"step": 18800
},
{
"epoch": 0.38031038559247426,
"grad_norm": 0.6270354986190796,
"learning_rate": 1.4039513768691753e-05,
"loss": 0.8221,
"step": 18900
},
{
"epoch": 0.38031038559247426,
"eval_loss": 0.5561990737915039,
"eval_runtime": 11.437,
"eval_samples_per_second": 33.051,
"eval_steps_per_second": 1.137,
"step": 18900
},
{
"epoch": 0.3823226098548683,
"grad_norm": 0.6403433680534363,
"learning_rate": 1.3980422118619447e-05,
"loss": 0.8156,
"step": 19000
},
{
"epoch": 0.3843348341172624,
"grad_norm": 0.5956655144691467,
"learning_rate": 1.3921164847340996e-05,
"loss": 0.8161,
"step": 19100
},
{
"epoch": 0.38634705837965644,
"grad_norm": 1.1075905561447144,
"learning_rate": 1.3861744420489547e-05,
"loss": 0.8115,
"step": 19200
},
{
"epoch": 0.38634705837965644,
"eval_loss": 0.5551438927650452,
"eval_runtime": 11.6061,
"eval_samples_per_second": 32.569,
"eval_steps_per_second": 1.12,
"step": 19200
},
{
"epoch": 0.38835928264205044,
"grad_norm": 0.5919958353042603,
"learning_rate": 1.380216331048699e-05,
"loss": 0.8042,
"step": 19300
},
{
"epoch": 0.3903715069044445,
"grad_norm": 0.599104106426239,
"learning_rate": 1.3742423996441067e-05,
"loss": 0.8107,
"step": 19400
},
{
"epoch": 0.39238373116683856,
"grad_norm": 0.6891294121742249,
"learning_rate": 1.3682528964042234e-05,
"loss": 0.8082,
"step": 19500
},
{
"epoch": 0.39238373116683856,
"eval_loss": 0.5554007291793823,
"eval_runtime": 11.5763,
"eval_samples_per_second": 32.653,
"eval_steps_per_second": 1.123,
"step": 19500
},
{
"epoch": 0.3943959554292326,
"grad_norm": 0.6625336408615112,
"learning_rate": 1.3622480705460217e-05,
"loss": 0.8161,
"step": 19600
},
{
"epoch": 0.3964081796916266,
"grad_norm": 0.6874691843986511,
"learning_rate": 1.3562281719240323e-05,
"loss": 0.808,
"step": 19700
},
{
"epoch": 0.3984204039540207,
"grad_norm": 0.6335239410400391,
"learning_rate": 1.3501934510199479e-05,
"loss": 0.8172,
"step": 19800
},
{
"epoch": 0.3984204039540207,
"eval_loss": 0.5533725023269653,
"eval_runtime": 11.4224,
"eval_samples_per_second": 33.093,
"eval_steps_per_second": 1.138,
"step": 19800
},
{
"epoch": 0.40043262821641473,
"grad_norm": 0.6799935102462769,
"learning_rate": 1.3441441589322013e-05,
"loss": 0.8102,
"step": 19900
},
{
"epoch": 0.4024448524788088,
"grad_norm": 0.7125223278999329,
"learning_rate": 1.338080547365517e-05,
"loss": 0.8196,
"step": 20000
},
{
"epoch": 0.4044570767412028,
"grad_norm": 0.6379702091217041,
"learning_rate": 1.3320028686204378e-05,
"loss": 0.7988,
"step": 20100
},
{
"epoch": 0.4044570767412028,
"eval_loss": 0.5532128214836121,
"eval_runtime": 11.5518,
"eval_samples_per_second": 32.722,
"eval_steps_per_second": 1.125,
"step": 20100
},
{
"epoch": 0.40646930100359685,
"grad_norm": 0.6244897842407227,
"learning_rate": 1.325911375582827e-05,
"loss": 0.8078,
"step": 20200
},
{
"epoch": 0.4084815252659909,
"grad_norm": 0.6567655801773071,
"learning_rate": 1.319806321713346e-05,
"loss": 0.812,
"step": 20300
},
{
"epoch": 0.41049374952838497,
"grad_norm": 0.7605450749397278,
"learning_rate": 1.3136879610369091e-05,
"loss": 0.8078,
"step": 20400
},
{
"epoch": 0.41049374952838497,
"eval_loss": 0.5506391525268555,
"eval_runtime": 11.3697,
"eval_samples_per_second": 33.246,
"eval_steps_per_second": 1.143,
"step": 20400
},
{
"epoch": 0.41250597379077897,
"grad_norm": 0.669282853603363,
"learning_rate": 1.3075565481321122e-05,
"loss": 0.8086,
"step": 20500
},
{
"epoch": 0.414518198053173,
"grad_norm": 0.6792070269584656,
"learning_rate": 1.301412338120641e-05,
"loss": 0.8075,
"step": 20600
},
{
"epoch": 0.4165304223155671,
"grad_norm": 0.5937780737876892,
"learning_rate": 1.2952555866566554e-05,
"loss": 0.8151,
"step": 20700
},
{
"epoch": 0.4165304223155671,
"eval_loss": 0.5495349168777466,
"eval_runtime": 11.3633,
"eval_samples_per_second": 33.265,
"eval_steps_per_second": 1.144,
"step": 20700
},
{
"epoch": 0.4185426465779611,
"grad_norm": 0.6547305583953857,
"learning_rate": 1.2890865499161522e-05,
"loss": 0.8022,
"step": 20800
},
{
"epoch": 0.42055487084035514,
"grad_norm": 0.5942917466163635,
"learning_rate": 1.2829054845863054e-05,
"loss": 0.8079,
"step": 20900
},
{
"epoch": 0.4225670951027492,
"grad_norm": 0.5794849991798401,
"learning_rate": 1.2767126478547865e-05,
"loss": 0.8152,
"step": 21000
},
{
"epoch": 0.4225670951027492,
"eval_loss": 0.5491987466812134,
"eval_runtime": 11.3343,
"eval_samples_per_second": 33.35,
"eval_steps_per_second": 1.147,
"step": 21000
},
{
"epoch": 0.42457931936514326,
"grad_norm": 0.6574000120162964,
"learning_rate": 1.2705082973990623e-05,
"loss": 0.8087,
"step": 21100
},
{
"epoch": 0.42659154362753726,
"grad_norm": 0.6523112654685974,
"learning_rate": 1.264292691375674e-05,
"loss": 0.8098,
"step": 21200
},
{
"epoch": 0.4286037678899313,
"grad_norm": 0.6403859853744507,
"learning_rate": 1.2580660884094944e-05,
"loss": 0.8125,
"step": 21300
},
{
"epoch": 0.4286037678899313,
"eval_loss": 0.5487639307975769,
"eval_runtime": 11.6017,
"eval_samples_per_second": 32.581,
"eval_steps_per_second": 1.121,
"step": 21300
},
{
"epoch": 0.4306159921523254,
"grad_norm": 0.6883541345596313,
"learning_rate": 1.2518287475829687e-05,
"loss": 0.804,
"step": 21400
},
{
"epoch": 0.43262821641471944,
"grad_norm": 0.6650357246398926,
"learning_rate": 1.2455809284253329e-05,
"loss": 0.8097,
"step": 21500
},
{
"epoch": 0.43464044067711344,
"grad_norm": 0.6048406958580017,
"learning_rate": 1.239322890901815e-05,
"loss": 0.8059,
"step": 21600
},
{
"epoch": 0.43464044067711344,
"eval_loss": 0.5487421751022339,
"eval_runtime": 11.4779,
"eval_samples_per_second": 32.933,
"eval_steps_per_second": 1.133,
"step": 21600
},
{
"epoch": 0.4366526649395075,
"grad_norm": 0.6876850724220276,
"learning_rate": 1.233054895402819e-05,
"loss": 0.8027,
"step": 21700
},
{
"epoch": 0.43866488920190155,
"grad_norm": 0.656778872013092,
"learning_rate": 1.2267772027330893e-05,
"loss": 0.8124,
"step": 21800
},
{
"epoch": 0.4406771134642956,
"grad_norm": 0.6603732109069824,
"learning_rate": 1.22049007410086e-05,
"loss": 0.8032,
"step": 21900
},
{
"epoch": 0.4406771134642956,
"eval_loss": 0.547619104385376,
"eval_runtime": 11.4392,
"eval_samples_per_second": 33.044,
"eval_steps_per_second": 1.136,
"step": 21900
},
{
"epoch": 0.4426893377266896,
"grad_norm": 0.5987362861633301,
"learning_rate": 1.2141937711069857e-05,
"loss": 0.8075,
"step": 22000
},
{
"epoch": 0.44470156198908367,
"grad_norm": 0.6756895780563354,
"learning_rate": 1.2078885557340562e-05,
"loss": 0.8092,
"step": 22100
},
{
"epoch": 0.44671378625147773,
"grad_norm": 0.7242164015769958,
"learning_rate": 1.2015746903354968e-05,
"loss": 0.8156,
"step": 22200
},
{
"epoch": 0.44671378625147773,
"eval_loss": 0.5490314364433289,
"eval_runtime": 11.6139,
"eval_samples_per_second": 32.547,
"eval_steps_per_second": 1.119,
"step": 22200
},
{
"epoch": 0.4487260105138718,
"grad_norm": 0.77918541431427,
"learning_rate": 1.1952524376246504e-05,
"loss": 0.8063,
"step": 22300
},
{
"epoch": 0.4507382347762658,
"grad_norm": 0.6913318634033203,
"learning_rate": 1.1889220606638476e-05,
"loss": 0.8079,
"step": 22400
},
{
"epoch": 0.45275045903865985,
"grad_norm": 0.747986376285553,
"learning_rate": 1.1825838228534607e-05,
"loss": 0.8033,
"step": 22500
},
{
"epoch": 0.45275045903865985,
"eval_loss": 0.5468713045120239,
"eval_runtime": 11.4,
"eval_samples_per_second": 33.158,
"eval_steps_per_second": 1.14,
"step": 22500
},
{
"epoch": 0.4547626833010539,
"grad_norm": 0.6693961024284363,
"learning_rate": 1.1762379879209442e-05,
"loss": 0.8089,
"step": 22600
},
{
"epoch": 0.45677490756344796,
"grad_norm": 0.6168875098228455,
"learning_rate": 1.1698848199098596e-05,
"loss": 0.7998,
"step": 22700
},
{
"epoch": 0.45878713182584197,
"grad_norm": 0.6753715872764587,
"learning_rate": 1.1635245831688913e-05,
"loss": 0.8057,
"step": 22800
},
{
"epoch": 0.45878713182584197,
"eval_loss": 0.5467536449432373,
"eval_runtime": 11.3082,
"eval_samples_per_second": 33.427,
"eval_steps_per_second": 1.15,
"step": 22800
},
{
"epoch": 0.460799356088236,
"grad_norm": 0.6399224996566772,
"learning_rate": 1.1571575423408456e-05,
"loss": 0.7965,
"step": 22900
},
{
"epoch": 0.4628115803506301,
"grad_norm": 0.5371870994567871,
"learning_rate": 1.1507839623516401e-05,
"loss": 0.8014,
"step": 23000
},
{
"epoch": 0.46482380461302414,
"grad_norm": 0.711793839931488,
"learning_rate": 1.1444041083992801e-05,
"loss": 0.8081,
"step": 23100
},
{
"epoch": 0.46482380461302414,
"eval_loss": 0.5455725193023682,
"eval_runtime": 11.4796,
"eval_samples_per_second": 32.928,
"eval_steps_per_second": 1.132,
"step": 23100
},
{
"epoch": 0.46683602887541814,
"grad_norm": 0.566677451133728,
"learning_rate": 1.1380182459428234e-05,
"loss": 0.8027,
"step": 23200
},
{
"epoch": 0.4688482531378122,
"grad_norm": 0.7086474895477295,
"learning_rate": 1.1316266406913355e-05,
"loss": 0.8024,
"step": 23300
},
{
"epoch": 0.47086047740020626,
"grad_norm": 0.6261083483695984,
"learning_rate": 1.1252295585928343e-05,
"loss": 0.8054,
"step": 23400
},
{
"epoch": 0.47086047740020626,
"eval_loss": 0.5444592833518982,
"eval_runtime": 11.5945,
"eval_samples_per_second": 32.602,
"eval_steps_per_second": 1.121,
"step": 23400
},
{
"epoch": 0.4728727016626003,
"grad_norm": 0.6763809323310852,
"learning_rate": 1.1188272658232228e-05,
"loss": 0.7952,
"step": 23500
},
{
"epoch": 0.4748849259249943,
"grad_norm": 0.6690487265586853,
"learning_rate": 1.1124200287752157e-05,
"loss": 0.807,
"step": 23600
},
{
"epoch": 0.4768971501873884,
"grad_norm": 0.5711999535560608,
"learning_rate": 1.1060081140472519e-05,
"loss": 0.8052,
"step": 23700
},
{
"epoch": 0.4768971501873884,
"eval_loss": 0.5443876385688782,
"eval_runtime": 11.4195,
"eval_samples_per_second": 33.101,
"eval_steps_per_second": 1.138,
"step": 23700
},
{
"epoch": 0.47890937444978243,
"grad_norm": 0.6411765217781067,
"learning_rate": 1.0995917884324056e-05,
"loss": 0.7976,
"step": 23800
},
{
"epoch": 0.4809215987121765,
"grad_norm": 0.5719566941261292,
"learning_rate": 1.0931713189072827e-05,
"loss": 0.7992,
"step": 23900
},
{
"epoch": 0.4829338229745705,
"grad_norm": 0.5175074934959412,
"learning_rate": 1.086746972620913e-05,
"loss": 0.8009,
"step": 24000
},
{
"epoch": 0.4829338229745705,
"eval_loss": 0.5424737334251404,
"eval_runtime": 11.3763,
"eval_samples_per_second": 33.227,
"eval_steps_per_second": 1.143,
"step": 24000
},
{
"epoch": 0.48494604723696455,
"grad_norm": 0.6476929783821106,
"learning_rate": 1.0803190168836341e-05,
"loss": 0.7984,
"step": 24100
},
{
"epoch": 0.4869582714993586,
"grad_norm": 0.6742759943008423,
"learning_rate": 1.0738877191559691e-05,
"loss": 0.7989,
"step": 24200
},
{
"epoch": 0.48897049576175267,
"grad_norm": 0.5645999908447266,
"learning_rate": 1.067453347037498e-05,
"loss": 0.7985,
"step": 24300
},
{
"epoch": 0.48897049576175267,
"eval_loss": 0.5427749752998352,
"eval_runtime": 11.4256,
"eval_samples_per_second": 33.084,
"eval_steps_per_second": 1.138,
"step": 24300
},
{
"epoch": 0.49098272002414667,
"grad_norm": 0.5972943902015686,
"learning_rate": 1.0610161682557225e-05,
"loss": 0.7961,
"step": 24400
},
{
"epoch": 0.4929949442865407,
"grad_norm": 0.6340279579162598,
"learning_rate": 1.0545764506549273e-05,
"loss": 0.8033,
"step": 24500
},
{
"epoch": 0.4950071685489348,
"grad_norm": 0.6096486449241638,
"learning_rate": 1.0481344621850347e-05,
"loss": 0.7955,
"step": 24600
},
{
"epoch": 0.4950071685489348,
"eval_loss": 0.5418882369995117,
"eval_runtime": 11.4157,
"eval_samples_per_second": 33.112,
"eval_steps_per_second": 1.139,
"step": 24600
},
{
"epoch": 0.49701939281132884,
"grad_norm": 0.5778651833534241,
"learning_rate": 1.041690470890455e-05,
"loss": 0.7954,
"step": 24700
},
{
"epoch": 0.49903161707372284,
"grad_norm": 0.5838211178779602,
"learning_rate": 1.0352447448989337e-05,
"loss": 0.7854,
"step": 24800
},
{
"epoch": 0.5010438413361169,
"grad_norm": 0.5919055342674255,
"learning_rate": 1.0287975524103964e-05,
"loss": 0.7925,
"step": 24900
},
{
"epoch": 0.5010438413361169,
"eval_loss": 0.541851818561554,
"eval_runtime": 11.2979,
"eval_samples_per_second": 33.457,
"eval_steps_per_second": 1.151,
"step": 24900
},
{
"epoch": 0.5030560655985109,
"grad_norm": 0.5358749628067017,
"learning_rate": 1.022349161685787e-05,
"loss": 0.7986,
"step": 25000
},
{
"epoch": 0.505068289860905,
"grad_norm": 0.6401896476745605,
"learning_rate": 1.0158998410359074e-05,
"loss": 0.7914,
"step": 25100
},
{
"epoch": 0.507080514123299,
"grad_norm": 0.5817869901657104,
"learning_rate": 1.0094498588102523e-05,
"loss": 0.7956,
"step": 25200
},
{
"epoch": 0.507080514123299,
"eval_loss": 0.5417122840881348,
"eval_runtime": 11.503,
"eval_samples_per_second": 32.861,
"eval_steps_per_second": 1.13,
"step": 25200
},
{
"epoch": 0.5090927383856931,
"grad_norm": 0.5595591068267822,
"learning_rate": 1.0029994833858438e-05,
"loss": 0.7943,
"step": 25300
},
{
"epoch": 0.5111049626480871,
"grad_norm": 0.5861169099807739,
"learning_rate": 9.965489831560652e-06,
"loss": 0.8006,
"step": 25400
},
{
"epoch": 0.5131171869104811,
"grad_norm": 0.5644922852516174,
"learning_rate": 9.900986265194924e-06,
"loss": 0.7868,
"step": 25500
},
{
"epoch": 0.5131171869104811,
"eval_loss": 0.5409750938415527,
"eval_runtime": 11.3254,
"eval_samples_per_second": 33.376,
"eval_steps_per_second": 1.148,
"step": 25500
},
{
"epoch": 0.5151294111728753,
"grad_norm": 0.5210478901863098,
"learning_rate": 9.836486818687262e-06,
"loss": 0.7967,
"step": 25600
},
{
"epoch": 0.5171416354352693,
"grad_norm": 0.5937855839729309,
"learning_rate": 9.771994175792262e-06,
"loss": 0.7839,
"step": 25700
},
{
"epoch": 0.5191538596976633,
"grad_norm": 0.68199622631073,
"learning_rate": 9.707511019981416e-06,
"loss": 0.7929,
"step": 25800
},
{
"epoch": 0.5191538596976633,
"eval_loss": 0.53957599401474,
"eval_runtime": 11.2847,
"eval_samples_per_second": 33.497,
"eval_steps_per_second": 1.152,
"step": 25800
},
{
"epoch": 0.5211660839600574,
"grad_norm": 0.6363146305084229,
"learning_rate": 9.643040034331475e-06,
"loss": 0.7893,
"step": 25900
},
{
"epoch": 0.5231783082224514,
"grad_norm": 0.6275014877319336,
"learning_rate": 9.578583901412802e-06,
"loss": 0.7883,
"step": 26000
},
{
"epoch": 0.5251905324848455,
"grad_norm": 0.5840523838996887,
"learning_rate": 9.514145303177751e-06,
"loss": 0.7961,
"step": 26100
},
{
"epoch": 0.5251905324848455,
"eval_loss": 0.5387553572654724,
"eval_runtime": 11.2936,
"eval_samples_per_second": 33.47,
"eval_steps_per_second": 1.151,
"step": 26100
},
{
"epoch": 0.5272027567472395,
"grad_norm": 0.706901490688324,
"learning_rate": 9.449726920849085e-06,
"loss": 0.795,
"step": 26200
},
{
"epoch": 0.5292149810096335,
"grad_norm": 0.5236905813217163,
"learning_rate": 9.385331434808386e-06,
"loss": 0.7919,
"step": 26300
},
{
"epoch": 0.5312272052720276,
"grad_norm": 0.6014547348022461,
"learning_rate": 9.320961524484565e-06,
"loss": 0.7917,
"step": 26400
},
{
"epoch": 0.5312272052720276,
"eval_loss": 0.5388390421867371,
"eval_runtime": 11.3827,
"eval_samples_per_second": 33.208,
"eval_steps_per_second": 1.142,
"step": 26400
},
{
"epoch": 0.5332394295344216,
"grad_norm": 0.5613085031509399,
"learning_rate": 9.256619868242341e-06,
"loss": 0.7957,
"step": 26500
},
{
"epoch": 0.5352516537968156,
"grad_norm": 0.6822344064712524,
"learning_rate": 9.192309143270818e-06,
"loss": 0.7867,
"step": 26600
},
{
"epoch": 0.5372638780592097,
"grad_norm": 0.6041319370269775,
"learning_rate": 9.128032025472077e-06,
"loss": 0.7884,
"step": 26700
},
{
"epoch": 0.5372638780592097,
"eval_loss": 0.5368719696998596,
"eval_runtime": 11.3484,
"eval_samples_per_second": 33.309,
"eval_steps_per_second": 1.146,
"step": 26700
},
{
"epoch": 0.5392761023216037,
"grad_norm": 0.644088089466095,
"learning_rate": 9.063791189349841e-06,
"loss": 0.7867,
"step": 26800
},
{
"epoch": 0.5412883265839978,
"grad_norm": 0.627928614616394,
"learning_rate": 8.999589307898192e-06,
"loss": 0.7896,
"step": 26900
},
{
"epoch": 0.5433005508463918,
"grad_norm": 0.6207029819488525,
"learning_rate": 8.935429052490347e-06,
"loss": 0.7853,
"step": 27000
},
{
"epoch": 0.5433005508463918,
"eval_loss": 0.5371023416519165,
"eval_runtime": 11.3461,
"eval_samples_per_second": 33.316,
"eval_steps_per_second": 1.146,
"step": 27000
},
{
"epoch": 0.5453127751087858,
"grad_norm": 0.541533887386322,
"learning_rate": 8.87131309276751e-06,
"loss": 0.7916,
"step": 27100
},
{
"epoch": 0.54732499937118,
"grad_norm": 0.590813934803009,
"learning_rate": 8.807244096527783e-06,
"loss": 0.7948,
"step": 27200
},
{
"epoch": 0.549337223633574,
"grad_norm": 0.584229588508606,
"learning_rate": 8.743224729615168e-06,
"loss": 0.7918,
"step": 27300
},
{
"epoch": 0.549337223633574,
"eval_loss": 0.5366615653038025,
"eval_runtime": 11.3157,
"eval_samples_per_second": 33.405,
"eval_steps_per_second": 1.149,
"step": 27300
},
{
"epoch": 0.551349447895968,
"grad_norm": 0.6746295094490051,
"learning_rate": 8.679257655808645e-06,
"loss": 0.7911,
"step": 27400
},
{
"epoch": 0.5533616721583621,
"grad_norm": 0.6765587329864502,
"learning_rate": 8.615345536711331e-06,
"loss": 0.7906,
"step": 27500
},
{
"epoch": 0.5553738964207561,
"grad_norm": 0.5838325619697571,
"learning_rate": 8.551491031639736e-06,
"loss": 0.7937,
"step": 27600
},
{
"epoch": 0.5553738964207561,
"eval_loss": 0.5361348390579224,
"eval_runtime": 11.3123,
"eval_samples_per_second": 33.415,
"eval_steps_per_second": 1.149,
"step": 27600
},
{
"epoch": 0.5573861206831502,
"grad_norm": 0.6001378893852234,
"learning_rate": 8.487696797513108e-06,
"loss": 0.7777,
"step": 27700
},
{
"epoch": 0.5593983449455442,
"grad_norm": 0.5667701363563538,
"learning_rate": 8.423965488742885e-06,
"loss": 0.7856,
"step": 27800
},
{
"epoch": 0.5614105692079382,
"grad_norm": 0.632291316986084,
"learning_rate": 8.360299757122247e-06,
"loss": 0.7792,
"step": 27900
},
{
"epoch": 0.5614105692079382,
"eval_loss": 0.5353109240531921,
"eval_runtime": 11.3749,
"eval_samples_per_second": 33.231,
"eval_steps_per_second": 1.143,
"step": 27900
},
{
"epoch": 0.5634227934703323,
"grad_norm": 0.5472155213356018,
"learning_rate": 8.296702251715778e-06,
"loss": 0.7831,
"step": 28000
},
{
"epoch": 0.5654350177327263,
"grad_norm": 0.590352475643158,
"learning_rate": 8.233175618749243e-06,
"loss": 0.7833,
"step": 28100
},
{
"epoch": 0.5674472419951203,
"grad_norm": 0.5392365455627441,
"learning_rate": 8.16972250149947e-06,
"loss": 0.7846,
"step": 28200
},
{
"epoch": 0.5674472419951203,
"eval_loss": 0.5345659852027893,
"eval_runtime": 11.3797,
"eval_samples_per_second": 33.217,
"eval_steps_per_second": 1.142,
"step": 28200
},
{
"epoch": 0.5694594662575144,
"grad_norm": 0.5367996692657471,
"learning_rate": 8.106345540184382e-06,
"loss": 0.7881,
"step": 28300
},
{
"epoch": 0.5714716905199084,
"grad_norm": 0.7017585039138794,
"learning_rate": 8.043047371853135e-06,
"loss": 0.7902,
"step": 28400
},
{
"epoch": 0.5734839147823025,
"grad_norm": 0.6775383353233337,
"learning_rate": 7.979830630276384e-06,
"loss": 0.795,
"step": 28500
},
{
"epoch": 0.5734839147823025,
"eval_loss": 0.5349369645118713,
"eval_runtime": 11.3477,
"eval_samples_per_second": 33.311,
"eval_steps_per_second": 1.146,
"step": 28500
},
{
"epoch": 0.5754961390446965,
"grad_norm": 0.5782616138458252,
"learning_rate": 7.91669794583671e-06,
"loss": 0.7902,
"step": 28600
},
{
"epoch": 0.5775083633070905,
"grad_norm": 0.5419892072677612,
"learning_rate": 7.853651945419155e-06,
"loss": 0.7858,
"step": 28700
},
{
"epoch": 0.5795205875694847,
"grad_norm": 0.6611707210540771,
"learning_rate": 7.790695252301938e-06,
"loss": 0.7894,
"step": 28800
},
{
"epoch": 0.5795205875694847,
"eval_loss": 0.5343945026397705,
"eval_runtime": 11.4492,
"eval_samples_per_second": 33.015,
"eval_steps_per_second": 1.135,
"step": 28800
},
{
"epoch": 0.5815328118318787,
"grad_norm": 0.5788918137550354,
"learning_rate": 7.727830486047288e-06,
"loss": 0.7868,
"step": 28900
},
{
"epoch": 0.5835450360942727,
"grad_norm": 0.5480091571807861,
"learning_rate": 7.665060262392461e-06,
"loss": 0.7858,
"step": 29000
},
{
"epoch": 0.5855572603566668,
"grad_norm": 0.730056881904602,
"learning_rate": 7.602387193140887e-06,
"loss": 0.7884,
"step": 29100
},
{
"epoch": 0.5855572603566668,
"eval_loss": 0.5339014530181885,
"eval_runtime": 11.3802,
"eval_samples_per_second": 33.216,
"eval_steps_per_second": 1.142,
"step": 29100
},
{
"epoch": 0.5875694846190608,
"grad_norm": 0.5774337649345398,
"learning_rate": 7.539813886053502e-06,
"loss": 0.7893,
"step": 29200
},
{
"epoch": 0.5895817088814549,
"grad_norm": 0.615470290184021,
"learning_rate": 7.477342944740249e-06,
"loss": 0.7817,
"step": 29300
},
{
"epoch": 0.5915939331438489,
"grad_norm": 0.6776989698410034,
"learning_rate": 7.414976968551735e-06,
"loss": 0.7783,
"step": 29400
},
{
"epoch": 0.5915939331438489,
"eval_loss": 0.533939003944397,
"eval_runtime": 11.3711,
"eval_samples_per_second": 33.242,
"eval_steps_per_second": 1.143,
"step": 29400
},
{
"epoch": 0.5936061574062429,
"grad_norm": 0.5885875821113586,
"learning_rate": 7.352718552471077e-06,
"loss": 0.784,
"step": 29500
},
{
"epoch": 0.595618381668637,
"grad_norm": 0.5772850513458252,
"learning_rate": 7.290570287005931e-06,
"loss": 0.7819,
"step": 29600
},
{
"epoch": 0.597630605931031,
"grad_norm": 0.6122897863388062,
"learning_rate": 7.228534758080694e-06,
"loss": 0.7891,
"step": 29700
},
{
"epoch": 0.597630605931031,
"eval_loss": 0.5327485799789429,
"eval_runtime": 11.3326,
"eval_samples_per_second": 33.355,
"eval_steps_per_second": 1.147,
"step": 29700
},
{
"epoch": 0.599642830193425,
"grad_norm": 0.6210538148880005,
"learning_rate": 7.1666145469289226e-06,
"loss": 0.7832,
"step": 29800
},
{
"epoch": 0.6016550544558191,
"grad_norm": 0.593087911605835,
"learning_rate": 7.1048122299859145e-06,
"loss": 0.7888,
"step": 29900
},
{
"epoch": 0.6036672787182131,
"grad_norm": 0.5805263519287109,
"learning_rate": 7.043130378781516e-06,
"loss": 0.7825,
"step": 30000
},
{
"epoch": 0.6036672787182131,
"eval_loss": 0.5322030782699585,
"eval_runtime": 11.3763,
"eval_samples_per_second": 33.227,
"eval_steps_per_second": 1.143,
"step": 30000
},
{
"epoch": 0.6056795029806072,
"grad_norm": 0.5463854074478149,
"learning_rate": 6.981571559833122e-06,
"loss": 0.7881,
"step": 30100
},
{
"epoch": 0.6076917272430012,
"grad_norm": 0.5730445384979248,
"learning_rate": 6.920138334538878e-06,
"loss": 0.7858,
"step": 30200
},
{
"epoch": 0.6097039515053952,
"grad_norm": 0.5871597528457642,
"learning_rate": 6.858833259071108e-06,
"loss": 0.7777,
"step": 30300
},
{
"epoch": 0.6097039515053952,
"eval_loss": 0.5328507423400879,
"eval_runtime": 11.3806,
"eval_samples_per_second": 33.215,
"eval_steps_per_second": 1.142,
"step": 30300
},
{
"epoch": 0.6117161757677894,
"grad_norm": 0.6252338290214539,
"learning_rate": 6.797658884269962e-06,
"loss": 0.778,
"step": 30400
},
{
"epoch": 0.6137284000301834,
"grad_norm": 0.588524580001831,
"learning_rate": 6.736617755537267e-06,
"loss": 0.7772,
"step": 30500
},
{
"epoch": 0.6157406242925774,
"grad_norm": 0.621525228023529,
"learning_rate": 6.675712412730625e-06,
"loss": 0.7832,
"step": 30600
},
{
"epoch": 0.6157406242925774,
"eval_loss": 0.5325730443000793,
"eval_runtime": 11.3314,
"eval_samples_per_second": 33.359,
"eval_steps_per_second": 1.147,
"step": 30600
},
{
"epoch": 0.6177528485549715,
"grad_norm": 0.5612871646881104,
"learning_rate": 6.614945390057723e-06,
"loss": 0.7831,
"step": 30700
},
{
"epoch": 0.6197650728173655,
"grad_norm": 0.5247837901115417,
"learning_rate": 6.554319215970895e-06,
"loss": 0.7828,
"step": 30800
},
{
"epoch": 0.6217772970797596,
"grad_norm": 0.5758721232414246,
"learning_rate": 6.493836413061907e-06,
"loss": 0.781,
"step": 30900
},
{
"epoch": 0.6217772970797596,
"eval_loss": 0.5314515829086304,
"eval_runtime": 11.3823,
"eval_samples_per_second": 33.21,
"eval_steps_per_second": 1.142,
"step": 30900
},
{
"epoch": 0.6237895213421536,
"grad_norm": 0.7134236693382263,
"learning_rate": 6.433499497957006e-06,
"loss": 0.7852,
"step": 31000
},
{
"epoch": 0.6258017456045476,
"grad_norm": 0.5432785153388977,
"learning_rate": 6.373310981212197e-06,
"loss": 0.7776,
"step": 31100
},
{
"epoch": 0.6278139698669417,
"grad_norm": 0.6110942959785461,
"learning_rate": 6.3132733672087875e-06,
"loss": 0.787,
"step": 31200
},
{
"epoch": 0.6278139698669417,
"eval_loss": 0.5303037166595459,
"eval_runtime": 11.4219,
"eval_samples_per_second": 33.094,
"eval_steps_per_second": 1.138,
"step": 31200
},
{
"epoch": 0.6298261941293357,
"grad_norm": 0.5783369541168213,
"learning_rate": 6.253389154049177e-06,
"loss": 0.7807,
"step": 31300
},
{
"epoch": 0.6318384183917297,
"grad_norm": 0.5356603860855103,
"learning_rate": 6.19366083345291e-06,
"loss": 0.7801,
"step": 31400
},
{
"epoch": 0.6338506426541238,
"grad_norm": 0.5529428124427795,
"learning_rate": 6.134090890653015e-06,
"loss": 0.7774,
"step": 31500
},
{
"epoch": 0.6338506426541238,
"eval_loss": 0.5301904678344727,
"eval_runtime": 11.4476,
"eval_samples_per_second": 33.02,
"eval_steps_per_second": 1.136,
"step": 31500
},
{
"epoch": 0.6358628669165178,
"grad_norm": 0.5553627610206604,
"learning_rate": 6.074681804292581e-06,
"loss": 0.7791,
"step": 31600
},
{
"epoch": 0.6378750911789118,
"grad_norm": 0.5281953811645508,
"learning_rate": 6.0154360463216325e-06,
"loss": 0.7769,
"step": 31700
},
{
"epoch": 0.639887315441306,
"grad_norm": 0.6406475305557251,
"learning_rate": 5.956356081894259e-06,
"loss": 0.7799,
"step": 31800
},
{
"epoch": 0.639887315441306,
"eval_loss": 0.5294053554534912,
"eval_runtime": 11.3422,
"eval_samples_per_second": 33.327,
"eval_steps_per_second": 1.146,
"step": 31800
},
{
"epoch": 0.6418995397037,
"grad_norm": 0.49855828285217285,
"learning_rate": 5.897444369266066e-06,
"loss": 0.7759,
"step": 31900
},
{
"epoch": 0.6439117639660941,
"grad_norm": 0.5699638724327087,
"learning_rate": 5.838703359691873e-06,
"loss": 0.7673,
"step": 32000
},
{
"epoch": 0.6459239882284881,
"grad_norm": 0.5306676030158997,
"learning_rate": 5.780135497323724e-06,
"loss": 0.7799,
"step": 32100
},
{
"epoch": 0.6459239882284881,
"eval_loss": 0.5290261507034302,
"eval_runtime": 11.3435,
"eval_samples_per_second": 33.323,
"eval_steps_per_second": 1.146,
"step": 32100
},
{
"epoch": 0.6479362124908821,
"grad_norm": 0.5989037752151489,
"learning_rate": 5.721743219109187e-06,
"loss": 0.7757,
"step": 32200
},
{
"epoch": 0.6499484367532762,
"grad_norm": 0.5595914721488953,
"learning_rate": 5.663528954689958e-06,
"loss": 0.7761,
"step": 32300
},
{
"epoch": 0.6519606610156702,
"grad_norm": 0.5618345737457275,
"learning_rate": 5.605495126300766e-06,
"loss": 0.779,
"step": 32400
},
{
"epoch": 0.6519606610156702,
"eval_loss": 0.529247522354126,
"eval_runtime": 11.3716,
"eval_samples_per_second": 33.241,
"eval_steps_per_second": 1.143,
"step": 32400
},
{
"epoch": 0.6539728852780642,
"grad_norm": 0.5271475315093994,
"learning_rate": 5.547644148668585e-06,
"loss": 0.7747,
"step": 32500
},
{
"epoch": 0.6559851095404583,
"grad_norm": 0.5703973770141602,
"learning_rate": 5.489978428912157e-06,
"loss": 0.7801,
"step": 32600
},
{
"epoch": 0.6579973338028523,
"grad_norm": 0.570797860622406,
"learning_rate": 5.432500366441843e-06,
"loss": 0.7756,
"step": 32700
},
{
"epoch": 0.6579973338028523,
"eval_loss": 0.5275307893753052,
"eval_runtime": 11.3412,
"eval_samples_per_second": 33.33,
"eval_steps_per_second": 1.146,
"step": 32700
},
{
"epoch": 0.6600095580652464,
"grad_norm": 0.564414918422699,
"learning_rate": 5.3752123528597746e-06,
"loss": 0.7688,
"step": 32800
},
{
"epoch": 0.6620217823276404,
"grad_norm": 0.5405446290969849,
"learning_rate": 5.318116771860351e-06,
"loss": 0.7777,
"step": 32900
},
{
"epoch": 0.6640340065900344,
"grad_norm": 0.5645068883895874,
"learning_rate": 5.261215999131055e-06,
"loss": 0.7723,
"step": 33000
},
{
"epoch": 0.6640340065900344,
"eval_loss": 0.5280060172080994,
"eval_runtime": 11.3103,
"eval_samples_per_second": 33.421,
"eval_steps_per_second": 1.149,
"step": 33000
},
{
"epoch": 0.6660462308524285,
"grad_norm": 0.5821409225463867,
"learning_rate": 5.204512402253592e-06,
"loss": 0.7857,
"step": 33100
},
{
"epoch": 0.6680584551148225,
"grad_norm": 0.5534176230430603,
"learning_rate": 5.148008340605393e-06,
"loss": 0.7726,
"step": 33200
},
{
"epoch": 0.6700706793772165,
"grad_norm": 0.5734113454818726,
"learning_rate": 5.091706165261438e-06,
"loss": 0.7806,
"step": 33300
},
{
"epoch": 0.6700706793772165,
"eval_loss": 0.527226984500885,
"eval_runtime": 11.3532,
"eval_samples_per_second": 33.295,
"eval_steps_per_second": 1.145,
"step": 33300
},
{
"epoch": 0.6720829036396107,
"grad_norm": 0.5118337273597717,
"learning_rate": 5.035608218896424e-06,
"loss": 0.7794,
"step": 33400
},
{
"epoch": 0.6740951279020047,
"grad_norm": 0.520524799823761,
"learning_rate": 4.979716835687296e-06,
"loss": 0.7833,
"step": 33500
},
{
"epoch": 0.6761073521643988,
"grad_norm": 0.5260956883430481,
"learning_rate": 4.924034341216123e-06,
"loss": 0.7722,
"step": 33600
},
{
"epoch": 0.6761073521643988,
"eval_loss": 0.5266076326370239,
"eval_runtime": 11.3351,
"eval_samples_per_second": 33.348,
"eval_steps_per_second": 1.147,
"step": 33600
},
{
"epoch": 0.6781195764267928,
"grad_norm": 0.5933238863945007,
"learning_rate": 4.868563052373329e-06,
"loss": 0.778,
"step": 33700
},
{
"epoch": 0.6801318006891868,
"grad_norm": 0.5882487297058105,
"learning_rate": 4.813305277261294e-06,
"loss": 0.778,
"step": 33800
},
{
"epoch": 0.6821440249515809,
"grad_norm": 0.5495398640632629,
"learning_rate": 4.758263315098319e-06,
"loss": 0.7749,
"step": 33900
},
{
"epoch": 0.6821440249515809,
"eval_loss": 0.527021050453186,
"eval_runtime": 11.3019,
"eval_samples_per_second": 33.446,
"eval_steps_per_second": 1.15,
"step": 33900
},
{
"epoch": 0.6841562492139749,
"grad_norm": 0.5372888445854187,
"learning_rate": 4.703439456122942e-06,
"loss": 0.7726,
"step": 34000
},
{
"epoch": 0.6861684734763689,
"grad_norm": 0.5453928709030151,
"learning_rate": 4.648835981498665e-06,
"loss": 0.7736,
"step": 34100
},
{
"epoch": 0.688180697738763,
"grad_norm": 0.534249484539032,
"learning_rate": 4.594455163219025e-06,
"loss": 0.7669,
"step": 34200
},
{
"epoch": 0.688180697738763,
"eval_loss": 0.5258325934410095,
"eval_runtime": 11.3315,
"eval_samples_per_second": 33.358,
"eval_steps_per_second": 1.147,
"step": 34200
},
{
"epoch": 0.690192922001157,
"grad_norm": 0.602557897567749,
"learning_rate": 4.5402992640130615e-06,
"loss": 0.7776,
"step": 34300
},
{
"epoch": 0.6922051462635511,
"grad_norm": 0.6340908408164978,
"learning_rate": 4.486370537251166e-06,
"loss": 0.7724,
"step": 34400
},
{
"epoch": 0.6942173705259451,
"grad_norm": 0.5442144870758057,
"learning_rate": 4.43267122685132e-06,
"loss": 0.7678,
"step": 34500
},
{
"epoch": 0.6942173705259451,
"eval_loss": 0.52588951587677,
"eval_runtime": 11.3113,
"eval_samples_per_second": 33.418,
"eval_steps_per_second": 1.149,
"step": 34500
},
{
"epoch": 0.6962295947883391,
"grad_norm": 0.5438702702522278,
"learning_rate": 4.379203567185733e-06,
"loss": 0.7722,
"step": 34600
},
{
"epoch": 0.6982418190507332,
"grad_norm": 0.575579822063446,
"learning_rate": 4.325969782987868e-06,
"loss": 0.7806,
"step": 34700
},
{
"epoch": 0.7002540433131272,
"grad_norm": 0.53037029504776,
"learning_rate": 4.2729720892598725e-06,
"loss": 0.7677,
"step": 34800
},
{
"epoch": 0.7002540433131272,
"eval_loss": 0.5252464413642883,
"eval_runtime": 11.2976,
"eval_samples_per_second": 33.458,
"eval_steps_per_second": 1.151,
"step": 34800
},
{
"epoch": 0.7022662675755212,
"grad_norm": 0.5570893883705139,
"learning_rate": 4.220212691180422e-06,
"loss": 0.7674,
"step": 34900
},
{
"epoch": 0.7042784918379154,
"grad_norm": 0.564457893371582,
"learning_rate": 4.167693784012948e-06,
"loss": 0.7774,
"step": 35000
},
{
"epoch": 0.7062907161003094,
"grad_norm": 0.6193362474441528,
"learning_rate": 4.115417553014317e-06,
"loss": 0.7739,
"step": 35100
},
{
"epoch": 0.7062907161003094,
"eval_loss": 0.5251539349555969,
"eval_runtime": 11.3037,
"eval_samples_per_second": 33.44,
"eval_steps_per_second": 1.15,
"step": 35100
},
{
"epoch": 0.7083029403627035,
"grad_norm": 0.5650792121887207,
"learning_rate": 4.063386173343888e-06,
"loss": 0.775,
"step": 35200
},
{
"epoch": 0.7103151646250975,
"grad_norm": 0.5598296523094177,
"learning_rate": 4.0116018099730155e-06,
"loss": 0.7736,
"step": 35300
},
{
"epoch": 0.7123273888874915,
"grad_norm": 0.5999264717102051,
"learning_rate": 3.960066617594962e-06,
"loss": 0.7728,
"step": 35400
},
{
"epoch": 0.7123273888874915,
"eval_loss": 0.5251903533935547,
"eval_runtime": 11.3608,
"eval_samples_per_second": 33.272,
"eval_steps_per_second": 1.144,
"step": 35400
},
{
"epoch": 0.7143396131498856,
"grad_norm": 0.5485169291496277,
"learning_rate": 3.908782740535244e-06,
"loss": 0.7663,
"step": 35500
},
{
"epoch": 0.7163518374122796,
"grad_norm": 0.5973437428474426,
"learning_rate": 3.857752312662413e-06,
"loss": 0.7731,
"step": 35600
},
{
"epoch": 0.7183640616746736,
"grad_norm": 0.559617280960083,
"learning_rate": 3.8069774572992614e-06,
"loss": 0.7623,
"step": 35700
},
{
"epoch": 0.7183640616746736,
"eval_loss": 0.5247710347175598,
"eval_runtime": 11.3529,
"eval_samples_per_second": 33.296,
"eval_steps_per_second": 1.145,
"step": 35700
},
{
"epoch": 0.7203762859370677,
"grad_norm": 0.5565606355667114,
"learning_rate": 3.756460287134479e-06,
"loss": 0.7773,
"step": 35800
},
{
"epoch": 0.7223885101994617,
"grad_norm": 0.5371571779251099,
"learning_rate": 3.706202904134747e-06,
"loss": 0.7761,
"step": 35900
},
{
"epoch": 0.7244007344618558,
"grad_norm": 0.5425861477851868,
"learning_rate": 3.6562073994572624e-06,
"loss": 0.7775,
"step": 36000
},
{
"epoch": 0.7244007344618558,
"eval_loss": 0.5243012309074402,
"eval_runtime": 11.3858,
"eval_samples_per_second": 33.199,
"eval_steps_per_second": 1.142,
"step": 36000
},
{
"epoch": 0.7264129587242498,
"grad_norm": 0.5546737909317017,
"learning_rate": 3.6064758533627496e-06,
"loss": 0.7712,
"step": 36100
},
{
"epoch": 0.7284251829866438,
"grad_norm": 0.6678885221481323,
"learning_rate": 3.55701033512889e-06,
"loss": 0.769,
"step": 36200
},
{
"epoch": 0.7304374072490379,
"grad_norm": 0.5747791528701782,
"learning_rate": 3.5078129029642192e-06,
"loss": 0.7671,
"step": 36300
},
{
"epoch": 0.7304374072490379,
"eval_loss": 0.523876428604126,
"eval_runtime": 11.3643,
"eval_samples_per_second": 33.262,
"eval_steps_per_second": 1.144,
"step": 36300
},
{
"epoch": 0.7324496315114319,
"grad_norm": 0.6479108333587646,
"learning_rate": 3.458885603922498e-06,
"loss": 0.7678,
"step": 36400
},
{
"epoch": 0.734461855773826,
"grad_norm": 0.5260623693466187,
"learning_rate": 3.4102304738175264e-06,
"loss": 0.7686,
"step": 36500
},
{
"epoch": 0.7364740800362201,
"grad_norm": 0.5565561056137085,
"learning_rate": 3.3618495371384384e-06,
"loss": 0.7722,
"step": 36600
},
{
"epoch": 0.7364740800362201,
"eval_loss": 0.5241602659225464,
"eval_runtime": 11.2637,
"eval_samples_per_second": 33.559,
"eval_steps_per_second": 1.154,
"step": 36600
},
{
"epoch": 0.7384863042986141,
"grad_norm": 0.5522435307502747,
"learning_rate": 3.3137448069654687e-06,
"loss": 0.7753,
"step": 36700
},
{
"epoch": 0.7404985285610082,
"grad_norm": 0.5111953020095825,
"learning_rate": 3.265918284886186e-06,
"loss": 0.7739,
"step": 36800
},
{
"epoch": 0.7425107528234022,
"grad_norm": 0.5280485153198242,
"learning_rate": 3.2183719609122146e-06,
"loss": 0.7626,
"step": 36900
},
{
"epoch": 0.7425107528234022,
"eval_loss": 0.5227437615394592,
"eval_runtime": 11.3194,
"eval_samples_per_second": 33.394,
"eval_steps_per_second": 1.148,
"step": 36900
},
{
"epoch": 0.7445229770857962,
"grad_norm": 0.5183678865432739,
"learning_rate": 3.171107813396418e-06,
"loss": 0.7745,
"step": 37000
},
{
"epoch": 0.7465352013481903,
"grad_norm": 0.5712314248085022,
"learning_rate": 3.124127808950602e-06,
"loss": 0.7711,
"step": 37100
},
{
"epoch": 0.7485474256105843,
"grad_norm": 0.5488412380218506,
"learning_rate": 3.0774339023636756e-06,
"loss": 0.7689,
"step": 37200
},
{
"epoch": 0.7485474256105843,
"eval_loss": 0.5230608582496643,
"eval_runtime": 11.338,
"eval_samples_per_second": 33.339,
"eval_steps_per_second": 1.147,
"step": 37200
},
{
"epoch": 0.7505596498729783,
"grad_norm": 0.5331023335456848,
"learning_rate": 3.0310280365203102e-06,
"loss": 0.7663,
"step": 37300
},
{
"epoch": 0.7525718741353724,
"grad_norm": 0.5227448344230652,
"learning_rate": 2.9849121423201054e-06,
"loss": 0.7645,
"step": 37400
},
{
"epoch": 0.7545840983977664,
"grad_norm": 0.5383438467979431,
"learning_rate": 2.9390881385972445e-06,
"loss": 0.7624,
"step": 37500
},
{
"epoch": 0.7545840983977664,
"eval_loss": 0.5230525732040405,
"eval_runtime": 11.3076,
"eval_samples_per_second": 33.429,
"eval_steps_per_second": 1.15,
"step": 37500
},
{
"epoch": 0.7565963226601605,
"grad_norm": 0.5267183184623718,
"learning_rate": 2.8935579320406504e-06,
"loss": 0.7744,
"step": 37600
},
{
"epoch": 0.7586085469225545,
"grad_norm": 0.5995730757713318,
"learning_rate": 2.8483234171146544e-06,
"loss": 0.77,
"step": 37700
},
{
"epoch": 0.7606207711849485,
"grad_norm": 0.5342182517051697,
"learning_rate": 2.803386475980171e-06,
"loss": 0.772,
"step": 37800
},
{
"epoch": 0.7606207711849485,
"eval_loss": 0.5222497582435608,
"eval_runtime": 11.6813,
"eval_samples_per_second": 32.36,
"eval_steps_per_second": 1.113,
"step": 37800
},
{
"epoch": 0.7626329954473426,
"grad_norm": 0.5149078965187073,
"learning_rate": 2.758748978416369e-06,
"loss": 0.7675,
"step": 37900
},
{
"epoch": 0.7646452197097366,
"grad_norm": 0.5688450932502747,
"learning_rate": 2.7144127817428965e-06,
"loss": 0.7655,
"step": 38000
},
{
"epoch": 0.7666574439721306,
"grad_norm": 0.5706648826599121,
"learning_rate": 2.6703797307425792e-06,
"loss": 0.7645,
"step": 38100
},
{
"epoch": 0.7666574439721306,
"eval_loss": 0.5218858122825623,
"eval_runtime": 11.6659,
"eval_samples_per_second": 32.402,
"eval_steps_per_second": 1.114,
"step": 38100
},
{
"epoch": 0.7686696682345248,
"grad_norm": 0.5271847248077393,
"learning_rate": 2.626651657584672e-06,
"loss": 0.7699,
"step": 38200
},
{
"epoch": 0.7706818924969188,
"grad_norm": 0.5311073064804077,
"learning_rate": 2.5832303817486137e-06,
"loss": 0.766,
"step": 38300
},
{
"epoch": 0.7726941167593129,
"grad_norm": 0.5762016177177429,
"learning_rate": 2.540117709948332e-06,
"loss": 0.7612,
"step": 38400
},
{
"epoch": 0.7726941167593129,
"eval_loss": 0.5214508175849915,
"eval_runtime": 11.4525,
"eval_samples_per_second": 33.006,
"eval_steps_per_second": 1.135,
"step": 38400
},
{
"epoch": 0.7747063410217069,
"grad_norm": 0.5659816861152649,
"learning_rate": 2.497315436057064e-06,
"loss": 0.7693,
"step": 38500
},
{
"epoch": 0.7767185652841009,
"grad_norm": 0.530085563659668,
"learning_rate": 2.4548253410327104e-06,
"loss": 0.7598,
"step": 38600
},
{
"epoch": 0.778730789546495,
"grad_norm": 0.624070405960083,
"learning_rate": 2.412649192843739e-06,
"loss": 0.7722,
"step": 38700
},
{
"epoch": 0.778730789546495,
"eval_loss": 0.5214821100234985,
"eval_runtime": 11.3194,
"eval_samples_per_second": 33.394,
"eval_steps_per_second": 1.148,
"step": 38700
},
{
"epoch": 0.780743013808889,
"grad_norm": 0.5348799228668213,
"learning_rate": 2.3707887463956146e-06,
"loss": 0.7615,
"step": 38800
},
{
"epoch": 0.782755238071283,
"grad_norm": 0.5490187406539917,
"learning_rate": 2.3292457434577854e-06,
"loss": 0.7714,
"step": 38900
},
{
"epoch": 0.7847674623336771,
"grad_norm": 0.5568532943725586,
"learning_rate": 2.2880219125912064e-06,
"loss": 0.7604,
"step": 39000
},
{
"epoch": 0.7847674623336771,
"eval_loss": 0.5214923620223999,
"eval_runtime": 11.3214,
"eval_samples_per_second": 33.388,
"eval_steps_per_second": 1.148,
"step": 39000
},
{
"epoch": 0.7867796865960711,
"grad_norm": 0.5511381030082703,
"learning_rate": 2.2471189690764093e-06,
"loss": 0.7644,
"step": 39100
},
{
"epoch": 0.7887919108584652,
"grad_norm": 0.5425460338592529,
"learning_rate": 2.2065386148421486e-06,
"loss": 0.7633,
"step": 39200
},
{
"epoch": 0.7908041351208592,
"grad_norm": 0.4867189824581146,
"learning_rate": 2.1662825383945686e-06,
"loss": 0.7674,
"step": 39300
},
{
"epoch": 0.7908041351208592,
"eval_loss": 0.5209300518035889,
"eval_runtime": 11.3182,
"eval_samples_per_second": 33.397,
"eval_steps_per_second": 1.149,
"step": 39300
},
{
"epoch": 0.7928163593832532,
"grad_norm": 0.5154452919960022,
"learning_rate": 2.1263524147469573e-06,
"loss": 0.7663,
"step": 39400
},
{
"epoch": 0.7948285836456473,
"grad_norm": 0.5264437198638916,
"learning_rate": 2.0867499053500473e-06,
"loss": 0.7642,
"step": 39500
},
{
"epoch": 0.7968408079080413,
"grad_norm": 0.5303503274917603,
"learning_rate": 2.047476658022881e-06,
"loss": 0.7722,
"step": 39600
},
{
"epoch": 0.7968408079080413,
"eval_loss": 0.5208966135978699,
"eval_runtime": 11.3632,
"eval_samples_per_second": 33.265,
"eval_steps_per_second": 1.144,
"step": 39600
},
{
"epoch": 0.7988530321704354,
"grad_norm": 0.5367266535758972,
"learning_rate": 2.0085343068842546e-06,
"loss": 0.753,
"step": 39700
},
{
"epoch": 0.8008652564328295,
"grad_norm": 0.5081086754798889,
"learning_rate": 1.9699244722847143e-06,
"loss": 0.7571,
"step": 39800
},
{
"epoch": 0.8028774806952235,
"grad_norm": 0.5019336938858032,
"learning_rate": 1.9316487607391465e-06,
"loss": 0.7723,
"step": 39900
},
{
"epoch": 0.8028774806952235,
"eval_loss": 0.5206644535064697,
"eval_runtime": 11.3602,
"eval_samples_per_second": 33.274,
"eval_steps_per_second": 1.144,
"step": 39900
},
{
"epoch": 0.8048897049576176,
"grad_norm": 0.5184951424598694,
"learning_rate": 1.893708764859924e-06,
"loss": 0.7677,
"step": 40000
},
{
"epoch": 0.8069019292200116,
"grad_norm": 0.5265465974807739,
"learning_rate": 1.8561060632906369e-06,
"loss": 0.7686,
"step": 40100
},
{
"epoch": 0.8089141534824056,
"grad_norm": 0.5161654353141785,
"learning_rate": 1.8188422206404165e-06,
"loss": 0.769,
"step": 40200
},
{
"epoch": 0.8089141534824056,
"eval_loss": 0.5201809406280518,
"eval_runtime": 11.369,
"eval_samples_per_second": 33.248,
"eval_steps_per_second": 1.143,
"step": 40200
},
{
"epoch": 0.8109263777447997,
"grad_norm": 0.5580165982246399,
"learning_rate": 1.7819187874188293e-06,
"loss": 0.7686,
"step": 40300
},
{
"epoch": 0.8129386020071937,
"grad_norm": 0.5577532052993774,
"learning_rate": 1.7453372999713557e-06,
"loss": 0.7616,
"step": 40400
},
{
"epoch": 0.8149508262695877,
"grad_norm": 0.5307947993278503,
"learning_rate": 1.709099280415476e-06,
"loss": 0.7705,
"step": 40500
},
{
"epoch": 0.8149508262695877,
"eval_loss": 0.5200989842414856,
"eval_runtime": 11.3357,
"eval_samples_per_second": 33.346,
"eval_steps_per_second": 1.147,
"step": 40500
},
{
"epoch": 0.8169630505319818,
"grad_norm": 0.5261068940162659,
"learning_rate": 1.6732062365773272e-06,
"loss": 0.7674,
"step": 40600
},
{
"epoch": 0.8189752747943758,
"grad_norm": 0.4946574568748474,
"learning_rate": 1.6376596619289653e-06,
"loss": 0.7654,
"step": 40700
},
{
"epoch": 0.8209874990567699,
"grad_norm": 0.5491064786911011,
"learning_rate": 1.6024610355262282e-06,
"loss": 0.7695,
"step": 40800
},
{
"epoch": 0.8209874990567699,
"eval_loss": 0.5198547840118408,
"eval_runtime": 11.316,
"eval_samples_per_second": 33.404,
"eval_steps_per_second": 1.149,
"step": 40800
},
{
"epoch": 0.8229997233191639,
"grad_norm": 0.5306958556175232,
"learning_rate": 1.5676118219471891e-06,
"loss": 0.7619,
"step": 40900
},
{
"epoch": 0.8250119475815579,
"grad_norm": 0.5380471348762512,
"learning_rate": 1.5331134712312235e-06,
"loss": 0.767,
"step": 41000
},
{
"epoch": 0.827024171843952,
"grad_norm": 0.5167573094367981,
"learning_rate": 1.4989674188186598e-06,
"loss": 0.7599,
"step": 41100
},
{
"epoch": 0.827024171843952,
"eval_loss": 0.5196862816810608,
"eval_runtime": 11.2973,
"eval_samples_per_second": 33.459,
"eval_steps_per_second": 1.151,
"step": 41100
},
{
"epoch": 0.829036396106346,
"grad_norm": 0.5409244894981384,
"learning_rate": 1.4651750854910685e-06,
"loss": 0.7587,
"step": 41200
},
{
"epoch": 0.83104862036874,
"grad_norm": 0.5431727170944214,
"learning_rate": 1.4317378773121393e-06,
"loss": 0.7579,
"step": 41300
},
{
"epoch": 0.8330608446311342,
"grad_norm": 0.53000807762146,
"learning_rate": 1.3986571855691744e-06,
"loss": 0.7688,
"step": 41400
},
{
"epoch": 0.8330608446311342,
"eval_loss": 0.5197826623916626,
"eval_runtime": 11.3928,
"eval_samples_per_second": 33.179,
"eval_steps_per_second": 1.141,
"step": 41400
},
{
"epoch": 0.8350730688935282,
"grad_norm": 0.5434339046478271,
"learning_rate": 1.3659343867151975e-06,
"loss": 0.7695,
"step": 41500
},
{
"epoch": 0.8370852931559222,
"grad_norm": 0.5368450284004211,
"learning_rate": 1.3335708423116856e-06,
"loss": 0.7636,
"step": 41600
},
{
"epoch": 0.8390975174183163,
"grad_norm": 0.5331200361251831,
"learning_rate": 1.3015678989719116e-06,
"loss": 0.7696,
"step": 41700
},
{
"epoch": 0.8390975174183163,
"eval_loss": 0.519400954246521,
"eval_runtime": 11.3064,
"eval_samples_per_second": 33.432,
"eval_steps_per_second": 1.15,
"step": 41700
},
{
"epoch": 0.8411097416807103,
"grad_norm": 0.5858904123306274,
"learning_rate": 1.2699268883049154e-06,
"loss": 0.7648,
"step": 41800
},
{
"epoch": 0.8431219659431044,
"grad_norm": 0.5302870273590088,
"learning_rate": 1.2386491268600976e-06,
"loss": 0.7553,
"step": 41900
},
{
"epoch": 0.8451341902054984,
"grad_norm": 0.4971041679382324,
"learning_rate": 1.2077359160724388e-06,
"loss": 0.7655,
"step": 42000
},
{
"epoch": 0.8451341902054984,
"eval_loss": 0.519396960735321,
"eval_runtime": 11.3912,
"eval_samples_per_second": 33.183,
"eval_steps_per_second": 1.141,
"step": 42000
},
{
"epoch": 0.8471464144678924,
"grad_norm": 0.5351930856704712,
"learning_rate": 1.1771885422083418e-06,
"loss": 0.7603,
"step": 42100
},
{
"epoch": 0.8491586387302865,
"grad_norm": 0.4970718026161194,
"learning_rate": 1.1470082763121227e-06,
"loss": 0.7661,
"step": 42200
},
{
"epoch": 0.8511708629926805,
"grad_norm": 0.5322678089141846,
"learning_rate": 1.1171963741531178e-06,
"loss": 0.7616,
"step": 42300
},
{
"epoch": 0.8511708629926805,
"eval_loss": 0.5193082094192505,
"eval_runtime": 11.3559,
"eval_samples_per_second": 33.287,
"eval_steps_per_second": 1.145,
"step": 42300
},
{
"epoch": 0.8531830872550745,
"grad_norm": 0.5380090475082397,
"learning_rate": 1.0877540761734317e-06,
"loss": 0.7623,
"step": 42400
},
{
"epoch": 0.8551953115174686,
"grad_norm": 0.5419859290122986,
"learning_rate": 1.0586826074363277e-06,
"loss": 0.761,
"step": 42500
},
{
"epoch": 0.8572075357798626,
"grad_norm": 0.5447313189506531,
"learning_rate": 1.0299831775752478e-06,
"loss": 0.7635,
"step": 42600
},
{
"epoch": 0.8572075357798626,
"eval_loss": 0.5189518332481384,
"eval_runtime": 11.3146,
"eval_samples_per_second": 33.408,
"eval_steps_per_second": 1.149,
"step": 42600
},
{
"epoch": 0.8592197600422568,
"grad_norm": 0.5054132342338562,
"learning_rate": 1.0016569807434894e-06,
"loss": 0.7553,
"step": 42700
},
{
"epoch": 0.8612319843046508,
"grad_norm": 0.5626354217529297,
"learning_rate": 9.737051955645104e-07,
"loss": 0.76,
"step": 42800
},
{
"epoch": 0.8632442085670448,
"grad_norm": 0.6139233112335205,
"learning_rate": 9.461289850828936e-07,
"loss": 0.7586,
"step": 42900
},
{
"epoch": 0.8632442085670448,
"eval_loss": 0.5188504457473755,
"eval_runtime": 11.3931,
"eval_samples_per_second": 33.178,
"eval_steps_per_second": 1.141,
"step": 42900
},
{
"epoch": 0.8652564328294389,
"grad_norm": 0.5168823003768921,
"learning_rate": 9.189294967159457e-07,
"loss": 0.7569,
"step": 43000
},
{
"epoch": 0.8672686570918329,
"grad_norm": 0.5103846192359924,
"learning_rate": 8.921078622059643e-07,
"loss": 0.7598,
"step": 43100
},
{
"epoch": 0.8692808813542269,
"grad_norm": 0.5376741290092468,
"learning_rate": 8.656651975731434e-07,
"loss": 0.7687,
"step": 43200
},
{
"epoch": 0.8692808813542269,
"eval_loss": 0.5187187790870667,
"eval_runtime": 11.3132,
"eval_samples_per_second": 33.412,
"eval_steps_per_second": 1.149,
"step": 43200
},
{
"epoch": 0.871293105616621,
"grad_norm": 0.5139674544334412,
"learning_rate": 8.396026030691329e-07,
"loss": 0.7543,
"step": 43300
},
{
"epoch": 0.873305329879015,
"grad_norm": 0.4912608563899994,
"learning_rate": 8.139211631312638e-07,
"loss": 0.759,
"step": 43400
},
{
"epoch": 0.8753175541414091,
"grad_norm": 0.5286913514137268,
"learning_rate": 7.886219463374256e-07,
"loss": 0.7579,
"step": 43500
},
{
"epoch": 0.8753175541414091,
"eval_loss": 0.5185059905052185,
"eval_runtime": 11.3249,
"eval_samples_per_second": 33.378,
"eval_steps_per_second": 1.148,
"step": 43500
},
{
"epoch": 0.8773297784038031,
"grad_norm": 0.4960270822048187,
"learning_rate": 7.637060053615963e-07,
"loss": 0.7582,
"step": 43600
},
{
"epoch": 0.8793420026661971,
"grad_norm": 0.5134163498878479,
"learning_rate": 7.391743769300541e-07,
"loss": 0.7624,
"step": 43700
},
{
"epoch": 0.8813542269285912,
"grad_norm": 0.5594838857650757,
"learning_rate": 7.150280817782296e-07,
"loss": 0.7626,
"step": 43800
},
{
"epoch": 0.8813542269285912,
"eval_loss": 0.5184139013290405,
"eval_runtime": 11.3303,
"eval_samples_per_second": 33.362,
"eval_steps_per_second": 1.147,
"step": 43800
},
{
"epoch": 0.8833664511909852,
"grad_norm": 0.523009717464447,
"learning_rate": 6.912681246082409e-07,
"loss": 0.7554,
"step": 43900
},
{
"epoch": 0.8853786754533792,
"grad_norm": 0.50362229347229,
"learning_rate": 6.678954940470806e-07,
"loss": 0.758,
"step": 44000
},
{
"epoch": 0.8873908997157733,
"grad_norm": 0.5441898107528687,
"learning_rate": 6.449111626054927e-07,
"loss": 0.7573,
"step": 44100
},
{
"epoch": 0.8873908997157733,
"eval_loss": 0.5184325575828552,
"eval_runtime": 11.3938,
"eval_samples_per_second": 33.176,
"eval_steps_per_second": 1.141,
"step": 44100
},
{
"epoch": 0.8894031239781673,
"grad_norm": 0.520699679851532,
"learning_rate": 6.223160866374967e-07,
"loss": 0.7638,
"step": 44200
},
{
"epoch": 0.8914153482405615,
"grad_norm": 0.4745332598686218,
"learning_rate": 6.001112063005998e-07,
"loss": 0.7577,
"step": 44300
},
{
"epoch": 0.8934275725029555,
"grad_norm": 0.49645400047302246,
"learning_rate": 5.782974455166767e-07,
"loss": 0.7619,
"step": 44400
},
{
"epoch": 0.8934275725029555,
"eval_loss": 0.518170952796936,
"eval_runtime": 11.3133,
"eval_samples_per_second": 33.412,
"eval_steps_per_second": 1.149,
"step": 44400
},
{
"epoch": 0.8954397967653495,
"grad_norm": 0.5159271955490112,
"learning_rate": 5.568757119335244e-07,
"loss": 0.7571,
"step": 44500
},
{
"epoch": 0.8974520210277436,
"grad_norm": 0.5097435712814331,
"learning_rate": 5.358468968871e-07,
"loss": 0.7697,
"step": 44600
},
{
"epoch": 0.8994642452901376,
"grad_norm": 0.5482389330863953,
"learning_rate": 5.152118753644275e-07,
"loss": 0.7682,
"step": 44700
},
{
"epoch": 0.8994642452901376,
"eval_loss": 0.5181338787078857,
"eval_runtime": 11.4656,
"eval_samples_per_second": 32.968,
"eval_steps_per_second": 1.134,
"step": 44700
},
{
"epoch": 0.9014764695525316,
"grad_norm": 0.5253916382789612,
"learning_rate": 4.949715059671978e-07,
"loss": 0.7656,
"step": 44800
},
{
"epoch": 0.9034886938149257,
"grad_norm": 0.4978592097759247,
"learning_rate": 4.7512663087603826e-07,
"loss": 0.7621,
"step": 44900
},
{
"epoch": 0.9055009180773197,
"grad_norm": 0.5216113924980164,
"learning_rate": 4.5567807581546664e-07,
"loss": 0.7595,
"step": 45000
},
{
"epoch": 0.9055009180773197,
"eval_loss": 0.5181112885475159,
"eval_runtime": 11.5213,
"eval_samples_per_second": 32.809,
"eval_steps_per_second": 1.128,
"step": 45000
},
{
"epoch": 0.9075131423397138,
"grad_norm": 0.5027504563331604,
"learning_rate": 4.366266500195426e-07,
"loss": 0.7588,
"step": 45100
},
{
"epoch": 0.9095253666021078,
"grad_norm": 0.5365561842918396,
"learning_rate": 4.1797314619819285e-07,
"loss": 0.7612,
"step": 45200
},
{
"epoch": 0.9115375908645018,
"grad_norm": 0.5316836833953857,
"learning_rate": 3.997183405042238e-07,
"loss": 0.7639,
"step": 45300
},
{
"epoch": 0.9115375908645018,
"eval_loss": 0.5180224776268005,
"eval_runtime": 11.5144,
"eval_samples_per_second": 32.828,
"eval_steps_per_second": 1.129,
"step": 45300
},
{
"epoch": 0.9135498151268959,
"grad_norm": 0.5350984930992126,
"learning_rate": 3.8186299250103085e-07,
"loss": 0.7582,
"step": 45400
},
{
"epoch": 0.9155620393892899,
"grad_norm": 0.5509154796600342,
"learning_rate": 3.644078451309907e-07,
"loss": 0.7686,
"step": 45500
},
{
"epoch": 0.9175742636516839,
"grad_norm": 0.5419358611106873,
"learning_rate": 3.47353624684551e-07,
"loss": 0.762,
"step": 45600
},
{
"epoch": 0.9175742636516839,
"eval_loss": 0.5179212689399719,
"eval_runtime": 11.4423,
"eval_samples_per_second": 33.035,
"eval_steps_per_second": 1.136,
"step": 45600
},
{
"epoch": 0.919586487914078,
"grad_norm": 0.5258903503417969,
"learning_rate": 3.307010407700084e-07,
"loss": 0.7598,
"step": 45700
},
{
"epoch": 0.921598712176472,
"grad_norm": 0.519910454750061,
"learning_rate": 3.1445078628398294e-07,
"loss": 0.7589,
"step": 45800
},
{
"epoch": 0.9236109364388662,
"grad_norm": 0.5140842795372009,
"learning_rate": 2.986035373825902e-07,
"loss": 0.762,
"step": 45900
},
{
"epoch": 0.9236109364388662,
"eval_loss": 0.5178348422050476,
"eval_runtime": 11.4694,
"eval_samples_per_second": 32.957,
"eval_steps_per_second": 1.133,
"step": 45900
},
{
"epoch": 0.9256231607012602,
"grad_norm": 0.5274850726127625,
"learning_rate": 2.8315995345329804e-07,
"loss": 0.758,
"step": 46000
},
{
"epoch": 0.9276353849636542,
"grad_norm": 0.5443992018699646,
"learning_rate": 2.681206770875022e-07,
"loss": 0.7614,
"step": 46100
},
{
"epoch": 0.9296476092260483,
"grad_norm": 0.5250468254089355,
"learning_rate": 2.5348633405378296e-07,
"loss": 0.7666,
"step": 46200
},
{
"epoch": 0.9296476092260483,
"eval_loss": 0.5178038477897644,
"eval_runtime": 11.6986,
"eval_samples_per_second": 32.311,
"eval_steps_per_second": 1.111,
"step": 46200
},
{
"epoch": 0.9316598334884423,
"grad_norm": 0.5096211433410645,
"learning_rate": 2.392575332718627e-07,
"loss": 0.7697,
"step": 46300
},
{
"epoch": 0.9336720577508363,
"grad_norm": 0.549790620803833,
"learning_rate": 2.2543486678727855e-07,
"loss": 0.7676,
"step": 46400
},
{
"epoch": 0.9356842820132304,
"grad_norm": 0.524726152420044,
"learning_rate": 2.120189097467451e-07,
"loss": 0.7673,
"step": 46500
},
{
"epoch": 0.9356842820132304,
"eval_loss": 0.5176617503166199,
"eval_runtime": 11.5673,
"eval_samples_per_second": 32.678,
"eval_steps_per_second": 1.124,
"step": 46500
},
{
"epoch": 0.9376965062756244,
"grad_norm": 0.5029181838035583,
"learning_rate": 1.9901022037421723e-07,
"loss": 0.7642,
"step": 46600
},
{
"epoch": 0.9397087305380185,
"grad_norm": 0.5207979679107666,
"learning_rate": 1.8640933994767073e-07,
"loss": 0.7592,
"step": 46700
},
{
"epoch": 0.9417209548004125,
"grad_norm": 0.5468851923942566,
"learning_rate": 1.74216792776577e-07,
"loss": 0.7631,
"step": 46800
},
{
"epoch": 0.9417209548004125,
"eval_loss": 0.5177092552185059,
"eval_runtime": 11.4559,
"eval_samples_per_second": 32.996,
"eval_steps_per_second": 1.135,
"step": 46800
},
{
"epoch": 0.9437331790628065,
"grad_norm": 0.5044853091239929,
"learning_rate": 1.62433086180086e-07,
"loss": 0.7644,
"step": 46900
},
{
"epoch": 0.9457454033252006,
"grad_norm": 0.5245229005813599,
"learning_rate": 1.5105871046592e-07,
"loss": 0.7605,
"step": 47000
},
{
"epoch": 0.9477576275875946,
"grad_norm": 0.49839621782302856,
"learning_rate": 1.400941389099697e-07,
"loss": 0.7565,
"step": 47100
},
{
"epoch": 0.9477576275875946,
"eval_loss": 0.5176432132720947,
"eval_runtime": 11.5662,
"eval_samples_per_second": 32.681,
"eval_steps_per_second": 1.124,
"step": 47100
},
{
"epoch": 0.9497698518499886,
"grad_norm": 0.4973909556865692,
"learning_rate": 1.2953982773660223e-07,
"loss": 0.7656,
"step": 47200
},
{
"epoch": 0.9517820761123827,
"grad_norm": 0.5007102489471436,
"learning_rate": 1.1939621609968088e-07,
"loss": 0.7506,
"step": 47300
},
{
"epoch": 0.9537943003747767,
"grad_norm": 0.49358874559402466,
"learning_rate": 1.0966372606428855e-07,
"loss": 0.7562,
"step": 47400
},
{
"epoch": 0.9537943003747767,
"eval_loss": 0.5176478624343872,
"eval_runtime": 11.3727,
"eval_samples_per_second": 33.237,
"eval_steps_per_second": 1.143,
"step": 47400
},
{
"epoch": 0.9558065246371709,
"grad_norm": 0.5771644115447998,
"learning_rate": 1.0034276258916953e-07,
"loss": 0.766,
"step": 47500
},
{
"epoch": 0.9578187488995649,
"grad_norm": 0.5385919213294983,
"learning_rate": 9.14337135098764e-08,
"loss": 0.7605,
"step": 47600
},
{
"epoch": 0.9598309731619589,
"grad_norm": 0.5119192004203796,
"learning_rate": 8.293694952263286e-08,
"loss": 0.757,
"step": 47700
},
{
"epoch": 0.9598309731619589,
"eval_loss": 0.5176236033439636,
"eval_runtime": 11.3818,
"eval_samples_per_second": 33.211,
"eval_steps_per_second": 1.142,
"step": 47700
},
{
"epoch": 0.961843197424353,
"grad_norm": 0.5380053520202637,
"learning_rate": 7.485282416891393e-08,
"loss": 0.7574,
"step": 47800
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.5267532467842102,
"learning_rate": 6.718167382072983e-08,
"loss": 0.7668,
"step": 47900
},
{
"epoch": 0.965867645949141,
"grad_norm": 0.5199303030967712,
"learning_rate": 5.99238176666328e-08,
"loss": 0.756,
"step": 48000
},
{
"epoch": 0.965867645949141,
"eval_loss": 0.5175907015800476,
"eval_runtime": 11.4752,
"eval_samples_per_second": 32.941,
"eval_steps_per_second": 1.133,
"step": 48000
},
{
"epoch": 0.9678798702115351,
"grad_norm": 0.5405638217926025,
"learning_rate": 5.307955769843443e-08,
"loss": 0.7612,
"step": 48100
},
{
"epoch": 0.9698920944739291,
"grad_norm": 0.47063717246055603,
"learning_rate": 4.664917869864338e-08,
"loss": 0.7667,
"step": 48200
},
{
"epoch": 0.9719043187363232,
"grad_norm": 0.48465442657470703,
"learning_rate": 4.063294822861163e-08,
"loss": 0.7605,
"step": 48300
},
{
"epoch": 0.9719043187363232,
"eval_loss": 0.5175836682319641,
"eval_runtime": 11.3838,
"eval_samples_per_second": 33.205,
"eval_steps_per_second": 1.142,
"step": 48300
},
{
"epoch": 0.9739165429987172,
"grad_norm": 0.48423367738723755,
"learning_rate": 3.5031116617404435e-08,
"loss": 0.7574,
"step": 48400
},
{
"epoch": 0.9759287672611112,
"grad_norm": 0.5320655107498169,
"learning_rate": 2.9843916951382e-08,
"loss": 0.767,
"step": 48500
},
{
"epoch": 0.9779409915235053,
"grad_norm": 0.5267395377159119,
"learning_rate": 2.5071565064506143e-08,
"loss": 0.7593,
"step": 48600
},
{
"epoch": 0.9779409915235053,
"eval_loss": 0.5175591707229614,
"eval_runtime": 11.3501,
"eval_samples_per_second": 33.304,
"eval_steps_per_second": 1.145,
"step": 48600
},
{
"epoch": 0.9799532157858993,
"grad_norm": 0.514837920665741,
"learning_rate": 2.071425952934969e-08,
"loss": 0.7641,
"step": 48700
},
{
"epoch": 0.9819654400482933,
"grad_norm": 0.5345449447631836,
"learning_rate": 1.677218164884753e-08,
"loss": 0.7685,
"step": 48800
},
{
"epoch": 0.9839776643106874,
"grad_norm": 0.5339971780776978,
"learning_rate": 1.3245495448739321e-08,
"loss": 0.7612,
"step": 48900
},
{
"epoch": 0.9839776643106874,
"eval_loss": 0.5175919532775879,
"eval_runtime": 11.3829,
"eval_samples_per_second": 33.208,
"eval_steps_per_second": 1.142,
"step": 48900
},
{
"epoch": 0.9859898885730815,
"grad_norm": 0.49889686703681946,
"learning_rate": 1.013434767075605e-08,
"loss": 0.7692,
"step": 49000
},
{
"epoch": 0.9880021128354756,
"grad_norm": 0.5119482278823853,
"learning_rate": 7.438867766504931e-09,
"loss": 0.7578,
"step": 49100
},
{
"epoch": 0.9900143370978696,
"grad_norm": 0.5316244959831238,
"learning_rate": 5.159167892089256e-09,
"loss": 0.7568,
"step": 49200
},
{
"epoch": 0.9900143370978696,
"eval_loss": 0.5176030993461609,
"eval_runtime": 11.4046,
"eval_samples_per_second": 33.145,
"eval_steps_per_second": 1.14,
"step": 49200
},
{
"epoch": 0.9920265613602636,
"grad_norm": 7.261257648468018,
"learning_rate": 3.2953429034399133e-09,
"loss": 0.7576,
"step": 49300
},
{
"epoch": 0.9940387856226577,
"grad_norm": 0.48430758714675903,
"learning_rate": 1.847470352367431e-09,
"loss": 0.7577,
"step": 49400
},
{
"epoch": 0.9960510098850517,
"grad_norm": 0.4918181598186493,
"learning_rate": 8.156104833345613e-10,
"loss": 0.7649,
"step": 49500
},
{
"epoch": 0.9960510098850517,
"eval_loss": 0.5175663232803345,
"eval_runtime": 11.4598,
"eval_samples_per_second": 32.985,
"eval_steps_per_second": 1.134,
"step": 49500
},
{
"epoch": 0.9980632341474457,
"grad_norm": 0.5409220457077026,
"learning_rate": 1.9980623095494645e-10,
"loss": 0.7531,
"step": 49600
}
],
"logging_steps": 100,
"max_steps": 49697,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.166027593741658e+19,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}