mbert_urd-arab / trainer_state.json
DGurgurov's picture
Uploading checkpoint-96000 for mbert - urd-arab
a0d8ee1 verified
{
"best_metric": 0.6338008642196655,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/urd-Arab/checkpoint-96000",
"epoch": 20.416843896214377,
"eval_steps": 500,
"global_step": 96000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10633772862611654,
"grad_norm": 3.0985794067382812,
"learning_rate": 9.95e-05,
"loss": 1.6616,
"step": 500
},
{
"epoch": 0.10633772862611654,
"eval_accuracy": 0.7045292975858304,
"eval_loss": 1.4788062572479248,
"eval_runtime": 184.8638,
"eval_samples_per_second": 148.39,
"eval_steps_per_second": 4.641,
"step": 500
},
{
"epoch": 0.21267545725223308,
"grad_norm": 2.9091954231262207,
"learning_rate": 9.900000000000001e-05,
"loss": 1.4534,
"step": 1000
},
{
"epoch": 0.21267545725223308,
"eval_accuracy": 0.7242012199076614,
"eval_loss": 1.3632394075393677,
"eval_runtime": 186.0107,
"eval_samples_per_second": 147.475,
"eval_steps_per_second": 4.613,
"step": 1000
},
{
"epoch": 0.31901318587834965,
"grad_norm": 3.358151435852051,
"learning_rate": 9.850000000000001e-05,
"loss": 1.3783,
"step": 1500
},
{
"epoch": 0.31901318587834965,
"eval_accuracy": 0.7372946823579634,
"eval_loss": 1.2977707386016846,
"eval_runtime": 185.966,
"eval_samples_per_second": 147.511,
"eval_steps_per_second": 4.614,
"step": 1500
},
{
"epoch": 0.42535091450446616,
"grad_norm": 2.854158878326416,
"learning_rate": 9.8e-05,
"loss": 1.3043,
"step": 2000
},
{
"epoch": 0.42535091450446616,
"eval_accuracy": 0.7475462534858491,
"eval_loss": 1.2377636432647705,
"eval_runtime": 190.1291,
"eval_samples_per_second": 144.281,
"eval_steps_per_second": 4.513,
"step": 2000
},
{
"epoch": 0.5316886431305827,
"grad_norm": 2.8080673217773438,
"learning_rate": 9.75e-05,
"loss": 1.266,
"step": 2500
},
{
"epoch": 0.5316886431305827,
"eval_accuracy": 0.7544512379735423,
"eval_loss": 1.2024192810058594,
"eval_runtime": 187.1935,
"eval_samples_per_second": 146.544,
"eval_steps_per_second": 4.583,
"step": 2500
},
{
"epoch": 0.6380263717566993,
"grad_norm": 3.4922664165496826,
"learning_rate": 9.7e-05,
"loss": 1.2307,
"step": 3000
},
{
"epoch": 0.6380263717566993,
"eval_accuracy": 0.7603215651478341,
"eval_loss": 1.1685340404510498,
"eval_runtime": 185.8428,
"eval_samples_per_second": 147.609,
"eval_steps_per_second": 4.617,
"step": 3000
},
{
"epoch": 0.7443641003828159,
"grad_norm": 2.4908382892608643,
"learning_rate": 9.65e-05,
"loss": 1.1959,
"step": 3500
},
{
"epoch": 0.7443641003828159,
"eval_accuracy": 0.7657556526801927,
"eval_loss": 1.1382145881652832,
"eval_runtime": 186.72,
"eval_samples_per_second": 146.915,
"eval_steps_per_second": 4.595,
"step": 3500
},
{
"epoch": 0.8507018290089323,
"grad_norm": 2.883516311645508,
"learning_rate": 9.6e-05,
"loss": 1.1782,
"step": 4000
},
{
"epoch": 0.8507018290089323,
"eval_accuracy": 0.7699587383241813,
"eval_loss": 1.130513310432434,
"eval_runtime": 186.2137,
"eval_samples_per_second": 147.315,
"eval_steps_per_second": 4.608,
"step": 4000
},
{
"epoch": 0.9570395576350489,
"grad_norm": 2.5834362506866455,
"learning_rate": 9.55e-05,
"loss": 1.1451,
"step": 4500
},
{
"epoch": 0.9570395576350489,
"eval_accuracy": 0.7739563469660221,
"eval_loss": 1.0999319553375244,
"eval_runtime": 186.1476,
"eval_samples_per_second": 147.367,
"eval_steps_per_second": 4.609,
"step": 4500
},
{
"epoch": 1.0633772862611655,
"grad_norm": 2.7231101989746094,
"learning_rate": 9.5e-05,
"loss": 1.1261,
"step": 5000
},
{
"epoch": 1.0633772862611655,
"eval_accuracy": 0.7763694500850334,
"eval_loss": 1.0874555110931396,
"eval_runtime": 186.83,
"eval_samples_per_second": 146.829,
"eval_steps_per_second": 4.592,
"step": 5000
},
{
"epoch": 1.169715014887282,
"grad_norm": 3.062772512435913,
"learning_rate": 9.449999999999999e-05,
"loss": 1.1155,
"step": 5500
},
{
"epoch": 1.169715014887282,
"eval_accuracy": 0.7799436407812638,
"eval_loss": 1.0624312162399292,
"eval_runtime": 187.3404,
"eval_samples_per_second": 146.429,
"eval_steps_per_second": 4.58,
"step": 5500
},
{
"epoch": 1.2760527435133986,
"grad_norm": 2.577993631362915,
"learning_rate": 9.4e-05,
"loss": 1.0835,
"step": 6000
},
{
"epoch": 1.2760527435133986,
"eval_accuracy": 0.7829938347419719,
"eval_loss": 1.0484861135482788,
"eval_runtime": 186.5523,
"eval_samples_per_second": 147.047,
"eval_steps_per_second": 4.599,
"step": 6000
},
{
"epoch": 1.382390472139515,
"grad_norm": 2.816096544265747,
"learning_rate": 9.350000000000001e-05,
"loss": 1.0662,
"step": 6500
},
{
"epoch": 1.382390472139515,
"eval_accuracy": 0.7847991903133893,
"eval_loss": 1.0540517568588257,
"eval_runtime": 187.2662,
"eval_samples_per_second": 146.487,
"eval_steps_per_second": 4.582,
"step": 6500
},
{
"epoch": 1.4887282007656317,
"grad_norm": 2.4449338912963867,
"learning_rate": 9.300000000000001e-05,
"loss": 1.0537,
"step": 7000
},
{
"epoch": 1.4887282007656317,
"eval_accuracy": 0.7870963170316769,
"eval_loss": 1.0328177213668823,
"eval_runtime": 187.5096,
"eval_samples_per_second": 146.297,
"eval_steps_per_second": 4.576,
"step": 7000
},
{
"epoch": 1.5950659293917482,
"grad_norm": 2.4896557331085205,
"learning_rate": 9.250000000000001e-05,
"loss": 1.0489,
"step": 7500
},
{
"epoch": 1.5950659293917482,
"eval_accuracy": 0.7898641999964792,
"eval_loss": 1.0257970094680786,
"eval_runtime": 187.4659,
"eval_samples_per_second": 146.331,
"eval_steps_per_second": 4.577,
"step": 7500
},
{
"epoch": 1.7014036580178646,
"grad_norm": 4.281688690185547,
"learning_rate": 9.200000000000001e-05,
"loss": 1.0417,
"step": 8000
},
{
"epoch": 1.7014036580178646,
"eval_accuracy": 0.7918014683810514,
"eval_loss": 1.0114617347717285,
"eval_runtime": 185.6247,
"eval_samples_per_second": 147.782,
"eval_steps_per_second": 4.622,
"step": 8000
},
{
"epoch": 1.8077413866439813,
"grad_norm": 2.6903951168060303,
"learning_rate": 9.15e-05,
"loss": 1.0238,
"step": 8500
},
{
"epoch": 1.8077413866439813,
"eval_accuracy": 0.7941965557582847,
"eval_loss": 0.9940029978752136,
"eval_runtime": 185.6726,
"eval_samples_per_second": 147.744,
"eval_steps_per_second": 4.621,
"step": 8500
},
{
"epoch": 1.914079115270098,
"grad_norm": 2.6284191608428955,
"learning_rate": 9.1e-05,
"loss": 1.0261,
"step": 9000
},
{
"epoch": 1.914079115270098,
"eval_accuracy": 0.7950889770370101,
"eval_loss": 0.9995012283325195,
"eval_runtime": 187.0194,
"eval_samples_per_second": 146.68,
"eval_steps_per_second": 4.588,
"step": 9000
},
{
"epoch": 2.0204168438962142,
"grad_norm": 2.3006789684295654,
"learning_rate": 9.05e-05,
"loss": 1.0054,
"step": 9500
},
{
"epoch": 2.0204168438962142,
"eval_accuracy": 0.7962621919626219,
"eval_loss": 0.989319384098053,
"eval_runtime": 187.091,
"eval_samples_per_second": 146.624,
"eval_steps_per_second": 4.586,
"step": 9500
},
{
"epoch": 2.126754572522331,
"grad_norm": 2.3347225189208984,
"learning_rate": 9e-05,
"loss": 1.0012,
"step": 10000
},
{
"epoch": 2.126754572522331,
"eval_accuracy": 0.7998739212317131,
"eval_loss": 0.9665579199790955,
"eval_runtime": 187.267,
"eval_samples_per_second": 146.486,
"eval_steps_per_second": 4.582,
"step": 10000
},
{
"epoch": 2.2330923011484476,
"grad_norm": 2.7090580463409424,
"learning_rate": 8.950000000000001e-05,
"loss": 0.9855,
"step": 10500
},
{
"epoch": 2.2330923011484476,
"eval_accuracy": 0.8001766586026973,
"eval_loss": 0.9704384207725525,
"eval_runtime": 186.5023,
"eval_samples_per_second": 147.087,
"eval_steps_per_second": 4.6,
"step": 10500
},
{
"epoch": 2.339430029774564,
"grad_norm": 2.664764404296875,
"learning_rate": 8.900000000000001e-05,
"loss": 0.983,
"step": 11000
},
{
"epoch": 2.339430029774564,
"eval_accuracy": 0.8007109155537777,
"eval_loss": 0.9576543569564819,
"eval_runtime": 187.0054,
"eval_samples_per_second": 146.691,
"eval_steps_per_second": 4.588,
"step": 11000
},
{
"epoch": 2.4457677584006805,
"grad_norm": 2.3209338188171387,
"learning_rate": 8.850000000000001e-05,
"loss": 0.9661,
"step": 11500
},
{
"epoch": 2.4457677584006805,
"eval_accuracy": 0.8029339980865494,
"eval_loss": 0.9501250386238098,
"eval_runtime": 186.7101,
"eval_samples_per_second": 146.923,
"eval_steps_per_second": 4.595,
"step": 11500
},
{
"epoch": 2.552105487026797,
"grad_norm": 2.667185068130493,
"learning_rate": 8.800000000000001e-05,
"loss": 0.9661,
"step": 12000
},
{
"epoch": 2.552105487026797,
"eval_accuracy": 0.803158831479515,
"eval_loss": 0.9485698342323303,
"eval_runtime": 187.0511,
"eval_samples_per_second": 146.655,
"eval_steps_per_second": 4.587,
"step": 12000
},
{
"epoch": 2.6584432156529134,
"grad_norm": 2.3294034004211426,
"learning_rate": 8.75e-05,
"loss": 0.9635,
"step": 12500
},
{
"epoch": 2.6584432156529134,
"eval_accuracy": 0.8052219742658059,
"eval_loss": 0.936097264289856,
"eval_runtime": 187.1076,
"eval_samples_per_second": 146.611,
"eval_steps_per_second": 4.586,
"step": 12500
},
{
"epoch": 2.76478094427903,
"grad_norm": 2.4288761615753174,
"learning_rate": 8.7e-05,
"loss": 0.9564,
"step": 13000
},
{
"epoch": 2.76478094427903,
"eval_accuracy": 0.8063745585697996,
"eval_loss": 0.9337704181671143,
"eval_runtime": 187.0756,
"eval_samples_per_second": 146.636,
"eval_steps_per_second": 4.586,
"step": 13000
},
{
"epoch": 2.8711186729051468,
"grad_norm": 2.7012951374053955,
"learning_rate": 8.65e-05,
"loss": 0.9461,
"step": 13500
},
{
"epoch": 2.8711186729051468,
"eval_accuracy": 0.8086281139879846,
"eval_loss": 0.9229084253311157,
"eval_runtime": 186.8255,
"eval_samples_per_second": 146.832,
"eval_steps_per_second": 4.593,
"step": 13500
},
{
"epoch": 2.9774564015312635,
"grad_norm": 2.791017532348633,
"learning_rate": 8.6e-05,
"loss": 0.9397,
"step": 14000
},
{
"epoch": 2.9774564015312635,
"eval_accuracy": 0.8081856611194737,
"eval_loss": 0.9245437383651733,
"eval_runtime": 188.1945,
"eval_samples_per_second": 145.764,
"eval_steps_per_second": 4.559,
"step": 14000
},
{
"epoch": 3.0837941301573797,
"grad_norm": 2.684272050857544,
"learning_rate": 8.55e-05,
"loss": 0.9264,
"step": 14500
},
{
"epoch": 3.0837941301573797,
"eval_accuracy": 0.8095467896972031,
"eval_loss": 0.9195280075073242,
"eval_runtime": 188.4052,
"eval_samples_per_second": 145.601,
"eval_steps_per_second": 4.554,
"step": 14500
},
{
"epoch": 3.1901318587834964,
"grad_norm": 2.159348726272583,
"learning_rate": 8.5e-05,
"loss": 0.9204,
"step": 15000
},
{
"epoch": 3.1901318587834964,
"eval_accuracy": 0.8105982920504011,
"eval_loss": 0.9083139300346375,
"eval_runtime": 186.9331,
"eval_samples_per_second": 146.748,
"eval_steps_per_second": 4.59,
"step": 15000
},
{
"epoch": 3.296469587409613,
"grad_norm": 2.2706503868103027,
"learning_rate": 8.450000000000001e-05,
"loss": 0.9213,
"step": 15500
},
{
"epoch": 3.296469587409613,
"eval_accuracy": 0.8117506926014157,
"eval_loss": 0.9083714485168457,
"eval_runtime": 186.0993,
"eval_samples_per_second": 147.405,
"eval_steps_per_second": 4.61,
"step": 15500
},
{
"epoch": 3.4028073160357293,
"grad_norm": 2.560739755630493,
"learning_rate": 8.4e-05,
"loss": 0.9193,
"step": 16000
},
{
"epoch": 3.4028073160357293,
"eval_accuracy": 0.8120885199882083,
"eval_loss": 0.9039110541343689,
"eval_runtime": 187.0648,
"eval_samples_per_second": 146.644,
"eval_steps_per_second": 4.587,
"step": 16000
},
{
"epoch": 3.509145044661846,
"grad_norm": 2.2117762565612793,
"learning_rate": 8.35e-05,
"loss": 0.9127,
"step": 16500
},
{
"epoch": 3.509145044661846,
"eval_accuracy": 0.8133210445095941,
"eval_loss": 0.8947284817695618,
"eval_runtime": 187.0387,
"eval_samples_per_second": 146.665,
"eval_steps_per_second": 4.587,
"step": 16500
},
{
"epoch": 3.6154827732879626,
"grad_norm": 7.251231670379639,
"learning_rate": 8.3e-05,
"loss": 0.9069,
"step": 17000
},
{
"epoch": 3.6154827732879626,
"eval_accuracy": 0.8143866647342304,
"eval_loss": 0.8961222767829895,
"eval_runtime": 186.0212,
"eval_samples_per_second": 147.467,
"eval_steps_per_second": 4.612,
"step": 17000
},
{
"epoch": 3.7218205019140793,
"grad_norm": 2.4916841983795166,
"learning_rate": 8.25e-05,
"loss": 0.9011,
"step": 17500
},
{
"epoch": 3.7218205019140793,
"eval_accuracy": 0.8143561971265061,
"eval_loss": 0.8977431058883667,
"eval_runtime": 186.782,
"eval_samples_per_second": 146.866,
"eval_steps_per_second": 4.594,
"step": 17500
},
{
"epoch": 3.8281582305401956,
"grad_norm": 2.5640275478363037,
"learning_rate": 8.2e-05,
"loss": 0.9044,
"step": 18000
},
{
"epoch": 3.8281582305401956,
"eval_accuracy": 0.8150782896033226,
"eval_loss": 0.8876448273658752,
"eval_runtime": 187.6582,
"eval_samples_per_second": 146.181,
"eval_steps_per_second": 4.572,
"step": 18000
},
{
"epoch": 3.9344959591663122,
"grad_norm": 2.1754488945007324,
"learning_rate": 8.15e-05,
"loss": 0.8889,
"step": 18500
},
{
"epoch": 3.9344959591663122,
"eval_accuracy": 0.8168180011691912,
"eval_loss": 0.8829460740089417,
"eval_runtime": 187.8254,
"eval_samples_per_second": 146.051,
"eval_steps_per_second": 4.568,
"step": 18500
},
{
"epoch": 4.0408336877924285,
"grad_norm": 2.2828450202941895,
"learning_rate": 8.1e-05,
"loss": 0.8781,
"step": 19000
},
{
"epoch": 4.0408336877924285,
"eval_accuracy": 0.8172543715113816,
"eval_loss": 0.881291925907135,
"eval_runtime": 186.9833,
"eval_samples_per_second": 146.708,
"eval_steps_per_second": 4.589,
"step": 19000
},
{
"epoch": 4.147171416418545,
"grad_norm": 2.2914931774139404,
"learning_rate": 8.05e-05,
"loss": 0.8776,
"step": 19500
},
{
"epoch": 4.147171416418545,
"eval_accuracy": 0.8177812507969122,
"eval_loss": 0.8755871057510376,
"eval_runtime": 187.0401,
"eval_samples_per_second": 146.664,
"eval_steps_per_second": 4.587,
"step": 19500
},
{
"epoch": 4.253509145044662,
"grad_norm": 2.8364596366882324,
"learning_rate": 8e-05,
"loss": 0.8751,
"step": 20000
},
{
"epoch": 4.253509145044662,
"eval_accuracy": 0.818812246001166,
"eval_loss": 0.8739203214645386,
"eval_runtime": 187.8224,
"eval_samples_per_second": 146.053,
"eval_steps_per_second": 4.568,
"step": 20000
},
{
"epoch": 4.3598468736707785,
"grad_norm": 2.4278838634490967,
"learning_rate": 7.950000000000001e-05,
"loss": 0.8799,
"step": 20500
},
{
"epoch": 4.3598468736707785,
"eval_accuracy": 0.8195426064288864,
"eval_loss": 0.87138432264328,
"eval_runtime": 188.1286,
"eval_samples_per_second": 145.815,
"eval_steps_per_second": 4.561,
"step": 20500
},
{
"epoch": 4.466184602296895,
"grad_norm": 2.5234789848327637,
"learning_rate": 7.900000000000001e-05,
"loss": 0.8731,
"step": 21000
},
{
"epoch": 4.466184602296895,
"eval_accuracy": 0.8210525102479217,
"eval_loss": 0.8586520552635193,
"eval_runtime": 187.1495,
"eval_samples_per_second": 146.578,
"eval_steps_per_second": 4.585,
"step": 21000
},
{
"epoch": 4.572522330923012,
"grad_norm": 2.47117018699646,
"learning_rate": 7.850000000000001e-05,
"loss": 0.8625,
"step": 21500
},
{
"epoch": 4.572522330923012,
"eval_accuracy": 0.8216630966191832,
"eval_loss": 0.8592851758003235,
"eval_runtime": 186.2152,
"eval_samples_per_second": 147.313,
"eval_steps_per_second": 4.608,
"step": 21500
},
{
"epoch": 4.678860059549128,
"grad_norm": 2.6229896545410156,
"learning_rate": 7.800000000000001e-05,
"loss": 0.8632,
"step": 22000
},
{
"epoch": 4.678860059549128,
"eval_accuracy": 0.8215383999990303,
"eval_loss": 0.8571019768714905,
"eval_runtime": 188.0905,
"eval_samples_per_second": 145.845,
"eval_steps_per_second": 4.562,
"step": 22000
},
{
"epoch": 4.785197788175244,
"grad_norm": 2.1132686138153076,
"learning_rate": 7.75e-05,
"loss": 0.8638,
"step": 22500
},
{
"epoch": 4.785197788175244,
"eval_accuracy": 0.8227068815093186,
"eval_loss": 0.8464910387992859,
"eval_runtime": 188.695,
"eval_samples_per_second": 145.377,
"eval_steps_per_second": 4.547,
"step": 22500
},
{
"epoch": 4.891535516801361,
"grad_norm": 2.4239864349365234,
"learning_rate": 7.7e-05,
"loss": 0.8538,
"step": 23000
},
{
"epoch": 4.891535516801361,
"eval_accuracy": 0.8232777279942184,
"eval_loss": 0.8469775915145874,
"eval_runtime": 186.3063,
"eval_samples_per_second": 147.241,
"eval_steps_per_second": 4.605,
"step": 23000
},
{
"epoch": 4.997873245427478,
"grad_norm": 2.2336645126342773,
"learning_rate": 7.65e-05,
"loss": 0.8541,
"step": 23500
},
{
"epoch": 4.997873245427478,
"eval_accuracy": 0.8230626489512016,
"eval_loss": 0.8494424819946289,
"eval_runtime": 187.1494,
"eval_samples_per_second": 146.578,
"eval_steps_per_second": 4.585,
"step": 23500
},
{
"epoch": 5.104210974053594,
"grad_norm": 2.3965516090393066,
"learning_rate": 7.6e-05,
"loss": 0.841,
"step": 24000
},
{
"epoch": 5.104210974053594,
"eval_accuracy": 0.8237575480873727,
"eval_loss": 0.8488872051239014,
"eval_runtime": 187.1345,
"eval_samples_per_second": 146.59,
"eval_steps_per_second": 4.585,
"step": 24000
},
{
"epoch": 5.210548702679711,
"grad_norm": 2.575192451477051,
"learning_rate": 7.55e-05,
"loss": 0.8402,
"step": 24500
},
{
"epoch": 5.210548702679711,
"eval_accuracy": 0.8252009630693954,
"eval_loss": 0.8397406339645386,
"eval_runtime": 187.5948,
"eval_samples_per_second": 146.23,
"eval_steps_per_second": 4.574,
"step": 24500
},
{
"epoch": 5.316886431305828,
"grad_norm": 2.234575033187866,
"learning_rate": 7.500000000000001e-05,
"loss": 0.8363,
"step": 25000
},
{
"epoch": 5.316886431305828,
"eval_accuracy": 0.8249671018137493,
"eval_loss": 0.8448637127876282,
"eval_runtime": 186.3423,
"eval_samples_per_second": 147.213,
"eval_steps_per_second": 4.604,
"step": 25000
},
{
"epoch": 5.4232241599319435,
"grad_norm": 2.2523510456085205,
"learning_rate": 7.450000000000001e-05,
"loss": 0.8336,
"step": 25500
},
{
"epoch": 5.4232241599319435,
"eval_accuracy": 0.8261139098550124,
"eval_loss": 0.8395271897315979,
"eval_runtime": 187.766,
"eval_samples_per_second": 146.097,
"eval_steps_per_second": 4.57,
"step": 25500
},
{
"epoch": 5.52956188855806,
"grad_norm": 2.5487735271453857,
"learning_rate": 7.4e-05,
"loss": 0.8369,
"step": 26000
},
{
"epoch": 5.52956188855806,
"eval_accuracy": 0.826535426004618,
"eval_loss": 0.8330492377281189,
"eval_runtime": 189.1247,
"eval_samples_per_second": 145.047,
"eval_steps_per_second": 4.537,
"step": 26000
},
{
"epoch": 5.635899617184177,
"grad_norm": 2.080281972885132,
"learning_rate": 7.35e-05,
"loss": 0.8365,
"step": 26500
},
{
"epoch": 5.635899617184177,
"eval_accuracy": 0.8272166865191455,
"eval_loss": 0.8262215852737427,
"eval_runtime": 187.6646,
"eval_samples_per_second": 146.176,
"eval_steps_per_second": 4.572,
"step": 26500
},
{
"epoch": 5.7422373458102935,
"grad_norm": 2.453874111175537,
"learning_rate": 7.3e-05,
"loss": 0.8221,
"step": 27000
},
{
"epoch": 5.7422373458102935,
"eval_accuracy": 0.8277079258525418,
"eval_loss": 0.8285869359970093,
"eval_runtime": 188.2189,
"eval_samples_per_second": 145.745,
"eval_steps_per_second": 4.559,
"step": 27000
},
{
"epoch": 5.84857507443641,
"grad_norm": 2.587031841278076,
"learning_rate": 7.25e-05,
"loss": 0.8235,
"step": 27500
},
{
"epoch": 5.84857507443641,
"eval_accuracy": 0.828009316952864,
"eval_loss": 0.8248396515846252,
"eval_runtime": 186.6625,
"eval_samples_per_second": 146.96,
"eval_steps_per_second": 4.597,
"step": 27500
},
{
"epoch": 5.954912803062527,
"grad_norm": 2.36771297454834,
"learning_rate": 7.2e-05,
"loss": 0.8269,
"step": 28000
},
{
"epoch": 5.954912803062527,
"eval_accuracy": 0.8290436079701959,
"eval_loss": 0.8144590854644775,
"eval_runtime": 187.3225,
"eval_samples_per_second": 146.443,
"eval_steps_per_second": 4.58,
"step": 28000
},
{
"epoch": 6.061250531688643,
"grad_norm": 2.362884044647217,
"learning_rate": 7.15e-05,
"loss": 0.8147,
"step": 28500
},
{
"epoch": 6.061250531688643,
"eval_accuracy": 0.829482515450688,
"eval_loss": 0.8172268271446228,
"eval_runtime": 185.8934,
"eval_samples_per_second": 147.568,
"eval_steps_per_second": 4.616,
"step": 28500
},
{
"epoch": 6.167588260314759,
"grad_norm": 2.7787272930145264,
"learning_rate": 7.1e-05,
"loss": 0.809,
"step": 29000
},
{
"epoch": 6.167588260314759,
"eval_accuracy": 0.8289893847861752,
"eval_loss": 0.8245209455490112,
"eval_runtime": 185.671,
"eval_samples_per_second": 147.745,
"eval_steps_per_second": 4.621,
"step": 29000
},
{
"epoch": 6.273925988940876,
"grad_norm": 2.2524912357330322,
"learning_rate": 7.05e-05,
"loss": 0.8136,
"step": 29500
},
{
"epoch": 6.273925988940876,
"eval_accuracy": 0.8299085027558315,
"eval_loss": 0.8161247372627258,
"eval_runtime": 187.178,
"eval_samples_per_second": 146.556,
"eval_steps_per_second": 4.584,
"step": 29500
},
{
"epoch": 6.380263717566993,
"grad_norm": 2.0518226623535156,
"learning_rate": 7e-05,
"loss": 0.8097,
"step": 30000
},
{
"epoch": 6.380263717566993,
"eval_accuracy": 0.8302947709868315,
"eval_loss": 0.8094373941421509,
"eval_runtime": 188.6751,
"eval_samples_per_second": 145.393,
"eval_steps_per_second": 4.548,
"step": 30000
},
{
"epoch": 6.486601446193109,
"grad_norm": 2.2898824214935303,
"learning_rate": 6.95e-05,
"loss": 0.8027,
"step": 30500
},
{
"epoch": 6.486601446193109,
"eval_accuracy": 0.831080552836758,
"eval_loss": 0.8069068789482117,
"eval_runtime": 187.7392,
"eval_samples_per_second": 146.118,
"eval_steps_per_second": 4.57,
"step": 30500
},
{
"epoch": 6.592939174819226,
"grad_norm": 2.191114902496338,
"learning_rate": 6.9e-05,
"loss": 0.8034,
"step": 31000
},
{
"epoch": 6.592939174819226,
"eval_accuracy": 0.8322840004200974,
"eval_loss": 0.7996180057525635,
"eval_runtime": 186.9585,
"eval_samples_per_second": 146.728,
"eval_steps_per_second": 4.589,
"step": 31000
},
{
"epoch": 6.699276903445343,
"grad_norm": 2.166645050048828,
"learning_rate": 6.850000000000001e-05,
"loss": 0.8014,
"step": 31500
},
{
"epoch": 6.699276903445343,
"eval_accuracy": 0.8322805736104725,
"eval_loss": 0.8084205389022827,
"eval_runtime": 187.1325,
"eval_samples_per_second": 146.591,
"eval_steps_per_second": 4.585,
"step": 31500
},
{
"epoch": 6.805614632071459,
"grad_norm": 2.5388972759246826,
"learning_rate": 6.800000000000001e-05,
"loss": 0.7925,
"step": 32000
},
{
"epoch": 6.805614632071459,
"eval_accuracy": 0.8326493216513361,
"eval_loss": 0.7997660040855408,
"eval_runtime": 186.3753,
"eval_samples_per_second": 147.187,
"eval_steps_per_second": 4.604,
"step": 32000
},
{
"epoch": 6.911952360697575,
"grad_norm": 2.3604350090026855,
"learning_rate": 6.750000000000001e-05,
"loss": 0.8069,
"step": 32500
},
{
"epoch": 6.911952360697575,
"eval_accuracy": 0.8327787324739044,
"eval_loss": 0.7998002767562866,
"eval_runtime": 186.0892,
"eval_samples_per_second": 147.413,
"eval_steps_per_second": 4.611,
"step": 32500
},
{
"epoch": 7.018290089323692,
"grad_norm": 2.3000738620758057,
"learning_rate": 6.7e-05,
"loss": 0.7916,
"step": 33000
},
{
"epoch": 7.018290089323692,
"eval_accuracy": 0.8333142335688971,
"eval_loss": 0.7988072037696838,
"eval_runtime": 187.1327,
"eval_samples_per_second": 146.591,
"eval_steps_per_second": 4.585,
"step": 33000
},
{
"epoch": 7.124627817949809,
"grad_norm": 2.031378746032715,
"learning_rate": 6.65e-05,
"loss": 0.7796,
"step": 33500
},
{
"epoch": 7.124627817949809,
"eval_accuracy": 0.834044136797238,
"eval_loss": 0.7932332754135132,
"eval_runtime": 186.7728,
"eval_samples_per_second": 146.874,
"eval_steps_per_second": 4.594,
"step": 33500
},
{
"epoch": 7.230965546575925,
"grad_norm": 2.151765823364258,
"learning_rate": 6.6e-05,
"loss": 0.783,
"step": 34000
},
{
"epoch": 7.230965546575925,
"eval_accuracy": 0.833573114410175,
"eval_loss": 0.7935149073600769,
"eval_runtime": 188.5605,
"eval_samples_per_second": 145.481,
"eval_steps_per_second": 4.55,
"step": 34000
},
{
"epoch": 7.337303275202042,
"grad_norm": 2.1417930126190186,
"learning_rate": 6.55e-05,
"loss": 0.785,
"step": 34500
},
{
"epoch": 7.337303275202042,
"eval_accuracy": 0.8342695371268045,
"eval_loss": 0.7929127216339111,
"eval_runtime": 188.5148,
"eval_samples_per_second": 145.516,
"eval_steps_per_second": 4.551,
"step": 34500
},
{
"epoch": 7.443641003828159,
"grad_norm": 1.9593024253845215,
"learning_rate": 6.500000000000001e-05,
"loss": 0.7849,
"step": 35000
},
{
"epoch": 7.443641003828159,
"eval_accuracy": 0.8352518766512985,
"eval_loss": 0.7853254675865173,
"eval_runtime": 187.2799,
"eval_samples_per_second": 146.476,
"eval_steps_per_second": 4.581,
"step": 35000
},
{
"epoch": 7.549978732454274,
"grad_norm": 2.447838544845581,
"learning_rate": 6.450000000000001e-05,
"loss": 0.7791,
"step": 35500
},
{
"epoch": 7.549978732454274,
"eval_accuracy": 0.8354121212121212,
"eval_loss": 0.7829086184501648,
"eval_runtime": 187.3697,
"eval_samples_per_second": 146.406,
"eval_steps_per_second": 4.579,
"step": 35500
},
{
"epoch": 7.656316461080391,
"grad_norm": 2.3611719608306885,
"learning_rate": 6.400000000000001e-05,
"loss": 0.772,
"step": 36000
},
{
"epoch": 7.656316461080391,
"eval_accuracy": 0.8359586422824412,
"eval_loss": 0.7856774926185608,
"eval_runtime": 187.3584,
"eval_samples_per_second": 146.415,
"eval_steps_per_second": 4.579,
"step": 36000
},
{
"epoch": 7.762654189706508,
"grad_norm": 2.6035830974578857,
"learning_rate": 6.35e-05,
"loss": 0.7725,
"step": 36500
},
{
"epoch": 7.762654189706508,
"eval_accuracy": 0.8363098633948102,
"eval_loss": 0.7833205461502075,
"eval_runtime": 187.0804,
"eval_samples_per_second": 146.632,
"eval_steps_per_second": 4.586,
"step": 36500
},
{
"epoch": 7.8689919183326245,
"grad_norm": 2.1747405529022217,
"learning_rate": 6.3e-05,
"loss": 0.7726,
"step": 37000
},
{
"epoch": 7.8689919183326245,
"eval_accuracy": 0.8363886951914692,
"eval_loss": 0.7823408842086792,
"eval_runtime": 187.2496,
"eval_samples_per_second": 146.5,
"eval_steps_per_second": 4.582,
"step": 37000
},
{
"epoch": 7.975329646958741,
"grad_norm": 2.4442336559295654,
"learning_rate": 6.25e-05,
"loss": 0.7706,
"step": 37500
},
{
"epoch": 7.975329646958741,
"eval_accuracy": 0.8373462373755904,
"eval_loss": 0.7759196758270264,
"eval_runtime": 187.0689,
"eval_samples_per_second": 146.641,
"eval_steps_per_second": 4.587,
"step": 37500
},
{
"epoch": 8.081667375584857,
"grad_norm": 2.2232158184051514,
"learning_rate": 6.2e-05,
"loss": 0.7681,
"step": 38000
},
{
"epoch": 8.081667375584857,
"eval_accuracy": 0.8376378883351716,
"eval_loss": 0.773087203502655,
"eval_runtime": 185.8545,
"eval_samples_per_second": 147.599,
"eval_steps_per_second": 4.617,
"step": 38000
},
{
"epoch": 8.188005104210974,
"grad_norm": 2.019041061401367,
"learning_rate": 6.15e-05,
"loss": 0.7598,
"step": 38500
},
{
"epoch": 8.188005104210974,
"eval_accuracy": 0.8383952690813153,
"eval_loss": 0.7797868847846985,
"eval_runtime": 187.2699,
"eval_samples_per_second": 146.484,
"eval_steps_per_second": 4.582,
"step": 38500
},
{
"epoch": 8.29434283283709,
"grad_norm": 2.744246482849121,
"learning_rate": 6.1e-05,
"loss": 0.7585,
"step": 39000
},
{
"epoch": 8.29434283283709,
"eval_accuracy": 0.8387710202737253,
"eval_loss": 0.77022385597229,
"eval_runtime": 185.74,
"eval_samples_per_second": 147.69,
"eval_steps_per_second": 4.619,
"step": 39000
},
{
"epoch": 8.400680561463208,
"grad_norm": 2.3689775466918945,
"learning_rate": 6.05e-05,
"loss": 0.7567,
"step": 39500
},
{
"epoch": 8.400680561463208,
"eval_accuracy": 0.8392269122978646,
"eval_loss": 0.7701305747032166,
"eval_runtime": 187.3057,
"eval_samples_per_second": 146.456,
"eval_steps_per_second": 4.581,
"step": 39500
},
{
"epoch": 8.507018290089324,
"grad_norm": 2.43866229057312,
"learning_rate": 6e-05,
"loss": 0.7549,
"step": 40000
},
{
"epoch": 8.507018290089324,
"eval_accuracy": 0.8386237512526904,
"eval_loss": 0.7731093168258667,
"eval_runtime": 186.3871,
"eval_samples_per_second": 147.178,
"eval_steps_per_second": 4.603,
"step": 40000
},
{
"epoch": 8.61335601871544,
"grad_norm": 2.1197094917297363,
"learning_rate": 5.95e-05,
"loss": 0.7566,
"step": 40500
},
{
"epoch": 8.61335601871544,
"eval_accuracy": 0.8395869011774794,
"eval_loss": 0.7649192214012146,
"eval_runtime": 187.8414,
"eval_samples_per_second": 146.038,
"eval_steps_per_second": 4.568,
"step": 40500
},
{
"epoch": 8.719693747341557,
"grad_norm": 2.322613000869751,
"learning_rate": 5.9e-05,
"loss": 0.7556,
"step": 41000
},
{
"epoch": 8.719693747341557,
"eval_accuracy": 0.8396322982752535,
"eval_loss": 0.7655606865882874,
"eval_runtime": 187.4692,
"eval_samples_per_second": 146.328,
"eval_steps_per_second": 4.577,
"step": 41000
},
{
"epoch": 8.826031475967673,
"grad_norm": 2.060520887374878,
"learning_rate": 5.85e-05,
"loss": 0.7477,
"step": 41500
},
{
"epoch": 8.826031475967673,
"eval_accuracy": 0.8404079064518989,
"eval_loss": 0.7558547258377075,
"eval_runtime": 187.201,
"eval_samples_per_second": 146.538,
"eval_steps_per_second": 4.583,
"step": 41500
},
{
"epoch": 8.93236920459379,
"grad_norm": 2.261857509613037,
"learning_rate": 5.8e-05,
"loss": 0.7438,
"step": 42000
},
{
"epoch": 8.93236920459379,
"eval_accuracy": 0.84021015485733,
"eval_loss": 0.7601897120475769,
"eval_runtime": 187.2973,
"eval_samples_per_second": 146.462,
"eval_steps_per_second": 4.581,
"step": 42000
},
{
"epoch": 9.038706933219906,
"grad_norm": 2.352423906326294,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.7454,
"step": 42500
},
{
"epoch": 9.038706933219906,
"eval_accuracy": 0.8411417281077362,
"eval_loss": 0.7569445371627808,
"eval_runtime": 188.8431,
"eval_samples_per_second": 145.263,
"eval_steps_per_second": 4.543,
"step": 42500
},
{
"epoch": 9.145044661846024,
"grad_norm": 2.5105371475219727,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.7441,
"step": 43000
},
{
"epoch": 9.145044661846024,
"eval_accuracy": 0.8420837788872061,
"eval_loss": 0.751946210861206,
"eval_runtime": 187.964,
"eval_samples_per_second": 145.943,
"eval_steps_per_second": 4.565,
"step": 43000
},
{
"epoch": 9.25138239047214,
"grad_norm": 2.747666835784912,
"learning_rate": 5.65e-05,
"loss": 0.7391,
"step": 43500
},
{
"epoch": 9.25138239047214,
"eval_accuracy": 0.8416609664144786,
"eval_loss": 0.7541568279266357,
"eval_runtime": 187.0727,
"eval_samples_per_second": 146.638,
"eval_steps_per_second": 4.586,
"step": 43500
},
{
"epoch": 9.357720119098255,
"grad_norm": 2.229844093322754,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.7345,
"step": 44000
},
{
"epoch": 9.357720119098255,
"eval_accuracy": 0.8417848056902848,
"eval_loss": 0.758493185043335,
"eval_runtime": 187.0332,
"eval_samples_per_second": 146.669,
"eval_steps_per_second": 4.587,
"step": 44000
},
{
"epoch": 9.464057847724373,
"grad_norm": 2.2251367568969727,
"learning_rate": 5.550000000000001e-05,
"loss": 0.7381,
"step": 44500
},
{
"epoch": 9.464057847724373,
"eval_accuracy": 0.8423865712157858,
"eval_loss": 0.7545250654220581,
"eval_runtime": 185.7507,
"eval_samples_per_second": 147.682,
"eval_steps_per_second": 4.619,
"step": 44500
},
{
"epoch": 9.570395576350489,
"grad_norm": 2.3922371864318848,
"learning_rate": 5.500000000000001e-05,
"loss": 0.7385,
"step": 45000
},
{
"epoch": 9.570395576350489,
"eval_accuracy": 0.8431280219033379,
"eval_loss": 0.7468847632408142,
"eval_runtime": 187.1862,
"eval_samples_per_second": 146.549,
"eval_steps_per_second": 4.584,
"step": 45000
},
{
"epoch": 9.676733304976606,
"grad_norm": 2.024217367172241,
"learning_rate": 5.45e-05,
"loss": 0.7319,
"step": 45500
},
{
"epoch": 9.676733304976606,
"eval_accuracy": 0.8432375296998541,
"eval_loss": 0.7428926229476929,
"eval_runtime": 185.6653,
"eval_samples_per_second": 147.75,
"eval_steps_per_second": 4.621,
"step": 45500
},
{
"epoch": 9.783071033602722,
"grad_norm": 2.946664571762085,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.7312,
"step": 46000
},
{
"epoch": 9.783071033602722,
"eval_accuracy": 0.8432175410960274,
"eval_loss": 0.7447838187217712,
"eval_runtime": 187.318,
"eval_samples_per_second": 146.446,
"eval_steps_per_second": 4.58,
"step": 46000
},
{
"epoch": 9.88940876222884,
"grad_norm": 2.362518787384033,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.7264,
"step": 46500
},
{
"epoch": 9.88940876222884,
"eval_accuracy": 0.8436135597948867,
"eval_loss": 0.7432146668434143,
"eval_runtime": 189.1721,
"eval_samples_per_second": 145.011,
"eval_steps_per_second": 4.536,
"step": 46500
},
{
"epoch": 9.995746490854955,
"grad_norm": 2.3549137115478516,
"learning_rate": 5.300000000000001e-05,
"loss": 0.73,
"step": 47000
},
{
"epoch": 9.995746490854955,
"eval_accuracy": 0.8444893924139688,
"eval_loss": 0.7410290837287903,
"eval_runtime": 189.0361,
"eval_samples_per_second": 145.115,
"eval_steps_per_second": 4.539,
"step": 47000
},
{
"epoch": 10.102084219481071,
"grad_norm": 2.3257205486297607,
"learning_rate": 5.25e-05,
"loss": 0.7208,
"step": 47500
},
{
"epoch": 10.102084219481071,
"eval_accuracy": 0.8442721638196152,
"eval_loss": 0.7432807087898254,
"eval_runtime": 187.0905,
"eval_samples_per_second": 146.624,
"eval_steps_per_second": 4.586,
"step": 47500
},
{
"epoch": 10.208421948107189,
"grad_norm": 2.081179618835449,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.7184,
"step": 48000
},
{
"epoch": 10.208421948107189,
"eval_accuracy": 0.8441397724142315,
"eval_loss": 0.7414455413818359,
"eval_runtime": 187.245,
"eval_samples_per_second": 146.503,
"eval_steps_per_second": 4.582,
"step": 48000
},
{
"epoch": 10.314759676733305,
"grad_norm": 2.1536877155303955,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.7197,
"step": 48500
},
{
"epoch": 10.314759676733305,
"eval_accuracy": 0.8453421523524357,
"eval_loss": 0.7430285215377808,
"eval_runtime": 186.8496,
"eval_samples_per_second": 146.813,
"eval_steps_per_second": 4.592,
"step": 48500
},
{
"epoch": 10.421097405359422,
"grad_norm": 2.0332043170928955,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.7216,
"step": 49000
},
{
"epoch": 10.421097405359422,
"eval_accuracy": 0.8457369996973606,
"eval_loss": 0.735456645488739,
"eval_runtime": 186.981,
"eval_samples_per_second": 146.71,
"eval_steps_per_second": 4.589,
"step": 49000
},
{
"epoch": 10.527435133985538,
"grad_norm": 2.408484935760498,
"learning_rate": 5.05e-05,
"loss": 0.7136,
"step": 49500
},
{
"epoch": 10.527435133985538,
"eval_accuracy": 0.844874850108529,
"eval_loss": 0.7428346872329712,
"eval_runtime": 187.2438,
"eval_samples_per_second": 146.504,
"eval_steps_per_second": 4.582,
"step": 49500
},
{
"epoch": 10.633772862611655,
"grad_norm": 2.188323497772217,
"learning_rate": 5e-05,
"loss": 0.7181,
"step": 50000
},
{
"epoch": 10.633772862611655,
"eval_accuracy": 0.8461618686867156,
"eval_loss": 0.7299309372901917,
"eval_runtime": 186.3046,
"eval_samples_per_second": 147.243,
"eval_steps_per_second": 4.605,
"step": 50000
},
{
"epoch": 10.740110591237771,
"grad_norm": 2.0275685787200928,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.7146,
"step": 50500
},
{
"epoch": 10.740110591237771,
"eval_accuracy": 0.8468640226233667,
"eval_loss": 0.7294827103614807,
"eval_runtime": 188.3539,
"eval_samples_per_second": 145.641,
"eval_steps_per_second": 4.555,
"step": 50500
},
{
"epoch": 10.846448319863887,
"grad_norm": 2.075991630554199,
"learning_rate": 4.9e-05,
"loss": 0.7155,
"step": 51000
},
{
"epoch": 10.846448319863887,
"eval_accuracy": 0.8462509637157972,
"eval_loss": 0.7348110675811768,
"eval_runtime": 187.3432,
"eval_samples_per_second": 146.426,
"eval_steps_per_second": 4.58,
"step": 51000
},
{
"epoch": 10.952786048490005,
"grad_norm": 2.083899736404419,
"learning_rate": 4.85e-05,
"loss": 0.709,
"step": 51500
},
{
"epoch": 10.952786048490005,
"eval_accuracy": 0.8467475927498435,
"eval_loss": 0.7246316075325012,
"eval_runtime": 186.2473,
"eval_samples_per_second": 147.288,
"eval_steps_per_second": 4.607,
"step": 51500
},
{
"epoch": 11.05912377711612,
"grad_norm": 2.051508903503418,
"learning_rate": 4.8e-05,
"loss": 0.709,
"step": 52000
},
{
"epoch": 11.05912377711612,
"eval_accuracy": 0.8473245959585851,
"eval_loss": 0.727922797203064,
"eval_runtime": 186.0664,
"eval_samples_per_second": 147.431,
"eval_steps_per_second": 4.611,
"step": 52000
},
{
"epoch": 11.165461505742238,
"grad_norm": 1.8798682689666748,
"learning_rate": 4.75e-05,
"loss": 0.7038,
"step": 52500
},
{
"epoch": 11.165461505742238,
"eval_accuracy": 0.8470500338964946,
"eval_loss": 0.7311117649078369,
"eval_runtime": 185.7398,
"eval_samples_per_second": 147.69,
"eval_steps_per_second": 4.619,
"step": 52500
},
{
"epoch": 11.271799234368354,
"grad_norm": 2.2613861560821533,
"learning_rate": 4.7e-05,
"loss": 0.7072,
"step": 53000
},
{
"epoch": 11.271799234368354,
"eval_accuracy": 0.8469547536409501,
"eval_loss": 0.7293407917022705,
"eval_runtime": 186.6537,
"eval_samples_per_second": 146.967,
"eval_steps_per_second": 4.597,
"step": 53000
},
{
"epoch": 11.378136962994471,
"grad_norm": 2.284278631210327,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.6982,
"step": 53500
},
{
"epoch": 11.378136962994471,
"eval_accuracy": 0.8488894363564872,
"eval_loss": 0.7241687774658203,
"eval_runtime": 187.2053,
"eval_samples_per_second": 146.534,
"eval_steps_per_second": 4.583,
"step": 53500
},
{
"epoch": 11.484474691620587,
"grad_norm": 2.0332393646240234,
"learning_rate": 4.600000000000001e-05,
"loss": 0.6979,
"step": 54000
},
{
"epoch": 11.484474691620587,
"eval_accuracy": 0.8488223472254323,
"eval_loss": 0.7191620469093323,
"eval_runtime": 186.9506,
"eval_samples_per_second": 146.734,
"eval_steps_per_second": 4.589,
"step": 54000
},
{
"epoch": 11.590812420246703,
"grad_norm": 2.3902156352996826,
"learning_rate": 4.55e-05,
"loss": 0.7061,
"step": 54500
},
{
"epoch": 11.590812420246703,
"eval_accuracy": 0.8484331299628768,
"eval_loss": 0.7225244641304016,
"eval_runtime": 188.9334,
"eval_samples_per_second": 145.194,
"eval_steps_per_second": 4.541,
"step": 54500
},
{
"epoch": 11.69715014887282,
"grad_norm": 2.1105005741119385,
"learning_rate": 4.5e-05,
"loss": 0.6908,
"step": 55000
},
{
"epoch": 11.69715014887282,
"eval_accuracy": 0.8491300232964473,
"eval_loss": 0.7186225652694702,
"eval_runtime": 187.6216,
"eval_samples_per_second": 146.209,
"eval_steps_per_second": 4.573,
"step": 55000
},
{
"epoch": 11.803487877498936,
"grad_norm": 2.3430845737457275,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.6982,
"step": 55500
},
{
"epoch": 11.803487877498936,
"eval_accuracy": 0.848985207075478,
"eval_loss": 0.7230582237243652,
"eval_runtime": 187.7234,
"eval_samples_per_second": 146.13,
"eval_steps_per_second": 4.571,
"step": 55500
},
{
"epoch": 11.909825606125054,
"grad_norm": 2.3165669441223145,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.6968,
"step": 56000
},
{
"epoch": 11.909825606125054,
"eval_accuracy": 0.8496422899025508,
"eval_loss": 0.7111759185791016,
"eval_runtime": 187.4207,
"eval_samples_per_second": 146.366,
"eval_steps_per_second": 4.578,
"step": 56000
},
{
"epoch": 12.01616333475117,
"grad_norm": 2.1263792514801025,
"learning_rate": 4.35e-05,
"loss": 0.6899,
"step": 56500
},
{
"epoch": 12.01616333475117,
"eval_accuracy": 0.849960832248035,
"eval_loss": 0.7166005373001099,
"eval_runtime": 187.5327,
"eval_samples_per_second": 146.278,
"eval_steps_per_second": 4.575,
"step": 56500
},
{
"epoch": 12.122501063377285,
"grad_norm": 1.994381070137024,
"learning_rate": 4.3e-05,
"loss": 0.6861,
"step": 57000
},
{
"epoch": 12.122501063377285,
"eval_accuracy": 0.8500555930888853,
"eval_loss": 0.7198529839515686,
"eval_runtime": 186.319,
"eval_samples_per_second": 147.231,
"eval_steps_per_second": 4.605,
"step": 57000
},
{
"epoch": 12.228838792003403,
"grad_norm": 2.1838319301605225,
"learning_rate": 4.25e-05,
"loss": 0.6904,
"step": 57500
},
{
"epoch": 12.228838792003403,
"eval_accuracy": 0.850506019072164,
"eval_loss": 0.7138729095458984,
"eval_runtime": 186.3075,
"eval_samples_per_second": 147.24,
"eval_steps_per_second": 4.605,
"step": 57500
},
{
"epoch": 12.335176520629519,
"grad_norm": 2.116994857788086,
"learning_rate": 4.2e-05,
"loss": 0.6865,
"step": 58000
},
{
"epoch": 12.335176520629519,
"eval_accuracy": 0.8506285997887376,
"eval_loss": 0.7130934596061707,
"eval_runtime": 187.4216,
"eval_samples_per_second": 146.365,
"eval_steps_per_second": 4.578,
"step": 58000
},
{
"epoch": 12.441514249255636,
"grad_norm": 2.377434015274048,
"learning_rate": 4.15e-05,
"loss": 0.6812,
"step": 58500
},
{
"epoch": 12.441514249255636,
"eval_accuracy": 0.8510783043990745,
"eval_loss": 0.7050937414169312,
"eval_runtime": 189.3457,
"eval_samples_per_second": 144.878,
"eval_steps_per_second": 4.531,
"step": 58500
},
{
"epoch": 12.547851977881752,
"grad_norm": 2.320155143737793,
"learning_rate": 4.1e-05,
"loss": 0.6907,
"step": 59000
},
{
"epoch": 12.547851977881752,
"eval_accuracy": 0.8511123081850741,
"eval_loss": 0.7095110416412354,
"eval_runtime": 187.9801,
"eval_samples_per_second": 145.93,
"eval_steps_per_second": 4.564,
"step": 59000
},
{
"epoch": 12.65418970650787,
"grad_norm": 2.1185412406921387,
"learning_rate": 4.05e-05,
"loss": 0.6849,
"step": 59500
},
{
"epoch": 12.65418970650787,
"eval_accuracy": 0.8515246474410098,
"eval_loss": 0.7070408463478088,
"eval_runtime": 187.5124,
"eval_samples_per_second": 146.294,
"eval_steps_per_second": 4.576,
"step": 59500
},
{
"epoch": 12.760527435133985,
"grad_norm": 2.21907639503479,
"learning_rate": 4e-05,
"loss": 0.6789,
"step": 60000
},
{
"epoch": 12.760527435133985,
"eval_accuracy": 0.8517749754931363,
"eval_loss": 0.7055286765098572,
"eval_runtime": 188.9775,
"eval_samples_per_second": 145.16,
"eval_steps_per_second": 4.54,
"step": 60000
},
{
"epoch": 12.866865163760101,
"grad_norm": 2.1278369426727295,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.6767,
"step": 60500
},
{
"epoch": 12.866865163760101,
"eval_accuracy": 0.8519451108850803,
"eval_loss": 0.706390917301178,
"eval_runtime": 188.7478,
"eval_samples_per_second": 145.337,
"eval_steps_per_second": 4.546,
"step": 60500
},
{
"epoch": 12.973202892386219,
"grad_norm": 2.2294068336486816,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.6828,
"step": 61000
},
{
"epoch": 12.973202892386219,
"eval_accuracy": 0.8526284277549754,
"eval_loss": 0.7011674046516418,
"eval_runtime": 188.2328,
"eval_samples_per_second": 145.734,
"eval_steps_per_second": 4.558,
"step": 61000
},
{
"epoch": 13.079540621012335,
"grad_norm": 2.4725828170776367,
"learning_rate": 3.85e-05,
"loss": 0.6706,
"step": 61500
},
{
"epoch": 13.079540621012335,
"eval_accuracy": 0.8527708579766078,
"eval_loss": 0.7007238268852234,
"eval_runtime": 188.5594,
"eval_samples_per_second": 145.482,
"eval_steps_per_second": 4.55,
"step": 61500
},
{
"epoch": 13.185878349638452,
"grad_norm": 2.164578676223755,
"learning_rate": 3.8e-05,
"loss": 0.6713,
"step": 62000
},
{
"epoch": 13.185878349638452,
"eval_accuracy": 0.8523022405661091,
"eval_loss": 0.7039057612419128,
"eval_runtime": 189.6905,
"eval_samples_per_second": 144.615,
"eval_steps_per_second": 4.523,
"step": 62000
},
{
"epoch": 13.292216078264568,
"grad_norm": 2.413243532180786,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.6688,
"step": 62500
},
{
"epoch": 13.292216078264568,
"eval_accuracy": 0.8524316826829147,
"eval_loss": 0.7022396922111511,
"eval_runtime": 187.5826,
"eval_samples_per_second": 146.24,
"eval_steps_per_second": 4.574,
"step": 62500
},
{
"epoch": 13.398553806890686,
"grad_norm": 2.042698383331299,
"learning_rate": 3.7e-05,
"loss": 0.6768,
"step": 63000
},
{
"epoch": 13.398553806890686,
"eval_accuracy": 0.8535561550741191,
"eval_loss": 0.6964840888977051,
"eval_runtime": 187.9201,
"eval_samples_per_second": 145.977,
"eval_steps_per_second": 4.566,
"step": 63000
},
{
"epoch": 13.504891535516801,
"grad_norm": 2.041522741317749,
"learning_rate": 3.65e-05,
"loss": 0.6645,
"step": 63500
},
{
"epoch": 13.504891535516801,
"eval_accuracy": 0.853386487539177,
"eval_loss": 0.7000829577445984,
"eval_runtime": 187.7215,
"eval_samples_per_second": 146.131,
"eval_steps_per_second": 4.571,
"step": 63500
},
{
"epoch": 13.611229264142917,
"grad_norm": 1.9478411674499512,
"learning_rate": 3.6e-05,
"loss": 0.6652,
"step": 64000
},
{
"epoch": 13.611229264142917,
"eval_accuracy": 0.8545154193036631,
"eval_loss": 0.6918612718582153,
"eval_runtime": 186.6017,
"eval_samples_per_second": 147.008,
"eval_steps_per_second": 4.598,
"step": 64000
},
{
"epoch": 13.717566992769035,
"grad_norm": 2.164696216583252,
"learning_rate": 3.55e-05,
"loss": 0.669,
"step": 64500
},
{
"epoch": 13.717566992769035,
"eval_accuracy": 0.8544395509174251,
"eval_loss": 0.6917126774787903,
"eval_runtime": 187.4287,
"eval_samples_per_second": 146.36,
"eval_steps_per_second": 4.578,
"step": 64500
},
{
"epoch": 13.82390472139515,
"grad_norm": 2.247715711593628,
"learning_rate": 3.5e-05,
"loss": 0.6683,
"step": 65000
},
{
"epoch": 13.82390472139515,
"eval_accuracy": 0.8542704238784817,
"eval_loss": 0.6937560439109802,
"eval_runtime": 188.8636,
"eval_samples_per_second": 145.248,
"eval_steps_per_second": 4.543,
"step": 65000
},
{
"epoch": 13.930242450021268,
"grad_norm": 2.264913558959961,
"learning_rate": 3.45e-05,
"loss": 0.6655,
"step": 65500
},
{
"epoch": 13.930242450021268,
"eval_accuracy": 0.8545225605898715,
"eval_loss": 0.6955862045288086,
"eval_runtime": 187.5492,
"eval_samples_per_second": 146.266,
"eval_steps_per_second": 4.575,
"step": 65500
},
{
"epoch": 14.036580178647384,
"grad_norm": 2.3346023559570312,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.6587,
"step": 66000
},
{
"epoch": 14.036580178647384,
"eval_accuracy": 0.8548906265850504,
"eval_loss": 0.6948685646057129,
"eval_runtime": 187.7667,
"eval_samples_per_second": 146.096,
"eval_steps_per_second": 4.57,
"step": 66000
},
{
"epoch": 14.142917907273501,
"grad_norm": 2.054490089416504,
"learning_rate": 3.35e-05,
"loss": 0.6562,
"step": 66500
},
{
"epoch": 14.142917907273501,
"eval_accuracy": 0.8547657424014736,
"eval_loss": 0.6918261647224426,
"eval_runtime": 188.2884,
"eval_samples_per_second": 145.691,
"eval_steps_per_second": 4.557,
"step": 66500
},
{
"epoch": 14.249255635899617,
"grad_norm": 2.101318359375,
"learning_rate": 3.3e-05,
"loss": 0.6524,
"step": 67000
},
{
"epoch": 14.249255635899617,
"eval_accuracy": 0.8545756667919997,
"eval_loss": 0.6881661415100098,
"eval_runtime": 188.3077,
"eval_samples_per_second": 145.676,
"eval_steps_per_second": 4.556,
"step": 67000
},
{
"epoch": 14.355593364525733,
"grad_norm": 2.3317761421203613,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.653,
"step": 67500
},
{
"epoch": 14.355593364525733,
"eval_accuracy": 0.8556463396940239,
"eval_loss": 0.6863633394241333,
"eval_runtime": 187.6402,
"eval_samples_per_second": 146.195,
"eval_steps_per_second": 4.573,
"step": 67500
},
{
"epoch": 14.46193109315185,
"grad_norm": 2.2883026599884033,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.6592,
"step": 68000
},
{
"epoch": 14.46193109315185,
"eval_accuracy": 0.8556758477466689,
"eval_loss": 0.6878767609596252,
"eval_runtime": 187.2987,
"eval_samples_per_second": 146.461,
"eval_steps_per_second": 4.581,
"step": 68000
},
{
"epoch": 14.568268821777966,
"grad_norm": 2.058140993118286,
"learning_rate": 3.15e-05,
"loss": 0.654,
"step": 68500
},
{
"epoch": 14.568268821777966,
"eval_accuracy": 0.8560685191587963,
"eval_loss": 0.6820354461669922,
"eval_runtime": 187.489,
"eval_samples_per_second": 146.313,
"eval_steps_per_second": 4.576,
"step": 68500
},
{
"epoch": 14.674606550404084,
"grad_norm": 2.3248109817504883,
"learning_rate": 3.1e-05,
"loss": 0.6535,
"step": 69000
},
{
"epoch": 14.674606550404084,
"eval_accuracy": 0.8565050750014839,
"eval_loss": 0.6821831464767456,
"eval_runtime": 187.5661,
"eval_samples_per_second": 146.252,
"eval_steps_per_second": 4.574,
"step": 69000
},
{
"epoch": 14.7809442790302,
"grad_norm": 2.507769823074341,
"learning_rate": 3.05e-05,
"loss": 0.6572,
"step": 69500
},
{
"epoch": 14.7809442790302,
"eval_accuracy": 0.8568994154448439,
"eval_loss": 0.6748529672622681,
"eval_runtime": 187.4579,
"eval_samples_per_second": 146.337,
"eval_steps_per_second": 4.577,
"step": 69500
},
{
"epoch": 14.887282007656317,
"grad_norm": 1.7414127588272095,
"learning_rate": 3e-05,
"loss": 0.653,
"step": 70000
},
{
"epoch": 14.887282007656317,
"eval_accuracy": 0.8568193158641805,
"eval_loss": 0.6834676861763,
"eval_runtime": 187.2966,
"eval_samples_per_second": 146.463,
"eval_steps_per_second": 4.581,
"step": 70000
},
{
"epoch": 14.993619736282433,
"grad_norm": 2.1479337215423584,
"learning_rate": 2.95e-05,
"loss": 0.6476,
"step": 70500
},
{
"epoch": 14.993619736282433,
"eval_accuracy": 0.8570982400326026,
"eval_loss": 0.6762750148773193,
"eval_runtime": 186.0961,
"eval_samples_per_second": 147.408,
"eval_steps_per_second": 4.611,
"step": 70500
},
{
"epoch": 15.099957464908549,
"grad_norm": 2.4283432960510254,
"learning_rate": 2.9e-05,
"loss": 0.6454,
"step": 71000
},
{
"epoch": 15.099957464908549,
"eval_accuracy": 0.8570932978007869,
"eval_loss": 0.6805168986320496,
"eval_runtime": 187.4834,
"eval_samples_per_second": 146.317,
"eval_steps_per_second": 4.576,
"step": 71000
},
{
"epoch": 15.206295193534666,
"grad_norm": 2.3493471145629883,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.6492,
"step": 71500
},
{
"epoch": 15.206295193534666,
"eval_accuracy": 0.857002769504863,
"eval_loss": 0.6780962944030762,
"eval_runtime": 187.5133,
"eval_samples_per_second": 146.294,
"eval_steps_per_second": 4.576,
"step": 71500
},
{
"epoch": 15.312632922160782,
"grad_norm": 2.315495729446411,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.6417,
"step": 72000
},
{
"epoch": 15.312632922160782,
"eval_accuracy": 0.8574317167307508,
"eval_loss": 0.67529296875,
"eval_runtime": 187.9011,
"eval_samples_per_second": 145.992,
"eval_steps_per_second": 4.566,
"step": 72000
},
{
"epoch": 15.4189706507869,
"grad_norm": 2.3548035621643066,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.6411,
"step": 72500
},
{
"epoch": 15.4189706507869,
"eval_accuracy": 0.8574519944946007,
"eval_loss": 0.6762493252754211,
"eval_runtime": 187.6451,
"eval_samples_per_second": 146.191,
"eval_steps_per_second": 4.572,
"step": 72500
},
{
"epoch": 15.525308379413016,
"grad_norm": 2.073268175125122,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.6427,
"step": 73000
},
{
"epoch": 15.525308379413016,
"eval_accuracy": 0.8576201590411181,
"eval_loss": 0.6761339902877808,
"eval_runtime": 189.4074,
"eval_samples_per_second": 144.831,
"eval_steps_per_second": 4.53,
"step": 73000
},
{
"epoch": 15.631646108039131,
"grad_norm": 1.8768184185028076,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.6427,
"step": 73500
},
{
"epoch": 15.631646108039131,
"eval_accuracy": 0.8576710592022652,
"eval_loss": 0.6767549514770508,
"eval_runtime": 189.8952,
"eval_samples_per_second": 144.459,
"eval_steps_per_second": 4.518,
"step": 73500
},
{
"epoch": 15.737983836665249,
"grad_norm": 2.476339817047119,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.6418,
"step": 74000
},
{
"epoch": 15.737983836665249,
"eval_accuracy": 0.8583988090867217,
"eval_loss": 0.6754936575889587,
"eval_runtime": 187.0962,
"eval_samples_per_second": 146.62,
"eval_steps_per_second": 4.586,
"step": 74000
},
{
"epoch": 15.844321565291365,
"grad_norm": 2.2399730682373047,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.6391,
"step": 74500
},
{
"epoch": 15.844321565291365,
"eval_accuracy": 0.8587047439506456,
"eval_loss": 0.6762666702270508,
"eval_runtime": 187.0028,
"eval_samples_per_second": 146.693,
"eval_steps_per_second": 4.588,
"step": 74500
},
{
"epoch": 15.950659293917482,
"grad_norm": 2.1147332191467285,
"learning_rate": 2.5e-05,
"loss": 0.6324,
"step": 75000
},
{
"epoch": 15.950659293917482,
"eval_accuracy": 0.8583270548777524,
"eval_loss": 0.6746546626091003,
"eval_runtime": 186.0971,
"eval_samples_per_second": 147.407,
"eval_steps_per_second": 4.61,
"step": 75000
},
{
"epoch": 16.056997022543598,
"grad_norm": 1.9580026865005493,
"learning_rate": 2.45e-05,
"loss": 0.635,
"step": 75500
},
{
"epoch": 16.056997022543598,
"eval_accuracy": 0.8590129341873303,
"eval_loss": 0.665653645992279,
"eval_runtime": 186.3662,
"eval_samples_per_second": 147.194,
"eval_steps_per_second": 4.604,
"step": 75500
},
{
"epoch": 16.163334751169714,
"grad_norm": 1.9090304374694824,
"learning_rate": 2.4e-05,
"loss": 0.6301,
"step": 76000
},
{
"epoch": 16.163334751169714,
"eval_accuracy": 0.8595268739500587,
"eval_loss": 0.6707313060760498,
"eval_runtime": 186.5814,
"eval_samples_per_second": 147.024,
"eval_steps_per_second": 4.599,
"step": 76000
},
{
"epoch": 16.269672479795833,
"grad_norm": 2.125847101211548,
"learning_rate": 2.35e-05,
"loss": 0.6325,
"step": 76500
},
{
"epoch": 16.269672479795833,
"eval_accuracy": 0.8596205331865414,
"eval_loss": 0.6737939715385437,
"eval_runtime": 186.3349,
"eval_samples_per_second": 147.219,
"eval_steps_per_second": 4.605,
"step": 76500
},
{
"epoch": 16.37601020842195,
"grad_norm": 2.348851203918457,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.6278,
"step": 77000
},
{
"epoch": 16.37601020842195,
"eval_accuracy": 0.8598069827368299,
"eval_loss": 0.6713533401489258,
"eval_runtime": 187.2971,
"eval_samples_per_second": 146.462,
"eval_steps_per_second": 4.581,
"step": 77000
},
{
"epoch": 16.482347937048065,
"grad_norm": 1.935435175895691,
"learning_rate": 2.25e-05,
"loss": 0.6333,
"step": 77500
},
{
"epoch": 16.482347937048065,
"eval_accuracy": 0.8600093652609475,
"eval_loss": 0.6650431752204895,
"eval_runtime": 187.5594,
"eval_samples_per_second": 146.258,
"eval_steps_per_second": 4.575,
"step": 77500
},
{
"epoch": 16.58868566567418,
"grad_norm": 2.2629425525665283,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.6296,
"step": 78000
},
{
"epoch": 16.58868566567418,
"eval_accuracy": 0.8603086354661712,
"eval_loss": 0.6635463237762451,
"eval_runtime": 186.2259,
"eval_samples_per_second": 147.305,
"eval_steps_per_second": 4.607,
"step": 78000
},
{
"epoch": 16.695023394300296,
"grad_norm": 2.1555726528167725,
"learning_rate": 2.15e-05,
"loss": 0.6278,
"step": 78500
},
{
"epoch": 16.695023394300296,
"eval_accuracy": 0.8604913225558962,
"eval_loss": 0.6623604893684387,
"eval_runtime": 187.7075,
"eval_samples_per_second": 146.142,
"eval_steps_per_second": 4.571,
"step": 78500
},
{
"epoch": 16.801361122926416,
"grad_norm": 2.1904544830322266,
"learning_rate": 2.1e-05,
"loss": 0.6264,
"step": 79000
},
{
"epoch": 16.801361122926416,
"eval_accuracy": 0.8604346510079303,
"eval_loss": 0.6690334677696228,
"eval_runtime": 187.4243,
"eval_samples_per_second": 146.363,
"eval_steps_per_second": 4.578,
"step": 79000
},
{
"epoch": 16.90769885155253,
"grad_norm": 2.208678722381592,
"learning_rate": 2.05e-05,
"loss": 0.6303,
"step": 79500
},
{
"epoch": 16.90769885155253,
"eval_accuracy": 0.8602657766644671,
"eval_loss": 0.663872480392456,
"eval_runtime": 187.6784,
"eval_samples_per_second": 146.165,
"eval_steps_per_second": 4.572,
"step": 79500
},
{
"epoch": 17.014036580178647,
"grad_norm": 2.0841898918151855,
"learning_rate": 2e-05,
"loss": 0.6261,
"step": 80000
},
{
"epoch": 17.014036580178647,
"eval_accuracy": 0.860494846911361,
"eval_loss": 0.6641469597816467,
"eval_runtime": 186.8732,
"eval_samples_per_second": 146.795,
"eval_steps_per_second": 4.591,
"step": 80000
},
{
"epoch": 17.120374308804763,
"grad_norm": 1.9734201431274414,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.6188,
"step": 80500
},
{
"epoch": 17.120374308804763,
"eval_accuracy": 0.8611235949597102,
"eval_loss": 0.6597367525100708,
"eval_runtime": 187.1259,
"eval_samples_per_second": 146.596,
"eval_steps_per_second": 4.585,
"step": 80500
},
{
"epoch": 17.22671203743088,
"grad_norm": 2.002861976623535,
"learning_rate": 1.9e-05,
"loss": 0.6169,
"step": 81000
},
{
"epoch": 17.22671203743088,
"eval_accuracy": 0.8612186249545289,
"eval_loss": 0.656814694404602,
"eval_runtime": 186.6713,
"eval_samples_per_second": 146.953,
"eval_steps_per_second": 4.596,
"step": 81000
},
{
"epoch": 17.333049766056998,
"grad_norm": 2.489762783050537,
"learning_rate": 1.85e-05,
"loss": 0.6224,
"step": 81500
},
{
"epoch": 17.333049766056998,
"eval_accuracy": 0.8613080153687351,
"eval_loss": 0.6605936884880066,
"eval_runtime": 188.8518,
"eval_samples_per_second": 145.257,
"eval_steps_per_second": 4.543,
"step": 81500
},
{
"epoch": 17.439387494683114,
"grad_norm": 2.0980629920959473,
"learning_rate": 1.8e-05,
"loss": 0.6199,
"step": 82000
},
{
"epoch": 17.439387494683114,
"eval_accuracy": 0.861816008556555,
"eval_loss": 0.6584839820861816,
"eval_runtime": 186.4347,
"eval_samples_per_second": 147.14,
"eval_steps_per_second": 4.602,
"step": 82000
},
{
"epoch": 17.54572522330923,
"grad_norm": 1.9317203760147095,
"learning_rate": 1.75e-05,
"loss": 0.6241,
"step": 82500
},
{
"epoch": 17.54572522330923,
"eval_accuracy": 0.8614589561697034,
"eval_loss": 0.661562442779541,
"eval_runtime": 187.7317,
"eval_samples_per_second": 146.123,
"eval_steps_per_second": 4.57,
"step": 82500
},
{
"epoch": 17.652062951935346,
"grad_norm": 2.1747238636016846,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.6154,
"step": 83000
},
{
"epoch": 17.652062951935346,
"eval_accuracy": 0.8619722616948544,
"eval_loss": 0.6551214456558228,
"eval_runtime": 187.7435,
"eval_samples_per_second": 146.114,
"eval_steps_per_second": 4.57,
"step": 83000
},
{
"epoch": 17.758400680561465,
"grad_norm": 2.34098482131958,
"learning_rate": 1.65e-05,
"loss": 0.6167,
"step": 83500
},
{
"epoch": 17.758400680561465,
"eval_accuracy": 0.8625058751823201,
"eval_loss": 0.6509849429130554,
"eval_runtime": 186.5789,
"eval_samples_per_second": 147.026,
"eval_steps_per_second": 4.599,
"step": 83500
},
{
"epoch": 17.86473840918758,
"grad_norm": 2.2351200580596924,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6162,
"step": 84000
},
{
"epoch": 17.86473840918758,
"eval_accuracy": 0.8624151082072831,
"eval_loss": 0.6552938222885132,
"eval_runtime": 186.7383,
"eval_samples_per_second": 146.901,
"eval_steps_per_second": 4.595,
"step": 84000
},
{
"epoch": 17.971076137813697,
"grad_norm": 2.1403043270111084,
"learning_rate": 1.55e-05,
"loss": 0.6127,
"step": 84500
},
{
"epoch": 17.971076137813697,
"eval_accuracy": 0.8623448436360815,
"eval_loss": 0.6527832746505737,
"eval_runtime": 186.8175,
"eval_samples_per_second": 146.838,
"eval_steps_per_second": 4.593,
"step": 84500
},
{
"epoch": 18.077413866439812,
"grad_norm": 1.9795105457305908,
"learning_rate": 1.5e-05,
"loss": 0.61,
"step": 85000
},
{
"epoch": 18.077413866439812,
"eval_accuracy": 0.8622515357619651,
"eval_loss": 0.6563568115234375,
"eval_runtime": 188.8808,
"eval_samples_per_second": 145.234,
"eval_steps_per_second": 4.543,
"step": 85000
},
{
"epoch": 18.183751595065928,
"grad_norm": 2.548271656036377,
"learning_rate": 1.45e-05,
"loss": 0.6115,
"step": 85500
},
{
"epoch": 18.183751595065928,
"eval_accuracy": 0.862900921721882,
"eval_loss": 0.6507585644721985,
"eval_runtime": 187.5686,
"eval_samples_per_second": 146.251,
"eval_steps_per_second": 4.574,
"step": 85500
},
{
"epoch": 18.290089323692047,
"grad_norm": 2.2638394832611084,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.602,
"step": 86000
},
{
"epoch": 18.290089323692047,
"eval_accuracy": 0.8630698221693879,
"eval_loss": 0.6584257483482361,
"eval_runtime": 186.8872,
"eval_samples_per_second": 146.784,
"eval_steps_per_second": 4.591,
"step": 86000
},
{
"epoch": 18.396427052318163,
"grad_norm": 1.9320064783096313,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.6134,
"step": 86500
},
{
"epoch": 18.396427052318163,
"eval_accuracy": 0.8632312148222042,
"eval_loss": 0.6499433517456055,
"eval_runtime": 187.7637,
"eval_samples_per_second": 146.098,
"eval_steps_per_second": 4.57,
"step": 86500
},
{
"epoch": 18.50276478094428,
"grad_norm": 1.891508936882019,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.6112,
"step": 87000
},
{
"epoch": 18.50276478094428,
"eval_accuracy": 0.8630286860815074,
"eval_loss": 0.6519419550895691,
"eval_runtime": 187.4682,
"eval_samples_per_second": 146.329,
"eval_steps_per_second": 4.577,
"step": 87000
},
{
"epoch": 18.609102509570395,
"grad_norm": 2.208890438079834,
"learning_rate": 1.25e-05,
"loss": 0.6109,
"step": 87500
},
{
"epoch": 18.609102509570395,
"eval_accuracy": 0.8636812324617463,
"eval_loss": 0.6446605324745178,
"eval_runtime": 188.845,
"eval_samples_per_second": 145.262,
"eval_steps_per_second": 4.543,
"step": 87500
},
{
"epoch": 18.71544023819651,
"grad_norm": 2.313291549682617,
"learning_rate": 1.2e-05,
"loss": 0.6018,
"step": 88000
},
{
"epoch": 18.71544023819651,
"eval_accuracy": 0.86378315200226,
"eval_loss": 0.6436861753463745,
"eval_runtime": 186.7793,
"eval_samples_per_second": 146.869,
"eval_steps_per_second": 4.594,
"step": 88000
},
{
"epoch": 18.82177796682263,
"grad_norm": 1.9980500936508179,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.6081,
"step": 88500
},
{
"epoch": 18.82177796682263,
"eval_accuracy": 0.8636463710418437,
"eval_loss": 0.6478908061981201,
"eval_runtime": 189.7409,
"eval_samples_per_second": 144.576,
"eval_steps_per_second": 4.522,
"step": 88500
},
{
"epoch": 18.928115695448746,
"grad_norm": 2.0310556888580322,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.6033,
"step": 89000
},
{
"epoch": 18.928115695448746,
"eval_accuracy": 0.8635644696497025,
"eval_loss": 0.6475953459739685,
"eval_runtime": 187.8946,
"eval_samples_per_second": 145.997,
"eval_steps_per_second": 4.566,
"step": 89000
},
{
"epoch": 19.03445342407486,
"grad_norm": 2.399073839187622,
"learning_rate": 1.05e-05,
"loss": 0.6035,
"step": 89500
},
{
"epoch": 19.03445342407486,
"eval_accuracy": 0.8637125413143368,
"eval_loss": 0.6477306485176086,
"eval_runtime": 188.0038,
"eval_samples_per_second": 145.912,
"eval_steps_per_second": 4.564,
"step": 89500
},
{
"epoch": 19.140791152700977,
"grad_norm": 2.218600034713745,
"learning_rate": 1e-05,
"loss": 0.5977,
"step": 90000
},
{
"epoch": 19.140791152700977,
"eval_accuracy": 0.8640764513496372,
"eval_loss": 0.6459035873413086,
"eval_runtime": 186.7125,
"eval_samples_per_second": 146.921,
"eval_steps_per_second": 4.595,
"step": 90000
},
{
"epoch": 19.247128881327093,
"grad_norm": 2.1308956146240234,
"learning_rate": 9.5e-06,
"loss": 0.6092,
"step": 90500
},
{
"epoch": 19.247128881327093,
"eval_accuracy": 0.8635675775343433,
"eval_loss": 0.6506515145301819,
"eval_runtime": 186.9565,
"eval_samples_per_second": 146.729,
"eval_steps_per_second": 4.589,
"step": 90500
},
{
"epoch": 19.353466609953212,
"grad_norm": 2.1165056228637695,
"learning_rate": 9e-06,
"loss": 0.594,
"step": 91000
},
{
"epoch": 19.353466609953212,
"eval_accuracy": 0.8645061649822221,
"eval_loss": 0.6416978240013123,
"eval_runtime": 187.0244,
"eval_samples_per_second": 146.676,
"eval_steps_per_second": 4.588,
"step": 91000
},
{
"epoch": 19.45980433857933,
"grad_norm": 2.277674436569214,
"learning_rate": 8.500000000000002e-06,
"loss": 0.601,
"step": 91500
},
{
"epoch": 19.45980433857933,
"eval_accuracy": 0.8641424971302022,
"eval_loss": 0.6443900465965271,
"eval_runtime": 186.9341,
"eval_samples_per_second": 146.747,
"eval_steps_per_second": 4.59,
"step": 91500
},
{
"epoch": 19.566142067205444,
"grad_norm": 2.283182144165039,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6008,
"step": 92000
},
{
"epoch": 19.566142067205444,
"eval_accuracy": 0.8650054540372305,
"eval_loss": 0.6387457847595215,
"eval_runtime": 188.6398,
"eval_samples_per_second": 145.42,
"eval_steps_per_second": 4.548,
"step": 92000
},
{
"epoch": 19.67247979583156,
"grad_norm": 1.9207966327667236,
"learning_rate": 7.5e-06,
"loss": 0.5995,
"step": 92500
},
{
"epoch": 19.67247979583156,
"eval_accuracy": 0.8643799885041582,
"eval_loss": 0.6439911127090454,
"eval_runtime": 186.8593,
"eval_samples_per_second": 146.806,
"eval_steps_per_second": 4.592,
"step": 92500
},
{
"epoch": 19.77881752445768,
"grad_norm": 2.332043409347534,
"learning_rate": 7.000000000000001e-06,
"loss": 0.5945,
"step": 93000
},
{
"epoch": 19.77881752445768,
"eval_accuracy": 0.8648918324061133,
"eval_loss": 0.6363367438316345,
"eval_runtime": 187.6222,
"eval_samples_per_second": 146.209,
"eval_steps_per_second": 4.573,
"step": 93000
},
{
"epoch": 19.885155253083795,
"grad_norm": 2.0394229888916016,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.5985,
"step": 93500
},
{
"epoch": 19.885155253083795,
"eval_accuracy": 0.8647885102924586,
"eval_loss": 0.6401004791259766,
"eval_runtime": 187.4927,
"eval_samples_per_second": 146.31,
"eval_steps_per_second": 4.576,
"step": 93500
},
{
"epoch": 19.99149298170991,
"grad_norm": 2.3234775066375732,
"learning_rate": 6e-06,
"loss": 0.5945,
"step": 94000
},
{
"epoch": 19.99149298170991,
"eval_accuracy": 0.8650117508813161,
"eval_loss": 0.6405218839645386,
"eval_runtime": 188.3987,
"eval_samples_per_second": 145.606,
"eval_steps_per_second": 4.554,
"step": 94000
},
{
"epoch": 20.097830710336027,
"grad_norm": 1.8002312183380127,
"learning_rate": 5.500000000000001e-06,
"loss": 0.5943,
"step": 94500
},
{
"epoch": 20.097830710336027,
"eval_accuracy": 0.8649030683652459,
"eval_loss": 0.6379217505455017,
"eval_runtime": 186.858,
"eval_samples_per_second": 146.807,
"eval_steps_per_second": 4.592,
"step": 94500
},
{
"epoch": 20.204168438962142,
"grad_norm": 2.004221200942993,
"learning_rate": 5e-06,
"loss": 0.5874,
"step": 95000
},
{
"epoch": 20.204168438962142,
"eval_accuracy": 0.8656674134373837,
"eval_loss": 0.6364595293998718,
"eval_runtime": 186.2416,
"eval_samples_per_second": 147.293,
"eval_steps_per_second": 4.607,
"step": 95000
},
{
"epoch": 20.31050616758826,
"grad_norm": 2.2905805110931396,
"learning_rate": 4.5e-06,
"loss": 0.5919,
"step": 95500
},
{
"epoch": 20.31050616758826,
"eval_accuracy": 0.8654995569474285,
"eval_loss": 0.6359232664108276,
"eval_runtime": 188.7313,
"eval_samples_per_second": 145.349,
"eval_steps_per_second": 4.546,
"step": 95500
},
{
"epoch": 20.416843896214377,
"grad_norm": 2.1926088333129883,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5959,
"step": 96000
},
{
"epoch": 20.416843896214377,
"eval_accuracy": 0.8659267527708855,
"eval_loss": 0.6338008642196655,
"eval_runtime": 187.0107,
"eval_samples_per_second": 146.687,
"eval_steps_per_second": 4.588,
"step": 96000
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 22,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.094053129886106e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}