mbert_npi-deva / trainer_state.json
DGurgurov's picture
Uploading checkpoint-99000 for mbert - npi-deva
d11be91 verified
{
"best_metric": 0.3867943286895752,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/npi-Deva/checkpoint-99000",
"epoch": 22.307345651194233,
"eval_steps": 500,
"global_step": 99000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11266336187471834,
"grad_norm": 2.965751886367798,
"learning_rate": 9.95e-05,
"loss": 1.2301,
"step": 500
},
{
"epoch": 0.11266336187471834,
"eval_accuracy": 0.773184921040326,
"eval_loss": 1.0424206256866455,
"eval_runtime": 220.277,
"eval_samples_per_second": 125.542,
"eval_steps_per_second": 3.927,
"step": 500
},
{
"epoch": 0.22532672374943669,
"grad_norm": 2.740410327911377,
"learning_rate": 9.900000000000001e-05,
"loss": 1.0543,
"step": 1000
},
{
"epoch": 0.22532672374943669,
"eval_accuracy": 0.7909380298838988,
"eval_loss": 0.9471855163574219,
"eval_runtime": 220.5287,
"eval_samples_per_second": 125.399,
"eval_steps_per_second": 3.922,
"step": 1000
},
{
"epoch": 0.337990085624155,
"grad_norm": 2.7863168716430664,
"learning_rate": 9.850000000000001e-05,
"loss": 0.9779,
"step": 1500
},
{
"epoch": 0.337990085624155,
"eval_accuracy": 0.8025380313108481,
"eval_loss": 0.8843335509300232,
"eval_runtime": 220.1694,
"eval_samples_per_second": 125.603,
"eval_steps_per_second": 3.929,
"step": 1500
},
{
"epoch": 0.45065344749887337,
"grad_norm": 2.5414557456970215,
"learning_rate": 9.8e-05,
"loss": 0.926,
"step": 2000
},
{
"epoch": 0.45065344749887337,
"eval_accuracy": 0.8101656739468518,
"eval_loss": 0.8543083667755127,
"eval_runtime": 220.7797,
"eval_samples_per_second": 125.256,
"eval_steps_per_second": 3.918,
"step": 2000
},
{
"epoch": 0.5633168093735917,
"grad_norm": 2.377737283706665,
"learning_rate": 9.75e-05,
"loss": 0.8865,
"step": 2500
},
{
"epoch": 0.5633168093735917,
"eval_accuracy": 0.8173040702951884,
"eval_loss": 0.810703456401825,
"eval_runtime": 220.8366,
"eval_samples_per_second": 125.224,
"eval_steps_per_second": 3.917,
"step": 2500
},
{
"epoch": 0.67598017124831,
"grad_norm": 2.3488988876342773,
"learning_rate": 9.7e-05,
"loss": 0.8609,
"step": 3000
},
{
"epoch": 0.67598017124831,
"eval_accuracy": 0.821973743458109,
"eval_loss": 0.7787520289421082,
"eval_runtime": 221.2787,
"eval_samples_per_second": 124.974,
"eval_steps_per_second": 3.909,
"step": 3000
},
{
"epoch": 0.7886435331230284,
"grad_norm": 2.1220295429229736,
"learning_rate": 9.65e-05,
"loss": 0.8342,
"step": 3500
},
{
"epoch": 0.7886435331230284,
"eval_accuracy": 0.8262216641689282,
"eval_loss": 0.762144923210144,
"eval_runtime": 220.8289,
"eval_samples_per_second": 125.228,
"eval_steps_per_second": 3.917,
"step": 3500
},
{
"epoch": 0.9013068949977467,
"grad_norm": 2.0968008041381836,
"learning_rate": 9.6e-05,
"loss": 0.819,
"step": 4000
},
{
"epoch": 0.9013068949977467,
"eval_accuracy": 0.8298798163116309,
"eval_loss": 0.7466955184936523,
"eval_runtime": 220.013,
"eval_samples_per_second": 125.693,
"eval_steps_per_second": 3.932,
"step": 4000
},
{
"epoch": 1.0139702568724651,
"grad_norm": 2.1498773097991943,
"learning_rate": 9.55e-05,
"loss": 0.7979,
"step": 4500
},
{
"epoch": 1.0139702568724651,
"eval_accuracy": 0.8318434647099575,
"eval_loss": 0.7348815202713013,
"eval_runtime": 220.8229,
"eval_samples_per_second": 125.232,
"eval_steps_per_second": 3.917,
"step": 4500
},
{
"epoch": 1.1266336187471835,
"grad_norm": 2.163381576538086,
"learning_rate": 9.5e-05,
"loss": 0.7814,
"step": 5000
},
{
"epoch": 1.1266336187471835,
"eval_accuracy": 0.8349864178467281,
"eval_loss": 0.7180664539337158,
"eval_runtime": 219.7442,
"eval_samples_per_second": 125.846,
"eval_steps_per_second": 3.936,
"step": 5000
},
{
"epoch": 1.2392969806219019,
"grad_norm": 2.431119680404663,
"learning_rate": 9.449999999999999e-05,
"loss": 0.7665,
"step": 5500
},
{
"epoch": 1.2392969806219019,
"eval_accuracy": 0.8369151584278837,
"eval_loss": 0.7159287333488464,
"eval_runtime": 220.9495,
"eval_samples_per_second": 125.16,
"eval_steps_per_second": 3.915,
"step": 5500
},
{
"epoch": 1.35196034249662,
"grad_norm": 2.2182135581970215,
"learning_rate": 9.4e-05,
"loss": 0.7555,
"step": 6000
},
{
"epoch": 1.35196034249662,
"eval_accuracy": 0.8390386817390197,
"eval_loss": 0.6999027132987976,
"eval_runtime": 221.2257,
"eval_samples_per_second": 125.004,
"eval_steps_per_second": 3.91,
"step": 6000
},
{
"epoch": 1.4646237043713384,
"grad_norm": 2.1569323539733887,
"learning_rate": 9.350000000000001e-05,
"loss": 0.7479,
"step": 6500
},
{
"epoch": 1.4646237043713384,
"eval_accuracy": 0.8422681179825285,
"eval_loss": 0.689199686050415,
"eval_runtime": 222.1646,
"eval_samples_per_second": 124.475,
"eval_steps_per_second": 3.894,
"step": 6500
},
{
"epoch": 1.5772870662460567,
"grad_norm": 2.1323976516723633,
"learning_rate": 9.300000000000001e-05,
"loss": 0.733,
"step": 7000
},
{
"epoch": 1.5772870662460567,
"eval_accuracy": 0.8439326355101854,
"eval_loss": 0.6770957112312317,
"eval_runtime": 220.0047,
"eval_samples_per_second": 125.697,
"eval_steps_per_second": 3.932,
"step": 7000
},
{
"epoch": 1.6899504281207751,
"grad_norm": 5.134857177734375,
"learning_rate": 9.250000000000001e-05,
"loss": 0.7254,
"step": 7500
},
{
"epoch": 1.6899504281207751,
"eval_accuracy": 0.8453398865939418,
"eval_loss": 0.6668263077735901,
"eval_runtime": 220.3226,
"eval_samples_per_second": 125.516,
"eval_steps_per_second": 3.926,
"step": 7500
},
{
"epoch": 1.8026137899954935,
"grad_norm": 2.0616443157196045,
"learning_rate": 9.200000000000001e-05,
"loss": 0.7155,
"step": 8000
},
{
"epoch": 1.8026137899954935,
"eval_accuracy": 0.8463415002486427,
"eval_loss": 0.6613638997077942,
"eval_runtime": 221.4149,
"eval_samples_per_second": 124.897,
"eval_steps_per_second": 3.907,
"step": 8000
},
{
"epoch": 1.9152771518702119,
"grad_norm": 1.8310041427612305,
"learning_rate": 9.15e-05,
"loss": 0.7057,
"step": 8500
},
{
"epoch": 1.9152771518702119,
"eval_accuracy": 0.8482940895875467,
"eval_loss": 0.6528915762901306,
"eval_runtime": 222.056,
"eval_samples_per_second": 124.536,
"eval_steps_per_second": 3.895,
"step": 8500
},
{
"epoch": 2.0279405137449302,
"grad_norm": 1.93686842918396,
"learning_rate": 9.1e-05,
"loss": 0.7005,
"step": 9000
},
{
"epoch": 2.0279405137449302,
"eval_accuracy": 0.849507663539711,
"eval_loss": 0.6522949934005737,
"eval_runtime": 221.8006,
"eval_samples_per_second": 124.68,
"eval_steps_per_second": 3.9,
"step": 9000
},
{
"epoch": 2.1406038756196484,
"grad_norm": 6.06415319442749,
"learning_rate": 9.05e-05,
"loss": 0.6884,
"step": 9500
},
{
"epoch": 2.1406038756196484,
"eval_accuracy": 0.8502759566677828,
"eval_loss": 0.6491975784301758,
"eval_runtime": 221.9876,
"eval_samples_per_second": 124.575,
"eval_steps_per_second": 3.897,
"step": 9500
},
{
"epoch": 2.253267237494367,
"grad_norm": 1.9235719442367554,
"learning_rate": 9e-05,
"loss": 0.6821,
"step": 10000
},
{
"epoch": 2.253267237494367,
"eval_accuracy": 0.8524909295282275,
"eval_loss": 0.6343050599098206,
"eval_runtime": 222.2817,
"eval_samples_per_second": 124.41,
"eval_steps_per_second": 3.891,
"step": 10000
},
{
"epoch": 2.365930599369085,
"grad_norm": 1.8421759605407715,
"learning_rate": 8.950000000000001e-05,
"loss": 0.6767,
"step": 10500
},
{
"epoch": 2.365930599369085,
"eval_accuracy": 0.8534436334949597,
"eval_loss": 0.623904287815094,
"eval_runtime": 221.9784,
"eval_samples_per_second": 124.58,
"eval_steps_per_second": 3.897,
"step": 10500
},
{
"epoch": 2.4785939612438037,
"grad_norm": 1.9507330656051636,
"learning_rate": 8.900000000000001e-05,
"loss": 0.6792,
"step": 11000
},
{
"epoch": 2.4785939612438037,
"eval_accuracy": 0.8552298873542783,
"eval_loss": 0.6220438480377197,
"eval_runtime": 221.6861,
"eval_samples_per_second": 124.744,
"eval_steps_per_second": 3.902,
"step": 11000
},
{
"epoch": 2.591257323118522,
"grad_norm": 2.11086106300354,
"learning_rate": 8.850000000000001e-05,
"loss": 0.668,
"step": 11500
},
{
"epoch": 2.591257323118522,
"eval_accuracy": 0.8557371543230742,
"eval_loss": 0.6222216486930847,
"eval_runtime": 221.2659,
"eval_samples_per_second": 124.981,
"eval_steps_per_second": 3.909,
"step": 11500
},
{
"epoch": 2.70392068499324,
"grad_norm": 2.1847715377807617,
"learning_rate": 8.800000000000001e-05,
"loss": 0.6636,
"step": 12000
},
{
"epoch": 2.70392068499324,
"eval_accuracy": 0.8559949812795903,
"eval_loss": 0.6197636127471924,
"eval_runtime": 222.3204,
"eval_samples_per_second": 124.388,
"eval_steps_per_second": 3.891,
"step": 12000
},
{
"epoch": 2.8165840468679586,
"grad_norm": 2.1351499557495117,
"learning_rate": 8.75e-05,
"loss": 0.6576,
"step": 12500
},
{
"epoch": 2.8165840468679586,
"eval_accuracy": 0.8577423671742627,
"eval_loss": 0.6103814840316772,
"eval_runtime": 222.1201,
"eval_samples_per_second": 124.5,
"eval_steps_per_second": 3.894,
"step": 12500
},
{
"epoch": 2.9292474087426768,
"grad_norm": 3.9510111808776855,
"learning_rate": 8.7e-05,
"loss": 0.6488,
"step": 13000
},
{
"epoch": 2.9292474087426768,
"eval_accuracy": 0.858561338833408,
"eval_loss": 0.6049174070358276,
"eval_runtime": 221.9998,
"eval_samples_per_second": 124.568,
"eval_steps_per_second": 3.896,
"step": 13000
},
{
"epoch": 3.0419107706173953,
"grad_norm": 1.8234397172927856,
"learning_rate": 8.65e-05,
"loss": 0.6438,
"step": 13500
},
{
"epoch": 3.0419107706173953,
"eval_accuracy": 0.8591052959636053,
"eval_loss": 0.6051846742630005,
"eval_runtime": 220.9713,
"eval_samples_per_second": 125.147,
"eval_steps_per_second": 3.915,
"step": 13500
},
{
"epoch": 3.1545741324921135,
"grad_norm": 1.9275134801864624,
"learning_rate": 8.6e-05,
"loss": 0.6369,
"step": 14000
},
{
"epoch": 3.1545741324921135,
"eval_accuracy": 0.8599297280864646,
"eval_loss": 0.6021236181259155,
"eval_runtime": 222.1682,
"eval_samples_per_second": 124.473,
"eval_steps_per_second": 3.893,
"step": 14000
},
{
"epoch": 3.267237494366832,
"grad_norm": 2.4342575073242188,
"learning_rate": 8.55e-05,
"loss": 0.6375,
"step": 14500
},
{
"epoch": 3.267237494366832,
"eval_accuracy": 0.8612232782302242,
"eval_loss": 0.5935059785842896,
"eval_runtime": 221.4028,
"eval_samples_per_second": 124.904,
"eval_steps_per_second": 3.907,
"step": 14500
},
{
"epoch": 3.3799008562415502,
"grad_norm": 1.8208547830581665,
"learning_rate": 8.5e-05,
"loss": 0.6327,
"step": 15000
},
{
"epoch": 3.3799008562415502,
"eval_accuracy": 0.8619705169680766,
"eval_loss": 0.5865727663040161,
"eval_runtime": 221.4519,
"eval_samples_per_second": 124.876,
"eval_steps_per_second": 3.906,
"step": 15000
},
{
"epoch": 3.492564218116269,
"grad_norm": 1.8497122526168823,
"learning_rate": 8.450000000000001e-05,
"loss": 0.6289,
"step": 15500
},
{
"epoch": 3.492564218116269,
"eval_accuracy": 0.8624703434485368,
"eval_loss": 0.5854940414428711,
"eval_runtime": 222.074,
"eval_samples_per_second": 124.526,
"eval_steps_per_second": 3.895,
"step": 15500
},
{
"epoch": 3.605227579990987,
"grad_norm": 1.7389825582504272,
"learning_rate": 8.4e-05,
"loss": 0.6231,
"step": 16000
},
{
"epoch": 3.605227579990987,
"eval_accuracy": 0.8635307164001665,
"eval_loss": 0.5809486508369446,
"eval_runtime": 222.3436,
"eval_samples_per_second": 124.375,
"eval_steps_per_second": 3.89,
"step": 16000
},
{
"epoch": 3.717890941865705,
"grad_norm": 1.7109190225601196,
"learning_rate": 8.35e-05,
"loss": 0.6193,
"step": 16500
},
{
"epoch": 3.717890941865705,
"eval_accuracy": 0.8642588913962003,
"eval_loss": 0.5757493376731873,
"eval_runtime": 220.862,
"eval_samples_per_second": 125.209,
"eval_steps_per_second": 3.916,
"step": 16500
},
{
"epoch": 3.8305543037404237,
"grad_norm": 2.09114408493042,
"learning_rate": 8.3e-05,
"loss": 0.619,
"step": 17000
},
{
"epoch": 3.8305543037404237,
"eval_accuracy": 0.8644031427528578,
"eval_loss": 0.5797725319862366,
"eval_runtime": 220.9835,
"eval_samples_per_second": 125.141,
"eval_steps_per_second": 3.914,
"step": 17000
},
{
"epoch": 3.943217665615142,
"grad_norm": 6.745112419128418,
"learning_rate": 8.25e-05,
"loss": 0.6127,
"step": 17500
},
{
"epoch": 3.943217665615142,
"eval_accuracy": 0.8645245282957764,
"eval_loss": 0.5759025812149048,
"eval_runtime": 222.2291,
"eval_samples_per_second": 124.439,
"eval_steps_per_second": 3.892,
"step": 17500
},
{
"epoch": 4.0558810274898605,
"grad_norm": 1.7710591554641724,
"learning_rate": 8.2e-05,
"loss": 0.6081,
"step": 18000
},
{
"epoch": 4.0558810274898605,
"eval_accuracy": 0.8658915432042757,
"eval_loss": 0.5714759230613708,
"eval_runtime": 221.6135,
"eval_samples_per_second": 124.785,
"eval_steps_per_second": 3.903,
"step": 18000
},
{
"epoch": 4.168544389364579,
"grad_norm": 1.8267593383789062,
"learning_rate": 8.15e-05,
"loss": 0.5988,
"step": 18500
},
{
"epoch": 4.168544389364579,
"eval_accuracy": 0.8665697779685045,
"eval_loss": 0.5671255588531494,
"eval_runtime": 221.0373,
"eval_samples_per_second": 125.11,
"eval_steps_per_second": 3.913,
"step": 18500
},
{
"epoch": 4.281207751239297,
"grad_norm": 1.6686463356018066,
"learning_rate": 8.1e-05,
"loss": 0.5981,
"step": 19000
},
{
"epoch": 4.281207751239297,
"eval_accuracy": 0.8667210799508446,
"eval_loss": 0.5654014348983765,
"eval_runtime": 221.1716,
"eval_samples_per_second": 125.034,
"eval_steps_per_second": 3.911,
"step": 19000
},
{
"epoch": 4.393871113114015,
"grad_norm": 1.6965349912643433,
"learning_rate": 8.05e-05,
"loss": 0.599,
"step": 19500
},
{
"epoch": 4.393871113114015,
"eval_accuracy": 0.8677269725072129,
"eval_loss": 0.5655470490455627,
"eval_runtime": 221.4343,
"eval_samples_per_second": 124.886,
"eval_steps_per_second": 3.906,
"step": 19500
},
{
"epoch": 4.506534474988734,
"grad_norm": 1.653952956199646,
"learning_rate": 8e-05,
"loss": 0.5976,
"step": 20000
},
{
"epoch": 4.506534474988734,
"eval_accuracy": 0.8685987876288259,
"eval_loss": 0.5560412406921387,
"eval_runtime": 220.5715,
"eval_samples_per_second": 125.374,
"eval_steps_per_second": 3.922,
"step": 20000
},
{
"epoch": 4.619197836863452,
"grad_norm": 1.7568910121917725,
"learning_rate": 7.950000000000001e-05,
"loss": 0.5941,
"step": 20500
},
{
"epoch": 4.619197836863452,
"eval_accuracy": 0.868412802308659,
"eval_loss": 0.5624808669090271,
"eval_runtime": 220.7945,
"eval_samples_per_second": 125.248,
"eval_steps_per_second": 3.918,
"step": 20500
},
{
"epoch": 4.73186119873817,
"grad_norm": 1.7545663118362427,
"learning_rate": 7.900000000000001e-05,
"loss": 0.5871,
"step": 21000
},
{
"epoch": 4.73186119873817,
"eval_accuracy": 0.8700149406874658,
"eval_loss": 0.5546574592590332,
"eval_runtime": 220.2428,
"eval_samples_per_second": 125.561,
"eval_steps_per_second": 3.927,
"step": 21000
},
{
"epoch": 4.844524560612888,
"grad_norm": 1.9459997415542603,
"learning_rate": 7.850000000000001e-05,
"loss": 0.5891,
"step": 21500
},
{
"epoch": 4.844524560612888,
"eval_accuracy": 0.8703311716376315,
"eval_loss": 0.5456222295761108,
"eval_runtime": 220.5867,
"eval_samples_per_second": 125.366,
"eval_steps_per_second": 3.921,
"step": 21500
},
{
"epoch": 4.957187922487607,
"grad_norm": 1.9034132957458496,
"learning_rate": 7.800000000000001e-05,
"loss": 0.5828,
"step": 22000
},
{
"epoch": 4.957187922487607,
"eval_accuracy": 0.8704027728365514,
"eval_loss": 0.549776554107666,
"eval_runtime": 221.4908,
"eval_samples_per_second": 124.854,
"eval_steps_per_second": 3.905,
"step": 22000
},
{
"epoch": 5.069851284362326,
"grad_norm": 1.881596565246582,
"learning_rate": 7.75e-05,
"loss": 0.5767,
"step": 22500
},
{
"epoch": 5.069851284362326,
"eval_accuracy": 0.8711589106147363,
"eval_loss": 0.5461272597312927,
"eval_runtime": 220.457,
"eval_samples_per_second": 125.439,
"eval_steps_per_second": 3.924,
"step": 22500
},
{
"epoch": 5.182514646237044,
"grad_norm": 1.9157260656356812,
"learning_rate": 7.7e-05,
"loss": 0.5731,
"step": 23000
},
{
"epoch": 5.182514646237044,
"eval_accuracy": 0.871975417070376,
"eval_loss": 0.5400785207748413,
"eval_runtime": 220.8692,
"eval_samples_per_second": 125.205,
"eval_steps_per_second": 3.916,
"step": 23000
},
{
"epoch": 5.295178008111762,
"grad_norm": 1.9823201894760132,
"learning_rate": 7.65e-05,
"loss": 0.5736,
"step": 23500
},
{
"epoch": 5.295178008111762,
"eval_accuracy": 0.8723751389743424,
"eval_loss": 0.5401638746261597,
"eval_runtime": 221.6042,
"eval_samples_per_second": 124.79,
"eval_steps_per_second": 3.903,
"step": 23500
},
{
"epoch": 5.40784136998648,
"grad_norm": 1.905613660812378,
"learning_rate": 7.6e-05,
"loss": 0.5747,
"step": 24000
},
{
"epoch": 5.40784136998648,
"eval_accuracy": 0.8724923660478054,
"eval_loss": 0.5441656112670898,
"eval_runtime": 221.3067,
"eval_samples_per_second": 124.958,
"eval_steps_per_second": 3.909,
"step": 24000
},
{
"epoch": 5.520504731861199,
"grad_norm": 1.5278126001358032,
"learning_rate": 7.55e-05,
"loss": 0.5681,
"step": 24500
},
{
"epoch": 5.520504731861199,
"eval_accuracy": 0.8728878650306285,
"eval_loss": 0.538100004196167,
"eval_runtime": 222.0369,
"eval_samples_per_second": 124.547,
"eval_steps_per_second": 3.896,
"step": 24500
},
{
"epoch": 5.633168093735917,
"grad_norm": 1.6478660106658936,
"learning_rate": 7.500000000000001e-05,
"loss": 0.5658,
"step": 25000
},
{
"epoch": 5.633168093735917,
"eval_accuracy": 0.8736624848239579,
"eval_loss": 0.5357881784439087,
"eval_runtime": 220.4147,
"eval_samples_per_second": 125.463,
"eval_steps_per_second": 3.924,
"step": 25000
},
{
"epoch": 5.745831455610635,
"grad_norm": 3.0473523139953613,
"learning_rate": 7.450000000000001e-05,
"loss": 0.5644,
"step": 25500
},
{
"epoch": 5.745831455610635,
"eval_accuracy": 0.8743903767129565,
"eval_loss": 0.5344362854957581,
"eval_runtime": 221.5481,
"eval_samples_per_second": 124.822,
"eval_steps_per_second": 3.904,
"step": 25500
},
{
"epoch": 5.8584948174853535,
"grad_norm": 1.8053028583526611,
"learning_rate": 7.4e-05,
"loss": 0.5622,
"step": 26000
},
{
"epoch": 5.8584948174853535,
"eval_accuracy": 0.874178054098396,
"eval_loss": 0.5315510630607605,
"eval_runtime": 221.4537,
"eval_samples_per_second": 124.875,
"eval_steps_per_second": 3.906,
"step": 26000
},
{
"epoch": 5.9711581793600725,
"grad_norm": 1.5863131284713745,
"learning_rate": 7.35e-05,
"loss": 0.5578,
"step": 26500
},
{
"epoch": 5.9711581793600725,
"eval_accuracy": 0.8753070050808498,
"eval_loss": 0.5271232724189758,
"eval_runtime": 221.5103,
"eval_samples_per_second": 124.843,
"eval_steps_per_second": 3.905,
"step": 26500
},
{
"epoch": 6.083821541234791,
"grad_norm": 1.7924689054489136,
"learning_rate": 7.3e-05,
"loss": 0.5546,
"step": 27000
},
{
"epoch": 6.083821541234791,
"eval_accuracy": 0.8749559789605048,
"eval_loss": 0.5305372476577759,
"eval_runtime": 220.6828,
"eval_samples_per_second": 125.311,
"eval_steps_per_second": 3.92,
"step": 27000
},
{
"epoch": 6.196484903109509,
"grad_norm": 1.6176671981811523,
"learning_rate": 7.25e-05,
"loss": 0.5553,
"step": 27500
},
{
"epoch": 6.196484903109509,
"eval_accuracy": 0.8752024294778373,
"eval_loss": 0.5255776047706604,
"eval_runtime": 220.8919,
"eval_samples_per_second": 125.192,
"eval_steps_per_second": 3.916,
"step": 27500
},
{
"epoch": 6.309148264984227,
"grad_norm": 1.855047583580017,
"learning_rate": 7.2e-05,
"loss": 0.5506,
"step": 28000
},
{
"epoch": 6.309148264984227,
"eval_accuracy": 0.8761331460452507,
"eval_loss": 0.52358478307724,
"eval_runtime": 220.7028,
"eval_samples_per_second": 125.3,
"eval_steps_per_second": 3.919,
"step": 28000
},
{
"epoch": 6.421811626858945,
"grad_norm": 1.6553348302841187,
"learning_rate": 7.15e-05,
"loss": 0.5439,
"step": 28500
},
{
"epoch": 6.421811626858945,
"eval_accuracy": 0.8768096662621753,
"eval_loss": 0.5175614953041077,
"eval_runtime": 221.3868,
"eval_samples_per_second": 124.913,
"eval_steps_per_second": 3.907,
"step": 28500
},
{
"epoch": 6.534474988733664,
"grad_norm": 1.8099743127822876,
"learning_rate": 7.1e-05,
"loss": 0.5486,
"step": 29000
},
{
"epoch": 6.534474988733664,
"eval_accuracy": 0.8767345488093528,
"eval_loss": 0.5191013216972351,
"eval_runtime": 221.0646,
"eval_samples_per_second": 125.095,
"eval_steps_per_second": 3.913,
"step": 29000
},
{
"epoch": 6.647138350608382,
"grad_norm": 1.7723827362060547,
"learning_rate": 7.05e-05,
"loss": 0.5442,
"step": 29500
},
{
"epoch": 6.647138350608382,
"eval_accuracy": 0.8777180592418201,
"eval_loss": 0.5211535096168518,
"eval_runtime": 222.034,
"eval_samples_per_second": 124.548,
"eval_steps_per_second": 3.896,
"step": 29500
},
{
"epoch": 6.7598017124831005,
"grad_norm": 1.7134077548980713,
"learning_rate": 7e-05,
"loss": 0.5412,
"step": 30000
},
{
"epoch": 6.7598017124831005,
"eval_accuracy": 0.8771853054768167,
"eval_loss": 0.5161250829696655,
"eval_runtime": 221.7362,
"eval_samples_per_second": 124.716,
"eval_steps_per_second": 3.901,
"step": 30000
},
{
"epoch": 6.872465074357819,
"grad_norm": 1.7683045864105225,
"learning_rate": 6.95e-05,
"loss": 0.5402,
"step": 30500
},
{
"epoch": 6.872465074357819,
"eval_accuracy": 0.8773839402820733,
"eval_loss": 0.5139411687850952,
"eval_runtime": 220.8209,
"eval_samples_per_second": 125.233,
"eval_steps_per_second": 3.917,
"step": 30500
},
{
"epoch": 6.985128436232538,
"grad_norm": 1.8624660968780518,
"learning_rate": 6.9e-05,
"loss": 0.5395,
"step": 31000
},
{
"epoch": 6.985128436232538,
"eval_accuracy": 0.8783624777319803,
"eval_loss": 0.5147821307182312,
"eval_runtime": 221.2924,
"eval_samples_per_second": 124.966,
"eval_steps_per_second": 3.909,
"step": 31000
},
{
"epoch": 7.097791798107256,
"grad_norm": 1.6134588718414307,
"learning_rate": 6.850000000000001e-05,
"loss": 0.5323,
"step": 31500
},
{
"epoch": 7.097791798107256,
"eval_accuracy": 0.8783227617479554,
"eval_loss": 0.5111725330352783,
"eval_runtime": 221.33,
"eval_samples_per_second": 124.945,
"eval_steps_per_second": 3.908,
"step": 31500
},
{
"epoch": 7.210455159981974,
"grad_norm": 1.8190521001815796,
"learning_rate": 6.800000000000001e-05,
"loss": 0.5341,
"step": 32000
},
{
"epoch": 7.210455159981974,
"eval_accuracy": 0.8789992011172492,
"eval_loss": 0.5084385275840759,
"eval_runtime": 221.6067,
"eval_samples_per_second": 124.789,
"eval_steps_per_second": 3.903,
"step": 32000
},
{
"epoch": 7.323118521856692,
"grad_norm": 1.7497199773788452,
"learning_rate": 6.750000000000001e-05,
"loss": 0.5325,
"step": 32500
},
{
"epoch": 7.323118521856692,
"eval_accuracy": 0.8800775404890228,
"eval_loss": 0.5039363503456116,
"eval_runtime": 222.1191,
"eval_samples_per_second": 124.501,
"eval_steps_per_second": 3.894,
"step": 32500
},
{
"epoch": 7.43578188373141,
"grad_norm": 1.6325268745422363,
"learning_rate": 6.7e-05,
"loss": 0.5309,
"step": 33000
},
{
"epoch": 7.43578188373141,
"eval_accuracy": 0.8801297464169966,
"eval_loss": 0.505262017250061,
"eval_runtime": 221.5433,
"eval_samples_per_second": 124.824,
"eval_steps_per_second": 3.904,
"step": 33000
},
{
"epoch": 7.548445245606128,
"grad_norm": 1.7531828880310059,
"learning_rate": 6.65e-05,
"loss": 0.5283,
"step": 33500
},
{
"epoch": 7.548445245606128,
"eval_accuracy": 0.8802065110814512,
"eval_loss": 0.5030723810195923,
"eval_runtime": 221.3827,
"eval_samples_per_second": 124.915,
"eval_steps_per_second": 3.907,
"step": 33500
},
{
"epoch": 7.661108607480847,
"grad_norm": 1.7174723148345947,
"learning_rate": 6.6e-05,
"loss": 0.5254,
"step": 34000
},
{
"epoch": 7.661108607480847,
"eval_accuracy": 0.8808179417817528,
"eval_loss": 0.5008535385131836,
"eval_runtime": 220.9595,
"eval_samples_per_second": 125.154,
"eval_steps_per_second": 3.915,
"step": 34000
},
{
"epoch": 7.773771969355566,
"grad_norm": 1.4874796867370605,
"learning_rate": 6.55e-05,
"loss": 0.5217,
"step": 34500
},
{
"epoch": 7.773771969355566,
"eval_accuracy": 0.8805426353661477,
"eval_loss": 0.5023674368858337,
"eval_runtime": 221.8005,
"eval_samples_per_second": 124.68,
"eval_steps_per_second": 3.9,
"step": 34500
},
{
"epoch": 7.886435331230284,
"grad_norm": 1.721684455871582,
"learning_rate": 6.500000000000001e-05,
"loss": 0.5268,
"step": 35000
},
{
"epoch": 7.886435331230284,
"eval_accuracy": 0.8809166125769631,
"eval_loss": 0.49913424253463745,
"eval_runtime": 221.9778,
"eval_samples_per_second": 124.58,
"eval_steps_per_second": 3.897,
"step": 35000
},
{
"epoch": 7.999098693105002,
"grad_norm": 1.6311215162277222,
"learning_rate": 6.450000000000001e-05,
"loss": 0.5279,
"step": 35500
},
{
"epoch": 7.999098693105002,
"eval_accuracy": 0.8812218745311298,
"eval_loss": 0.49702906608581543,
"eval_runtime": 221.8919,
"eval_samples_per_second": 124.628,
"eval_steps_per_second": 3.898,
"step": 35500
},
{
"epoch": 8.111762054979721,
"grad_norm": 1.843680739402771,
"learning_rate": 6.400000000000001e-05,
"loss": 0.5113,
"step": 36000
},
{
"epoch": 8.111762054979721,
"eval_accuracy": 0.8819420345077154,
"eval_loss": 0.4924590289592743,
"eval_runtime": 220.9686,
"eval_samples_per_second": 125.149,
"eval_steps_per_second": 3.915,
"step": 36000
},
{
"epoch": 8.224425416854439,
"grad_norm": 2.186274290084839,
"learning_rate": 6.35e-05,
"loss": 0.519,
"step": 36500
},
{
"epoch": 8.224425416854439,
"eval_accuracy": 0.8821893135016634,
"eval_loss": 0.49259641766548157,
"eval_runtime": 221.6028,
"eval_samples_per_second": 124.791,
"eval_steps_per_second": 3.903,
"step": 36500
},
{
"epoch": 8.337088778729157,
"grad_norm": 1.6842992305755615,
"learning_rate": 6.3e-05,
"loss": 0.5142,
"step": 37000
},
{
"epoch": 8.337088778729157,
"eval_accuracy": 0.8831766874688345,
"eval_loss": 0.49090540409088135,
"eval_runtime": 221.5262,
"eval_samples_per_second": 124.834,
"eval_steps_per_second": 3.905,
"step": 37000
},
{
"epoch": 8.449752140603875,
"grad_norm": 1.69620943069458,
"learning_rate": 6.25e-05,
"loss": 0.5118,
"step": 37500
},
{
"epoch": 8.449752140603875,
"eval_accuracy": 0.8829115546267233,
"eval_loss": 0.48741188645362854,
"eval_runtime": 220.7417,
"eval_samples_per_second": 125.278,
"eval_steps_per_second": 3.919,
"step": 37500
},
{
"epoch": 8.562415502478594,
"grad_norm": 1.7428566217422485,
"learning_rate": 6.2e-05,
"loss": 0.513,
"step": 38000
},
{
"epoch": 8.562415502478594,
"eval_accuracy": 0.883224147742304,
"eval_loss": 0.4888308644294739,
"eval_runtime": 221.978,
"eval_samples_per_second": 124.58,
"eval_steps_per_second": 3.897,
"step": 38000
},
{
"epoch": 8.675078864353312,
"grad_norm": 1.7894220352172852,
"learning_rate": 6.15e-05,
"loss": 0.5158,
"step": 38500
},
{
"epoch": 8.675078864353312,
"eval_accuracy": 0.8831726096386725,
"eval_loss": 0.48892539739608765,
"eval_runtime": 221.6541,
"eval_samples_per_second": 124.762,
"eval_steps_per_second": 3.902,
"step": 38500
},
{
"epoch": 8.78774222622803,
"grad_norm": 1.8389184474945068,
"learning_rate": 6.1e-05,
"loss": 0.5074,
"step": 39000
},
{
"epoch": 8.78774222622803,
"eval_accuracy": 0.8836604617832933,
"eval_loss": 0.4904680550098419,
"eval_runtime": 220.5534,
"eval_samples_per_second": 125.385,
"eval_steps_per_second": 3.922,
"step": 39000
},
{
"epoch": 8.90040558810275,
"grad_norm": 1.6285669803619385,
"learning_rate": 6.05e-05,
"loss": 0.5115,
"step": 39500
},
{
"epoch": 8.90040558810275,
"eval_accuracy": 0.8834682744822249,
"eval_loss": 0.4873930513858795,
"eval_runtime": 221.8348,
"eval_samples_per_second": 124.66,
"eval_steps_per_second": 3.899,
"step": 39500
},
{
"epoch": 9.013068949977468,
"grad_norm": 1.892903447151184,
"learning_rate": 6e-05,
"loss": 0.506,
"step": 40000
},
{
"epoch": 9.013068949977468,
"eval_accuracy": 0.8844121521679462,
"eval_loss": 0.48553282022476196,
"eval_runtime": 221.0439,
"eval_samples_per_second": 125.106,
"eval_steps_per_second": 3.913,
"step": 40000
},
{
"epoch": 9.125732311852186,
"grad_norm": 1.5161460638046265,
"learning_rate": 5.95e-05,
"loss": 0.5006,
"step": 40500
},
{
"epoch": 9.125732311852186,
"eval_accuracy": 0.8845845321702142,
"eval_loss": 0.4854166805744171,
"eval_runtime": 221.5376,
"eval_samples_per_second": 124.828,
"eval_steps_per_second": 3.905,
"step": 40500
},
{
"epoch": 9.238395673726904,
"grad_norm": 1.6559338569641113,
"learning_rate": 5.9e-05,
"loss": 0.4998,
"step": 41000
},
{
"epoch": 9.238395673726904,
"eval_accuracy": 0.8846773258713508,
"eval_loss": 0.47937873005867004,
"eval_runtime": 221.7432,
"eval_samples_per_second": 124.712,
"eval_steps_per_second": 3.901,
"step": 41000
},
{
"epoch": 9.351059035601622,
"grad_norm": 1.5425843000411987,
"learning_rate": 5.85e-05,
"loss": 0.4993,
"step": 41500
},
{
"epoch": 9.351059035601622,
"eval_accuracy": 0.8852505184740784,
"eval_loss": 0.47994357347488403,
"eval_runtime": 220.8861,
"eval_samples_per_second": 125.196,
"eval_steps_per_second": 3.916,
"step": 41500
},
{
"epoch": 9.46372239747634,
"grad_norm": 1.6957345008850098,
"learning_rate": 5.8e-05,
"loss": 0.4978,
"step": 42000
},
{
"epoch": 9.46372239747634,
"eval_accuracy": 0.8847172732012654,
"eval_loss": 0.48131656646728516,
"eval_runtime": 222.0591,
"eval_samples_per_second": 124.534,
"eval_steps_per_second": 3.895,
"step": 42000
},
{
"epoch": 9.576385759351059,
"grad_norm": 1.9139741659164429,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.4989,
"step": 42500
},
{
"epoch": 9.576385759351059,
"eval_accuracy": 0.8862352978048973,
"eval_loss": 0.4748667776584625,
"eval_runtime": 221.9766,
"eval_samples_per_second": 124.581,
"eval_steps_per_second": 3.897,
"step": 42500
},
{
"epoch": 9.689049121225777,
"grad_norm": 1.770585536956787,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.4974,
"step": 43000
},
{
"epoch": 9.689049121225777,
"eval_accuracy": 0.8855722252421147,
"eval_loss": 0.4763648211956024,
"eval_runtime": 220.554,
"eval_samples_per_second": 125.384,
"eval_steps_per_second": 3.922,
"step": 43000
},
{
"epoch": 9.801712483100495,
"grad_norm": 1.6551371812820435,
"learning_rate": 5.65e-05,
"loss": 0.4978,
"step": 43500
},
{
"epoch": 9.801712483100495,
"eval_accuracy": 0.8858765050235756,
"eval_loss": 0.47770920395851135,
"eval_runtime": 221.8932,
"eval_samples_per_second": 124.628,
"eval_steps_per_second": 3.898,
"step": 43500
},
{
"epoch": 9.914375844975215,
"grad_norm": 1.6118969917297363,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.4942,
"step": 44000
},
{
"epoch": 9.914375844975215,
"eval_accuracy": 0.8865845660569847,
"eval_loss": 0.47676002979278564,
"eval_runtime": 221.7004,
"eval_samples_per_second": 124.736,
"eval_steps_per_second": 3.902,
"step": 44000
},
{
"epoch": 10.027039206849933,
"grad_norm": 1.8588035106658936,
"learning_rate": 5.550000000000001e-05,
"loss": 0.4955,
"step": 44500
},
{
"epoch": 10.027039206849933,
"eval_accuracy": 0.8870998796760368,
"eval_loss": 0.47594934701919556,
"eval_runtime": 221.976,
"eval_samples_per_second": 124.581,
"eval_steps_per_second": 3.897,
"step": 44500
},
{
"epoch": 10.139702568724651,
"grad_norm": 1.6966643333435059,
"learning_rate": 5.500000000000001e-05,
"loss": 0.489,
"step": 45000
},
{
"epoch": 10.139702568724651,
"eval_accuracy": 0.8869448016018396,
"eval_loss": 0.477344274520874,
"eval_runtime": 221.5008,
"eval_samples_per_second": 124.848,
"eval_steps_per_second": 3.905,
"step": 45000
},
{
"epoch": 10.25236593059937,
"grad_norm": 1.7615017890930176,
"learning_rate": 5.45e-05,
"loss": 0.4849,
"step": 45500
},
{
"epoch": 10.25236593059937,
"eval_accuracy": 0.8868306586288885,
"eval_loss": 0.4725435972213745,
"eval_runtime": 221.7608,
"eval_samples_per_second": 124.702,
"eval_steps_per_second": 3.901,
"step": 45500
},
{
"epoch": 10.365029292474087,
"grad_norm": 1.7889434099197388,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.4818,
"step": 46000
},
{
"epoch": 10.365029292474087,
"eval_accuracy": 0.887833900017014,
"eval_loss": 0.4671822190284729,
"eval_runtime": 222.1274,
"eval_samples_per_second": 124.496,
"eval_steps_per_second": 3.894,
"step": 46000
},
{
"epoch": 10.477692654348806,
"grad_norm": 1.7761868238449097,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.4864,
"step": 46500
},
{
"epoch": 10.477692654348806,
"eval_accuracy": 0.887966177980069,
"eval_loss": 0.46516725420951843,
"eval_runtime": 221.4768,
"eval_samples_per_second": 124.862,
"eval_steps_per_second": 3.906,
"step": 46500
},
{
"epoch": 10.590356016223524,
"grad_norm": 1.7193918228149414,
"learning_rate": 5.300000000000001e-05,
"loss": 0.4854,
"step": 47000
},
{
"epoch": 10.590356016223524,
"eval_accuracy": 0.8878875431862944,
"eval_loss": 0.4649243652820587,
"eval_runtime": 221.9203,
"eval_samples_per_second": 124.612,
"eval_steps_per_second": 3.898,
"step": 47000
},
{
"epoch": 10.703019378098242,
"grad_norm": 1.681303858757019,
"learning_rate": 5.25e-05,
"loss": 0.4842,
"step": 47500
},
{
"epoch": 10.703019378098242,
"eval_accuracy": 0.8880860212733241,
"eval_loss": 0.4627833366394043,
"eval_runtime": 220.7325,
"eval_samples_per_second": 125.283,
"eval_steps_per_second": 3.919,
"step": 47500
},
{
"epoch": 10.81568273997296,
"grad_norm": 1.689483642578125,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.4853,
"step": 48000
},
{
"epoch": 10.81568273997296,
"eval_accuracy": 0.8884850427627177,
"eval_loss": 0.4670482873916626,
"eval_runtime": 222.2087,
"eval_samples_per_second": 124.451,
"eval_steps_per_second": 3.893,
"step": 48000
},
{
"epoch": 10.92834610184768,
"grad_norm": 1.6489872932434082,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.4825,
"step": 48500
},
{
"epoch": 10.92834610184768,
"eval_accuracy": 0.8886944679602043,
"eval_loss": 0.4673362970352173,
"eval_runtime": 221.8366,
"eval_samples_per_second": 124.659,
"eval_steps_per_second": 3.899,
"step": 48500
},
{
"epoch": 11.041009463722398,
"grad_norm": 1.6207237243652344,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.4783,
"step": 49000
},
{
"epoch": 11.041009463722398,
"eval_accuracy": 0.8887642317859056,
"eval_loss": 0.46382275223731995,
"eval_runtime": 222.0203,
"eval_samples_per_second": 124.556,
"eval_steps_per_second": 3.896,
"step": 49000
},
{
"epoch": 11.153672825597116,
"grad_norm": 1.7849069833755493,
"learning_rate": 5.05e-05,
"loss": 0.4755,
"step": 49500
},
{
"epoch": 11.153672825597116,
"eval_accuracy": 0.8889988225245398,
"eval_loss": 0.4611697793006897,
"eval_runtime": 221.8086,
"eval_samples_per_second": 124.675,
"eval_steps_per_second": 3.9,
"step": 49500
},
{
"epoch": 11.266336187471834,
"grad_norm": 1.7341585159301758,
"learning_rate": 5e-05,
"loss": 0.4766,
"step": 50000
},
{
"epoch": 11.266336187471834,
"eval_accuracy": 0.8896719975387671,
"eval_loss": 0.45947107672691345,
"eval_runtime": 221.6153,
"eval_samples_per_second": 124.784,
"eval_steps_per_second": 3.903,
"step": 50000
},
{
"epoch": 11.378999549346553,
"grad_norm": 1.6157374382019043,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.4758,
"step": 50500
},
{
"epoch": 11.378999549346553,
"eval_accuracy": 0.8899909483321304,
"eval_loss": 0.4591013193130493,
"eval_runtime": 221.8952,
"eval_samples_per_second": 124.626,
"eval_steps_per_second": 3.898,
"step": 50500
},
{
"epoch": 11.49166291122127,
"grad_norm": 1.3931312561035156,
"learning_rate": 4.9e-05,
"loss": 0.4749,
"step": 51000
},
{
"epoch": 11.49166291122127,
"eval_accuracy": 0.8898331334878133,
"eval_loss": 0.4599143862724304,
"eval_runtime": 221.5141,
"eval_samples_per_second": 124.841,
"eval_steps_per_second": 3.905,
"step": 51000
},
{
"epoch": 11.604326273095989,
"grad_norm": 1.5027562379837036,
"learning_rate": 4.85e-05,
"loss": 0.4696,
"step": 51500
},
{
"epoch": 11.604326273095989,
"eval_accuracy": 0.8903252192404275,
"eval_loss": 0.4557996988296509,
"eval_runtime": 221.4742,
"eval_samples_per_second": 124.863,
"eval_steps_per_second": 3.906,
"step": 51500
},
{
"epoch": 11.716989634970707,
"grad_norm": 2.007624864578247,
"learning_rate": 4.8e-05,
"loss": 0.4731,
"step": 52000
},
{
"epoch": 11.716989634970707,
"eval_accuracy": 0.8906047731898101,
"eval_loss": 0.4601598381996155,
"eval_runtime": 221.6161,
"eval_samples_per_second": 124.783,
"eval_steps_per_second": 3.903,
"step": 52000
},
{
"epoch": 11.829652996845425,
"grad_norm": 1.623124361038208,
"learning_rate": 4.75e-05,
"loss": 0.4705,
"step": 52500
},
{
"epoch": 11.829652996845425,
"eval_accuracy": 0.8907063641623542,
"eval_loss": 0.4568343460559845,
"eval_runtime": 221.7063,
"eval_samples_per_second": 124.733,
"eval_steps_per_second": 3.902,
"step": 52500
},
{
"epoch": 11.942316358720145,
"grad_norm": 1.7550790309906006,
"learning_rate": 4.7e-05,
"loss": 0.4712,
"step": 53000
},
{
"epoch": 11.942316358720145,
"eval_accuracy": 0.8906701808811146,
"eval_loss": 0.4544416666030884,
"eval_runtime": 221.7786,
"eval_samples_per_second": 124.692,
"eval_steps_per_second": 3.9,
"step": 53000
},
{
"epoch": 12.054979720594863,
"grad_norm": 1.8783979415893555,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.4672,
"step": 53500
},
{
"epoch": 12.054979720594863,
"eval_accuracy": 0.8910758036453728,
"eval_loss": 0.45520085096359253,
"eval_runtime": 221.6441,
"eval_samples_per_second": 124.768,
"eval_steps_per_second": 3.903,
"step": 53500
},
{
"epoch": 12.167643082469581,
"grad_norm": 1.7316193580627441,
"learning_rate": 4.600000000000001e-05,
"loss": 0.4643,
"step": 54000
},
{
"epoch": 12.167643082469581,
"eval_accuracy": 0.8910305824271204,
"eval_loss": 0.4555051028728485,
"eval_runtime": 221.45,
"eval_samples_per_second": 124.877,
"eval_steps_per_second": 3.906,
"step": 54000
},
{
"epoch": 12.2803064443443,
"grad_norm": 1.6475858688354492,
"learning_rate": 4.55e-05,
"loss": 0.4634,
"step": 54500
},
{
"epoch": 12.2803064443443,
"eval_accuracy": 0.8916132904164534,
"eval_loss": 0.450579971075058,
"eval_runtime": 221.3289,
"eval_samples_per_second": 124.945,
"eval_steps_per_second": 3.908,
"step": 54500
},
{
"epoch": 12.392969806219018,
"grad_norm": 1.6666234731674194,
"learning_rate": 4.5e-05,
"loss": 0.4629,
"step": 55000
},
{
"epoch": 12.392969806219018,
"eval_accuracy": 0.8920182501631405,
"eval_loss": 0.4492991268634796,
"eval_runtime": 220.8234,
"eval_samples_per_second": 125.231,
"eval_steps_per_second": 3.917,
"step": 55000
},
{
"epoch": 12.505633168093736,
"grad_norm": 2.040255308151245,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.4577,
"step": 55500
},
{
"epoch": 12.505633168093736,
"eval_accuracy": 0.8917890008025184,
"eval_loss": 0.45352259278297424,
"eval_runtime": 222.1131,
"eval_samples_per_second": 124.504,
"eval_steps_per_second": 3.894,
"step": 55500
},
{
"epoch": 12.618296529968454,
"grad_norm": 1.6200906038284302,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.4597,
"step": 56000
},
{
"epoch": 12.618296529968454,
"eval_accuracy": 0.8922171868098442,
"eval_loss": 0.45517975091934204,
"eval_runtime": 221.5067,
"eval_samples_per_second": 124.845,
"eval_steps_per_second": 3.905,
"step": 56000
},
{
"epoch": 12.730959891843172,
"grad_norm": 1.8632248640060425,
"learning_rate": 4.35e-05,
"loss": 0.4624,
"step": 56500
},
{
"epoch": 12.730959891843172,
"eval_accuracy": 0.8927531961352251,
"eval_loss": 0.44637200236320496,
"eval_runtime": 221.488,
"eval_samples_per_second": 124.856,
"eval_steps_per_second": 3.905,
"step": 56500
},
{
"epoch": 12.84362325371789,
"grad_norm": 1.6908427476882935,
"learning_rate": 4.3e-05,
"loss": 0.46,
"step": 57000
},
{
"epoch": 12.84362325371789,
"eval_accuracy": 0.8920658168036726,
"eval_loss": 0.44909459352493286,
"eval_runtime": 221.6242,
"eval_samples_per_second": 124.779,
"eval_steps_per_second": 3.903,
"step": 57000
},
{
"epoch": 12.95628661559261,
"grad_norm": 1.7786799669265747,
"learning_rate": 4.25e-05,
"loss": 0.4586,
"step": 57500
},
{
"epoch": 12.95628661559261,
"eval_accuracy": 0.8929494005257145,
"eval_loss": 0.4447159469127655,
"eval_runtime": 221.7737,
"eval_samples_per_second": 124.695,
"eval_steps_per_second": 3.9,
"step": 57500
},
{
"epoch": 13.068949977467328,
"grad_norm": 1.7628467082977295,
"learning_rate": 4.2e-05,
"loss": 0.4558,
"step": 58000
},
{
"epoch": 13.068949977467328,
"eval_accuracy": 0.8926064267317521,
"eval_loss": 0.4458833336830139,
"eval_runtime": 221.4227,
"eval_samples_per_second": 124.892,
"eval_steps_per_second": 3.907,
"step": 58000
},
{
"epoch": 13.181613339342046,
"grad_norm": 1.5658234357833862,
"learning_rate": 4.15e-05,
"loss": 0.4542,
"step": 58500
},
{
"epoch": 13.181613339342046,
"eval_accuracy": 0.8932430818063928,
"eval_loss": 0.4461354613304138,
"eval_runtime": 221.5061,
"eval_samples_per_second": 124.845,
"eval_steps_per_second": 3.905,
"step": 58500
},
{
"epoch": 13.294276701216765,
"grad_norm": 1.5327554941177368,
"learning_rate": 4.1e-05,
"loss": 0.455,
"step": 59000
},
{
"epoch": 13.294276701216765,
"eval_accuracy": 0.8931742171282963,
"eval_loss": 0.4385415017604828,
"eval_runtime": 220.6336,
"eval_samples_per_second": 125.339,
"eval_steps_per_second": 3.921,
"step": 59000
},
{
"epoch": 13.406940063091483,
"grad_norm": 1.804396390914917,
"learning_rate": 4.05e-05,
"loss": 0.4506,
"step": 59500
},
{
"epoch": 13.406940063091483,
"eval_accuracy": 0.8937421063264991,
"eval_loss": 0.4429979622364044,
"eval_runtime": 221.8018,
"eval_samples_per_second": 124.679,
"eval_steps_per_second": 3.9,
"step": 59500
},
{
"epoch": 13.519603424966201,
"grad_norm": 1.8558369874954224,
"learning_rate": 4e-05,
"loss": 0.4542,
"step": 60000
},
{
"epoch": 13.519603424966201,
"eval_accuracy": 0.8935920533223497,
"eval_loss": 0.4469524025917053,
"eval_runtime": 220.934,
"eval_samples_per_second": 125.169,
"eval_steps_per_second": 3.915,
"step": 60000
},
{
"epoch": 13.632266786840919,
"grad_norm": 1.7201515436172485,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.4535,
"step": 60500
},
{
"epoch": 13.632266786840919,
"eval_accuracy": 0.8939343684906524,
"eval_loss": 0.44035276770591736,
"eval_runtime": 220.7553,
"eval_samples_per_second": 125.27,
"eval_steps_per_second": 3.918,
"step": 60500
},
{
"epoch": 13.744930148715637,
"grad_norm": 1.5173367261886597,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.4519,
"step": 61000
},
{
"epoch": 13.744930148715637,
"eval_accuracy": 0.8938849251143673,
"eval_loss": 0.44248726963996887,
"eval_runtime": 220.2187,
"eval_samples_per_second": 125.575,
"eval_steps_per_second": 3.928,
"step": 61000
},
{
"epoch": 13.857593510590355,
"grad_norm": 1.6886624097824097,
"learning_rate": 3.85e-05,
"loss": 0.4492,
"step": 61500
},
{
"epoch": 13.857593510590355,
"eval_accuracy": 0.8941456344925675,
"eval_loss": 0.44254130125045776,
"eval_runtime": 220.9103,
"eval_samples_per_second": 125.182,
"eval_steps_per_second": 3.916,
"step": 61500
},
{
"epoch": 13.970256872465074,
"grad_norm": 1.560421109199524,
"learning_rate": 3.8e-05,
"loss": 0.4495,
"step": 62000
},
{
"epoch": 13.970256872465074,
"eval_accuracy": 0.8943100925877457,
"eval_loss": 0.43967217206954956,
"eval_runtime": 221.8474,
"eval_samples_per_second": 124.653,
"eval_steps_per_second": 3.899,
"step": 62000
},
{
"epoch": 14.082920234339793,
"grad_norm": 2.146169662475586,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.4438,
"step": 62500
},
{
"epoch": 14.082920234339793,
"eval_accuracy": 0.8951792684420733,
"eval_loss": 0.43378108739852905,
"eval_runtime": 221.3932,
"eval_samples_per_second": 124.909,
"eval_steps_per_second": 3.907,
"step": 62500
},
{
"epoch": 14.195583596214512,
"grad_norm": 1.6352427005767822,
"learning_rate": 3.7e-05,
"loss": 0.4437,
"step": 63000
},
{
"epoch": 14.195583596214512,
"eval_accuracy": 0.8944950022699799,
"eval_loss": 0.4368970990180969,
"eval_runtime": 221.6512,
"eval_samples_per_second": 124.764,
"eval_steps_per_second": 3.903,
"step": 63000
},
{
"epoch": 14.30824695808923,
"grad_norm": 1.6570438146591187,
"learning_rate": 3.65e-05,
"loss": 0.4407,
"step": 63500
},
{
"epoch": 14.30824695808923,
"eval_accuracy": 0.8954035156096067,
"eval_loss": 0.43203291296958923,
"eval_runtime": 220.9732,
"eval_samples_per_second": 125.146,
"eval_steps_per_second": 3.915,
"step": 63500
},
{
"epoch": 14.420910319963948,
"grad_norm": 1.6666638851165771,
"learning_rate": 3.6e-05,
"loss": 0.4409,
"step": 64000
},
{
"epoch": 14.420910319963948,
"eval_accuracy": 0.8951364676803809,
"eval_loss": 0.43579936027526855,
"eval_runtime": 221.9618,
"eval_samples_per_second": 124.589,
"eval_steps_per_second": 3.897,
"step": 64000
},
{
"epoch": 14.533573681838666,
"grad_norm": 1.5540229082107544,
"learning_rate": 3.55e-05,
"loss": 0.4425,
"step": 64500
},
{
"epoch": 14.533573681838666,
"eval_accuracy": 0.8954575425427099,
"eval_loss": 0.432124525308609,
"eval_runtime": 222.2861,
"eval_samples_per_second": 124.407,
"eval_steps_per_second": 3.891,
"step": 64500
},
{
"epoch": 14.646237043713384,
"grad_norm": 1.6039586067199707,
"learning_rate": 3.5e-05,
"loss": 0.4375,
"step": 65000
},
{
"epoch": 14.646237043713384,
"eval_accuracy": 0.8959293125901446,
"eval_loss": 0.4307084083557129,
"eval_runtime": 221.3855,
"eval_samples_per_second": 124.913,
"eval_steps_per_second": 3.907,
"step": 65000
},
{
"epoch": 14.758900405588102,
"grad_norm": 1.4141193628311157,
"learning_rate": 3.45e-05,
"loss": 0.4412,
"step": 65500
},
{
"epoch": 14.758900405588102,
"eval_accuracy": 0.8955789560165742,
"eval_loss": 0.4335871934890747,
"eval_runtime": 220.8948,
"eval_samples_per_second": 125.191,
"eval_steps_per_second": 3.916,
"step": 65500
},
{
"epoch": 14.87156376746282,
"grad_norm": 1.479407548904419,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.4402,
"step": 66000
},
{
"epoch": 14.87156376746282,
"eval_accuracy": 0.8960790351082009,
"eval_loss": 0.428357869386673,
"eval_runtime": 221.3572,
"eval_samples_per_second": 124.929,
"eval_steps_per_second": 3.908,
"step": 66000
},
{
"epoch": 14.984227129337539,
"grad_norm": 1.6063992977142334,
"learning_rate": 3.35e-05,
"loss": 0.4386,
"step": 66500
},
{
"epoch": 14.984227129337539,
"eval_accuracy": 0.8961887828028311,
"eval_loss": 0.42679697275161743,
"eval_runtime": 220.1773,
"eval_samples_per_second": 125.599,
"eval_steps_per_second": 3.929,
"step": 66500
},
{
"epoch": 15.096890491212259,
"grad_norm": 1.7383469343185425,
"learning_rate": 3.3e-05,
"loss": 0.4342,
"step": 67000
},
{
"epoch": 15.096890491212259,
"eval_accuracy": 0.8963356120392529,
"eval_loss": 0.43058517575263977,
"eval_runtime": 221.0267,
"eval_samples_per_second": 125.116,
"eval_steps_per_second": 3.914,
"step": 67000
},
{
"epoch": 15.209553853086977,
"grad_norm": 1.4529184103012085,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.4355,
"step": 67500
},
{
"epoch": 15.209553853086977,
"eval_accuracy": 0.8963537523060265,
"eval_loss": 0.4319207966327667,
"eval_runtime": 221.0822,
"eval_samples_per_second": 125.085,
"eval_steps_per_second": 3.913,
"step": 67500
},
{
"epoch": 15.322217214961695,
"grad_norm": 1.5925979614257812,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.434,
"step": 68000
},
{
"epoch": 15.322217214961695,
"eval_accuracy": 0.8967275614111444,
"eval_loss": 0.4326106905937195,
"eval_runtime": 221.1171,
"eval_samples_per_second": 125.065,
"eval_steps_per_second": 3.912,
"step": 68000
},
{
"epoch": 15.434880576836413,
"grad_norm": 1.5591844320297241,
"learning_rate": 3.15e-05,
"loss": 0.4299,
"step": 68500
},
{
"epoch": 15.434880576836413,
"eval_accuracy": 0.8968720289079662,
"eval_loss": 0.42554253339767456,
"eval_runtime": 221.0599,
"eval_samples_per_second": 125.097,
"eval_steps_per_second": 3.913,
"step": 68500
},
{
"epoch": 15.547543938711131,
"grad_norm": 1.6964800357818604,
"learning_rate": 3.1e-05,
"loss": 0.4302,
"step": 69000
},
{
"epoch": 15.547543938711131,
"eval_accuracy": 0.8968983814075581,
"eval_loss": 0.43178391456604004,
"eval_runtime": 220.2723,
"eval_samples_per_second": 125.545,
"eval_steps_per_second": 3.927,
"step": 69000
},
{
"epoch": 15.66020730058585,
"grad_norm": 1.7176204919815063,
"learning_rate": 3.05e-05,
"loss": 0.4317,
"step": 69500
},
{
"epoch": 15.66020730058585,
"eval_accuracy": 0.8971937797880897,
"eval_loss": 0.42581045627593994,
"eval_runtime": 221.117,
"eval_samples_per_second": 125.065,
"eval_steps_per_second": 3.912,
"step": 69500
},
{
"epoch": 15.772870662460567,
"grad_norm": 1.4802976846694946,
"learning_rate": 3e-05,
"loss": 0.4335,
"step": 70000
},
{
"epoch": 15.772870662460567,
"eval_accuracy": 0.8972389358742884,
"eval_loss": 0.4227333068847656,
"eval_runtime": 220.2787,
"eval_samples_per_second": 125.541,
"eval_steps_per_second": 3.927,
"step": 70000
},
{
"epoch": 15.885534024335286,
"grad_norm": 1.4625871181488037,
"learning_rate": 2.95e-05,
"loss": 0.4313,
"step": 70500
},
{
"epoch": 15.885534024335286,
"eval_accuracy": 0.8974343061715017,
"eval_loss": 0.420085072517395,
"eval_runtime": 221.0709,
"eval_samples_per_second": 125.091,
"eval_steps_per_second": 3.913,
"step": 70500
},
{
"epoch": 15.998197386210004,
"grad_norm": 1.4574440717697144,
"learning_rate": 2.9e-05,
"loss": 0.4288,
"step": 71000
},
{
"epoch": 15.998197386210004,
"eval_accuracy": 0.8976216192291354,
"eval_loss": 0.42089083790779114,
"eval_runtime": 221.0359,
"eval_samples_per_second": 125.111,
"eval_steps_per_second": 3.913,
"step": 71000
},
{
"epoch": 16.110860748084722,
"grad_norm": 1.415560245513916,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.4245,
"step": 71500
},
{
"epoch": 16.110860748084722,
"eval_accuracy": 0.8975563777935319,
"eval_loss": 0.42667824029922485,
"eval_runtime": 220.6503,
"eval_samples_per_second": 125.33,
"eval_steps_per_second": 3.92,
"step": 71500
},
{
"epoch": 16.223524109959442,
"grad_norm": 1.6393336057662964,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.4271,
"step": 72000
},
{
"epoch": 16.223524109959442,
"eval_accuracy": 0.8984646682572308,
"eval_loss": 0.4213043749332428,
"eval_runtime": 219.8857,
"eval_samples_per_second": 125.765,
"eval_steps_per_second": 3.934,
"step": 72000
},
{
"epoch": 16.336187471834158,
"grad_norm": 1.6446831226348877,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.4234,
"step": 72500
},
{
"epoch": 16.336187471834158,
"eval_accuracy": 0.8985838129375973,
"eval_loss": 0.42193278670310974,
"eval_runtime": 221.0608,
"eval_samples_per_second": 125.097,
"eval_steps_per_second": 3.913,
"step": 72500
},
{
"epoch": 16.448850833708878,
"grad_norm": 1.725674033164978,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.4251,
"step": 73000
},
{
"epoch": 16.448850833708878,
"eval_accuracy": 0.8987038252593214,
"eval_loss": 0.4171987771987915,
"eval_runtime": 220.8886,
"eval_samples_per_second": 125.194,
"eval_steps_per_second": 3.916,
"step": 73000
},
{
"epoch": 16.561514195583594,
"grad_norm": 1.5979257822036743,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.4217,
"step": 73500
},
{
"epoch": 16.561514195583594,
"eval_accuracy": 0.8987482542802462,
"eval_loss": 0.418377161026001,
"eval_runtime": 221.121,
"eval_samples_per_second": 125.063,
"eval_steps_per_second": 3.912,
"step": 73500
},
{
"epoch": 16.674177557458314,
"grad_norm": 1.4892100095748901,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.4205,
"step": 74000
},
{
"epoch": 16.674177557458314,
"eval_accuracy": 0.8989114915507669,
"eval_loss": 0.4165091812610626,
"eval_runtime": 222.5924,
"eval_samples_per_second": 124.236,
"eval_steps_per_second": 3.886,
"step": 74000
},
{
"epoch": 16.786840919333034,
"grad_norm": 1.4461849927902222,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.4228,
"step": 74500
},
{
"epoch": 16.786840919333034,
"eval_accuracy": 0.8989125542022752,
"eval_loss": 0.4175247848033905,
"eval_runtime": 222.3981,
"eval_samples_per_second": 124.345,
"eval_steps_per_second": 3.889,
"step": 74500
},
{
"epoch": 16.89950428120775,
"grad_norm": 1.768370509147644,
"learning_rate": 2.5e-05,
"loss": 0.421,
"step": 75000
},
{
"epoch": 16.89950428120775,
"eval_accuracy": 0.8991065894891168,
"eval_loss": 0.41619065403938293,
"eval_runtime": 222.6024,
"eval_samples_per_second": 124.23,
"eval_steps_per_second": 3.886,
"step": 75000
},
{
"epoch": 17.01216764308247,
"grad_norm": 1.4250850677490234,
"learning_rate": 2.45e-05,
"loss": 0.4178,
"step": 75500
},
{
"epoch": 17.01216764308247,
"eval_accuracy": 0.8994014895612595,
"eval_loss": 0.4117368161678314,
"eval_runtime": 222.14,
"eval_samples_per_second": 124.489,
"eval_steps_per_second": 3.894,
"step": 75500
},
{
"epoch": 17.124831004957187,
"grad_norm": 1.4036965370178223,
"learning_rate": 2.4e-05,
"loss": 0.4176,
"step": 76000
},
{
"epoch": 17.124831004957187,
"eval_accuracy": 0.8995830389073786,
"eval_loss": 0.4121379852294922,
"eval_runtime": 222.2839,
"eval_samples_per_second": 124.408,
"eval_steps_per_second": 3.891,
"step": 76000
},
{
"epoch": 17.237494366831907,
"grad_norm": 1.395093321800232,
"learning_rate": 2.35e-05,
"loss": 0.4172,
"step": 76500
},
{
"epoch": 17.237494366831907,
"eval_accuracy": 0.8998577766066815,
"eval_loss": 0.41285398602485657,
"eval_runtime": 222.2267,
"eval_samples_per_second": 124.44,
"eval_steps_per_second": 3.892,
"step": 76500
},
{
"epoch": 17.350157728706623,
"grad_norm": 1.5492697954177856,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.4133,
"step": 77000
},
{
"epoch": 17.350157728706623,
"eval_accuracy": 0.8992890854661668,
"eval_loss": 0.41492369771003723,
"eval_runtime": 219.0021,
"eval_samples_per_second": 126.273,
"eval_steps_per_second": 3.95,
"step": 77000
},
{
"epoch": 17.462821090581343,
"grad_norm": 1.4863234758377075,
"learning_rate": 2.25e-05,
"loss": 0.4166,
"step": 77500
},
{
"epoch": 17.462821090581343,
"eval_accuracy": 0.8995439142560963,
"eval_loss": 0.41370296478271484,
"eval_runtime": 220.4874,
"eval_samples_per_second": 125.422,
"eval_steps_per_second": 3.923,
"step": 77500
},
{
"epoch": 17.57548445245606,
"grad_norm": 1.8134657144546509,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.4167,
"step": 78000
},
{
"epoch": 17.57548445245606,
"eval_accuracy": 0.8998953243247179,
"eval_loss": 0.4118014872074127,
"eval_runtime": 220.8002,
"eval_samples_per_second": 125.244,
"eval_steps_per_second": 3.918,
"step": 78000
},
{
"epoch": 17.68814781433078,
"grad_norm": 1.7903392314910889,
"learning_rate": 2.15e-05,
"loss": 0.4164,
"step": 78500
},
{
"epoch": 17.68814781433078,
"eval_accuracy": 0.9001628949311502,
"eval_loss": 0.4123002886772156,
"eval_runtime": 219.63,
"eval_samples_per_second": 125.912,
"eval_steps_per_second": 3.938,
"step": 78500
},
{
"epoch": 17.8008111762055,
"grad_norm": 1.6216607093811035,
"learning_rate": 2.1e-05,
"loss": 0.4143,
"step": 79000
},
{
"epoch": 17.8008111762055,
"eval_accuracy": 0.9001079811521843,
"eval_loss": 0.40997758507728577,
"eval_runtime": 219.8198,
"eval_samples_per_second": 125.803,
"eval_steps_per_second": 3.935,
"step": 79000
},
{
"epoch": 17.913474538080216,
"grad_norm": 1.5128173828125,
"learning_rate": 2.05e-05,
"loss": 0.4136,
"step": 79500
},
{
"epoch": 17.913474538080216,
"eval_accuracy": 0.9006287821890727,
"eval_loss": 0.41052308678627014,
"eval_runtime": 219.697,
"eval_samples_per_second": 125.873,
"eval_steps_per_second": 3.937,
"step": 79500
},
{
"epoch": 18.026137899954936,
"grad_norm": 1.413712978363037,
"learning_rate": 2e-05,
"loss": 0.4132,
"step": 80000
},
{
"epoch": 18.026137899954936,
"eval_accuracy": 0.9007660373895346,
"eval_loss": 0.4081571400165558,
"eval_runtime": 220.6703,
"eval_samples_per_second": 125.318,
"eval_steps_per_second": 3.92,
"step": 80000
},
{
"epoch": 18.138801261829652,
"grad_norm": 1.7320311069488525,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.4102,
"step": 80500
},
{
"epoch": 18.138801261829652,
"eval_accuracy": 0.9009554825729237,
"eval_loss": 0.407240092754364,
"eval_runtime": 221.4224,
"eval_samples_per_second": 124.893,
"eval_steps_per_second": 3.907,
"step": 80500
},
{
"epoch": 18.251464623704372,
"grad_norm": 1.8033103942871094,
"learning_rate": 1.9e-05,
"loss": 0.4097,
"step": 81000
},
{
"epoch": 18.251464623704372,
"eval_accuracy": 0.9008985457774398,
"eval_loss": 0.4109956920146942,
"eval_runtime": 221.1014,
"eval_samples_per_second": 125.074,
"eval_steps_per_second": 3.912,
"step": 81000
},
{
"epoch": 18.36412798557909,
"grad_norm": 1.8222883939743042,
"learning_rate": 1.85e-05,
"loss": 0.4085,
"step": 81500
},
{
"epoch": 18.36412798557909,
"eval_accuracy": 0.9007539025464132,
"eval_loss": 0.4095366299152374,
"eval_runtime": 220.8203,
"eval_samples_per_second": 125.233,
"eval_steps_per_second": 3.917,
"step": 81500
},
{
"epoch": 18.47679134745381,
"grad_norm": 1.4663125276565552,
"learning_rate": 1.8e-05,
"loss": 0.4105,
"step": 82000
},
{
"epoch": 18.47679134745381,
"eval_accuracy": 0.9014532811520996,
"eval_loss": 0.4047625958919525,
"eval_runtime": 219.6263,
"eval_samples_per_second": 125.914,
"eval_steps_per_second": 3.939,
"step": 82000
},
{
"epoch": 18.589454709328525,
"grad_norm": 1.8482975959777832,
"learning_rate": 1.75e-05,
"loss": 0.4096,
"step": 82500
},
{
"epoch": 18.589454709328525,
"eval_accuracy": 0.9010233806097327,
"eval_loss": 0.4072835445404053,
"eval_runtime": 220.7586,
"eval_samples_per_second": 125.268,
"eval_steps_per_second": 3.918,
"step": 82500
},
{
"epoch": 18.702118071203245,
"grad_norm": 1.4483723640441895,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.4041,
"step": 83000
},
{
"epoch": 18.702118071203245,
"eval_accuracy": 0.9015295597674521,
"eval_loss": 0.4039141833782196,
"eval_runtime": 220.7392,
"eval_samples_per_second": 125.279,
"eval_steps_per_second": 3.919,
"step": 83000
},
{
"epoch": 18.814781433077965,
"grad_norm": 1.6040253639221191,
"learning_rate": 1.65e-05,
"loss": 0.4062,
"step": 83500
},
{
"epoch": 18.814781433077965,
"eval_accuracy": 0.9016612318058135,
"eval_loss": 0.40488725900650024,
"eval_runtime": 221.3884,
"eval_samples_per_second": 124.912,
"eval_steps_per_second": 3.907,
"step": 83500
},
{
"epoch": 18.92744479495268,
"grad_norm": 1.3560248613357544,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4045,
"step": 84000
},
{
"epoch": 18.92744479495268,
"eval_accuracy": 0.9015874866980568,
"eval_loss": 0.4032597243785858,
"eval_runtime": 221.9037,
"eval_samples_per_second": 124.622,
"eval_steps_per_second": 3.898,
"step": 84000
},
{
"epoch": 19.0401081568274,
"grad_norm": 1.6236895322799683,
"learning_rate": 1.55e-05,
"loss": 0.4038,
"step": 84500
},
{
"epoch": 19.0401081568274,
"eval_accuracy": 0.901710217516976,
"eval_loss": 0.4084183871746063,
"eval_runtime": 220.8431,
"eval_samples_per_second": 125.22,
"eval_steps_per_second": 3.917,
"step": 84500
},
{
"epoch": 19.152771518702117,
"grad_norm": 1.6514983177185059,
"learning_rate": 1.5e-05,
"loss": 0.4037,
"step": 85000
},
{
"epoch": 19.152771518702117,
"eval_accuracy": 0.9016946022320732,
"eval_loss": 0.4033704102039337,
"eval_runtime": 221.6212,
"eval_samples_per_second": 124.78,
"eval_steps_per_second": 3.903,
"step": 85000
},
{
"epoch": 19.265434880576837,
"grad_norm": 1.3684407472610474,
"learning_rate": 1.45e-05,
"loss": 0.4022,
"step": 85500
},
{
"epoch": 19.265434880576837,
"eval_accuracy": 0.9021324676993308,
"eval_loss": 0.40617531538009644,
"eval_runtime": 221.8256,
"eval_samples_per_second": 124.666,
"eval_steps_per_second": 3.899,
"step": 85500
},
{
"epoch": 19.378098242451554,
"grad_norm": 1.592301607131958,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.4059,
"step": 86000
},
{
"epoch": 19.378098242451554,
"eval_accuracy": 0.902363044454423,
"eval_loss": 0.3991073668003082,
"eval_runtime": 220.8011,
"eval_samples_per_second": 125.244,
"eval_steps_per_second": 3.918,
"step": 86000
},
{
"epoch": 19.490761604326273,
"grad_norm": 1.5463926792144775,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.4013,
"step": 86500
},
{
"epoch": 19.490761604326273,
"eval_accuracy": 0.9023868906868481,
"eval_loss": 0.39859089255332947,
"eval_runtime": 220.6504,
"eval_samples_per_second": 125.329,
"eval_steps_per_second": 3.92,
"step": 86500
},
{
"epoch": 19.60342496620099,
"grad_norm": 1.6952037811279297,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.4004,
"step": 87000
},
{
"epoch": 19.60342496620099,
"eval_accuracy": 0.9029012333672634,
"eval_loss": 0.4017859995365143,
"eval_runtime": 220.6857,
"eval_samples_per_second": 125.309,
"eval_steps_per_second": 3.92,
"step": 87000
},
{
"epoch": 19.71608832807571,
"grad_norm": 1.5156389474868774,
"learning_rate": 1.25e-05,
"loss": 0.4023,
"step": 87500
},
{
"epoch": 19.71608832807571,
"eval_accuracy": 0.9022691715502759,
"eval_loss": 0.40082216262817383,
"eval_runtime": 220.7786,
"eval_samples_per_second": 125.257,
"eval_steps_per_second": 3.918,
"step": 87500
},
{
"epoch": 19.82875168995043,
"grad_norm": 1.5951709747314453,
"learning_rate": 1.2e-05,
"loss": 0.3987,
"step": 88000
},
{
"epoch": 19.82875168995043,
"eval_accuracy": 0.9028266490406112,
"eval_loss": 0.4010894000530243,
"eval_runtime": 220.1664,
"eval_samples_per_second": 125.605,
"eval_steps_per_second": 3.929,
"step": 88000
},
{
"epoch": 19.941415051825146,
"grad_norm": 1.4990533590316772,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.3935,
"step": 88500
},
{
"epoch": 19.941415051825146,
"eval_accuracy": 0.9027395900326748,
"eval_loss": 0.401162326335907,
"eval_runtime": 220.111,
"eval_samples_per_second": 125.637,
"eval_steps_per_second": 3.93,
"step": 88500
},
{
"epoch": 20.054078413699866,
"grad_norm": 1.5961695909500122,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.3978,
"step": 89000
},
{
"epoch": 20.054078413699866,
"eval_accuracy": 0.902977115716753,
"eval_loss": 0.3981638252735138,
"eval_runtime": 219.6972,
"eval_samples_per_second": 125.873,
"eval_steps_per_second": 3.937,
"step": 89000
},
{
"epoch": 20.166741775574582,
"grad_norm": 1.5186184644699097,
"learning_rate": 1.05e-05,
"loss": 0.4012,
"step": 89500
},
{
"epoch": 20.166741775574582,
"eval_accuracy": 0.9029895131243953,
"eval_loss": 0.39535069465637207,
"eval_runtime": 220.4034,
"eval_samples_per_second": 125.47,
"eval_steps_per_second": 3.925,
"step": 89500
},
{
"epoch": 20.279405137449302,
"grad_norm": 1.7340284585952759,
"learning_rate": 1e-05,
"loss": 0.3954,
"step": 90000
},
{
"epoch": 20.279405137449302,
"eval_accuracy": 0.9031845731573412,
"eval_loss": 0.3972371816635132,
"eval_runtime": 220.9175,
"eval_samples_per_second": 125.178,
"eval_steps_per_second": 3.915,
"step": 90000
},
{
"epoch": 20.39206849932402,
"grad_norm": 1.4601465463638306,
"learning_rate": 9.5e-06,
"loss": 0.3933,
"step": 90500
},
{
"epoch": 20.39206849932402,
"eval_accuracy": 0.9031702530935091,
"eval_loss": 0.39749225974082947,
"eval_runtime": 220.5834,
"eval_samples_per_second": 125.368,
"eval_steps_per_second": 3.921,
"step": 90500
},
{
"epoch": 20.50473186119874,
"grad_norm": 1.6822484731674194,
"learning_rate": 9e-06,
"loss": 0.3985,
"step": 91000
},
{
"epoch": 20.50473186119874,
"eval_accuracy": 0.903283638473266,
"eval_loss": 0.39412999153137207,
"eval_runtime": 220.402,
"eval_samples_per_second": 125.471,
"eval_steps_per_second": 3.925,
"step": 91000
},
{
"epoch": 20.617395223073455,
"grad_norm": 1.5493133068084717,
"learning_rate": 8.500000000000002e-06,
"loss": 0.3952,
"step": 91500
},
{
"epoch": 20.617395223073455,
"eval_accuracy": 0.9031870870760611,
"eval_loss": 0.39998504519462585,
"eval_runtime": 219.1703,
"eval_samples_per_second": 126.176,
"eval_steps_per_second": 3.947,
"step": 91500
},
{
"epoch": 20.730058584948175,
"grad_norm": 1.6142163276672363,
"learning_rate": 8.000000000000001e-06,
"loss": 0.395,
"step": 92000
},
{
"epoch": 20.730058584948175,
"eval_accuracy": 0.9037042508521438,
"eval_loss": 0.39454683661460876,
"eval_runtime": 220.1482,
"eval_samples_per_second": 125.615,
"eval_steps_per_second": 3.929,
"step": 92000
},
{
"epoch": 20.842721946822895,
"grad_norm": 1.3768945932388306,
"learning_rate": 7.5e-06,
"loss": 0.3925,
"step": 92500
},
{
"epoch": 20.842721946822895,
"eval_accuracy": 0.9035520393735632,
"eval_loss": 0.3969292640686035,
"eval_runtime": 218.8787,
"eval_samples_per_second": 126.344,
"eval_steps_per_second": 3.952,
"step": 92500
},
{
"epoch": 20.95538530869761,
"grad_norm": 1.8161870241165161,
"learning_rate": 7.000000000000001e-06,
"loss": 0.3911,
"step": 93000
},
{
"epoch": 20.95538530869761,
"eval_accuracy": 0.9034115695768419,
"eval_loss": 0.39153432846069336,
"eval_runtime": 219.6974,
"eval_samples_per_second": 125.873,
"eval_steps_per_second": 3.937,
"step": 93000
},
{
"epoch": 21.06804867057233,
"grad_norm": 1.7550774812698364,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.3927,
"step": 93500
},
{
"epoch": 21.06804867057233,
"eval_accuracy": 0.9035121668560334,
"eval_loss": 0.39775171875953674,
"eval_runtime": 221.0095,
"eval_samples_per_second": 125.126,
"eval_steps_per_second": 3.914,
"step": 93500
},
{
"epoch": 21.180712032447047,
"grad_norm": 1.5582369565963745,
"learning_rate": 6e-06,
"loss": 0.3891,
"step": 94000
},
{
"epoch": 21.180712032447047,
"eval_accuracy": 0.9037201879273532,
"eval_loss": 0.3943246006965637,
"eval_runtime": 220.3117,
"eval_samples_per_second": 125.522,
"eval_steps_per_second": 3.926,
"step": 94000
},
{
"epoch": 21.293375394321767,
"grad_norm": 1.6729559898376465,
"learning_rate": 5.500000000000001e-06,
"loss": 0.3912,
"step": 94500
},
{
"epoch": 21.293375394321767,
"eval_accuracy": 0.9036670141570837,
"eval_loss": 0.39444249868392944,
"eval_runtime": 219.9471,
"eval_samples_per_second": 125.73,
"eval_steps_per_second": 3.933,
"step": 94500
},
{
"epoch": 21.406038756196484,
"grad_norm": 1.6871699094772339,
"learning_rate": 5e-06,
"loss": 0.3908,
"step": 95000
},
{
"epoch": 21.406038756196484,
"eval_accuracy": 0.9037736297217607,
"eval_loss": 0.39369192719459534,
"eval_runtime": 219.9205,
"eval_samples_per_second": 125.745,
"eval_steps_per_second": 3.933,
"step": 95000
},
{
"epoch": 21.518702118071204,
"grad_norm": 1.486741304397583,
"learning_rate": 4.5e-06,
"loss": 0.3902,
"step": 95500
},
{
"epoch": 21.518702118071204,
"eval_accuracy": 0.9034302972672164,
"eval_loss": 0.39573636651039124,
"eval_runtime": 219.8759,
"eval_samples_per_second": 125.771,
"eval_steps_per_second": 3.934,
"step": 95500
},
{
"epoch": 21.63136547994592,
"grad_norm": 1.8056081533432007,
"learning_rate": 4.000000000000001e-06,
"loss": 0.3891,
"step": 96000
},
{
"epoch": 21.63136547994592,
"eval_accuracy": 0.9045647365783699,
"eval_loss": 0.39023157954216003,
"eval_runtime": 221.3034,
"eval_samples_per_second": 124.96,
"eval_steps_per_second": 3.909,
"step": 96000
},
{
"epoch": 21.74402884182064,
"grad_norm": 1.552370309829712,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.3894,
"step": 96500
},
{
"epoch": 21.74402884182064,
"eval_accuracy": 0.9044615558398447,
"eval_loss": 0.39400991797447205,
"eval_runtime": 219.8746,
"eval_samples_per_second": 125.772,
"eval_steps_per_second": 3.934,
"step": 96500
},
{
"epoch": 21.85669220369536,
"grad_norm": 1.506536841392517,
"learning_rate": 3e-06,
"loss": 0.3904,
"step": 97000
},
{
"epoch": 21.85669220369536,
"eval_accuracy": 0.9044962394479266,
"eval_loss": 0.390458881855011,
"eval_runtime": 220.131,
"eval_samples_per_second": 125.625,
"eval_steps_per_second": 3.929,
"step": 97000
},
{
"epoch": 21.969355565570076,
"grad_norm": 1.6080279350280762,
"learning_rate": 2.5e-06,
"loss": 0.3882,
"step": 97500
},
{
"epoch": 21.969355565570076,
"eval_accuracy": 0.9043700852475594,
"eval_loss": 0.39395132660865784,
"eval_runtime": 220.1175,
"eval_samples_per_second": 125.633,
"eval_steps_per_second": 3.93,
"step": 97500
},
{
"epoch": 22.082018927444796,
"grad_norm": 1.6551542282104492,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.388,
"step": 98000
},
{
"epoch": 22.082018927444796,
"eval_accuracy": 0.904642958920198,
"eval_loss": 0.39477479457855225,
"eval_runtime": 219.108,
"eval_samples_per_second": 126.212,
"eval_steps_per_second": 3.948,
"step": 98000
},
{
"epoch": 22.194682289319513,
"grad_norm": 1.3376331329345703,
"learning_rate": 1.5e-06,
"loss": 0.3888,
"step": 98500
},
{
"epoch": 22.194682289319513,
"eval_accuracy": 0.9042594879589607,
"eval_loss": 0.39155128598213196,
"eval_runtime": 221.2476,
"eval_samples_per_second": 124.991,
"eval_steps_per_second": 3.91,
"step": 98500
},
{
"epoch": 22.307345651194233,
"grad_norm": 1.6391901969909668,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.385,
"step": 99000
},
{
"epoch": 22.307345651194233,
"eval_accuracy": 0.9047423169505552,
"eval_loss": 0.3867943286895752,
"eval_runtime": 220.9463,
"eval_samples_per_second": 125.162,
"eval_steps_per_second": 3.915,
"step": 99000
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 23,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.346992290195046e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}