colbert_reasonir_v2 / trainer_state.json
souvickdascmsa019's picture
Upload folder using huggingface_hub
5a606eb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.5728105382319644,
"eval_steps": 250,
"global_step": 25000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005145621076463929,
"grad_norm": 10.373374938964844,
"learning_rate": 9.600000000000001e-06,
"loss": 4.8488,
"step": 50
},
{
"epoch": 0.010291242152927859,
"grad_norm": 11.32767391204834,
"learning_rate": 1.9600000000000002e-05,
"loss": 2.2402,
"step": 100
},
{
"epoch": 0.015436863229391787,
"grad_norm": 7.465405464172363,
"learning_rate": 1.9990110229731125e-05,
"loss": 1.8204,
"step": 150
},
{
"epoch": 0.020582484305855717,
"grad_norm": 11.595544815063477,
"learning_rate": 1.9979808385701043e-05,
"loss": 1.7765,
"step": 200
},
{
"epoch": 0.025728105382319646,
"grad_norm": 15.59911060333252,
"learning_rate": 1.996950654167096e-05,
"loss": 1.7482,
"step": 250
},
{
"epoch": 0.025728105382319646,
"eval_accuracy": 0.9227184653282166,
"eval_loss": 1.1625392436981201,
"eval_runtime": 418.704,
"eval_samples_per_second": 82.514,
"eval_steps_per_second": 2.579,
"step": 250
},
{
"epoch": 0.030873726458783574,
"grad_norm": 7.875217914581299,
"learning_rate": 1.995920469764088e-05,
"loss": 1.7821,
"step": 300
},
{
"epoch": 0.036019347535247506,
"grad_norm": 11.855839729309082,
"learning_rate": 1.99489028536108e-05,
"loss": 1.6761,
"step": 350
},
{
"epoch": 0.041164968611711435,
"grad_norm": 7.26309061050415,
"learning_rate": 1.9938601009580717e-05,
"loss": 1.4887,
"step": 400
},
{
"epoch": 0.04631058968817536,
"grad_norm": 8.920960426330566,
"learning_rate": 1.9928299165550635e-05,
"loss": 1.6001,
"step": 450
},
{
"epoch": 0.05145621076463929,
"grad_norm": 7.441282749176025,
"learning_rate": 1.9917997321520554e-05,
"loss": 1.7426,
"step": 500
},
{
"epoch": 0.05145621076463929,
"eval_accuracy": 0.9316622614860535,
"eval_loss": 1.1087791919708252,
"eval_runtime": 417.9867,
"eval_samples_per_second": 82.656,
"eval_steps_per_second": 2.584,
"step": 500
},
{
"epoch": 0.05660183184110322,
"grad_norm": 5.284599781036377,
"learning_rate": 1.9907695477490472e-05,
"loss": 1.5562,
"step": 550
},
{
"epoch": 0.06174745291756715,
"grad_norm": 9.429953575134277,
"learning_rate": 1.989739363346039e-05,
"loss": 1.6811,
"step": 600
},
{
"epoch": 0.06689307399403108,
"grad_norm": 13.159565925598145,
"learning_rate": 1.988709178943031e-05,
"loss": 1.5994,
"step": 650
},
{
"epoch": 0.07203869507049501,
"grad_norm": 6.322610378265381,
"learning_rate": 1.9876789945400228e-05,
"loss": 1.5981,
"step": 700
},
{
"epoch": 0.07718431614695893,
"grad_norm": 13.995536804199219,
"learning_rate": 1.9866488101370146e-05,
"loss": 1.5713,
"step": 750
},
{
"epoch": 0.07718431614695893,
"eval_accuracy": 0.9368722438812256,
"eval_loss": 1.0817046165466309,
"eval_runtime": 418.366,
"eval_samples_per_second": 82.581,
"eval_steps_per_second": 2.581,
"step": 750
},
{
"epoch": 0.08232993722342287,
"grad_norm": 10.23471450805664,
"learning_rate": 1.9856186257340068e-05,
"loss": 1.6516,
"step": 800
},
{
"epoch": 0.08747555829988679,
"grad_norm": 9.448832511901855,
"learning_rate": 1.9845884413309983e-05,
"loss": 1.5768,
"step": 850
},
{
"epoch": 0.09262117937635073,
"grad_norm": 9.08662223815918,
"learning_rate": 1.9835582569279905e-05,
"loss": 1.5902,
"step": 900
},
{
"epoch": 0.09776680045281465,
"grad_norm": 9.396324157714844,
"learning_rate": 1.982528072524982e-05,
"loss": 1.4613,
"step": 950
},
{
"epoch": 0.10291242152927858,
"grad_norm": 6.668619632720947,
"learning_rate": 1.9814978881219738e-05,
"loss": 1.6295,
"step": 1000
},
{
"epoch": 0.10291242152927858,
"eval_accuracy": 0.9374222159385681,
"eval_loss": 1.0676764249801636,
"eval_runtime": 418.3645,
"eval_samples_per_second": 82.581,
"eval_steps_per_second": 2.581,
"step": 1000
},
{
"epoch": 0.10805804260574252,
"grad_norm": 12.718502044677734,
"learning_rate": 1.980467703718966e-05,
"loss": 1.5301,
"step": 1050
},
{
"epoch": 0.11320366368220644,
"grad_norm": 6.540752410888672,
"learning_rate": 1.9794375193159575e-05,
"loss": 1.6072,
"step": 1100
},
{
"epoch": 0.11834928475867038,
"grad_norm": 11.122970581054688,
"learning_rate": 1.9784073349129497e-05,
"loss": 1.4644,
"step": 1150
},
{
"epoch": 0.1234949058351343,
"grad_norm": 9.239429473876953,
"learning_rate": 1.9773771505099415e-05,
"loss": 1.6331,
"step": 1200
},
{
"epoch": 0.12864052691159822,
"grad_norm": 7.708181858062744,
"learning_rate": 1.9763469661069334e-05,
"loss": 1.5464,
"step": 1250
},
{
"epoch": 0.12864052691159822,
"eval_accuracy": 0.9407797455787659,
"eval_loss": 1.0546813011169434,
"eval_runtime": 418.2896,
"eval_samples_per_second": 82.596,
"eval_steps_per_second": 2.582,
"step": 1250
},
{
"epoch": 0.13378614798806215,
"grad_norm": 8.665009498596191,
"learning_rate": 1.9753167817039252e-05,
"loss": 1.5406,
"step": 1300
},
{
"epoch": 0.1389317690645261,
"grad_norm": 8.5620756149292,
"learning_rate": 1.974286597300917e-05,
"loss": 1.5471,
"step": 1350
},
{
"epoch": 0.14407739014099002,
"grad_norm": 10.859761238098145,
"learning_rate": 1.973256412897909e-05,
"loss": 1.6685,
"step": 1400
},
{
"epoch": 0.14922301121745396,
"grad_norm": 6.381153106689453,
"learning_rate": 1.9722262284949007e-05,
"loss": 1.5644,
"step": 1450
},
{
"epoch": 0.15436863229391787,
"grad_norm": 3.7411134243011475,
"learning_rate": 1.9711960440918926e-05,
"loss": 1.6587,
"step": 1500
},
{
"epoch": 0.15436863229391787,
"eval_accuracy": 0.9419954419136047,
"eval_loss": 1.0589897632598877,
"eval_runtime": 418.2875,
"eval_samples_per_second": 82.596,
"eval_steps_per_second": 2.582,
"step": 1500
},
{
"epoch": 0.1595142533703818,
"grad_norm": 7.128472328186035,
"learning_rate": 1.9701658596888844e-05,
"loss": 1.5793,
"step": 1550
},
{
"epoch": 0.16465987444684574,
"grad_norm": 7.605306625366211,
"learning_rate": 1.9691356752858763e-05,
"loss": 1.4877,
"step": 1600
},
{
"epoch": 0.16980549552330967,
"grad_norm": 10.431309700012207,
"learning_rate": 1.9681054908828684e-05,
"loss": 1.5781,
"step": 1650
},
{
"epoch": 0.17495111659977358,
"grad_norm": 7.39127254486084,
"learning_rate": 1.96707530647986e-05,
"loss": 1.5081,
"step": 1700
},
{
"epoch": 0.18009673767623752,
"grad_norm": 8.452966690063477,
"learning_rate": 1.9660451220768518e-05,
"loss": 1.5434,
"step": 1750
},
{
"epoch": 0.18009673767623752,
"eval_accuracy": 0.9395930171012878,
"eval_loss": 1.0563884973526,
"eval_runtime": 418.2001,
"eval_samples_per_second": 82.614,
"eval_steps_per_second": 2.582,
"step": 1750
},
{
"epoch": 0.18524235875270145,
"grad_norm": 11.400819778442383,
"learning_rate": 1.965014937673844e-05,
"loss": 1.4617,
"step": 1800
},
{
"epoch": 0.1903879798291654,
"grad_norm": 2.2198734283447266,
"learning_rate": 1.9639847532708355e-05,
"loss": 1.4531,
"step": 1850
},
{
"epoch": 0.1955336009056293,
"grad_norm": 9.084458351135254,
"learning_rate": 1.9629545688678276e-05,
"loss": 1.5713,
"step": 1900
},
{
"epoch": 0.20067922198209323,
"grad_norm": 9.126426696777344,
"learning_rate": 1.961924384464819e-05,
"loss": 1.5166,
"step": 1950
},
{
"epoch": 0.20582484305855717,
"grad_norm": 18.784996032714844,
"learning_rate": 1.9608942000618113e-05,
"loss": 1.4771,
"step": 2000
},
{
"epoch": 0.20582484305855717,
"eval_accuracy": 0.943095326423645,
"eval_loss": 1.0343589782714844,
"eval_runtime": 418.12,
"eval_samples_per_second": 82.629,
"eval_steps_per_second": 2.583,
"step": 2000
},
{
"epoch": 0.2109704641350211,
"grad_norm": 4.355062961578369,
"learning_rate": 1.9598640156588032e-05,
"loss": 1.4706,
"step": 2050
},
{
"epoch": 0.21611608521148504,
"grad_norm": 7.846619606018066,
"learning_rate": 1.958833831255795e-05,
"loss": 1.5276,
"step": 2100
},
{
"epoch": 0.22126170628794894,
"grad_norm": 11.4408597946167,
"learning_rate": 1.957803646852787e-05,
"loss": 1.4002,
"step": 2150
},
{
"epoch": 0.22640732736441288,
"grad_norm": 10.485089302062988,
"learning_rate": 1.9567734624497787e-05,
"loss": 1.5605,
"step": 2200
},
{
"epoch": 0.23155294844087682,
"grad_norm": 5.763485431671143,
"learning_rate": 1.9557432780467705e-05,
"loss": 1.4871,
"step": 2250
},
{
"epoch": 0.23155294844087682,
"eval_accuracy": 0.9440793991088867,
"eval_loss": 1.035501480102539,
"eval_runtime": 418.1856,
"eval_samples_per_second": 82.616,
"eval_steps_per_second": 2.583,
"step": 2250
},
{
"epoch": 0.23669856951734075,
"grad_norm": 6.086212635040283,
"learning_rate": 1.9547130936437624e-05,
"loss": 1.56,
"step": 2300
},
{
"epoch": 0.24184419059380466,
"grad_norm": 10.038729667663574,
"learning_rate": 1.9536829092407542e-05,
"loss": 1.4322,
"step": 2350
},
{
"epoch": 0.2469898116702686,
"grad_norm": 8.869370460510254,
"learning_rate": 1.952652724837746e-05,
"loss": 1.4682,
"step": 2400
},
{
"epoch": 0.25213543274673256,
"grad_norm": 9.509527206420898,
"learning_rate": 1.951622540434738e-05,
"loss": 1.4375,
"step": 2450
},
{
"epoch": 0.25728105382319644,
"grad_norm": 14.525392532348633,
"learning_rate": 1.95061295971979e-05,
"loss": 1.4499,
"step": 2500
},
{
"epoch": 0.25728105382319644,
"eval_accuracy": 0.9433557987213135,
"eval_loss": 1.030641794204712,
"eval_runtime": 418.9422,
"eval_samples_per_second": 82.467,
"eval_steps_per_second": 2.578,
"step": 2500
},
{
"epoch": 0.2624266748996604,
"grad_norm": 10.151646614074707,
"learning_rate": 1.949582775316782e-05,
"loss": 1.5088,
"step": 2550
},
{
"epoch": 0.2675722959761243,
"grad_norm": 9.49378776550293,
"learning_rate": 1.948552590913774e-05,
"loss": 1.5577,
"step": 2600
},
{
"epoch": 0.27271791705258824,
"grad_norm": 11.339292526245117,
"learning_rate": 1.9475224065107657e-05,
"loss": 1.4221,
"step": 2650
},
{
"epoch": 0.2778635381290522,
"grad_norm": 6.203045845031738,
"learning_rate": 1.9464922221077572e-05,
"loss": 1.5105,
"step": 2700
},
{
"epoch": 0.2830091592055161,
"grad_norm": 8.609308242797852,
"learning_rate": 1.9454620377047494e-05,
"loss": 1.4681,
"step": 2750
},
{
"epoch": 0.2830091592055161,
"eval_accuracy": 0.9453240633010864,
"eval_loss": 1.0219130516052246,
"eval_runtime": 418.4116,
"eval_samples_per_second": 82.572,
"eval_steps_per_second": 2.581,
"step": 2750
},
{
"epoch": 0.28815478028198005,
"grad_norm": 5.019013404846191,
"learning_rate": 1.9444318533017412e-05,
"loss": 1.4354,
"step": 2800
},
{
"epoch": 0.293300401358444,
"grad_norm": 11.37190055847168,
"learning_rate": 1.943401668898733e-05,
"loss": 1.4982,
"step": 2850
},
{
"epoch": 0.2984460224349079,
"grad_norm": 4.953094005584717,
"learning_rate": 1.942371484495725e-05,
"loss": 1.5374,
"step": 2900
},
{
"epoch": 0.3035916435113718,
"grad_norm": 12.19895076751709,
"learning_rate": 1.9413413000927167e-05,
"loss": 1.4769,
"step": 2950
},
{
"epoch": 0.30873726458783574,
"grad_norm": 6.096263408660889,
"learning_rate": 1.9403111156897086e-05,
"loss": 1.5767,
"step": 3000
},
{
"epoch": 0.30873726458783574,
"eval_accuracy": 0.9450345635414124,
"eval_loss": 1.0167551040649414,
"eval_runtime": 419.6576,
"eval_samples_per_second": 82.327,
"eval_steps_per_second": 2.574,
"step": 3000
},
{
"epoch": 0.31388288566429967,
"grad_norm": 5.119435787200928,
"learning_rate": 1.9392809312867004e-05,
"loss": 1.3712,
"step": 3050
},
{
"epoch": 0.3190285067407636,
"grad_norm": 3.375337600708008,
"learning_rate": 1.9382507468836923e-05,
"loss": 1.4979,
"step": 3100
},
{
"epoch": 0.32417412781722754,
"grad_norm": 12.87149715423584,
"learning_rate": 1.937220562480684e-05,
"loss": 1.4633,
"step": 3150
},
{
"epoch": 0.3293197488936915,
"grad_norm": 8.743680000305176,
"learning_rate": 1.936190378077676e-05,
"loss": 1.5025,
"step": 3200
},
{
"epoch": 0.3344653699701554,
"grad_norm": 7.334400653839111,
"learning_rate": 1.935160193674668e-05,
"loss": 1.5206,
"step": 3250
},
{
"epoch": 0.3344653699701554,
"eval_accuracy": 0.9457292556762695,
"eval_loss": 1.016142725944519,
"eval_runtime": 419.0299,
"eval_samples_per_second": 82.45,
"eval_steps_per_second": 2.577,
"step": 3250
},
{
"epoch": 0.33961099104661935,
"grad_norm": 15.349682807922363,
"learning_rate": 1.9341300092716596e-05,
"loss": 1.5119,
"step": 3300
},
{
"epoch": 0.3447566121230833,
"grad_norm": 10.006840705871582,
"learning_rate": 1.9330998248686518e-05,
"loss": 1.6285,
"step": 3350
},
{
"epoch": 0.34990223319954716,
"grad_norm": 10.248564720153809,
"learning_rate": 1.9320696404656437e-05,
"loss": 1.4421,
"step": 3400
},
{
"epoch": 0.3550478542760111,
"grad_norm": 10.41620922088623,
"learning_rate": 1.931039456062635e-05,
"loss": 1.4866,
"step": 3450
},
{
"epoch": 0.36019347535247503,
"grad_norm": 11.682571411132812,
"learning_rate": 1.9300092716596273e-05,
"loss": 1.4651,
"step": 3500
},
{
"epoch": 0.36019347535247503,
"eval_accuracy": 0.9464818239212036,
"eval_loss": 1.0084654092788696,
"eval_runtime": 418.3089,
"eval_samples_per_second": 82.592,
"eval_steps_per_second": 2.582,
"step": 3500
},
{
"epoch": 0.36533909642893897,
"grad_norm": 4.340071678161621,
"learning_rate": 1.9289790872566192e-05,
"loss": 1.3777,
"step": 3550
},
{
"epoch": 0.3704847175054029,
"grad_norm": 5.201279163360596,
"learning_rate": 1.927948902853611e-05,
"loss": 1.5256,
"step": 3600
},
{
"epoch": 0.37563033858186684,
"grad_norm": 13.030069351196289,
"learning_rate": 1.926918718450603e-05,
"loss": 1.358,
"step": 3650
},
{
"epoch": 0.3807759596583308,
"grad_norm": 6.489394664764404,
"learning_rate": 1.9259091377356548e-05,
"loss": 1.4384,
"step": 3700
},
{
"epoch": 0.3859215807347947,
"grad_norm": 9.361295700073242,
"learning_rate": 1.9248789533326466e-05,
"loss": 1.4847,
"step": 3750
},
{
"epoch": 0.3859215807347947,
"eval_accuracy": 0.9460765719413757,
"eval_loss": 1.0093164443969727,
"eval_runtime": 418.9794,
"eval_samples_per_second": 82.46,
"eval_steps_per_second": 2.578,
"step": 3750
},
{
"epoch": 0.3910672018112586,
"grad_norm": 5.094785213470459,
"learning_rate": 1.9238487689296385e-05,
"loss": 1.327,
"step": 3800
},
{
"epoch": 0.3962128228877225,
"grad_norm": 2.1664323806762695,
"learning_rate": 1.9228185845266307e-05,
"loss": 1.4463,
"step": 3850
},
{
"epoch": 0.40135844396418646,
"grad_norm": 2.0584053993225098,
"learning_rate": 1.921788400123622e-05,
"loss": 1.3179,
"step": 3900
},
{
"epoch": 0.4065040650406504,
"grad_norm": 7.324461936950684,
"learning_rate": 1.920758215720614e-05,
"loss": 1.4312,
"step": 3950
},
{
"epoch": 0.41164968611711433,
"grad_norm": 13.046509742736816,
"learning_rate": 1.9197280313176062e-05,
"loss": 1.4179,
"step": 4000
},
{
"epoch": 0.41164968611711433,
"eval_accuracy": 0.9460186958312988,
"eval_loss": 1.0144544839859009,
"eval_runtime": 418.3617,
"eval_samples_per_second": 82.582,
"eval_steps_per_second": 2.581,
"step": 4000
},
{
"epoch": 0.41679530719357827,
"grad_norm": 4.131565093994141,
"learning_rate": 1.9186978469145977e-05,
"loss": 1.4828,
"step": 4050
},
{
"epoch": 0.4219409282700422,
"grad_norm": 14.746273040771484,
"learning_rate": 1.91766766251159e-05,
"loss": 1.4568,
"step": 4100
},
{
"epoch": 0.42708654934650614,
"grad_norm": 9.327777862548828,
"learning_rate": 1.9166374781085817e-05,
"loss": 1.4921,
"step": 4150
},
{
"epoch": 0.4322321704229701,
"grad_norm": 11.792791366577148,
"learning_rate": 1.9156072937055735e-05,
"loss": 1.4485,
"step": 4200
},
{
"epoch": 0.43737779149943395,
"grad_norm": 9.524967193603516,
"learning_rate": 1.9145771093025654e-05,
"loss": 1.4908,
"step": 4250
},
{
"epoch": 0.43737779149943395,
"eval_accuracy": 0.9477843046188354,
"eval_loss": 1.0120937824249268,
"eval_runtime": 418.3734,
"eval_samples_per_second": 82.579,
"eval_steps_per_second": 2.581,
"step": 4250
},
{
"epoch": 0.4425234125758979,
"grad_norm": 6.7903361320495605,
"learning_rate": 1.9135469248995572e-05,
"loss": 1.295,
"step": 4300
},
{
"epoch": 0.4476690336523618,
"grad_norm": 7.337329387664795,
"learning_rate": 1.912516740496549e-05,
"loss": 1.4687,
"step": 4350
},
{
"epoch": 0.45281465472882576,
"grad_norm": 8.49622631072998,
"learning_rate": 1.911486556093541e-05,
"loss": 1.3846,
"step": 4400
},
{
"epoch": 0.4579602758052897,
"grad_norm": 4.385276794433594,
"learning_rate": 1.9104563716905328e-05,
"loss": 1.4704,
"step": 4450
},
{
"epoch": 0.46310589688175363,
"grad_norm": 9.520790100097656,
"learning_rate": 1.9094261872875246e-05,
"loss": 1.3646,
"step": 4500
},
{
"epoch": 0.46310589688175363,
"eval_accuracy": 0.9479579925537109,
"eval_loss": 1.0055809020996094,
"eval_runtime": 419.9183,
"eval_samples_per_second": 82.276,
"eval_steps_per_second": 2.572,
"step": 4500
},
{
"epoch": 0.46825151795821757,
"grad_norm": 11.194172859191895,
"learning_rate": 1.9083960028845164e-05,
"loss": 1.4779,
"step": 4550
},
{
"epoch": 0.4733971390346815,
"grad_norm": 8.098811149597168,
"learning_rate": 1.9073658184815083e-05,
"loss": 1.4581,
"step": 4600
},
{
"epoch": 0.47854276011114544,
"grad_norm": 3.986377477645874,
"learning_rate": 1.9063356340785e-05,
"loss": 1.3786,
"step": 4650
},
{
"epoch": 0.4836883811876093,
"grad_norm": 7.204378128051758,
"learning_rate": 1.9053054496754923e-05,
"loss": 1.56,
"step": 4700
},
{
"epoch": 0.48883400226407325,
"grad_norm": 6.332393169403076,
"learning_rate": 1.9042752652724838e-05,
"loss": 1.4334,
"step": 4750
},
{
"epoch": 0.48883400226407325,
"eval_accuracy": 0.947523832321167,
"eval_loss": 1.0032474994659424,
"eval_runtime": 418.1822,
"eval_samples_per_second": 82.617,
"eval_steps_per_second": 2.583,
"step": 4750
},
{
"epoch": 0.4939796233405372,
"grad_norm": 5.785167694091797,
"learning_rate": 1.9032450808694756e-05,
"loss": 1.3877,
"step": 4800
},
{
"epoch": 0.4991252444170011,
"grad_norm": 2.13838529586792,
"learning_rate": 1.9022148964664678e-05,
"loss": 1.3485,
"step": 4850
},
{
"epoch": 0.5042708654934651,
"grad_norm": 5.9960618019104,
"learning_rate": 1.9011847120634593e-05,
"loss": 1.4509,
"step": 4900
},
{
"epoch": 0.509416486569929,
"grad_norm": 3.8960604667663574,
"learning_rate": 1.9001545276604515e-05,
"loss": 1.3693,
"step": 4950
},
{
"epoch": 0.5145621076463929,
"grad_norm": 5.9818115234375,
"learning_rate": 1.8991243432574434e-05,
"loss": 1.5226,
"step": 5000
},
{
"epoch": 0.5145621076463929,
"eval_accuracy": 0.9477264285087585,
"eval_loss": 0.9975742101669312,
"eval_runtime": 418.0863,
"eval_samples_per_second": 82.636,
"eval_steps_per_second": 2.583,
"step": 5000
},
{
"epoch": 0.5197077287228569,
"grad_norm": 9.811531066894531,
"learning_rate": 1.8980941588544352e-05,
"loss": 1.4423,
"step": 5050
},
{
"epoch": 0.5248533497993207,
"grad_norm": 4.812816619873047,
"learning_rate": 1.897063974451427e-05,
"loss": 1.4191,
"step": 5100
},
{
"epoch": 0.5299989708757847,
"grad_norm": 7.36176872253418,
"learning_rate": 1.896033790048419e-05,
"loss": 1.5109,
"step": 5150
},
{
"epoch": 0.5351445919522486,
"grad_norm": 12.9472017288208,
"learning_rate": 1.8950036056454107e-05,
"loss": 1.4509,
"step": 5200
},
{
"epoch": 0.5402902130287126,
"grad_norm": 6.96859073638916,
"learning_rate": 1.8939734212424026e-05,
"loss": 1.4351,
"step": 5250
},
{
"epoch": 0.5402902130287126,
"eval_accuracy": 0.9485947489738464,
"eval_loss": 1.000069499015808,
"eval_runtime": 418.3411,
"eval_samples_per_second": 82.586,
"eval_steps_per_second": 2.582,
"step": 5250
},
{
"epoch": 0.5454358341051765,
"grad_norm": 11.015822410583496,
"learning_rate": 1.8929432368393944e-05,
"loss": 1.3868,
"step": 5300
},
{
"epoch": 0.5505814551816405,
"grad_norm": 7.5497050285339355,
"learning_rate": 1.8919130524363863e-05,
"loss": 1.4339,
"step": 5350
},
{
"epoch": 0.5557270762581044,
"grad_norm": 11.352765083312988,
"learning_rate": 1.890882868033378e-05,
"loss": 1.365,
"step": 5400
},
{
"epoch": 0.5608726973345682,
"grad_norm": 10.072178840637207,
"learning_rate": 1.88985268363037e-05,
"loss": 1.44,
"step": 5450
},
{
"epoch": 0.5660183184110322,
"grad_norm": 5.5806803703308105,
"learning_rate": 1.8888224992273618e-05,
"loss": 1.2895,
"step": 5500
},
{
"epoch": 0.5660183184110322,
"eval_accuracy": 0.9490578770637512,
"eval_loss": 1.0065183639526367,
"eval_runtime": 418.105,
"eval_samples_per_second": 82.632,
"eval_steps_per_second": 2.583,
"step": 5500
},
{
"epoch": 0.5711639394874961,
"grad_norm": 12.096445083618164,
"learning_rate": 1.8877923148243536e-05,
"loss": 1.4253,
"step": 5550
},
{
"epoch": 0.5763095605639601,
"grad_norm": 6.126964569091797,
"learning_rate": 1.8867621304213455e-05,
"loss": 1.4438,
"step": 5600
},
{
"epoch": 0.581455181640424,
"grad_norm": 10.5121488571167,
"learning_rate": 1.8857319460183373e-05,
"loss": 1.3543,
"step": 5650
},
{
"epoch": 0.586600802716888,
"grad_norm": 4.3215227127075195,
"learning_rate": 1.8847017616153295e-05,
"loss": 1.5587,
"step": 5700
},
{
"epoch": 0.5917464237933519,
"grad_norm": 6.327254295349121,
"learning_rate": 1.883671577212321e-05,
"loss": 1.342,
"step": 5750
},
{
"epoch": 0.5917464237933519,
"eval_accuracy": 0.9487684369087219,
"eval_loss": 0.9927480816841125,
"eval_runtime": 418.6843,
"eval_samples_per_second": 82.518,
"eval_steps_per_second": 2.58,
"step": 5750
},
{
"epoch": 0.5968920448698158,
"grad_norm": 9.713254928588867,
"learning_rate": 1.882641392809313e-05,
"loss": 1.4503,
"step": 5800
},
{
"epoch": 0.6020376659462797,
"grad_norm": 5.628683090209961,
"learning_rate": 1.881611208406305e-05,
"loss": 1.4045,
"step": 5850
},
{
"epoch": 0.6071832870227436,
"grad_norm": 11.369056701660156,
"learning_rate": 1.8805810240032965e-05,
"loss": 1.4092,
"step": 5900
},
{
"epoch": 0.6123289080992076,
"grad_norm": 2.5842366218566895,
"learning_rate": 1.8795508396002887e-05,
"loss": 1.3318,
"step": 5950
},
{
"epoch": 0.6174745291756715,
"grad_norm": 11.178747177124023,
"learning_rate": 1.8785206551972805e-05,
"loss": 1.416,
"step": 6000
},
{
"epoch": 0.6174745291756715,
"eval_accuracy": 0.9503603577613831,
"eval_loss": 0.9909718632698059,
"eval_runtime": 418.5904,
"eval_samples_per_second": 82.537,
"eval_steps_per_second": 2.58,
"step": 6000
},
{
"epoch": 0.6226201502521355,
"grad_norm": 7.2673115730285645,
"learning_rate": 1.8774904707942724e-05,
"loss": 1.5132,
"step": 6050
},
{
"epoch": 0.6277657713285993,
"grad_norm": 4.217124938964844,
"learning_rate": 1.8764602863912642e-05,
"loss": 1.3275,
"step": 6100
},
{
"epoch": 0.6329113924050633,
"grad_norm": 2.112212896347046,
"learning_rate": 1.875430101988256e-05,
"loss": 1.4595,
"step": 6150
},
{
"epoch": 0.6380570134815272,
"grad_norm": 5.421743392944336,
"learning_rate": 1.874399917585248e-05,
"loss": 1.5112,
"step": 6200
},
{
"epoch": 0.6432026345579912,
"grad_norm": 9.458545684814453,
"learning_rate": 1.8733697331822397e-05,
"loss": 1.4435,
"step": 6250
},
{
"epoch": 0.6432026345579912,
"eval_accuracy": 0.9514892101287842,
"eval_loss": 0.9927791357040405,
"eval_runtime": 418.0438,
"eval_samples_per_second": 82.644,
"eval_steps_per_second": 2.583,
"step": 6250
},
{
"epoch": 0.6483482556344551,
"grad_norm": 8.746692657470703,
"learning_rate": 1.8723395487792316e-05,
"loss": 1.4268,
"step": 6300
},
{
"epoch": 0.653493876710919,
"grad_norm": 6.339073657989502,
"learning_rate": 1.8713093643762234e-05,
"loss": 1.5071,
"step": 6350
},
{
"epoch": 0.658639497787383,
"grad_norm": 10.726541519165039,
"learning_rate": 1.8702791799732153e-05,
"loss": 1.3817,
"step": 6400
},
{
"epoch": 0.6637851188638468,
"grad_norm": 6.412696361541748,
"learning_rate": 1.869248995570207e-05,
"loss": 1.5101,
"step": 6450
},
{
"epoch": 0.6689307399403108,
"grad_norm": 8.011473655700684,
"learning_rate": 1.868218811167199e-05,
"loss": 1.4014,
"step": 6500
},
{
"epoch": 0.6689307399403108,
"eval_accuracy": 0.9489710330963135,
"eval_loss": 0.9953876733779907,
"eval_runtime": 418.8528,
"eval_samples_per_second": 82.485,
"eval_steps_per_second": 2.578,
"step": 6500
},
{
"epoch": 0.6740763610167747,
"grad_norm": 2.333108901977539,
"learning_rate": 1.867188626764191e-05,
"loss": 1.2797,
"step": 6550
},
{
"epoch": 0.6792219820932387,
"grad_norm": 13.239009857177734,
"learning_rate": 1.8661584423611826e-05,
"loss": 1.3829,
"step": 6600
},
{
"epoch": 0.6843676031697026,
"grad_norm": 7.554291248321533,
"learning_rate": 1.8651282579581745e-05,
"loss": 1.4907,
"step": 6650
},
{
"epoch": 0.6895132242461666,
"grad_norm": 8.046769142150879,
"learning_rate": 1.8640980735551667e-05,
"loss": 1.4098,
"step": 6700
},
{
"epoch": 0.6946588453226304,
"grad_norm": 3.5291695594787598,
"learning_rate": 1.863067889152158e-05,
"loss": 1.482,
"step": 6750
},
{
"epoch": 0.6946588453226304,
"eval_accuracy": 0.9492025971412659,
"eval_loss": 0.9936777949333191,
"eval_runtime": 418.0424,
"eval_samples_per_second": 82.645,
"eval_steps_per_second": 2.583,
"step": 6750
},
{
"epoch": 0.6998044663990943,
"grad_norm": 5.854330539703369,
"learning_rate": 1.8620377047491503e-05,
"loss": 1.3779,
"step": 6800
},
{
"epoch": 0.7049500874755583,
"grad_norm": 9.476693153381348,
"learning_rate": 1.8610075203461422e-05,
"loss": 1.3791,
"step": 6850
},
{
"epoch": 0.7100957085520222,
"grad_norm": 9.646202087402344,
"learning_rate": 1.859977335943134e-05,
"loss": 1.5183,
"step": 6900
},
{
"epoch": 0.7152413296284862,
"grad_norm": 3.42673397064209,
"learning_rate": 1.858947151540126e-05,
"loss": 1.4022,
"step": 6950
},
{
"epoch": 0.7203869507049501,
"grad_norm": 9.239468574523926,
"learning_rate": 1.8579169671371177e-05,
"loss": 1.544,
"step": 7000
},
{
"epoch": 0.7203869507049501,
"eval_accuracy": 0.9508234858512878,
"eval_loss": 0.9934782385826111,
"eval_runtime": 418.095,
"eval_samples_per_second": 82.634,
"eval_steps_per_second": 2.583,
"step": 7000
},
{
"epoch": 0.7255325717814141,
"grad_norm": 5.876420021057129,
"learning_rate": 1.8568867827341096e-05,
"loss": 1.4566,
"step": 7050
},
{
"epoch": 0.7306781928578779,
"grad_norm": 2.191608190536499,
"learning_rate": 1.8558565983311014e-05,
"loss": 1.4641,
"step": 7100
},
{
"epoch": 0.7358238139343419,
"grad_norm": 10.467001914978027,
"learning_rate": 1.8548264139280932e-05,
"loss": 1.4208,
"step": 7150
},
{
"epoch": 0.7409694350108058,
"grad_norm": 9.560342788696289,
"learning_rate": 1.853796229525085e-05,
"loss": 1.3391,
"step": 7200
},
{
"epoch": 0.7461150560872697,
"grad_norm": 10.074899673461914,
"learning_rate": 1.852766045122077e-05,
"loss": 1.5002,
"step": 7250
},
{
"epoch": 0.7461150560872697,
"eval_accuracy": 0.9496946334838867,
"eval_loss": 0.9860528707504272,
"eval_runtime": 418.2736,
"eval_samples_per_second": 82.599,
"eval_steps_per_second": 2.582,
"step": 7250
},
{
"epoch": 0.7512606771637337,
"grad_norm": 5.140987873077393,
"learning_rate": 1.851735860719069e-05,
"loss": 1.2985,
"step": 7300
},
{
"epoch": 0.7564062982401976,
"grad_norm": 4.276757717132568,
"learning_rate": 1.8507056763160606e-05,
"loss": 1.5496,
"step": 7350
},
{
"epoch": 0.7615519193166616,
"grad_norm": 8.268556594848633,
"learning_rate": 1.8496754919130528e-05,
"loss": 1.5046,
"step": 7400
},
{
"epoch": 0.7666975403931254,
"grad_norm": 7.343358516693115,
"learning_rate": 1.8486453075100443e-05,
"loss": 1.3687,
"step": 7450
},
{
"epoch": 0.7718431614695894,
"grad_norm": 5.345001220703125,
"learning_rate": 1.847615123107036e-05,
"loss": 1.3841,
"step": 7500
},
{
"epoch": 0.7718431614695894,
"eval_accuracy": 0.9501287937164307,
"eval_loss": 0.9868325591087341,
"eval_runtime": 418.192,
"eval_samples_per_second": 82.615,
"eval_steps_per_second": 2.583,
"step": 7500
},
{
"epoch": 0.7769887825460533,
"grad_norm": 11.624256134033203,
"learning_rate": 1.8465849387040283e-05,
"loss": 1.3996,
"step": 7550
},
{
"epoch": 0.7821344036225172,
"grad_norm": 6.849825859069824,
"learning_rate": 1.8455547543010198e-05,
"loss": 1.5112,
"step": 7600
},
{
"epoch": 0.7872800246989812,
"grad_norm": 9.704992294311523,
"learning_rate": 1.844524569898012e-05,
"loss": 1.4335,
"step": 7650
},
{
"epoch": 0.792425645775445,
"grad_norm": 5.669846534729004,
"learning_rate": 1.843494385495004e-05,
"loss": 1.3867,
"step": 7700
},
{
"epoch": 0.797571266851909,
"grad_norm": 6.519596099853516,
"learning_rate": 1.8424642010919957e-05,
"loss": 1.3865,
"step": 7750
},
{
"epoch": 0.797571266851909,
"eval_accuracy": 0.9511418342590332,
"eval_loss": 0.986303448677063,
"eval_runtime": 418.5501,
"eval_samples_per_second": 82.544,
"eval_steps_per_second": 2.58,
"step": 7750
},
{
"epoch": 0.8027168879283729,
"grad_norm": 7.500890731811523,
"learning_rate": 1.8414340166889875e-05,
"loss": 1.4039,
"step": 7800
},
{
"epoch": 0.8078625090048369,
"grad_norm": 8.141263961791992,
"learning_rate": 1.8404038322859794e-05,
"loss": 1.379,
"step": 7850
},
{
"epoch": 0.8130081300813008,
"grad_norm": 8.75843620300293,
"learning_rate": 1.8393736478829712e-05,
"loss": 1.3459,
"step": 7900
},
{
"epoch": 0.8181537511577648,
"grad_norm": 9.22071647644043,
"learning_rate": 1.838343463479963e-05,
"loss": 1.3996,
"step": 7950
},
{
"epoch": 0.8232993722342287,
"grad_norm": 5.6009345054626465,
"learning_rate": 1.837313279076955e-05,
"loss": 1.4151,
"step": 8000
},
{
"epoch": 0.8232993722342287,
"eval_accuracy": 0.9510839581489563,
"eval_loss": 0.9821743369102478,
"eval_runtime": 418.1692,
"eval_samples_per_second": 82.62,
"eval_steps_per_second": 2.583,
"step": 8000
},
{
"epoch": 0.8284449933106925,
"grad_norm": 6.377861976623535,
"learning_rate": 1.8362830946739467e-05,
"loss": 1.3745,
"step": 8050
},
{
"epoch": 0.8335906143871565,
"grad_norm": 6.7617902755737305,
"learning_rate": 1.8352529102709386e-05,
"loss": 1.4404,
"step": 8100
},
{
"epoch": 0.8387362354636204,
"grad_norm": 10.52645492553711,
"learning_rate": 1.8342227258679308e-05,
"loss": 1.4776,
"step": 8150
},
{
"epoch": 0.8438818565400844,
"grad_norm": 7.829946517944336,
"learning_rate": 1.8331925414649223e-05,
"loss": 1.398,
"step": 8200
},
{
"epoch": 0.8490274776165483,
"grad_norm": 6.536490440368652,
"learning_rate": 1.832162357061914e-05,
"loss": 1.4482,
"step": 8250
},
{
"epoch": 0.8490274776165483,
"eval_accuracy": 0.9505919218063354,
"eval_loss": 0.9802690744400024,
"eval_runtime": 417.9193,
"eval_samples_per_second": 82.669,
"eval_steps_per_second": 2.584,
"step": 8250
},
{
"epoch": 0.8541730986930123,
"grad_norm": 8.002507209777832,
"learning_rate": 1.8311321726589063e-05,
"loss": 1.4551,
"step": 8300
},
{
"epoch": 0.8593187197694762,
"grad_norm": 10.97170352935791,
"learning_rate": 1.8301019882558978e-05,
"loss": 1.46,
"step": 8350
},
{
"epoch": 0.8644643408459401,
"grad_norm": 9.144811630249023,
"learning_rate": 1.82907180385289e-05,
"loss": 1.5179,
"step": 8400
},
{
"epoch": 0.869609961922404,
"grad_norm": 11.398577690124512,
"learning_rate": 1.8280416194498818e-05,
"loss": 1.4067,
"step": 8450
},
{
"epoch": 0.8747555829988679,
"grad_norm": 8.858057022094727,
"learning_rate": 1.8270320387349337e-05,
"loss": 1.4393,
"step": 8500
},
{
"epoch": 0.8747555829988679,
"eval_accuracy": 0.9503893256187439,
"eval_loss": 0.9808804392814636,
"eval_runtime": 418.2126,
"eval_samples_per_second": 82.611,
"eval_steps_per_second": 2.582,
"step": 8500
},
{
"epoch": 0.8799012040753319,
"grad_norm": 9.175712585449219,
"learning_rate": 1.8260018543319256e-05,
"loss": 1.4995,
"step": 8550
},
{
"epoch": 0.8850468251517958,
"grad_norm": 9.636043548583984,
"learning_rate": 1.8249716699289174e-05,
"loss": 1.4077,
"step": 8600
},
{
"epoch": 0.8901924462282598,
"grad_norm": 8.578084945678711,
"learning_rate": 1.8239414855259093e-05,
"loss": 1.4088,
"step": 8650
},
{
"epoch": 0.8953380673047237,
"grad_norm": 7.253017425537109,
"learning_rate": 1.822911301122901e-05,
"loss": 1.3464,
"step": 8700
},
{
"epoch": 0.9004836883811876,
"grad_norm": 8.55578899383545,
"learning_rate": 1.821881116719893e-05,
"loss": 1.3455,
"step": 8750
},
{
"epoch": 0.9004836883811876,
"eval_accuracy": 0.9506208300590515,
"eval_loss": 0.9797450304031372,
"eval_runtime": 418.1418,
"eval_samples_per_second": 82.625,
"eval_steps_per_second": 2.583,
"step": 8750
},
{
"epoch": 0.9056293094576515,
"grad_norm": 9.603639602661133,
"learning_rate": 1.8208509323168848e-05,
"loss": 1.5172,
"step": 8800
},
{
"epoch": 0.9107749305341155,
"grad_norm": 5.811156272888184,
"learning_rate": 1.8198207479138766e-05,
"loss": 1.3922,
"step": 8850
},
{
"epoch": 0.9159205516105794,
"grad_norm": 7.18412971496582,
"learning_rate": 1.8187905635108688e-05,
"loss": 1.3645,
"step": 8900
},
{
"epoch": 0.9210661726870433,
"grad_norm": 10.653360366821289,
"learning_rate": 1.8177603791078603e-05,
"loss": 1.3627,
"step": 8950
},
{
"epoch": 0.9262117937635073,
"grad_norm": 9.01271915435791,
"learning_rate": 1.8167301947048525e-05,
"loss": 1.3896,
"step": 9000
},
{
"epoch": 0.9262117937635073,
"eval_accuracy": 0.9506497979164124,
"eval_loss": 0.9806250929832458,
"eval_runtime": 417.4847,
"eval_samples_per_second": 82.755,
"eval_steps_per_second": 2.587,
"step": 9000
},
{
"epoch": 0.9313574148399711,
"grad_norm": 6.072149276733398,
"learning_rate": 1.8157000103018443e-05,
"loss": 1.433,
"step": 9050
},
{
"epoch": 0.9365030359164351,
"grad_norm": 5.18344783782959,
"learning_rate": 1.814669825898836e-05,
"loss": 1.4678,
"step": 9100
},
{
"epoch": 0.941648656992899,
"grad_norm": 12.650690078735352,
"learning_rate": 1.813639641495828e-05,
"loss": 1.3206,
"step": 9150
},
{
"epoch": 0.946794278069363,
"grad_norm": 4.13425350189209,
"learning_rate": 1.8126094570928195e-05,
"loss": 1.4589,
"step": 9200
},
{
"epoch": 0.9519398991458269,
"grad_norm": 9.408120155334473,
"learning_rate": 1.8115792726898117e-05,
"loss": 1.3494,
"step": 9250
},
{
"epoch": 0.9519398991458269,
"eval_accuracy": 0.9509103298187256,
"eval_loss": 0.9760673642158508,
"eval_runtime": 418.3905,
"eval_samples_per_second": 82.576,
"eval_steps_per_second": 2.581,
"step": 9250
},
{
"epoch": 0.9570855202222909,
"grad_norm": 8.437677383422852,
"learning_rate": 1.8105490882868035e-05,
"loss": 1.3768,
"step": 9300
},
{
"epoch": 0.9622311412987548,
"grad_norm": 5.862843990325928,
"learning_rate": 1.8095189038837954e-05,
"loss": 1.4449,
"step": 9350
},
{
"epoch": 0.9673767623752186,
"grad_norm": 5.639468193054199,
"learning_rate": 1.8084887194807872e-05,
"loss": 1.4187,
"step": 9400
},
{
"epoch": 0.9725223834516826,
"grad_norm": 5.7434401512146,
"learning_rate": 1.807458535077779e-05,
"loss": 1.3046,
"step": 9450
},
{
"epoch": 0.9776680045281465,
"grad_norm": 8.578060150146484,
"learning_rate": 1.806428350674771e-05,
"loss": 1.3586,
"step": 9500
},
{
"epoch": 0.9776680045281465,
"eval_accuracy": 0.9511997699737549,
"eval_loss": 0.9817301034927368,
"eval_runtime": 417.7965,
"eval_samples_per_second": 82.693,
"eval_steps_per_second": 2.585,
"step": 9500
},
{
"epoch": 0.9828136256046105,
"grad_norm": 6.870723247528076,
"learning_rate": 1.8053981662717628e-05,
"loss": 1.4631,
"step": 9550
},
{
"epoch": 0.9879592466810744,
"grad_norm": 8.596879005432129,
"learning_rate": 1.8043679818687546e-05,
"loss": 1.3113,
"step": 9600
},
{
"epoch": 0.9931048677575384,
"grad_norm": 5.606679439544678,
"learning_rate": 1.8033377974657464e-05,
"loss": 1.2972,
"step": 9650
},
{
"epoch": 0.9982504888340022,
"grad_norm": 1.0621393918991089,
"learning_rate": 1.8023076130627383e-05,
"loss": 1.3793,
"step": 9700
},
{
"epoch": 1.0033961099104662,
"grad_norm": 1.193249225616455,
"learning_rate": 1.8012774286597305e-05,
"loss": 1.1729,
"step": 9750
},
{
"epoch": 1.0033961099104662,
"eval_accuracy": 0.9509392380714417,
"eval_loss": 0.9846755266189575,
"eval_runtime": 417.935,
"eval_samples_per_second": 82.666,
"eval_steps_per_second": 2.584,
"step": 9750
},
{
"epoch": 1.0085417309869302,
"grad_norm": 10.64986515045166,
"learning_rate": 1.800247244256722e-05,
"loss": 1.2009,
"step": 9800
},
{
"epoch": 1.013687352063394,
"grad_norm": 9.815643310546875,
"learning_rate": 1.799217059853714e-05,
"loss": 1.2576,
"step": 9850
},
{
"epoch": 1.018832973139858,
"grad_norm": 9.344294548034668,
"learning_rate": 1.798186875450706e-05,
"loss": 1.3483,
"step": 9900
},
{
"epoch": 1.023978594216322,
"grad_norm": 2.3761701583862305,
"learning_rate": 1.7971566910476975e-05,
"loss": 1.2609,
"step": 9950
},
{
"epoch": 1.0291242152927857,
"grad_norm": 9.36589527130127,
"learning_rate": 1.7961265066446897e-05,
"loss": 1.3099,
"step": 10000
},
{
"epoch": 1.0291242152927857,
"eval_accuracy": 0.9513155221939087,
"eval_loss": 0.9894696474075317,
"eval_runtime": 417.5493,
"eval_samples_per_second": 82.742,
"eval_steps_per_second": 2.587,
"step": 10000
},
{
"epoch": 1.0342698363692497,
"grad_norm": 14.563089370727539,
"learning_rate": 1.7950963222416815e-05,
"loss": 1.2224,
"step": 10050
},
{
"epoch": 1.0394154574457137,
"grad_norm": 11.867334365844727,
"learning_rate": 1.7940661378386734e-05,
"loss": 1.3552,
"step": 10100
},
{
"epoch": 1.0445610785221777,
"grad_norm": 1.510968565940857,
"learning_rate": 1.7930359534356652e-05,
"loss": 1.3508,
"step": 10150
},
{
"epoch": 1.0497066995986415,
"grad_norm": 0.8010023832321167,
"learning_rate": 1.792005769032657e-05,
"loss": 1.3242,
"step": 10200
},
{
"epoch": 1.0548523206751055,
"grad_norm": 5.283142566680908,
"learning_rate": 1.790975584629649e-05,
"loss": 1.2287,
"step": 10250
},
{
"epoch": 1.0548523206751055,
"eval_accuracy": 0.951170802116394,
"eval_loss": 0.9977254867553711,
"eval_runtime": 417.1052,
"eval_samples_per_second": 82.83,
"eval_steps_per_second": 2.589,
"step": 10250
},
{
"epoch": 1.0599979417515695,
"grad_norm": 6.146693706512451,
"learning_rate": 1.7899454002266407e-05,
"loss": 1.2863,
"step": 10300
},
{
"epoch": 1.0651435628280332,
"grad_norm": 9.038339614868164,
"learning_rate": 1.7889152158236326e-05,
"loss": 1.2377,
"step": 10350
},
{
"epoch": 1.0702891839044972,
"grad_norm": 8.985528945922852,
"learning_rate": 1.7878850314206244e-05,
"loss": 1.3058,
"step": 10400
},
{
"epoch": 1.0754348049809612,
"grad_norm": 6.91862154006958,
"learning_rate": 1.7868548470176162e-05,
"loss": 1.3013,
"step": 10450
},
{
"epoch": 1.0805804260574252,
"grad_norm": 4.811442852020264,
"learning_rate": 1.785824662614608e-05,
"loss": 1.3233,
"step": 10500
},
{
"epoch": 1.0805804260574252,
"eval_accuracy": 0.9488263130187988,
"eval_loss": 0.9947823286056519,
"eval_runtime": 418.0737,
"eval_samples_per_second": 82.639,
"eval_steps_per_second": 2.583,
"step": 10500
},
{
"epoch": 1.085726047133889,
"grad_norm": 3.9576923847198486,
"learning_rate": 1.7847944782116e-05,
"loss": 1.334,
"step": 10550
},
{
"epoch": 1.090871668210353,
"grad_norm": 11.280867576599121,
"learning_rate": 1.783764293808592e-05,
"loss": 1.246,
"step": 10600
},
{
"epoch": 1.096017289286817,
"grad_norm": 10.32507038116455,
"learning_rate": 1.7827341094055836e-05,
"loss": 1.2298,
"step": 10650
},
{
"epoch": 1.1011629103632807,
"grad_norm": 9.05435848236084,
"learning_rate": 1.7817039250025755e-05,
"loss": 1.2016,
"step": 10700
},
{
"epoch": 1.1063085314397447,
"grad_norm": 10.334163665771484,
"learning_rate": 1.7806737405995676e-05,
"loss": 1.3035,
"step": 10750
},
{
"epoch": 1.1063085314397447,
"eval_accuracy": 0.9506497979164124,
"eval_loss": 0.9946981072425842,
"eval_runtime": 417.8974,
"eval_samples_per_second": 82.673,
"eval_steps_per_second": 2.584,
"step": 10750
},
{
"epoch": 1.1114541525162087,
"grad_norm": 11.852724075317383,
"learning_rate": 1.779643556196559e-05,
"loss": 1.2457,
"step": 10800
},
{
"epoch": 1.1165997735926727,
"grad_norm": 10.447225570678711,
"learning_rate": 1.7786133717935513e-05,
"loss": 1.2882,
"step": 10850
},
{
"epoch": 1.1217453946691365,
"grad_norm": 3.3465206623077393,
"learning_rate": 1.777583187390543e-05,
"loss": 1.2365,
"step": 10900
},
{
"epoch": 1.1268910157456005,
"grad_norm": 6.849998950958252,
"learning_rate": 1.776553002987535e-05,
"loss": 1.19,
"step": 10950
},
{
"epoch": 1.1320366368220645,
"grad_norm": 11.492406845092773,
"learning_rate": 1.775522818584527e-05,
"loss": 1.2377,
"step": 11000
},
{
"epoch": 1.1320366368220645,
"eval_accuracy": 0.9511129260063171,
"eval_loss": 0.9914972186088562,
"eval_runtime": 417.922,
"eval_samples_per_second": 82.669,
"eval_steps_per_second": 2.584,
"step": 11000
},
{
"epoch": 1.1371822578985284,
"grad_norm": 7.080196857452393,
"learning_rate": 1.7744926341815187e-05,
"loss": 1.3028,
"step": 11050
},
{
"epoch": 1.1423278789749922,
"grad_norm": 3.5371875762939453,
"learning_rate": 1.7734624497785105e-05,
"loss": 1.319,
"step": 11100
},
{
"epoch": 1.1474735000514562,
"grad_norm": 5.618402004241943,
"learning_rate": 1.7724322653755024e-05,
"loss": 1.3315,
"step": 11150
},
{
"epoch": 1.1526191211279202,
"grad_norm": 6.200303554534912,
"learning_rate": 1.7714020809724942e-05,
"loss": 1.2161,
"step": 11200
},
{
"epoch": 1.157764742204384,
"grad_norm": 8.898612976074219,
"learning_rate": 1.770371896569486e-05,
"loss": 1.3555,
"step": 11250
},
{
"epoch": 1.157764742204384,
"eval_accuracy": 0.9510550498962402,
"eval_loss": 0.990160346031189,
"eval_runtime": 417.6517,
"eval_samples_per_second": 82.722,
"eval_steps_per_second": 2.586,
"step": 11250
},
{
"epoch": 1.162910363280848,
"grad_norm": 4.882264137268066,
"learning_rate": 1.769341712166478e-05,
"loss": 1.1874,
"step": 11300
},
{
"epoch": 1.168055984357312,
"grad_norm": 3.1759836673736572,
"learning_rate": 1.7683115277634697e-05,
"loss": 1.2373,
"step": 11350
},
{
"epoch": 1.173201605433776,
"grad_norm": 13.944663047790527,
"learning_rate": 1.7672813433604616e-05,
"loss": 1.2474,
"step": 11400
},
{
"epoch": 1.1783472265102397,
"grad_norm": 6.393034934997559,
"learning_rate": 1.7662511589574534e-05,
"loss": 1.2838,
"step": 11450
},
{
"epoch": 1.1834928475867037,
"grad_norm": 9.834447860717773,
"learning_rate": 1.7652209745544453e-05,
"loss": 1.2242,
"step": 11500
},
{
"epoch": 1.1834928475867037,
"eval_accuracy": 0.9518075585365295,
"eval_loss": 0.992717444896698,
"eval_runtime": 417.5743,
"eval_samples_per_second": 82.737,
"eval_steps_per_second": 2.586,
"step": 11500
},
{
"epoch": 1.1886384686631677,
"grad_norm": 7.190279006958008,
"learning_rate": 1.764190790151437e-05,
"loss": 1.3123,
"step": 11550
},
{
"epoch": 1.1937840897396317,
"grad_norm": 3.1001381874084473,
"learning_rate": 1.7631606057484293e-05,
"loss": 1.2874,
"step": 11600
},
{
"epoch": 1.1989297108160955,
"grad_norm": 10.424577713012695,
"learning_rate": 1.7621304213454208e-05,
"loss": 1.2568,
"step": 11650
},
{
"epoch": 1.2040753318925594,
"grad_norm": 2.9702112674713135,
"learning_rate": 1.761100236942413e-05,
"loss": 1.2526,
"step": 11700
},
{
"epoch": 1.2092209529690234,
"grad_norm": 4.956679821014404,
"learning_rate": 1.7600700525394048e-05,
"loss": 1.347,
"step": 11750
},
{
"epoch": 1.2092209529690234,
"eval_accuracy": 0.9508523941040039,
"eval_loss": 0.9882821440696716,
"eval_runtime": 417.5587,
"eval_samples_per_second": 82.74,
"eval_steps_per_second": 2.586,
"step": 11750
},
{
"epoch": 1.2143665740454872,
"grad_norm": 7.329675674438477,
"learning_rate": 1.7590398681363963e-05,
"loss": 1.3098,
"step": 11800
},
{
"epoch": 1.2195121951219512,
"grad_norm": 2.8485448360443115,
"learning_rate": 1.7580096837333885e-05,
"loss": 1.2541,
"step": 11850
},
{
"epoch": 1.2246578161984152,
"grad_norm": 13.313427925109863,
"learning_rate": 1.7569794993303803e-05,
"loss": 1.2791,
"step": 11900
},
{
"epoch": 1.2298034372748792,
"grad_norm": 10.920377731323242,
"learning_rate": 1.7559493149273722e-05,
"loss": 1.2333,
"step": 11950
},
{
"epoch": 1.234949058351343,
"grad_norm": 3.033597946166992,
"learning_rate": 1.754919130524364e-05,
"loss": 1.3827,
"step": 12000
},
{
"epoch": 1.234949058351343,
"eval_accuracy": 0.9507366418838501,
"eval_loss": 0.9942870140075684,
"eval_runtime": 417.7023,
"eval_samples_per_second": 82.712,
"eval_steps_per_second": 2.586,
"step": 12000
},
{
"epoch": 1.240094679427807,
"grad_norm": 1.0334879159927368,
"learning_rate": 1.753888946121356e-05,
"loss": 1.2732,
"step": 12050
},
{
"epoch": 1.245240300504271,
"grad_norm": 7.173407077789307,
"learning_rate": 1.7528587617183477e-05,
"loss": 1.2993,
"step": 12100
},
{
"epoch": 1.250385921580735,
"grad_norm": 15.351693153381348,
"learning_rate": 1.7518285773153396e-05,
"loss": 1.2947,
"step": 12150
},
{
"epoch": 1.2555315426571987,
"grad_norm": 13.320657730102539,
"learning_rate": 1.7507983929123314e-05,
"loss": 1.3001,
"step": 12200
},
{
"epoch": 1.2606771637336627,
"grad_norm": 4.0671186447143555,
"learning_rate": 1.7497682085093232e-05,
"loss": 1.2957,
"step": 12250
},
{
"epoch": 1.2606771637336627,
"eval_accuracy": 0.9514023661613464,
"eval_loss": 0.9864968657493591,
"eval_runtime": 417.8412,
"eval_samples_per_second": 82.685,
"eval_steps_per_second": 2.585,
"step": 12250
},
{
"epoch": 1.2658227848101267,
"grad_norm": 7.0425519943237305,
"learning_rate": 1.748738024106315e-05,
"loss": 1.1393,
"step": 12300
},
{
"epoch": 1.2709684058865904,
"grad_norm": 4.306710243225098,
"learning_rate": 1.747707839703307e-05,
"loss": 1.2996,
"step": 12350
},
{
"epoch": 1.2761140269630544,
"grad_norm": 10.586379051208496,
"learning_rate": 1.7466776553002988e-05,
"loss": 1.3218,
"step": 12400
},
{
"epoch": 1.2812596480395184,
"grad_norm": 6.002781867980957,
"learning_rate": 1.745647470897291e-05,
"loss": 1.2138,
"step": 12450
},
{
"epoch": 1.2864052691159822,
"grad_norm": 7.406036853790283,
"learning_rate": 1.7446172864942825e-05,
"loss": 1.1731,
"step": 12500
},
{
"epoch": 1.2864052691159822,
"eval_accuracy": 0.9509682059288025,
"eval_loss": 0.9963937997817993,
"eval_runtime": 417.622,
"eval_samples_per_second": 82.728,
"eval_steps_per_second": 2.586,
"step": 12500
},
{
"epoch": 1.2915508901924462,
"grad_norm": 11.54760456085205,
"learning_rate": 1.7436077057793347e-05,
"loss": 1.3326,
"step": 12550
},
{
"epoch": 1.2966965112689102,
"grad_norm": 3.4204094409942627,
"learning_rate": 1.7425775213763265e-05,
"loss": 1.3575,
"step": 12600
},
{
"epoch": 1.3018421323453742,
"grad_norm": 9.140461921691895,
"learning_rate": 1.7415473369733184e-05,
"loss": 1.2948,
"step": 12650
},
{
"epoch": 1.3069877534218381,
"grad_norm": 6.069116592407227,
"learning_rate": 1.7405171525703102e-05,
"loss": 1.2921,
"step": 12700
},
{
"epoch": 1.312133374498302,
"grad_norm": 13.66699504852295,
"learning_rate": 1.739486968167302e-05,
"loss": 1.3052,
"step": 12750
},
{
"epoch": 1.312133374498302,
"eval_accuracy": 0.9509103298187256,
"eval_loss": 0.9840078949928284,
"eval_runtime": 418.1556,
"eval_samples_per_second": 82.622,
"eval_steps_per_second": 2.583,
"step": 12750
},
{
"epoch": 1.317278995574766,
"grad_norm": 6.949051380157471,
"learning_rate": 1.738456783764294e-05,
"loss": 1.3662,
"step": 12800
},
{
"epoch": 1.32242461665123,
"grad_norm": 9.286051750183105,
"learning_rate": 1.7374265993612858e-05,
"loss": 1.3673,
"step": 12850
},
{
"epoch": 1.3275702377276937,
"grad_norm": 9.19774341583252,
"learning_rate": 1.7363964149582776e-05,
"loss": 1.3006,
"step": 12900
},
{
"epoch": 1.3327158588041577,
"grad_norm": 5.003039360046387,
"learning_rate": 1.7353662305552694e-05,
"loss": 1.4217,
"step": 12950
},
{
"epoch": 1.3378614798806217,
"grad_norm": 4.849103927612305,
"learning_rate": 1.7343360461522613e-05,
"loss": 1.1608,
"step": 13000
},
{
"epoch": 1.3378614798806217,
"eval_accuracy": 0.9520102143287659,
"eval_loss": 0.98476642370224,
"eval_runtime": 417.8875,
"eval_samples_per_second": 82.675,
"eval_steps_per_second": 2.584,
"step": 13000
},
{
"epoch": 1.3430071009570854,
"grad_norm": 3.944049596786499,
"learning_rate": 1.7333058617492535e-05,
"loss": 1.2066,
"step": 13050
},
{
"epoch": 1.3481527220335494,
"grad_norm": 8.767118453979492,
"learning_rate": 1.732275677346245e-05,
"loss": 1.408,
"step": 13100
},
{
"epoch": 1.3532983431100134,
"grad_norm": 8.175588607788086,
"learning_rate": 1.7312454929432368e-05,
"loss": 1.3574,
"step": 13150
},
{
"epoch": 1.3584439641864772,
"grad_norm": 5.246455192565918,
"learning_rate": 1.730215308540229e-05,
"loss": 1.3171,
"step": 13200
},
{
"epoch": 1.3635895852629412,
"grad_norm": 8.986821174621582,
"learning_rate": 1.7291851241372205e-05,
"loss": 1.3188,
"step": 13250
},
{
"epoch": 1.3635895852629412,
"eval_accuracy": 0.9502446055412292,
"eval_loss": 0.9888262152671814,
"eval_runtime": 418.0556,
"eval_samples_per_second": 82.642,
"eval_steps_per_second": 2.583,
"step": 13250
},
{
"epoch": 1.3687352063394052,
"grad_norm": 3.4874706268310547,
"learning_rate": 1.7281549397342127e-05,
"loss": 1.299,
"step": 13300
},
{
"epoch": 1.3738808274158691,
"grad_norm": 5.339372158050537,
"learning_rate": 1.7271247553312045e-05,
"loss": 1.3015,
"step": 13350
},
{
"epoch": 1.3790264484923331,
"grad_norm": 0.7593218684196472,
"learning_rate": 1.7260945709281964e-05,
"loss": 1.3159,
"step": 13400
},
{
"epoch": 1.384172069568797,
"grad_norm": 6.2086896896362305,
"learning_rate": 1.7250643865251882e-05,
"loss": 1.2139,
"step": 13450
},
{
"epoch": 1.389317690645261,
"grad_norm": 8.667464256286621,
"learning_rate": 1.72403420212218e-05,
"loss": 1.2855,
"step": 13500
},
{
"epoch": 1.389317690645261,
"eval_accuracy": 0.9513733983039856,
"eval_loss": 0.9957149624824524,
"eval_runtime": 417.8702,
"eval_samples_per_second": 82.679,
"eval_steps_per_second": 2.585,
"step": 13500
},
{
"epoch": 1.3944633117217249,
"grad_norm": 11.632777214050293,
"learning_rate": 1.723004017719172e-05,
"loss": 1.2705,
"step": 13550
},
{
"epoch": 1.3996089327981887,
"grad_norm": 24.493167877197266,
"learning_rate": 1.7219738333161637e-05,
"loss": 1.3099,
"step": 13600
},
{
"epoch": 1.4047545538746526,
"grad_norm": 6.20335054397583,
"learning_rate": 1.7209436489131556e-05,
"loss": 1.3144,
"step": 13650
},
{
"epoch": 1.4099001749511166,
"grad_norm": 9.66215991973877,
"learning_rate": 1.7199134645101474e-05,
"loss": 1.2948,
"step": 13700
},
{
"epoch": 1.4150457960275804,
"grad_norm": 3.034616470336914,
"learning_rate": 1.7188832801071393e-05,
"loss": 1.3313,
"step": 13750
},
{
"epoch": 1.4150457960275804,
"eval_accuracy": 0.9511997699737549,
"eval_loss": 0.9909895658493042,
"eval_runtime": 417.5787,
"eval_samples_per_second": 82.737,
"eval_steps_per_second": 2.586,
"step": 13750
},
{
"epoch": 1.4201914171040444,
"grad_norm": 5.056656360626221,
"learning_rate": 1.7178530957041314e-05,
"loss": 1.3473,
"step": 13800
},
{
"epoch": 1.4253370381805084,
"grad_norm": 2.1632890701293945,
"learning_rate": 1.716822911301123e-05,
"loss": 1.2037,
"step": 13850
},
{
"epoch": 1.4304826592569724,
"grad_norm": 8.617193222045898,
"learning_rate": 1.7157927268981148e-05,
"loss": 1.3059,
"step": 13900
},
{
"epoch": 1.4356282803334364,
"grad_norm": 4.062990188598633,
"learning_rate": 1.714762542495107e-05,
"loss": 1.3763,
"step": 13950
},
{
"epoch": 1.4407739014099001,
"grad_norm": 8.483048439025879,
"learning_rate": 1.7137323580920985e-05,
"loss": 1.2606,
"step": 14000
},
{
"epoch": 1.4407739014099001,
"eval_accuracy": 0.9522996544837952,
"eval_loss": 0.9875785708427429,
"eval_runtime": 417.781,
"eval_samples_per_second": 82.696,
"eval_steps_per_second": 2.585,
"step": 14000
},
{
"epoch": 1.4459195224863641,
"grad_norm": 8.226116180419922,
"learning_rate": 1.7127021736890906e-05,
"loss": 1.2394,
"step": 14050
},
{
"epoch": 1.4510651435628281,
"grad_norm": 0.5191435813903809,
"learning_rate": 1.711671989286082e-05,
"loss": 1.219,
"step": 14100
},
{
"epoch": 1.4562107646392919,
"grad_norm": 8.271252632141113,
"learning_rate": 1.7106418048830743e-05,
"loss": 1.3501,
"step": 14150
},
{
"epoch": 1.4613563857157559,
"grad_norm": 6.9849066734313965,
"learning_rate": 1.7096116204800662e-05,
"loss": 1.2664,
"step": 14200
},
{
"epoch": 1.4665020067922199,
"grad_norm": 3.286569595336914,
"learning_rate": 1.7085814360770577e-05,
"loss": 1.2704,
"step": 14250
},
{
"epoch": 1.4665020067922199,
"eval_accuracy": 0.9512576460838318,
"eval_loss": 0.994490385055542,
"eval_runtime": 418.1263,
"eval_samples_per_second": 82.628,
"eval_steps_per_second": 2.583,
"step": 14250
},
{
"epoch": 1.4716476278686836,
"grad_norm": 6.6526618003845215,
"learning_rate": 1.70755125167405e-05,
"loss": 1.2332,
"step": 14300
},
{
"epoch": 1.4767932489451476,
"grad_norm": 7.4313578605651855,
"learning_rate": 1.7065210672710417e-05,
"loss": 1.2286,
"step": 14350
},
{
"epoch": 1.4819388700216116,
"grad_norm": 6.093780517578125,
"learning_rate": 1.7054908828680335e-05,
"loss": 1.2123,
"step": 14400
},
{
"epoch": 1.4870844910980756,
"grad_norm": 6.429713726043701,
"learning_rate": 1.7044606984650254e-05,
"loss": 1.2437,
"step": 14450
},
{
"epoch": 1.4922301121745396,
"grad_norm": 8.225885391235352,
"learning_rate": 1.7034305140620172e-05,
"loss": 1.2292,
"step": 14500
},
{
"epoch": 1.4922301121745396,
"eval_accuracy": 0.9502446055412292,
"eval_loss": 0.9886476993560791,
"eval_runtime": 417.1781,
"eval_samples_per_second": 82.816,
"eval_steps_per_second": 2.589,
"step": 14500
},
{
"epoch": 1.4973757332510034,
"grad_norm": 6.095223903656006,
"learning_rate": 1.7024209333470695e-05,
"loss": 1.3007,
"step": 14550
},
{
"epoch": 1.5025213543274674,
"grad_norm": 12.490996360778809,
"learning_rate": 1.701390748944061e-05,
"loss": 1.308,
"step": 14600
},
{
"epoch": 1.5076669754039314,
"grad_norm": 9.118165016174316,
"learning_rate": 1.700360564541053e-05,
"loss": 1.174,
"step": 14650
},
{
"epoch": 1.5128125964803951,
"grad_norm": 6.5648722648620605,
"learning_rate": 1.6993303801380447e-05,
"loss": 1.2648,
"step": 14700
},
{
"epoch": 1.5179582175568591,
"grad_norm": 8.813359260559082,
"learning_rate": 1.698300195735037e-05,
"loss": 1.2533,
"step": 14750
},
{
"epoch": 1.5179582175568591,
"eval_accuracy": 0.9517496824264526,
"eval_loss": 0.9885143041610718,
"eval_runtime": 417.9702,
"eval_samples_per_second": 82.659,
"eval_steps_per_second": 2.584,
"step": 14750
},
{
"epoch": 1.523103838633323,
"grad_norm": 1.7033747434616089,
"learning_rate": 1.6972700113320287e-05,
"loss": 1.2576,
"step": 14800
},
{
"epoch": 1.5282494597097869,
"grad_norm": 5.316808700561523,
"learning_rate": 1.6962398269290202e-05,
"loss": 1.3659,
"step": 14850
},
{
"epoch": 1.5333950807862509,
"grad_norm": 3.3904647827148438,
"learning_rate": 1.6952096425260124e-05,
"loss": 1.298,
"step": 14900
},
{
"epoch": 1.5385407018627149,
"grad_norm": 0.8259275555610657,
"learning_rate": 1.6941794581230042e-05,
"loss": 1.2723,
"step": 14950
},
{
"epoch": 1.5436863229391786,
"grad_norm": 7.67642068862915,
"learning_rate": 1.693149273719996e-05,
"loss": 1.3099,
"step": 15000
},
{
"epoch": 1.5436863229391786,
"eval_accuracy": 0.9517786502838135,
"eval_loss": 0.9875179529190063,
"eval_runtime": 416.9745,
"eval_samples_per_second": 82.856,
"eval_steps_per_second": 2.59,
"step": 15000
},
{
"epoch": 1.5488319440156428,
"grad_norm": 2.3492562770843506,
"learning_rate": 1.692119089316988e-05,
"loss": 1.2984,
"step": 15050
},
{
"epoch": 1.5539775650921066,
"grad_norm": 5.415560722351074,
"learning_rate": 1.6910889049139797e-05,
"loss": 1.2128,
"step": 15100
},
{
"epoch": 1.5591231861685706,
"grad_norm": 12.2908935546875,
"learning_rate": 1.6900587205109716e-05,
"loss": 1.2689,
"step": 15150
},
{
"epoch": 1.5642688072450346,
"grad_norm": 8.375056266784668,
"learning_rate": 1.6890285361079634e-05,
"loss": 1.2516,
"step": 15200
},
{
"epoch": 1.5694144283214984,
"grad_norm": 9.067890167236328,
"learning_rate": 1.6879983517049553e-05,
"loss": 1.3028,
"step": 15250
},
{
"epoch": 1.5694144283214984,
"eval_accuracy": 0.9523285627365112,
"eval_loss": 0.9856404066085815,
"eval_runtime": 417.1329,
"eval_samples_per_second": 82.825,
"eval_steps_per_second": 2.589,
"step": 15250
},
{
"epoch": 1.5745600493979623,
"grad_norm": 10.568164825439453,
"learning_rate": 1.686968167301947e-05,
"loss": 1.3619,
"step": 15300
},
{
"epoch": 1.5797056704744263,
"grad_norm": 15.765814781188965,
"learning_rate": 1.685937982898939e-05,
"loss": 1.3524,
"step": 15350
},
{
"epoch": 1.58485129155089,
"grad_norm": 11.065564155578613,
"learning_rate": 1.684907798495931e-05,
"loss": 1.1749,
"step": 15400
},
{
"epoch": 1.589996912627354,
"grad_norm": 7.860668659210205,
"learning_rate": 1.6838776140929226e-05,
"loss": 1.205,
"step": 15450
},
{
"epoch": 1.595142533703818,
"grad_norm": 2.4386684894561768,
"learning_rate": 1.6828474296899148e-05,
"loss": 1.297,
"step": 15500
},
{
"epoch": 1.595142533703818,
"eval_accuracy": 0.9513155221939087,
"eval_loss": 0.9780011177062988,
"eval_runtime": 418.0332,
"eval_samples_per_second": 82.647,
"eval_steps_per_second": 2.584,
"step": 15500
},
{
"epoch": 1.6002881547802819,
"grad_norm": 7.5391316413879395,
"learning_rate": 1.6818172452869067e-05,
"loss": 1.2469,
"step": 15550
},
{
"epoch": 1.605433775856746,
"grad_norm": 9.402176856994629,
"learning_rate": 1.680787060883898e-05,
"loss": 1.2285,
"step": 15600
},
{
"epoch": 1.6105793969332098,
"grad_norm": 5.171482563018799,
"learning_rate": 1.6797568764808903e-05,
"loss": 1.2963,
"step": 15650
},
{
"epoch": 1.6157250180096736,
"grad_norm": 7.366409778594971,
"learning_rate": 1.678726692077882e-05,
"loss": 1.2406,
"step": 15700
},
{
"epoch": 1.6208706390861378,
"grad_norm": 10.613348007202148,
"learning_rate": 1.677696507674874e-05,
"loss": 1.3049,
"step": 15750
},
{
"epoch": 1.6208706390861378,
"eval_accuracy": 0.9511997699737549,
"eval_loss": 0.9873180389404297,
"eval_runtime": 417.7747,
"eval_samples_per_second": 82.698,
"eval_steps_per_second": 2.585,
"step": 15750
},
{
"epoch": 1.6260162601626016,
"grad_norm": 3.9607322216033936,
"learning_rate": 1.676666323271866e-05,
"loss": 1.2174,
"step": 15800
},
{
"epoch": 1.6311618812390656,
"grad_norm": 8.552703857421875,
"learning_rate": 1.6756361388688577e-05,
"loss": 1.2789,
"step": 15850
},
{
"epoch": 1.6363075023155296,
"grad_norm": 5.216203689575195,
"learning_rate": 1.6746059544658496e-05,
"loss": 1.289,
"step": 15900
},
{
"epoch": 1.6414531233919933,
"grad_norm": 7.981589317321777,
"learning_rate": 1.6735757700628414e-05,
"loss": 1.3242,
"step": 15950
},
{
"epoch": 1.6465987444684573,
"grad_norm": 9.827128410339355,
"learning_rate": 1.6725455856598332e-05,
"loss": 1.2974,
"step": 16000
},
{
"epoch": 1.6465987444684573,
"eval_accuracy": 0.9522417187690735,
"eval_loss": 0.9755061268806458,
"eval_runtime": 418.3094,
"eval_samples_per_second": 82.592,
"eval_steps_per_second": 2.582,
"step": 16000
},
{
"epoch": 1.6517443655449213,
"grad_norm": 8.793742179870605,
"learning_rate": 1.671515401256825e-05,
"loss": 1.2741,
"step": 16050
},
{
"epoch": 1.656889986621385,
"grad_norm": 4.681251049041748,
"learning_rate": 1.670485216853817e-05,
"loss": 1.1625,
"step": 16100
},
{
"epoch": 1.662035607697849,
"grad_norm": 9.398008346557617,
"learning_rate": 1.6694550324508088e-05,
"loss": 1.2795,
"step": 16150
},
{
"epoch": 1.667181228774313,
"grad_norm": 7.628296852111816,
"learning_rate": 1.6684248480478006e-05,
"loss": 1.2301,
"step": 16200
},
{
"epoch": 1.6723268498507768,
"grad_norm": 7.104902267456055,
"learning_rate": 1.6673946636447928e-05,
"loss": 1.2348,
"step": 16250
},
{
"epoch": 1.6723268498507768,
"eval_accuracy": 0.952791690826416,
"eval_loss": 0.980122447013855,
"eval_runtime": 418.027,
"eval_samples_per_second": 82.648,
"eval_steps_per_second": 2.584,
"step": 16250
},
{
"epoch": 1.677472470927241,
"grad_norm": 6.678224563598633,
"learning_rate": 1.6663644792417843e-05,
"loss": 1.2408,
"step": 16300
},
{
"epoch": 1.6826180920037048,
"grad_norm": 13.851053237915039,
"learning_rate": 1.665334294838776e-05,
"loss": 1.2477,
"step": 16350
},
{
"epoch": 1.6877637130801688,
"grad_norm": 3.6658806800842285,
"learning_rate": 1.6643041104357683e-05,
"loss": 1.3386,
"step": 16400
},
{
"epoch": 1.6929093341566328,
"grad_norm": 5.4644927978515625,
"learning_rate": 1.6632739260327598e-05,
"loss": 1.2346,
"step": 16450
},
{
"epoch": 1.6980549552330966,
"grad_norm": 1.6028341054916382,
"learning_rate": 1.662243741629752e-05,
"loss": 1.2904,
"step": 16500
},
{
"epoch": 1.6980549552330966,
"eval_accuracy": 0.9520391225814819,
"eval_loss": 0.9905561208724976,
"eval_runtime": 417.9366,
"eval_samples_per_second": 82.666,
"eval_steps_per_second": 2.584,
"step": 16500
},
{
"epoch": 1.7032005763095606,
"grad_norm": 0.9734807014465332,
"learning_rate": 1.661234160914804e-05,
"loss": 1.2947,
"step": 16550
},
{
"epoch": 1.7083461973860246,
"grad_norm": 3.4319236278533936,
"learning_rate": 1.6602039765117958e-05,
"loss": 1.2572,
"step": 16600
},
{
"epoch": 1.7134918184624883,
"grad_norm": 3.019766092300415,
"learning_rate": 1.6591737921087876e-05,
"loss": 1.2738,
"step": 16650
},
{
"epoch": 1.7186374395389523,
"grad_norm": 9.71827507019043,
"learning_rate": 1.6581436077057794e-05,
"loss": 1.2686,
"step": 16700
},
{
"epoch": 1.7237830606154163,
"grad_norm": 5.171957969665527,
"learning_rate": 1.6571134233027713e-05,
"loss": 1.4041,
"step": 16750
},
{
"epoch": 1.7237830606154163,
"eval_accuracy": 0.952791690826416,
"eval_loss": 0.9791179895401001,
"eval_runtime": 418.3742,
"eval_samples_per_second": 82.579,
"eval_steps_per_second": 2.581,
"step": 16750
},
{
"epoch": 1.72892868169188,
"grad_norm": 5.277884006500244,
"learning_rate": 1.656083238899763e-05,
"loss": 1.2935,
"step": 16800
},
{
"epoch": 1.7340743027683443,
"grad_norm": 10.89902400970459,
"learning_rate": 1.655053054496755e-05,
"loss": 1.2501,
"step": 16850
},
{
"epoch": 1.739219923844808,
"grad_norm": 2.373206377029419,
"learning_rate": 1.6540434737818072e-05,
"loss": 1.3208,
"step": 16900
},
{
"epoch": 1.744365544921272,
"grad_norm": 1.7645074129104614,
"learning_rate": 1.653013289378799e-05,
"loss": 1.2486,
"step": 16950
},
{
"epoch": 1.749511165997736,
"grad_norm": 3.979423999786377,
"learning_rate": 1.651983104975791e-05,
"loss": 1.2587,
"step": 17000
},
{
"epoch": 1.749511165997736,
"eval_accuracy": 0.9519522786140442,
"eval_loss": 0.9862294793128967,
"eval_runtime": 417.504,
"eval_samples_per_second": 82.751,
"eval_steps_per_second": 2.587,
"step": 17000
},
{
"epoch": 1.7546567870741998,
"grad_norm": 3.5347177982330322,
"learning_rate": 1.6509529205727824e-05,
"loss": 1.3325,
"step": 17050
},
{
"epoch": 1.7598024081506638,
"grad_norm": 5.752897262573242,
"learning_rate": 1.6499227361697746e-05,
"loss": 1.3104,
"step": 17100
},
{
"epoch": 1.7649480292271278,
"grad_norm": 8.936431884765625,
"learning_rate": 1.6488925517667664e-05,
"loss": 1.2504,
"step": 17150
},
{
"epoch": 1.7700936503035916,
"grad_norm": 11.348810195922852,
"learning_rate": 1.6478623673637583e-05,
"loss": 1.3153,
"step": 17200
},
{
"epoch": 1.7752392713800556,
"grad_norm": 8.096456527709961,
"learning_rate": 1.64683218296075e-05,
"loss": 1.328,
"step": 17250
},
{
"epoch": 1.7752392713800556,
"eval_accuracy": 0.9529942870140076,
"eval_loss": 0.9803459644317627,
"eval_runtime": 417.43,
"eval_samples_per_second": 82.766,
"eval_steps_per_second": 2.587,
"step": 17250
},
{
"epoch": 1.7803848924565195,
"grad_norm": 4.984877586364746,
"learning_rate": 1.645801998557742e-05,
"loss": 1.3417,
"step": 17300
},
{
"epoch": 1.7855305135329833,
"grad_norm": 8.615971565246582,
"learning_rate": 1.6447718141547338e-05,
"loss": 1.2486,
"step": 17350
},
{
"epoch": 1.7906761346094475,
"grad_norm": 6.480031490325928,
"learning_rate": 1.6437416297517256e-05,
"loss": 1.2869,
"step": 17400
},
{
"epoch": 1.7958217556859113,
"grad_norm": 3.220890522003174,
"learning_rate": 1.6427114453487175e-05,
"loss": 1.3599,
"step": 17450
},
{
"epoch": 1.800967376762375,
"grad_norm": 6.310009956359863,
"learning_rate": 1.6416812609457093e-05,
"loss": 1.2822,
"step": 17500
},
{
"epoch": 1.800967376762375,
"eval_accuracy": 0.9526180028915405,
"eval_loss": 0.9846508502960205,
"eval_runtime": 417.34,
"eval_samples_per_second": 82.784,
"eval_steps_per_second": 2.588,
"step": 17500
},
{
"epoch": 1.8061129978388393,
"grad_norm": 9.1428804397583,
"learning_rate": 1.640651076542701e-05,
"loss": 1.3001,
"step": 17550
},
{
"epoch": 1.811258618915303,
"grad_norm": 11.503830909729004,
"learning_rate": 1.6396208921396934e-05,
"loss": 1.0848,
"step": 17600
},
{
"epoch": 1.816404239991767,
"grad_norm": 6.229241847991943,
"learning_rate": 1.638590707736685e-05,
"loss": 1.3171,
"step": 17650
},
{
"epoch": 1.821549861068231,
"grad_norm": 6.697323799133301,
"learning_rate": 1.637560523333677e-05,
"loss": 1.3387,
"step": 17700
},
{
"epoch": 1.8266954821446948,
"grad_norm": 12.814096450805664,
"learning_rate": 1.636530338930669e-05,
"loss": 1.2401,
"step": 17750
},
{
"epoch": 1.8266954821446948,
"eval_accuracy": 0.9528206586837769,
"eval_loss": 0.9803994297981262,
"eval_runtime": 417.6448,
"eval_samples_per_second": 82.723,
"eval_steps_per_second": 2.586,
"step": 17750
},
{
"epoch": 1.8318411032211588,
"grad_norm": 5.222059726715088,
"learning_rate": 1.6355001545276604e-05,
"loss": 1.2979,
"step": 17800
},
{
"epoch": 1.8369867242976228,
"grad_norm": 3.9017868041992188,
"learning_rate": 1.6344699701246526e-05,
"loss": 1.2222,
"step": 17850
},
{
"epoch": 1.8421323453740865,
"grad_norm": 7.524175643920898,
"learning_rate": 1.6334397857216444e-05,
"loss": 1.27,
"step": 17900
},
{
"epoch": 1.8472779664505505,
"grad_norm": 4.3478593826293945,
"learning_rate": 1.6324096013186362e-05,
"loss": 1.3109,
"step": 17950
},
{
"epoch": 1.8524235875270145,
"grad_norm": 8.614230155944824,
"learning_rate": 1.631379416915628e-05,
"loss": 1.2306,
"step": 18000
},
{
"epoch": 1.8524235875270145,
"eval_accuracy": 0.9536600112915039,
"eval_loss": 0.987566351890564,
"eval_runtime": 417.5552,
"eval_samples_per_second": 82.741,
"eval_steps_per_second": 2.586,
"step": 18000
},
{
"epoch": 1.8575692086034783,
"grad_norm": 7.108985900878906,
"learning_rate": 1.63034923251262e-05,
"loss": 1.1878,
"step": 18050
},
{
"epoch": 1.8627148296799425,
"grad_norm": 10.433032989501953,
"learning_rate": 1.6293190481096118e-05,
"loss": 1.2398,
"step": 18100
},
{
"epoch": 1.8678604507564063,
"grad_norm": 10.102560043334961,
"learning_rate": 1.6282888637066036e-05,
"loss": 1.2576,
"step": 18150
},
{
"epoch": 1.8730060718328703,
"grad_norm": 4.380664348602295,
"learning_rate": 1.6272586793035955e-05,
"loss": 1.1579,
"step": 18200
},
{
"epoch": 1.8781516929093343,
"grad_norm": 2.1999149322509766,
"learning_rate": 1.6262284949005873e-05,
"loss": 1.2889,
"step": 18250
},
{
"epoch": 1.8781516929093343,
"eval_accuracy": 0.9519233703613281,
"eval_loss": 0.9859423041343689,
"eval_runtime": 417.7845,
"eval_samples_per_second": 82.696,
"eval_steps_per_second": 2.585,
"step": 18250
},
{
"epoch": 1.883297313985798,
"grad_norm": 5.329191207885742,
"learning_rate": 1.625198310497579e-05,
"loss": 1.3331,
"step": 18300
},
{
"epoch": 1.888442935062262,
"grad_norm": 17.370649337768555,
"learning_rate": 1.624168126094571e-05,
"loss": 1.2957,
"step": 18350
},
{
"epoch": 1.893588556138726,
"grad_norm": 10.373506546020508,
"learning_rate": 1.6231379416915628e-05,
"loss": 1.2286,
"step": 18400
},
{
"epoch": 1.8987341772151898,
"grad_norm": 13.445988655090332,
"learning_rate": 1.622107757288555e-05,
"loss": 1.2513,
"step": 18450
},
{
"epoch": 1.9038797982916538,
"grad_norm": 11.915916442871094,
"learning_rate": 1.6210775728855465e-05,
"loss": 1.1702,
"step": 18500
},
{
"epoch": 1.9038797982916538,
"eval_accuracy": 0.9540941715240479,
"eval_loss": 0.9839755296707153,
"eval_runtime": 417.3645,
"eval_samples_per_second": 82.779,
"eval_steps_per_second": 2.588,
"step": 18500
},
{
"epoch": 1.9090254193681178,
"grad_norm": 13.058712005615234,
"learning_rate": 1.6200473884825383e-05,
"loss": 1.3181,
"step": 18550
},
{
"epoch": 1.9141710404445815,
"grad_norm": 8.620599746704102,
"learning_rate": 1.6190172040795305e-05,
"loss": 1.1976,
"step": 18600
},
{
"epoch": 1.9193166615210457,
"grad_norm": 7.074895858764648,
"learning_rate": 1.617987019676522e-05,
"loss": 1.3623,
"step": 18650
},
{
"epoch": 1.9244622825975095,
"grad_norm": 10.293702125549316,
"learning_rate": 1.6169568352735142e-05,
"loss": 1.2594,
"step": 18700
},
{
"epoch": 1.9296079036739735,
"grad_norm": 7.491464138031006,
"learning_rate": 1.615926650870506e-05,
"loss": 1.2902,
"step": 18750
},
{
"epoch": 1.9296079036739735,
"eval_accuracy": 0.9522128105163574,
"eval_loss": 0.9844051003456116,
"eval_runtime": 418.2592,
"eval_samples_per_second": 82.602,
"eval_steps_per_second": 2.582,
"step": 18750
},
{
"epoch": 1.9347535247504375,
"grad_norm": 2.0575156211853027,
"learning_rate": 1.614896466467498e-05,
"loss": 1.3283,
"step": 18800
},
{
"epoch": 1.9398991458269013,
"grad_norm": 10.51094913482666,
"learning_rate": 1.6138662820644897e-05,
"loss": 1.2987,
"step": 18850
},
{
"epoch": 1.9450447669033653,
"grad_norm": 6.296252727508545,
"learning_rate": 1.6128567013495417e-05,
"loss": 1.1987,
"step": 18900
},
{
"epoch": 1.9501903879798292,
"grad_norm": 2.280179738998413,
"learning_rate": 1.6118265169465335e-05,
"loss": 1.2385,
"step": 18950
},
{
"epoch": 1.955336009056293,
"grad_norm": 5.830591678619385,
"learning_rate": 1.6107963325435253e-05,
"loss": 1.2772,
"step": 19000
},
{
"epoch": 1.955336009056293,
"eval_accuracy": 0.9533126950263977,
"eval_loss": 0.9861400723457336,
"eval_runtime": 418.0997,
"eval_samples_per_second": 82.633,
"eval_steps_per_second": 2.583,
"step": 19000
},
{
"epoch": 1.960481630132757,
"grad_norm": 6.468533515930176,
"learning_rate": 1.6097661481405172e-05,
"loss": 1.1906,
"step": 19050
},
{
"epoch": 1.965627251209221,
"grad_norm": 7.839109420776367,
"learning_rate": 1.608735963737509e-05,
"loss": 1.3041,
"step": 19100
},
{
"epoch": 1.9707728722856848,
"grad_norm": 12.740795135498047,
"learning_rate": 1.607705779334501e-05,
"loss": 1.2345,
"step": 19150
},
{
"epoch": 1.975918493362149,
"grad_norm": 7.1893134117126465,
"learning_rate": 1.606675594931493e-05,
"loss": 1.2586,
"step": 19200
},
{
"epoch": 1.9810641144386127,
"grad_norm": 14.163928031921387,
"learning_rate": 1.6056454105284846e-05,
"loss": 1.196,
"step": 19250
},
{
"epoch": 1.9810641144386127,
"eval_accuracy": 0.9521838426589966,
"eval_loss": 0.9835113286972046,
"eval_runtime": 417.4557,
"eval_samples_per_second": 82.761,
"eval_steps_per_second": 2.587,
"step": 19250
},
{
"epoch": 1.9862097355150765,
"grad_norm": 5.9427618980407715,
"learning_rate": 1.6046152261254767e-05,
"loss": 1.2872,
"step": 19300
},
{
"epoch": 1.9913553565915407,
"grad_norm": 14.67308235168457,
"learning_rate": 1.6035850417224686e-05,
"loss": 1.2449,
"step": 19350
},
{
"epoch": 1.9965009776680045,
"grad_norm": 3.581702947616577,
"learning_rate": 1.6025548573194604e-05,
"loss": 1.2435,
"step": 19400
},
{
"epoch": 2.0016465987444683,
"grad_norm": 13.742449760437012,
"learning_rate": 1.6015246729164523e-05,
"loss": 1.3096,
"step": 19450
},
{
"epoch": 2.0067922198209325,
"grad_norm": 10.677633285522461,
"learning_rate": 1.600494488513444e-05,
"loss": 1.1697,
"step": 19500
},
{
"epoch": 2.0067922198209325,
"eval_accuracy": 0.9514312744140625,
"eval_loss": 1.0035802125930786,
"eval_runtime": 418.5248,
"eval_samples_per_second": 82.549,
"eval_steps_per_second": 2.58,
"step": 19500
},
{
"epoch": 2.0119378408973962,
"grad_norm": 6.8798418045043945,
"learning_rate": 1.599464304110436e-05,
"loss": 1.0556,
"step": 19550
},
{
"epoch": 2.0170834619738605,
"grad_norm": 8.785711288452148,
"learning_rate": 1.5984341197074278e-05,
"loss": 1.1592,
"step": 19600
},
{
"epoch": 2.0222290830503242,
"grad_norm": 11.857321739196777,
"learning_rate": 1.5974039353044196e-05,
"loss": 1.1808,
"step": 19650
},
{
"epoch": 2.027374704126788,
"grad_norm": 3.2849769592285156,
"learning_rate": 1.5963737509014115e-05,
"loss": 1.141,
"step": 19700
},
{
"epoch": 2.032520325203252,
"grad_norm": 7.293135166168213,
"learning_rate": 1.5953435664984033e-05,
"loss": 1.1139,
"step": 19750
},
{
"epoch": 2.032520325203252,
"eval_accuracy": 0.9516628384590149,
"eval_loss": 1.0205085277557373,
"eval_runtime": 417.5267,
"eval_samples_per_second": 82.747,
"eval_steps_per_second": 2.587,
"step": 19750
},
{
"epoch": 2.037665946279716,
"grad_norm": 10.044347763061523,
"learning_rate": 1.594313382095395e-05,
"loss": 1.1959,
"step": 19800
},
{
"epoch": 2.0428115673561797,
"grad_norm": 0.14353907108306885,
"learning_rate": 1.593283197692387e-05,
"loss": 1.0762,
"step": 19850
},
{
"epoch": 2.047957188432644,
"grad_norm": 9.524683952331543,
"learning_rate": 1.592253013289379e-05,
"loss": 1.3522,
"step": 19900
},
{
"epoch": 2.0531028095091077,
"grad_norm": 12.576092720031738,
"learning_rate": 1.5912228288863707e-05,
"loss": 1.1175,
"step": 19950
},
{
"epoch": 2.0582484305855715,
"grad_norm": 0.6325793862342834,
"learning_rate": 1.5901926444833625e-05,
"loss": 1.178,
"step": 20000
},
{
"epoch": 2.0582484305855715,
"eval_accuracy": 0.951228678226471,
"eval_loss": 1.018436074256897,
"eval_runtime": 417.7277,
"eval_samples_per_second": 82.707,
"eval_steps_per_second": 2.585,
"step": 20000
},
{
"epoch": 2.0633940516620357,
"grad_norm": 12.589435577392578,
"learning_rate": 1.5891624600803547e-05,
"loss": 1.1416,
"step": 20050
},
{
"epoch": 2.0685396727384995,
"grad_norm": 4.007430076599121,
"learning_rate": 1.5881322756773462e-05,
"loss": 1.1523,
"step": 20100
},
{
"epoch": 2.0736852938149637,
"grad_norm": 4.076907157897949,
"learning_rate": 1.5871020912743384e-05,
"loss": 1.2561,
"step": 20150
},
{
"epoch": 2.0788309148914275,
"grad_norm": 1.5451477766036987,
"learning_rate": 1.5860719068713302e-05,
"loss": 1.119,
"step": 20200
},
{
"epoch": 2.0839765359678912,
"grad_norm": 5.0513691902160645,
"learning_rate": 1.5850417224683217e-05,
"loss": 1.095,
"step": 20250
},
{
"epoch": 2.0839765359678912,
"eval_accuracy": 0.95041823387146,
"eval_loss": 1.0154516696929932,
"eval_runtime": 418.0138,
"eval_samples_per_second": 82.65,
"eval_steps_per_second": 2.584,
"step": 20250
},
{
"epoch": 2.0891221570443554,
"grad_norm": 9.095207214355469,
"learning_rate": 1.584011538065314e-05,
"loss": 1.1432,
"step": 20300
},
{
"epoch": 2.094267778120819,
"grad_norm": 9.274755477905273,
"learning_rate": 1.5829813536623058e-05,
"loss": 1.1455,
"step": 20350
},
{
"epoch": 2.099413399197283,
"grad_norm": 14.58219051361084,
"learning_rate": 1.5819511692592976e-05,
"loss": 1.0913,
"step": 20400
},
{
"epoch": 2.104559020273747,
"grad_norm": 12.373740196228027,
"learning_rate": 1.5809209848562894e-05,
"loss": 1.1671,
"step": 20450
},
{
"epoch": 2.109704641350211,
"grad_norm": 7.772844314575195,
"learning_rate": 1.5798908004532813e-05,
"loss": 1.2776,
"step": 20500
},
{
"epoch": 2.109704641350211,
"eval_accuracy": 0.9514023661613464,
"eval_loss": 1.0333930253982544,
"eval_runtime": 418.0428,
"eval_samples_per_second": 82.645,
"eval_steps_per_second": 2.583,
"step": 20500
},
{
"epoch": 2.1148502624266747,
"grad_norm": 7.367280006408691,
"learning_rate": 1.578860616050273e-05,
"loss": 1.3092,
"step": 20550
},
{
"epoch": 2.119995883503139,
"grad_norm": 10.191935539245605,
"learning_rate": 1.577830431647265e-05,
"loss": 1.1981,
"step": 20600
},
{
"epoch": 2.1251415045796027,
"grad_norm": 7.1885199546813965,
"learning_rate": 1.5768002472442568e-05,
"loss": 1.1399,
"step": 20650
},
{
"epoch": 2.1302871256560665,
"grad_norm": 1.4416226148605347,
"learning_rate": 1.5757700628412486e-05,
"loss": 1.0976,
"step": 20700
},
{
"epoch": 2.1354327467325307,
"grad_norm": 13.531189918518066,
"learning_rate": 1.5747398784382405e-05,
"loss": 1.1335,
"step": 20750
},
{
"epoch": 2.1354327467325307,
"eval_accuracy": 0.9518365263938904,
"eval_loss": 1.0136040449142456,
"eval_runtime": 418.6163,
"eval_samples_per_second": 82.531,
"eval_steps_per_second": 2.58,
"step": 20750
},
{
"epoch": 2.1405783678089945,
"grad_norm": 0.48753559589385986,
"learning_rate": 1.5737096940352323e-05,
"loss": 1.1567,
"step": 20800
},
{
"epoch": 2.1457239888854587,
"grad_norm": 1.1756892204284668,
"learning_rate": 1.5726795096322242e-05,
"loss": 1.2536,
"step": 20850
},
{
"epoch": 2.1508696099619224,
"grad_norm": 13.005239486694336,
"learning_rate": 1.5716493252292164e-05,
"loss": 1.1717,
"step": 20900
},
{
"epoch": 2.156015231038386,
"grad_norm": 13.348917961120605,
"learning_rate": 1.570619140826208e-05,
"loss": 1.1433,
"step": 20950
},
{
"epoch": 2.1611608521148504,
"grad_norm": 4.952757835388184,
"learning_rate": 1.5695889564231997e-05,
"loss": 1.1885,
"step": 21000
},
{
"epoch": 2.1611608521148504,
"eval_accuracy": 0.951170802116394,
"eval_loss": 1.0185319185256958,
"eval_runtime": 418.1975,
"eval_samples_per_second": 82.614,
"eval_steps_per_second": 2.583,
"step": 21000
},
{
"epoch": 2.166306473191314,
"grad_norm": 10.125651359558105,
"learning_rate": 1.568558772020192e-05,
"loss": 1.0543,
"step": 21050
},
{
"epoch": 2.171452094267778,
"grad_norm": 5.2072062492370605,
"learning_rate": 1.5675285876171834e-05,
"loss": 1.1122,
"step": 21100
},
{
"epoch": 2.176597715344242,
"grad_norm": 3.4542808532714844,
"learning_rate": 1.5664984032141756e-05,
"loss": 1.17,
"step": 21150
},
{
"epoch": 2.181743336420706,
"grad_norm": 1.4935418367385864,
"learning_rate": 1.5654682188111674e-05,
"loss": 1.0757,
"step": 21200
},
{
"epoch": 2.1868889574971697,
"grad_norm": 2.735926389694214,
"learning_rate": 1.5644380344081593e-05,
"loss": 1.3008,
"step": 21250
},
{
"epoch": 2.1868889574971697,
"eval_accuracy": 0.9506497979164124,
"eval_loss": 1.016100287437439,
"eval_runtime": 417.7311,
"eval_samples_per_second": 82.706,
"eval_steps_per_second": 2.585,
"step": 21250
},
{
"epoch": 2.192034578573634,
"grad_norm": 11.821269989013672,
"learning_rate": 1.563407850005151e-05,
"loss": 1.1723,
"step": 21300
},
{
"epoch": 2.1971801996500977,
"grad_norm": 1.3524460792541504,
"learning_rate": 1.562377665602143e-05,
"loss": 1.2517,
"step": 21350
},
{
"epoch": 2.2023258207265615,
"grad_norm": 6.308670520782471,
"learning_rate": 1.5613474811991348e-05,
"loss": 1.1834,
"step": 21400
},
{
"epoch": 2.2074714418030257,
"grad_norm": 10.960680961608887,
"learning_rate": 1.5603172967961266e-05,
"loss": 1.1284,
"step": 21450
},
{
"epoch": 2.2126170628794894,
"grad_norm": 8.426880836486816,
"learning_rate": 1.5592871123931185e-05,
"loss": 1.28,
"step": 21500
},
{
"epoch": 2.2126170628794894,
"eval_accuracy": 0.9507076740264893,
"eval_loss": 1.0217114686965942,
"eval_runtime": 418.3067,
"eval_samples_per_second": 82.593,
"eval_steps_per_second": 2.582,
"step": 21500
},
{
"epoch": 2.2177626839559537,
"grad_norm": 3.2604434490203857,
"learning_rate": 1.5582569279901103e-05,
"loss": 1.2478,
"step": 21550
},
{
"epoch": 2.2229083050324174,
"grad_norm": 1.98189115524292,
"learning_rate": 1.557226743587102e-05,
"loss": 1.1798,
"step": 21600
},
{
"epoch": 2.228053926108881,
"grad_norm": 4.054050445556641,
"learning_rate": 1.5562171628721544e-05,
"loss": 1.1218,
"step": 21650
},
{
"epoch": 2.2331995471853454,
"grad_norm": 10.367090225219727,
"learning_rate": 1.555186978469146e-05,
"loss": 1.2787,
"step": 21700
},
{
"epoch": 2.238345168261809,
"grad_norm": 12.966401100158691,
"learning_rate": 1.554156794066138e-05,
"loss": 1.1254,
"step": 21750
},
{
"epoch": 2.238345168261809,
"eval_accuracy": 0.9507656097412109,
"eval_loss": 1.0311578512191772,
"eval_runtime": 417.8078,
"eval_samples_per_second": 82.691,
"eval_steps_per_second": 2.585,
"step": 21750
},
{
"epoch": 2.243490789338273,
"grad_norm": 10.402215957641602,
"learning_rate": 1.55312660966313e-05,
"loss": 1.2375,
"step": 21800
},
{
"epoch": 2.248636410414737,
"grad_norm": 15.226551055908203,
"learning_rate": 1.5520964252601218e-05,
"loss": 1.1074,
"step": 21850
},
{
"epoch": 2.253782031491201,
"grad_norm": 7.523915767669678,
"learning_rate": 1.5510662408571136e-05,
"loss": 1.0927,
"step": 21900
},
{
"epoch": 2.2589276525676647,
"grad_norm": 8.177473068237305,
"learning_rate": 1.5500360564541055e-05,
"loss": 1.1691,
"step": 21950
},
{
"epoch": 2.264073273644129,
"grad_norm": 4.812458038330078,
"learning_rate": 1.5490058720510973e-05,
"loss": 1.1703,
"step": 22000
},
{
"epoch": 2.264073273644129,
"eval_accuracy": 0.9499261975288391,
"eval_loss": 1.0275415182113647,
"eval_runtime": 417.8493,
"eval_samples_per_second": 82.683,
"eval_steps_per_second": 2.585,
"step": 22000
},
{
"epoch": 2.2692188947205927,
"grad_norm": 5.703485012054443,
"learning_rate": 1.547975687648089e-05,
"loss": 1.2158,
"step": 22050
},
{
"epoch": 2.274364515797057,
"grad_norm": 10.64054012298584,
"learning_rate": 1.546945503245081e-05,
"loss": 1.1026,
"step": 22100
},
{
"epoch": 2.2795101368735207,
"grad_norm": 0.8261292576789856,
"learning_rate": 1.5459153188420728e-05,
"loss": 1.0644,
"step": 22150
},
{
"epoch": 2.2846557579499844,
"grad_norm": 5.98064661026001,
"learning_rate": 1.5448851344390647e-05,
"loss": 1.1092,
"step": 22200
},
{
"epoch": 2.2898013790264486,
"grad_norm": 10.404533386230469,
"learning_rate": 1.543854950036057e-05,
"loss": 1.1686,
"step": 22250
},
{
"epoch": 2.2898013790264486,
"eval_accuracy": 0.9511997699737549,
"eval_loss": 1.0342940092086792,
"eval_runtime": 417.6196,
"eval_samples_per_second": 82.728,
"eval_steps_per_second": 2.586,
"step": 22250
},
{
"epoch": 2.2949470001029124,
"grad_norm": 3.5781595706939697,
"learning_rate": 1.5428247656330483e-05,
"loss": 1.2711,
"step": 22300
},
{
"epoch": 2.300092621179376,
"grad_norm": 5.956209182739258,
"learning_rate": 1.5417945812300402e-05,
"loss": 1.2942,
"step": 22350
},
{
"epoch": 2.3052382422558404,
"grad_norm": 0.08046738803386688,
"learning_rate": 1.5407643968270324e-05,
"loss": 1.2073,
"step": 22400
},
{
"epoch": 2.310383863332304,
"grad_norm": 6.365548610687256,
"learning_rate": 1.539734212424024e-05,
"loss": 1.2131,
"step": 22450
},
{
"epoch": 2.315529484408768,
"grad_norm": 6.6707258224487305,
"learning_rate": 1.538704028021016e-05,
"loss": 1.1445,
"step": 22500
},
{
"epoch": 2.315529484408768,
"eval_accuracy": 0.9516628384590149,
"eval_loss": 1.0127946138381958,
"eval_runtime": 417.7487,
"eval_samples_per_second": 82.703,
"eval_steps_per_second": 2.585,
"step": 22500
},
{
"epoch": 2.320675105485232,
"grad_norm": 0.25063377618789673,
"learning_rate": 1.5376738436180076e-05,
"loss": 1.1553,
"step": 22550
},
{
"epoch": 2.325820726561696,
"grad_norm": 8.28696060180664,
"learning_rate": 1.5366436592149997e-05,
"loss": 1.1512,
"step": 22600
},
{
"epoch": 2.33096634763816,
"grad_norm": 1.361279845237732,
"learning_rate": 1.5356134748119916e-05,
"loss": 1.2069,
"step": 22650
},
{
"epoch": 2.336111968714624,
"grad_norm": 1.9882014989852905,
"learning_rate": 1.534583290408983e-05,
"loss": 1.1345,
"step": 22700
},
{
"epoch": 2.3412575897910877,
"grad_norm": 4.8411865234375,
"learning_rate": 1.5335531060059753e-05,
"loss": 1.1681,
"step": 22750
},
{
"epoch": 2.3412575897910877,
"eval_accuracy": 0.9508813619613647,
"eval_loss": 1.0100795030593872,
"eval_runtime": 417.4967,
"eval_samples_per_second": 82.753,
"eval_steps_per_second": 2.587,
"step": 22750
},
{
"epoch": 2.346403210867552,
"grad_norm": 6.883627414703369,
"learning_rate": 1.532522921602967e-05,
"loss": 1.1372,
"step": 22800
},
{
"epoch": 2.3515488319440156,
"grad_norm": 9.81013298034668,
"learning_rate": 1.531492737199959e-05,
"loss": 1.1393,
"step": 22850
},
{
"epoch": 2.3566944530204794,
"grad_norm": 7.514392852783203,
"learning_rate": 1.5304625527969508e-05,
"loss": 1.1327,
"step": 22900
},
{
"epoch": 2.3618400740969436,
"grad_norm": 2.8904621601104736,
"learning_rate": 1.5294323683939426e-05,
"loss": 1.0903,
"step": 22950
},
{
"epoch": 2.3669856951734074,
"grad_norm": 7.99860954284668,
"learning_rate": 1.5284021839909345e-05,
"loss": 1.1354,
"step": 23000
},
{
"epoch": 2.3669856951734074,
"eval_accuracy": 0.9513444900512695,
"eval_loss": 1.0172919034957886,
"eval_runtime": 418.1914,
"eval_samples_per_second": 82.615,
"eval_steps_per_second": 2.583,
"step": 23000
},
{
"epoch": 2.372131316249871,
"grad_norm": 3.2717432975769043,
"learning_rate": 1.5273719995879263e-05,
"loss": 1.2517,
"step": 23050
},
{
"epoch": 2.3772769373263354,
"grad_norm": 4.3879594802856445,
"learning_rate": 1.526341815184918e-05,
"loss": 1.0634,
"step": 23100
},
{
"epoch": 2.382422558402799,
"grad_norm": 8.286825180053711,
"learning_rate": 1.52531163078191e-05,
"loss": 1.2095,
"step": 23150
},
{
"epoch": 2.3875681794792634,
"grad_norm": 2.0084967613220215,
"learning_rate": 1.524281446378902e-05,
"loss": 1.1686,
"step": 23200
},
{
"epoch": 2.392713800555727,
"grad_norm": 5.974940776824951,
"learning_rate": 1.5232512619758939e-05,
"loss": 1.1063,
"step": 23250
},
{
"epoch": 2.392713800555727,
"eval_accuracy": 0.9516628384590149,
"eval_loss": 1.0242797136306763,
"eval_runtime": 417.7754,
"eval_samples_per_second": 82.698,
"eval_steps_per_second": 2.585,
"step": 23250
},
{
"epoch": 2.397859421632191,
"grad_norm": 7.380057334899902,
"learning_rate": 1.5222210775728857e-05,
"loss": 1.1309,
"step": 23300
},
{
"epoch": 2.403005042708655,
"grad_norm": 1.6446843147277832,
"learning_rate": 1.5211908931698775e-05,
"loss": 1.1869,
"step": 23350
},
{
"epoch": 2.408150663785119,
"grad_norm": 5.716843605041504,
"learning_rate": 1.5201607087668696e-05,
"loss": 1.1743,
"step": 23400
},
{
"epoch": 2.4132962848615827,
"grad_norm": 2.141338586807251,
"learning_rate": 1.5191305243638612e-05,
"loss": 1.1001,
"step": 23450
},
{
"epoch": 2.418441905938047,
"grad_norm": 1.5462799072265625,
"learning_rate": 1.5181003399608532e-05,
"loss": 1.1696,
"step": 23500
},
{
"epoch": 2.418441905938047,
"eval_accuracy": 0.9524732828140259,
"eval_loss": 1.0314745903015137,
"eval_runtime": 418.5814,
"eval_samples_per_second": 82.538,
"eval_steps_per_second": 2.58,
"step": 23500
},
{
"epoch": 2.4235875270145106,
"grad_norm": 10.894279479980469,
"learning_rate": 1.5170701555578449e-05,
"loss": 1.1493,
"step": 23550
},
{
"epoch": 2.4287331480909744,
"grad_norm": 10.256954193115234,
"learning_rate": 1.5160399711548367e-05,
"loss": 1.1486,
"step": 23600
},
{
"epoch": 2.4338787691674386,
"grad_norm": 14.089293479919434,
"learning_rate": 1.5150097867518288e-05,
"loss": 1.2302,
"step": 23650
},
{
"epoch": 2.4390243902439024,
"grad_norm": 7.1174492835998535,
"learning_rate": 1.5139796023488204e-05,
"loss": 1.1427,
"step": 23700
},
{
"epoch": 2.4441700113203666,
"grad_norm": 1.6686251163482666,
"learning_rate": 1.5129494179458124e-05,
"loss": 1.2123,
"step": 23750
},
{
"epoch": 2.4441700113203666,
"eval_accuracy": 0.9509971141815186,
"eval_loss": 1.0296884775161743,
"eval_runtime": 418.2348,
"eval_samples_per_second": 82.607,
"eval_steps_per_second": 2.582,
"step": 23750
},
{
"epoch": 2.4493156323968304,
"grad_norm": 0.8798663020133972,
"learning_rate": 1.5119192335428043e-05,
"loss": 1.1169,
"step": 23800
},
{
"epoch": 2.454461253473294,
"grad_norm": 2.511453151702881,
"learning_rate": 1.5108890491397961e-05,
"loss": 1.1688,
"step": 23850
},
{
"epoch": 2.4596068745497583,
"grad_norm": 8.896649360656738,
"learning_rate": 1.509858864736788e-05,
"loss": 1.0506,
"step": 23900
},
{
"epoch": 2.464752495626222,
"grad_norm": 12.617236137390137,
"learning_rate": 1.50882868033378e-05,
"loss": 1.1965,
"step": 23950
},
{
"epoch": 2.469898116702686,
"grad_norm": 14.843036651611328,
"learning_rate": 1.5077984959307717e-05,
"loss": 1.1253,
"step": 24000
},
{
"epoch": 2.469898116702686,
"eval_accuracy": 0.9508234858512878,
"eval_loss": 1.0238152742385864,
"eval_runtime": 418.2418,
"eval_samples_per_second": 82.605,
"eval_steps_per_second": 2.582,
"step": 24000
},
{
"epoch": 2.47504373777915,
"grad_norm": 13.253664016723633,
"learning_rate": 1.5067683115277637e-05,
"loss": 1.1957,
"step": 24050
},
{
"epoch": 2.480189358855614,
"grad_norm": 4.080730438232422,
"learning_rate": 1.5057381271247555e-05,
"loss": 1.1395,
"step": 24100
},
{
"epoch": 2.4853349799320776,
"grad_norm": 1.9019577503204346,
"learning_rate": 1.5047079427217472e-05,
"loss": 1.1238,
"step": 24150
},
{
"epoch": 2.490480601008542,
"grad_norm": 10.57223129272461,
"learning_rate": 1.5036983620067993e-05,
"loss": 1.1342,
"step": 24200
},
{
"epoch": 2.4956262220850056,
"grad_norm": 11.598908424377441,
"learning_rate": 1.5026681776037913e-05,
"loss": 1.1703,
"step": 24250
},
{
"epoch": 2.4956262220850056,
"eval_accuracy": 0.9505629539489746,
"eval_loss": 1.0218814611434937,
"eval_runtime": 418.3594,
"eval_samples_per_second": 82.582,
"eval_steps_per_second": 2.582,
"step": 24250
},
{
"epoch": 2.50077184316147,
"grad_norm": 9.800350189208984,
"learning_rate": 1.501637993200783e-05,
"loss": 1.0947,
"step": 24300
},
{
"epoch": 2.5059174642379336,
"grad_norm": 3.784536361694336,
"learning_rate": 1.500607808797775e-05,
"loss": 1.1281,
"step": 24350
},
{
"epoch": 2.5110630853143974,
"grad_norm": 2.499333620071411,
"learning_rate": 1.4995776243947668e-05,
"loss": 1.1029,
"step": 24400
},
{
"epoch": 2.516208706390861,
"grad_norm": 9.453124046325684,
"learning_rate": 1.4985474399917585e-05,
"loss": 1.1784,
"step": 24450
},
{
"epoch": 2.5213543274673254,
"grad_norm": 2.7490689754486084,
"learning_rate": 1.4975172555887505e-05,
"loss": 1.101,
"step": 24500
},
{
"epoch": 2.5213543274673254,
"eval_accuracy": 0.9527627229690552,
"eval_loss": 1.0266767740249634,
"eval_runtime": 417.6079,
"eval_samples_per_second": 82.731,
"eval_steps_per_second": 2.586,
"step": 24500
},
{
"epoch": 2.526499948543789,
"grad_norm": 8.228843688964844,
"learning_rate": 1.4964870711857425e-05,
"loss": 1.1231,
"step": 24550
},
{
"epoch": 2.5316455696202533,
"grad_norm": 8.344508171081543,
"learning_rate": 1.4954568867827342e-05,
"loss": 1.1364,
"step": 24600
},
{
"epoch": 2.536791190696717,
"grad_norm": 2.6875457763671875,
"learning_rate": 1.494426702379726e-05,
"loss": 1.1778,
"step": 24650
},
{
"epoch": 2.541936811773181,
"grad_norm": 6.898427486419678,
"learning_rate": 1.493396517976718e-05,
"loss": 1.1089,
"step": 24700
},
{
"epoch": 2.547082432849645,
"grad_norm": 3.228970766067505,
"learning_rate": 1.4923663335737097e-05,
"loss": 1.1626,
"step": 24750
},
{
"epoch": 2.547082432849645,
"eval_accuracy": 0.9508234858512878,
"eval_loss": 1.0254093408584595,
"eval_runtime": 417.4897,
"eval_samples_per_second": 82.754,
"eval_steps_per_second": 2.587,
"step": 24750
},
{
"epoch": 2.552228053926109,
"grad_norm": 4.928084850311279,
"learning_rate": 1.4913361491707017e-05,
"loss": 1.2019,
"step": 24800
},
{
"epoch": 2.557373675002573,
"grad_norm": 15.422240257263184,
"learning_rate": 1.4903059647676936e-05,
"loss": 1.1503,
"step": 24850
},
{
"epoch": 2.562519296079037,
"grad_norm": 11.451377868652344,
"learning_rate": 1.4892757803646854e-05,
"loss": 1.1697,
"step": 24900
},
{
"epoch": 2.5676649171555006,
"grad_norm": 7.738549709320068,
"learning_rate": 1.4882455959616772e-05,
"loss": 1.0921,
"step": 24950
},
{
"epoch": 2.5728105382319644,
"grad_norm": 2.7136483192443848,
"learning_rate": 1.4872154115586692e-05,
"loss": 1.3136,
"step": 25000
},
{
"epoch": 2.5728105382319644,
"eval_accuracy": 0.9512865543365479,
"eval_loss": 1.0222209692001343,
"eval_runtime": 419.1601,
"eval_samples_per_second": 82.424,
"eval_steps_per_second": 2.577,
"step": 25000
}
],
"logging_steps": 50,
"max_steps": 97170,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}