BOND-reranker / trainer_state.json
rgrupesh's picture
Upload folder using huggingface_hub
770160f verified
{
"best_global_step": 69500,
"best_metric": 0.9733653983882032,
"best_model_checkpoint": "./results/checkpoint-69500",
"epoch": 2.278541733290694,
"eval_steps": 500,
"global_step": 85500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.332498301064666e-05,
"grad_norm": 2.171241283416748,
"learning_rate": 0.0,
"loss": 1.1419,
"step": 1
},
{
"epoch": 0.0013324983010646661,
"grad_norm": 3.923346757888794,
"learning_rate": 1.319120586275816e-07,
"loss": 0.9369,
"step": 100
},
{
"epoch": 0.0026649966021293323,
"grad_norm": 5.213994026184082,
"learning_rate": 2.651565622918055e-07,
"loss": 0.9031,
"step": 200
},
{
"epoch": 0.003997494903193999,
"grad_norm": 3.4589016437530518,
"learning_rate": 3.984010659560293e-07,
"loss": 0.8309,
"step": 300
},
{
"epoch": 0.0053299932042586646,
"grad_norm": 2.302459239959717,
"learning_rate": 5.316455696202532e-07,
"loss": 0.7406,
"step": 400
},
{
"epoch": 0.0066624915053233305,
"grad_norm": 2.8590707778930664,
"learning_rate": 6.64890073284477e-07,
"loss": 0.7311,
"step": 500
},
{
"epoch": 0.0066624915053233305,
"eval_dev_accuracy": 0.9312310116323776,
"eval_dev_accuracy_threshold": 0.48881804943084717,
"eval_dev_average_precision": 0.1090500582180461,
"eval_dev_f1": 0.1867953275774505,
"eval_dev_f1_threshold": 0.31155017018318176,
"eval_dev_precision": 0.17409826753763136,
"eval_dev_recall": 0.20149008436507068,
"eval_loss": 0.7330209612846375,
"eval_runtime": 567.4178,
"eval_samples_per_second": 233.773,
"eval_steps_per_second": 7.307,
"step": 500
},
{
"epoch": 0.007994989806387997,
"grad_norm": 2.162013530731201,
"learning_rate": 7.981345769487009e-07,
"loss": 0.725,
"step": 600
},
{
"epoch": 0.009327488107452663,
"grad_norm": 3.412961959838867,
"learning_rate": 9.313790806129248e-07,
"loss": 0.6892,
"step": 700
},
{
"epoch": 0.010659986408517329,
"grad_norm": 3.037612199783325,
"learning_rate": 1.0646235842771487e-06,
"loss": 0.74,
"step": 800
},
{
"epoch": 0.011992484709581995,
"grad_norm": 3.178318977355957,
"learning_rate": 1.1978680879413725e-06,
"loss": 0.6857,
"step": 900
},
{
"epoch": 0.013324983010646661,
"grad_norm": 3.9319422245025635,
"learning_rate": 1.3311125916055965e-06,
"loss": 0.6784,
"step": 1000
},
{
"epoch": 0.013324983010646661,
"eval_dev_accuracy": 0.9312913220804089,
"eval_dev_accuracy_threshold": 0.5571334362030029,
"eval_dev_average_precision": 0.2217988734731733,
"eval_dev_f1": 0.3029693004529441,
"eval_dev_f1_threshold": 0.3375406265258789,
"eval_dev_precision": 0.2600942655145326,
"eval_dev_recall": 0.36276980387860197,
"eval_loss": 0.6632949113845825,
"eval_runtime": 567.4434,
"eval_samples_per_second": 233.763,
"eval_steps_per_second": 7.306,
"step": 1000
},
{
"epoch": 0.014657481311711327,
"grad_norm": 3.4704461097717285,
"learning_rate": 1.4643570952698202e-06,
"loss": 0.6753,
"step": 1100
},
{
"epoch": 0.015989979612775995,
"grad_norm": 5.541119575500488,
"learning_rate": 1.597601598934044e-06,
"loss": 0.6707,
"step": 1200
},
{
"epoch": 0.01732247791384066,
"grad_norm": 4.9743475914001465,
"learning_rate": 1.7308461025982678e-06,
"loss": 0.6679,
"step": 1300
},
{
"epoch": 0.018654976214905326,
"grad_norm": 7.222622394561768,
"learning_rate": 1.864090606262492e-06,
"loss": 0.5831,
"step": 1400
},
{
"epoch": 0.01998747451596999,
"grad_norm": 3.6720590591430664,
"learning_rate": 1.9973351099267156e-06,
"loss": 0.5589,
"step": 1500
},
{
"epoch": 0.01998747451596999,
"eval_dev_accuracy": 0.9353095056804903,
"eval_dev_accuracy_threshold": 0.5631594657897949,
"eval_dev_average_precision": 0.338116839920073,
"eval_dev_f1": 0.39591571740541814,
"eval_dev_f1_threshold": 0.4508041739463806,
"eval_dev_precision": 0.4291197543500512,
"eval_dev_recall": 0.3674811000328695,
"eval_loss": 0.6119648814201355,
"eval_runtime": 567.5553,
"eval_samples_per_second": 233.716,
"eval_steps_per_second": 7.305,
"step": 1500
},
{
"epoch": 0.021319972817034658,
"grad_norm": 10.199407577514648,
"learning_rate": 2.1305796135909398e-06,
"loss": 0.6065,
"step": 1600
},
{
"epoch": 0.022652471118099326,
"grad_norm": 6.087101459503174,
"learning_rate": 2.2638241172551636e-06,
"loss": 0.5724,
"step": 1700
},
{
"epoch": 0.02398496941916399,
"grad_norm": 16.529647827148438,
"learning_rate": 2.3970686209193873e-06,
"loss": 0.5568,
"step": 1800
},
{
"epoch": 0.025317467720228658,
"grad_norm": 14.971884727478027,
"learning_rate": 2.530313124583611e-06,
"loss": 0.5603,
"step": 1900
},
{
"epoch": 0.026649966021293322,
"grad_norm": 4.663777828216553,
"learning_rate": 2.663557628247835e-06,
"loss": 0.5553,
"step": 2000
},
{
"epoch": 0.026649966021293322,
"eval_dev_accuracy": 0.9353848937405294,
"eval_dev_accuracy_threshold": 0.6437499523162842,
"eval_dev_average_precision": 0.38323093653846474,
"eval_dev_f1": 0.4514054443643622,
"eval_dev_f1_threshold": 0.6111855506896973,
"eval_dev_precision": 0.4569023569023569,
"eval_dev_recall": 0.44603922427960996,
"eval_loss": 0.5613667964935303,
"eval_runtime": 568.0344,
"eval_samples_per_second": 233.519,
"eval_steps_per_second": 7.299,
"step": 2000
},
{
"epoch": 0.02798246432235799,
"grad_norm": 5.051695823669434,
"learning_rate": 2.7968021319120587e-06,
"loss": 0.5506,
"step": 2100
},
{
"epoch": 0.029314962623422654,
"grad_norm": 12.604368209838867,
"learning_rate": 2.930046635576283e-06,
"loss": 0.5446,
"step": 2200
},
{
"epoch": 0.03064746092448732,
"grad_norm": 3.9183976650238037,
"learning_rate": 3.0632911392405066e-06,
"loss": 0.5432,
"step": 2300
},
{
"epoch": 0.03197995922555199,
"grad_norm": 5.165050983428955,
"learning_rate": 3.1965356429047304e-06,
"loss": 0.5091,
"step": 2400
},
{
"epoch": 0.03331245752661666,
"grad_norm": 10.820756912231445,
"learning_rate": 3.3297801465689546e-06,
"loss": 0.5099,
"step": 2500
},
{
"epoch": 0.03331245752661666,
"eval_dev_accuracy": 0.9395613922666928,
"eval_dev_accuracy_threshold": 0.700435996055603,
"eval_dev_average_precision": 0.4498892909951412,
"eval_dev_f1": 0.4910784423745932,
"eval_dev_f1_threshold": 0.5620608925819397,
"eval_dev_precision": 0.5032777458309373,
"eval_dev_recall": 0.47945655746685656,
"eval_loss": 0.5332435369491577,
"eval_runtime": 567.4907,
"eval_samples_per_second": 233.743,
"eval_steps_per_second": 7.306,
"step": 2500
},
{
"epoch": 0.03464495582768132,
"grad_norm": 5.984354496002197,
"learning_rate": 3.4630246502331784e-06,
"loss": 0.5168,
"step": 2600
},
{
"epoch": 0.035977454128745985,
"grad_norm": 11.091134071350098,
"learning_rate": 3.596269153897402e-06,
"loss": 0.4763,
"step": 2700
},
{
"epoch": 0.03730995242981065,
"grad_norm": 25.33905601501465,
"learning_rate": 3.729513657561626e-06,
"loss": 0.4916,
"step": 2800
},
{
"epoch": 0.03864245073087532,
"grad_norm": 7.44692325592041,
"learning_rate": 3.862758161225849e-06,
"loss": 0.4842,
"step": 2900
},
{
"epoch": 0.03997494903193998,
"grad_norm": 11.449934005737305,
"learning_rate": 3.996002664890073e-06,
"loss": 0.5246,
"step": 3000
},
{
"epoch": 0.03997494903193998,
"eval_dev_accuracy": 0.9320828967108189,
"eval_dev_accuracy_threshold": 0.7955138683319092,
"eval_dev_average_precision": 0.40640646661588725,
"eval_dev_f1": 0.5273073175258689,
"eval_dev_f1_threshold": 0.7331215143203735,
"eval_dev_precision": 0.45734063103670314,
"eval_dev_recall": 0.6225484825243782,
"eval_loss": 0.4692871868610382,
"eval_runtime": 565.8018,
"eval_samples_per_second": 234.441,
"eval_steps_per_second": 7.328,
"step": 3000
},
{
"epoch": 0.04130744733300465,
"grad_norm": 2.739481210708618,
"learning_rate": 4.129247168554298e-06,
"loss": 0.4399,
"step": 3100
},
{
"epoch": 0.042639945634069316,
"grad_norm": 18.604293823242188,
"learning_rate": 4.2624916722185215e-06,
"loss": 0.4532,
"step": 3200
},
{
"epoch": 0.043972443935133984,
"grad_norm": 2.9506380558013916,
"learning_rate": 4.395736175882745e-06,
"loss": 0.5107,
"step": 3300
},
{
"epoch": 0.04530494223619865,
"grad_norm": 6.515221118927002,
"learning_rate": 4.528980679546969e-06,
"loss": 0.4249,
"step": 3400
},
{
"epoch": 0.04663744053726331,
"grad_norm": 8.708155632019043,
"learning_rate": 4.662225183211193e-06,
"loss": 0.4526,
"step": 3500
},
{
"epoch": 0.04663744053726331,
"eval_dev_accuracy": 0.9425166042202237,
"eval_dev_accuracy_threshold": 0.8969273567199707,
"eval_dev_average_precision": 0.5181562757024104,
"eval_dev_f1": 0.5669769324160259,
"eval_dev_f1_threshold": 0.7522543668746948,
"eval_dev_precision": 0.5266422328728503,
"eval_dev_recall": 0.6140024104305906,
"eval_loss": 0.44245800375938416,
"eval_runtime": 566.66,
"eval_samples_per_second": 234.086,
"eval_steps_per_second": 7.317,
"step": 3500
},
{
"epoch": 0.04796993883832798,
"grad_norm": 20.27404022216797,
"learning_rate": 4.795469686875417e-06,
"loss": 0.4791,
"step": 3600
},
{
"epoch": 0.04930243713939265,
"grad_norm": 26.697437286376953,
"learning_rate": 4.92871419053964e-06,
"loss": 0.4151,
"step": 3700
},
{
"epoch": 0.050634935440457315,
"grad_norm": 29.9031982421875,
"learning_rate": 5.061958694203864e-06,
"loss": 0.4842,
"step": 3800
},
{
"epoch": 0.051967433741521976,
"grad_norm": 33.03110885620117,
"learning_rate": 5.195203197868088e-06,
"loss": 0.4062,
"step": 3900
},
{
"epoch": 0.053299932042586644,
"grad_norm": 18.199092864990234,
"learning_rate": 5.328447701532313e-06,
"loss": 0.4491,
"step": 4000
},
{
"epoch": 0.053299932042586644,
"eval_dev_accuracy": 0.9464367833422542,
"eval_dev_accuracy_threshold": 0.8940709829330444,
"eval_dev_average_precision": 0.5665972703365837,
"eval_dev_f1": 0.5783120410421486,
"eval_dev_f1_threshold": 0.532160758972168,
"eval_dev_precision": 0.5345420734542073,
"eval_dev_recall": 0.6298893393228882,
"eval_loss": 0.4243237376213074,
"eval_runtime": 565.9608,
"eval_samples_per_second": 234.375,
"eval_steps_per_second": 7.326,
"step": 4000
},
{
"epoch": 0.05463243034365131,
"grad_norm": 2.3968331813812256,
"learning_rate": 5.461692205196536e-06,
"loss": 0.3937,
"step": 4100
},
{
"epoch": 0.05596492864471598,
"grad_norm": 3.501485586166382,
"learning_rate": 5.59493670886076e-06,
"loss": 0.4806,
"step": 4200
},
{
"epoch": 0.05729742694578065,
"grad_norm": 20.607412338256836,
"learning_rate": 5.728181212524984e-06,
"loss": 0.4355,
"step": 4300
},
{
"epoch": 0.05862992524684531,
"grad_norm": 11.288957595825195,
"learning_rate": 5.861425716189208e-06,
"loss": 0.4579,
"step": 4400
},
{
"epoch": 0.059962423547909975,
"grad_norm": 23.52041244506836,
"learning_rate": 5.9946702198534315e-06,
"loss": 0.4232,
"step": 4500
},
{
"epoch": 0.059962423547909975,
"eval_dev_accuracy": 0.9480651654390978,
"eval_dev_accuracy_threshold": 0.925118088722229,
"eval_dev_average_precision": 0.5978901727391869,
"eval_dev_f1": 0.5897354160025502,
"eval_dev_f1_threshold": 0.8643622994422913,
"eval_dev_precision": 0.5724600309437855,
"eval_dev_recall": 0.6080858989810453,
"eval_loss": 0.4087965786457062,
"eval_runtime": 564.9476,
"eval_samples_per_second": 234.795,
"eval_steps_per_second": 7.339,
"step": 4500
},
{
"epoch": 0.06129492184897464,
"grad_norm": 8.910244941711426,
"learning_rate": 6.127914723517655e-06,
"loss": 0.4195,
"step": 4600
},
{
"epoch": 0.0626274201500393,
"grad_norm": 10.253131866455078,
"learning_rate": 6.261159227181879e-06,
"loss": 0.4332,
"step": 4700
},
{
"epoch": 0.06395991845110398,
"grad_norm": 6.71283483505249,
"learning_rate": 6.394403730846103e-06,
"loss": 0.433,
"step": 4800
},
{
"epoch": 0.06529241675216864,
"grad_norm": 13.018428802490234,
"learning_rate": 6.527648234510327e-06,
"loss": 0.3978,
"step": 4900
},
{
"epoch": 0.06662491505323331,
"grad_norm": 5.483168601989746,
"learning_rate": 6.660892738174551e-06,
"loss": 0.4165,
"step": 5000
},
{
"epoch": 0.06662491505323331,
"eval_dev_accuracy": 0.9488642788755117,
"eval_dev_accuracy_threshold": 0.9351813793182373,
"eval_dev_average_precision": 0.611806667891919,
"eval_dev_f1": 0.594213494881972,
"eval_dev_f1_threshold": 0.8715409636497498,
"eval_dev_precision": 0.5677078135914579,
"eval_dev_recall": 0.6233154377122823,
"eval_loss": 0.4015994369983673,
"eval_runtime": 565.0203,
"eval_samples_per_second": 234.765,
"eval_steps_per_second": 7.338,
"step": 5000
},
{
"epoch": 0.06795741335429797,
"grad_norm": 44.895931243896484,
"learning_rate": 6.794137241838775e-06,
"loss": 0.3173,
"step": 5100
},
{
"epoch": 0.06928991165536263,
"grad_norm": 19.51112937927246,
"learning_rate": 6.927381745502999e-06,
"loss": 0.4279,
"step": 5200
},
{
"epoch": 0.07062240995642731,
"grad_norm": 11.284177780151367,
"learning_rate": 7.0606262491672225e-06,
"loss": 0.4278,
"step": 5300
},
{
"epoch": 0.07195490825749197,
"grad_norm": 12.088862419128418,
"learning_rate": 7.193870752831446e-06,
"loss": 0.394,
"step": 5400
},
{
"epoch": 0.07328740655855664,
"grad_norm": 5.778110504150391,
"learning_rate": 7.32711525649567e-06,
"loss": 0.4033,
"step": 5500
},
{
"epoch": 0.07328740655855664,
"eval_dev_accuracy": 0.9500780266421405,
"eval_dev_accuracy_threshold": 0.9353994131088257,
"eval_dev_average_precision": 0.6400665115573199,
"eval_dev_f1": 0.6073723716004319,
"eval_dev_f1_threshold": 0.8007456064224243,
"eval_dev_precision": 0.5721619527314994,
"eval_dev_recall": 0.6472006135641504,
"eval_loss": 0.3888355791568756,
"eval_runtime": 566.2917,
"eval_samples_per_second": 234.238,
"eval_steps_per_second": 7.321,
"step": 5500
},
{
"epoch": 0.0746199048596213,
"grad_norm": 7.1619439125061035,
"learning_rate": 7.460359760159894e-06,
"loss": 0.3775,
"step": 5600
},
{
"epoch": 0.07595240316068597,
"grad_norm": 12.566367149353027,
"learning_rate": 7.593604263824118e-06,
"loss": 0.3944,
"step": 5700
},
{
"epoch": 0.07728490146175064,
"grad_norm": 10.173190116882324,
"learning_rate": 7.726848767488342e-06,
"loss": 0.4256,
"step": 5800
},
{
"epoch": 0.0786173997628153,
"grad_norm": 1.7395318746566772,
"learning_rate": 7.860093271152565e-06,
"loss": 0.3984,
"step": 5900
},
{
"epoch": 0.07994989806387996,
"grad_norm": 3.9586873054504395,
"learning_rate": 7.99333777481679e-06,
"loss": 0.3545,
"step": 6000
},
{
"epoch": 0.07994989806387996,
"eval_dev_accuracy": 0.949716163953953,
"eval_dev_accuracy_threshold": 0.9251655340194702,
"eval_dev_average_precision": 0.6562730620483952,
"eval_dev_f1": 0.6157150706828513,
"eval_dev_f1_threshold": 0.4972879886627197,
"eval_dev_precision": 0.5658281307381564,
"eval_dev_recall": 0.6752492604360688,
"eval_loss": 0.38393494486808777,
"eval_runtime": 567.4221,
"eval_samples_per_second": 233.771,
"eval_steps_per_second": 7.307,
"step": 6000
},
{
"epoch": 0.08128239636494464,
"grad_norm": 4.9177398681640625,
"learning_rate": 8.126582278481013e-06,
"loss": 0.4551,
"step": 6100
},
{
"epoch": 0.0826148946660093,
"grad_norm": 14.003257751464844,
"learning_rate": 8.259826782145237e-06,
"loss": 0.3817,
"step": 6200
},
{
"epoch": 0.08394739296707397,
"grad_norm": 7.29791259765625,
"learning_rate": 8.39307128580946e-06,
"loss": 0.408,
"step": 6300
},
{
"epoch": 0.08527989126813863,
"grad_norm": 26.11504554748535,
"learning_rate": 8.526315789473685e-06,
"loss": 0.4176,
"step": 6400
},
{
"epoch": 0.0866123895692033,
"grad_norm": 21.16114616394043,
"learning_rate": 8.659560293137908e-06,
"loss": 0.4128,
"step": 6500
},
{
"epoch": 0.0866123895692033,
"eval_dev_accuracy": 0.9528523072515775,
"eval_dev_accuracy_threshold": 0.9102756977081299,
"eval_dev_average_precision": 0.6708215133735886,
"eval_dev_f1": 0.6232578397212545,
"eval_dev_f1_threshold": 0.6869294047355652,
"eval_dev_precision": 0.6194134833892436,
"eval_dev_recall": 0.6271502136518023,
"eval_loss": 0.35909053683280945,
"eval_runtime": 568.2468,
"eval_samples_per_second": 233.432,
"eval_steps_per_second": 7.296,
"step": 6500
},
{
"epoch": 0.08794488787026797,
"grad_norm": 2.0384860038757324,
"learning_rate": 8.792804796802133e-06,
"loss": 0.3837,
"step": 6600
},
{
"epoch": 0.08927738617133263,
"grad_norm": 9.94750690460205,
"learning_rate": 8.926049300466355e-06,
"loss": 0.3618,
"step": 6700
},
{
"epoch": 0.0906098844723973,
"grad_norm": 3.8198211193084717,
"learning_rate": 9.05929380413058e-06,
"loss": 0.3643,
"step": 6800
},
{
"epoch": 0.09194238277346196,
"grad_norm": 14.838878631591797,
"learning_rate": 9.192538307794803e-06,
"loss": 0.3409,
"step": 6900
},
{
"epoch": 0.09327488107452662,
"grad_norm": 25.42053985595703,
"learning_rate": 9.325782811459028e-06,
"loss": 0.4001,
"step": 7000
},
{
"epoch": 0.09327488107452662,
"eval_dev_accuracy": 0.9507640579884958,
"eval_dev_accuracy_threshold": 0.7285012006759644,
"eval_dev_average_precision": 0.6341417194028635,
"eval_dev_f1": 0.6159875449616148,
"eval_dev_f1_threshold": 0.3488144874572754,
"eval_dev_precision": 0.6038947368421053,
"eval_dev_recall": 0.628574559000767,
"eval_loss": 0.46753522753715515,
"eval_runtime": 566.0195,
"eval_samples_per_second": 234.351,
"eval_steps_per_second": 7.325,
"step": 7000
},
{
"epoch": 0.0946073793755913,
"grad_norm": 21.88748550415039,
"learning_rate": 9.459027315123252e-06,
"loss": 0.4186,
"step": 7100
},
{
"epoch": 0.09593987767665596,
"grad_norm": 5.960207939147949,
"learning_rate": 9.592271818787475e-06,
"loss": 0.3478,
"step": 7200
},
{
"epoch": 0.09727237597772063,
"grad_norm": 16.917625427246094,
"learning_rate": 9.7255163224517e-06,
"loss": 0.3492,
"step": 7300
},
{
"epoch": 0.0986048742787853,
"grad_norm": 14.463135719299316,
"learning_rate": 9.858760826115924e-06,
"loss": 0.3522,
"step": 7400
},
{
"epoch": 0.09993737257984996,
"grad_norm": 3.8919215202331543,
"learning_rate": 9.992005329780147e-06,
"loss": 0.3445,
"step": 7500
},
{
"epoch": 0.09993737257984996,
"eval_dev_accuracy": 0.9513897788868199,
"eval_dev_accuracy_threshold": 0.933416485786438,
"eval_dev_average_precision": 0.672072308065407,
"eval_dev_f1": 0.6241289651586063,
"eval_dev_f1_threshold": 0.4514094591140747,
"eval_dev_precision": 0.5939819855488468,
"eval_dev_recall": 0.6574997260874329,
"eval_loss": 0.4463006556034088,
"eval_runtime": 565.9962,
"eval_samples_per_second": 234.36,
"eval_steps_per_second": 7.325,
"step": 7500
},
{
"epoch": 0.10126987088091463,
"grad_norm": 5.6276421546936035,
"learning_rate": 1.012524983344437e-05,
"loss": 0.3611,
"step": 7600
},
{
"epoch": 0.10260236918197929,
"grad_norm": 25.222440719604492,
"learning_rate": 1.0258494337108595e-05,
"loss": 0.3694,
"step": 7700
},
{
"epoch": 0.10393486748304395,
"grad_norm": 10.44590950012207,
"learning_rate": 1.0391738840772818e-05,
"loss": 0.34,
"step": 7800
},
{
"epoch": 0.10526736578410863,
"grad_norm": 15.12126350402832,
"learning_rate": 1.0524983344437042e-05,
"loss": 0.3839,
"step": 7900
},
{
"epoch": 0.10659986408517329,
"grad_norm": 10.425951957702637,
"learning_rate": 1.0658227848101265e-05,
"loss": 0.3408,
"step": 8000
},
{
"epoch": 0.10659986408517329,
"eval_dev_accuracy": 0.947168047524633,
"eval_dev_accuracy_threshold": 0.9196346402168274,
"eval_dev_average_precision": 0.6278546311695713,
"eval_dev_f1": 0.5864126161957174,
"eval_dev_f1_threshold": 0.5887953042984009,
"eval_dev_precision": 0.5073623559539052,
"eval_dev_recall": 0.6946422701873562,
"eval_loss": 0.36468541622161865,
"eval_runtime": 566.1553,
"eval_samples_per_second": 234.294,
"eval_steps_per_second": 7.323,
"step": 8000
},
{
"epoch": 0.10793236238623796,
"grad_norm": 19.852497100830078,
"learning_rate": 1.079147235176549e-05,
"loss": 0.4011,
"step": 8100
},
{
"epoch": 0.10926486068730262,
"grad_norm": 66.98611450195312,
"learning_rate": 1.0924716855429713e-05,
"loss": 0.3037,
"step": 8200
},
{
"epoch": 0.11059735898836728,
"grad_norm": 2.033569812774658,
"learning_rate": 1.1057961359093938e-05,
"loss": 0.3632,
"step": 8300
},
{
"epoch": 0.11192985728943196,
"grad_norm": 1.7951024770736694,
"learning_rate": 1.1191205862758164e-05,
"loss": 0.3878,
"step": 8400
},
{
"epoch": 0.11326235559049662,
"grad_norm": 3.2986645698547363,
"learning_rate": 1.1324450366422385e-05,
"loss": 0.3849,
"step": 8500
},
{
"epoch": 0.11326235559049662,
"eval_dev_accuracy": 0.9516611759029605,
"eval_dev_accuracy_threshold": 0.9289531707763672,
"eval_dev_average_precision": 0.6795165568410317,
"eval_dev_f1": 0.6335993534700474,
"eval_dev_f1_threshold": 0.7476029396057129,
"eval_dev_precision": 0.5877612220035611,
"eval_dev_recall": 0.6871918483620029,
"eval_loss": 0.3556542694568634,
"eval_runtime": 568.4381,
"eval_samples_per_second": 233.353,
"eval_steps_per_second": 7.294,
"step": 8500
},
{
"epoch": 0.1145948538915613,
"grad_norm": 1.4321446418762207,
"learning_rate": 1.1457694870086611e-05,
"loss": 0.3833,
"step": 8600
},
{
"epoch": 0.11592735219262595,
"grad_norm": 54.76650619506836,
"learning_rate": 1.1590939373750833e-05,
"loss": 0.3797,
"step": 8700
},
{
"epoch": 0.11725985049369061,
"grad_norm": 31.644800186157227,
"learning_rate": 1.1724183877415059e-05,
"loss": 0.3705,
"step": 8800
},
{
"epoch": 0.11859234879475529,
"grad_norm": 10.417598724365234,
"learning_rate": 1.1857428381079282e-05,
"loss": 0.3556,
"step": 8900
},
{
"epoch": 0.11992484709581995,
"grad_norm": 9.85118579864502,
"learning_rate": 1.1990672884743507e-05,
"loss": 0.3771,
"step": 9000
},
{
"epoch": 0.11992484709581995,
"eval_dev_accuracy": 0.955196875918792,
"eval_dev_accuracy_threshold": 0.9099207520484924,
"eval_dev_average_precision": 0.7086261480764138,
"eval_dev_f1": 0.6543492478744277,
"eval_dev_f1_threshold": 0.8123365640640259,
"eval_dev_precision": 0.6510139898058779,
"eval_dev_recall": 0.6577188561411198,
"eval_loss": 0.33506301045417786,
"eval_runtime": 567.679,
"eval_samples_per_second": 233.665,
"eval_steps_per_second": 7.303,
"step": 9000
},
{
"epoch": 0.12125734539688462,
"grad_norm": 22.609596252441406,
"learning_rate": 1.212391738840773e-05,
"loss": 0.3649,
"step": 9100
},
{
"epoch": 0.12258984369794929,
"grad_norm": 13.67054271697998,
"learning_rate": 1.2257161892071954e-05,
"loss": 0.3687,
"step": 9200
},
{
"epoch": 0.12392234199901395,
"grad_norm": 11.858447074890137,
"learning_rate": 1.2390406395736177e-05,
"loss": 0.406,
"step": 9300
},
{
"epoch": 0.1252548403000786,
"grad_norm": 22.195842742919922,
"learning_rate": 1.2523650899400402e-05,
"loss": 0.3362,
"step": 9400
},
{
"epoch": 0.1265873386011433,
"grad_norm": 1.6114740371704102,
"learning_rate": 1.2656895403064625e-05,
"loss": 0.2749,
"step": 9500
},
{
"epoch": 0.1265873386011433,
"eval_dev_accuracy": 0.9535986490459641,
"eval_dev_accuracy_threshold": 0.9009051322937012,
"eval_dev_average_precision": 0.6983138303648807,
"eval_dev_f1": 0.6486718540381003,
"eval_dev_f1_threshold": 0.7954304218292236,
"eval_dev_precision": 0.6356752208666386,
"eval_dev_recall": 0.6622110222417005,
"eval_loss": 0.35359007120132446,
"eval_runtime": 567.3028,
"eval_samples_per_second": 233.82,
"eval_steps_per_second": 7.308,
"step": 9500
},
{
"epoch": 0.12791983690220796,
"grad_norm": 15.079890251159668,
"learning_rate": 1.279013990672885e-05,
"loss": 0.3038,
"step": 9600
},
{
"epoch": 0.12925233520327262,
"grad_norm": 19.459815979003906,
"learning_rate": 1.2923384410393072e-05,
"loss": 0.3273,
"step": 9700
},
{
"epoch": 0.13058483350433728,
"grad_norm": 21.132827758789062,
"learning_rate": 1.3056628914057297e-05,
"loss": 0.3823,
"step": 9800
},
{
"epoch": 0.13191733180540194,
"grad_norm": 4.1918158531188965,
"learning_rate": 1.318987341772152e-05,
"loss": 0.3406,
"step": 9900
},
{
"epoch": 0.13324983010646663,
"grad_norm": 22.806039810180664,
"learning_rate": 1.3323117921385744e-05,
"loss": 0.4069,
"step": 10000
},
{
"epoch": 0.13324983010646663,
"eval_dev_accuracy": 0.9545711550204679,
"eval_dev_accuracy_threshold": 0.9562267065048218,
"eval_dev_average_precision": 0.7100922283297543,
"eval_dev_f1": 0.6518728053062817,
"eval_dev_f1_threshold": 0.8285595178604126,
"eval_dev_precision": 0.5874132020743605,
"eval_dev_recall": 0.7322230743946532,
"eval_loss": 0.3286122977733612,
"eval_runtime": 566.8451,
"eval_samples_per_second": 234.009,
"eval_steps_per_second": 7.314,
"step": 10000
},
{
"epoch": 0.1345823284075313,
"grad_norm": 12.236410140991211,
"learning_rate": 1.3456362425049967e-05,
"loss": 0.3618,
"step": 10100
},
{
"epoch": 0.13591482670859595,
"grad_norm": 5.4430060386657715,
"learning_rate": 1.3589606928714192e-05,
"loss": 0.3619,
"step": 10200
},
{
"epoch": 0.1372473250096606,
"grad_norm": 13.798270225524902,
"learning_rate": 1.3722851432378415e-05,
"loss": 0.3413,
"step": 10300
},
{
"epoch": 0.13857982331072527,
"grad_norm": 3.899458169937134,
"learning_rate": 1.385609593604264e-05,
"loss": 0.3374,
"step": 10400
},
{
"epoch": 0.13991232161178996,
"grad_norm": 6.147464752197266,
"learning_rate": 1.3989340439706862e-05,
"loss": 0.3725,
"step": 10500
},
{
"epoch": 0.13991232161178996,
"eval_dev_accuracy": 0.9561316878632762,
"eval_dev_accuracy_threshold": 0.9145029187202454,
"eval_dev_average_precision": 0.7194237355423562,
"eval_dev_f1": 0.6639100398366194,
"eval_dev_f1_threshold": 0.7489595413208008,
"eval_dev_precision": 0.6150037369207773,
"eval_dev_recall": 0.7212665717103101,
"eval_loss": 0.30797863006591797,
"eval_runtime": 567.191,
"eval_samples_per_second": 233.867,
"eval_steps_per_second": 7.31,
"step": 10500
},
{
"epoch": 0.14124481991285462,
"grad_norm": 0.32248708605766296,
"learning_rate": 1.4122584943371087e-05,
"loss": 0.3289,
"step": 10600
},
{
"epoch": 0.14257731821391928,
"grad_norm": 3.342273235321045,
"learning_rate": 1.4255829447035312e-05,
"loss": 0.335,
"step": 10700
},
{
"epoch": 0.14390981651498394,
"grad_norm": 5.640665531158447,
"learning_rate": 1.4389073950699535e-05,
"loss": 0.3298,
"step": 10800
},
{
"epoch": 0.1452423148160486,
"grad_norm": 1.3349778652191162,
"learning_rate": 1.452231845436376e-05,
"loss": 0.3805,
"step": 10900
},
{
"epoch": 0.1465748131171133,
"grad_norm": 8.876007080078125,
"learning_rate": 1.4655562958027982e-05,
"loss": 0.3545,
"step": 11000
},
{
"epoch": 0.1465748131171133,
"eval_dev_accuracy": 0.9554607341289286,
"eval_dev_accuracy_threshold": 0.8000156283378601,
"eval_dev_average_precision": 0.706945036170435,
"eval_dev_f1": 0.6470619459631616,
"eval_dev_f1_threshold": 0.35837632417678833,
"eval_dev_precision": 0.6275741350906096,
"eval_dev_recall": 0.6677988386107154,
"eval_loss": 0.40842413902282715,
"eval_runtime": 566.601,
"eval_samples_per_second": 234.11,
"eval_steps_per_second": 7.317,
"step": 11000
},
{
"epoch": 0.14790731141817795,
"grad_norm": 1.4081681966781616,
"learning_rate": 1.4788807461692207e-05,
"loss": 0.3591,
"step": 11100
},
{
"epoch": 0.1492398097192426,
"grad_norm": 15.024413108825684,
"learning_rate": 1.492205196535643e-05,
"loss": 0.3523,
"step": 11200
},
{
"epoch": 0.15057230802030727,
"grad_norm": 18.281108856201172,
"learning_rate": 1.5055296469020654e-05,
"loss": 0.3601,
"step": 11300
},
{
"epoch": 0.15190480632137193,
"grad_norm": 6.56211519241333,
"learning_rate": 1.5188540972684877e-05,
"loss": 0.3365,
"step": 11400
},
{
"epoch": 0.1532373046224366,
"grad_norm": 43.26646041870117,
"learning_rate": 1.5321785476349102e-05,
"loss": 0.3859,
"step": 11500
},
{
"epoch": 0.1532373046224366,
"eval_dev_accuracy": 0.9578354580201588,
"eval_dev_accuracy_threshold": 0.7837315797805786,
"eval_dev_average_precision": 0.7318177300477213,
"eval_dev_f1": 0.6784168212739641,
"eval_dev_f1_threshold": 0.5022754669189453,
"eval_dev_precision": 0.6404592779994162,
"eval_dev_recall": 0.7211570066834666,
"eval_loss": 0.3321084976196289,
"eval_runtime": 566.5331,
"eval_samples_per_second": 234.138,
"eval_steps_per_second": 7.318,
"step": 11500
},
{
"epoch": 0.15456980292350128,
"grad_norm": 28.52861785888672,
"learning_rate": 1.5455029980013325e-05,
"loss": 0.3581,
"step": 11600
},
{
"epoch": 0.15590230122456594,
"grad_norm": 7.024416923522949,
"learning_rate": 1.558827448367755e-05,
"loss": 0.3133,
"step": 11700
},
{
"epoch": 0.1572347995256306,
"grad_norm": 0.6226129531860352,
"learning_rate": 1.5721518987341774e-05,
"loss": 0.295,
"step": 11800
},
{
"epoch": 0.15856729782669526,
"grad_norm": 1.0621097087860107,
"learning_rate": 1.5854763491005997e-05,
"loss": 0.3027,
"step": 11900
},
{
"epoch": 0.15989979612775992,
"grad_norm": 1.318295955657959,
"learning_rate": 1.598800799467022e-05,
"loss": 0.3216,
"step": 12000
},
{
"epoch": 0.15989979612775992,
"eval_dev_accuracy": 0.9555587386069794,
"eval_dev_accuracy_threshold": 0.8190538287162781,
"eval_dev_average_precision": 0.7133029701747852,
"eval_dev_f1": 0.6531785971038309,
"eval_dev_f1_threshold": 0.3298466205596924,
"eval_dev_precision": 0.6146709191069876,
"eval_dev_recall": 0.6968335707242248,
"eval_loss": 0.3916049897670746,
"eval_runtime": 554.3199,
"eval_samples_per_second": 239.297,
"eval_steps_per_second": 7.479,
"step": 12000
},
{
"epoch": 0.1612322944288246,
"grad_norm": 9.81628704071045,
"learning_rate": 1.6121252498334446e-05,
"loss": 0.3522,
"step": 12100
},
{
"epoch": 0.16256479272988927,
"grad_norm": 4.447005271911621,
"learning_rate": 1.625449700199867e-05,
"loss": 0.3266,
"step": 12200
},
{
"epoch": 0.16389729103095393,
"grad_norm": 14.646246910095215,
"learning_rate": 1.6387741505662892e-05,
"loss": 0.3292,
"step": 12300
},
{
"epoch": 0.1652297893320186,
"grad_norm": 16.482669830322266,
"learning_rate": 1.6520986009327115e-05,
"loss": 0.3446,
"step": 12400
},
{
"epoch": 0.16656228763308326,
"grad_norm": 7.77319860458374,
"learning_rate": 1.665423051299134e-05,
"loss": 0.3236,
"step": 12500
},
{
"epoch": 0.16656228763308326,
"eval_dev_accuracy": 0.9569684953297097,
"eval_dev_accuracy_threshold": 0.9383260011672974,
"eval_dev_average_precision": 0.7292398003419696,
"eval_dev_f1": 0.66701062841812,
"eval_dev_f1_threshold": 0.8000765442848206,
"eval_dev_precision": 0.6303266699171136,
"eval_dev_recall": 0.7082283335159417,
"eval_loss": 0.35466820001602173,
"eval_runtime": 561.336,
"eval_samples_per_second": 236.306,
"eval_steps_per_second": 7.386,
"step": 12500
},
{
"epoch": 0.16789478593414794,
"grad_norm": 6.646021366119385,
"learning_rate": 1.6787475016655564e-05,
"loss": 0.3134,
"step": 12600
},
{
"epoch": 0.1692272842352126,
"grad_norm": 87.47698211669922,
"learning_rate": 1.6920719520319787e-05,
"loss": 0.3249,
"step": 12700
},
{
"epoch": 0.17055978253627727,
"grad_norm": 17.500768661499023,
"learning_rate": 1.705396402398401e-05,
"loss": 0.3811,
"step": 12800
},
{
"epoch": 0.17189228083734193,
"grad_norm": 7.166949272155762,
"learning_rate": 1.7187208527648237e-05,
"loss": 0.3127,
"step": 12900
},
{
"epoch": 0.1732247791384066,
"grad_norm": 4.106062889099121,
"learning_rate": 1.732045303131246e-05,
"loss": 0.3219,
"step": 13000
},
{
"epoch": 0.1732247791384066,
"eval_dev_accuracy": 0.9564407789094364,
"eval_dev_accuracy_threshold": 0.8795315623283386,
"eval_dev_average_precision": 0.726842832162545,
"eval_dev_f1": 0.6643535054597408,
"eval_dev_f1_threshold": 0.4876420497894287,
"eval_dev_precision": 0.621717123483908,
"eval_dev_recall": 0.7132683247507395,
"eval_loss": 0.3564859926700592,
"eval_runtime": 558.1535,
"eval_samples_per_second": 237.653,
"eval_steps_per_second": 7.428,
"step": 13000
},
{
"epoch": 0.17455727743947128,
"grad_norm": 1.3857766389846802,
"learning_rate": 1.7453697534976682e-05,
"loss": 0.3526,
"step": 13100
},
{
"epoch": 0.17588977574053594,
"grad_norm": 20.39262580871582,
"learning_rate": 1.758694203864091e-05,
"loss": 0.3299,
"step": 13200
},
{
"epoch": 0.1772222740416006,
"grad_norm": 18.849407196044922,
"learning_rate": 1.772018654230513e-05,
"loss": 0.3303,
"step": 13300
},
{
"epoch": 0.17855477234266526,
"grad_norm": 42.82183837890625,
"learning_rate": 1.7853431045969355e-05,
"loss": 0.3739,
"step": 13400
},
{
"epoch": 0.17988727064372992,
"grad_norm": 4.524885654449463,
"learning_rate": 1.7986675549633577e-05,
"loss": 0.3544,
"step": 13500
},
{
"epoch": 0.17988727064372992,
"eval_dev_accuracy": 0.9572021983158308,
"eval_dev_accuracy_threshold": 0.9545025825500488,
"eval_dev_average_precision": 0.7351364884979171,
"eval_dev_f1": 0.6692303640099035,
"eval_dev_f1_threshold": 0.7856150269508362,
"eval_dev_precision": 0.6444805194805194,
"eval_dev_recall": 0.6959570505094774,
"eval_loss": 0.33416542410850525,
"eval_runtime": 558.7974,
"eval_samples_per_second": 237.379,
"eval_steps_per_second": 7.42,
"step": 13500
},
{
"epoch": 0.1812197689447946,
"grad_norm": 2.0763137340545654,
"learning_rate": 1.8119920053297804e-05,
"loss": 0.3584,
"step": 13600
},
{
"epoch": 0.18255226724585927,
"grad_norm": 4.722475051879883,
"learning_rate": 1.8253164556962027e-05,
"loss": 0.341,
"step": 13700
},
{
"epoch": 0.18388476554692393,
"grad_norm": 4.084864139556885,
"learning_rate": 1.838640906062625e-05,
"loss": 0.3371,
"step": 13800
},
{
"epoch": 0.1852172638479886,
"grad_norm": 0.32559067010879517,
"learning_rate": 1.8519653564290473e-05,
"loss": 0.3322,
"step": 13900
},
{
"epoch": 0.18654976214905325,
"grad_norm": 9.505677223205566,
"learning_rate": 1.86528980679547e-05,
"loss": 0.3493,
"step": 14000
},
{
"epoch": 0.18654976214905325,
"eval_dev_accuracy": 0.9565010893574676,
"eval_dev_accuracy_threshold": 0.9463940858840942,
"eval_dev_average_precision": 0.7367044841602028,
"eval_dev_f1": 0.6669865642994243,
"eval_dev_f1_threshold": 0.8662494421005249,
"eval_dev_precision": 0.6496001661647107,
"eval_dev_recall": 0.6853292429056645,
"eval_loss": 0.29620230197906494,
"eval_runtime": 559.2923,
"eval_samples_per_second": 237.169,
"eval_steps_per_second": 7.413,
"step": 14000
},
{
"epoch": 0.18788226045011794,
"grad_norm": 19.357847213745117,
"learning_rate": 1.8786142571618922e-05,
"loss": 0.3021,
"step": 14100
},
{
"epoch": 0.1892147587511826,
"grad_norm": 0.8998715281486511,
"learning_rate": 1.8919387075283148e-05,
"loss": 0.3249,
"step": 14200
},
{
"epoch": 0.19054725705224726,
"grad_norm": 17.16973304748535,
"learning_rate": 1.9052631578947368e-05,
"loss": 0.3389,
"step": 14300
},
{
"epoch": 0.19187975535331192,
"grad_norm": 1.553682565689087,
"learning_rate": 1.9185876082611594e-05,
"loss": 0.3547,
"step": 14400
},
{
"epoch": 0.19321225365437658,
"grad_norm": 10.778045654296875,
"learning_rate": 1.9319120586275817e-05,
"loss": 0.3205,
"step": 14500
},
{
"epoch": 0.19321225365437658,
"eval_dev_accuracy": 0.9589587401147406,
"eval_dev_accuracy_threshold": 0.9295341968536377,
"eval_dev_average_precision": 0.7537310584722261,
"eval_dev_f1": 0.6830523319465732,
"eval_dev_f1_threshold": 0.7479926347732544,
"eval_dev_precision": 0.6825292637567005,
"eval_dev_recall": 0.6835762024761696,
"eval_loss": 0.3468180298805237,
"eval_runtime": 558.8191,
"eval_samples_per_second": 237.37,
"eval_steps_per_second": 7.419,
"step": 14500
},
{
"epoch": 0.19454475195544127,
"grad_norm": 2.241529941558838,
"learning_rate": 1.9452365089940043e-05,
"loss": 0.3168,
"step": 14600
},
{
"epoch": 0.19587725025650593,
"grad_norm": 1.1848278045654297,
"learning_rate": 1.9585609593604263e-05,
"loss": 0.3348,
"step": 14700
},
{
"epoch": 0.1972097485575706,
"grad_norm": 16.031787872314453,
"learning_rate": 1.971885409726849e-05,
"loss": 0.3237,
"step": 14800
},
{
"epoch": 0.19854224685863525,
"grad_norm": 12.078638076782227,
"learning_rate": 1.9852098600932712e-05,
"loss": 0.3428,
"step": 14900
},
{
"epoch": 0.1998747451596999,
"grad_norm": 5.735422134399414,
"learning_rate": 1.998534310459694e-05,
"loss": 0.3179,
"step": 15000
},
{
"epoch": 0.1998747451596999,
"eval_dev_accuracy": 0.9547219311405459,
"eval_dev_accuracy_threshold": 0.9576058387756348,
"eval_dev_average_precision": 0.6908515568536635,
"eval_dev_f1": 0.6764229341974599,
"eval_dev_f1_threshold": 0.926771879196167,
"eval_dev_precision": 0.6467119728163102,
"eval_dev_recall": 0.7089952887038458,
"eval_loss": 0.34104466438293457,
"eval_runtime": 561.385,
"eval_samples_per_second": 236.285,
"eval_steps_per_second": 7.385,
"step": 15000
},
{
"epoch": 0.2012072434607646,
"grad_norm": 19.35861587524414,
"learning_rate": 1.9986823013828432e-05,
"loss": 0.3409,
"step": 15100
},
{
"epoch": 0.20253974176182926,
"grad_norm": 35.545223236083984,
"learning_rate": 1.997201741138847e-05,
"loss": 0.331,
"step": 15200
},
{
"epoch": 0.20387224006289392,
"grad_norm": 17.14919662475586,
"learning_rate": 1.9957211808948506e-05,
"loss": 0.3493,
"step": 15300
},
{
"epoch": 0.20520473836395858,
"grad_norm": 2.735530138015747,
"learning_rate": 1.9942406206508544e-05,
"loss": 0.3205,
"step": 15400
},
{
"epoch": 0.20653723666502324,
"grad_norm": 1.0762556791305542,
"learning_rate": 1.9927600604068582e-05,
"loss": 0.3307,
"step": 15500
},
{
"epoch": 0.20653723666502324,
"eval_dev_accuracy": 0.9532292475517727,
"eval_dev_accuracy_threshold": 0.8670874238014221,
"eval_dev_average_precision": 0.6865319564110608,
"eval_dev_f1": 0.660230457801308,
"eval_dev_f1_threshold": 0.6122031211853027,
"eval_dev_precision": 0.6272807969227735,
"eval_dev_recall": 0.6968335707242248,
"eval_loss": 0.3867639899253845,
"eval_runtime": 563.4002,
"eval_samples_per_second": 235.44,
"eval_steps_per_second": 7.359,
"step": 15500
},
{
"epoch": 0.2078697349660879,
"grad_norm": 8.77493953704834,
"learning_rate": 1.9912795001628617e-05,
"loss": 0.3683,
"step": 15600
},
{
"epoch": 0.2092022332671526,
"grad_norm": 0.7768261432647705,
"learning_rate": 1.9897989399188656e-05,
"loss": 0.318,
"step": 15700
},
{
"epoch": 0.21053473156821725,
"grad_norm": 12.180807113647461,
"learning_rate": 1.988318379674869e-05,
"loss": 0.3498,
"step": 15800
},
{
"epoch": 0.21186722986928191,
"grad_norm": 4.719166278839111,
"learning_rate": 1.986837819430873e-05,
"loss": 0.3043,
"step": 15900
},
{
"epoch": 0.21319972817034658,
"grad_norm": 6.112349987030029,
"learning_rate": 1.9853572591868764e-05,
"loss": 0.3393,
"step": 16000
},
{
"epoch": 0.21319972817034658,
"eval_dev_accuracy": 0.9534780281499016,
"eval_dev_accuracy_threshold": 0.9171842336654663,
"eval_dev_average_precision": 0.7089238013031434,
"eval_dev_f1": 0.6696855863736944,
"eval_dev_f1_threshold": 0.8700560331344604,
"eval_dev_precision": 0.6313797787696488,
"eval_dev_recall": 0.7129396296702093,
"eval_loss": 0.33279770612716675,
"eval_runtime": 559.9008,
"eval_samples_per_second": 236.912,
"eval_steps_per_second": 7.405,
"step": 16000
},
{
"epoch": 0.21453222647141124,
"grad_norm": 47.89255905151367,
"learning_rate": 1.9838766989428802e-05,
"loss": 0.3326,
"step": 16100
},
{
"epoch": 0.21586472477247592,
"grad_norm": 6.826938152313232,
"learning_rate": 1.982396138698884e-05,
"loss": 0.3094,
"step": 16200
},
{
"epoch": 0.21719722307354059,
"grad_norm": 13.803174018859863,
"learning_rate": 1.9809155784548875e-05,
"loss": 0.3572,
"step": 16300
},
{
"epoch": 0.21852972137460525,
"grad_norm": 7.402415752410889,
"learning_rate": 1.9794350182108914e-05,
"loss": 0.3447,
"step": 16400
},
{
"epoch": 0.2198622196756699,
"grad_norm": 28.723724365234375,
"learning_rate": 1.977954457966895e-05,
"loss": 0.3484,
"step": 16500
},
{
"epoch": 0.2198622196756699,
"eval_dev_accuracy": 0.9592753699669047,
"eval_dev_accuracy_threshold": 0.8685222864151001,
"eval_dev_average_precision": 0.7462578581871518,
"eval_dev_f1": 0.6837169650468883,
"eval_dev_f1_threshold": 0.35429614782333374,
"eval_dev_precision": 0.6654911316253501,
"eval_dev_recall": 0.702969212227457,
"eval_loss": 0.3938925862312317,
"eval_runtime": 560.3292,
"eval_samples_per_second": 236.73,
"eval_steps_per_second": 7.399,
"step": 16500
},
{
"epoch": 0.22119471797673457,
"grad_norm": 16.560897827148438,
"learning_rate": 1.9764738977228987e-05,
"loss": 0.3182,
"step": 16600
},
{
"epoch": 0.22252721627779926,
"grad_norm": 1.3701841831207275,
"learning_rate": 1.9749933374789022e-05,
"loss": 0.283,
"step": 16700
},
{
"epoch": 0.22385971457886392,
"grad_norm": 12.799971580505371,
"learning_rate": 1.973512777234906e-05,
"loss": 0.3134,
"step": 16800
},
{
"epoch": 0.22519221287992858,
"grad_norm": 4.794546127319336,
"learning_rate": 1.9720322169909095e-05,
"loss": 0.3454,
"step": 16900
},
{
"epoch": 0.22652471118099324,
"grad_norm": 21.59016990661621,
"learning_rate": 1.970551656746913e-05,
"loss": 0.3485,
"step": 17000
},
{
"epoch": 0.22652471118099324,
"eval_dev_accuracy": 0.9593356804149359,
"eval_dev_accuracy_threshold": 0.9145892858505249,
"eval_dev_average_precision": 0.747695886787644,
"eval_dev_f1": 0.6768515829218704,
"eval_dev_f1_threshold": 0.8686491847038269,
"eval_dev_precision": 0.7203264094955489,
"eval_dev_recall": 0.6383258463898324,
"eval_loss": 0.34743377566337585,
"eval_runtime": 563.0506,
"eval_samples_per_second": 235.586,
"eval_steps_per_second": 7.363,
"step": 17000
},
{
"epoch": 0.2278572094820579,
"grad_norm": 14.225733757019043,
"learning_rate": 1.969071096502917e-05,
"loss": 0.2942,
"step": 17100
},
{
"epoch": 0.2291897077831226,
"grad_norm": 7.983681678771973,
"learning_rate": 1.9675905362589203e-05,
"loss": 0.2865,
"step": 17200
},
{
"epoch": 0.23052220608418725,
"grad_norm": 2.1385481357574463,
"learning_rate": 1.9661099760149242e-05,
"loss": 0.3489,
"step": 17300
},
{
"epoch": 0.2318547043852519,
"grad_norm": 12.413968086242676,
"learning_rate": 1.9646294157709277e-05,
"loss": 0.2959,
"step": 17400
},
{
"epoch": 0.23318720268631657,
"grad_norm": 15.191158294677734,
"learning_rate": 1.9631488555269315e-05,
"loss": 0.3626,
"step": 17500
},
{
"epoch": 0.23318720268631657,
"eval_dev_accuracy": 0.959637232655092,
"eval_dev_accuracy_threshold": 0.9243228435516357,
"eval_dev_average_precision": 0.7598083206267562,
"eval_dev_f1": 0.6903569873748368,
"eval_dev_f1_threshold": 0.7939244508743286,
"eval_dev_precision": 0.6858038706887231,
"eval_dev_recall": 0.6949709652678865,
"eval_loss": 0.28921985626220703,
"eval_runtime": 562.3661,
"eval_samples_per_second": 235.873,
"eval_steps_per_second": 7.372,
"step": 17500
},
{
"epoch": 0.23451970098738123,
"grad_norm": 2.7491862773895264,
"learning_rate": 1.961668295282935e-05,
"loss": 0.3107,
"step": 17600
},
{
"epoch": 0.23585219928844592,
"grad_norm": 52.241886138916016,
"learning_rate": 1.960187735038939e-05,
"loss": 0.2914,
"step": 17700
},
{
"epoch": 0.23718469758951058,
"grad_norm": 11.401723861694336,
"learning_rate": 1.9587071747949427e-05,
"loss": 0.3298,
"step": 17800
},
{
"epoch": 0.23851719589057524,
"grad_norm": 4.170936107635498,
"learning_rate": 1.957226614550946e-05,
"loss": 0.3315,
"step": 17900
},
{
"epoch": 0.2398496941916399,
"grad_norm": 5.668073654174805,
"learning_rate": 1.95574605430695e-05,
"loss": 0.3097,
"step": 18000
},
{
"epoch": 0.2398496941916399,
"eval_dev_accuracy": 0.9607454371376661,
"eval_dev_accuracy_threshold": 0.9465240836143494,
"eval_dev_average_precision": 0.7633624538366494,
"eval_dev_f1": 0.6969561824060653,
"eval_dev_f1_threshold": 0.8802664279937744,
"eval_dev_precision": 0.7094540914765634,
"eval_dev_recall": 0.6848909827982907,
"eval_loss": 0.2974649667739868,
"eval_runtime": 565.9458,
"eval_samples_per_second": 234.381,
"eval_steps_per_second": 7.326,
"step": 18000
},
{
"epoch": 0.24118219249270456,
"grad_norm": 18.609146118164062,
"learning_rate": 1.9542654940629535e-05,
"loss": 0.3474,
"step": 18100
},
{
"epoch": 0.24251469079376925,
"grad_norm": 5.154010772705078,
"learning_rate": 1.9527849338189573e-05,
"loss": 0.2917,
"step": 18200
},
{
"epoch": 0.2438471890948339,
"grad_norm": 9.549324035644531,
"learning_rate": 1.9513043735749608e-05,
"loss": 0.3557,
"step": 18300
},
{
"epoch": 0.24517968739589857,
"grad_norm": 1.6343746185302734,
"learning_rate": 1.9498238133309647e-05,
"loss": 0.3394,
"step": 18400
},
{
"epoch": 0.24651218569696323,
"grad_norm": 9.207841873168945,
"learning_rate": 1.9483432530869685e-05,
"loss": 0.2891,
"step": 18500
},
{
"epoch": 0.24651218569696323,
"eval_dev_accuracy": 0.9630900058048806,
"eval_dev_accuracy_threshold": 0.9168897271156311,
"eval_dev_average_precision": 0.7806456437261002,
"eval_dev_f1": 0.7187227550130775,
"eval_dev_f1_threshold": 0.6090723276138306,
"eval_dev_precision": 0.7149051490514905,
"eval_dev_recall": 0.7225813520324312,
"eval_loss": 0.3342207372188568,
"eval_runtime": 562.3653,
"eval_samples_per_second": 235.873,
"eval_steps_per_second": 7.372,
"step": 18500
},
{
"epoch": 0.2478446839980279,
"grad_norm": 30.978008270263672,
"learning_rate": 1.946862692842972e-05,
"loss": 0.2514,
"step": 18600
},
{
"epoch": 0.24917718229909258,
"grad_norm": 26.20627784729004,
"learning_rate": 1.9453821325989758e-05,
"loss": 0.3139,
"step": 18700
},
{
"epoch": 0.2505096806001572,
"grad_norm": 29.30525779724121,
"learning_rate": 1.9439015723549793e-05,
"loss": 0.2896,
"step": 18800
},
{
"epoch": 0.2518421789012219,
"grad_norm": 1.5062270164489746,
"learning_rate": 1.942421012110983e-05,
"loss": 0.3161,
"step": 18900
},
{
"epoch": 0.2531746772022866,
"grad_norm": 5.7221784591674805,
"learning_rate": 1.9409404518669866e-05,
"loss": 0.3331,
"step": 19000
},
{
"epoch": 0.2531746772022866,
"eval_dev_accuracy": 0.9605946610175881,
"eval_dev_accuracy_threshold": 0.8986555337905884,
"eval_dev_average_precision": 0.7733706796615292,
"eval_dev_f1": 0.7069406003832233,
"eval_dev_f1_threshold": 0.5075786113739014,
"eval_dev_precision": 0.6874029603560708,
"eval_dev_recall": 0.727621343267229,
"eval_loss": 0.333689421415329,
"eval_runtime": 564.7555,
"eval_samples_per_second": 234.875,
"eval_steps_per_second": 7.341,
"step": 19000
},
{
"epoch": 0.2545071755033512,
"grad_norm": 0.41423532366752625,
"learning_rate": 1.9394598916229905e-05,
"loss": 0.2959,
"step": 19100
},
{
"epoch": 0.2558396738044159,
"grad_norm": 18.290435791015625,
"learning_rate": 1.937979331378994e-05,
"loss": 0.3236,
"step": 19200
},
{
"epoch": 0.25717217210548055,
"grad_norm": 1.2307929992675781,
"learning_rate": 1.9364987711349975e-05,
"loss": 0.3527,
"step": 19300
},
{
"epoch": 0.25850467040654523,
"grad_norm": 1.151492714881897,
"learning_rate": 1.9350182108910013e-05,
"loss": 0.3106,
"step": 19400
},
{
"epoch": 0.2598371687076099,
"grad_norm": 1.676810383796692,
"learning_rate": 1.9335376506470048e-05,
"loss": 0.3271,
"step": 19500
},
{
"epoch": 0.2598371687076099,
"eval_dev_accuracy": 0.9614239296780176,
"eval_dev_accuracy_threshold": 0.9547422528266907,
"eval_dev_average_precision": 0.7587239294098156,
"eval_dev_f1": 0.7068855932203391,
"eval_dev_f1_threshold": 0.7505875825881958,
"eval_dev_precision": 0.6841997334153593,
"eval_dev_recall": 0.7311274241262189,
"eval_loss": 0.3253738582134247,
"eval_runtime": 562.4242,
"eval_samples_per_second": 235.849,
"eval_steps_per_second": 7.372,
"step": 19500
},
{
"epoch": 0.26116966700867456,
"grad_norm": 18.608806610107422,
"learning_rate": 1.9320570904030086e-05,
"loss": 0.3504,
"step": 19600
},
{
"epoch": 0.26250216530973924,
"grad_norm": 20.453174591064453,
"learning_rate": 1.930576530159012e-05,
"loss": 0.303,
"step": 19700
},
{
"epoch": 0.2638346636108039,
"grad_norm": 5.1661248207092285,
"learning_rate": 1.929095969915016e-05,
"loss": 0.2478,
"step": 19800
},
{
"epoch": 0.26516716191186857,
"grad_norm": 1.2466572523117065,
"learning_rate": 1.9276154096710194e-05,
"loss": 0.3309,
"step": 19900
},
{
"epoch": 0.26649966021293325,
"grad_norm": 2.0681653022766113,
"learning_rate": 1.9261348494270233e-05,
"loss": 0.3063,
"step": 20000
},
{
"epoch": 0.26649966021293325,
"eval_dev_accuracy": 0.9621928878904159,
"eval_dev_accuracy_threshold": 0.5716267228126526,
"eval_dev_average_precision": 0.7784750233173614,
"eval_dev_f1": 0.7154299699632884,
"eval_dev_f1_threshold": 0.2591094672679901,
"eval_dev_precision": 0.7265845667156253,
"eval_dev_recall": 0.7046126876301084,
"eval_loss": 0.45214059948921204,
"eval_runtime": 560.4518,
"eval_samples_per_second": 236.679,
"eval_steps_per_second": 7.398,
"step": 20000
},
{
"epoch": 0.2678321585139979,
"grad_norm": 18.87665557861328,
"learning_rate": 1.924654289183027e-05,
"loss": 0.3482,
"step": 20100
},
{
"epoch": 0.2691646568150626,
"grad_norm": 1.1184475421905518,
"learning_rate": 1.9231737289390306e-05,
"loss": 0.3033,
"step": 20200
},
{
"epoch": 0.2704971551161272,
"grad_norm": 13.190022468566895,
"learning_rate": 1.9216931686950344e-05,
"loss": 0.288,
"step": 20300
},
{
"epoch": 0.2718296534171919,
"grad_norm": 5.855016231536865,
"learning_rate": 1.920212608451038e-05,
"loss": 0.3609,
"step": 20400
},
{
"epoch": 0.2731621517182566,
"grad_norm": 0.26388707756996155,
"learning_rate": 1.9187320482070418e-05,
"loss": 0.3071,
"step": 20500
},
{
"epoch": 0.2731621517182566,
"eval_dev_accuracy": 0.9630975446108845,
"eval_dev_accuracy_threshold": 0.9040592908859253,
"eval_dev_average_precision": 0.7844185876274975,
"eval_dev_f1": 0.7150979850952249,
"eval_dev_f1_threshold": 0.6517728567123413,
"eval_dev_precision": 0.7206275033377837,
"eval_dev_recall": 0.7096526788649063,
"eval_loss": 0.3398449718952179,
"eval_runtime": 559.7754,
"eval_samples_per_second": 236.965,
"eval_steps_per_second": 7.407,
"step": 20500
},
{
"epoch": 0.2744946500193212,
"grad_norm": 4.928101539611816,
"learning_rate": 1.9172514879630453e-05,
"loss": 0.3778,
"step": 20600
},
{
"epoch": 0.2758271483203859,
"grad_norm": 32.13788604736328,
"learning_rate": 1.915770927719049e-05,
"loss": 0.2681,
"step": 20700
},
{
"epoch": 0.27715964662145054,
"grad_norm": 4.934467792510986,
"learning_rate": 1.914290367475053e-05,
"loss": 0.3358,
"step": 20800
},
{
"epoch": 0.27849214492251523,
"grad_norm": 20.491180419921875,
"learning_rate": 1.9128098072310564e-05,
"loss": 0.2964,
"step": 20900
},
{
"epoch": 0.2798246432235799,
"grad_norm": 1.0770193338394165,
"learning_rate": 1.9113292469870603e-05,
"loss": 0.2193,
"step": 21000
},
{
"epoch": 0.2798246432235799,
"eval_dev_accuracy": 0.9630221565508454,
"eval_dev_accuracy_threshold": 0.9147968292236328,
"eval_dev_average_precision": 0.7911413496458403,
"eval_dev_f1": 0.7163220463124683,
"eval_dev_f1_threshold": 0.7818174362182617,
"eval_dev_precision": 0.7372999304105776,
"eval_dev_recall": 0.6965048756436946,
"eval_loss": 0.3126268982887268,
"eval_runtime": 558.7817,
"eval_samples_per_second": 237.386,
"eval_steps_per_second": 7.42,
"step": 21000
},
{
"epoch": 0.28115714152464455,
"grad_norm": 24.751399993896484,
"learning_rate": 1.9098486867430638e-05,
"loss": 0.3136,
"step": 21100
},
{
"epoch": 0.28248963982570924,
"grad_norm": 38.034759521484375,
"learning_rate": 1.9083681264990676e-05,
"loss": 0.3121,
"step": 21200
},
{
"epoch": 0.28382213812677387,
"grad_norm": 22.520530700683594,
"learning_rate": 1.906887566255071e-05,
"loss": 0.2893,
"step": 21300
},
{
"epoch": 0.28515463642783856,
"grad_norm": 13.158409118652344,
"learning_rate": 1.905407006011075e-05,
"loss": 0.2987,
"step": 21400
},
{
"epoch": 0.28648713472890325,
"grad_norm": 2.2500672340393066,
"learning_rate": 1.9039264457670784e-05,
"loss": 0.2781,
"step": 21500
},
{
"epoch": 0.28648713472890325,
"eval_dev_accuracy": 0.96250951774258,
"eval_dev_accuracy_threshold": 0.9360392093658447,
"eval_dev_average_precision": 0.7809073848360293,
"eval_dev_f1": 0.724827056110684,
"eval_dev_f1_threshold": 0.9214021563529968,
"eval_dev_precision": 0.7264223616154947,
"eval_dev_recall": 0.7232387421934918,
"eval_loss": 0.32232773303985596,
"eval_runtime": 558.841,
"eval_samples_per_second": 237.361,
"eval_steps_per_second": 7.419,
"step": 21500
},
{
"epoch": 0.2878196330299679,
"grad_norm": 7.364509582519531,
"learning_rate": 1.902445885523082e-05,
"loss": 0.2444,
"step": 21600
},
{
"epoch": 0.28915213133103257,
"grad_norm": 14.986044883728027,
"learning_rate": 1.9009653252790857e-05,
"loss": 0.2917,
"step": 21700
},
{
"epoch": 0.2904846296320972,
"grad_norm": 1.4703857898712158,
"learning_rate": 1.8994847650350892e-05,
"loss": 0.32,
"step": 21800
},
{
"epoch": 0.2918171279331619,
"grad_norm": 4.144439220428467,
"learning_rate": 1.898004204791093e-05,
"loss": 0.2873,
"step": 21900
},
{
"epoch": 0.2931496262342266,
"grad_norm": 3.1540684700012207,
"learning_rate": 1.8965236445470966e-05,
"loss": 0.2877,
"step": 22000
},
{
"epoch": 0.2931496262342266,
"eval_dev_accuracy": 0.9638212699872594,
"eval_dev_accuracy_threshold": 0.8727903366088867,
"eval_dev_average_precision": 0.7927687696111004,
"eval_dev_f1": 0.7246392958609548,
"eval_dev_f1_threshold": 0.7912191152572632,
"eval_dev_precision": 0.7370806890299184,
"eval_dev_recall": 0.712610934589679,
"eval_loss": 0.30881205201148987,
"eval_runtime": 559.5919,
"eval_samples_per_second": 237.042,
"eval_steps_per_second": 7.409,
"step": 22000
},
{
"epoch": 0.2944821245352912,
"grad_norm": 6.18324613571167,
"learning_rate": 1.8950430843031004e-05,
"loss": 0.2947,
"step": 22100
},
{
"epoch": 0.2958146228363559,
"grad_norm": 12.850146293640137,
"learning_rate": 1.893562524059104e-05,
"loss": 0.2619,
"step": 22200
},
{
"epoch": 0.29714712113742053,
"grad_norm": 5.986371040344238,
"learning_rate": 1.8920819638151077e-05,
"loss": 0.3143,
"step": 22300
},
{
"epoch": 0.2984796194384852,
"grad_norm": 6.889712810516357,
"learning_rate": 1.8906014035711116e-05,
"loss": 0.3585,
"step": 22400
},
{
"epoch": 0.29981211773954985,
"grad_norm": 14.721301078796387,
"learning_rate": 1.889120843327115e-05,
"loss": 0.28,
"step": 22500
},
{
"epoch": 0.29981211773954985,
"eval_dev_accuracy": 0.9627809147587205,
"eval_dev_accuracy_threshold": 0.949242889881134,
"eval_dev_average_precision": 0.7827527934861719,
"eval_dev_f1": 0.7189280438911163,
"eval_dev_f1_threshold": 0.41740649938583374,
"eval_dev_precision": 0.6932546545935497,
"eval_dev_recall": 0.7465760929111428,
"eval_loss": 0.3353007137775421,
"eval_runtime": 558.6719,
"eval_samples_per_second": 237.433,
"eval_steps_per_second": 7.421,
"step": 22500
},
{
"epoch": 0.30114461604061454,
"grad_norm": 1.0338587760925293,
"learning_rate": 1.887640283083119e-05,
"loss": 0.284,
"step": 22600
},
{
"epoch": 0.30247711434167923,
"grad_norm": 0.5249596834182739,
"learning_rate": 1.8861597228391224e-05,
"loss": 0.2821,
"step": 22700
},
{
"epoch": 0.30380961264274386,
"grad_norm": 2.10871958732605,
"learning_rate": 1.8846791625951262e-05,
"loss": 0.2762,
"step": 22800
},
{
"epoch": 0.30514211094380855,
"grad_norm": 8.820456504821777,
"learning_rate": 1.8831986023511297e-05,
"loss": 0.3152,
"step": 22900
},
{
"epoch": 0.3064746092448732,
"grad_norm": 0.5152029395103455,
"learning_rate": 1.8817180421071335e-05,
"loss": 0.2879,
"step": 23000
},
{
"epoch": 0.3064746092448732,
"eval_dev_accuracy": 0.9637911147632438,
"eval_dev_accuracy_threshold": 0.8721863627433777,
"eval_dev_average_precision": 0.7944006641896747,
"eval_dev_f1": 0.7282656663724625,
"eval_dev_f1_threshold": 0.7819468975067139,
"eval_dev_precision": 0.7333629596711476,
"eval_dev_recall": 0.7232387421934918,
"eval_loss": 0.27257823944091797,
"eval_runtime": 559.9281,
"eval_samples_per_second": 236.9,
"eval_steps_per_second": 7.405,
"step": 23000
},
{
"epoch": 0.3078071075459379,
"grad_norm": 7.670559406280518,
"learning_rate": 1.8802374818631374e-05,
"loss": 0.2738,
"step": 23100
},
{
"epoch": 0.30913960584700256,
"grad_norm": 1.2862569093704224,
"learning_rate": 1.878756921619141e-05,
"loss": 0.2624,
"step": 23200
},
{
"epoch": 0.3104721041480672,
"grad_norm": 6.1086249351501465,
"learning_rate": 1.8772763613751447e-05,
"loss": 0.2698,
"step": 23300
},
{
"epoch": 0.3118046024491319,
"grad_norm": 2.7864394187927246,
"learning_rate": 1.8757958011311482e-05,
"loss": 0.278,
"step": 23400
},
{
"epoch": 0.3131371007501965,
"grad_norm": 0.4662020206451416,
"learning_rate": 1.874315240887152e-05,
"loss": 0.3024,
"step": 23500
},
{
"epoch": 0.3131371007501965,
"eval_dev_accuracy": 0.9640398953613727,
"eval_dev_accuracy_threshold": 0.9022700786590576,
"eval_dev_average_precision": 0.789665459307555,
"eval_dev_f1": 0.7233386555084511,
"eval_dev_f1_threshold": 0.49045658111572266,
"eval_dev_precision": 0.7269809650287737,
"eval_dev_recall": 0.719732661334502,
"eval_loss": 0.3414628207683563,
"eval_runtime": 561.5298,
"eval_samples_per_second": 236.224,
"eval_steps_per_second": 7.383,
"step": 23500
},
{
"epoch": 0.3144695990512612,
"grad_norm": 16.328683853149414,
"learning_rate": 1.8728346806431555e-05,
"loss": 0.3255,
"step": 23600
},
{
"epoch": 0.3158020973523259,
"grad_norm": 6.683753490447998,
"learning_rate": 1.8713541203991594e-05,
"loss": 0.3298,
"step": 23700
},
{
"epoch": 0.3171345956533905,
"grad_norm": 14.66252613067627,
"learning_rate": 1.869873560155163e-05,
"loss": 0.2902,
"step": 23800
},
{
"epoch": 0.3184670939544552,
"grad_norm": 1.7640432119369507,
"learning_rate": 1.8683929999111664e-05,
"loss": 0.283,
"step": 23900
},
{
"epoch": 0.31979959225551985,
"grad_norm": 20.055587768554688,
"learning_rate": 1.8669124396671702e-05,
"loss": 0.3098,
"step": 24000
},
{
"epoch": 0.31979959225551985,
"eval_dev_accuracy": 0.9638288087932633,
"eval_dev_accuracy_threshold": 0.9375428557395935,
"eval_dev_average_precision": 0.7947515841312096,
"eval_dev_f1": 0.731536653364675,
"eval_dev_f1_threshold": 0.8831270337104797,
"eval_dev_precision": 0.7299803622081605,
"eval_dev_recall": 0.7330995946094007,
"eval_loss": 0.2945517897605896,
"eval_runtime": 562.1643,
"eval_samples_per_second": 235.958,
"eval_steps_per_second": 7.375,
"step": 24000
},
{
"epoch": 0.32113209055658454,
"grad_norm": 40.83311080932617,
"learning_rate": 1.8654318794231737e-05,
"loss": 0.2592,
"step": 24100
},
{
"epoch": 0.3224645888576492,
"grad_norm": 5.973490238189697,
"learning_rate": 1.8639513191791775e-05,
"loss": 0.27,
"step": 24200
},
{
"epoch": 0.32379708715871386,
"grad_norm": 8.698867797851562,
"learning_rate": 1.862470758935181e-05,
"loss": 0.2738,
"step": 24300
},
{
"epoch": 0.32512958545977855,
"grad_norm": 8.795327186584473,
"learning_rate": 1.860990198691185e-05,
"loss": 0.2528,
"step": 24400
},
{
"epoch": 0.3264620837608432,
"grad_norm": 0.2583109438419342,
"learning_rate": 1.8595096384471883e-05,
"loss": 0.2694,
"step": 24500
},
{
"epoch": 0.3264620837608432,
"eval_dev_accuracy": 0.9628261475947439,
"eval_dev_accuracy_threshold": 0.9562203884124756,
"eval_dev_average_precision": 0.7884777296034856,
"eval_dev_f1": 0.7260596117035821,
"eval_dev_f1_threshold": 0.9503564834594727,
"eval_dev_precision": 0.7248307490718497,
"eval_dev_recall": 0.7272926481866988,
"eval_loss": 0.3025730550289154,
"eval_runtime": 561.6413,
"eval_samples_per_second": 236.177,
"eval_steps_per_second": 7.382,
"step": 24500
},
{
"epoch": 0.32779458206190787,
"grad_norm": 2.1876091957092285,
"learning_rate": 1.8580290782031922e-05,
"loss": 0.2288,
"step": 24600
},
{
"epoch": 0.32912708036297256,
"grad_norm": 7.1153459548950195,
"learning_rate": 1.856548517959196e-05,
"loss": 0.2966,
"step": 24700
},
{
"epoch": 0.3304595786640372,
"grad_norm": 0.5204883217811584,
"learning_rate": 1.8550679577151995e-05,
"loss": 0.3103,
"step": 24800
},
{
"epoch": 0.3317920769651019,
"grad_norm": 0.5321233868598938,
"learning_rate": 1.8535873974712033e-05,
"loss": 0.2403,
"step": 24900
},
{
"epoch": 0.3331245752661665,
"grad_norm": 0.5437518358230591,
"learning_rate": 1.8521068372272068e-05,
"loss": 0.2986,
"step": 25000
},
{
"epoch": 0.3331245752661665,
"eval_dev_accuracy": 0.964250981929482,
"eval_dev_accuracy_threshold": 0.8822938203811646,
"eval_dev_average_precision": 0.7923663060740336,
"eval_dev_f1": 0.7238444852327716,
"eval_dev_f1_threshold": 0.3350263833999634,
"eval_dev_precision": 0.7350881156800723,
"eval_dev_recall": 0.7129396296702093,
"eval_loss": 0.39168474078178406,
"eval_runtime": 559.9637,
"eval_samples_per_second": 236.885,
"eval_steps_per_second": 7.404,
"step": 25000
},
{
"epoch": 0.3344570735672312,
"grad_norm": 10.434455871582031,
"learning_rate": 1.8506262769832107e-05,
"loss": 0.2954,
"step": 25100
},
{
"epoch": 0.3357895718682959,
"grad_norm": 29.660995483398438,
"learning_rate": 1.849145716739214e-05,
"loss": 0.2778,
"step": 25200
},
{
"epoch": 0.3371220701693605,
"grad_norm": 17.967578887939453,
"learning_rate": 1.847665156495218e-05,
"loss": 0.2522,
"step": 25300
},
{
"epoch": 0.3384545684704252,
"grad_norm": 16.963655471801758,
"learning_rate": 1.8461845962512218e-05,
"loss": 0.3071,
"step": 25400
},
{
"epoch": 0.33978706677148984,
"grad_norm": 3.178967237472534,
"learning_rate": 1.8447040360072253e-05,
"loss": 0.3088,
"step": 25500
},
{
"epoch": 0.33978706677148984,
"eval_dev_accuracy": 0.9653064147700288,
"eval_dev_accuracy_threshold": 0.9469561576843262,
"eval_dev_average_precision": 0.8090508028224602,
"eval_dev_f1": 0.7406513872135102,
"eval_dev_f1_threshold": 0.9149296879768372,
"eval_dev_precision": 0.7413017231917463,
"eval_dev_recall": 0.7400021913005369,
"eval_loss": 0.28880587220191956,
"eval_runtime": 559.426,
"eval_samples_per_second": 237.113,
"eval_steps_per_second": 7.411,
"step": 25500
},
{
"epoch": 0.34111956507255453,
"grad_norm": 31.83365821838379,
"learning_rate": 1.843223475763229e-05,
"loss": 0.3328,
"step": 25600
},
{
"epoch": 0.3424520633736192,
"grad_norm": 73.58321380615234,
"learning_rate": 1.8417429155192326e-05,
"loss": 0.249,
"step": 25700
},
{
"epoch": 0.34378456167468385,
"grad_norm": 31.073486328125,
"learning_rate": 1.8402623552752365e-05,
"loss": 0.248,
"step": 25800
},
{
"epoch": 0.34511705997574854,
"grad_norm": 2.6796510219573975,
"learning_rate": 1.83878179503124e-05,
"loss": 0.2735,
"step": 25900
},
{
"epoch": 0.3464495582768132,
"grad_norm": 19.556621551513672,
"learning_rate": 1.8373012347872438e-05,
"loss": 0.3087,
"step": 26000
},
{
"epoch": 0.3464495582768132,
"eval_dev_accuracy": 0.9655853505921732,
"eval_dev_accuracy_threshold": 0.9267855882644653,
"eval_dev_average_precision": 0.8095342358911112,
"eval_dev_f1": 0.7389250472391351,
"eval_dev_f1_threshold": 0.6512651443481445,
"eval_dev_precision": 0.7092191435768262,
"eval_dev_recall": 0.7712282239509148,
"eval_loss": 0.26777184009552,
"eval_runtime": 562.5407,
"eval_samples_per_second": 235.8,
"eval_steps_per_second": 7.37,
"step": 26000
},
{
"epoch": 0.34778205657787786,
"grad_norm": 10.894082069396973,
"learning_rate": 1.8358206745432473e-05,
"loss": 0.2852,
"step": 26100
},
{
"epoch": 0.34911455487894255,
"grad_norm": 43.44607162475586,
"learning_rate": 1.8343401142992508e-05,
"loss": 0.255,
"step": 26200
},
{
"epoch": 0.3504470531800072,
"grad_norm": 0.060168083757162094,
"learning_rate": 1.8328595540552546e-05,
"loss": 0.27,
"step": 26300
},
{
"epoch": 0.3517795514810719,
"grad_norm": 0.13352444767951965,
"learning_rate": 1.831378993811258e-05,
"loss": 0.3315,
"step": 26400
},
{
"epoch": 0.3531120497821365,
"grad_norm": 2.8769795894622803,
"learning_rate": 1.829898433567262e-05,
"loss": 0.2548,
"step": 26500
},
{
"epoch": 0.3531120497821365,
"eval_dev_accuracy": 0.9633161699849978,
"eval_dev_accuracy_threshold": 0.9598461389541626,
"eval_dev_average_precision": 0.7967367390804647,
"eval_dev_f1": 0.7211769095463995,
"eval_dev_f1_threshold": 0.9407143592834473,
"eval_dev_precision": 0.7022005397550343,
"eval_dev_recall": 0.7412074065958146,
"eval_loss": 0.26967185735702515,
"eval_runtime": 561.1397,
"eval_samples_per_second": 236.389,
"eval_steps_per_second": 7.389,
"step": 26500
},
{
"epoch": 0.3544445480832012,
"grad_norm": 6.555627822875977,
"learning_rate": 1.8284178733232655e-05,
"loss": 0.2967,
"step": 26600
},
{
"epoch": 0.3557770463842659,
"grad_norm": 18.727455139160156,
"learning_rate": 1.8269373130792693e-05,
"loss": 0.2907,
"step": 26700
},
{
"epoch": 0.3571095446853305,
"grad_norm": 16.004812240600586,
"learning_rate": 1.825456752835273e-05,
"loss": 0.2871,
"step": 26800
},
{
"epoch": 0.3584420429863952,
"grad_norm": 0.3446504771709442,
"learning_rate": 1.8239761925912766e-05,
"loss": 0.287,
"step": 26900
},
{
"epoch": 0.35977454128745984,
"grad_norm": 1.3801554441452026,
"learning_rate": 1.8224956323472805e-05,
"loss": 0.2461,
"step": 27000
},
{
"epoch": 0.35977454128745984,
"eval_dev_accuracy": 0.9659321356683529,
"eval_dev_accuracy_threshold": 0.9563218355178833,
"eval_dev_average_precision": 0.8166384763438364,
"eval_dev_f1": 0.7424130273871207,
"eval_dev_f1_threshold": 0.5458764433860779,
"eval_dev_precision": 0.7173801982221314,
"eval_dev_recall": 0.7692560534677331,
"eval_loss": 0.3241870701313019,
"eval_runtime": 560.7697,
"eval_samples_per_second": 236.545,
"eval_steps_per_second": 7.393,
"step": 27000
},
{
"epoch": 0.3611070395885245,
"grad_norm": 11.259644508361816,
"learning_rate": 1.821015072103284e-05,
"loss": 0.3134,
"step": 27100
},
{
"epoch": 0.3624395378895892,
"grad_norm": 15.958681106567383,
"learning_rate": 1.8195345118592878e-05,
"loss": 0.229,
"step": 27200
},
{
"epoch": 0.36377203619065385,
"grad_norm": 3.471926689147949,
"learning_rate": 1.8180539516152913e-05,
"loss": 0.2318,
"step": 27300
},
{
"epoch": 0.36510453449171854,
"grad_norm": 57.36378479003906,
"learning_rate": 1.816573391371295e-05,
"loss": 0.2584,
"step": 27400
},
{
"epoch": 0.36643703279278317,
"grad_norm": 15.649163246154785,
"learning_rate": 1.8150928311272986e-05,
"loss": 0.3092,
"step": 27500
},
{
"epoch": 0.36643703279278317,
"eval_dev_accuracy": 0.9644696073035952,
"eval_dev_accuracy_threshold": 0.9345089793205261,
"eval_dev_average_precision": 0.8036883896122946,
"eval_dev_f1": 0.7414679756895747,
"eval_dev_f1_threshold": 0.8182344436645508,
"eval_dev_precision": 0.7049585144211774,
"eval_dev_recall": 0.7819655965815712,
"eval_loss": 0.2574635446071625,
"eval_runtime": 562.3168,
"eval_samples_per_second": 235.894,
"eval_steps_per_second": 7.373,
"step": 27500
},
{
"epoch": 0.36776953109384786,
"grad_norm": 31.03179931640625,
"learning_rate": 1.8136122708833024e-05,
"loss": 0.2781,
"step": 27600
},
{
"epoch": 0.36910202939491255,
"grad_norm": 32.65872573852539,
"learning_rate": 1.8121317106393063e-05,
"loss": 0.2411,
"step": 27700
},
{
"epoch": 0.3704345276959772,
"grad_norm": 10.414048194885254,
"learning_rate": 1.8106511503953098e-05,
"loss": 0.2768,
"step": 27800
},
{
"epoch": 0.37176702599704187,
"grad_norm": 0.27181100845336914,
"learning_rate": 1.8091705901513136e-05,
"loss": 0.256,
"step": 27900
},
{
"epoch": 0.3730995242981065,
"grad_norm": 15.69724178314209,
"learning_rate": 1.807690029907317e-05,
"loss": 0.3024,
"step": 28000
},
{
"epoch": 0.3730995242981065,
"eval_dev_accuracy": 0.9660904505944349,
"eval_dev_accuracy_threshold": 0.960444450378418,
"eval_dev_average_precision": 0.8143885872198954,
"eval_dev_f1": 0.7409103007718926,
"eval_dev_f1_threshold": 0.8899838328361511,
"eval_dev_precision": 0.7205425553944916,
"eval_dev_recall": 0.7624630218034404,
"eval_loss": 0.2652537524700165,
"eval_runtime": 560.0512,
"eval_samples_per_second": 236.848,
"eval_steps_per_second": 7.403,
"step": 28000
},
{
"epoch": 0.3744320225991712,
"grad_norm": 4.027531623840332,
"learning_rate": 1.806209469663321e-05,
"loss": 0.2676,
"step": 28100
},
{
"epoch": 0.3757645209002359,
"grad_norm": 6.543447494506836,
"learning_rate": 1.8047289094193244e-05,
"loss": 0.2384,
"step": 28200
},
{
"epoch": 0.3770970192013005,
"grad_norm": 35.99159622192383,
"learning_rate": 1.8032483491753283e-05,
"loss": 0.2586,
"step": 28300
},
{
"epoch": 0.3784295175023652,
"grad_norm": 1.3943774700164795,
"learning_rate": 1.8017677889313318e-05,
"loss": 0.2663,
"step": 28400
},
{
"epoch": 0.37976201580342983,
"grad_norm": 0.43371257185935974,
"learning_rate": 1.8002872286873352e-05,
"loss": 0.3077,
"step": 28500
},
{
"epoch": 0.37976201580342983,
"eval_dev_accuracy": 0.9662638431325247,
"eval_dev_accuracy_threshold": 0.9389976263046265,
"eval_dev_average_precision": 0.8185963813825948,
"eval_dev_f1": 0.7529551465428834,
"eval_dev_f1_threshold": 0.8002798557281494,
"eval_dev_precision": 0.7420212765957447,
"eval_dev_recall": 0.7642160622329353,
"eval_loss": 0.2862532138824463,
"eval_runtime": 562.8872,
"eval_samples_per_second": 235.655,
"eval_steps_per_second": 7.366,
"step": 28500
},
{
"epoch": 0.3810945141044945,
"grad_norm": 7.868191719055176,
"learning_rate": 1.798806668443339e-05,
"loss": 0.2609,
"step": 28600
},
{
"epoch": 0.3824270124055592,
"grad_norm": 0.37841853499412537,
"learning_rate": 1.7973261081993426e-05,
"loss": 0.277,
"step": 28700
},
{
"epoch": 0.38375951070662384,
"grad_norm": 1.237690806388855,
"learning_rate": 1.7958455479553464e-05,
"loss": 0.2635,
"step": 28800
},
{
"epoch": 0.38509200900768853,
"grad_norm": 14.932636260986328,
"learning_rate": 1.79436498771135e-05,
"loss": 0.2518,
"step": 28900
},
{
"epoch": 0.38642450730875316,
"grad_norm": 7.698137283325195,
"learning_rate": 1.7928844274673537e-05,
"loss": 0.2686,
"step": 29000
},
{
"epoch": 0.38642450730875316,
"eval_dev_accuracy": 0.9663693864165793,
"eval_dev_accuracy_threshold": 0.9125785231590271,
"eval_dev_average_precision": 0.8194613717227588,
"eval_dev_f1": 0.7500950931913275,
"eval_dev_f1_threshold": 0.7369703054428101,
"eval_dev_precision": 0.7440707201379905,
"eval_dev_recall": 0.7562178152733647,
"eval_loss": 0.25516369938850403,
"eval_runtime": 561.2432,
"eval_samples_per_second": 236.345,
"eval_steps_per_second": 7.387,
"step": 29000
},
{
"epoch": 0.38775700560981785,
"grad_norm": 11.858484268188477,
"learning_rate": 1.7914038672233576e-05,
"loss": 0.2419,
"step": 29100
},
{
"epoch": 0.38908950391088254,
"grad_norm": 1.3223813772201538,
"learning_rate": 1.789923306979361e-05,
"loss": 0.268,
"step": 29200
},
{
"epoch": 0.3904220022119472,
"grad_norm": 1.3486851453781128,
"learning_rate": 1.788442746735365e-05,
"loss": 0.2851,
"step": 29300
},
{
"epoch": 0.39175450051301186,
"grad_norm": 4.85157585144043,
"learning_rate": 1.7869621864913684e-05,
"loss": 0.2212,
"step": 29400
},
{
"epoch": 0.3930869988140765,
"grad_norm": 6.538160800933838,
"learning_rate": 1.7854816262473722e-05,
"loss": 0.2571,
"step": 29500
},
{
"epoch": 0.3930869988140765,
"eval_dev_accuracy": 0.9645676117816461,
"eval_dev_accuracy_threshold": 0.8994825482368469,
"eval_dev_average_precision": 0.8082227405172548,
"eval_dev_f1": 0.7435443565181175,
"eval_dev_f1_threshold": 0.609738826751709,
"eval_dev_precision": 0.7083622656482492,
"eval_dev_recall": 0.7824038566889449,
"eval_loss": 0.2665890157222748,
"eval_runtime": 562.6368,
"eval_samples_per_second": 235.76,
"eval_steps_per_second": 7.369,
"step": 29500
},
{
"epoch": 0.3944194971151412,
"grad_norm": 10.298799514770508,
"learning_rate": 1.7840010660033757e-05,
"loss": 0.2803,
"step": 29600
},
{
"epoch": 0.39575199541620587,
"grad_norm": 46.07704162597656,
"learning_rate": 1.7825205057593796e-05,
"loss": 0.3034,
"step": 29700
},
{
"epoch": 0.3970844937172705,
"grad_norm": 12.525829315185547,
"learning_rate": 1.781039945515383e-05,
"loss": 0.2332,
"step": 29800
},
{
"epoch": 0.3984169920183352,
"grad_norm": 3.9645519256591797,
"learning_rate": 1.779559385271387e-05,
"loss": 0.2444,
"step": 29900
},
{
"epoch": 0.3997494903193998,
"grad_norm": 18.388866424560547,
"learning_rate": 1.7780788250273907e-05,
"loss": 0.247,
"step": 30000
},
{
"epoch": 0.3997494903193998,
"eval_dev_accuracy": 0.9654571908901068,
"eval_dev_accuracy_threshold": 0.9365599155426025,
"eval_dev_average_precision": 0.8171252302464322,
"eval_dev_f1": 0.747335818153184,
"eval_dev_f1_threshold": 0.8443748354911804,
"eval_dev_precision": 0.7173956863535578,
"eval_dev_recall": 0.779883861071546,
"eval_loss": 0.267426073551178,
"eval_runtime": 564.1091,
"eval_samples_per_second": 235.144,
"eval_steps_per_second": 7.35,
"step": 30000
},
{
"epoch": 0.4010819886204645,
"grad_norm": 23.66806411743164,
"learning_rate": 1.7765982647833942e-05,
"loss": 0.2861,
"step": 30100
},
{
"epoch": 0.4024144869215292,
"grad_norm": 3.966848611831665,
"learning_rate": 1.775117704539398e-05,
"loss": 0.2409,
"step": 30200
},
{
"epoch": 0.40374698522259383,
"grad_norm": 14.780499458312988,
"learning_rate": 1.7736371442954015e-05,
"loss": 0.2658,
"step": 30300
},
{
"epoch": 0.4050794835236585,
"grad_norm": 30.90425682067871,
"learning_rate": 1.7721565840514054e-05,
"loss": 0.3114,
"step": 30400
},
{
"epoch": 0.40641198182472316,
"grad_norm": 5.639667987823486,
"learning_rate": 1.770676023807409e-05,
"loss": 0.2685,
"step": 30500
},
{
"epoch": 0.40641198182472316,
"eval_dev_accuracy": 0.9670704953749425,
"eval_dev_accuracy_threshold": 0.9521620869636536,
"eval_dev_average_precision": 0.8255021501170436,
"eval_dev_f1": 0.7578924800343035,
"eval_dev_f1_threshold": 0.8574447631835938,
"eval_dev_precision": 0.7418677859391396,
"eval_dev_recall": 0.7746247397830612,
"eval_loss": 0.27643173933029175,
"eval_runtime": 561.8887,
"eval_samples_per_second": 236.073,
"eval_steps_per_second": 7.379,
"step": 30500
},
{
"epoch": 0.40774448012578784,
"grad_norm": 0.6215185523033142,
"learning_rate": 1.7691954635634127e-05,
"loss": 0.2354,
"step": 30600
},
{
"epoch": 0.40907697842685253,
"grad_norm": 4.660243034362793,
"learning_rate": 1.7677149033194162e-05,
"loss": 0.2576,
"step": 30700
},
{
"epoch": 0.41040947672791717,
"grad_norm": 0.37590527534484863,
"learning_rate": 1.7662343430754197e-05,
"loss": 0.2647,
"step": 30800
},
{
"epoch": 0.41174197502898185,
"grad_norm": 0.8927075862884521,
"learning_rate": 1.7647537828314235e-05,
"loss": 0.2175,
"step": 30900
},
{
"epoch": 0.4130744733300465,
"grad_norm": 3.024475336074829,
"learning_rate": 1.763273222587427e-05,
"loss": 0.3085,
"step": 31000
},
{
"epoch": 0.4130744733300465,
"eval_dev_accuracy": 0.9660376789524076,
"eval_dev_accuracy_threshold": 0.9548216462135315,
"eval_dev_average_precision": 0.8156242337854964,
"eval_dev_f1": 0.7478032096816627,
"eval_dev_f1_threshold": 0.6426188945770264,
"eval_dev_precision": 0.7193763919821826,
"eval_dev_recall": 0.7785690807494248,
"eval_loss": 0.26265889406204224,
"eval_runtime": 565.3292,
"eval_samples_per_second": 234.637,
"eval_steps_per_second": 7.334,
"step": 31000
},
{
"epoch": 0.4144069716311112,
"grad_norm": 0.6045613884925842,
"learning_rate": 1.761792662343431e-05,
"loss": 0.2637,
"step": 31100
},
{
"epoch": 0.4157394699321758,
"grad_norm": 0.6080629229545593,
"learning_rate": 1.7603121020994344e-05,
"loss": 0.2567,
"step": 31200
},
{
"epoch": 0.4170719682332405,
"grad_norm": 0.933800995349884,
"learning_rate": 1.7588315418554382e-05,
"loss": 0.2906,
"step": 31300
},
{
"epoch": 0.4184044665343052,
"grad_norm": 3.305546522140503,
"learning_rate": 1.757350981611442e-05,
"loss": 0.2516,
"step": 31400
},
{
"epoch": 0.4197369648353698,
"grad_norm": 9.856147766113281,
"learning_rate": 1.7558704213674455e-05,
"loss": 0.2342,
"step": 31500
},
{
"epoch": 0.4197369648353698,
"eval_dev_accuracy": 0.9664523132826223,
"eval_dev_accuracy_threshold": 0.6949450373649597,
"eval_dev_average_precision": 0.8198951977617771,
"eval_dev_f1": 0.752799668187474,
"eval_dev_f1_threshold": 0.14068716764450073,
"eval_dev_precision": 0.7144966046648952,
"eval_dev_recall": 0.7954420948833133,
"eval_loss": 0.3560490906238556,
"eval_runtime": 566.4442,
"eval_samples_per_second": 234.175,
"eval_steps_per_second": 7.319,
"step": 31500
},
{
"epoch": 0.4210694631364345,
"grad_norm": 6.468503952026367,
"learning_rate": 1.7543898611234493e-05,
"loss": 0.2595,
"step": 31600
},
{
"epoch": 0.42240196143749914,
"grad_norm": 2.2248482704162598,
"learning_rate": 1.752909300879453e-05,
"loss": 0.259,
"step": 31700
},
{
"epoch": 0.42373445973856383,
"grad_norm": 2.2780916690826416,
"learning_rate": 1.7514287406354567e-05,
"loss": 0.2563,
"step": 31800
},
{
"epoch": 0.4250669580396285,
"grad_norm": 5.997177600860596,
"learning_rate": 1.74994818039146e-05,
"loss": 0.2504,
"step": 31900
},
{
"epoch": 0.42639945634069315,
"grad_norm": 5.018893241882324,
"learning_rate": 1.748467620147464e-05,
"loss": 0.2751,
"step": 32000
},
{
"epoch": 0.42639945634069315,
"eval_dev_accuracy": 0.9660979894004388,
"eval_dev_accuracy_threshold": 0.9447215795516968,
"eval_dev_average_precision": 0.818149670082586,
"eval_dev_f1": 0.7564001884718078,
"eval_dev_f1_threshold": 0.7197975516319275,
"eval_dev_precision": 0.7242831361540004,
"eval_dev_recall": 0.7914977539169498,
"eval_loss": 0.23995983600616455,
"eval_runtime": 559.4727,
"eval_samples_per_second": 237.093,
"eval_steps_per_second": 7.411,
"step": 32000
},
{
"epoch": 0.42773195464175784,
"grad_norm": 9.826861381530762,
"learning_rate": 1.7469870599034675e-05,
"loss": 0.2521,
"step": 32100
},
{
"epoch": 0.42906445294282247,
"grad_norm": 7.288123607635498,
"learning_rate": 1.7455064996594713e-05,
"loss": 0.2406,
"step": 32200
},
{
"epoch": 0.43039695124388716,
"grad_norm": 11.257208824157715,
"learning_rate": 1.744025939415475e-05,
"loss": 0.3026,
"step": 32300
},
{
"epoch": 0.43172944954495185,
"grad_norm": 0.21672357618808746,
"learning_rate": 1.7425453791714787e-05,
"loss": 0.234,
"step": 32400
},
{
"epoch": 0.4330619478460165,
"grad_norm": 1.5854872465133667,
"learning_rate": 1.7410648189274825e-05,
"loss": 0.2639,
"step": 32500
},
{
"epoch": 0.4330619478460165,
"eval_dev_accuracy": 0.9651707162619584,
"eval_dev_accuracy_threshold": 0.8978205919265747,
"eval_dev_average_precision": 0.8087336536278384,
"eval_dev_f1": 0.740958788898234,
"eval_dev_f1_threshold": 0.7787094712257385,
"eval_dev_precision": 0.7121349904011317,
"eval_dev_recall": 0.7722143091925058,
"eval_loss": 0.2519395053386688,
"eval_runtime": 557.1987,
"eval_samples_per_second": 238.061,
"eval_steps_per_second": 7.441,
"step": 32500
},
{
"epoch": 0.43439444614708117,
"grad_norm": 5.898445129394531,
"learning_rate": 1.739584258683486e-05,
"loss": 0.2321,
"step": 32600
},
{
"epoch": 0.4357269444481458,
"grad_norm": 0.27915239334106445,
"learning_rate": 1.7381036984394898e-05,
"loss": 0.1894,
"step": 32700
},
{
"epoch": 0.4370594427492105,
"grad_norm": 0.3429672122001648,
"learning_rate": 1.7366231381954933e-05,
"loss": 0.3076,
"step": 32800
},
{
"epoch": 0.4383919410502752,
"grad_norm": 0.6808755397796631,
"learning_rate": 1.735142577951497e-05,
"loss": 0.2392,
"step": 32900
},
{
"epoch": 0.4397244393513398,
"grad_norm": 36.33818435668945,
"learning_rate": 1.7336620177075006e-05,
"loss": 0.2742,
"step": 33000
},
{
"epoch": 0.4397244393513398,
"eval_dev_accuracy": 0.9674248192571261,
"eval_dev_accuracy_threshold": 0.9071935415267944,
"eval_dev_average_precision": 0.8132130323917695,
"eval_dev_f1": 0.7505652677438923,
"eval_dev_f1_threshold": 0.5399670600891113,
"eval_dev_precision": 0.7556073728625361,
"eval_dev_recall": 0.7455900076695519,
"eval_loss": 0.30597466230392456,
"eval_runtime": 519.866,
"eval_samples_per_second": 255.156,
"eval_steps_per_second": 7.975,
"step": 33000
},
{
"epoch": 0.4410569376524045,
"grad_norm": 6.550230503082275,
"learning_rate": 1.732181457463504e-05,
"loss": 0.2624,
"step": 33100
},
{
"epoch": 0.44238943595346913,
"grad_norm": 15.728365898132324,
"learning_rate": 1.730700897219508e-05,
"loss": 0.2481,
"step": 33200
},
{
"epoch": 0.4437219342545338,
"grad_norm": 1.1476960182189941,
"learning_rate": 1.7292203369755115e-05,
"loss": 0.2289,
"step": 33300
},
{
"epoch": 0.4450544325555985,
"grad_norm": 89.61054992675781,
"learning_rate": 1.7277397767315153e-05,
"loss": 0.2854,
"step": 33400
},
{
"epoch": 0.44638693085666314,
"grad_norm": 4.351845741271973,
"learning_rate": 1.7262592164875188e-05,
"loss": 0.2733,
"step": 33500
},
{
"epoch": 0.44638693085666314,
"eval_dev_accuracy": 0.9650500953658959,
"eval_dev_accuracy_threshold": 0.9060708284378052,
"eval_dev_average_precision": 0.8133536713572236,
"eval_dev_f1": 0.744153082919915,
"eval_dev_f1_threshold": 0.8223495483398438,
"eval_dev_precision": 0.7405598958333334,
"eval_dev_recall": 0.7477813082064205,
"eval_loss": 0.272208571434021,
"eval_runtime": 521.749,
"eval_samples_per_second": 254.235,
"eval_steps_per_second": 7.946,
"step": 33500
},
{
"epoch": 0.44771942915772783,
"grad_norm": 6.246555805206299,
"learning_rate": 1.7247786562435226e-05,
"loss": 0.2759,
"step": 33600
},
{
"epoch": 0.44905192745879247,
"grad_norm": 52.076377868652344,
"learning_rate": 1.7232980959995265e-05,
"loss": 0.2588,
"step": 33700
},
{
"epoch": 0.45038442575985715,
"grad_norm": 5.682718276977539,
"learning_rate": 1.72181753575553e-05,
"loss": 0.2106,
"step": 33800
},
{
"epoch": 0.45171692406092184,
"grad_norm": 2.271516799926758,
"learning_rate": 1.7203369755115338e-05,
"loss": 0.2631,
"step": 33900
},
{
"epoch": 0.4530494223619865,
"grad_norm": 1.0763822793960571,
"learning_rate": 1.7188564152675373e-05,
"loss": 0.304,
"step": 34000
},
{
"epoch": 0.4530494223619865,
"eval_dev_accuracy": 0.9669197192548644,
"eval_dev_accuracy_threshold": 0.8872429132461548,
"eval_dev_average_precision": 0.8222864572131344,
"eval_dev_f1": 0.7534934497816593,
"eval_dev_f1_threshold": 0.4772883951663971,
"eval_dev_precision": 0.750788643533123,
"eval_dev_recall": 0.7562178152733647,
"eval_loss": 0.2554573118686676,
"eval_runtime": 520.4082,
"eval_samples_per_second": 254.89,
"eval_steps_per_second": 7.967,
"step": 34000
},
{
"epoch": 0.45438192066305116,
"grad_norm": 0.5738760828971863,
"learning_rate": 1.717375855023541e-05,
"loss": 0.2513,
"step": 34100
},
{
"epoch": 0.4557144189641158,
"grad_norm": 2.8462681770324707,
"learning_rate": 1.7158952947795446e-05,
"loss": 0.2507,
"step": 34200
},
{
"epoch": 0.4570469172651805,
"grad_norm": 8.60177993774414,
"learning_rate": 1.7144147345355484e-05,
"loss": 0.2417,
"step": 34300
},
{
"epoch": 0.4583794155662452,
"grad_norm": 1.3673675060272217,
"learning_rate": 1.712934174291552e-05,
"loss": 0.239,
"step": 34400
},
{
"epoch": 0.4597119138673098,
"grad_norm": 36.5560188293457,
"learning_rate": 1.7114536140475558e-05,
"loss": 0.2527,
"step": 34500
},
{
"epoch": 0.4597119138673098,
"eval_dev_accuracy": 0.9667614043287824,
"eval_dev_accuracy_threshold": 0.9581319093704224,
"eval_dev_average_precision": 0.818866417573704,
"eval_dev_f1": 0.7523900039134568,
"eval_dev_f1_threshold": 0.9470370411872864,
"eval_dev_precision": 0.7681506849315068,
"eval_dev_recall": 0.7372630656294511,
"eval_loss": 0.2984105348587036,
"eval_runtime": 519.7969,
"eval_samples_per_second": 255.19,
"eval_steps_per_second": 7.976,
"step": 34500
},
{
"epoch": 0.4610444121683745,
"grad_norm": 17.973974227905273,
"learning_rate": 1.7099730538035596e-05,
"loss": 0.2807,
"step": 34600
},
{
"epoch": 0.46237691046943913,
"grad_norm": 9.143497467041016,
"learning_rate": 1.708492493559563e-05,
"loss": 0.2304,
"step": 34700
},
{
"epoch": 0.4637094087705038,
"grad_norm": 8.447179794311523,
"learning_rate": 1.707011933315567e-05,
"loss": 0.2707,
"step": 34800
},
{
"epoch": 0.4650419070715685,
"grad_norm": 0.18045054376125336,
"learning_rate": 1.7055313730715704e-05,
"loss": 0.2202,
"step": 34900
},
{
"epoch": 0.46637440537263314,
"grad_norm": 18.00141716003418,
"learning_rate": 1.7040508128275743e-05,
"loss": 0.2802,
"step": 35000
},
{
"epoch": 0.46637440537263314,
"eval_dev_accuracy": 0.9667387879107707,
"eval_dev_accuracy_threshold": 0.922869086265564,
"eval_dev_average_precision": 0.8248757172419965,
"eval_dev_f1": 0.7573180276545787,
"eval_dev_f1_threshold": 0.618488073348999,
"eval_dev_precision": 0.7229527794381351,
"eval_dev_recall": 0.795113399802783,
"eval_loss": 0.2512986958026886,
"eval_runtime": 520.5462,
"eval_samples_per_second": 254.823,
"eval_steps_per_second": 7.965,
"step": 35000
},
{
"epoch": 0.4677069036736978,
"grad_norm": 0.6688315868377686,
"learning_rate": 1.7025702525835778e-05,
"loss": 0.2375,
"step": 35100
},
{
"epoch": 0.46903940197476246,
"grad_norm": 17.023473739624023,
"learning_rate": 1.7010896923395816e-05,
"loss": 0.2058,
"step": 35200
},
{
"epoch": 0.47037190027582715,
"grad_norm": 0.3867310881614685,
"learning_rate": 1.699609132095585e-05,
"loss": 0.2419,
"step": 35300
},
{
"epoch": 0.47170439857689184,
"grad_norm": 13.710586547851562,
"learning_rate": 1.6981285718515886e-05,
"loss": 0.2232,
"step": 35400
},
{
"epoch": 0.47303689687795647,
"grad_norm": 14.513033866882324,
"learning_rate": 1.6966480116075924e-05,
"loss": 0.316,
"step": 35500
},
{
"epoch": 0.47303689687795647,
"eval_dev_accuracy": 0.9672815819430518,
"eval_dev_accuracy_threshold": 0.9403676986694336,
"eval_dev_average_precision": 0.821150327893476,
"eval_dev_f1": 0.7596174282678001,
"eval_dev_f1_threshold": 0.7547413110733032,
"eval_dev_precision": 0.7374393892499742,
"eval_dev_recall": 0.7831708118768489,
"eval_loss": 0.2614619731903076,
"eval_runtime": 520.6856,
"eval_samples_per_second": 254.755,
"eval_steps_per_second": 7.963,
"step": 35500
},
{
"epoch": 0.47436939517902116,
"grad_norm": 2.2954721450805664,
"learning_rate": 1.695167451363596e-05,
"loss": 0.2527,
"step": 35600
},
{
"epoch": 0.4757018934800858,
"grad_norm": 2.294912338256836,
"learning_rate": 1.6936868911195997e-05,
"loss": 0.2732,
"step": 35700
},
{
"epoch": 0.4770343917811505,
"grad_norm": 100.57258605957031,
"learning_rate": 1.6922063308756032e-05,
"loss": 0.2806,
"step": 35800
},
{
"epoch": 0.47836689008221517,
"grad_norm": 13.040018081665039,
"learning_rate": 1.690725770631607e-05,
"loss": 0.25,
"step": 35900
},
{
"epoch": 0.4796993883832798,
"grad_norm": 0.7189066410064697,
"learning_rate": 1.689245210387611e-05,
"loss": 0.2173,
"step": 36000
},
{
"epoch": 0.4796993883832798,
"eval_dev_accuracy": 0.9675982117952159,
"eval_dev_accuracy_threshold": 0.9232138395309448,
"eval_dev_average_precision": 0.8271432363427305,
"eval_dev_f1": 0.7561493449329397,
"eval_dev_f1_threshold": 0.49452510476112366,
"eval_dev_precision": 0.7169088766692852,
"eval_dev_recall": 0.799934260983894,
"eval_loss": 0.2864265441894531,
"eval_runtime": 519.7616,
"eval_samples_per_second": 255.207,
"eval_steps_per_second": 7.977,
"step": 36000
},
{
"epoch": 0.4810318866843445,
"grad_norm": 0.35153084993362427,
"learning_rate": 1.6877646501436144e-05,
"loss": 0.2576,
"step": 36100
},
{
"epoch": 0.4823643849854091,
"grad_norm": 0.3834153413772583,
"learning_rate": 1.6862840898996182e-05,
"loss": 0.2087,
"step": 36200
},
{
"epoch": 0.4836968832864738,
"grad_norm": 0.9096924066543579,
"learning_rate": 1.6848035296556217e-05,
"loss": 0.2581,
"step": 36300
},
{
"epoch": 0.4850293815875385,
"grad_norm": 17.327335357666016,
"learning_rate": 1.6833229694116256e-05,
"loss": 0.265,
"step": 36400
},
{
"epoch": 0.48636187988860313,
"grad_norm": 3.3336431980133057,
"learning_rate": 1.681842409167629e-05,
"loss": 0.2404,
"step": 36500
},
{
"epoch": 0.48636187988860313,
"eval_dev_accuracy": 0.9678545311993486,
"eval_dev_accuracy_threshold": 0.6843677163124084,
"eval_dev_average_precision": 0.8368359833153991,
"eval_dev_f1": 0.7613580982292738,
"eval_dev_f1_threshold": 0.3513629138469696,
"eval_dev_precision": 0.7526766595289079,
"eval_dev_recall": 0.770242138709324,
"eval_loss": 0.3165341913700104,
"eval_runtime": 520.0004,
"eval_samples_per_second": 255.09,
"eval_steps_per_second": 7.973,
"step": 36500
},
{
"epoch": 0.4876943781896678,
"grad_norm": 22.69322395324707,
"learning_rate": 1.680361848923633e-05,
"loss": 0.2525,
"step": 36600
},
{
"epoch": 0.48902687649073245,
"grad_norm": 4.788589954376221,
"learning_rate": 1.6788812886796367e-05,
"loss": 0.2262,
"step": 36700
},
{
"epoch": 0.49035937479179714,
"grad_norm": 48.63047409057617,
"learning_rate": 1.6774007284356402e-05,
"loss": 0.2572,
"step": 36800
},
{
"epoch": 0.49169187309286183,
"grad_norm": 8.924850463867188,
"learning_rate": 1.675920168191644e-05,
"loss": 0.2608,
"step": 36900
},
{
"epoch": 0.49302437139392646,
"grad_norm": 0.28982293605804443,
"learning_rate": 1.6744396079476476e-05,
"loss": 0.2212,
"step": 37000
},
{
"epoch": 0.49302437139392646,
"eval_dev_accuracy": 0.9689250416519032,
"eval_dev_accuracy_threshold": 0.9271968603134155,
"eval_dev_average_precision": 0.8366709399417354,
"eval_dev_f1": 0.7682220970137786,
"eval_dev_f1_threshold": 0.2581200897693634,
"eval_dev_precision": 0.7213351288957291,
"eval_dev_recall": 0.8216281362988934,
"eval_loss": 0.29943621158599854,
"eval_runtime": 519.5326,
"eval_samples_per_second": 255.32,
"eval_steps_per_second": 7.98,
"step": 37000
},
{
"epoch": 0.49435686969499115,
"grad_norm": 8.631064414978027,
"learning_rate": 1.6729590477036514e-05,
"loss": 0.2236,
"step": 37100
},
{
"epoch": 0.4956893679960558,
"grad_norm": 0.3893554210662842,
"learning_rate": 1.671478487459655e-05,
"loss": 0.2542,
"step": 37200
},
{
"epoch": 0.4970218662971205,
"grad_norm": 11.258530616760254,
"learning_rate": 1.6699979272156587e-05,
"loss": 0.2775,
"step": 37300
},
{
"epoch": 0.49835436459818516,
"grad_norm": 23.54794692993164,
"learning_rate": 1.6685173669716622e-05,
"loss": 0.2437,
"step": 37400
},
{
"epoch": 0.4996868628992498,
"grad_norm": 15.748093605041504,
"learning_rate": 1.667036806727666e-05,
"loss": 0.3368,
"step": 37500
},
{
"epoch": 0.4996868628992498,
"eval_dev_accuracy": 0.9686687222477705,
"eval_dev_accuracy_threshold": 0.9627949595451355,
"eval_dev_average_precision": 0.8345873786652108,
"eval_dev_f1": 0.7644562041783806,
"eval_dev_f1_threshold": 0.6956943869590759,
"eval_dev_precision": 0.7692478366984691,
"eval_dev_recall": 0.7597238961323546,
"eval_loss": 0.2797723412513733,
"eval_runtime": 520.4988,
"eval_samples_per_second": 254.846,
"eval_steps_per_second": 7.965,
"step": 37500
},
{
"epoch": 0.5010193612003144,
"grad_norm": 2.5468738079071045,
"learning_rate": 1.6655562464836695e-05,
"loss": 0.2806,
"step": 37600
},
{
"epoch": 0.5023518595013792,
"grad_norm": 2.1441900730133057,
"learning_rate": 1.664075686239673e-05,
"loss": 0.2576,
"step": 37700
},
{
"epoch": 0.5036843578024438,
"grad_norm": 1.2568778991699219,
"learning_rate": 1.662595125995677e-05,
"loss": 0.2848,
"step": 37800
},
{
"epoch": 0.5050168561035084,
"grad_norm": 3.095561981201172,
"learning_rate": 1.6611145657516804e-05,
"loss": 0.215,
"step": 37900
},
{
"epoch": 0.5063493544045732,
"grad_norm": 1.0205029249191284,
"learning_rate": 1.6596340055076842e-05,
"loss": 0.2331,
"step": 38000
},
{
"epoch": 0.5063493544045732,
"eval_dev_accuracy": 0.9679223804533837,
"eval_dev_accuracy_threshold": 0.9669108390808105,
"eval_dev_average_precision": 0.8269638058273905,
"eval_dev_f1": 0.7641839204087119,
"eval_dev_f1_threshold": 0.8454810380935669,
"eval_dev_precision": 0.7504224757076469,
"eval_dev_recall": 0.7784595157225813,
"eval_loss": 0.27937838435173035,
"eval_runtime": 519.5927,
"eval_samples_per_second": 255.29,
"eval_steps_per_second": 7.979,
"step": 38000
},
{
"epoch": 0.5076818527056378,
"grad_norm": 31.32097053527832,
"learning_rate": 1.6581534452636877e-05,
"loss": 0.27,
"step": 38100
},
{
"epoch": 0.5090143510067024,
"grad_norm": 0.6534382104873657,
"learning_rate": 1.6566728850196915e-05,
"loss": 0.2522,
"step": 38200
},
{
"epoch": 0.5103468493077671,
"grad_norm": 23.841657638549805,
"learning_rate": 1.6551923247756954e-05,
"loss": 0.251,
"step": 38300
},
{
"epoch": 0.5116793476088318,
"grad_norm": 11.927959442138672,
"learning_rate": 1.653711764531699e-05,
"loss": 0.2299,
"step": 38400
},
{
"epoch": 0.5130118459098965,
"grad_norm": 1.765657663345337,
"learning_rate": 1.6522312042877027e-05,
"loss": 0.2543,
"step": 38500
},
{
"epoch": 0.5130118459098965,
"eval_dev_accuracy": 0.9683596312016103,
"eval_dev_accuracy_threshold": 0.9189764261245728,
"eval_dev_average_precision": 0.8251932254713443,
"eval_dev_f1": 0.7703276368781975,
"eval_dev_f1_threshold": 0.8014627695083618,
"eval_dev_precision": 0.765329295987888,
"eval_dev_recall": 0.7753916949709653,
"eval_loss": 0.2911910116672516,
"eval_runtime": 519.1529,
"eval_samples_per_second": 255.507,
"eval_steps_per_second": 7.986,
"step": 38500
},
{
"epoch": 0.5143443442109611,
"grad_norm": 6.26005220413208,
"learning_rate": 1.6507506440437062e-05,
"loss": 0.203,
"step": 38600
},
{
"epoch": 0.5156768425120258,
"grad_norm": 3.370025157928467,
"learning_rate": 1.64927008379971e-05,
"loss": 0.2621,
"step": 38700
},
{
"epoch": 0.5170093408130905,
"grad_norm": 29.85224151611328,
"learning_rate": 1.6477895235557135e-05,
"loss": 0.2677,
"step": 38800
},
{
"epoch": 0.5183418391141551,
"grad_norm": 13.099495887756348,
"learning_rate": 1.6463089633117173e-05,
"loss": 0.2377,
"step": 38900
},
{
"epoch": 0.5196743374152198,
"grad_norm": 17.140789031982422,
"learning_rate": 1.6448284030677212e-05,
"loss": 0.265,
"step": 39000
},
{
"epoch": 0.5196743374152198,
"eval_dev_accuracy": 0.9675529789591925,
"eval_dev_accuracy_threshold": 0.9323844909667969,
"eval_dev_average_precision": 0.818988116595722,
"eval_dev_f1": 0.7656208525773743,
"eval_dev_f1_threshold": 0.8410446643829346,
"eval_dev_precision": 0.7426364572605562,
"eval_dev_recall": 0.7900734085679851,
"eval_loss": 0.28379642963409424,
"eval_runtime": 518.2405,
"eval_samples_per_second": 255.956,
"eval_steps_per_second": 8.0,
"step": 39000
},
{
"epoch": 0.5210068357162845,
"grad_norm": 2.0083911418914795,
"learning_rate": 1.319120586275816e-07,
"loss": 0.2108,
"step": 39100
},
{
"epoch": 0.5223393340173491,
"grad_norm": 0.4948272705078125,
"learning_rate": 2.651565622918055e-07,
"loss": 0.227,
"step": 39200
},
{
"epoch": 0.5236718323184137,
"grad_norm": 11.525949478149414,
"learning_rate": 3.984010659560293e-07,
"loss": 0.2081,
"step": 39300
},
{
"epoch": 0.5250043306194785,
"grad_norm": 18.18743133544922,
"learning_rate": 5.316455696202532e-07,
"loss": 0.2782,
"step": 39400
},
{
"epoch": 0.5263368289205431,
"grad_norm": 30.067602157592773,
"learning_rate": 6.64890073284477e-07,
"loss": 0.2357,
"step": 39500
},
{
"epoch": 0.5263368289205431,
"eval_dev_accuracy": 0.9679374580653916,
"eval_dev_accuracy_threshold": 0.8992660045623779,
"eval_dev_average_precision": 0.8239503903565419,
"eval_dev_f1": 0.768843413510473,
"eval_dev_f1_threshold": 0.8412591814994812,
"eval_dev_precision": 0.7522012578616353,
"eval_dev_recall": 0.7862386326284649,
"eval_loss": 0.27902960777282715,
"eval_runtime": 522.9572,
"eval_samples_per_second": 253.648,
"eval_steps_per_second": 7.928,
"step": 39500
},
{
"epoch": 0.5276693272216078,
"grad_norm": 1.496453881263733,
"learning_rate": 7.981345769487009e-07,
"loss": 0.2654,
"step": 39600
},
{
"epoch": 0.5290018255226725,
"grad_norm": 2.676929473876953,
"learning_rate": 9.313790806129248e-07,
"loss": 0.2572,
"step": 39700
},
{
"epoch": 0.5303343238237371,
"grad_norm": 1.3355958461761475,
"learning_rate": 1.0646235842771487e-06,
"loss": 0.2452,
"step": 39800
},
{
"epoch": 0.5316668221248018,
"grad_norm": 24.94687843322754,
"learning_rate": 1.1978680879413725e-06,
"loss": 0.2412,
"step": 39900
},
{
"epoch": 0.5329993204258665,
"grad_norm": 16.272785186767578,
"learning_rate": 1.3311125916055965e-06,
"loss": 0.2656,
"step": 40000
},
{
"epoch": 0.5329993204258665,
"eval_dev_accuracy": 0.9683219371715908,
"eval_dev_accuracy_threshold": 0.8796899914741516,
"eval_dev_average_precision": 0.8334902875069624,
"eval_dev_f1": 0.7711174542763505,
"eval_dev_f1_threshold": 0.6210243701934814,
"eval_dev_precision": 0.7449698702890409,
"eval_dev_recall": 0.7991673057959899,
"eval_loss": 0.2717488408088684,
"eval_runtime": 523.9326,
"eval_samples_per_second": 253.176,
"eval_steps_per_second": 7.913,
"step": 40000
},
{
"epoch": 0.5343318187269311,
"grad_norm": 38.643516540527344,
"learning_rate": 1.4643570952698202e-06,
"loss": 0.2558,
"step": 40100
},
{
"epoch": 0.5356643170279958,
"grad_norm": 0.41367307305336,
"learning_rate": 1.597601598934044e-06,
"loss": 0.2445,
"step": 40200
},
{
"epoch": 0.5369968153290604,
"grad_norm": 0.5968548655509949,
"learning_rate": 1.7308461025982678e-06,
"loss": 0.225,
"step": 40300
},
{
"epoch": 0.5383293136301252,
"grad_norm": 3.6407761573791504,
"learning_rate": 1.864090606262492e-06,
"loss": 0.1996,
"step": 40400
},
{
"epoch": 0.5396618119311898,
"grad_norm": 4.504887580871582,
"learning_rate": 1.9973351099267156e-06,
"loss": 0.244,
"step": 40500
},
{
"epoch": 0.5396618119311898,
"eval_dev_accuracy": 0.9687214938897978,
"eval_dev_accuracy_threshold": 0.9278361797332764,
"eval_dev_average_precision": 0.8391958373486473,
"eval_dev_f1": 0.772467364332722,
"eval_dev_f1_threshold": 0.8639750480651855,
"eval_dev_precision": 0.7608118159600468,
"eval_dev_recall": 0.7844855921989701,
"eval_loss": 0.2598799467086792,
"eval_runtime": 524.043,
"eval_samples_per_second": 253.122,
"eval_steps_per_second": 7.912,
"step": 40500
},
{
"epoch": 0.5409943102322544,
"grad_norm": 102.69219970703125,
"learning_rate": 2.1305796135909398e-06,
"loss": 0.2261,
"step": 40600
},
{
"epoch": 0.5423268085333192,
"grad_norm": 0.4366992115974426,
"learning_rate": 2.2638241172551636e-06,
"loss": 0.2146,
"step": 40700
},
{
"epoch": 0.5436593068343838,
"grad_norm": 0.5195454955101013,
"learning_rate": 2.3970686209193873e-06,
"loss": 0.2287,
"step": 40800
},
{
"epoch": 0.5449918051354484,
"grad_norm": 0.5551161170005798,
"learning_rate": 2.530313124583611e-06,
"loss": 0.2278,
"step": 40900
},
{
"epoch": 0.5463243034365132,
"grad_norm": 0.49544551968574524,
"learning_rate": 2.663557628247835e-06,
"loss": 0.2482,
"step": 41000
},
{
"epoch": 0.5463243034365132,
"eval_dev_accuracy": 0.9691587446380242,
"eval_dev_accuracy_threshold": 0.9283666610717773,
"eval_dev_average_precision": 0.8431961837252191,
"eval_dev_f1": 0.7750185715801761,
"eval_dev_f1_threshold": 0.6344282627105713,
"eval_dev_precision": 0.7514147546043831,
"eval_dev_recall": 0.8001533910375808,
"eval_loss": 0.275828093290329,
"eval_runtime": 522.4079,
"eval_samples_per_second": 253.915,
"eval_steps_per_second": 7.936,
"step": 41000
},
{
"epoch": 0.5476568017375778,
"grad_norm": 0.10281296074390411,
"learning_rate": 2.7968021319120587e-06,
"loss": 0.2163,
"step": 41100
},
{
"epoch": 0.5489893000386424,
"grad_norm": 1.15056312084198,
"learning_rate": 2.930046635576283e-06,
"loss": 0.2284,
"step": 41200
},
{
"epoch": 0.5503217983397071,
"grad_norm": 0.4747524559497833,
"learning_rate": 3.0632911392405066e-06,
"loss": 0.2382,
"step": 41300
},
{
"epoch": 0.5516542966407718,
"grad_norm": 0.4341018795967102,
"learning_rate": 3.1965356429047304e-06,
"loss": 0.2355,
"step": 41400
},
{
"epoch": 0.5529867949418364,
"grad_norm": 14.61008071899414,
"learning_rate": 3.3297801465689546e-06,
"loss": 0.2247,
"step": 41500
},
{
"epoch": 0.5529867949418364,
"eval_dev_accuracy": 0.9692039774740476,
"eval_dev_accuracy_threshold": 0.9339917302131653,
"eval_dev_average_precision": 0.8436933951228754,
"eval_dev_f1": 0.7787227299138979,
"eval_dev_f1_threshold": 0.5835311412811279,
"eval_dev_precision": 0.7518359853121175,
"eval_dev_recall": 0.8076038128629341,
"eval_loss": 0.2721947729587555,
"eval_runtime": 523.5606,
"eval_samples_per_second": 253.356,
"eval_steps_per_second": 7.919,
"step": 41500
},
{
"epoch": 0.5543192932429011,
"grad_norm": 0.17993593215942383,
"learning_rate": 3.4630246502331784e-06,
"loss": 0.2731,
"step": 41600
},
{
"epoch": 0.5556517915439658,
"grad_norm": 0.47082406282424927,
"learning_rate": 3.596269153897402e-06,
"loss": 0.2493,
"step": 41700
},
{
"epoch": 0.5569842898450305,
"grad_norm": 3.0138349533081055,
"learning_rate": 3.729513657561626e-06,
"loss": 0.2002,
"step": 41800
},
{
"epoch": 0.5583167881460951,
"grad_norm": 15.761974334716797,
"learning_rate": 3.862758161225849e-06,
"loss": 0.2301,
"step": 41900
},
{
"epoch": 0.5596492864471598,
"grad_norm": 0.34038063883781433,
"learning_rate": 3.996002664890073e-06,
"loss": 0.2136,
"step": 42000
},
{
"epoch": 0.5596492864471598,
"eval_dev_accuracy": 0.9691587446380242,
"eval_dev_accuracy_threshold": 0.9421218633651733,
"eval_dev_average_precision": 0.8475633374819089,
"eval_dev_f1": 0.7781878671310496,
"eval_dev_f1_threshold": 0.3623931407928467,
"eval_dev_precision": 0.7403560830860534,
"eval_dev_recall": 0.8200942259230853,
"eval_loss": 0.2632051110267639,
"eval_runtime": 523.1078,
"eval_samples_per_second": 253.575,
"eval_steps_per_second": 7.926,
"step": 42000
},
{
"epoch": 0.5609817847482245,
"grad_norm": 0.8982422351837158,
"learning_rate": 4.129247168554298e-06,
"loss": 0.2323,
"step": 42100
},
{
"epoch": 0.5623142830492891,
"grad_norm": 3.004122495651245,
"learning_rate": 4.2624916722185215e-06,
"loss": 0.2274,
"step": 42200
},
{
"epoch": 0.5636467813503537,
"grad_norm": 7.217723846435547,
"learning_rate": 4.395736175882745e-06,
"loss": 0.2233,
"step": 42300
},
{
"epoch": 0.5649792796514185,
"grad_norm": 1.1566057205200195,
"learning_rate": 4.528980679546969e-06,
"loss": 0.2819,
"step": 42400
},
{
"epoch": 0.5663117779524831,
"grad_norm": 0.2774888575077057,
"learning_rate": 4.662225183211193e-06,
"loss": 0.2002,
"step": 42500
},
{
"epoch": 0.5663117779524831,
"eval_dev_accuracy": 0.9700181685224694,
"eval_dev_accuracy_threshold": 0.9420008063316345,
"eval_dev_average_precision": 0.8490166145203218,
"eval_dev_f1": 0.7794501933730532,
"eval_dev_f1_threshold": 0.41960281133651733,
"eval_dev_precision": 0.7451783751374038,
"eval_dev_recall": 0.8170264051714693,
"eval_loss": 0.2606056034564972,
"eval_runtime": 524.5455,
"eval_samples_per_second": 252.88,
"eval_steps_per_second": 7.904,
"step": 42500
},
{
"epoch": 0.5676442762535477,
"grad_norm": 13.932589530944824,
"learning_rate": 4.795469686875417e-06,
"loss": 0.2599,
"step": 42600
},
{
"epoch": 0.5689767745546125,
"grad_norm": 10.140316009521484,
"learning_rate": 4.92871419053964e-06,
"loss": 0.2478,
"step": 42700
},
{
"epoch": 0.5703092728556771,
"grad_norm": 13.381287574768066,
"learning_rate": 5.061958694203864e-06,
"loss": 0.2151,
"step": 42800
},
{
"epoch": 0.5716417711567418,
"grad_norm": 3.821155548095703,
"learning_rate": 5.195203197868088e-06,
"loss": 0.2207,
"step": 42900
},
{
"epoch": 0.5729742694578065,
"grad_norm": 0.3303406834602356,
"learning_rate": 5.328447701532313e-06,
"loss": 0.2683,
"step": 43000
},
{
"epoch": 0.5729742694578065,
"eval_dev_accuracy": 0.9702820267326061,
"eval_dev_accuracy_threshold": 0.9166876673698425,
"eval_dev_average_precision": 0.8539072755077529,
"eval_dev_f1": 0.7817631806395852,
"eval_dev_f1_threshold": 0.4148586690425873,
"eval_dev_precision": 0.7710175812466702,
"eval_dev_recall": 0.7928125342390709,
"eval_loss": 0.2761251628398895,
"eval_runtime": 522.8877,
"eval_samples_per_second": 253.682,
"eval_steps_per_second": 7.929,
"step": 43000
},
{
"epoch": 0.5743067677588711,
"grad_norm": 2.869353771209717,
"learning_rate": 5.461692205196536e-06,
"loss": 0.2233,
"step": 43100
},
{
"epoch": 0.5756392660599358,
"grad_norm": 1.4524685144424438,
"learning_rate": 5.59493670886076e-06,
"loss": 0.2473,
"step": 43200
},
{
"epoch": 0.5769717643610004,
"grad_norm": 0.838426411151886,
"learning_rate": 5.728181212524984e-06,
"loss": 0.2289,
"step": 43300
},
{
"epoch": 0.5783042626620651,
"grad_norm": 33.507659912109375,
"learning_rate": 5.861425716189208e-06,
"loss": 0.2757,
"step": 43400
},
{
"epoch": 0.5796367609631298,
"grad_norm": 10.75368595123291,
"learning_rate": 5.9946702198534315e-06,
"loss": 0.2489,
"step": 43500
},
{
"epoch": 0.5796367609631298,
"eval_dev_accuracy": 0.9702367938965827,
"eval_dev_accuracy_threshold": 0.9455279111862183,
"eval_dev_average_precision": 0.8513893973961074,
"eval_dev_f1": 0.7795382036446223,
"eval_dev_f1_threshold": 0.6581396460533142,
"eval_dev_precision": 0.7695921417894512,
"eval_dev_recall": 0.7897447134874548,
"eval_loss": 0.24530762434005737,
"eval_runtime": 523.1112,
"eval_samples_per_second": 253.573,
"eval_steps_per_second": 7.926,
"step": 43500
},
{
"epoch": 0.5809692592641944,
"grad_norm": 4.178175449371338,
"learning_rate": 6.127914723517655e-06,
"loss": 0.2238,
"step": 43600
},
{
"epoch": 0.5823017575652591,
"grad_norm": 7.612859725952148,
"learning_rate": 6.261159227181879e-06,
"loss": 0.2342,
"step": 43700
},
{
"epoch": 0.5836342558663238,
"grad_norm": 19.10555648803711,
"learning_rate": 6.394403730846103e-06,
"loss": 0.2209,
"step": 43800
},
{
"epoch": 0.5849667541673884,
"grad_norm": 0.2660426199436188,
"learning_rate": 6.527648234510327e-06,
"loss": 0.1982,
"step": 43900
},
{
"epoch": 0.5862992524684532,
"grad_norm": 4.176153659820557,
"learning_rate": 6.660892738174551e-06,
"loss": 0.2577,
"step": 44000
},
{
"epoch": 0.5862992524684532,
"eval_dev_accuracy": 0.9705006521067193,
"eval_dev_accuracy_threshold": 0.9348860383033752,
"eval_dev_average_precision": 0.8544433474182094,
"eval_dev_f1": 0.7824561403508773,
"eval_dev_f1_threshold": 0.41301047801971436,
"eval_dev_precision": 0.759991738097697,
"eval_dev_recall": 0.806289032540813,
"eval_loss": 0.268686980009079,
"eval_runtime": 525.8935,
"eval_samples_per_second": 252.232,
"eval_steps_per_second": 7.884,
"step": 44000
},
{
"epoch": 0.5876317507695178,
"grad_norm": 2.451788902282715,
"learning_rate": 6.794137241838775e-06,
"loss": 0.1872,
"step": 44100
},
{
"epoch": 0.5889642490705824,
"grad_norm": 0.2053864449262619,
"learning_rate": 6.927381745502999e-06,
"loss": 0.2132,
"step": 44200
},
{
"epoch": 0.5902967473716471,
"grad_norm": 2.7442498207092285,
"learning_rate": 7.0606262491672225e-06,
"loss": 0.1735,
"step": 44300
},
{
"epoch": 0.5916292456727118,
"grad_norm": 14.928565979003906,
"learning_rate": 7.193870752831446e-06,
"loss": 0.2907,
"step": 44400
},
{
"epoch": 0.5929617439737764,
"grad_norm": 1.0581625699996948,
"learning_rate": 7.32711525649567e-06,
"loss": 0.2109,
"step": 44500
},
{
"epoch": 0.5929617439737764,
"eval_dev_accuracy": 0.9710132909149849,
"eval_dev_accuracy_threshold": 0.9184995889663696,
"eval_dev_average_precision": 0.8564900386871592,
"eval_dev_f1": 0.7874429836329488,
"eval_dev_f1_threshold": 0.42533212900161743,
"eval_dev_precision": 0.7716659655027346,
"eval_dev_recall": 0.8038786019502575,
"eval_loss": 0.2596043348312378,
"eval_runtime": 521.1196,
"eval_samples_per_second": 254.542,
"eval_steps_per_second": 7.956,
"step": 44500
},
{
"epoch": 0.5942942422748411,
"grad_norm": 7.90291166305542,
"learning_rate": 7.460359760159894e-06,
"loss": 0.2621,
"step": 44600
},
{
"epoch": 0.5956267405759058,
"grad_norm": 27.323461532592773,
"learning_rate": 7.593604263824118e-06,
"loss": 0.21,
"step": 44700
},
{
"epoch": 0.5969592388769704,
"grad_norm": 0.3570970296859741,
"learning_rate": 7.726848767488342e-06,
"loss": 0.216,
"step": 44800
},
{
"epoch": 0.5982917371780351,
"grad_norm": 0.6491680145263672,
"learning_rate": 7.860093271152565e-06,
"loss": 0.2136,
"step": 44900
},
{
"epoch": 0.5996242354790997,
"grad_norm": 20.47812271118164,
"learning_rate": 7.99333777481679e-06,
"loss": 0.2099,
"step": 45000
},
{
"epoch": 0.5996242354790997,
"eval_dev_accuracy": 0.9701463282245358,
"eval_dev_accuracy_threshold": 0.7721706628799438,
"eval_dev_average_precision": 0.8515314890810202,
"eval_dev_f1": 0.7854063375727528,
"eval_dev_f1_threshold": 0.46630430221557617,
"eval_dev_precision": 0.7728285077951003,
"eval_dev_recall": 0.7984003506080859,
"eval_loss": 0.27925005555152893,
"eval_runtime": 528.9897,
"eval_samples_per_second": 250.755,
"eval_steps_per_second": 7.838,
"step": 45000
},
{
"epoch": 0.6009567337801645,
"grad_norm": 0.4902491867542267,
"learning_rate": 8.126582278481013e-06,
"loss": 0.2536,
"step": 45100
},
{
"epoch": 0.6022892320812291,
"grad_norm": 0.5637998580932617,
"learning_rate": 8.259826782145237e-06,
"loss": 0.2247,
"step": 45200
},
{
"epoch": 0.6036217303822937,
"grad_norm": 1.9175264835357666,
"learning_rate": 8.39307128580946e-06,
"loss": 0.2349,
"step": 45300
},
{
"epoch": 0.6049542286833585,
"grad_norm": 76.62299346923828,
"learning_rate": 8.526315789473685e-06,
"loss": 0.1836,
"step": 45400
},
{
"epoch": 0.6062867269844231,
"grad_norm": 1.5868983268737793,
"learning_rate": 8.659560293137908e-06,
"loss": 0.2635,
"step": 45500
},
{
"epoch": 0.6062867269844231,
"eval_dev_accuracy": 0.9700709401644968,
"eval_dev_accuracy_threshold": 0.9073478579521179,
"eval_dev_average_precision": 0.85367208401453,
"eval_dev_f1": 0.7835151777033597,
"eval_dev_f1_threshold": 0.5480349659919739,
"eval_dev_precision": 0.7726643229998935,
"eval_dev_recall": 0.7946751396954093,
"eval_loss": 0.27641019225120544,
"eval_runtime": 535.7653,
"eval_samples_per_second": 247.584,
"eval_steps_per_second": 7.738,
"step": 45500
},
{
"epoch": 0.6076192252854877,
"grad_norm": 0.3646801710128784,
"learning_rate": 8.792804796802133e-06,
"loss": 0.2259,
"step": 45600
},
{
"epoch": 0.6089517235865525,
"grad_norm": 0.1534300446510315,
"learning_rate": 8.926049300466355e-06,
"loss": 0.1824,
"step": 45700
},
{
"epoch": 0.6102842218876171,
"grad_norm": 4.515030384063721,
"learning_rate": 9.05929380413058e-06,
"loss": 0.2108,
"step": 45800
},
{
"epoch": 0.6116167201886817,
"grad_norm": 27.513139724731445,
"learning_rate": 9.192538307794803e-06,
"loss": 0.1652,
"step": 45900
},
{
"epoch": 0.6129492184897464,
"grad_norm": 0.3283866345882416,
"learning_rate": 9.325782811459028e-06,
"loss": 0.2599,
"step": 46000
},
{
"epoch": 0.6129492184897464,
"eval_dev_accuracy": 0.9699880132984537,
"eval_dev_accuracy_threshold": 0.9482549428939819,
"eval_dev_average_precision": 0.8491188703823201,
"eval_dev_f1": 0.7826180027828322,
"eval_dev_f1_threshold": 0.9011486768722534,
"eval_dev_precision": 0.7649335704571608,
"eval_dev_recall": 0.8011394762791717,
"eval_loss": 0.2594774067401886,
"eval_runtime": 527.4017,
"eval_samples_per_second": 251.51,
"eval_steps_per_second": 7.861,
"step": 46000
},
{
"epoch": 0.6142817167908111,
"grad_norm": 0.6060785055160522,
"learning_rate": 9.459027315123252e-06,
"loss": 0.231,
"step": 46100
},
{
"epoch": 0.6156142150918757,
"grad_norm": 1.9709681272506714,
"learning_rate": 9.592271818787475e-06,
"loss": 0.2364,
"step": 46200
},
{
"epoch": 0.6169467133929404,
"grad_norm": 0.13106560707092285,
"learning_rate": 9.7255163224517e-06,
"loss": 0.1774,
"step": 46300
},
{
"epoch": 0.6182792116940051,
"grad_norm": 53.972103118896484,
"learning_rate": 9.858760826115924e-06,
"loss": 0.2322,
"step": 46400
},
{
"epoch": 0.6196117099950698,
"grad_norm": 12.795185089111328,
"learning_rate": 9.992005329780147e-06,
"loss": 0.2283,
"step": 46500
},
{
"epoch": 0.6196117099950698,
"eval_dev_accuracy": 0.9702518715085905,
"eval_dev_accuracy_threshold": 0.8647300004959106,
"eval_dev_average_precision": 0.8569022880485853,
"eval_dev_f1": 0.7869809918232983,
"eval_dev_f1_threshold": 0.43426772952079773,
"eval_dev_precision": 0.7634696610693315,
"eval_dev_recall": 0.8119864139366714,
"eval_loss": 0.26569852232933044,
"eval_runtime": 528.8452,
"eval_samples_per_second": 250.824,
"eval_steps_per_second": 7.84,
"step": 46500
},
{
"epoch": 0.6209442082961344,
"grad_norm": 6.9099507331848145,
"learning_rate": 1.012524983344437e-05,
"loss": 0.2275,
"step": 46600
},
{
"epoch": 0.6222767065971991,
"grad_norm": 3.897141456604004,
"learning_rate": 1.0258494337108595e-05,
"loss": 0.1867,
"step": 46700
},
{
"epoch": 0.6236092048982638,
"grad_norm": 1.8539767265319824,
"learning_rate": 1.0391738840772818e-05,
"loss": 0.276,
"step": 46800
},
{
"epoch": 0.6249417031993284,
"grad_norm": 17.823284149169922,
"learning_rate": 1.0524983344437042e-05,
"loss": 0.2208,
"step": 46900
},
{
"epoch": 0.626274201500393,
"grad_norm": 0.8377816081047058,
"learning_rate": 1.0658227848101265e-05,
"loss": 0.2644,
"step": 47000
},
{
"epoch": 0.626274201500393,
"eval_dev_accuracy": 0.9708625147949068,
"eval_dev_accuracy_threshold": 0.8370188474655151,
"eval_dev_average_precision": 0.8568328618718613,
"eval_dev_f1": 0.7867207514944491,
"eval_dev_f1_threshold": 0.3532576858997345,
"eval_dev_precision": 0.766989280882506,
"eval_dev_recall": 0.8074942478360907,
"eval_loss": 0.2608221769332886,
"eval_runtime": 528.9364,
"eval_samples_per_second": 250.781,
"eval_steps_per_second": 7.838,
"step": 47000
},
{
"epoch": 0.6276066998014578,
"grad_norm": 23.196794509887695,
"learning_rate": 1.079147235176549e-05,
"loss": 0.1944,
"step": 47100
},
{
"epoch": 0.6289391981025224,
"grad_norm": 0.2909054458141327,
"learning_rate": 1.0924716855429713e-05,
"loss": 0.2221,
"step": 47200
},
{
"epoch": 0.630271696403587,
"grad_norm": 15.759045600891113,
"learning_rate": 1.1057961359093938e-05,
"loss": 0.2392,
"step": 47300
},
{
"epoch": 0.6316041947046518,
"grad_norm": 4.435680866241455,
"learning_rate": 1.1191205862758164e-05,
"loss": 0.1809,
"step": 47400
},
{
"epoch": 0.6329366930057164,
"grad_norm": 3.936431646347046,
"learning_rate": 1.1324450366422385e-05,
"loss": 0.1708,
"step": 47500
},
{
"epoch": 0.6329366930057164,
"eval_dev_accuracy": 0.9702594103145944,
"eval_dev_accuracy_threshold": 0.9633700847625732,
"eval_dev_average_precision": 0.8539832745264263,
"eval_dev_f1": 0.7859069988890653,
"eval_dev_f1_threshold": 0.7301878929138184,
"eval_dev_precision": 0.7598199672667758,
"eval_dev_recall": 0.8138490193930098,
"eval_loss": 0.2916560173034668,
"eval_runtime": 524.8425,
"eval_samples_per_second": 252.737,
"eval_steps_per_second": 7.9,
"step": 47500
},
{
"epoch": 0.634269191306781,
"grad_norm": 1.574413776397705,
"learning_rate": 1.1457694870086611e-05,
"loss": 0.2181,
"step": 47600
},
{
"epoch": 0.6356016896078458,
"grad_norm": 4.340725421905518,
"learning_rate": 1.1590939373750833e-05,
"loss": 0.2258,
"step": 47700
},
{
"epoch": 0.6369341879089104,
"grad_norm": 5.916915416717529,
"learning_rate": 1.1724183877415059e-05,
"loss": 0.2808,
"step": 47800
},
{
"epoch": 0.6382666862099751,
"grad_norm": 15.759284019470215,
"learning_rate": 1.1857428381079282e-05,
"loss": 0.2394,
"step": 47900
},
{
"epoch": 0.6395991845110397,
"grad_norm": 14.555028915405273,
"learning_rate": 1.1990672884743507e-05,
"loss": 0.2267,
"step": 48000
},
{
"epoch": 0.6395991845110397,
"eval_dev_accuracy": 0.9713826924091762,
"eval_dev_accuracy_threshold": 0.9341762065887451,
"eval_dev_average_precision": 0.8563315677126753,
"eval_dev_f1": 0.7862142099681866,
"eval_dev_f1_threshold": 0.4216569662094116,
"eval_dev_precision": 0.7617384156991678,
"eval_dev_recall": 0.8123151090172017,
"eval_loss": 0.26165512204170227,
"eval_runtime": 525.2884,
"eval_samples_per_second": 252.522,
"eval_steps_per_second": 7.893,
"step": 48000
},
{
"epoch": 0.6409316828121044,
"grad_norm": 8.00622844696045,
"learning_rate": 1.212391738840773e-05,
"loss": 0.2583,
"step": 48100
},
{
"epoch": 0.6422641811131691,
"grad_norm": 13.320343017578125,
"learning_rate": 1.2257161892071954e-05,
"loss": 0.2188,
"step": 48200
},
{
"epoch": 0.6435966794142337,
"grad_norm": 2.9494426250457764,
"learning_rate": 1.2390406395736177e-05,
"loss": 0.1877,
"step": 48300
},
{
"epoch": 0.6449291777152985,
"grad_norm": 0.39628902077674866,
"learning_rate": 1.2523650899400402e-05,
"loss": 0.2324,
"step": 48400
},
{
"epoch": 0.6462616760163631,
"grad_norm": 0.1506374627351761,
"learning_rate": 1.2656895403064625e-05,
"loss": 0.2239,
"step": 48500
},
{
"epoch": 0.6462616760163631,
"eval_dev_accuracy": 0.9706514282267974,
"eval_dev_accuracy_threshold": 0.8615503311157227,
"eval_dev_average_precision": 0.8586570982605375,
"eval_dev_f1": 0.7870691958322201,
"eval_dev_f1_threshold": 0.24849581718444824,
"eval_dev_precision": 0.7681476846057572,
"eval_dev_recall": 0.8069464227018736,
"eval_loss": 0.28418707847595215,
"eval_runtime": 533.0754,
"eval_samples_per_second": 248.833,
"eval_steps_per_second": 7.778,
"step": 48500
},
{
"epoch": 0.6475941743174277,
"grad_norm": 0.48906368017196655,
"learning_rate": 1.279013990672885e-05,
"loss": 0.22,
"step": 48600
},
{
"epoch": 0.6489266726184925,
"grad_norm": 71.81077575683594,
"learning_rate": 1.2923384410393072e-05,
"loss": 0.2079,
"step": 48700
},
{
"epoch": 0.6502591709195571,
"grad_norm": 17.413375854492188,
"learning_rate": 1.3056628914057297e-05,
"loss": 0.2212,
"step": 48800
},
{
"epoch": 0.6515916692206217,
"grad_norm": 0.7448732852935791,
"learning_rate": 1.318987341772152e-05,
"loss": 0.2106,
"step": 48900
},
{
"epoch": 0.6529241675216864,
"grad_norm": 0.6357948780059814,
"learning_rate": 1.3323117921385744e-05,
"loss": 0.2095,
"step": 49000
},
{
"epoch": 0.6529241675216864,
"eval_dev_accuracy": 0.971164067035063,
"eval_dev_accuracy_threshold": 0.925714373588562,
"eval_dev_average_precision": 0.8570638757463108,
"eval_dev_f1": 0.7913554743365645,
"eval_dev_f1_threshold": 0.5317444801330566,
"eval_dev_precision": 0.7659967186218212,
"eval_dev_recall": 0.8184507505204339,
"eval_loss": 0.2687513828277588,
"eval_runtime": 529.1402,
"eval_samples_per_second": 250.684,
"eval_steps_per_second": 7.835,
"step": 49000
},
{
"epoch": 0.6542566658227511,
"grad_norm": 12.15365982055664,
"learning_rate": 1.3456362425049967e-05,
"loss": 0.2359,
"step": 49100
},
{
"epoch": 0.6555891641238157,
"grad_norm": 12.457159996032715,
"learning_rate": 1.3589606928714192e-05,
"loss": 0.2392,
"step": 49200
},
{
"epoch": 0.6569216624248804,
"grad_norm": 0.6378312110900879,
"learning_rate": 1.3722851432378415e-05,
"loss": 0.2185,
"step": 49300
},
{
"epoch": 0.6582541607259451,
"grad_norm": 10.198519706726074,
"learning_rate": 1.385609593604264e-05,
"loss": 0.2497,
"step": 49400
},
{
"epoch": 0.6595866590270097,
"grad_norm": 0.6230494976043701,
"learning_rate": 1.3989340439706862e-05,
"loss": 0.2357,
"step": 49500
},
{
"epoch": 0.6595866590270097,
"eval_dev_accuracy": 0.9700030909104616,
"eval_dev_accuracy_threshold": 0.5345156192779541,
"eval_dev_average_precision": 0.8443688741553218,
"eval_dev_f1": 0.785516801361123,
"eval_dev_f1_threshold": 0.39208123087882996,
"eval_dev_precision": 0.7630410081603141,
"eval_dev_recall": 0.809356853292429,
"eval_loss": 0.270622581243515,
"eval_runtime": 527.7067,
"eval_samples_per_second": 251.365,
"eval_steps_per_second": 7.857,
"step": 49500
},
{
"epoch": 0.6609191573280744,
"grad_norm": 6.028562068939209,
"learning_rate": 1.4122584943371087e-05,
"loss": 0.2147,
"step": 49600
},
{
"epoch": 0.6622516556291391,
"grad_norm": 7.488621711730957,
"learning_rate": 1.4255829447035312e-05,
"loss": 0.2252,
"step": 49700
},
{
"epoch": 0.6635841539302038,
"grad_norm": 3.221320152282715,
"learning_rate": 1.4389073950699535e-05,
"loss": 0.2296,
"step": 49800
},
{
"epoch": 0.6649166522312684,
"grad_norm": 33.004817962646484,
"learning_rate": 1.452231845436376e-05,
"loss": 0.2434,
"step": 49900
},
{
"epoch": 0.666249150532333,
"grad_norm": 6.759824752807617,
"learning_rate": 1.4655562958027982e-05,
"loss": 0.2449,
"step": 50000
},
{
"epoch": 0.666249150532333,
"eval_dev_accuracy": 0.9705534237487466,
"eval_dev_accuracy_threshold": 0.9030373096466064,
"eval_dev_average_precision": 0.8517374123261313,
"eval_dev_f1": 0.7881202847731378,
"eval_dev_f1_threshold": 0.5092203617095947,
"eval_dev_precision": 0.7650335224342445,
"eval_dev_recall": 0.812643804097732,
"eval_loss": 0.24229487776756287,
"eval_runtime": 528.3673,
"eval_samples_per_second": 251.051,
"eval_steps_per_second": 7.847,
"step": 50000
},
{
"epoch": 0.6675816488333978,
"grad_norm": 0.4978267252445221,
"learning_rate": 1.4788807461692207e-05,
"loss": 0.3087,
"step": 50100
},
{
"epoch": 0.6689141471344624,
"grad_norm": 17.420612335205078,
"learning_rate": 1.492205196535643e-05,
"loss": 0.2188,
"step": 50200
},
{
"epoch": 0.670246645435527,
"grad_norm": 0.26254966855049133,
"learning_rate": 1.5055296469020654e-05,
"loss": 0.2214,
"step": 50300
},
{
"epoch": 0.6715791437365918,
"grad_norm": 16.93143653869629,
"learning_rate": 1.5188540972684877e-05,
"loss": 0.2141,
"step": 50400
},
{
"epoch": 0.6729116420376564,
"grad_norm": 5.481032848358154,
"learning_rate": 1.5321785476349102e-05,
"loss": 0.2534,
"step": 50500
},
{
"epoch": 0.6729116420376564,
"eval_dev_accuracy": 0.9701538670305397,
"eval_dev_accuracy_threshold": 0.9412756562232971,
"eval_dev_average_precision": 0.8418413944064206,
"eval_dev_f1": 0.78390731292517,
"eval_dev_f1_threshold": 0.8259508013725281,
"eval_dev_precision": 0.7611724636185365,
"eval_dev_recall": 0.8080420729703078,
"eval_loss": 0.28124794363975525,
"eval_runtime": 528.2314,
"eval_samples_per_second": 251.115,
"eval_steps_per_second": 7.849,
"step": 50500
},
{
"epoch": 0.674244140338721,
"grad_norm": 0.13247288763523102,
"learning_rate": 1.319120586275816e-07,
"loss": 0.2242,
"step": 50600
},
{
"epoch": 0.6755766386397858,
"grad_norm": 50.61308670043945,
"learning_rate": 2.651565622918055e-07,
"loss": 0.199,
"step": 50700
},
{
"epoch": 0.6769091369408504,
"grad_norm": 9.46574592590332,
"learning_rate": 3.984010659560293e-07,
"loss": 0.2019,
"step": 50800
},
{
"epoch": 0.678241635241915,
"grad_norm": 0.4613121449947357,
"learning_rate": 5.316455696202532e-07,
"loss": 0.2324,
"step": 50900
},
{
"epoch": 0.6795741335429797,
"grad_norm": 0.06632626801729202,
"learning_rate": 6.64890073284477e-07,
"loss": 0.2095,
"step": 51000
},
{
"epoch": 0.6795741335429797,
"eval_dev_accuracy": 0.9702820267326061,
"eval_dev_accuracy_threshold": 0.9349472522735596,
"eval_dev_average_precision": 0.8438139930773977,
"eval_dev_f1": 0.7839174599797903,
"eval_dev_f1_threshold": 0.7425632476806641,
"eval_dev_precision": 0.7616783794956593,
"eval_dev_recall": 0.8074942478360907,
"eval_loss": 0.2824593782424927,
"eval_runtime": 534.9937,
"eval_samples_per_second": 247.941,
"eval_steps_per_second": 7.75,
"step": 51000
},
{
"epoch": 0.6809066318440444,
"grad_norm": 0.5744990706443787,
"learning_rate": 7.981345769487009e-07,
"loss": 0.2757,
"step": 51100
},
{
"epoch": 0.6822391301451091,
"grad_norm": 44.8016471862793,
"learning_rate": 9.313790806129248e-07,
"loss": 0.2954,
"step": 51200
},
{
"epoch": 0.6835716284461737,
"grad_norm": 18.677654266357422,
"learning_rate": 1.0646235842771487e-06,
"loss": 0.2051,
"step": 51300
},
{
"epoch": 0.6849041267472384,
"grad_norm": 7.698785305023193,
"learning_rate": 1.1978680879413725e-06,
"loss": 0.2575,
"step": 51400
},
{
"epoch": 0.6862366250483031,
"grad_norm": 1.6236628293991089,
"learning_rate": 1.3311125916055965e-06,
"loss": 0.1763,
"step": 51500
},
{
"epoch": 0.6862366250483031,
"eval_dev_accuracy": 0.9702669491205983,
"eval_dev_accuracy_threshold": 0.9349033832550049,
"eval_dev_average_precision": 0.8468881842158165,
"eval_dev_f1": 0.783245178180264,
"eval_dev_f1_threshold": 0.763167142868042,
"eval_dev_precision": 0.7643378519290928,
"eval_dev_recall": 0.8031116467623535,
"eval_loss": 0.2643745541572571,
"eval_runtime": 526.1955,
"eval_samples_per_second": 252.087,
"eval_steps_per_second": 7.879,
"step": 51500
},
{
"epoch": 0.6875691233493677,
"grad_norm": 28.033424377441406,
"learning_rate": 1.4643570952698202e-06,
"loss": 0.2108,
"step": 51600
},
{
"epoch": 0.6889016216504324,
"grad_norm": 19.735244750976562,
"learning_rate": 1.597601598934044e-06,
"loss": 0.2313,
"step": 51700
},
{
"epoch": 0.6902341199514971,
"grad_norm": 2.9967164993286133,
"learning_rate": 1.7308461025982678e-06,
"loss": 0.2344,
"step": 51800
},
{
"epoch": 0.6915666182525617,
"grad_norm": 1.428648591041565,
"learning_rate": 1.864090606262492e-06,
"loss": 0.1968,
"step": 51900
},
{
"epoch": 0.6928991165536263,
"grad_norm": 0.3774360418319702,
"learning_rate": 1.9973351099267156e-06,
"loss": 0.2222,
"step": 52000
},
{
"epoch": 0.6928991165536263,
"eval_dev_accuracy": 0.9705609625547506,
"eval_dev_accuracy_threshold": 0.9212765693664551,
"eval_dev_average_precision": 0.8504652727472383,
"eval_dev_f1": 0.786851950828434,
"eval_dev_f1_threshold": 0.6886965036392212,
"eval_dev_precision": 0.7681310654283627,
"eval_dev_recall": 0.8065081625944999,
"eval_loss": 0.26056790351867676,
"eval_runtime": 524.9198,
"eval_samples_per_second": 252.7,
"eval_steps_per_second": 7.898,
"step": 52000
},
{
"epoch": 0.6942316148546911,
"grad_norm": 5.1444525718688965,
"learning_rate": 2.1305796135909398e-06,
"loss": 0.2213,
"step": 52100
},
{
"epoch": 0.6955641131557557,
"grad_norm": 0.18948954343795776,
"learning_rate": 2.2638241172551636e-06,
"loss": 0.2055,
"step": 52200
},
{
"epoch": 0.6968966114568204,
"grad_norm": 13.482624053955078,
"learning_rate": 2.3970686209193873e-06,
"loss": 0.2321,
"step": 52300
},
{
"epoch": 0.6982291097578851,
"grad_norm": 0.6994342803955078,
"learning_rate": 2.530313124583611e-06,
"loss": 0.257,
"step": 52400
},
{
"epoch": 0.6995616080589497,
"grad_norm": 0.9283449053764343,
"learning_rate": 2.663557628247835e-06,
"loss": 0.2398,
"step": 52500
},
{
"epoch": 0.6995616080589497,
"eval_dev_accuracy": 0.9711112953930356,
"eval_dev_accuracy_threshold": 0.9353954195976257,
"eval_dev_average_precision": 0.854664598144776,
"eval_dev_f1": 0.789044289044289,
"eval_dev_f1_threshold": 0.7551745176315308,
"eval_dev_precision": 0.7638732177659248,
"eval_dev_recall": 0.8159307549030349,
"eval_loss": 0.2365955263376236,
"eval_runtime": 526.0912,
"eval_samples_per_second": 252.137,
"eval_steps_per_second": 7.881,
"step": 52500
},
{
"epoch": 0.7008941063600144,
"grad_norm": 69.95816040039062,
"learning_rate": 2.7968021319120587e-06,
"loss": 0.2168,
"step": 52600
},
{
"epoch": 0.7022266046610791,
"grad_norm": 13.763835906982422,
"learning_rate": 2.930046635576283e-06,
"loss": 0.2066,
"step": 52700
},
{
"epoch": 0.7035591029621437,
"grad_norm": 2.3356781005859375,
"learning_rate": 3.0632911392405066e-06,
"loss": 0.222,
"step": 52800
},
{
"epoch": 0.7048916012632084,
"grad_norm": 4.479837417602539,
"learning_rate": 3.1965356429047304e-06,
"loss": 0.269,
"step": 52900
},
{
"epoch": 0.706224099564273,
"grad_norm": 15.155440330505371,
"learning_rate": 3.3297801465689546e-06,
"loss": 0.2327,
"step": 53000
},
{
"epoch": 0.706224099564273,
"eval_dev_accuracy": 0.971005752108981,
"eval_dev_accuracy_threshold": 0.9340351819992065,
"eval_dev_average_precision": 0.8546100599663748,
"eval_dev_f1": 0.7908306421726932,
"eval_dev_f1_threshold": 0.7827771306037903,
"eval_dev_precision": 0.7651096086867445,
"eval_dev_recall": 0.8183411854935905,
"eval_loss": 0.24670535326004028,
"eval_runtime": 524.1368,
"eval_samples_per_second": 253.077,
"eval_steps_per_second": 7.91,
"step": 53000
},
{
"epoch": 0.7075565978653378,
"grad_norm": 30.88198471069336,
"learning_rate": 3.4630246502331784e-06,
"loss": 0.2274,
"step": 53100
},
{
"epoch": 0.7088890961664024,
"grad_norm": 19.670501708984375,
"learning_rate": 3.596269153897402e-06,
"loss": 0.1619,
"step": 53200
},
{
"epoch": 0.710221594467467,
"grad_norm": 1.817409873008728,
"learning_rate": 3.729513657561626e-06,
"loss": 0.2105,
"step": 53300
},
{
"epoch": 0.7115540927685318,
"grad_norm": 7.859726428985596,
"learning_rate": 3.862758161225849e-06,
"loss": 0.2314,
"step": 53400
},
{
"epoch": 0.7128865910695964,
"grad_norm": 1.2846513986587524,
"learning_rate": 3.996002664890073e-06,
"loss": 0.2118,
"step": 53500
},
{
"epoch": 0.7128865910695964,
"eval_dev_accuracy": 0.9711263730050435,
"eval_dev_accuracy_threshold": 0.956214189529419,
"eval_dev_average_precision": 0.8529387869562187,
"eval_dev_f1": 0.7885323513940031,
"eval_dev_f1_threshold": 0.7215464115142822,
"eval_dev_precision": 0.7583729636749975,
"eval_dev_recall": 0.8211898761915196,
"eval_loss": 0.251621812582016,
"eval_runtime": 524.6134,
"eval_samples_per_second": 252.847,
"eval_steps_per_second": 7.903,
"step": 53500
},
{
"epoch": 0.714219089370661,
"grad_norm": 29.144947052001953,
"learning_rate": 4.129247168554298e-06,
"loss": 0.223,
"step": 53600
},
{
"epoch": 0.7155515876717257,
"grad_norm": 1.1121717691421509,
"learning_rate": 4.2624916722185215e-06,
"loss": 0.2177,
"step": 53700
},
{
"epoch": 0.7168840859727904,
"grad_norm": 20.09768295288086,
"learning_rate": 4.395736175882745e-06,
"loss": 0.2092,
"step": 53800
},
{
"epoch": 0.718216584273855,
"grad_norm": 0.34697094559669495,
"learning_rate": 4.528980679546969e-06,
"loss": 0.2112,
"step": 53900
},
{
"epoch": 0.7195490825749197,
"grad_norm": 27.53289222717285,
"learning_rate": 4.662225183211193e-06,
"loss": 0.2188,
"step": 54000
},
{
"epoch": 0.7195490825749197,
"eval_dev_accuracy": 0.9717445550973637,
"eval_dev_accuracy_threshold": 0.9209288358688354,
"eval_dev_average_precision": 0.8572864419695019,
"eval_dev_f1": 0.7925902130849127,
"eval_dev_f1_threshold": 0.5230389833450317,
"eval_dev_precision": 0.7749973824730395,
"eval_dev_recall": 0.8110003286950805,
"eval_loss": 0.2652234435081482,
"eval_runtime": 524.205,
"eval_samples_per_second": 253.044,
"eval_steps_per_second": 7.909,
"step": 54000
},
{
"epoch": 0.7208815808759844,
"grad_norm": 0.12331326305866241,
"learning_rate": 4.795469686875417e-06,
"loss": 0.1995,
"step": 54100
},
{
"epoch": 0.722214079177049,
"grad_norm": 26.130399703979492,
"learning_rate": 4.92871419053964e-06,
"loss": 0.1863,
"step": 54200
},
{
"epoch": 0.7235465774781137,
"grad_norm": 63.348262786865234,
"learning_rate": 5.061958694203864e-06,
"loss": 0.1885,
"step": 54300
},
{
"epoch": 0.7248790757791784,
"grad_norm": 4.434421539306641,
"learning_rate": 5.195203197868088e-06,
"loss": 0.2059,
"step": 54400
},
{
"epoch": 0.7262115740802431,
"grad_norm": 1.5990498065948486,
"learning_rate": 5.328447701532313e-06,
"loss": 0.1944,
"step": 54500
},
{
"epoch": 0.7262115740802431,
"eval_dev_accuracy": 0.9710962177810278,
"eval_dev_accuracy_threshold": 0.938183069229126,
"eval_dev_average_precision": 0.8581458729833185,
"eval_dev_f1": 0.79388743943347,
"eval_dev_f1_threshold": 0.692324697971344,
"eval_dev_precision": 0.7722187694220013,
"eval_dev_recall": 0.8168072751177824,
"eval_loss": 0.24563372135162354,
"eval_runtime": 524.5271,
"eval_samples_per_second": 252.889,
"eval_steps_per_second": 7.904,
"step": 54500
},
{
"epoch": 0.7275440723813077,
"grad_norm": 13.777716636657715,
"learning_rate": 5.461692205196536e-06,
"loss": 0.2015,
"step": 54600
},
{
"epoch": 0.7288765706823723,
"grad_norm": 0.40915578603744507,
"learning_rate": 5.59493670886076e-06,
"loss": 0.1804,
"step": 54700
},
{
"epoch": 0.7302090689834371,
"grad_norm": 2.3663179874420166,
"learning_rate": 5.728181212524984e-06,
"loss": 0.2424,
"step": 54800
},
{
"epoch": 0.7315415672845017,
"grad_norm": 19.617507934570312,
"learning_rate": 5.861425716189208e-06,
"loss": 0.2331,
"step": 54900
},
{
"epoch": 0.7328740655855663,
"grad_norm": 1.4067281484603882,
"learning_rate": 5.9946702198534315e-06,
"loss": 0.197,
"step": 55000
},
{
"epoch": 0.7328740655855663,
"eval_dev_accuracy": 0.9715711625592739,
"eval_dev_accuracy_threshold": 0.9351357221603394,
"eval_dev_average_precision": 0.8584440513483999,
"eval_dev_f1": 0.7944548676255994,
"eval_dev_f1_threshold": 0.3239399194717407,
"eval_dev_precision": 0.7575787695060133,
"eval_dev_recall": 0.8351046346006354,
"eval_loss": 0.271222859621048,
"eval_runtime": 526.0703,
"eval_samples_per_second": 252.147,
"eval_steps_per_second": 7.881,
"step": 55000
},
{
"epoch": 0.7342065638866311,
"grad_norm": 0.45710641145706177,
"learning_rate": 6.127914723517655e-06,
"loss": 0.2503,
"step": 55100
},
{
"epoch": 0.7355390621876957,
"grad_norm": 0.6267761588096619,
"learning_rate": 6.261159227181879e-06,
"loss": 0.2421,
"step": 55200
},
{
"epoch": 0.7368715604887603,
"grad_norm": 11.160945892333984,
"learning_rate": 6.394403730846103e-06,
"loss": 0.2169,
"step": 55300
},
{
"epoch": 0.7382040587898251,
"grad_norm": 0.22500374913215637,
"learning_rate": 6.527648234510327e-06,
"loss": 0.1801,
"step": 55400
},
{
"epoch": 0.7395365570908897,
"grad_norm": 0.34952008724212646,
"learning_rate": 6.660892738174551e-06,
"loss": 0.2168,
"step": 55500
},
{
"epoch": 0.7395365570908897,
"eval_dev_accuracy": 0.9718576371874222,
"eval_dev_accuracy_threshold": 0.9311728477478027,
"eval_dev_average_precision": 0.8606955219787713,
"eval_dev_f1": 0.7966432680635458,
"eval_dev_f1_threshold": 0.3317277133464813,
"eval_dev_precision": 0.7685336048879837,
"eval_dev_recall": 0.8268872575873781,
"eval_loss": 0.24974019825458527,
"eval_runtime": 524.3487,
"eval_samples_per_second": 252.975,
"eval_steps_per_second": 7.907,
"step": 55500
},
{
"epoch": 0.7408690553919544,
"grad_norm": 13.866408348083496,
"learning_rate": 6.794137241838775e-06,
"loss": 0.2266,
"step": 55600
},
{
"epoch": 0.742201553693019,
"grad_norm": 9.584277153015137,
"learning_rate": 6.927381745502999e-06,
"loss": 0.1882,
"step": 55700
},
{
"epoch": 0.7435340519940837,
"grad_norm": 52.4222297668457,
"learning_rate": 7.0606262491672225e-06,
"loss": 0.2214,
"step": 55800
},
{
"epoch": 0.7448665502951484,
"grad_norm": 15.216498374938965,
"learning_rate": 7.193870752831446e-06,
"loss": 0.23,
"step": 55900
},
{
"epoch": 0.746199048596213,
"grad_norm": 21.095590591430664,
"learning_rate": 7.32711525649567e-06,
"loss": 0.2355,
"step": 56000
},
{
"epoch": 0.746199048596213,
"eval_dev_accuracy": 0.9719631804714769,
"eval_dev_accuracy_threshold": 0.9183558821678162,
"eval_dev_average_precision": 0.8589405860687593,
"eval_dev_f1": 0.7974690109434157,
"eval_dev_f1_threshold": 0.33763912320137024,
"eval_dev_precision": 0.7571400433326768,
"eval_dev_recall": 0.8423359263723019,
"eval_loss": 0.24558140337467194,
"eval_runtime": 523.2596,
"eval_samples_per_second": 253.501,
"eval_steps_per_second": 7.923,
"step": 56000
},
{
"epoch": 0.7475315468972777,
"grad_norm": 87.9457778930664,
"learning_rate": 7.460359760159894e-06,
"loss": 0.2105,
"step": 56100
},
{
"epoch": 0.7488640451983424,
"grad_norm": 1.1765731573104858,
"learning_rate": 7.593604263824118e-06,
"loss": 0.1608,
"step": 56200
},
{
"epoch": 0.750196543499407,
"grad_norm": 12.082050323486328,
"learning_rate": 7.726848767488342e-06,
"loss": 0.214,
"step": 56300
},
{
"epoch": 0.7515290418004718,
"grad_norm": 17.673494338989258,
"learning_rate": 7.860093271152565e-06,
"loss": 0.2531,
"step": 56400
},
{
"epoch": 0.7528615401015364,
"grad_norm": 4.850943565368652,
"learning_rate": 7.99333777481679e-06,
"loss": 0.2641,
"step": 56500
},
{
"epoch": 0.7528615401015364,
"eval_dev_accuracy": 0.9718953312174418,
"eval_dev_accuracy_threshold": 0.9289690852165222,
"eval_dev_average_precision": 0.8607199959963239,
"eval_dev_f1": 0.7934619562406249,
"eval_dev_f1_threshold": 0.2598855793476105,
"eval_dev_precision": 0.7515187144816774,
"eval_dev_recall": 0.8403637558891202,
"eval_loss": 0.24914328753948212,
"eval_runtime": 526.4308,
"eval_samples_per_second": 251.974,
"eval_steps_per_second": 7.876,
"step": 56500
},
{
"epoch": 0.754194038402601,
"grad_norm": 21.872079849243164,
"learning_rate": 8.126582278481013e-06,
"loss": 0.2002,
"step": 56600
},
{
"epoch": 0.7555265367036657,
"grad_norm": 0.3463062345981598,
"learning_rate": 8.259826782145237e-06,
"loss": 0.1727,
"step": 56700
},
{
"epoch": 0.7568590350047304,
"grad_norm": 4.641270637512207,
"learning_rate": 8.39307128580946e-06,
"loss": 0.2135,
"step": 56800
},
{
"epoch": 0.758191533305795,
"grad_norm": 1.456807017326355,
"learning_rate": 8.526315789473685e-06,
"loss": 0.1694,
"step": 56900
},
{
"epoch": 0.7595240316068597,
"grad_norm": 0.2848343551158905,
"learning_rate": 8.659560293137908e-06,
"loss": 0.1969,
"step": 57000
},
{
"epoch": 0.7595240316068597,
"eval_dev_accuracy": 0.9716917834553364,
"eval_dev_accuracy_threshold": 0.9249356389045715,
"eval_dev_average_precision": 0.8628574223791167,
"eval_dev_f1": 0.7945488333677474,
"eval_dev_f1_threshold": 0.2702260911464691,
"eval_dev_precision": 0.7511957052220596,
"eval_dev_recall": 0.8432124465870494,
"eval_loss": 0.2667163014411926,
"eval_runtime": 523.0471,
"eval_samples_per_second": 253.604,
"eval_steps_per_second": 7.927,
"step": 57000
},
{
"epoch": 1.521692783285364,
"grad_norm": 680.6102294921875,
"learning_rate": 1.1723219044235212e-05,
"loss": 0.1989,
"step": 57100
},
{
"epoch": 1.524357744376932,
"grad_norm": 555.5462036132812,
"learning_rate": 1.1900870492094512e-05,
"loss": 0.1823,
"step": 57200
},
{
"epoch": 1.5270227054685002,
"grad_norm": 19347.361328125,
"learning_rate": 1.207852193995381e-05,
"loss": 0.2099,
"step": 57300
},
{
"epoch": 1.5296876665600683,
"grad_norm": 28487.04296875,
"learning_rate": 1.225617338781311e-05,
"loss": 0.2007,
"step": 57400
},
{
"epoch": 1.5323526276516364,
"grad_norm": 33787.03515625,
"learning_rate": 1.2433824835672413e-05,
"loss": 0.1893,
"step": 57500
},
{
"epoch": 1.5323526276516364,
"eval_dev_accuracy": 0.9712469939011059,
"eval_dev_accuracy_threshold": 0.930076539516449,
"eval_dev_average_precision": 0.8589126571915907,
"eval_dev_f1": 0.788643194504079,
"eval_dev_f1_threshold": 0.8417924642562866,
"eval_dev_precision": 0.7729615991583377,
"eval_dev_recall": 0.8049742522186918,
"eval_loss": 0.22310471534729004,
"eval_runtime": 911.6835,
"eval_samples_per_second": 145.497,
"eval_steps_per_second": 2.274,
"step": 57500
},
{
"epoch": 1.5350175887432043,
"grad_norm": 10426.8994140625,
"learning_rate": 1.2611476283531711e-05,
"loss": 0.1941,
"step": 57600
},
{
"epoch": 1.5376825498347724,
"grad_norm": 20932.927734375,
"learning_rate": 1.2789127731391012e-05,
"loss": 0.1917,
"step": 57700
},
{
"epoch": 1.5403475109263405,
"grad_norm": 19958.53125,
"learning_rate": 1.2966779179250314e-05,
"loss": 0.1704,
"step": 57800
},
{
"epoch": 1.5430124720179086,
"grad_norm": 4519.30517578125,
"learning_rate": 1.3144430627109612e-05,
"loss": 0.1769,
"step": 57900
},
{
"epoch": 1.5456774331094767,
"grad_norm": 1185.6409912109375,
"learning_rate": 1.3322082074968912e-05,
"loss": 0.1917,
"step": 58000
},
{
"epoch": 1.5456774331094767,
"eval_dev_accuracy": 0.971314843155141,
"eval_dev_accuracy_threshold": 0.9302895069122314,
"eval_dev_average_precision": 0.8581921137101376,
"eval_dev_f1": 0.7902556259558663,
"eval_dev_f1_threshold": 0.9142668843269348,
"eval_dev_precision": 0.7879315978651563,
"eval_dev_recall": 0.792593404185384,
"eval_loss": 0.21683622896671295,
"eval_runtime": 910.3929,
"eval_samples_per_second": 145.703,
"eval_steps_per_second": 2.277,
"step": 58000
},
{
"epoch": 1.5483423942010446,
"grad_norm": 10156.921875,
"learning_rate": 1.3499733522828211e-05,
"loss": 0.156,
"step": 58100
},
{
"epoch": 1.5510073552926127,
"grad_norm": 20830.22265625,
"learning_rate": 1.3677384970687513e-05,
"loss": 0.1882,
"step": 58200
},
{
"epoch": 1.5536723163841808,
"grad_norm": 10158.1328125,
"learning_rate": 1.3855036418546812e-05,
"loss": 0.1914,
"step": 58300
},
{
"epoch": 1.556337277475749,
"grad_norm": 12550.0205078125,
"learning_rate": 1.4032687866406112e-05,
"loss": 0.1859,
"step": 58400
},
{
"epoch": 1.559002238567317,
"grad_norm": 25116.525390625,
"learning_rate": 1.4210339314265414e-05,
"loss": 0.1915,
"step": 58500
},
{
"epoch": 1.559002238567317,
"eval_dev_accuracy": 0.9707343550928405,
"eval_dev_accuracy_threshold": 0.9600124359130859,
"eval_dev_average_precision": 0.8552104699335599,
"eval_dev_f1": 0.788252996419862,
"eval_dev_f1_threshold": 0.6280207633972168,
"eval_dev_precision": 0.7486694263749261,
"eval_dev_recall": 0.8322559439027063,
"eval_loss": 0.22474558651447296,
"eval_runtime": 912.547,
"eval_samples_per_second": 145.359,
"eval_steps_per_second": 2.272,
"step": 58500
},
{
"epoch": 1.561667199658885,
"grad_norm": 1747.8248291015625,
"learning_rate": 1.4387990762124712e-05,
"loss": 0.1658,
"step": 58600
},
{
"epoch": 1.564332160750453,
"grad_norm": 10528.990234375,
"learning_rate": 1.4565642209984013e-05,
"loss": 0.1877,
"step": 58700
},
{
"epoch": 1.5669971218420211,
"grad_norm": 14108.591796875,
"learning_rate": 1.4743293657843311e-05,
"loss": 0.1972,
"step": 58800
},
{
"epoch": 1.5696620829335892,
"grad_norm": 33609.73828125,
"learning_rate": 1.4920945105702613e-05,
"loss": 0.1915,
"step": 58900
},
{
"epoch": 1.5723270440251573,
"grad_norm": 14393.123046875,
"learning_rate": 1.5098596553561913e-05,
"loss": 0.1982,
"step": 59000
},
{
"epoch": 1.5723270440251573,
"eval_dev_accuracy": 0.9714354640512036,
"eval_dev_accuracy_threshold": 0.861323356628418,
"eval_dev_average_precision": 0.8617997355004788,
"eval_dev_f1": 0.792690745885873,
"eval_dev_f1_threshold": 0.5087981224060059,
"eval_dev_precision": 0.7735947439774742,
"eval_dev_recall": 0.8127533691245754,
"eval_loss": 0.27619487047195435,
"eval_runtime": 912.722,
"eval_samples_per_second": 145.331,
"eval_steps_per_second": 2.271,
"step": 59000
},
{
"epoch": 1.5749920051167252,
"grad_norm": 2650.031982421875,
"learning_rate": 1.5276248001421212e-05,
"loss": 0.1977,
"step": 59100
},
{
"epoch": 1.5776569662082933,
"grad_norm": 21126.404296875,
"learning_rate": 1.5453899449280514e-05,
"loss": 0.1646,
"step": 59200
},
{
"epoch": 1.5803219272998614,
"grad_norm": 1604.2296142578125,
"learning_rate": 1.5631550897139813e-05,
"loss": 0.1855,
"step": 59300
},
{
"epoch": 1.5829868883914295,
"grad_norm": 9624.1689453125,
"learning_rate": 1.580920234499911e-05,
"loss": 0.1809,
"step": 59400
},
{
"epoch": 1.5856518494829976,
"grad_norm": 4949.5078125,
"learning_rate": 1.5986853792858413e-05,
"loss": 0.185,
"step": 59500
},
{
"epoch": 1.5856518494829976,
"eval_dev_accuracy": 0.9717068610673442,
"eval_dev_accuracy_threshold": 0.9281443357467651,
"eval_dev_average_precision": 0.8651298435899648,
"eval_dev_f1": 0.7949938492806332,
"eval_dev_f1_threshold": 0.7204960584640503,
"eval_dev_precision": 0.7765935214211076,
"eval_dev_recall": 0.8142872795003835,
"eval_loss": 0.23017099499702454,
"eval_runtime": 912.0946,
"eval_samples_per_second": 145.431,
"eval_steps_per_second": 2.273,
"step": 59500
},
{
"epoch": 1.5883168105745655,
"grad_norm": 4366.28125,
"learning_rate": 1.6164505240717715e-05,
"loss": 0.1524,
"step": 59600
},
{
"epoch": 1.5909817716661336,
"grad_norm": 6088.126953125,
"learning_rate": 1.6342156688577014e-05,
"loss": 0.1626,
"step": 59700
},
{
"epoch": 1.5936467327577017,
"grad_norm": 41741.02734375,
"learning_rate": 1.6519808136436312e-05,
"loss": 0.1855,
"step": 59800
},
{
"epoch": 1.5963116938492699,
"grad_norm": 6351.677734375,
"learning_rate": 1.6697459584295614e-05,
"loss": 0.1777,
"step": 59900
},
{
"epoch": 1.598976654940838,
"grad_norm": 667.612548828125,
"learning_rate": 1.6875111032154913e-05,
"loss": 0.1519,
"step": 60000
},
{
"epoch": 1.598976654940838,
"eval_dev_accuracy": 0.9702217162845749,
"eval_dev_accuracy_threshold": 0.9527369737625122,
"eval_dev_average_precision": 0.8599004250878434,
"eval_dev_f1": 0.7857490403849272,
"eval_dev_f1_threshold": 0.9123563170433044,
"eval_dev_precision": 0.7755602988260406,
"eval_dev_recall": 0.7962090500712172,
"eval_loss": 0.23386961221694946,
"eval_runtime": 912.5307,
"eval_samples_per_second": 145.362,
"eval_steps_per_second": 2.272,
"step": 60000
},
{
"epoch": 1.6016416160324058,
"grad_norm": 74362.328125,
"learning_rate": 1.7052762480014215e-05,
"loss": 0.1705,
"step": 60100
},
{
"epoch": 1.604306577123974,
"grad_norm": 41024.45703125,
"learning_rate": 1.7230413927873513e-05,
"loss": 0.1868,
"step": 60200
},
{
"epoch": 1.606971538215542,
"grad_norm": 10907.779296875,
"learning_rate": 1.7408065375732815e-05,
"loss": 0.1801,
"step": 60300
},
{
"epoch": 1.6096364993071102,
"grad_norm": 17233.494140625,
"learning_rate": 1.7585716823592114e-05,
"loss": 0.1672,
"step": 60400
},
{
"epoch": 1.6123014603986783,
"grad_norm": 6108.4228515625,
"learning_rate": 1.7763368271451412e-05,
"loss": 0.1619,
"step": 60500
},
{
"epoch": 1.6123014603986783,
"eval_dev_accuracy": 0.9701990998665632,
"eval_dev_accuracy_threshold": 0.9653939604759216,
"eval_dev_average_precision": 0.8583701139769879,
"eval_dev_f1": 0.7852786105654916,
"eval_dev_f1_threshold": 0.4483921527862549,
"eval_dev_precision": 0.7433212643115765,
"eval_dev_recall": 0.8322559439027063,
"eval_loss": 0.2841331958770752,
"eval_runtime": 912.3726,
"eval_samples_per_second": 145.387,
"eval_steps_per_second": 2.272,
"step": 60500
},
{
"epoch": 1.6149664214902462,
"grad_norm": 753.2778930664062,
"learning_rate": 1.7941019719310714e-05,
"loss": 0.1775,
"step": 60600
},
{
"epoch": 1.6176313825818143,
"grad_norm": 7861.2724609375,
"learning_rate": 1.8118671167170013e-05,
"loss": 0.1539,
"step": 60700
},
{
"epoch": 1.6202963436733824,
"grad_norm": 4606.5625,
"learning_rate": 1.8296322615029315e-05,
"loss": 0.1984,
"step": 60800
},
{
"epoch": 1.6229613047649505,
"grad_norm": 3256.729248046875,
"learning_rate": 1.8473974062888614e-05,
"loss": 0.1936,
"step": 60900
},
{
"epoch": 1.6256262658565186,
"grad_norm": 16788.51953125,
"learning_rate": 1.8651625510747916e-05,
"loss": 0.1928,
"step": 61000
},
{
"epoch": 1.6256262658565186,
"eval_dev_accuracy": 0.9702820267326061,
"eval_dev_accuracy_threshold": 0.9588229656219482,
"eval_dev_average_precision": 0.8578683942622316,
"eval_dev_f1": 0.78329335697153,
"eval_dev_f1_threshold": 0.8013461232185364,
"eval_dev_precision": 0.7472888269823899,
"eval_dev_recall": 0.8229429166210146,
"eval_loss": 0.21942387521266937,
"eval_runtime": 911.7434,
"eval_samples_per_second": 145.487,
"eval_steps_per_second": 2.274,
"step": 61000
},
{
"epoch": 1.6282912269480865,
"grad_norm": 1664.8751220703125,
"learning_rate": 1.8829276958607214e-05,
"loss": 0.166,
"step": 61100
},
{
"epoch": 1.6309561880396546,
"grad_norm": 21448.6796875,
"learning_rate": 1.9006928406466513e-05,
"loss": 0.1774,
"step": 61200
},
{
"epoch": 1.6336211491312227,
"grad_norm": 18060.765625,
"learning_rate": 1.9184579854325815e-05,
"loss": 0.1319,
"step": 61300
},
{
"epoch": 1.6362861102227908,
"grad_norm": 7385.87353515625,
"learning_rate": 1.9362231302185113e-05,
"loss": 0.1971,
"step": 61400
},
{
"epoch": 1.638951071314359,
"grad_norm": 5024.80078125,
"learning_rate": 1.9539882750044415e-05,
"loss": 0.1728,
"step": 61500
},
{
"epoch": 1.638951071314359,
"eval_dev_accuracy": 0.9713073043491371,
"eval_dev_accuracy_threshold": 0.9408199787139893,
"eval_dev_average_precision": 0.8671213714406215,
"eval_dev_f1": 0.7911789297658863,
"eval_dev_f1_threshold": 0.6503252983093262,
"eval_dev_precision": 0.7563193126186433,
"eval_dev_recall": 0.829407253204777,
"eval_loss": 0.23295743763446808,
"eval_runtime": 911.9086,
"eval_samples_per_second": 145.461,
"eval_steps_per_second": 2.273,
"step": 61500
},
{
"epoch": 1.6416160324059268,
"grad_norm": 8569.271484375,
"learning_rate": 1.9717534197903714e-05,
"loss": 0.1703,
"step": 61600
},
{
"epoch": 1.644280993497495,
"grad_norm": 20367.513671875,
"learning_rate": 1.9895185645763016e-05,
"loss": 0.1624,
"step": 61700
},
{
"epoch": 1.646945954589063,
"grad_norm": 1712.7371826171875,
"learning_rate": 1.9991906350553724e-05,
"loss": 0.1526,
"step": 61800
},
{
"epoch": 1.6496109156806311,
"grad_norm": 408.11163330078125,
"learning_rate": 1.9972165742148174e-05,
"loss": 0.1611,
"step": 61900
},
{
"epoch": 1.6522758767721992,
"grad_norm": 6086.27587890625,
"learning_rate": 1.9952425133742624e-05,
"loss": 0.1603,
"step": 62000
},
{
"epoch": 1.6522758767721992,
"eval_dev_accuracy": 0.971164067035063,
"eval_dev_accuracy_threshold": 0.9526249170303345,
"eval_dev_average_precision": 0.8575865091547995,
"eval_dev_f1": 0.7882105728821057,
"eval_dev_f1_threshold": 0.9510890245437622,
"eval_dev_precision": 0.7956905213799264,
"eval_dev_recall": 0.7808699463131369,
"eval_loss": 0.2797718644142151,
"eval_runtime": 911.0244,
"eval_samples_per_second": 145.602,
"eval_steps_per_second": 2.275,
"step": 62000
},
{
"epoch": 1.6549408378637671,
"grad_norm": 448.80615234375,
"learning_rate": 1.993268452533707e-05,
"loss": 0.1546,
"step": 62100
},
{
"epoch": 1.6576057989553352,
"grad_norm": 31734.08984375,
"learning_rate": 1.991294391693152e-05,
"loss": 0.2302,
"step": 62200
},
{
"epoch": 1.6602707600469033,
"grad_norm": 18211.0,
"learning_rate": 1.989320330852597e-05,
"loss": 0.1694,
"step": 62300
},
{
"epoch": 1.6629357211384712,
"grad_norm": 8841.400390625,
"learning_rate": 1.9873462700120417e-05,
"loss": 0.1705,
"step": 62400
},
{
"epoch": 1.6656006822300395,
"grad_norm": 24008.82421875,
"learning_rate": 1.985372209171487e-05,
"loss": 0.1606,
"step": 62500
},
{
"epoch": 1.6656006822300395,
"eval_dev_accuracy": 0.9708549759889029,
"eval_dev_accuracy_threshold": 0.9542537927627563,
"eval_dev_average_precision": 0.841409192198319,
"eval_dev_f1": 0.7890381515314348,
"eval_dev_f1_threshold": 0.8909753561019897,
"eval_dev_precision": 0.7742275651165244,
"eval_dev_recall": 0.8044264270844746,
"eval_loss": 0.2822663486003876,
"eval_runtime": 911.9083,
"eval_samples_per_second": 145.461,
"eval_steps_per_second": 2.273,
"step": 62500
},
{
"epoch": 1.6682656433216074,
"grad_norm": 2424.1279296875,
"learning_rate": 1.9833981483309317e-05,
"loss": 0.1887,
"step": 62600
},
{
"epoch": 1.6709306044131755,
"grad_norm": 45195.04296875,
"learning_rate": 1.9814240874903764e-05,
"loss": 0.1918,
"step": 62700
},
{
"epoch": 1.6735955655047436,
"grad_norm": 2223.521728515625,
"learning_rate": 1.9794500266498217e-05,
"loss": 0.1475,
"step": 62800
},
{
"epoch": 1.6762605265963115,
"grad_norm": 2829.02099609375,
"learning_rate": 1.9774759658092664e-05,
"loss": 0.1995,
"step": 62900
},
{
"epoch": 1.6789254876878799,
"grad_norm": 11702.283203125,
"learning_rate": 1.975501904968711e-05,
"loss": 0.1648,
"step": 63000
},
{
"epoch": 1.6789254876878799,
"eval_dev_accuracy": 0.9710359073329966,
"eval_dev_accuracy_threshold": 0.910698652267456,
"eval_dev_average_precision": 0.849610869643878,
"eval_dev_f1": 0.7900427192658614,
"eval_dev_f1_threshold": 0.4727928936481476,
"eval_dev_precision": 0.7616432784218019,
"eval_dev_recall": 0.8206420510573025,
"eval_loss": 0.25969284772872925,
"eval_runtime": 910.5015,
"eval_samples_per_second": 145.686,
"eval_steps_per_second": 2.277,
"step": 63000
},
{
"epoch": 1.6815904487794477,
"grad_norm": 21649.341796875,
"learning_rate": 1.9735278441281564e-05,
"loss": 0.1788,
"step": 63100
},
{
"epoch": 1.6842554098710159,
"grad_norm": 86422.7421875,
"learning_rate": 1.971553783287601e-05,
"loss": 0.2286,
"step": 63200
},
{
"epoch": 1.686920370962584,
"grad_norm": 45808.265625,
"learning_rate": 1.969579722447046e-05,
"loss": 0.1611,
"step": 63300
},
{
"epoch": 1.6895853320541518,
"grad_norm": 13495.0380859375,
"learning_rate": 1.967605661606491e-05,
"loss": 0.1962,
"step": 63400
},
{
"epoch": 1.6922502931457202,
"grad_norm": 22458.46484375,
"learning_rate": 1.9656316007659357e-05,
"loss": 0.1825,
"step": 63500
},
{
"epoch": 1.6922502931457202,
"eval_dev_accuracy": 0.9710283685269927,
"eval_dev_accuracy_threshold": 0.932883620262146,
"eval_dev_average_precision": 0.8574042822114104,
"eval_dev_f1": 0.7900720576461169,
"eval_dev_f1_threshold": 0.9062104225158691,
"eval_dev_precision": 0.7702955870108243,
"eval_dev_recall": 0.8108907636682371,
"eval_loss": 0.20927684009075165,
"eval_runtime": 911.9738,
"eval_samples_per_second": 145.45,
"eval_steps_per_second": 2.273,
"step": 63500
},
{
"epoch": 1.694915254237288,
"grad_norm": 4333.6484375,
"learning_rate": 1.9636575399253807e-05,
"loss": 0.1795,
"step": 63600
},
{
"epoch": 1.6975802153288562,
"grad_norm": 51141.83203125,
"learning_rate": 1.9616834790848257e-05,
"loss": 0.1944,
"step": 63700
},
{
"epoch": 1.7002451764204243,
"grad_norm": 24413.966796875,
"learning_rate": 1.9597094182442704e-05,
"loss": 0.196,
"step": 63800
},
{
"epoch": 1.7029101375119922,
"grad_norm": 11386.5224609375,
"learning_rate": 1.9577353574037154e-05,
"loss": 0.1851,
"step": 63900
},
{
"epoch": 1.7055750986035605,
"grad_norm": 1291.42236328125,
"learning_rate": 1.9557612965631604e-05,
"loss": 0.1787,
"step": 64000
},
{
"epoch": 1.7055750986035605,
"eval_dev_accuracy": 0.9717747103213793,
"eval_dev_accuracy_threshold": 0.9631803035736084,
"eval_dev_average_precision": 0.8630868871875782,
"eval_dev_f1": 0.7981506777345803,
"eval_dev_f1_threshold": 0.9355161786079407,
"eval_dev_precision": 0.7667305945291208,
"eval_dev_recall": 0.8322559439027063,
"eval_loss": 0.23051400482654572,
"eval_runtime": 912.7714,
"eval_samples_per_second": 145.323,
"eval_steps_per_second": 2.271,
"step": 64000
},
{
"epoch": 1.7082400596951284,
"grad_norm": 689.987060546875,
"learning_rate": 1.953787235722605e-05,
"loss": 0.1924,
"step": 64100
},
{
"epoch": 1.7109050207866965,
"grad_norm": 11370.0517578125,
"learning_rate": 1.95181317488205e-05,
"loss": 0.1611,
"step": 64200
},
{
"epoch": 1.7135699818782646,
"grad_norm": 15404.4140625,
"learning_rate": 1.949839114041495e-05,
"loss": 0.1799,
"step": 64300
},
{
"epoch": 1.7162349429698325,
"grad_norm": 14026.65234375,
"learning_rate": 1.9478650532009397e-05,
"loss": 0.1977,
"step": 64400
},
{
"epoch": 1.7188999040614008,
"grad_norm": 1225.2841796875,
"learning_rate": 1.9458909923603847e-05,
"loss": 0.1672,
"step": 64500
},
{
"epoch": 1.7188999040614008,
"eval_dev_accuracy": 0.9720687237555315,
"eval_dev_accuracy_threshold": 0.920991063117981,
"eval_dev_average_precision": 0.845700489229083,
"eval_dev_f1": 0.7995607383778697,
"eval_dev_f1_threshold": 0.6048256158828735,
"eval_dev_precision": 0.7648059223689476,
"eval_dev_recall": 0.8376246302180343,
"eval_loss": 0.21997055411338806,
"eval_runtime": 912.741,
"eval_samples_per_second": 145.328,
"eval_steps_per_second": 2.271,
"step": 64500
},
{
"epoch": 1.7215648651529687,
"grad_norm": 18876.72265625,
"learning_rate": 1.9439169315198297e-05,
"loss": 0.1812,
"step": 64600
},
{
"epoch": 1.7242298262445368,
"grad_norm": 44768.2578125,
"learning_rate": 1.9419428706792744e-05,
"loss": 0.1641,
"step": 64700
},
{
"epoch": 1.726894787336105,
"grad_norm": 1987.0482177734375,
"learning_rate": 1.9399688098387194e-05,
"loss": 0.1526,
"step": 64800
},
{
"epoch": 1.7295597484276728,
"grad_norm": 1468.9228515625,
"learning_rate": 1.9379947489981644e-05,
"loss": 0.1745,
"step": 64900
},
{
"epoch": 1.7322247095192411,
"grad_norm": 2461.248291015625,
"learning_rate": 1.936020688157609e-05,
"loss": 0.2017,
"step": 65000
},
{
"epoch": 1.7322247095192411,
"eval_dev_accuracy": 0.9716842446493325,
"eval_dev_accuracy_threshold": 0.8851553201675415,
"eval_dev_average_precision": 0.8642482817005424,
"eval_dev_f1": 0.7979695431472081,
"eval_dev_f1_threshold": 0.674056887626648,
"eval_dev_precision": 0.7787859824780976,
"eval_dev_recall": 0.8181220554399036,
"eval_loss": 0.25105008482933044,
"eval_runtime": 933.219,
"eval_samples_per_second": 142.139,
"eval_steps_per_second": 2.221,
"step": 65000
},
{
"epoch": 1.734889670610809,
"grad_norm": 1901.36474609375,
"learning_rate": 1.934046627317054e-05,
"loss": 0.2092,
"step": 65100
},
{
"epoch": 1.7375546317023771,
"grad_norm": 25123.84375,
"learning_rate": 1.932072566476499e-05,
"loss": 0.1807,
"step": 65200
},
{
"epoch": 1.7402195927939452,
"grad_norm": 21136.314453125,
"learning_rate": 1.9300985056359437e-05,
"loss": 0.1627,
"step": 65300
},
{
"epoch": 1.742884553885513,
"grad_norm": 14610.0068359375,
"learning_rate": 1.9281244447953887e-05,
"loss": 0.1809,
"step": 65400
},
{
"epoch": 1.7455495149770814,
"grad_norm": 5105.17529296875,
"learning_rate": 1.9261503839548337e-05,
"loss": 0.1774,
"step": 65500
},
{
"epoch": 1.7455495149770814,
"eval_dev_accuracy": 0.9722270386816136,
"eval_dev_accuracy_threshold": 0.9311126470565796,
"eval_dev_average_precision": 0.8672414882858807,
"eval_dev_f1": 0.801593625498008,
"eval_dev_f1_threshold": 0.841367244720459,
"eval_dev_precision": 0.7779954629820581,
"eval_dev_recall": 0.8266681275336912,
"eval_loss": 0.2049088478088379,
"eval_runtime": 933.0893,
"eval_samples_per_second": 142.159,
"eval_steps_per_second": 2.222,
"step": 65500
},
{
"epoch": 1.7482144760686493,
"grad_norm": 52553.86328125,
"learning_rate": 1.9241763231142784e-05,
"loss": 0.154,
"step": 65600
},
{
"epoch": 1.7508794371602174,
"grad_norm": 8918.7666015625,
"learning_rate": 1.9222022622737234e-05,
"loss": 0.1871,
"step": 65700
},
{
"epoch": 1.7535443982517855,
"grad_norm": 1728.83984375,
"learning_rate": 1.9202282014331684e-05,
"loss": 0.1929,
"step": 65800
},
{
"epoch": 1.7562093593433534,
"grad_norm": 8542.5439453125,
"learning_rate": 1.918254140592613e-05,
"loss": 0.1519,
"step": 65900
},
{
"epoch": 1.7588743204349218,
"grad_norm": 40360.875,
"learning_rate": 1.916280079752058e-05,
"loss": 0.2105,
"step": 66000
},
{
"epoch": 1.7588743204349218,
"eval_dev_accuracy": 0.9723099655476566,
"eval_dev_accuracy_threshold": 0.82029128074646,
"eval_dev_average_precision": 0.8667448997071003,
"eval_dev_f1": 0.801227852873068,
"eval_dev_f1_threshold": 0.5722821354866028,
"eval_dev_precision": 0.7878627409447151,
"eval_dev_recall": 0.8150542346882875,
"eval_loss": 0.2998444736003876,
"eval_runtime": 935.7172,
"eval_samples_per_second": 141.76,
"eval_steps_per_second": 2.215,
"step": 66000
},
{
"epoch": 1.7615392815264896,
"grad_norm": 46394.6875,
"learning_rate": 1.914306018911503e-05,
"loss": 0.1664,
"step": 66100
},
{
"epoch": 1.7642042426180577,
"grad_norm": 3412.559814453125,
"learning_rate": 1.9123319580709477e-05,
"loss": 0.1806,
"step": 66200
},
{
"epoch": 1.7668692037096259,
"grad_norm": 5545.865234375,
"learning_rate": 1.910357897230393e-05,
"loss": 0.1881,
"step": 66300
},
{
"epoch": 1.7695341648011937,
"grad_norm": 85940.0234375,
"learning_rate": 1.9083838363898377e-05,
"loss": 0.1881,
"step": 66400
},
{
"epoch": 1.772199125892762,
"grad_norm": 15622.53125,
"learning_rate": 1.9064097755492824e-05,
"loss": 0.1889,
"step": 66500
},
{
"epoch": 1.772199125892762,
"eval_dev_accuracy": 0.9718651759934261,
"eval_dev_accuracy_threshold": 0.9094328880310059,
"eval_dev_average_precision": 0.8682256601471484,
"eval_dev_f1": 0.7982062780269058,
"eval_dev_f1_threshold": 0.6328648328781128,
"eval_dev_precision": 0.7697395197395197,
"eval_dev_recall": 0.8288594280705599,
"eval_loss": 0.19647949934005737,
"eval_runtime": 933.2357,
"eval_samples_per_second": 142.137,
"eval_steps_per_second": 2.221,
"step": 66500
},
{
"epoch": 1.77486408698433,
"grad_norm": 1002.01220703125,
"learning_rate": 1.9044357147087277e-05,
"loss": 0.1722,
"step": 66600
},
{
"epoch": 1.777529048075898,
"grad_norm": 45076.7421875,
"learning_rate": 1.9024616538681724e-05,
"loss": 0.1999,
"step": 66700
},
{
"epoch": 1.7801940091674662,
"grad_norm": 2053.866455078125,
"learning_rate": 1.900487593027617e-05,
"loss": 0.1894,
"step": 66800
},
{
"epoch": 1.782858970259034,
"grad_norm": 3085.87451171875,
"learning_rate": 1.8985135321870624e-05,
"loss": 0.1702,
"step": 66900
},
{
"epoch": 1.7855239313506024,
"grad_norm": 1689.106201171875,
"learning_rate": 1.896539471346507e-05,
"loss": 0.1905,
"step": 67000
},
{
"epoch": 1.7855239313506024,
"eval_dev_accuracy": 0.97235519838368,
"eval_dev_accuracy_threshold": 0.8816102743148804,
"eval_dev_average_precision": 0.8719513025342801,
"eval_dev_f1": 0.8005663642561224,
"eval_dev_f1_threshold": 0.595874547958374,
"eval_dev_precision": 0.7677529672098169,
"eval_dev_recall": 0.8363098498959132,
"eval_loss": 0.22260619699954987,
"eval_runtime": 935.4859,
"eval_samples_per_second": 141.795,
"eval_steps_per_second": 2.216,
"step": 67000
},
{
"epoch": 1.7881888924421703,
"grad_norm": 24842.880859375,
"learning_rate": 1.8945654105059517e-05,
"loss": 0.1809,
"step": 67100
},
{
"epoch": 1.7908538535337384,
"grad_norm": 60853.56640625,
"learning_rate": 1.892591349665397e-05,
"loss": 0.1825,
"step": 67200
},
{
"epoch": 1.7935188146253065,
"grad_norm": 6448.2060546875,
"learning_rate": 1.8906172888248417e-05,
"loss": 0.1912,
"step": 67300
},
{
"epoch": 1.7961837757168744,
"grad_norm": 28209.67578125,
"learning_rate": 1.8886432279842867e-05,
"loss": 0.1849,
"step": 67400
},
{
"epoch": 1.7988487368084427,
"grad_norm": 1441.7255859375,
"learning_rate": 1.8866691671437317e-05,
"loss": 0.1812,
"step": 67500
},
{
"epoch": 1.7988487368084427,
"eval_dev_accuracy": 0.9728678371919456,
"eval_dev_accuracy_threshold": 0.8422494530677795,
"eval_dev_average_precision": 0.8713666080730428,
"eval_dev_f1": 0.8021557531662624,
"eval_dev_f1_threshold": 0.6560682058334351,
"eval_dev_precision": 0.7893508697496818,
"eval_dev_recall": 0.8153829297688178,
"eval_loss": 0.21360942721366882,
"eval_runtime": 935.6466,
"eval_samples_per_second": 141.77,
"eval_steps_per_second": 2.216,
"step": 67500
},
{
"epoch": 1.8015136979000106,
"grad_norm": 19593.896484375,
"learning_rate": 1.8846951063031764e-05,
"loss": 0.1729,
"step": 67600
},
{
"epoch": 1.8041786589915787,
"grad_norm": 49532.5390625,
"learning_rate": 1.8827210454626214e-05,
"loss": 0.1981,
"step": 67700
},
{
"epoch": 1.8068436200831468,
"grad_norm": 2939.565185546875,
"learning_rate": 1.8807469846220664e-05,
"loss": 0.172,
"step": 67800
},
{
"epoch": 1.8095085811747147,
"grad_norm": 18294.060546875,
"learning_rate": 1.878772923781511e-05,
"loss": 0.1609,
"step": 67900
},
{
"epoch": 1.812173542266283,
"grad_norm": 67081.5234375,
"learning_rate": 1.876798862940956e-05,
"loss": 0.18,
"step": 68000
},
{
"epoch": 1.812173542266283,
"eval_dev_accuracy": 0.972196883457598,
"eval_dev_accuracy_threshold": 0.9536248445510864,
"eval_dev_average_precision": 0.8677887820499237,
"eval_dev_f1": 0.7948606271777002,
"eval_dev_f1_threshold": 0.8924222588539124,
"eval_dev_precision": 0.7899577967752408,
"eval_dev_recall": 0.7998246959570505,
"eval_loss": 0.22794483602046967,
"eval_runtime": 934.1595,
"eval_samples_per_second": 141.996,
"eval_steps_per_second": 2.219,
"step": 68000
},
{
"epoch": 1.814838503357851,
"grad_norm": 3441.131103515625,
"learning_rate": 1.874824802100401e-05,
"loss": 0.1618,
"step": 68100
},
{
"epoch": 1.817503464449419,
"grad_norm": 40774.67578125,
"learning_rate": 1.8728507412598457e-05,
"loss": 0.1673,
"step": 68200
},
{
"epoch": 1.8201684255409871,
"grad_norm": 23139.685546875,
"learning_rate": 1.8708766804192907e-05,
"loss": 0.1793,
"step": 68300
},
{
"epoch": 1.822833386632555,
"grad_norm": 8400.26171875,
"learning_rate": 1.8689026195787357e-05,
"loss": 0.219,
"step": 68400
},
{
"epoch": 1.8254983477241233,
"grad_norm": 874.6626586914062,
"learning_rate": 1.8669285587381804e-05,
"loss": 0.1714,
"step": 68500
},
{
"epoch": 1.8254983477241233,
"eval_dev_accuracy": 0.9731241565960783,
"eval_dev_accuracy_threshold": 0.939326286315918,
"eval_dev_average_precision": 0.872717385393903,
"eval_dev_f1": 0.803395225464191,
"eval_dev_f1_threshold": 0.7294609546661377,
"eval_dev_precision": 0.7787719839555692,
"eval_dev_recall": 0.8296263832584639,
"eval_loss": 0.22690728306770325,
"eval_runtime": 931.9708,
"eval_samples_per_second": 142.33,
"eval_steps_per_second": 2.224,
"step": 68500
},
{
"epoch": 1.8281633088156912,
"grad_norm": 339.2591552734375,
"learning_rate": 1.8649544978976254e-05,
"loss": 0.1969,
"step": 68600
},
{
"epoch": 1.8308282699072593,
"grad_norm": 48369.09375,
"learning_rate": 1.8629804370570704e-05,
"loss": 0.1715,
"step": 68700
},
{
"epoch": 1.8334932309988274,
"grad_norm": 1295.3619384765625,
"learning_rate": 1.861006376216515e-05,
"loss": 0.1728,
"step": 68800
},
{
"epoch": 1.8361581920903953,
"grad_norm": 13706.5322265625,
"learning_rate": 1.85903231537596e-05,
"loss": 0.1768,
"step": 68900
},
{
"epoch": 1.8388231531819637,
"grad_norm": 36329.11328125,
"learning_rate": 1.857058254535405e-05,
"loss": 0.1821,
"step": 69000
},
{
"epoch": 1.8388231531819637,
"eval_dev_accuracy": 0.9732900103281642,
"eval_dev_accuracy_threshold": 0.9531596899032593,
"eval_dev_average_precision": 0.8750664616109699,
"eval_dev_f1": 0.8036220816059348,
"eval_dev_f1_threshold": 0.925843358039856,
"eval_dev_precision": 0.8002172732210755,
"eval_dev_recall": 0.807055987728717,
"eval_loss": 0.22201138734817505,
"eval_runtime": 933.9653,
"eval_samples_per_second": 142.026,
"eval_steps_per_second": 2.22,
"step": 69000
},
{
"epoch": 1.8414881142735315,
"grad_norm": 21184.15234375,
"learning_rate": 1.8550841936948497e-05,
"loss": 0.1925,
"step": 69100
},
{
"epoch": 1.8441530753650996,
"grad_norm": 1523.7003173828125,
"learning_rate": 1.8531101328542947e-05,
"loss": 0.1761,
"step": 69200
},
{
"epoch": 1.8468180364566678,
"grad_norm": 18345.251953125,
"learning_rate": 1.8511360720137397e-05,
"loss": 0.1656,
"step": 69300
},
{
"epoch": 1.8494829975482356,
"grad_norm": 3282.25830078125,
"learning_rate": 1.8491620111731844e-05,
"loss": 0.2208,
"step": 69400
},
{
"epoch": 1.852147958639804,
"grad_norm": 10842.587890625,
"learning_rate": 1.8471879503326294e-05,
"loss": 0.1579,
"step": 69500
},
{
"epoch": 1.852147958639804,
"eval_dev_accuracy": 0.9733653983882032,
"eval_dev_accuracy_threshold": 0.9553133249282837,
"eval_dev_average_precision": 0.8719043392702807,
"eval_dev_f1": 0.8058681249342727,
"eval_dev_f1_threshold": 0.8481921553611755,
"eval_dev_precision": 0.7747447174198766,
"eval_dev_recall": 0.8395968007012161,
"eval_loss": 0.2123890370130539,
"eval_runtime": 932.9373,
"eval_samples_per_second": 142.182,
"eval_steps_per_second": 2.222,
"step": 69500
},
{
"epoch": 1.8548129197313719,
"grad_norm": 14122.6357421875,
"learning_rate": 1.8452138894920744e-05,
"loss": 0.1684,
"step": 69600
},
{
"epoch": 1.85747788082294,
"grad_norm": 22713.14453125,
"learning_rate": 1.843239828651519e-05,
"loss": 0.207,
"step": 69700
},
{
"epoch": 1.860142841914508,
"grad_norm": 21279.48828125,
"learning_rate": 1.841265767810964e-05,
"loss": 0.1679,
"step": 69800
},
{
"epoch": 1.862807803006076,
"grad_norm": 1724.1683349609375,
"learning_rate": 1.839291706970409e-05,
"loss": 0.1658,
"step": 69900
},
{
"epoch": 1.8654727640976443,
"grad_norm": 25310.3359375,
"learning_rate": 1.8373176461298537e-05,
"loss": 0.2035,
"step": 70000
},
{
"epoch": 1.8654727640976443,
"eval_dev_accuracy": 0.9732146222681252,
"eval_dev_accuracy_threshold": 0.9318354725837708,
"eval_dev_average_precision": 0.8761383143347535,
"eval_dev_f1": 0.8027572731220147,
"eval_dev_f1_threshold": 0.7808271646499634,
"eval_dev_precision": 0.7954178767344304,
"eval_dev_recall": 0.8102333735071765,
"eval_loss": 0.2241707593202591,
"eval_runtime": 934.0769,
"eval_samples_per_second": 142.009,
"eval_steps_per_second": 2.219,
"step": 70000
},
{
"epoch": 1.8681377251892122,
"grad_norm": 1192.052978515625,
"learning_rate": 1.8353435852892987e-05,
"loss": 0.1671,
"step": 70100
},
{
"epoch": 1.8708026862807803,
"grad_norm": 3381.109375,
"learning_rate": 1.8333695244487437e-05,
"loss": 0.1777,
"step": 70200
},
{
"epoch": 1.8734676473723484,
"grad_norm": 2287.74267578125,
"learning_rate": 1.8313954636081884e-05,
"loss": 0.1894,
"step": 70300
},
{
"epoch": 1.8761326084639163,
"grad_norm": 5671.9111328125,
"learning_rate": 1.8294214027676334e-05,
"loss": 0.2227,
"step": 70400
},
{
"epoch": 1.8787975695554846,
"grad_norm": 8669.9560546875,
"learning_rate": 1.8274473419270784e-05,
"loss": 0.1754,
"step": 70500
},
{
"epoch": 1.8787975695554846,
"eval_dev_accuracy": 0.9732221610741291,
"eval_dev_accuracy_threshold": 0.9238910675048828,
"eval_dev_average_precision": 0.8787022531852614,
"eval_dev_f1": 0.8059863355384449,
"eval_dev_f1_threshold": 0.760931134223938,
"eval_dev_precision": 0.7978529253891573,
"eval_dev_recall": 0.8142872795003835,
"eval_loss": 0.1879546046257019,
"eval_runtime": 934.0481,
"eval_samples_per_second": 142.013,
"eval_steps_per_second": 2.219,
"step": 70500
},
{
"epoch": 1.8814625306470525,
"grad_norm": 21830.791015625,
"learning_rate": 1.825473281086523e-05,
"loss": 0.1683,
"step": 70600
},
{
"epoch": 1.8841274917386206,
"grad_norm": 5870.6396484375,
"learning_rate": 1.823499220245968e-05,
"loss": 0.1618,
"step": 70700
},
{
"epoch": 1.8867924528301887,
"grad_norm": 9237.384765625,
"learning_rate": 1.821525159405413e-05,
"loss": 0.1806,
"step": 70800
},
{
"epoch": 1.8894574139217566,
"grad_norm": 5946.40380859375,
"learning_rate": 1.8195510985648577e-05,
"loss": 0.1701,
"step": 70900
},
{
"epoch": 1.892122375013325,
"grad_norm": 4265.1650390625,
"learning_rate": 1.8175770377243027e-05,
"loss": 0.1752,
"step": 71000
},
{
"epoch": 1.892122375013325,
"eval_dev_accuracy": 0.9730336909240315,
"eval_dev_accuracy_threshold": 0.9348808526992798,
"eval_dev_average_precision": 0.8700561831987852,
"eval_dev_f1": 0.8034291366708798,
"eval_dev_f1_threshold": 0.9348808526992798,
"eval_dev_precision": 0.8059536934950385,
"eval_dev_recall": 0.8009203462254848,
"eval_loss": 0.201664537191391,
"eval_runtime": 931.3245,
"eval_samples_per_second": 142.428,
"eval_steps_per_second": 2.226,
"step": 71000
},
{
"epoch": 1.8947873361048928,
"grad_norm": 2272.4169921875,
"learning_rate": 1.8156029768837477e-05,
"loss": 0.1688,
"step": 71100
},
{
"epoch": 1.897452297196461,
"grad_norm": 11893.5654296875,
"learning_rate": 1.8136289160431924e-05,
"loss": 0.184,
"step": 71200
},
{
"epoch": 1.900117258288029,
"grad_norm": 3861.369384765625,
"learning_rate": 1.8116548552026374e-05,
"loss": 0.1665,
"step": 71300
},
{
"epoch": 1.902782219379597,
"grad_norm": 35609.0,
"learning_rate": 1.8096807943620824e-05,
"loss": 0.1749,
"step": 71400
},
{
"epoch": 1.9054471804711652,
"grad_norm": 11618.3125,
"learning_rate": 1.8077067335215274e-05,
"loss": 0.1899,
"step": 71500
},
{
"epoch": 1.9054471804711652,
"eval_dev_accuracy": 0.9734483252542462,
"eval_dev_accuracy_threshold": 0.943538248538971,
"eval_dev_average_precision": 0.8746432264035248,
"eval_dev_f1": 0.8067354698533405,
"eval_dev_f1_threshold": 0.9360702037811279,
"eval_dev_precision": 0.7999569104815254,
"eval_dev_recall": 0.8136298893393229,
"eval_loss": 0.20475232601165771,
"eval_runtime": 860.459,
"eval_samples_per_second": 154.158,
"eval_steps_per_second": 2.409,
"step": 71500
},
{
"epoch": 1.9081121415627331,
"grad_norm": 8260.7607421875,
"learning_rate": 1.805732672680972e-05,
"loss": 0.1886,
"step": 71600
},
{
"epoch": 1.9107771026543012,
"grad_norm": 47676.78125,
"learning_rate": 1.803758611840417e-05,
"loss": 0.1858,
"step": 71700
},
{
"epoch": 1.9134420637458693,
"grad_norm": 554.1092529296875,
"learning_rate": 1.801784550999862e-05,
"loss": 0.165,
"step": 71800
},
{
"epoch": 1.9161070248374372,
"grad_norm": 12699.4365234375,
"learning_rate": 1.7998104901593067e-05,
"loss": 0.1784,
"step": 71900
},
{
"epoch": 1.9187719859290056,
"grad_norm": 4534.798828125,
"learning_rate": 1.7978364293187517e-05,
"loss": 0.1767,
"step": 72000
},
{
"epoch": 1.9187719859290056,
"eval_dev_accuracy": 0.9739835804805235,
"eval_dev_accuracy_threshold": 0.9395354986190796,
"eval_dev_average_precision": 0.8772612012666982,
"eval_dev_f1": 0.8093941820122765,
"eval_dev_f1_threshold": 0.875823974609375,
"eval_dev_precision": 0.7891340549542049,
"eval_dev_recall": 0.8307220335268982,
"eval_loss": 0.20605036616325378,
"eval_runtime": 861.3232,
"eval_samples_per_second": 154.004,
"eval_steps_per_second": 2.407,
"step": 72000
},
{
"epoch": 1.9214369470205734,
"grad_norm": 65605.9375,
"learning_rate": 1.7958623684781968e-05,
"loss": 0.1687,
"step": 72100
},
{
"epoch": 1.9241019081121415,
"grad_norm": 11532.1455078125,
"learning_rate": 1.7938883076376414e-05,
"loss": 0.1664,
"step": 72200
},
{
"epoch": 1.9267668692037097,
"grad_norm": 11916.1513671875,
"learning_rate": 1.7919142467970864e-05,
"loss": 0.1669,
"step": 72300
},
{
"epoch": 1.9294318302952775,
"grad_norm": 2029.2286376953125,
"learning_rate": 1.7899401859565314e-05,
"loss": 0.1787,
"step": 72400
},
{
"epoch": 1.9320967913868459,
"grad_norm": 6753.46142578125,
"learning_rate": 1.787966125115976e-05,
"loss": 0.1728,
"step": 72500
},
{
"epoch": 1.9320967913868459,
"eval_dev_accuracy": 0.9743379043627071,
"eval_dev_accuracy_threshold": 0.8970457315444946,
"eval_dev_average_precision": 0.8806920275415929,
"eval_dev_f1": 0.8153239556692241,
"eval_dev_f1_threshold": 0.7824004888534546,
"eval_dev_precision": 0.7935898765688206,
"eval_dev_recall": 0.8382820203790949,
"eval_loss": 0.19223952293395996,
"eval_runtime": 862.5657,
"eval_samples_per_second": 153.782,
"eval_steps_per_second": 2.403,
"step": 72500
},
{
"epoch": 1.9347617524784138,
"grad_norm": 27343.193359375,
"learning_rate": 1.785992064275421e-05,
"loss": 0.1443,
"step": 72600
},
{
"epoch": 1.9374267135699819,
"grad_norm": 13309.6455078125,
"learning_rate": 1.784018003434866e-05,
"loss": 0.1569,
"step": 72700
},
{
"epoch": 1.94009167466155,
"grad_norm": 1874.899169921875,
"learning_rate": 1.7820439425943108e-05,
"loss": 0.1931,
"step": 72800
},
{
"epoch": 1.9427566357531179,
"grad_norm": 31156.685546875,
"learning_rate": 1.7800698817537558e-05,
"loss": 0.1811,
"step": 72900
},
{
"epoch": 1.9454215968446862,
"grad_norm": 4346.09912109375,
"learning_rate": 1.7780958209132008e-05,
"loss": 0.1836,
"step": 73000
},
{
"epoch": 1.9454215968446862,
"eval_dev_accuracy": 0.9730563073420432,
"eval_dev_accuracy_threshold": 0.9250275492668152,
"eval_dev_average_precision": 0.8743046594125137,
"eval_dev_f1": 0.8057607880929436,
"eval_dev_f1_threshold": 0.8426618576049805,
"eval_dev_precision": 0.7878756151188357,
"eval_dev_recall": 0.8244768269968226,
"eval_loss": 0.207134410738945,
"eval_runtime": 861.6487,
"eval_samples_per_second": 153.946,
"eval_steps_per_second": 2.406,
"step": 73000
},
{
"epoch": 1.948086557936254,
"grad_norm": 5061.1884765625,
"learning_rate": 1.7761217600726454e-05,
"loss": 0.1739,
"step": 73100
},
{
"epoch": 1.9507515190278222,
"grad_norm": 103200.015625,
"learning_rate": 1.7741476992320904e-05,
"loss": 0.1966,
"step": 73200
},
{
"epoch": 1.9534164801193903,
"grad_norm": 18783.486328125,
"learning_rate": 1.7721736383915354e-05,
"loss": 0.1723,
"step": 73300
},
{
"epoch": 1.9560814412109582,
"grad_norm": 13243.9150390625,
"learning_rate": 1.7701995775509804e-05,
"loss": 0.1698,
"step": 73400
},
{
"epoch": 1.9587464023025265,
"grad_norm": 4332.658203125,
"learning_rate": 1.768225516710425e-05,
"loss": 0.1801,
"step": 73500
},
{
"epoch": 1.9587464023025265,
"eval_dev_accuracy": 0.972988458088008,
"eval_dev_accuracy_threshold": 0.9180799126625061,
"eval_dev_average_precision": 0.8762342719828209,
"eval_dev_f1": 0.8045175392942646,
"eval_dev_f1_threshold": 0.7035636901855469,
"eval_dev_precision": 0.7662337662337663,
"eval_dev_recall": 0.8468280924728827,
"eval_loss": 0.18561449646949768,
"eval_runtime": 862.9273,
"eval_samples_per_second": 153.717,
"eval_steps_per_second": 2.402,
"step": 73500
},
{
"epoch": 1.9614113633940944,
"grad_norm": 13960.3876953125,
"learning_rate": 1.76625145586987e-05,
"loss": 0.1599,
"step": 73600
},
{
"epoch": 1.9640763244856625,
"grad_norm": 12248.2890625,
"learning_rate": 1.764277395029315e-05,
"loss": 0.1722,
"step": 73700
},
{
"epoch": 1.9667412855772306,
"grad_norm": 20745.55859375,
"learning_rate": 1.7623033341887598e-05,
"loss": 0.1708,
"step": 73800
},
{
"epoch": 1.9694062466687985,
"grad_norm": 13722.9697265625,
"learning_rate": 1.7603292733482048e-05,
"loss": 0.1662,
"step": 73900
},
{
"epoch": 1.9720712077603668,
"grad_norm": 18372.69140625,
"learning_rate": 1.7583552125076498e-05,
"loss": 0.1716,
"step": 74000
},
{
"epoch": 1.9720712077603668,
"eval_dev_accuracy": 0.9739232700324922,
"eval_dev_accuracy_threshold": 0.8308413624763489,
"eval_dev_average_precision": 0.8841492699463087,
"eval_dev_f1": 0.8137931034482759,
"eval_dev_f1_threshold": 0.6751728057861328,
"eval_dev_precision": 0.7832387515200648,
"eval_dev_recall": 0.8468280924728827,
"eval_loss": 0.2016657292842865,
"eval_runtime": 862.1524,
"eval_samples_per_second": 153.856,
"eval_steps_per_second": 2.404,
"step": 74000
},
{
"epoch": 1.9747361688519347,
"grad_norm": 22373.701171875,
"learning_rate": 1.7563811516670944e-05,
"loss": 0.1741,
"step": 74100
},
{
"epoch": 1.9774011299435028,
"grad_norm": 1855.767822265625,
"learning_rate": 1.7544070908265394e-05,
"loss": 0.1318,
"step": 74200
},
{
"epoch": 1.980066091035071,
"grad_norm": 20893.662109375,
"learning_rate": 1.7524330299859844e-05,
"loss": 0.1782,
"step": 74300
},
{
"epoch": 1.9827310521266388,
"grad_norm": 1626.1358642578125,
"learning_rate": 1.750458969145429e-05,
"loss": 0.1842,
"step": 74400
},
{
"epoch": 1.9853960132182071,
"grad_norm": 8638.869140625,
"learning_rate": 1.748484908304874e-05,
"loss": 0.1545,
"step": 74500
},
{
"epoch": 1.9853960132182071,
"eval_dev_accuracy": 0.9740815849585742,
"eval_dev_accuracy_threshold": 0.7622551918029785,
"eval_dev_average_precision": 0.8838940929517627,
"eval_dev_f1": 0.8130659767141011,
"eval_dev_f1_threshold": 0.6812475919723511,
"eval_dev_precision": 0.800212201591512,
"eval_dev_recall": 0.826339432453161,
"eval_loss": 0.21240267157554626,
"eval_runtime": 862.2203,
"eval_samples_per_second": 153.844,
"eval_steps_per_second": 2.404,
"step": 74500
},
{
"epoch": 1.988060974309775,
"grad_norm": 12036.10546875,
"learning_rate": 1.746510847464319e-05,
"loss": 0.1786,
"step": 74600
},
{
"epoch": 1.9907259354013431,
"grad_norm": 3197.989013671875,
"learning_rate": 1.7445367866237638e-05,
"loss": 0.1589,
"step": 74700
},
{
"epoch": 1.9933908964929112,
"grad_norm": 2326.903564453125,
"learning_rate": 1.7425627257832088e-05,
"loss": 0.1712,
"step": 74800
},
{
"epoch": 1.9960558575844791,
"grad_norm": 13623.826171875,
"learning_rate": 1.7405886649426538e-05,
"loss": 0.1761,
"step": 74900
},
{
"epoch": 1.9987208186760475,
"grad_norm": 7701.57861328125,
"learning_rate": 1.7386146041020984e-05,
"loss": 0.1958,
"step": 75000
},
{
"epoch": 1.9987208186760475,
"eval_dev_accuracy": 0.9734558640602501,
"eval_dev_accuracy_threshold": 0.957332968711853,
"eval_dev_average_precision": 0.8773248937578426,
"eval_dev_f1": 0.8058651661075641,
"eval_dev_f1_threshold": 0.763139009475708,
"eval_dev_precision": 0.796044895777659,
"eval_dev_recall": 0.8159307549030349,
"eval_loss": 0.25920844078063965,
"eval_runtime": 862.3734,
"eval_samples_per_second": 153.816,
"eval_steps_per_second": 2.404,
"step": 75000
},
{
"epoch": 2.0013857797676153,
"grad_norm": 19200.3828125,
"learning_rate": 1.7366405432615434e-05,
"loss": 0.1859,
"step": 75100
},
{
"epoch": 2.0040507408591837,
"grad_norm": 27715.55859375,
"learning_rate": 1.7346664824209884e-05,
"loss": 0.215,
"step": 75200
},
{
"epoch": 2.0067157019507516,
"grad_norm": 14230.0625,
"learning_rate": 1.7326924215804334e-05,
"loss": 0.1883,
"step": 75300
},
{
"epoch": 2.0093806630423194,
"grad_norm": 214.24032592773438,
"learning_rate": 1.730718360739878e-05,
"loss": 0.1771,
"step": 75400
},
{
"epoch": 2.0120456241338878,
"grad_norm": 11949.2451171875,
"learning_rate": 1.728744299899323e-05,
"loss": 0.1568,
"step": 75500
},
{
"epoch": 2.0120456241338878,
"eval_dev_accuracy": 0.9732749327161564,
"eval_dev_accuracy_threshold": 0.9531142115592957,
"eval_dev_average_precision": 0.8772400052614694,
"eval_dev_f1": 0.8078490242333263,
"eval_dev_f1_threshold": 0.9034242630004883,
"eval_dev_precision": 0.7909711286089239,
"eval_dev_recall": 0.8254629122384135,
"eval_loss": 0.27995508909225464,
"eval_runtime": 861.968,
"eval_samples_per_second": 153.889,
"eval_steps_per_second": 2.405,
"step": 75500
},
{
"epoch": 2.0147105852254557,
"grad_norm": 1409.49951171875,
"learning_rate": 1.726770239058768e-05,
"loss": 0.1797,
"step": 75600
},
{
"epoch": 2.017375546317024,
"grad_norm": 5395.6484375,
"learning_rate": 1.7247961782182128e-05,
"loss": 0.1659,
"step": 75700
},
{
"epoch": 2.020040507408592,
"grad_norm": 49720.015625,
"learning_rate": 1.7228221173776578e-05,
"loss": 0.1519,
"step": 75800
},
{
"epoch": 2.0227054685001598,
"grad_norm": 39423.91015625,
"learning_rate": 1.7208480565371028e-05,
"loss": 0.1366,
"step": 75900
},
{
"epoch": 2.025370429591728,
"grad_norm": 1205.4697265625,
"learning_rate": 1.7188739956965474e-05,
"loss": 0.1641,
"step": 76000
},
{
"epoch": 2.025370429591728,
"eval_dev_accuracy": 0.9739760416745196,
"eval_dev_accuracy_threshold": 0.9528675079345703,
"eval_dev_average_precision": 0.8829642344114682,
"eval_dev_f1": 0.8102727032036007,
"eval_dev_f1_threshold": 0.8193379640579224,
"eval_dev_precision": 0.7840746054519369,
"eval_dev_recall": 0.8382820203790949,
"eval_loss": 0.22183284163475037,
"eval_runtime": 861.7487,
"eval_samples_per_second": 153.928,
"eval_steps_per_second": 2.406,
"step": 76000
},
{
"epoch": 2.028035390683296,
"grad_norm": 143011.90625,
"learning_rate": 1.7168999348559924e-05,
"loss": 0.1551,
"step": 76100
},
{
"epoch": 2.0307003517748643,
"grad_norm": 3733.740234375,
"learning_rate": 1.7149258740154374e-05,
"loss": 0.1612,
"step": 76200
},
{
"epoch": 2.033365312866432,
"grad_norm": 13346.1015625,
"learning_rate": 1.712951813174882e-05,
"loss": 0.1643,
"step": 76300
},
{
"epoch": 2.036030273958,
"grad_norm": 10167.767578125,
"learning_rate": 1.710977752334327e-05,
"loss": 0.1692,
"step": 76400
},
{
"epoch": 2.0386952350495684,
"grad_norm": 26428.076171875,
"learning_rate": 1.709003691493772e-05,
"loss": 0.1708,
"step": 76500
},
{
"epoch": 2.0386952350495684,
"eval_dev_accuracy": 0.9733277043581837,
"eval_dev_accuracy_threshold": 0.9573899507522583,
"eval_dev_average_precision": 0.8690568245333676,
"eval_dev_f1": 0.8137024870952604,
"eval_dev_f1_threshold": 0.8371973037719727,
"eval_dev_precision": 0.7762634301631516,
"eval_dev_recall": 0.8549359044592966,
"eval_loss": 0.21817246079444885,
"eval_runtime": 861.7458,
"eval_samples_per_second": 153.928,
"eval_steps_per_second": 2.406,
"step": 76500
},
{
"epoch": 2.0413601961411363,
"grad_norm": 22541.1796875,
"learning_rate": 1.7070296306532168e-05,
"loss": 0.165,
"step": 76600
},
{
"epoch": 2.0440251572327046,
"grad_norm": 49104.6015625,
"learning_rate": 1.7050555698126618e-05,
"loss": 0.1445,
"step": 76700
},
{
"epoch": 2.0466901183242725,
"grad_norm": 47796.04296875,
"learning_rate": 1.7030815089721068e-05,
"loss": 0.1354,
"step": 76800
},
{
"epoch": 2.0493550794158404,
"grad_norm": 21167.962890625,
"learning_rate": 1.7011074481315514e-05,
"loss": 0.1787,
"step": 76900
},
{
"epoch": 2.0520200405074087,
"grad_norm": 75447.2890625,
"learning_rate": 1.6991333872909964e-05,
"loss": 0.1626,
"step": 77000
},
{
"epoch": 2.0520200405074087,
"eval_dev_accuracy": 0.9745339133188086,
"eval_dev_accuracy_threshold": 0.9593422412872314,
"eval_dev_average_precision": 0.8806603026145806,
"eval_dev_f1": 0.8148537765621713,
"eval_dev_f1_threshold": 0.781623363494873,
"eval_dev_precision": 0.7836115326251897,
"eval_dev_recall": 0.848690697929221,
"eval_loss": 0.2216637134552002,
"eval_runtime": 862.3061,
"eval_samples_per_second": 153.828,
"eval_steps_per_second": 2.404,
"step": 77000
},
{
"epoch": 2.0546850015989766,
"grad_norm": 4420.5458984375,
"learning_rate": 1.6971593264504414e-05,
"loss": 0.1418,
"step": 77100
},
{
"epoch": 2.057349962690545,
"grad_norm": 14327.546875,
"learning_rate": 1.695185265609886e-05,
"loss": 0.2011,
"step": 77200
},
{
"epoch": 2.060014923782113,
"grad_norm": 19713.06640625,
"learning_rate": 1.693211204769331e-05,
"loss": 0.1593,
"step": 77300
},
{
"epoch": 2.0626798848736807,
"grad_norm": 5675.8125,
"learning_rate": 1.691237143928776e-05,
"loss": 0.1546,
"step": 77400
},
{
"epoch": 2.065344845965249,
"grad_norm": 7002.0654296875,
"learning_rate": 1.6892630830882208e-05,
"loss": 0.177,
"step": 77500
},
{
"epoch": 2.065344845965249,
"eval_dev_accuracy": 0.9752048670531561,
"eval_dev_accuracy_threshold": 0.8865873217582703,
"eval_dev_average_precision": 0.8890707955101652,
"eval_dev_f1": 0.8212508115126596,
"eval_dev_f1_threshold": 0.8439962863922119,
"eval_dev_precision": 0.8111574222507214,
"eval_dev_recall": 0.8315985537416457,
"eval_loss": 0.21185144782066345,
"eval_runtime": 860.1662,
"eval_samples_per_second": 154.211,
"eval_steps_per_second": 2.41,
"step": 77500
},
{
"epoch": 2.068009807056817,
"grad_norm": 418.3937683105469,
"learning_rate": 1.6872890222476658e-05,
"loss": 0.1546,
"step": 77600
},
{
"epoch": 2.0706747681483852,
"grad_norm": 47829.74609375,
"learning_rate": 1.6853149614071108e-05,
"loss": 0.1766,
"step": 77700
},
{
"epoch": 2.073339729239953,
"grad_norm": 395.5926208496094,
"learning_rate": 1.6833409005665554e-05,
"loss": 0.1879,
"step": 77800
},
{
"epoch": 2.076004690331521,
"grad_norm": 13378.1806640625,
"learning_rate": 1.6813668397260004e-05,
"loss": 0.1694,
"step": 77900
},
{
"epoch": 2.0786696514230893,
"grad_norm": 4878.7451171875,
"learning_rate": 1.6793927788854454e-05,
"loss": 0.1546,
"step": 78000
},
{
"epoch": 2.0786696514230893,
"eval_dev_accuracy": 0.9736971058523751,
"eval_dev_accuracy_threshold": 0.9617332220077515,
"eval_dev_average_precision": 0.8737670860803924,
"eval_dev_f1": 0.8101625374783019,
"eval_dev_f1_threshold": 0.8637624979019165,
"eval_dev_precision": 0.7791380008093889,
"eval_dev_recall": 0.8437602717212666,
"eval_loss": 0.24948453903198242,
"eval_runtime": 861.1759,
"eval_samples_per_second": 154.03,
"eval_steps_per_second": 2.407,
"step": 78000
},
{
"epoch": 2.0813346125146572,
"grad_norm": 26331.390625,
"learning_rate": 1.67741871804489e-05,
"loss": 0.1742,
"step": 78100
},
{
"epoch": 2.0839995736062256,
"grad_norm": 5203.9365234375,
"learning_rate": 1.675444657204335e-05,
"loss": 0.2024,
"step": 78200
},
{
"epoch": 2.0866645346977934,
"grad_norm": 27641.3671875,
"learning_rate": 1.67347059636378e-05,
"loss": 0.2126,
"step": 78300
},
{
"epoch": 2.0893294957893613,
"grad_norm": 3783.3671875,
"learning_rate": 1.6714965355232248e-05,
"loss": 0.1747,
"step": 78400
},
{
"epoch": 2.0919944568809297,
"grad_norm": 20038.98046875,
"learning_rate": 1.6695224746826698e-05,
"loss": 0.1807,
"step": 78500
},
{
"epoch": 2.0919944568809297,
"eval_dev_accuracy": 0.9743303655567032,
"eval_dev_accuracy_threshold": 0.9270470142364502,
"eval_dev_average_precision": 0.8818386397835865,
"eval_dev_f1": 0.816217350257002,
"eval_dev_f1_threshold": 0.7469815015792847,
"eval_dev_precision": 0.7828755407988731,
"eval_dev_recall": 0.8525254738687411,
"eval_loss": 0.21055419743061066,
"eval_runtime": 861.287,
"eval_samples_per_second": 154.01,
"eval_steps_per_second": 2.407,
"step": 78500
},
{
"epoch": 2.0946594179724976,
"grad_norm": 18032.57421875,
"learning_rate": 1.6675484138421148e-05,
"loss": 0.1805,
"step": 78600
},
{
"epoch": 2.097324379064066,
"grad_norm": 13172.416015625,
"learning_rate": 1.6655743530015594e-05,
"loss": 0.1498,
"step": 78700
},
{
"epoch": 2.0999893401556338,
"grad_norm": 10491.02734375,
"learning_rate": 1.6636002921610045e-05,
"loss": 0.1899,
"step": 78800
},
{
"epoch": 2.1026543012472017,
"grad_norm": 3893.85107421875,
"learning_rate": 1.6616262313204495e-05,
"loss": 0.1924,
"step": 78900
},
{
"epoch": 2.10531926233877,
"grad_norm": 1639.23486328125,
"learning_rate": 1.659652170479894e-05,
"loss": 0.1521,
"step": 79000
},
{
"epoch": 2.10531926233877,
"eval_dev_accuracy": 0.9743982148107383,
"eval_dev_accuracy_threshold": 0.9525002837181091,
"eval_dev_average_precision": 0.883524287942099,
"eval_dev_f1": 0.8129610403803071,
"eval_dev_f1_threshold": 0.9087203145027161,
"eval_dev_precision": 0.8108785698713756,
"eval_dev_recall": 0.8150542346882875,
"eval_loss": 0.24836769700050354,
"eval_runtime": 952.3381,
"eval_samples_per_second": 139.286,
"eval_steps_per_second": 2.177,
"step": 79000
},
{
"epoch": 2.107984223430338,
"grad_norm": 7783.5283203125,
"learning_rate": 1.657678109639339e-05,
"loss": 0.1988,
"step": 79100
},
{
"epoch": 2.1106491845219058,
"grad_norm": 1583.300537109375,
"learning_rate": 1.655704048798784e-05,
"loss": 0.1702,
"step": 79200
},
{
"epoch": 2.113314145613474,
"grad_norm": 1492.0706787109375,
"learning_rate": 1.6537299879582288e-05,
"loss": 0.1824,
"step": 79300
},
{
"epoch": 2.115979106705042,
"grad_norm": 18683.794921875,
"learning_rate": 1.651755927117674e-05,
"loss": 0.1688,
"step": 79400
},
{
"epoch": 2.1186440677966103,
"grad_norm": 8736.2275390625,
"learning_rate": 1.6497818662771188e-05,
"loss": 0.1809,
"step": 79500
},
{
"epoch": 2.1186440677966103,
"eval_dev_accuracy": 0.9739685028685157,
"eval_dev_accuracy_threshold": 0.9717953205108643,
"eval_dev_average_precision": 0.8798479877006415,
"eval_dev_f1": 0.8135902528044657,
"eval_dev_f1_threshold": 0.9465633630752563,
"eval_dev_precision": 0.7974537037037037,
"eval_dev_recall": 0.8303933384463679,
"eval_loss": 0.22024385631084442,
"eval_runtime": 951.2023,
"eval_samples_per_second": 139.452,
"eval_steps_per_second": 2.179,
"step": 79500
},
{
"epoch": 2.121309028888178,
"grad_norm": 54950.51953125,
"learning_rate": 1.6478078054365635e-05,
"loss": 0.1858,
"step": 79600
},
{
"epoch": 2.1239739899797465,
"grad_norm": 19716.146484375,
"learning_rate": 1.6458337445960088e-05,
"loss": 0.1642,
"step": 79700
},
{
"epoch": 2.1266389510713144,
"grad_norm": 18239.75,
"learning_rate": 1.6438596837554535e-05,
"loss": 0.191,
"step": 79800
},
{
"epoch": 2.1293039121628823,
"grad_norm": 41301.21875,
"learning_rate": 1.641885622914898e-05,
"loss": 0.1655,
"step": 79900
},
{
"epoch": 2.1319688732544506,
"grad_norm": 1119.526123046875,
"learning_rate": 1.6399115620743435e-05,
"loss": 0.1789,
"step": 80000
},
{
"epoch": 2.1319688732544506,
"eval_dev_accuracy": 0.9743152879446954,
"eval_dev_accuracy_threshold": 0.8854852914810181,
"eval_dev_average_precision": 0.8771901487923467,
"eval_dev_f1": 0.813726025900224,
"eval_dev_f1_threshold": 0.8826526403427124,
"eval_dev_precision": 0.8116415958142577,
"eval_dev_recall": 0.8158211898761916,
"eval_loss": 0.1959654837846756,
"eval_runtime": 952.4132,
"eval_samples_per_second": 139.275,
"eval_steps_per_second": 2.177,
"step": 80000
},
{
"epoch": 2.1346338343460185,
"grad_norm": 3469.789794921875,
"learning_rate": 1.637937501233788e-05,
"loss": 0.2002,
"step": 80100
},
{
"epoch": 2.1372987954375864,
"grad_norm": 15840.623046875,
"learning_rate": 1.635963440393233e-05,
"loss": 0.2139,
"step": 80200
},
{
"epoch": 2.1399637565291547,
"grad_norm": 24576.1328125,
"learning_rate": 1.633989379552678e-05,
"loss": 0.199,
"step": 80300
},
{
"epoch": 2.1426287176207226,
"grad_norm": 9852.4111328125,
"learning_rate": 1.6320153187121228e-05,
"loss": 0.165,
"step": 80400
},
{
"epoch": 2.145293678712291,
"grad_norm": 280.64031982421875,
"learning_rate": 1.6300412578715678e-05,
"loss": 0.1848,
"step": 80500
},
{
"epoch": 2.145293678712291,
"eval_dev_accuracy": 0.9742022058546368,
"eval_dev_accuracy_threshold": 0.9753606915473938,
"eval_dev_average_precision": 0.8782336024461705,
"eval_dev_f1": 0.8096592433592701,
"eval_dev_f1_threshold": 0.8103638887405396,
"eval_dev_precision": 0.7934371055952881,
"eval_dev_recall": 0.8265585625068478,
"eval_loss": 0.26615819334983826,
"eval_runtime": 951.0255,
"eval_samples_per_second": 139.478,
"eval_steps_per_second": 2.18,
"step": 80500
},
{
"epoch": 2.147958639803859,
"grad_norm": 3749.137939453125,
"learning_rate": 1.6280671970310128e-05,
"loss": 0.2118,
"step": 80600
},
{
"epoch": 2.150623600895427,
"grad_norm": 16408.94140625,
"learning_rate": 1.6260931361904575e-05,
"loss": 0.177,
"step": 80700
},
{
"epoch": 2.153288561986995,
"grad_norm": 122466.71875,
"learning_rate": 1.6241190753499025e-05,
"loss": 0.169,
"step": 80800
},
{
"epoch": 2.155953523078563,
"grad_norm": 35088.30078125,
"learning_rate": 1.6221450145093475e-05,
"loss": 0.1748,
"step": 80900
},
{
"epoch": 2.1586184841701312,
"grad_norm": 2193.1103515625,
"learning_rate": 1.620170953668792e-05,
"loss": 0.1532,
"step": 81000
},
{
"epoch": 2.1586184841701312,
"eval_dev_accuracy": 0.9747299222749101,
"eval_dev_accuracy_threshold": 0.7087757587432861,
"eval_dev_average_precision": 0.8839240203558189,
"eval_dev_f1": 0.8178559791463017,
"eval_dev_f1_threshold": 0.6686054468154907,
"eval_dev_precision": 0.8108108108108109,
"eval_dev_recall": 0.8250246521310398,
"eval_loss": 0.2607557475566864,
"eval_runtime": 952.9522,
"eval_samples_per_second": 139.196,
"eval_steps_per_second": 2.175,
"step": 81000
},
{
"epoch": 2.161283445261699,
"grad_norm": 2420.868896484375,
"learning_rate": 1.618196892828237e-05,
"loss": 0.1618,
"step": 81100
},
{
"epoch": 2.163948406353267,
"grad_norm": 706.0858764648438,
"learning_rate": 1.616222831987682e-05,
"loss": 0.1679,
"step": 81200
},
{
"epoch": 2.1666133674448353,
"grad_norm": 23174.521484375,
"learning_rate": 1.6142487711471268e-05,
"loss": 0.1808,
"step": 81300
},
{
"epoch": 2.1692783285364032,
"grad_norm": 15347.12890625,
"learning_rate": 1.6122747103065718e-05,
"loss": 0.1685,
"step": 81400
},
{
"epoch": 2.1719432896279716,
"grad_norm": 19526.70703125,
"learning_rate": 1.6103006494660168e-05,
"loss": 0.1901,
"step": 81500
},
{
"epoch": 2.1719432896279716,
"eval_dev_accuracy": 0.9744434476467617,
"eval_dev_accuracy_threshold": 0.9750630855560303,
"eval_dev_average_precision": 0.8830413621285588,
"eval_dev_f1": 0.8129956790461085,
"eval_dev_f1_threshold": 0.9695107936859131,
"eval_dev_precision": 0.8117081695063346,
"eval_dev_recall": 0.8142872795003835,
"eval_loss": 0.23483458161354065,
"eval_runtime": 950.8404,
"eval_samples_per_second": 139.505,
"eval_steps_per_second": 2.18,
"step": 81500
},
{
"epoch": 2.1746082507195394,
"grad_norm": 1307.5916748046875,
"learning_rate": 1.6083265886254615e-05,
"loss": 0.184,
"step": 81600
},
{
"epoch": 2.177273211811108,
"grad_norm": 40642.421875,
"learning_rate": 1.6063525277849065e-05,
"loss": 0.1667,
"step": 81700
},
{
"epoch": 2.1799381729026757,
"grad_norm": 1084.0020751953125,
"learning_rate": 1.6043784669443515e-05,
"loss": 0.1816,
"step": 81800
},
{
"epoch": 2.1826031339942435,
"grad_norm": 14024.021484375,
"learning_rate": 1.602404406103796e-05,
"loss": 0.159,
"step": 81900
},
{
"epoch": 2.185268095085812,
"grad_norm": 8854.5498046875,
"learning_rate": 1.600430345263241e-05,
"loss": 0.1553,
"step": 82000
},
{
"epoch": 2.185268095085812,
"eval_dev_accuracy": 0.9748957760069961,
"eval_dev_accuracy_threshold": 0.8661369681358337,
"eval_dev_average_precision": 0.8869519261803035,
"eval_dev_f1": 0.8157429896224332,
"eval_dev_f1_threshold": 0.8445290327072144,
"eval_dev_precision": 0.8220046723773501,
"eval_dev_recall": 0.8095759833461159,
"eval_loss": 0.23748071491718292,
"eval_runtime": 951.5083,
"eval_samples_per_second": 139.407,
"eval_steps_per_second": 2.179,
"step": 82000
},
{
"epoch": 2.1879330561773798,
"grad_norm": 44325.265625,
"learning_rate": 1.598456284422686e-05,
"loss": 0.1572,
"step": 82100
},
{
"epoch": 2.1905980172689477,
"grad_norm": 1203.1580810546875,
"learning_rate": 1.5964822235821308e-05,
"loss": 0.1629,
"step": 82200
},
{
"epoch": 2.193262978360516,
"grad_norm": 745.87353515625,
"learning_rate": 1.5945081627415758e-05,
"loss": 0.194,
"step": 82300
},
{
"epoch": 2.195927939452084,
"grad_norm": 17854.037109375,
"learning_rate": 1.5925341019010208e-05,
"loss": 0.1685,
"step": 82400
},
{
"epoch": 2.198592900543652,
"grad_norm": 44721.08203125,
"learning_rate": 1.5905600410604655e-05,
"loss": 0.1859,
"step": 82500
},
{
"epoch": 2.198592900543652,
"eval_dev_accuracy": 0.974345443168711,
"eval_dev_accuracy_threshold": 0.9739015102386475,
"eval_dev_average_precision": 0.8797686946603407,
"eval_dev_f1": 0.8160733549083065,
"eval_dev_f1_threshold": 0.9577875137329102,
"eval_dev_precision": 0.8036757675555083,
"eval_dev_recall": 0.8288594280705599,
"eval_loss": 0.2292918860912323,
"eval_runtime": 950.1815,
"eval_samples_per_second": 139.602,
"eval_steps_per_second": 2.182,
"step": 82500
},
{
"epoch": 2.20125786163522,
"grad_norm": 170.60641479492188,
"learning_rate": 1.5885859802199105e-05,
"loss": 0.1483,
"step": 82600
},
{
"epoch": 2.2039228227267884,
"grad_norm": 27626.072265625,
"learning_rate": 1.5866119193793555e-05,
"loss": 0.2056,
"step": 82700
},
{
"epoch": 2.2065877838183563,
"grad_norm": 731.1361083984375,
"learning_rate": 1.5846378585388e-05,
"loss": 0.1799,
"step": 82800
},
{
"epoch": 2.209252744909924,
"grad_norm": 36164.07421875,
"learning_rate": 1.582663797698245e-05,
"loss": 0.1645,
"step": 82900
},
{
"epoch": 2.2119177060014925,
"grad_norm": 6034.74853515625,
"learning_rate": 1.58068973685769e-05,
"loss": 0.1633,
"step": 83000
},
{
"epoch": 2.2119177060014925,
"eval_dev_accuracy": 0.9737423386883985,
"eval_dev_accuracy_threshold": 0.950665295124054,
"eval_dev_average_precision": 0.883874392367785,
"eval_dev_f1": 0.8089262330859885,
"eval_dev_f1_threshold": 0.9107600450515747,
"eval_dev_precision": 0.8056732963808282,
"eval_dev_recall": 0.8122055439903583,
"eval_loss": 0.23654605448246002,
"eval_runtime": 951.9974,
"eval_samples_per_second": 139.335,
"eval_steps_per_second": 2.178,
"step": 83000
},
{
"epoch": 2.2145826670930604,
"grad_norm": 10695.716796875,
"learning_rate": 1.5787156760171348e-05,
"loss": 0.1714,
"step": 83100
},
{
"epoch": 2.2172476281846283,
"grad_norm": 63246.39453125,
"learning_rate": 1.5767416151765798e-05,
"loss": 0.1793,
"step": 83200
},
{
"epoch": 2.2199125892761966,
"grad_norm": 1381.2412109375,
"learning_rate": 1.5747675543360248e-05,
"loss": 0.154,
"step": 83300
},
{
"epoch": 2.2225775503677645,
"grad_norm": 31067.8828125,
"learning_rate": 1.5727934934954695e-05,
"loss": 0.151,
"step": 83400
},
{
"epoch": 2.225242511459333,
"grad_norm": 33396.78125,
"learning_rate": 1.5708194326549148e-05,
"loss": 0.1841,
"step": 83500
},
{
"epoch": 2.225242511459333,
"eval_dev_accuracy": 0.9747902327229413,
"eval_dev_accuracy_threshold": 0.9669053554534912,
"eval_dev_average_precision": 0.8854411022874333,
"eval_dev_f1": 0.8149101635827299,
"eval_dev_f1_threshold": 0.9264481067657471,
"eval_dev_precision": 0.7981718848497583,
"eval_dev_recall": 0.8323655089295496,
"eval_loss": 0.20306049287319183,
"eval_runtime": 951.0541,
"eval_samples_per_second": 139.474,
"eval_steps_per_second": 2.18,
"step": 83500
},
{
"epoch": 2.2279074725509007,
"grad_norm": 1647.4901123046875,
"learning_rate": 1.5688453718143595e-05,
"loss": 0.1709,
"step": 83600
},
{
"epoch": 2.230572433642469,
"grad_norm": 310.0802307128906,
"learning_rate": 1.566871310973804e-05,
"loss": 0.1875,
"step": 83700
},
{
"epoch": 2.233237394734037,
"grad_norm": 14275.015625,
"learning_rate": 1.5648972501332495e-05,
"loss": 0.2041,
"step": 83800
},
{
"epoch": 2.235902355825605,
"grad_norm": 28323.603515625,
"learning_rate": 1.562923189292694e-05,
"loss": 0.1812,
"step": 83900
},
{
"epoch": 2.238567316917173,
"grad_norm": 25161.5546875,
"learning_rate": 1.5609491284521388e-05,
"loss": 0.1779,
"step": 84000
},
{
"epoch": 2.238567316917173,
"eval_dev_accuracy": 0.975137017799121,
"eval_dev_accuracy_threshold": 0.9797601699829102,
"eval_dev_average_precision": 0.8889116324411686,
"eval_dev_f1": 0.8158041179744018,
"eval_dev_f1_threshold": 0.9772592782974243,
"eval_dev_precision": 0.8289042180255569,
"eval_dev_recall": 0.8031116467623535,
"eval_loss": 0.2351406365633011,
"eval_runtime": 953.0513,
"eval_samples_per_second": 139.181,
"eval_steps_per_second": 2.175,
"step": 84000
},
{
"epoch": 2.241232278008741,
"grad_norm": 1789.634033203125,
"learning_rate": 1.558975067611584e-05,
"loss": 0.224,
"step": 84100
},
{
"epoch": 2.243897239100309,
"grad_norm": 5931.00048828125,
"learning_rate": 1.5570010067710288e-05,
"loss": 0.1624,
"step": 84200
},
{
"epoch": 2.2465622001918772,
"grad_norm": 18578.33203125,
"learning_rate": 1.5550269459304738e-05,
"loss": 0.1361,
"step": 84300
},
{
"epoch": 2.249227161283445,
"grad_norm": 1247.7115478515625,
"learning_rate": 1.5530528850899188e-05,
"loss": 0.1371,
"step": 84400
},
{
"epoch": 2.2518921223750135,
"grad_norm": 713.0791625976562,
"learning_rate": 1.5510788242493635e-05,
"loss": 0.2314,
"step": 84500
},
{
"epoch": 2.2518921223750135,
"eval_dev_accuracy": 0.974737461080914,
"eval_dev_accuracy_threshold": 0.9574118256568909,
"eval_dev_average_precision": 0.8861569751582977,
"eval_dev_f1": 0.8148996509598603,
"eval_dev_f1_threshold": 0.832693338394165,
"eval_dev_precision": 0.8112715821478987,
"eval_dev_recall": 0.8185603155472773,
"eval_loss": 0.2932807505130768,
"eval_runtime": 951.2089,
"eval_samples_per_second": 139.451,
"eval_steps_per_second": 2.179,
"step": 84500
},
{
"epoch": 2.2545570834665813,
"grad_norm": 2987.9375,
"learning_rate": 1.5491047634088085e-05,
"loss": 0.1889,
"step": 84600
},
{
"epoch": 2.2572220445581497,
"grad_norm": 2279.125,
"learning_rate": 1.5471307025682535e-05,
"loss": 0.2079,
"step": 84700
},
{
"epoch": 2.2598870056497176,
"grad_norm": 1106.462890625,
"learning_rate": 1.545156641727698e-05,
"loss": 0.1783,
"step": 84800
},
{
"epoch": 2.2625519667412854,
"grad_norm": 7212.99560546875,
"learning_rate": 1.543182580887143e-05,
"loss": 0.1551,
"step": 84900
},
{
"epoch": 2.2652169278328538,
"grad_norm": 22761.849609375,
"learning_rate": 1.541208520046588e-05,
"loss": 0.1606,
"step": 85000
},
{
"epoch": 2.2652169278328538,
"eval_dev_accuracy": 0.9748203879469569,
"eval_dev_accuracy_threshold": 0.9453166723251343,
"eval_dev_average_precision": 0.8915161607864528,
"eval_dev_f1": 0.8188866156993647,
"eval_dev_f1_threshold": 0.8840415477752686,
"eval_dev_precision": 0.8050174658621785,
"eval_dev_recall": 0.8332420291442971,
"eval_loss": 0.2096114605665207,
"eval_runtime": 952.4626,
"eval_samples_per_second": 139.267,
"eval_steps_per_second": 2.176,
"step": 85000
},
{
"epoch": 2.2678818889244217,
"grad_norm": 661.853271484375,
"learning_rate": 1.5392344592060328e-05,
"loss": 0.1887,
"step": 85100
},
{
"epoch": 2.2705468500159895,
"grad_norm": 26199.923828125,
"learning_rate": 1.5372603983654778e-05,
"loss": 0.1829,
"step": 85200
},
{
"epoch": 2.273211811107558,
"grad_norm": 11920.501953125,
"learning_rate": 1.5352863375249228e-05,
"loss": 0.18,
"step": 85300
},
{
"epoch": 2.2758767721991258,
"grad_norm": 13859.2724609375,
"learning_rate": 1.5333122766843675e-05,
"loss": 0.1935,
"step": 85400
},
{
"epoch": 2.278541733290694,
"grad_norm": 476.45367431640625,
"learning_rate": 1.5313382158438125e-05,
"loss": 0.1934,
"step": 85500
},
{
"epoch": 2.278541733290694,
"eval_dev_accuracy": 0.9749636252610312,
"eval_dev_accuracy_threshold": 0.9518921375274658,
"eval_dev_average_precision": 0.8866670870419442,
"eval_dev_f1": 0.8237035470740602,
"eval_dev_f1_threshold": 0.6820048093795776,
"eval_dev_precision": 0.7958120531154239,
"eval_dev_recall": 0.8536211241371754,
"eval_loss": 0.23012706637382507,
"eval_runtime": 952.604,
"eval_samples_per_second": 139.247,
"eval_steps_per_second": 2.176,
"step": 85500
}
],
"logging_steps": 100,
"max_steps": 112572,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}