{ "best_global_step": 69500, "best_metric": 0.9733653983882032, "best_model_checkpoint": "./results/checkpoint-69500", "epoch": 2.278541733290694, "eval_steps": 500, "global_step": 85500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.332498301064666e-05, "grad_norm": 2.171241283416748, "learning_rate": 0.0, "loss": 1.1419, "step": 1 }, { "epoch": 0.0013324983010646661, "grad_norm": 3.923346757888794, "learning_rate": 1.319120586275816e-07, "loss": 0.9369, "step": 100 }, { "epoch": 0.0026649966021293323, "grad_norm": 5.213994026184082, "learning_rate": 2.651565622918055e-07, "loss": 0.9031, "step": 200 }, { "epoch": 0.003997494903193999, "grad_norm": 3.4589016437530518, "learning_rate": 3.984010659560293e-07, "loss": 0.8309, "step": 300 }, { "epoch": 0.0053299932042586646, "grad_norm": 2.302459239959717, "learning_rate": 5.316455696202532e-07, "loss": 0.7406, "step": 400 }, { "epoch": 0.0066624915053233305, "grad_norm": 2.8590707778930664, "learning_rate": 6.64890073284477e-07, "loss": 0.7311, "step": 500 }, { "epoch": 0.0066624915053233305, "eval_dev_accuracy": 0.9312310116323776, "eval_dev_accuracy_threshold": 0.48881804943084717, "eval_dev_average_precision": 0.1090500582180461, "eval_dev_f1": 0.1867953275774505, "eval_dev_f1_threshold": 0.31155017018318176, "eval_dev_precision": 0.17409826753763136, "eval_dev_recall": 0.20149008436507068, "eval_loss": 0.7330209612846375, "eval_runtime": 567.4178, "eval_samples_per_second": 233.773, "eval_steps_per_second": 7.307, "step": 500 }, { "epoch": 0.007994989806387997, "grad_norm": 2.162013530731201, "learning_rate": 7.981345769487009e-07, "loss": 0.725, "step": 600 }, { "epoch": 0.009327488107452663, "grad_norm": 3.412961959838867, "learning_rate": 9.313790806129248e-07, "loss": 0.6892, "step": 700 }, { "epoch": 0.010659986408517329, "grad_norm": 3.037612199783325, "learning_rate": 1.0646235842771487e-06, "loss": 0.74, "step": 800 }, { "epoch": 0.011992484709581995, "grad_norm": 3.178318977355957, "learning_rate": 1.1978680879413725e-06, "loss": 0.6857, "step": 900 }, { "epoch": 0.013324983010646661, "grad_norm": 3.9319422245025635, "learning_rate": 1.3311125916055965e-06, "loss": 0.6784, "step": 1000 }, { "epoch": 0.013324983010646661, "eval_dev_accuracy": 0.9312913220804089, "eval_dev_accuracy_threshold": 0.5571334362030029, "eval_dev_average_precision": 0.2217988734731733, "eval_dev_f1": 0.3029693004529441, "eval_dev_f1_threshold": 0.3375406265258789, "eval_dev_precision": 0.2600942655145326, "eval_dev_recall": 0.36276980387860197, "eval_loss": 0.6632949113845825, "eval_runtime": 567.4434, "eval_samples_per_second": 233.763, "eval_steps_per_second": 7.306, "step": 1000 }, { "epoch": 0.014657481311711327, "grad_norm": 3.4704461097717285, "learning_rate": 1.4643570952698202e-06, "loss": 0.6753, "step": 1100 }, { "epoch": 0.015989979612775995, "grad_norm": 5.541119575500488, "learning_rate": 1.597601598934044e-06, "loss": 0.6707, "step": 1200 }, { "epoch": 0.01732247791384066, "grad_norm": 4.9743475914001465, "learning_rate": 1.7308461025982678e-06, "loss": 0.6679, "step": 1300 }, { "epoch": 0.018654976214905326, "grad_norm": 7.222622394561768, "learning_rate": 1.864090606262492e-06, "loss": 0.5831, "step": 1400 }, { "epoch": 0.01998747451596999, "grad_norm": 3.6720590591430664, "learning_rate": 1.9973351099267156e-06, "loss": 0.5589, "step": 1500 }, { "epoch": 0.01998747451596999, "eval_dev_accuracy": 0.9353095056804903, "eval_dev_accuracy_threshold": 0.5631594657897949, "eval_dev_average_precision": 0.338116839920073, "eval_dev_f1": 0.39591571740541814, "eval_dev_f1_threshold": 0.4508041739463806, "eval_dev_precision": 0.4291197543500512, "eval_dev_recall": 0.3674811000328695, "eval_loss": 0.6119648814201355, "eval_runtime": 567.5553, "eval_samples_per_second": 233.716, "eval_steps_per_second": 7.305, "step": 1500 }, { "epoch": 0.021319972817034658, "grad_norm": 10.199407577514648, "learning_rate": 2.1305796135909398e-06, "loss": 0.6065, "step": 1600 }, { "epoch": 0.022652471118099326, "grad_norm": 6.087101459503174, "learning_rate": 2.2638241172551636e-06, "loss": 0.5724, "step": 1700 }, { "epoch": 0.02398496941916399, "grad_norm": 16.529647827148438, "learning_rate": 2.3970686209193873e-06, "loss": 0.5568, "step": 1800 }, { "epoch": 0.025317467720228658, "grad_norm": 14.971884727478027, "learning_rate": 2.530313124583611e-06, "loss": 0.5603, "step": 1900 }, { "epoch": 0.026649966021293322, "grad_norm": 4.663777828216553, "learning_rate": 2.663557628247835e-06, "loss": 0.5553, "step": 2000 }, { "epoch": 0.026649966021293322, "eval_dev_accuracy": 0.9353848937405294, "eval_dev_accuracy_threshold": 0.6437499523162842, "eval_dev_average_precision": 0.38323093653846474, "eval_dev_f1": 0.4514054443643622, "eval_dev_f1_threshold": 0.6111855506896973, "eval_dev_precision": 0.4569023569023569, "eval_dev_recall": 0.44603922427960996, "eval_loss": 0.5613667964935303, "eval_runtime": 568.0344, "eval_samples_per_second": 233.519, "eval_steps_per_second": 7.299, "step": 2000 }, { "epoch": 0.02798246432235799, "grad_norm": 5.051695823669434, "learning_rate": 2.7968021319120587e-06, "loss": 0.5506, "step": 2100 }, { "epoch": 0.029314962623422654, "grad_norm": 12.604368209838867, "learning_rate": 2.930046635576283e-06, "loss": 0.5446, "step": 2200 }, { "epoch": 0.03064746092448732, "grad_norm": 3.9183976650238037, "learning_rate": 3.0632911392405066e-06, "loss": 0.5432, "step": 2300 }, { "epoch": 0.03197995922555199, "grad_norm": 5.165050983428955, "learning_rate": 3.1965356429047304e-06, "loss": 0.5091, "step": 2400 }, { "epoch": 0.03331245752661666, "grad_norm": 10.820756912231445, "learning_rate": 3.3297801465689546e-06, "loss": 0.5099, "step": 2500 }, { "epoch": 0.03331245752661666, "eval_dev_accuracy": 0.9395613922666928, "eval_dev_accuracy_threshold": 0.700435996055603, "eval_dev_average_precision": 0.4498892909951412, "eval_dev_f1": 0.4910784423745932, "eval_dev_f1_threshold": 0.5620608925819397, "eval_dev_precision": 0.5032777458309373, "eval_dev_recall": 0.47945655746685656, "eval_loss": 0.5332435369491577, "eval_runtime": 567.4907, "eval_samples_per_second": 233.743, "eval_steps_per_second": 7.306, "step": 2500 }, { "epoch": 0.03464495582768132, "grad_norm": 5.984354496002197, "learning_rate": 3.4630246502331784e-06, "loss": 0.5168, "step": 2600 }, { "epoch": 0.035977454128745985, "grad_norm": 11.091134071350098, "learning_rate": 3.596269153897402e-06, "loss": 0.4763, "step": 2700 }, { "epoch": 0.03730995242981065, "grad_norm": 25.33905601501465, "learning_rate": 3.729513657561626e-06, "loss": 0.4916, "step": 2800 }, { "epoch": 0.03864245073087532, "grad_norm": 7.44692325592041, "learning_rate": 3.862758161225849e-06, "loss": 0.4842, "step": 2900 }, { "epoch": 0.03997494903193998, "grad_norm": 11.449934005737305, "learning_rate": 3.996002664890073e-06, "loss": 0.5246, "step": 3000 }, { "epoch": 0.03997494903193998, "eval_dev_accuracy": 0.9320828967108189, "eval_dev_accuracy_threshold": 0.7955138683319092, "eval_dev_average_precision": 0.40640646661588725, "eval_dev_f1": 0.5273073175258689, "eval_dev_f1_threshold": 0.7331215143203735, "eval_dev_precision": 0.45734063103670314, "eval_dev_recall": 0.6225484825243782, "eval_loss": 0.4692871868610382, "eval_runtime": 565.8018, "eval_samples_per_second": 234.441, "eval_steps_per_second": 7.328, "step": 3000 }, { "epoch": 0.04130744733300465, "grad_norm": 2.739481210708618, "learning_rate": 4.129247168554298e-06, "loss": 0.4399, "step": 3100 }, { "epoch": 0.042639945634069316, "grad_norm": 18.604293823242188, "learning_rate": 4.2624916722185215e-06, "loss": 0.4532, "step": 3200 }, { "epoch": 0.043972443935133984, "grad_norm": 2.9506380558013916, "learning_rate": 4.395736175882745e-06, "loss": 0.5107, "step": 3300 }, { "epoch": 0.04530494223619865, "grad_norm": 6.515221118927002, "learning_rate": 4.528980679546969e-06, "loss": 0.4249, "step": 3400 }, { "epoch": 0.04663744053726331, "grad_norm": 8.708155632019043, "learning_rate": 4.662225183211193e-06, "loss": 0.4526, "step": 3500 }, { "epoch": 0.04663744053726331, "eval_dev_accuracy": 0.9425166042202237, "eval_dev_accuracy_threshold": 0.8969273567199707, "eval_dev_average_precision": 0.5181562757024104, "eval_dev_f1": 0.5669769324160259, "eval_dev_f1_threshold": 0.7522543668746948, "eval_dev_precision": 0.5266422328728503, "eval_dev_recall": 0.6140024104305906, "eval_loss": 0.44245800375938416, "eval_runtime": 566.66, "eval_samples_per_second": 234.086, "eval_steps_per_second": 7.317, "step": 3500 }, { "epoch": 0.04796993883832798, "grad_norm": 20.27404022216797, "learning_rate": 4.795469686875417e-06, "loss": 0.4791, "step": 3600 }, { "epoch": 0.04930243713939265, "grad_norm": 26.697437286376953, "learning_rate": 4.92871419053964e-06, "loss": 0.4151, "step": 3700 }, { "epoch": 0.050634935440457315, "grad_norm": 29.9031982421875, "learning_rate": 5.061958694203864e-06, "loss": 0.4842, "step": 3800 }, { "epoch": 0.051967433741521976, "grad_norm": 33.03110885620117, "learning_rate": 5.195203197868088e-06, "loss": 0.4062, "step": 3900 }, { "epoch": 0.053299932042586644, "grad_norm": 18.199092864990234, "learning_rate": 5.328447701532313e-06, "loss": 0.4491, "step": 4000 }, { "epoch": 0.053299932042586644, "eval_dev_accuracy": 0.9464367833422542, "eval_dev_accuracy_threshold": 0.8940709829330444, "eval_dev_average_precision": 0.5665972703365837, "eval_dev_f1": 0.5783120410421486, "eval_dev_f1_threshold": 0.532160758972168, "eval_dev_precision": 0.5345420734542073, "eval_dev_recall": 0.6298893393228882, "eval_loss": 0.4243237376213074, "eval_runtime": 565.9608, "eval_samples_per_second": 234.375, "eval_steps_per_second": 7.326, "step": 4000 }, { "epoch": 0.05463243034365131, "grad_norm": 2.3968331813812256, "learning_rate": 5.461692205196536e-06, "loss": 0.3937, "step": 4100 }, { "epoch": 0.05596492864471598, "grad_norm": 3.501485586166382, "learning_rate": 5.59493670886076e-06, "loss": 0.4806, "step": 4200 }, { "epoch": 0.05729742694578065, "grad_norm": 20.607412338256836, "learning_rate": 5.728181212524984e-06, "loss": 0.4355, "step": 4300 }, { "epoch": 0.05862992524684531, "grad_norm": 11.288957595825195, "learning_rate": 5.861425716189208e-06, "loss": 0.4579, "step": 4400 }, { "epoch": 0.059962423547909975, "grad_norm": 23.52041244506836, "learning_rate": 5.9946702198534315e-06, "loss": 0.4232, "step": 4500 }, { "epoch": 0.059962423547909975, "eval_dev_accuracy": 0.9480651654390978, "eval_dev_accuracy_threshold": 0.925118088722229, "eval_dev_average_precision": 0.5978901727391869, "eval_dev_f1": 0.5897354160025502, "eval_dev_f1_threshold": 0.8643622994422913, "eval_dev_precision": 0.5724600309437855, "eval_dev_recall": 0.6080858989810453, "eval_loss": 0.4087965786457062, "eval_runtime": 564.9476, "eval_samples_per_second": 234.795, "eval_steps_per_second": 7.339, "step": 4500 }, { "epoch": 0.06129492184897464, "grad_norm": 8.910244941711426, "learning_rate": 6.127914723517655e-06, "loss": 0.4195, "step": 4600 }, { "epoch": 0.0626274201500393, "grad_norm": 10.253131866455078, "learning_rate": 6.261159227181879e-06, "loss": 0.4332, "step": 4700 }, { "epoch": 0.06395991845110398, "grad_norm": 6.71283483505249, "learning_rate": 6.394403730846103e-06, "loss": 0.433, "step": 4800 }, { "epoch": 0.06529241675216864, "grad_norm": 13.018428802490234, "learning_rate": 6.527648234510327e-06, "loss": 0.3978, "step": 4900 }, { "epoch": 0.06662491505323331, "grad_norm": 5.483168601989746, "learning_rate": 6.660892738174551e-06, "loss": 0.4165, "step": 5000 }, { "epoch": 0.06662491505323331, "eval_dev_accuracy": 0.9488642788755117, "eval_dev_accuracy_threshold": 0.9351813793182373, "eval_dev_average_precision": 0.611806667891919, "eval_dev_f1": 0.594213494881972, "eval_dev_f1_threshold": 0.8715409636497498, "eval_dev_precision": 0.5677078135914579, "eval_dev_recall": 0.6233154377122823, "eval_loss": 0.4015994369983673, "eval_runtime": 565.0203, "eval_samples_per_second": 234.765, "eval_steps_per_second": 7.338, "step": 5000 }, { "epoch": 0.06795741335429797, "grad_norm": 44.895931243896484, "learning_rate": 6.794137241838775e-06, "loss": 0.3173, "step": 5100 }, { "epoch": 0.06928991165536263, "grad_norm": 19.51112937927246, "learning_rate": 6.927381745502999e-06, "loss": 0.4279, "step": 5200 }, { "epoch": 0.07062240995642731, "grad_norm": 11.284177780151367, "learning_rate": 7.0606262491672225e-06, "loss": 0.4278, "step": 5300 }, { "epoch": 0.07195490825749197, "grad_norm": 12.088862419128418, "learning_rate": 7.193870752831446e-06, "loss": 0.394, "step": 5400 }, { "epoch": 0.07328740655855664, "grad_norm": 5.778110504150391, "learning_rate": 7.32711525649567e-06, "loss": 0.4033, "step": 5500 }, { "epoch": 0.07328740655855664, "eval_dev_accuracy": 0.9500780266421405, "eval_dev_accuracy_threshold": 0.9353994131088257, "eval_dev_average_precision": 0.6400665115573199, "eval_dev_f1": 0.6073723716004319, "eval_dev_f1_threshold": 0.8007456064224243, "eval_dev_precision": 0.5721619527314994, "eval_dev_recall": 0.6472006135641504, "eval_loss": 0.3888355791568756, "eval_runtime": 566.2917, "eval_samples_per_second": 234.238, "eval_steps_per_second": 7.321, "step": 5500 }, { "epoch": 0.0746199048596213, "grad_norm": 7.1619439125061035, "learning_rate": 7.460359760159894e-06, "loss": 0.3775, "step": 5600 }, { "epoch": 0.07595240316068597, "grad_norm": 12.566367149353027, "learning_rate": 7.593604263824118e-06, "loss": 0.3944, "step": 5700 }, { "epoch": 0.07728490146175064, "grad_norm": 10.173190116882324, "learning_rate": 7.726848767488342e-06, "loss": 0.4256, "step": 5800 }, { "epoch": 0.0786173997628153, "grad_norm": 1.7395318746566772, "learning_rate": 7.860093271152565e-06, "loss": 0.3984, "step": 5900 }, { "epoch": 0.07994989806387996, "grad_norm": 3.9586873054504395, "learning_rate": 7.99333777481679e-06, "loss": 0.3545, "step": 6000 }, { "epoch": 0.07994989806387996, "eval_dev_accuracy": 0.949716163953953, "eval_dev_accuracy_threshold": 0.9251655340194702, "eval_dev_average_precision": 0.6562730620483952, "eval_dev_f1": 0.6157150706828513, "eval_dev_f1_threshold": 0.4972879886627197, "eval_dev_precision": 0.5658281307381564, "eval_dev_recall": 0.6752492604360688, "eval_loss": 0.38393494486808777, "eval_runtime": 567.4221, "eval_samples_per_second": 233.771, "eval_steps_per_second": 7.307, "step": 6000 }, { "epoch": 0.08128239636494464, "grad_norm": 4.9177398681640625, "learning_rate": 8.126582278481013e-06, "loss": 0.4551, "step": 6100 }, { "epoch": 0.0826148946660093, "grad_norm": 14.003257751464844, "learning_rate": 8.259826782145237e-06, "loss": 0.3817, "step": 6200 }, { "epoch": 0.08394739296707397, "grad_norm": 7.29791259765625, "learning_rate": 8.39307128580946e-06, "loss": 0.408, "step": 6300 }, { "epoch": 0.08527989126813863, "grad_norm": 26.11504554748535, "learning_rate": 8.526315789473685e-06, "loss": 0.4176, "step": 6400 }, { "epoch": 0.0866123895692033, "grad_norm": 21.16114616394043, "learning_rate": 8.659560293137908e-06, "loss": 0.4128, "step": 6500 }, { "epoch": 0.0866123895692033, "eval_dev_accuracy": 0.9528523072515775, "eval_dev_accuracy_threshold": 0.9102756977081299, "eval_dev_average_precision": 0.6708215133735886, "eval_dev_f1": 0.6232578397212545, "eval_dev_f1_threshold": 0.6869294047355652, "eval_dev_precision": 0.6194134833892436, "eval_dev_recall": 0.6271502136518023, "eval_loss": 0.35909053683280945, "eval_runtime": 568.2468, "eval_samples_per_second": 233.432, "eval_steps_per_second": 7.296, "step": 6500 }, { "epoch": 0.08794488787026797, "grad_norm": 2.0384860038757324, "learning_rate": 8.792804796802133e-06, "loss": 0.3837, "step": 6600 }, { "epoch": 0.08927738617133263, "grad_norm": 9.94750690460205, "learning_rate": 8.926049300466355e-06, "loss": 0.3618, "step": 6700 }, { "epoch": 0.0906098844723973, "grad_norm": 3.8198211193084717, "learning_rate": 9.05929380413058e-06, "loss": 0.3643, "step": 6800 }, { "epoch": 0.09194238277346196, "grad_norm": 14.838878631591797, "learning_rate": 9.192538307794803e-06, "loss": 0.3409, "step": 6900 }, { "epoch": 0.09327488107452662, "grad_norm": 25.42053985595703, "learning_rate": 9.325782811459028e-06, "loss": 0.4001, "step": 7000 }, { "epoch": 0.09327488107452662, "eval_dev_accuracy": 0.9507640579884958, "eval_dev_accuracy_threshold": 0.7285012006759644, "eval_dev_average_precision": 0.6341417194028635, "eval_dev_f1": 0.6159875449616148, "eval_dev_f1_threshold": 0.3488144874572754, "eval_dev_precision": 0.6038947368421053, "eval_dev_recall": 0.628574559000767, "eval_loss": 0.46753522753715515, "eval_runtime": 566.0195, "eval_samples_per_second": 234.351, "eval_steps_per_second": 7.325, "step": 7000 }, { "epoch": 0.0946073793755913, "grad_norm": 21.88748550415039, "learning_rate": 9.459027315123252e-06, "loss": 0.4186, "step": 7100 }, { "epoch": 0.09593987767665596, "grad_norm": 5.960207939147949, "learning_rate": 9.592271818787475e-06, "loss": 0.3478, "step": 7200 }, { "epoch": 0.09727237597772063, "grad_norm": 16.917625427246094, "learning_rate": 9.7255163224517e-06, "loss": 0.3492, "step": 7300 }, { "epoch": 0.0986048742787853, "grad_norm": 14.463135719299316, "learning_rate": 9.858760826115924e-06, "loss": 0.3522, "step": 7400 }, { "epoch": 0.09993737257984996, "grad_norm": 3.8919215202331543, "learning_rate": 9.992005329780147e-06, "loss": 0.3445, "step": 7500 }, { "epoch": 0.09993737257984996, "eval_dev_accuracy": 0.9513897788868199, "eval_dev_accuracy_threshold": 0.933416485786438, "eval_dev_average_precision": 0.672072308065407, "eval_dev_f1": 0.6241289651586063, "eval_dev_f1_threshold": 0.4514094591140747, "eval_dev_precision": 0.5939819855488468, "eval_dev_recall": 0.6574997260874329, "eval_loss": 0.4463006556034088, "eval_runtime": 565.9962, "eval_samples_per_second": 234.36, "eval_steps_per_second": 7.325, "step": 7500 }, { "epoch": 0.10126987088091463, "grad_norm": 5.6276421546936035, "learning_rate": 1.012524983344437e-05, "loss": 0.3611, "step": 7600 }, { "epoch": 0.10260236918197929, "grad_norm": 25.222440719604492, "learning_rate": 1.0258494337108595e-05, "loss": 0.3694, "step": 7700 }, { "epoch": 0.10393486748304395, "grad_norm": 10.44590950012207, "learning_rate": 1.0391738840772818e-05, "loss": 0.34, "step": 7800 }, { "epoch": 0.10526736578410863, "grad_norm": 15.12126350402832, "learning_rate": 1.0524983344437042e-05, "loss": 0.3839, "step": 7900 }, { "epoch": 0.10659986408517329, "grad_norm": 10.425951957702637, "learning_rate": 1.0658227848101265e-05, "loss": 0.3408, "step": 8000 }, { "epoch": 0.10659986408517329, "eval_dev_accuracy": 0.947168047524633, "eval_dev_accuracy_threshold": 0.9196346402168274, "eval_dev_average_precision": 0.6278546311695713, "eval_dev_f1": 0.5864126161957174, "eval_dev_f1_threshold": 0.5887953042984009, "eval_dev_precision": 0.5073623559539052, "eval_dev_recall": 0.6946422701873562, "eval_loss": 0.36468541622161865, "eval_runtime": 566.1553, "eval_samples_per_second": 234.294, "eval_steps_per_second": 7.323, "step": 8000 }, { "epoch": 0.10793236238623796, "grad_norm": 19.852497100830078, "learning_rate": 1.079147235176549e-05, "loss": 0.4011, "step": 8100 }, { "epoch": 0.10926486068730262, "grad_norm": 66.98611450195312, "learning_rate": 1.0924716855429713e-05, "loss": 0.3037, "step": 8200 }, { "epoch": 0.11059735898836728, "grad_norm": 2.033569812774658, "learning_rate": 1.1057961359093938e-05, "loss": 0.3632, "step": 8300 }, { "epoch": 0.11192985728943196, "grad_norm": 1.7951024770736694, "learning_rate": 1.1191205862758164e-05, "loss": 0.3878, "step": 8400 }, { "epoch": 0.11326235559049662, "grad_norm": 3.2986645698547363, "learning_rate": 1.1324450366422385e-05, "loss": 0.3849, "step": 8500 }, { "epoch": 0.11326235559049662, "eval_dev_accuracy": 0.9516611759029605, "eval_dev_accuracy_threshold": 0.9289531707763672, "eval_dev_average_precision": 0.6795165568410317, "eval_dev_f1": 0.6335993534700474, "eval_dev_f1_threshold": 0.7476029396057129, "eval_dev_precision": 0.5877612220035611, "eval_dev_recall": 0.6871918483620029, "eval_loss": 0.3556542694568634, "eval_runtime": 568.4381, "eval_samples_per_second": 233.353, "eval_steps_per_second": 7.294, "step": 8500 }, { "epoch": 0.1145948538915613, "grad_norm": 1.4321446418762207, "learning_rate": 1.1457694870086611e-05, "loss": 0.3833, "step": 8600 }, { "epoch": 0.11592735219262595, "grad_norm": 54.76650619506836, "learning_rate": 1.1590939373750833e-05, "loss": 0.3797, "step": 8700 }, { "epoch": 0.11725985049369061, "grad_norm": 31.644800186157227, "learning_rate": 1.1724183877415059e-05, "loss": 0.3705, "step": 8800 }, { "epoch": 0.11859234879475529, "grad_norm": 10.417598724365234, "learning_rate": 1.1857428381079282e-05, "loss": 0.3556, "step": 8900 }, { "epoch": 0.11992484709581995, "grad_norm": 9.85118579864502, "learning_rate": 1.1990672884743507e-05, "loss": 0.3771, "step": 9000 }, { "epoch": 0.11992484709581995, "eval_dev_accuracy": 0.955196875918792, "eval_dev_accuracy_threshold": 0.9099207520484924, "eval_dev_average_precision": 0.7086261480764138, "eval_dev_f1": 0.6543492478744277, "eval_dev_f1_threshold": 0.8123365640640259, "eval_dev_precision": 0.6510139898058779, "eval_dev_recall": 0.6577188561411198, "eval_loss": 0.33506301045417786, "eval_runtime": 567.679, "eval_samples_per_second": 233.665, "eval_steps_per_second": 7.303, "step": 9000 }, { "epoch": 0.12125734539688462, "grad_norm": 22.609596252441406, "learning_rate": 1.212391738840773e-05, "loss": 0.3649, "step": 9100 }, { "epoch": 0.12258984369794929, "grad_norm": 13.67054271697998, "learning_rate": 1.2257161892071954e-05, "loss": 0.3687, "step": 9200 }, { "epoch": 0.12392234199901395, "grad_norm": 11.858447074890137, "learning_rate": 1.2390406395736177e-05, "loss": 0.406, "step": 9300 }, { "epoch": 0.1252548403000786, "grad_norm": 22.195842742919922, "learning_rate": 1.2523650899400402e-05, "loss": 0.3362, "step": 9400 }, { "epoch": 0.1265873386011433, "grad_norm": 1.6114740371704102, "learning_rate": 1.2656895403064625e-05, "loss": 0.2749, "step": 9500 }, { "epoch": 0.1265873386011433, "eval_dev_accuracy": 0.9535986490459641, "eval_dev_accuracy_threshold": 0.9009051322937012, "eval_dev_average_precision": 0.6983138303648807, "eval_dev_f1": 0.6486718540381003, "eval_dev_f1_threshold": 0.7954304218292236, "eval_dev_precision": 0.6356752208666386, "eval_dev_recall": 0.6622110222417005, "eval_loss": 0.35359007120132446, "eval_runtime": 567.3028, "eval_samples_per_second": 233.82, "eval_steps_per_second": 7.308, "step": 9500 }, { "epoch": 0.12791983690220796, "grad_norm": 15.079890251159668, "learning_rate": 1.279013990672885e-05, "loss": 0.3038, "step": 9600 }, { "epoch": 0.12925233520327262, "grad_norm": 19.459815979003906, "learning_rate": 1.2923384410393072e-05, "loss": 0.3273, "step": 9700 }, { "epoch": 0.13058483350433728, "grad_norm": 21.132827758789062, "learning_rate": 1.3056628914057297e-05, "loss": 0.3823, "step": 9800 }, { "epoch": 0.13191733180540194, "grad_norm": 4.1918158531188965, "learning_rate": 1.318987341772152e-05, "loss": 0.3406, "step": 9900 }, { "epoch": 0.13324983010646663, "grad_norm": 22.806039810180664, "learning_rate": 1.3323117921385744e-05, "loss": 0.4069, "step": 10000 }, { "epoch": 0.13324983010646663, "eval_dev_accuracy": 0.9545711550204679, "eval_dev_accuracy_threshold": 0.9562267065048218, "eval_dev_average_precision": 0.7100922283297543, "eval_dev_f1": 0.6518728053062817, "eval_dev_f1_threshold": 0.8285595178604126, "eval_dev_precision": 0.5874132020743605, "eval_dev_recall": 0.7322230743946532, "eval_loss": 0.3286122977733612, "eval_runtime": 566.8451, "eval_samples_per_second": 234.009, "eval_steps_per_second": 7.314, "step": 10000 }, { "epoch": 0.1345823284075313, "grad_norm": 12.236410140991211, "learning_rate": 1.3456362425049967e-05, "loss": 0.3618, "step": 10100 }, { "epoch": 0.13591482670859595, "grad_norm": 5.4430060386657715, "learning_rate": 1.3589606928714192e-05, "loss": 0.3619, "step": 10200 }, { "epoch": 0.1372473250096606, "grad_norm": 13.798270225524902, "learning_rate": 1.3722851432378415e-05, "loss": 0.3413, "step": 10300 }, { "epoch": 0.13857982331072527, "grad_norm": 3.899458169937134, "learning_rate": 1.385609593604264e-05, "loss": 0.3374, "step": 10400 }, { "epoch": 0.13991232161178996, "grad_norm": 6.147464752197266, "learning_rate": 1.3989340439706862e-05, "loss": 0.3725, "step": 10500 }, { "epoch": 0.13991232161178996, "eval_dev_accuracy": 0.9561316878632762, "eval_dev_accuracy_threshold": 0.9145029187202454, "eval_dev_average_precision": 0.7194237355423562, "eval_dev_f1": 0.6639100398366194, "eval_dev_f1_threshold": 0.7489595413208008, "eval_dev_precision": 0.6150037369207773, "eval_dev_recall": 0.7212665717103101, "eval_loss": 0.30797863006591797, "eval_runtime": 567.191, "eval_samples_per_second": 233.867, "eval_steps_per_second": 7.31, "step": 10500 }, { "epoch": 0.14124481991285462, "grad_norm": 0.32248708605766296, "learning_rate": 1.4122584943371087e-05, "loss": 0.3289, "step": 10600 }, { "epoch": 0.14257731821391928, "grad_norm": 3.342273235321045, "learning_rate": 1.4255829447035312e-05, "loss": 0.335, "step": 10700 }, { "epoch": 0.14390981651498394, "grad_norm": 5.640665531158447, "learning_rate": 1.4389073950699535e-05, "loss": 0.3298, "step": 10800 }, { "epoch": 0.1452423148160486, "grad_norm": 1.3349778652191162, "learning_rate": 1.452231845436376e-05, "loss": 0.3805, "step": 10900 }, { "epoch": 0.1465748131171133, "grad_norm": 8.876007080078125, "learning_rate": 1.4655562958027982e-05, "loss": 0.3545, "step": 11000 }, { "epoch": 0.1465748131171133, "eval_dev_accuracy": 0.9554607341289286, "eval_dev_accuracy_threshold": 0.8000156283378601, "eval_dev_average_precision": 0.706945036170435, "eval_dev_f1": 0.6470619459631616, "eval_dev_f1_threshold": 0.35837632417678833, "eval_dev_precision": 0.6275741350906096, "eval_dev_recall": 0.6677988386107154, "eval_loss": 0.40842413902282715, "eval_runtime": 566.601, "eval_samples_per_second": 234.11, "eval_steps_per_second": 7.317, "step": 11000 }, { "epoch": 0.14790731141817795, "grad_norm": 1.4081681966781616, "learning_rate": 1.4788807461692207e-05, "loss": 0.3591, "step": 11100 }, { "epoch": 0.1492398097192426, "grad_norm": 15.024413108825684, "learning_rate": 1.492205196535643e-05, "loss": 0.3523, "step": 11200 }, { "epoch": 0.15057230802030727, "grad_norm": 18.281108856201172, "learning_rate": 1.5055296469020654e-05, "loss": 0.3601, "step": 11300 }, { "epoch": 0.15190480632137193, "grad_norm": 6.56211519241333, "learning_rate": 1.5188540972684877e-05, "loss": 0.3365, "step": 11400 }, { "epoch": 0.1532373046224366, "grad_norm": 43.26646041870117, "learning_rate": 1.5321785476349102e-05, "loss": 0.3859, "step": 11500 }, { "epoch": 0.1532373046224366, "eval_dev_accuracy": 0.9578354580201588, "eval_dev_accuracy_threshold": 0.7837315797805786, "eval_dev_average_precision": 0.7318177300477213, "eval_dev_f1": 0.6784168212739641, "eval_dev_f1_threshold": 0.5022754669189453, "eval_dev_precision": 0.6404592779994162, "eval_dev_recall": 0.7211570066834666, "eval_loss": 0.3321084976196289, "eval_runtime": 566.5331, "eval_samples_per_second": 234.138, "eval_steps_per_second": 7.318, "step": 11500 }, { "epoch": 0.15456980292350128, "grad_norm": 28.52861785888672, "learning_rate": 1.5455029980013325e-05, "loss": 0.3581, "step": 11600 }, { "epoch": 0.15590230122456594, "grad_norm": 7.024416923522949, "learning_rate": 1.558827448367755e-05, "loss": 0.3133, "step": 11700 }, { "epoch": 0.1572347995256306, "grad_norm": 0.6226129531860352, "learning_rate": 1.5721518987341774e-05, "loss": 0.295, "step": 11800 }, { "epoch": 0.15856729782669526, "grad_norm": 1.0621097087860107, "learning_rate": 1.5854763491005997e-05, "loss": 0.3027, "step": 11900 }, { "epoch": 0.15989979612775992, "grad_norm": 1.318295955657959, "learning_rate": 1.598800799467022e-05, "loss": 0.3216, "step": 12000 }, { "epoch": 0.15989979612775992, "eval_dev_accuracy": 0.9555587386069794, "eval_dev_accuracy_threshold": 0.8190538287162781, "eval_dev_average_precision": 0.7133029701747852, "eval_dev_f1": 0.6531785971038309, "eval_dev_f1_threshold": 0.3298466205596924, "eval_dev_precision": 0.6146709191069876, "eval_dev_recall": 0.6968335707242248, "eval_loss": 0.3916049897670746, "eval_runtime": 554.3199, "eval_samples_per_second": 239.297, "eval_steps_per_second": 7.479, "step": 12000 }, { "epoch": 0.1612322944288246, "grad_norm": 9.81628704071045, "learning_rate": 1.6121252498334446e-05, "loss": 0.3522, "step": 12100 }, { "epoch": 0.16256479272988927, "grad_norm": 4.447005271911621, "learning_rate": 1.625449700199867e-05, "loss": 0.3266, "step": 12200 }, { "epoch": 0.16389729103095393, "grad_norm": 14.646246910095215, "learning_rate": 1.6387741505662892e-05, "loss": 0.3292, "step": 12300 }, { "epoch": 0.1652297893320186, "grad_norm": 16.482669830322266, "learning_rate": 1.6520986009327115e-05, "loss": 0.3446, "step": 12400 }, { "epoch": 0.16656228763308326, "grad_norm": 7.77319860458374, "learning_rate": 1.665423051299134e-05, "loss": 0.3236, "step": 12500 }, { "epoch": 0.16656228763308326, "eval_dev_accuracy": 0.9569684953297097, "eval_dev_accuracy_threshold": 0.9383260011672974, "eval_dev_average_precision": 0.7292398003419696, "eval_dev_f1": 0.66701062841812, "eval_dev_f1_threshold": 0.8000765442848206, "eval_dev_precision": 0.6303266699171136, "eval_dev_recall": 0.7082283335159417, "eval_loss": 0.35466820001602173, "eval_runtime": 561.336, "eval_samples_per_second": 236.306, "eval_steps_per_second": 7.386, "step": 12500 }, { "epoch": 0.16789478593414794, "grad_norm": 6.646021366119385, "learning_rate": 1.6787475016655564e-05, "loss": 0.3134, "step": 12600 }, { "epoch": 0.1692272842352126, "grad_norm": 87.47698211669922, "learning_rate": 1.6920719520319787e-05, "loss": 0.3249, "step": 12700 }, { "epoch": 0.17055978253627727, "grad_norm": 17.500768661499023, "learning_rate": 1.705396402398401e-05, "loss": 0.3811, "step": 12800 }, { "epoch": 0.17189228083734193, "grad_norm": 7.166949272155762, "learning_rate": 1.7187208527648237e-05, "loss": 0.3127, "step": 12900 }, { "epoch": 0.1732247791384066, "grad_norm": 4.106062889099121, "learning_rate": 1.732045303131246e-05, "loss": 0.3219, "step": 13000 }, { "epoch": 0.1732247791384066, "eval_dev_accuracy": 0.9564407789094364, "eval_dev_accuracy_threshold": 0.8795315623283386, "eval_dev_average_precision": 0.726842832162545, "eval_dev_f1": 0.6643535054597408, "eval_dev_f1_threshold": 0.4876420497894287, "eval_dev_precision": 0.621717123483908, "eval_dev_recall": 0.7132683247507395, "eval_loss": 0.3564859926700592, "eval_runtime": 558.1535, "eval_samples_per_second": 237.653, "eval_steps_per_second": 7.428, "step": 13000 }, { "epoch": 0.17455727743947128, "grad_norm": 1.3857766389846802, "learning_rate": 1.7453697534976682e-05, "loss": 0.3526, "step": 13100 }, { "epoch": 0.17588977574053594, "grad_norm": 20.39262580871582, "learning_rate": 1.758694203864091e-05, "loss": 0.3299, "step": 13200 }, { "epoch": 0.1772222740416006, "grad_norm": 18.849407196044922, "learning_rate": 1.772018654230513e-05, "loss": 0.3303, "step": 13300 }, { "epoch": 0.17855477234266526, "grad_norm": 42.82183837890625, "learning_rate": 1.7853431045969355e-05, "loss": 0.3739, "step": 13400 }, { "epoch": 0.17988727064372992, "grad_norm": 4.524885654449463, "learning_rate": 1.7986675549633577e-05, "loss": 0.3544, "step": 13500 }, { "epoch": 0.17988727064372992, "eval_dev_accuracy": 0.9572021983158308, "eval_dev_accuracy_threshold": 0.9545025825500488, "eval_dev_average_precision": 0.7351364884979171, "eval_dev_f1": 0.6692303640099035, "eval_dev_f1_threshold": 0.7856150269508362, "eval_dev_precision": 0.6444805194805194, "eval_dev_recall": 0.6959570505094774, "eval_loss": 0.33416542410850525, "eval_runtime": 558.7974, "eval_samples_per_second": 237.379, "eval_steps_per_second": 7.42, "step": 13500 }, { "epoch": 0.1812197689447946, "grad_norm": 2.0763137340545654, "learning_rate": 1.8119920053297804e-05, "loss": 0.3584, "step": 13600 }, { "epoch": 0.18255226724585927, "grad_norm": 4.722475051879883, "learning_rate": 1.8253164556962027e-05, "loss": 0.341, "step": 13700 }, { "epoch": 0.18388476554692393, "grad_norm": 4.084864139556885, "learning_rate": 1.838640906062625e-05, "loss": 0.3371, "step": 13800 }, { "epoch": 0.1852172638479886, "grad_norm": 0.32559067010879517, "learning_rate": 1.8519653564290473e-05, "loss": 0.3322, "step": 13900 }, { "epoch": 0.18654976214905325, "grad_norm": 9.505677223205566, "learning_rate": 1.86528980679547e-05, "loss": 0.3493, "step": 14000 }, { "epoch": 0.18654976214905325, "eval_dev_accuracy": 0.9565010893574676, "eval_dev_accuracy_threshold": 0.9463940858840942, "eval_dev_average_precision": 0.7367044841602028, "eval_dev_f1": 0.6669865642994243, "eval_dev_f1_threshold": 0.8662494421005249, "eval_dev_precision": 0.6496001661647107, "eval_dev_recall": 0.6853292429056645, "eval_loss": 0.29620230197906494, "eval_runtime": 559.2923, "eval_samples_per_second": 237.169, "eval_steps_per_second": 7.413, "step": 14000 }, { "epoch": 0.18788226045011794, "grad_norm": 19.357847213745117, "learning_rate": 1.8786142571618922e-05, "loss": 0.3021, "step": 14100 }, { "epoch": 0.1892147587511826, "grad_norm": 0.8998715281486511, "learning_rate": 1.8919387075283148e-05, "loss": 0.3249, "step": 14200 }, { "epoch": 0.19054725705224726, "grad_norm": 17.16973304748535, "learning_rate": 1.9052631578947368e-05, "loss": 0.3389, "step": 14300 }, { "epoch": 0.19187975535331192, "grad_norm": 1.553682565689087, "learning_rate": 1.9185876082611594e-05, "loss": 0.3547, "step": 14400 }, { "epoch": 0.19321225365437658, "grad_norm": 10.778045654296875, "learning_rate": 1.9319120586275817e-05, "loss": 0.3205, "step": 14500 }, { "epoch": 0.19321225365437658, "eval_dev_accuracy": 0.9589587401147406, "eval_dev_accuracy_threshold": 0.9295341968536377, "eval_dev_average_precision": 0.7537310584722261, "eval_dev_f1": 0.6830523319465732, "eval_dev_f1_threshold": 0.7479926347732544, "eval_dev_precision": 0.6825292637567005, "eval_dev_recall": 0.6835762024761696, "eval_loss": 0.3468180298805237, "eval_runtime": 558.8191, "eval_samples_per_second": 237.37, "eval_steps_per_second": 7.419, "step": 14500 }, { "epoch": 0.19454475195544127, "grad_norm": 2.241529941558838, "learning_rate": 1.9452365089940043e-05, "loss": 0.3168, "step": 14600 }, { "epoch": 0.19587725025650593, "grad_norm": 1.1848278045654297, "learning_rate": 1.9585609593604263e-05, "loss": 0.3348, "step": 14700 }, { "epoch": 0.1972097485575706, "grad_norm": 16.031787872314453, "learning_rate": 1.971885409726849e-05, "loss": 0.3237, "step": 14800 }, { "epoch": 0.19854224685863525, "grad_norm": 12.078638076782227, "learning_rate": 1.9852098600932712e-05, "loss": 0.3428, "step": 14900 }, { "epoch": 0.1998747451596999, "grad_norm": 5.735422134399414, "learning_rate": 1.998534310459694e-05, "loss": 0.3179, "step": 15000 }, { "epoch": 0.1998747451596999, "eval_dev_accuracy": 0.9547219311405459, "eval_dev_accuracy_threshold": 0.9576058387756348, "eval_dev_average_precision": 0.6908515568536635, "eval_dev_f1": 0.6764229341974599, "eval_dev_f1_threshold": 0.926771879196167, "eval_dev_precision": 0.6467119728163102, "eval_dev_recall": 0.7089952887038458, "eval_loss": 0.34104466438293457, "eval_runtime": 561.385, "eval_samples_per_second": 236.285, "eval_steps_per_second": 7.385, "step": 15000 }, { "epoch": 0.2012072434607646, "grad_norm": 19.35861587524414, "learning_rate": 1.9986823013828432e-05, "loss": 0.3409, "step": 15100 }, { "epoch": 0.20253974176182926, "grad_norm": 35.545223236083984, "learning_rate": 1.997201741138847e-05, "loss": 0.331, "step": 15200 }, { "epoch": 0.20387224006289392, "grad_norm": 17.14919662475586, "learning_rate": 1.9957211808948506e-05, "loss": 0.3493, "step": 15300 }, { "epoch": 0.20520473836395858, "grad_norm": 2.735530138015747, "learning_rate": 1.9942406206508544e-05, "loss": 0.3205, "step": 15400 }, { "epoch": 0.20653723666502324, "grad_norm": 1.0762556791305542, "learning_rate": 1.9927600604068582e-05, "loss": 0.3307, "step": 15500 }, { "epoch": 0.20653723666502324, "eval_dev_accuracy": 0.9532292475517727, "eval_dev_accuracy_threshold": 0.8670874238014221, "eval_dev_average_precision": 0.6865319564110608, "eval_dev_f1": 0.660230457801308, "eval_dev_f1_threshold": 0.6122031211853027, "eval_dev_precision": 0.6272807969227735, "eval_dev_recall": 0.6968335707242248, "eval_loss": 0.3867639899253845, "eval_runtime": 563.4002, "eval_samples_per_second": 235.44, "eval_steps_per_second": 7.359, "step": 15500 }, { "epoch": 0.2078697349660879, "grad_norm": 8.77493953704834, "learning_rate": 1.9912795001628617e-05, "loss": 0.3683, "step": 15600 }, { "epoch": 0.2092022332671526, "grad_norm": 0.7768261432647705, "learning_rate": 1.9897989399188656e-05, "loss": 0.318, "step": 15700 }, { "epoch": 0.21053473156821725, "grad_norm": 12.180807113647461, "learning_rate": 1.988318379674869e-05, "loss": 0.3498, "step": 15800 }, { "epoch": 0.21186722986928191, "grad_norm": 4.719166278839111, "learning_rate": 1.986837819430873e-05, "loss": 0.3043, "step": 15900 }, { "epoch": 0.21319972817034658, "grad_norm": 6.112349987030029, "learning_rate": 1.9853572591868764e-05, "loss": 0.3393, "step": 16000 }, { "epoch": 0.21319972817034658, "eval_dev_accuracy": 0.9534780281499016, "eval_dev_accuracy_threshold": 0.9171842336654663, "eval_dev_average_precision": 0.7089238013031434, "eval_dev_f1": 0.6696855863736944, "eval_dev_f1_threshold": 0.8700560331344604, "eval_dev_precision": 0.6313797787696488, "eval_dev_recall": 0.7129396296702093, "eval_loss": 0.33279770612716675, "eval_runtime": 559.9008, "eval_samples_per_second": 236.912, "eval_steps_per_second": 7.405, "step": 16000 }, { "epoch": 0.21453222647141124, "grad_norm": 47.89255905151367, "learning_rate": 1.9838766989428802e-05, "loss": 0.3326, "step": 16100 }, { "epoch": 0.21586472477247592, "grad_norm": 6.826938152313232, "learning_rate": 1.982396138698884e-05, "loss": 0.3094, "step": 16200 }, { "epoch": 0.21719722307354059, "grad_norm": 13.803174018859863, "learning_rate": 1.9809155784548875e-05, "loss": 0.3572, "step": 16300 }, { "epoch": 0.21852972137460525, "grad_norm": 7.402415752410889, "learning_rate": 1.9794350182108914e-05, "loss": 0.3447, "step": 16400 }, { "epoch": 0.2198622196756699, "grad_norm": 28.723724365234375, "learning_rate": 1.977954457966895e-05, "loss": 0.3484, "step": 16500 }, { "epoch": 0.2198622196756699, "eval_dev_accuracy": 0.9592753699669047, "eval_dev_accuracy_threshold": 0.8685222864151001, "eval_dev_average_precision": 0.7462578581871518, "eval_dev_f1": 0.6837169650468883, "eval_dev_f1_threshold": 0.35429614782333374, "eval_dev_precision": 0.6654911316253501, "eval_dev_recall": 0.702969212227457, "eval_loss": 0.3938925862312317, "eval_runtime": 560.3292, "eval_samples_per_second": 236.73, "eval_steps_per_second": 7.399, "step": 16500 }, { "epoch": 0.22119471797673457, "grad_norm": 16.560897827148438, "learning_rate": 1.9764738977228987e-05, "loss": 0.3182, "step": 16600 }, { "epoch": 0.22252721627779926, "grad_norm": 1.3701841831207275, "learning_rate": 1.9749933374789022e-05, "loss": 0.283, "step": 16700 }, { "epoch": 0.22385971457886392, "grad_norm": 12.799971580505371, "learning_rate": 1.973512777234906e-05, "loss": 0.3134, "step": 16800 }, { "epoch": 0.22519221287992858, "grad_norm": 4.794546127319336, "learning_rate": 1.9720322169909095e-05, "loss": 0.3454, "step": 16900 }, { "epoch": 0.22652471118099324, "grad_norm": 21.59016990661621, "learning_rate": 1.970551656746913e-05, "loss": 0.3485, "step": 17000 }, { "epoch": 0.22652471118099324, "eval_dev_accuracy": 0.9593356804149359, "eval_dev_accuracy_threshold": 0.9145892858505249, "eval_dev_average_precision": 0.747695886787644, "eval_dev_f1": 0.6768515829218704, "eval_dev_f1_threshold": 0.8686491847038269, "eval_dev_precision": 0.7203264094955489, "eval_dev_recall": 0.6383258463898324, "eval_loss": 0.34743377566337585, "eval_runtime": 563.0506, "eval_samples_per_second": 235.586, "eval_steps_per_second": 7.363, "step": 17000 }, { "epoch": 0.2278572094820579, "grad_norm": 14.225733757019043, "learning_rate": 1.969071096502917e-05, "loss": 0.2942, "step": 17100 }, { "epoch": 0.2291897077831226, "grad_norm": 7.983681678771973, "learning_rate": 1.9675905362589203e-05, "loss": 0.2865, "step": 17200 }, { "epoch": 0.23052220608418725, "grad_norm": 2.1385481357574463, "learning_rate": 1.9661099760149242e-05, "loss": 0.3489, "step": 17300 }, { "epoch": 0.2318547043852519, "grad_norm": 12.413968086242676, "learning_rate": 1.9646294157709277e-05, "loss": 0.2959, "step": 17400 }, { "epoch": 0.23318720268631657, "grad_norm": 15.191158294677734, "learning_rate": 1.9631488555269315e-05, "loss": 0.3626, "step": 17500 }, { "epoch": 0.23318720268631657, "eval_dev_accuracy": 0.959637232655092, "eval_dev_accuracy_threshold": 0.9243228435516357, "eval_dev_average_precision": 0.7598083206267562, "eval_dev_f1": 0.6903569873748368, "eval_dev_f1_threshold": 0.7939244508743286, "eval_dev_precision": 0.6858038706887231, "eval_dev_recall": 0.6949709652678865, "eval_loss": 0.28921985626220703, "eval_runtime": 562.3661, "eval_samples_per_second": 235.873, "eval_steps_per_second": 7.372, "step": 17500 }, { "epoch": 0.23451970098738123, "grad_norm": 2.7491862773895264, "learning_rate": 1.961668295282935e-05, "loss": 0.3107, "step": 17600 }, { "epoch": 0.23585219928844592, "grad_norm": 52.241886138916016, "learning_rate": 1.960187735038939e-05, "loss": 0.2914, "step": 17700 }, { "epoch": 0.23718469758951058, "grad_norm": 11.401723861694336, "learning_rate": 1.9587071747949427e-05, "loss": 0.3298, "step": 17800 }, { "epoch": 0.23851719589057524, "grad_norm": 4.170936107635498, "learning_rate": 1.957226614550946e-05, "loss": 0.3315, "step": 17900 }, { "epoch": 0.2398496941916399, "grad_norm": 5.668073654174805, "learning_rate": 1.95574605430695e-05, "loss": 0.3097, "step": 18000 }, { "epoch": 0.2398496941916399, "eval_dev_accuracy": 0.9607454371376661, "eval_dev_accuracy_threshold": 0.9465240836143494, "eval_dev_average_precision": 0.7633624538366494, "eval_dev_f1": 0.6969561824060653, "eval_dev_f1_threshold": 0.8802664279937744, "eval_dev_precision": 0.7094540914765634, "eval_dev_recall": 0.6848909827982907, "eval_loss": 0.2974649667739868, "eval_runtime": 565.9458, "eval_samples_per_second": 234.381, "eval_steps_per_second": 7.326, "step": 18000 }, { "epoch": 0.24118219249270456, "grad_norm": 18.609146118164062, "learning_rate": 1.9542654940629535e-05, "loss": 0.3474, "step": 18100 }, { "epoch": 0.24251469079376925, "grad_norm": 5.154010772705078, "learning_rate": 1.9527849338189573e-05, "loss": 0.2917, "step": 18200 }, { "epoch": 0.2438471890948339, "grad_norm": 9.549324035644531, "learning_rate": 1.9513043735749608e-05, "loss": 0.3557, "step": 18300 }, { "epoch": 0.24517968739589857, "grad_norm": 1.6343746185302734, "learning_rate": 1.9498238133309647e-05, "loss": 0.3394, "step": 18400 }, { "epoch": 0.24651218569696323, "grad_norm": 9.207841873168945, "learning_rate": 1.9483432530869685e-05, "loss": 0.2891, "step": 18500 }, { "epoch": 0.24651218569696323, "eval_dev_accuracy": 0.9630900058048806, "eval_dev_accuracy_threshold": 0.9168897271156311, "eval_dev_average_precision": 0.7806456437261002, "eval_dev_f1": 0.7187227550130775, "eval_dev_f1_threshold": 0.6090723276138306, "eval_dev_precision": 0.7149051490514905, "eval_dev_recall": 0.7225813520324312, "eval_loss": 0.3342207372188568, "eval_runtime": 562.3653, "eval_samples_per_second": 235.873, "eval_steps_per_second": 7.372, "step": 18500 }, { "epoch": 0.2478446839980279, "grad_norm": 30.978008270263672, "learning_rate": 1.946862692842972e-05, "loss": 0.2514, "step": 18600 }, { "epoch": 0.24917718229909258, "grad_norm": 26.20627784729004, "learning_rate": 1.9453821325989758e-05, "loss": 0.3139, "step": 18700 }, { "epoch": 0.2505096806001572, "grad_norm": 29.30525779724121, "learning_rate": 1.9439015723549793e-05, "loss": 0.2896, "step": 18800 }, { "epoch": 0.2518421789012219, "grad_norm": 1.5062270164489746, "learning_rate": 1.942421012110983e-05, "loss": 0.3161, "step": 18900 }, { "epoch": 0.2531746772022866, "grad_norm": 5.7221784591674805, "learning_rate": 1.9409404518669866e-05, "loss": 0.3331, "step": 19000 }, { "epoch": 0.2531746772022866, "eval_dev_accuracy": 0.9605946610175881, "eval_dev_accuracy_threshold": 0.8986555337905884, "eval_dev_average_precision": 0.7733706796615292, "eval_dev_f1": 0.7069406003832233, "eval_dev_f1_threshold": 0.5075786113739014, "eval_dev_precision": 0.6874029603560708, "eval_dev_recall": 0.727621343267229, "eval_loss": 0.333689421415329, "eval_runtime": 564.7555, "eval_samples_per_second": 234.875, "eval_steps_per_second": 7.341, "step": 19000 }, { "epoch": 0.2545071755033512, "grad_norm": 0.41423532366752625, "learning_rate": 1.9394598916229905e-05, "loss": 0.2959, "step": 19100 }, { "epoch": 0.2558396738044159, "grad_norm": 18.290435791015625, "learning_rate": 1.937979331378994e-05, "loss": 0.3236, "step": 19200 }, { "epoch": 0.25717217210548055, "grad_norm": 1.2307929992675781, "learning_rate": 1.9364987711349975e-05, "loss": 0.3527, "step": 19300 }, { "epoch": 0.25850467040654523, "grad_norm": 1.151492714881897, "learning_rate": 1.9350182108910013e-05, "loss": 0.3106, "step": 19400 }, { "epoch": 0.2598371687076099, "grad_norm": 1.676810383796692, "learning_rate": 1.9335376506470048e-05, "loss": 0.3271, "step": 19500 }, { "epoch": 0.2598371687076099, "eval_dev_accuracy": 0.9614239296780176, "eval_dev_accuracy_threshold": 0.9547422528266907, "eval_dev_average_precision": 0.7587239294098156, "eval_dev_f1": 0.7068855932203391, "eval_dev_f1_threshold": 0.7505875825881958, "eval_dev_precision": 0.6841997334153593, "eval_dev_recall": 0.7311274241262189, "eval_loss": 0.3253738582134247, "eval_runtime": 562.4242, "eval_samples_per_second": 235.849, "eval_steps_per_second": 7.372, "step": 19500 }, { "epoch": 0.26116966700867456, "grad_norm": 18.608806610107422, "learning_rate": 1.9320570904030086e-05, "loss": 0.3504, "step": 19600 }, { "epoch": 0.26250216530973924, "grad_norm": 20.453174591064453, "learning_rate": 1.930576530159012e-05, "loss": 0.303, "step": 19700 }, { "epoch": 0.2638346636108039, "grad_norm": 5.1661248207092285, "learning_rate": 1.929095969915016e-05, "loss": 0.2478, "step": 19800 }, { "epoch": 0.26516716191186857, "grad_norm": 1.2466572523117065, "learning_rate": 1.9276154096710194e-05, "loss": 0.3309, "step": 19900 }, { "epoch": 0.26649966021293325, "grad_norm": 2.0681653022766113, "learning_rate": 1.9261348494270233e-05, "loss": 0.3063, "step": 20000 }, { "epoch": 0.26649966021293325, "eval_dev_accuracy": 0.9621928878904159, "eval_dev_accuracy_threshold": 0.5716267228126526, "eval_dev_average_precision": 0.7784750233173614, "eval_dev_f1": 0.7154299699632884, "eval_dev_f1_threshold": 0.2591094672679901, "eval_dev_precision": 0.7265845667156253, "eval_dev_recall": 0.7046126876301084, "eval_loss": 0.45214059948921204, "eval_runtime": 560.4518, "eval_samples_per_second": 236.679, "eval_steps_per_second": 7.398, "step": 20000 }, { "epoch": 0.2678321585139979, "grad_norm": 18.87665557861328, "learning_rate": 1.924654289183027e-05, "loss": 0.3482, "step": 20100 }, { "epoch": 0.2691646568150626, "grad_norm": 1.1184475421905518, "learning_rate": 1.9231737289390306e-05, "loss": 0.3033, "step": 20200 }, { "epoch": 0.2704971551161272, "grad_norm": 13.190022468566895, "learning_rate": 1.9216931686950344e-05, "loss": 0.288, "step": 20300 }, { "epoch": 0.2718296534171919, "grad_norm": 5.855016231536865, "learning_rate": 1.920212608451038e-05, "loss": 0.3609, "step": 20400 }, { "epoch": 0.2731621517182566, "grad_norm": 0.26388707756996155, "learning_rate": 1.9187320482070418e-05, "loss": 0.3071, "step": 20500 }, { "epoch": 0.2731621517182566, "eval_dev_accuracy": 0.9630975446108845, "eval_dev_accuracy_threshold": 0.9040592908859253, "eval_dev_average_precision": 0.7844185876274975, "eval_dev_f1": 0.7150979850952249, "eval_dev_f1_threshold": 0.6517728567123413, "eval_dev_precision": 0.7206275033377837, "eval_dev_recall": 0.7096526788649063, "eval_loss": 0.3398449718952179, "eval_runtime": 559.7754, "eval_samples_per_second": 236.965, "eval_steps_per_second": 7.407, "step": 20500 }, { "epoch": 0.2744946500193212, "grad_norm": 4.928101539611816, "learning_rate": 1.9172514879630453e-05, "loss": 0.3778, "step": 20600 }, { "epoch": 0.2758271483203859, "grad_norm": 32.13788604736328, "learning_rate": 1.915770927719049e-05, "loss": 0.2681, "step": 20700 }, { "epoch": 0.27715964662145054, "grad_norm": 4.934467792510986, "learning_rate": 1.914290367475053e-05, "loss": 0.3358, "step": 20800 }, { "epoch": 0.27849214492251523, "grad_norm": 20.491180419921875, "learning_rate": 1.9128098072310564e-05, "loss": 0.2964, "step": 20900 }, { "epoch": 0.2798246432235799, "grad_norm": 1.0770193338394165, "learning_rate": 1.9113292469870603e-05, "loss": 0.2193, "step": 21000 }, { "epoch": 0.2798246432235799, "eval_dev_accuracy": 0.9630221565508454, "eval_dev_accuracy_threshold": 0.9147968292236328, "eval_dev_average_precision": 0.7911413496458403, "eval_dev_f1": 0.7163220463124683, "eval_dev_f1_threshold": 0.7818174362182617, "eval_dev_precision": 0.7372999304105776, "eval_dev_recall": 0.6965048756436946, "eval_loss": 0.3126268982887268, "eval_runtime": 558.7817, "eval_samples_per_second": 237.386, "eval_steps_per_second": 7.42, "step": 21000 }, { "epoch": 0.28115714152464455, "grad_norm": 24.751399993896484, "learning_rate": 1.9098486867430638e-05, "loss": 0.3136, "step": 21100 }, { "epoch": 0.28248963982570924, "grad_norm": 38.034759521484375, "learning_rate": 1.9083681264990676e-05, "loss": 0.3121, "step": 21200 }, { "epoch": 0.28382213812677387, "grad_norm": 22.520530700683594, "learning_rate": 1.906887566255071e-05, "loss": 0.2893, "step": 21300 }, { "epoch": 0.28515463642783856, "grad_norm": 13.158409118652344, "learning_rate": 1.905407006011075e-05, "loss": 0.2987, "step": 21400 }, { "epoch": 0.28648713472890325, "grad_norm": 2.2500672340393066, "learning_rate": 1.9039264457670784e-05, "loss": 0.2781, "step": 21500 }, { "epoch": 0.28648713472890325, "eval_dev_accuracy": 0.96250951774258, "eval_dev_accuracy_threshold": 0.9360392093658447, "eval_dev_average_precision": 0.7809073848360293, "eval_dev_f1": 0.724827056110684, "eval_dev_f1_threshold": 0.9214021563529968, "eval_dev_precision": 0.7264223616154947, "eval_dev_recall": 0.7232387421934918, "eval_loss": 0.32232773303985596, "eval_runtime": 558.841, "eval_samples_per_second": 237.361, "eval_steps_per_second": 7.419, "step": 21500 }, { "epoch": 0.2878196330299679, "grad_norm": 7.364509582519531, "learning_rate": 1.902445885523082e-05, "loss": 0.2444, "step": 21600 }, { "epoch": 0.28915213133103257, "grad_norm": 14.986044883728027, "learning_rate": 1.9009653252790857e-05, "loss": 0.2917, "step": 21700 }, { "epoch": 0.2904846296320972, "grad_norm": 1.4703857898712158, "learning_rate": 1.8994847650350892e-05, "loss": 0.32, "step": 21800 }, { "epoch": 0.2918171279331619, "grad_norm": 4.144439220428467, "learning_rate": 1.898004204791093e-05, "loss": 0.2873, "step": 21900 }, { "epoch": 0.2931496262342266, "grad_norm": 3.1540684700012207, "learning_rate": 1.8965236445470966e-05, "loss": 0.2877, "step": 22000 }, { "epoch": 0.2931496262342266, "eval_dev_accuracy": 0.9638212699872594, "eval_dev_accuracy_threshold": 0.8727903366088867, "eval_dev_average_precision": 0.7927687696111004, "eval_dev_f1": 0.7246392958609548, "eval_dev_f1_threshold": 0.7912191152572632, "eval_dev_precision": 0.7370806890299184, "eval_dev_recall": 0.712610934589679, "eval_loss": 0.30881205201148987, "eval_runtime": 559.5919, "eval_samples_per_second": 237.042, "eval_steps_per_second": 7.409, "step": 22000 }, { "epoch": 0.2944821245352912, "grad_norm": 6.18324613571167, "learning_rate": 1.8950430843031004e-05, "loss": 0.2947, "step": 22100 }, { "epoch": 0.2958146228363559, "grad_norm": 12.850146293640137, "learning_rate": 1.893562524059104e-05, "loss": 0.2619, "step": 22200 }, { "epoch": 0.29714712113742053, "grad_norm": 5.986371040344238, "learning_rate": 1.8920819638151077e-05, "loss": 0.3143, "step": 22300 }, { "epoch": 0.2984796194384852, "grad_norm": 6.889712810516357, "learning_rate": 1.8906014035711116e-05, "loss": 0.3585, "step": 22400 }, { "epoch": 0.29981211773954985, "grad_norm": 14.721301078796387, "learning_rate": 1.889120843327115e-05, "loss": 0.28, "step": 22500 }, { "epoch": 0.29981211773954985, "eval_dev_accuracy": 0.9627809147587205, "eval_dev_accuracy_threshold": 0.949242889881134, "eval_dev_average_precision": 0.7827527934861719, "eval_dev_f1": 0.7189280438911163, "eval_dev_f1_threshold": 0.41740649938583374, "eval_dev_precision": 0.6932546545935497, "eval_dev_recall": 0.7465760929111428, "eval_loss": 0.3353007137775421, "eval_runtime": 558.6719, "eval_samples_per_second": 237.433, "eval_steps_per_second": 7.421, "step": 22500 }, { "epoch": 0.30114461604061454, "grad_norm": 1.0338587760925293, "learning_rate": 1.887640283083119e-05, "loss": 0.284, "step": 22600 }, { "epoch": 0.30247711434167923, "grad_norm": 0.5249596834182739, "learning_rate": 1.8861597228391224e-05, "loss": 0.2821, "step": 22700 }, { "epoch": 0.30380961264274386, "grad_norm": 2.10871958732605, "learning_rate": 1.8846791625951262e-05, "loss": 0.2762, "step": 22800 }, { "epoch": 0.30514211094380855, "grad_norm": 8.820456504821777, "learning_rate": 1.8831986023511297e-05, "loss": 0.3152, "step": 22900 }, { "epoch": 0.3064746092448732, "grad_norm": 0.5152029395103455, "learning_rate": 1.8817180421071335e-05, "loss": 0.2879, "step": 23000 }, { "epoch": 0.3064746092448732, "eval_dev_accuracy": 0.9637911147632438, "eval_dev_accuracy_threshold": 0.8721863627433777, "eval_dev_average_precision": 0.7944006641896747, "eval_dev_f1": 0.7282656663724625, "eval_dev_f1_threshold": 0.7819468975067139, "eval_dev_precision": 0.7333629596711476, "eval_dev_recall": 0.7232387421934918, "eval_loss": 0.27257823944091797, "eval_runtime": 559.9281, "eval_samples_per_second": 236.9, "eval_steps_per_second": 7.405, "step": 23000 }, { "epoch": 0.3078071075459379, "grad_norm": 7.670559406280518, "learning_rate": 1.8802374818631374e-05, "loss": 0.2738, "step": 23100 }, { "epoch": 0.30913960584700256, "grad_norm": 1.2862569093704224, "learning_rate": 1.878756921619141e-05, "loss": 0.2624, "step": 23200 }, { "epoch": 0.3104721041480672, "grad_norm": 6.1086249351501465, "learning_rate": 1.8772763613751447e-05, "loss": 0.2698, "step": 23300 }, { "epoch": 0.3118046024491319, "grad_norm": 2.7864394187927246, "learning_rate": 1.8757958011311482e-05, "loss": 0.278, "step": 23400 }, { "epoch": 0.3131371007501965, "grad_norm": 0.4662020206451416, "learning_rate": 1.874315240887152e-05, "loss": 0.3024, "step": 23500 }, { "epoch": 0.3131371007501965, "eval_dev_accuracy": 0.9640398953613727, "eval_dev_accuracy_threshold": 0.9022700786590576, "eval_dev_average_precision": 0.789665459307555, "eval_dev_f1": 0.7233386555084511, "eval_dev_f1_threshold": 0.49045658111572266, "eval_dev_precision": 0.7269809650287737, "eval_dev_recall": 0.719732661334502, "eval_loss": 0.3414628207683563, "eval_runtime": 561.5298, "eval_samples_per_second": 236.224, "eval_steps_per_second": 7.383, "step": 23500 }, { "epoch": 0.3144695990512612, "grad_norm": 16.328683853149414, "learning_rate": 1.8728346806431555e-05, "loss": 0.3255, "step": 23600 }, { "epoch": 0.3158020973523259, "grad_norm": 6.683753490447998, "learning_rate": 1.8713541203991594e-05, "loss": 0.3298, "step": 23700 }, { "epoch": 0.3171345956533905, "grad_norm": 14.66252613067627, "learning_rate": 1.869873560155163e-05, "loss": 0.2902, "step": 23800 }, { "epoch": 0.3184670939544552, "grad_norm": 1.7640432119369507, "learning_rate": 1.8683929999111664e-05, "loss": 0.283, "step": 23900 }, { "epoch": 0.31979959225551985, "grad_norm": 20.055587768554688, "learning_rate": 1.8669124396671702e-05, "loss": 0.3098, "step": 24000 }, { "epoch": 0.31979959225551985, "eval_dev_accuracy": 0.9638288087932633, "eval_dev_accuracy_threshold": 0.9375428557395935, "eval_dev_average_precision": 0.7947515841312096, "eval_dev_f1": 0.731536653364675, "eval_dev_f1_threshold": 0.8831270337104797, "eval_dev_precision": 0.7299803622081605, "eval_dev_recall": 0.7330995946094007, "eval_loss": 0.2945517897605896, "eval_runtime": 562.1643, "eval_samples_per_second": 235.958, "eval_steps_per_second": 7.375, "step": 24000 }, { "epoch": 0.32113209055658454, "grad_norm": 40.83311080932617, "learning_rate": 1.8654318794231737e-05, "loss": 0.2592, "step": 24100 }, { "epoch": 0.3224645888576492, "grad_norm": 5.973490238189697, "learning_rate": 1.8639513191791775e-05, "loss": 0.27, "step": 24200 }, { "epoch": 0.32379708715871386, "grad_norm": 8.698867797851562, "learning_rate": 1.862470758935181e-05, "loss": 0.2738, "step": 24300 }, { "epoch": 0.32512958545977855, "grad_norm": 8.795327186584473, "learning_rate": 1.860990198691185e-05, "loss": 0.2528, "step": 24400 }, { "epoch": 0.3264620837608432, "grad_norm": 0.2583109438419342, "learning_rate": 1.8595096384471883e-05, "loss": 0.2694, "step": 24500 }, { "epoch": 0.3264620837608432, "eval_dev_accuracy": 0.9628261475947439, "eval_dev_accuracy_threshold": 0.9562203884124756, "eval_dev_average_precision": 0.7884777296034856, "eval_dev_f1": 0.7260596117035821, "eval_dev_f1_threshold": 0.9503564834594727, "eval_dev_precision": 0.7248307490718497, "eval_dev_recall": 0.7272926481866988, "eval_loss": 0.3025730550289154, "eval_runtime": 561.6413, "eval_samples_per_second": 236.177, "eval_steps_per_second": 7.382, "step": 24500 }, { "epoch": 0.32779458206190787, "grad_norm": 2.1876091957092285, "learning_rate": 1.8580290782031922e-05, "loss": 0.2288, "step": 24600 }, { "epoch": 0.32912708036297256, "grad_norm": 7.1153459548950195, "learning_rate": 1.856548517959196e-05, "loss": 0.2966, "step": 24700 }, { "epoch": 0.3304595786640372, "grad_norm": 0.5204883217811584, "learning_rate": 1.8550679577151995e-05, "loss": 0.3103, "step": 24800 }, { "epoch": 0.3317920769651019, "grad_norm": 0.5321233868598938, "learning_rate": 1.8535873974712033e-05, "loss": 0.2403, "step": 24900 }, { "epoch": 0.3331245752661665, "grad_norm": 0.5437518358230591, "learning_rate": 1.8521068372272068e-05, "loss": 0.2986, "step": 25000 }, { "epoch": 0.3331245752661665, "eval_dev_accuracy": 0.964250981929482, "eval_dev_accuracy_threshold": 0.8822938203811646, "eval_dev_average_precision": 0.7923663060740336, "eval_dev_f1": 0.7238444852327716, "eval_dev_f1_threshold": 0.3350263833999634, "eval_dev_precision": 0.7350881156800723, "eval_dev_recall": 0.7129396296702093, "eval_loss": 0.39168474078178406, "eval_runtime": 559.9637, "eval_samples_per_second": 236.885, "eval_steps_per_second": 7.404, "step": 25000 }, { "epoch": 0.3344570735672312, "grad_norm": 10.434455871582031, "learning_rate": 1.8506262769832107e-05, "loss": 0.2954, "step": 25100 }, { "epoch": 0.3357895718682959, "grad_norm": 29.660995483398438, "learning_rate": 1.849145716739214e-05, "loss": 0.2778, "step": 25200 }, { "epoch": 0.3371220701693605, "grad_norm": 17.967578887939453, "learning_rate": 1.847665156495218e-05, "loss": 0.2522, "step": 25300 }, { "epoch": 0.3384545684704252, "grad_norm": 16.963655471801758, "learning_rate": 1.8461845962512218e-05, "loss": 0.3071, "step": 25400 }, { "epoch": 0.33978706677148984, "grad_norm": 3.178967237472534, "learning_rate": 1.8447040360072253e-05, "loss": 0.3088, "step": 25500 }, { "epoch": 0.33978706677148984, "eval_dev_accuracy": 0.9653064147700288, "eval_dev_accuracy_threshold": 0.9469561576843262, "eval_dev_average_precision": 0.8090508028224602, "eval_dev_f1": 0.7406513872135102, "eval_dev_f1_threshold": 0.9149296879768372, "eval_dev_precision": 0.7413017231917463, "eval_dev_recall": 0.7400021913005369, "eval_loss": 0.28880587220191956, "eval_runtime": 559.426, "eval_samples_per_second": 237.113, "eval_steps_per_second": 7.411, "step": 25500 }, { "epoch": 0.34111956507255453, "grad_norm": 31.83365821838379, "learning_rate": 1.843223475763229e-05, "loss": 0.3328, "step": 25600 }, { "epoch": 0.3424520633736192, "grad_norm": 73.58321380615234, "learning_rate": 1.8417429155192326e-05, "loss": 0.249, "step": 25700 }, { "epoch": 0.34378456167468385, "grad_norm": 31.073486328125, "learning_rate": 1.8402623552752365e-05, "loss": 0.248, "step": 25800 }, { "epoch": 0.34511705997574854, "grad_norm": 2.6796510219573975, "learning_rate": 1.83878179503124e-05, "loss": 0.2735, "step": 25900 }, { "epoch": 0.3464495582768132, "grad_norm": 19.556621551513672, "learning_rate": 1.8373012347872438e-05, "loss": 0.3087, "step": 26000 }, { "epoch": 0.3464495582768132, "eval_dev_accuracy": 0.9655853505921732, "eval_dev_accuracy_threshold": 0.9267855882644653, "eval_dev_average_precision": 0.8095342358911112, "eval_dev_f1": 0.7389250472391351, "eval_dev_f1_threshold": 0.6512651443481445, "eval_dev_precision": 0.7092191435768262, "eval_dev_recall": 0.7712282239509148, "eval_loss": 0.26777184009552, "eval_runtime": 562.5407, "eval_samples_per_second": 235.8, "eval_steps_per_second": 7.37, "step": 26000 }, { "epoch": 0.34778205657787786, "grad_norm": 10.894082069396973, "learning_rate": 1.8358206745432473e-05, "loss": 0.2852, "step": 26100 }, { "epoch": 0.34911455487894255, "grad_norm": 43.44607162475586, "learning_rate": 1.8343401142992508e-05, "loss": 0.255, "step": 26200 }, { "epoch": 0.3504470531800072, "grad_norm": 0.060168083757162094, "learning_rate": 1.8328595540552546e-05, "loss": 0.27, "step": 26300 }, { "epoch": 0.3517795514810719, "grad_norm": 0.13352444767951965, "learning_rate": 1.831378993811258e-05, "loss": 0.3315, "step": 26400 }, { "epoch": 0.3531120497821365, "grad_norm": 2.8769795894622803, "learning_rate": 1.829898433567262e-05, "loss": 0.2548, "step": 26500 }, { "epoch": 0.3531120497821365, "eval_dev_accuracy": 0.9633161699849978, "eval_dev_accuracy_threshold": 0.9598461389541626, "eval_dev_average_precision": 0.7967367390804647, "eval_dev_f1": 0.7211769095463995, "eval_dev_f1_threshold": 0.9407143592834473, "eval_dev_precision": 0.7022005397550343, "eval_dev_recall": 0.7412074065958146, "eval_loss": 0.26967185735702515, "eval_runtime": 561.1397, "eval_samples_per_second": 236.389, "eval_steps_per_second": 7.389, "step": 26500 }, { "epoch": 0.3544445480832012, "grad_norm": 6.555627822875977, "learning_rate": 1.8284178733232655e-05, "loss": 0.2967, "step": 26600 }, { "epoch": 0.3557770463842659, "grad_norm": 18.727455139160156, "learning_rate": 1.8269373130792693e-05, "loss": 0.2907, "step": 26700 }, { "epoch": 0.3571095446853305, "grad_norm": 16.004812240600586, "learning_rate": 1.825456752835273e-05, "loss": 0.2871, "step": 26800 }, { "epoch": 0.3584420429863952, "grad_norm": 0.3446504771709442, "learning_rate": 1.8239761925912766e-05, "loss": 0.287, "step": 26900 }, { "epoch": 0.35977454128745984, "grad_norm": 1.3801554441452026, "learning_rate": 1.8224956323472805e-05, "loss": 0.2461, "step": 27000 }, { "epoch": 0.35977454128745984, "eval_dev_accuracy": 0.9659321356683529, "eval_dev_accuracy_threshold": 0.9563218355178833, "eval_dev_average_precision": 0.8166384763438364, "eval_dev_f1": 0.7424130273871207, "eval_dev_f1_threshold": 0.5458764433860779, "eval_dev_precision": 0.7173801982221314, "eval_dev_recall": 0.7692560534677331, "eval_loss": 0.3241870701313019, "eval_runtime": 560.7697, "eval_samples_per_second": 236.545, "eval_steps_per_second": 7.393, "step": 27000 }, { "epoch": 0.3611070395885245, "grad_norm": 11.259644508361816, "learning_rate": 1.821015072103284e-05, "loss": 0.3134, "step": 27100 }, { "epoch": 0.3624395378895892, "grad_norm": 15.958681106567383, "learning_rate": 1.8195345118592878e-05, "loss": 0.229, "step": 27200 }, { "epoch": 0.36377203619065385, "grad_norm": 3.471926689147949, "learning_rate": 1.8180539516152913e-05, "loss": 0.2318, "step": 27300 }, { "epoch": 0.36510453449171854, "grad_norm": 57.36378479003906, "learning_rate": 1.816573391371295e-05, "loss": 0.2584, "step": 27400 }, { "epoch": 0.36643703279278317, "grad_norm": 15.649163246154785, "learning_rate": 1.8150928311272986e-05, "loss": 0.3092, "step": 27500 }, { "epoch": 0.36643703279278317, "eval_dev_accuracy": 0.9644696073035952, "eval_dev_accuracy_threshold": 0.9345089793205261, "eval_dev_average_precision": 0.8036883896122946, "eval_dev_f1": 0.7414679756895747, "eval_dev_f1_threshold": 0.8182344436645508, "eval_dev_precision": 0.7049585144211774, "eval_dev_recall": 0.7819655965815712, "eval_loss": 0.2574635446071625, "eval_runtime": 562.3168, "eval_samples_per_second": 235.894, "eval_steps_per_second": 7.373, "step": 27500 }, { "epoch": 0.36776953109384786, "grad_norm": 31.03179931640625, "learning_rate": 1.8136122708833024e-05, "loss": 0.2781, "step": 27600 }, { "epoch": 0.36910202939491255, "grad_norm": 32.65872573852539, "learning_rate": 1.8121317106393063e-05, "loss": 0.2411, "step": 27700 }, { "epoch": 0.3704345276959772, "grad_norm": 10.414048194885254, "learning_rate": 1.8106511503953098e-05, "loss": 0.2768, "step": 27800 }, { "epoch": 0.37176702599704187, "grad_norm": 0.27181100845336914, "learning_rate": 1.8091705901513136e-05, "loss": 0.256, "step": 27900 }, { "epoch": 0.3730995242981065, "grad_norm": 15.69724178314209, "learning_rate": 1.807690029907317e-05, "loss": 0.3024, "step": 28000 }, { "epoch": 0.3730995242981065, "eval_dev_accuracy": 0.9660904505944349, "eval_dev_accuracy_threshold": 0.960444450378418, "eval_dev_average_precision": 0.8143885872198954, "eval_dev_f1": 0.7409103007718926, "eval_dev_f1_threshold": 0.8899838328361511, "eval_dev_precision": 0.7205425553944916, "eval_dev_recall": 0.7624630218034404, "eval_loss": 0.2652537524700165, "eval_runtime": 560.0512, "eval_samples_per_second": 236.848, "eval_steps_per_second": 7.403, "step": 28000 }, { "epoch": 0.3744320225991712, "grad_norm": 4.027531623840332, "learning_rate": 1.806209469663321e-05, "loss": 0.2676, "step": 28100 }, { "epoch": 0.3757645209002359, "grad_norm": 6.543447494506836, "learning_rate": 1.8047289094193244e-05, "loss": 0.2384, "step": 28200 }, { "epoch": 0.3770970192013005, "grad_norm": 35.99159622192383, "learning_rate": 1.8032483491753283e-05, "loss": 0.2586, "step": 28300 }, { "epoch": 0.3784295175023652, "grad_norm": 1.3943774700164795, "learning_rate": 1.8017677889313318e-05, "loss": 0.2663, "step": 28400 }, { "epoch": 0.37976201580342983, "grad_norm": 0.43371257185935974, "learning_rate": 1.8002872286873352e-05, "loss": 0.3077, "step": 28500 }, { "epoch": 0.37976201580342983, "eval_dev_accuracy": 0.9662638431325247, "eval_dev_accuracy_threshold": 0.9389976263046265, "eval_dev_average_precision": 0.8185963813825948, "eval_dev_f1": 0.7529551465428834, "eval_dev_f1_threshold": 0.8002798557281494, "eval_dev_precision": 0.7420212765957447, "eval_dev_recall": 0.7642160622329353, "eval_loss": 0.2862532138824463, "eval_runtime": 562.8872, "eval_samples_per_second": 235.655, "eval_steps_per_second": 7.366, "step": 28500 }, { "epoch": 0.3810945141044945, "grad_norm": 7.868191719055176, "learning_rate": 1.798806668443339e-05, "loss": 0.2609, "step": 28600 }, { "epoch": 0.3824270124055592, "grad_norm": 0.37841853499412537, "learning_rate": 1.7973261081993426e-05, "loss": 0.277, "step": 28700 }, { "epoch": 0.38375951070662384, "grad_norm": 1.237690806388855, "learning_rate": 1.7958455479553464e-05, "loss": 0.2635, "step": 28800 }, { "epoch": 0.38509200900768853, "grad_norm": 14.932636260986328, "learning_rate": 1.79436498771135e-05, "loss": 0.2518, "step": 28900 }, { "epoch": 0.38642450730875316, "grad_norm": 7.698137283325195, "learning_rate": 1.7928844274673537e-05, "loss": 0.2686, "step": 29000 }, { "epoch": 0.38642450730875316, "eval_dev_accuracy": 0.9663693864165793, "eval_dev_accuracy_threshold": 0.9125785231590271, "eval_dev_average_precision": 0.8194613717227588, "eval_dev_f1": 0.7500950931913275, "eval_dev_f1_threshold": 0.7369703054428101, "eval_dev_precision": 0.7440707201379905, "eval_dev_recall": 0.7562178152733647, "eval_loss": 0.25516369938850403, "eval_runtime": 561.2432, "eval_samples_per_second": 236.345, "eval_steps_per_second": 7.387, "step": 29000 }, { "epoch": 0.38775700560981785, "grad_norm": 11.858484268188477, "learning_rate": 1.7914038672233576e-05, "loss": 0.2419, "step": 29100 }, { "epoch": 0.38908950391088254, "grad_norm": 1.3223813772201538, "learning_rate": 1.789923306979361e-05, "loss": 0.268, "step": 29200 }, { "epoch": 0.3904220022119472, "grad_norm": 1.3486851453781128, "learning_rate": 1.788442746735365e-05, "loss": 0.2851, "step": 29300 }, { "epoch": 0.39175450051301186, "grad_norm": 4.85157585144043, "learning_rate": 1.7869621864913684e-05, "loss": 0.2212, "step": 29400 }, { "epoch": 0.3930869988140765, "grad_norm": 6.538160800933838, "learning_rate": 1.7854816262473722e-05, "loss": 0.2571, "step": 29500 }, { "epoch": 0.3930869988140765, "eval_dev_accuracy": 0.9645676117816461, "eval_dev_accuracy_threshold": 0.8994825482368469, "eval_dev_average_precision": 0.8082227405172548, "eval_dev_f1": 0.7435443565181175, "eval_dev_f1_threshold": 0.609738826751709, "eval_dev_precision": 0.7083622656482492, "eval_dev_recall": 0.7824038566889449, "eval_loss": 0.2665890157222748, "eval_runtime": 562.6368, "eval_samples_per_second": 235.76, "eval_steps_per_second": 7.369, "step": 29500 }, { "epoch": 0.3944194971151412, "grad_norm": 10.298799514770508, "learning_rate": 1.7840010660033757e-05, "loss": 0.2803, "step": 29600 }, { "epoch": 0.39575199541620587, "grad_norm": 46.07704162597656, "learning_rate": 1.7825205057593796e-05, "loss": 0.3034, "step": 29700 }, { "epoch": 0.3970844937172705, "grad_norm": 12.525829315185547, "learning_rate": 1.781039945515383e-05, "loss": 0.2332, "step": 29800 }, { "epoch": 0.3984169920183352, "grad_norm": 3.9645519256591797, "learning_rate": 1.779559385271387e-05, "loss": 0.2444, "step": 29900 }, { "epoch": 0.3997494903193998, "grad_norm": 18.388866424560547, "learning_rate": 1.7780788250273907e-05, "loss": 0.247, "step": 30000 }, { "epoch": 0.3997494903193998, "eval_dev_accuracy": 0.9654571908901068, "eval_dev_accuracy_threshold": 0.9365599155426025, "eval_dev_average_precision": 0.8171252302464322, "eval_dev_f1": 0.747335818153184, "eval_dev_f1_threshold": 0.8443748354911804, "eval_dev_precision": 0.7173956863535578, "eval_dev_recall": 0.779883861071546, "eval_loss": 0.267426073551178, "eval_runtime": 564.1091, "eval_samples_per_second": 235.144, "eval_steps_per_second": 7.35, "step": 30000 }, { "epoch": 0.4010819886204645, "grad_norm": 23.66806411743164, "learning_rate": 1.7765982647833942e-05, "loss": 0.2861, "step": 30100 }, { "epoch": 0.4024144869215292, "grad_norm": 3.966848611831665, "learning_rate": 1.775117704539398e-05, "loss": 0.2409, "step": 30200 }, { "epoch": 0.40374698522259383, "grad_norm": 14.780499458312988, "learning_rate": 1.7736371442954015e-05, "loss": 0.2658, "step": 30300 }, { "epoch": 0.4050794835236585, "grad_norm": 30.90425682067871, "learning_rate": 1.7721565840514054e-05, "loss": 0.3114, "step": 30400 }, { "epoch": 0.40641198182472316, "grad_norm": 5.639667987823486, "learning_rate": 1.770676023807409e-05, "loss": 0.2685, "step": 30500 }, { "epoch": 0.40641198182472316, "eval_dev_accuracy": 0.9670704953749425, "eval_dev_accuracy_threshold": 0.9521620869636536, "eval_dev_average_precision": 0.8255021501170436, "eval_dev_f1": 0.7578924800343035, "eval_dev_f1_threshold": 0.8574447631835938, "eval_dev_precision": 0.7418677859391396, "eval_dev_recall": 0.7746247397830612, "eval_loss": 0.27643173933029175, "eval_runtime": 561.8887, "eval_samples_per_second": 236.073, "eval_steps_per_second": 7.379, "step": 30500 }, { "epoch": 0.40774448012578784, "grad_norm": 0.6215185523033142, "learning_rate": 1.7691954635634127e-05, "loss": 0.2354, "step": 30600 }, { "epoch": 0.40907697842685253, "grad_norm": 4.660243034362793, "learning_rate": 1.7677149033194162e-05, "loss": 0.2576, "step": 30700 }, { "epoch": 0.41040947672791717, "grad_norm": 0.37590527534484863, "learning_rate": 1.7662343430754197e-05, "loss": 0.2647, "step": 30800 }, { "epoch": 0.41174197502898185, "grad_norm": 0.8927075862884521, "learning_rate": 1.7647537828314235e-05, "loss": 0.2175, "step": 30900 }, { "epoch": 0.4130744733300465, "grad_norm": 3.024475336074829, "learning_rate": 1.763273222587427e-05, "loss": 0.3085, "step": 31000 }, { "epoch": 0.4130744733300465, "eval_dev_accuracy": 0.9660376789524076, "eval_dev_accuracy_threshold": 0.9548216462135315, "eval_dev_average_precision": 0.8156242337854964, "eval_dev_f1": 0.7478032096816627, "eval_dev_f1_threshold": 0.6426188945770264, "eval_dev_precision": 0.7193763919821826, "eval_dev_recall": 0.7785690807494248, "eval_loss": 0.26265889406204224, "eval_runtime": 565.3292, "eval_samples_per_second": 234.637, "eval_steps_per_second": 7.334, "step": 31000 }, { "epoch": 0.4144069716311112, "grad_norm": 0.6045613884925842, "learning_rate": 1.761792662343431e-05, "loss": 0.2637, "step": 31100 }, { "epoch": 0.4157394699321758, "grad_norm": 0.6080629229545593, "learning_rate": 1.7603121020994344e-05, "loss": 0.2567, "step": 31200 }, { "epoch": 0.4170719682332405, "grad_norm": 0.933800995349884, "learning_rate": 1.7588315418554382e-05, "loss": 0.2906, "step": 31300 }, { "epoch": 0.4184044665343052, "grad_norm": 3.305546522140503, "learning_rate": 1.757350981611442e-05, "loss": 0.2516, "step": 31400 }, { "epoch": 0.4197369648353698, "grad_norm": 9.856147766113281, "learning_rate": 1.7558704213674455e-05, "loss": 0.2342, "step": 31500 }, { "epoch": 0.4197369648353698, "eval_dev_accuracy": 0.9664523132826223, "eval_dev_accuracy_threshold": 0.6949450373649597, "eval_dev_average_precision": 0.8198951977617771, "eval_dev_f1": 0.752799668187474, "eval_dev_f1_threshold": 0.14068716764450073, "eval_dev_precision": 0.7144966046648952, "eval_dev_recall": 0.7954420948833133, "eval_loss": 0.3560490906238556, "eval_runtime": 566.4442, "eval_samples_per_second": 234.175, "eval_steps_per_second": 7.319, "step": 31500 }, { "epoch": 0.4210694631364345, "grad_norm": 6.468503952026367, "learning_rate": 1.7543898611234493e-05, "loss": 0.2595, "step": 31600 }, { "epoch": 0.42240196143749914, "grad_norm": 2.2248482704162598, "learning_rate": 1.752909300879453e-05, "loss": 0.259, "step": 31700 }, { "epoch": 0.42373445973856383, "grad_norm": 2.2780916690826416, "learning_rate": 1.7514287406354567e-05, "loss": 0.2563, "step": 31800 }, { "epoch": 0.4250669580396285, "grad_norm": 5.997177600860596, "learning_rate": 1.74994818039146e-05, "loss": 0.2504, "step": 31900 }, { "epoch": 0.42639945634069315, "grad_norm": 5.018893241882324, "learning_rate": 1.748467620147464e-05, "loss": 0.2751, "step": 32000 }, { "epoch": 0.42639945634069315, "eval_dev_accuracy": 0.9660979894004388, "eval_dev_accuracy_threshold": 0.9447215795516968, "eval_dev_average_precision": 0.818149670082586, "eval_dev_f1": 0.7564001884718078, "eval_dev_f1_threshold": 0.7197975516319275, "eval_dev_precision": 0.7242831361540004, "eval_dev_recall": 0.7914977539169498, "eval_loss": 0.23995983600616455, "eval_runtime": 559.4727, "eval_samples_per_second": 237.093, "eval_steps_per_second": 7.411, "step": 32000 }, { "epoch": 0.42773195464175784, "grad_norm": 9.826861381530762, "learning_rate": 1.7469870599034675e-05, "loss": 0.2521, "step": 32100 }, { "epoch": 0.42906445294282247, "grad_norm": 7.288123607635498, "learning_rate": 1.7455064996594713e-05, "loss": 0.2406, "step": 32200 }, { "epoch": 0.43039695124388716, "grad_norm": 11.257208824157715, "learning_rate": 1.744025939415475e-05, "loss": 0.3026, "step": 32300 }, { "epoch": 0.43172944954495185, "grad_norm": 0.21672357618808746, "learning_rate": 1.7425453791714787e-05, "loss": 0.234, "step": 32400 }, { "epoch": 0.4330619478460165, "grad_norm": 1.5854872465133667, "learning_rate": 1.7410648189274825e-05, "loss": 0.2639, "step": 32500 }, { "epoch": 0.4330619478460165, "eval_dev_accuracy": 0.9651707162619584, "eval_dev_accuracy_threshold": 0.8978205919265747, "eval_dev_average_precision": 0.8087336536278384, "eval_dev_f1": 0.740958788898234, "eval_dev_f1_threshold": 0.7787094712257385, "eval_dev_precision": 0.7121349904011317, "eval_dev_recall": 0.7722143091925058, "eval_loss": 0.2519395053386688, "eval_runtime": 557.1987, "eval_samples_per_second": 238.061, "eval_steps_per_second": 7.441, "step": 32500 }, { "epoch": 0.43439444614708117, "grad_norm": 5.898445129394531, "learning_rate": 1.739584258683486e-05, "loss": 0.2321, "step": 32600 }, { "epoch": 0.4357269444481458, "grad_norm": 0.27915239334106445, "learning_rate": 1.7381036984394898e-05, "loss": 0.1894, "step": 32700 }, { "epoch": 0.4370594427492105, "grad_norm": 0.3429672122001648, "learning_rate": 1.7366231381954933e-05, "loss": 0.3076, "step": 32800 }, { "epoch": 0.4383919410502752, "grad_norm": 0.6808755397796631, "learning_rate": 1.735142577951497e-05, "loss": 0.2392, "step": 32900 }, { "epoch": 0.4397244393513398, "grad_norm": 36.33818435668945, "learning_rate": 1.7336620177075006e-05, "loss": 0.2742, "step": 33000 }, { "epoch": 0.4397244393513398, "eval_dev_accuracy": 0.9674248192571261, "eval_dev_accuracy_threshold": 0.9071935415267944, "eval_dev_average_precision": 0.8132130323917695, "eval_dev_f1": 0.7505652677438923, "eval_dev_f1_threshold": 0.5399670600891113, "eval_dev_precision": 0.7556073728625361, "eval_dev_recall": 0.7455900076695519, "eval_loss": 0.30597466230392456, "eval_runtime": 519.866, "eval_samples_per_second": 255.156, "eval_steps_per_second": 7.975, "step": 33000 }, { "epoch": 0.4410569376524045, "grad_norm": 6.550230503082275, "learning_rate": 1.732181457463504e-05, "loss": 0.2624, "step": 33100 }, { "epoch": 0.44238943595346913, "grad_norm": 15.728365898132324, "learning_rate": 1.730700897219508e-05, "loss": 0.2481, "step": 33200 }, { "epoch": 0.4437219342545338, "grad_norm": 1.1476960182189941, "learning_rate": 1.7292203369755115e-05, "loss": 0.2289, "step": 33300 }, { "epoch": 0.4450544325555985, "grad_norm": 89.61054992675781, "learning_rate": 1.7277397767315153e-05, "loss": 0.2854, "step": 33400 }, { "epoch": 0.44638693085666314, "grad_norm": 4.351845741271973, "learning_rate": 1.7262592164875188e-05, "loss": 0.2733, "step": 33500 }, { "epoch": 0.44638693085666314, "eval_dev_accuracy": 0.9650500953658959, "eval_dev_accuracy_threshold": 0.9060708284378052, "eval_dev_average_precision": 0.8133536713572236, "eval_dev_f1": 0.744153082919915, "eval_dev_f1_threshold": 0.8223495483398438, "eval_dev_precision": 0.7405598958333334, "eval_dev_recall": 0.7477813082064205, "eval_loss": 0.272208571434021, "eval_runtime": 521.749, "eval_samples_per_second": 254.235, "eval_steps_per_second": 7.946, "step": 33500 }, { "epoch": 0.44771942915772783, "grad_norm": 6.246555805206299, "learning_rate": 1.7247786562435226e-05, "loss": 0.2759, "step": 33600 }, { "epoch": 0.44905192745879247, "grad_norm": 52.076377868652344, "learning_rate": 1.7232980959995265e-05, "loss": 0.2588, "step": 33700 }, { "epoch": 0.45038442575985715, "grad_norm": 5.682718276977539, "learning_rate": 1.72181753575553e-05, "loss": 0.2106, "step": 33800 }, { "epoch": 0.45171692406092184, "grad_norm": 2.271516799926758, "learning_rate": 1.7203369755115338e-05, "loss": 0.2631, "step": 33900 }, { "epoch": 0.4530494223619865, "grad_norm": 1.0763822793960571, "learning_rate": 1.7188564152675373e-05, "loss": 0.304, "step": 34000 }, { "epoch": 0.4530494223619865, "eval_dev_accuracy": 0.9669197192548644, "eval_dev_accuracy_threshold": 0.8872429132461548, "eval_dev_average_precision": 0.8222864572131344, "eval_dev_f1": 0.7534934497816593, "eval_dev_f1_threshold": 0.4772883951663971, "eval_dev_precision": 0.750788643533123, "eval_dev_recall": 0.7562178152733647, "eval_loss": 0.2554573118686676, "eval_runtime": 520.4082, "eval_samples_per_second": 254.89, "eval_steps_per_second": 7.967, "step": 34000 }, { "epoch": 0.45438192066305116, "grad_norm": 0.5738760828971863, "learning_rate": 1.717375855023541e-05, "loss": 0.2513, "step": 34100 }, { "epoch": 0.4557144189641158, "grad_norm": 2.8462681770324707, "learning_rate": 1.7158952947795446e-05, "loss": 0.2507, "step": 34200 }, { "epoch": 0.4570469172651805, "grad_norm": 8.60177993774414, "learning_rate": 1.7144147345355484e-05, "loss": 0.2417, "step": 34300 }, { "epoch": 0.4583794155662452, "grad_norm": 1.3673675060272217, "learning_rate": 1.712934174291552e-05, "loss": 0.239, "step": 34400 }, { "epoch": 0.4597119138673098, "grad_norm": 36.5560188293457, "learning_rate": 1.7114536140475558e-05, "loss": 0.2527, "step": 34500 }, { "epoch": 0.4597119138673098, "eval_dev_accuracy": 0.9667614043287824, "eval_dev_accuracy_threshold": 0.9581319093704224, "eval_dev_average_precision": 0.818866417573704, "eval_dev_f1": 0.7523900039134568, "eval_dev_f1_threshold": 0.9470370411872864, "eval_dev_precision": 0.7681506849315068, "eval_dev_recall": 0.7372630656294511, "eval_loss": 0.2984105348587036, "eval_runtime": 519.7969, "eval_samples_per_second": 255.19, "eval_steps_per_second": 7.976, "step": 34500 }, { "epoch": 0.4610444121683745, "grad_norm": 17.973974227905273, "learning_rate": 1.7099730538035596e-05, "loss": 0.2807, "step": 34600 }, { "epoch": 0.46237691046943913, "grad_norm": 9.143497467041016, "learning_rate": 1.708492493559563e-05, "loss": 0.2304, "step": 34700 }, { "epoch": 0.4637094087705038, "grad_norm": 8.447179794311523, "learning_rate": 1.707011933315567e-05, "loss": 0.2707, "step": 34800 }, { "epoch": 0.4650419070715685, "grad_norm": 0.18045054376125336, "learning_rate": 1.7055313730715704e-05, "loss": 0.2202, "step": 34900 }, { "epoch": 0.46637440537263314, "grad_norm": 18.00141716003418, "learning_rate": 1.7040508128275743e-05, "loss": 0.2802, "step": 35000 }, { "epoch": 0.46637440537263314, "eval_dev_accuracy": 0.9667387879107707, "eval_dev_accuracy_threshold": 0.922869086265564, "eval_dev_average_precision": 0.8248757172419965, "eval_dev_f1": 0.7573180276545787, "eval_dev_f1_threshold": 0.618488073348999, "eval_dev_precision": 0.7229527794381351, "eval_dev_recall": 0.795113399802783, "eval_loss": 0.2512986958026886, "eval_runtime": 520.5462, "eval_samples_per_second": 254.823, "eval_steps_per_second": 7.965, "step": 35000 }, { "epoch": 0.4677069036736978, "grad_norm": 0.6688315868377686, "learning_rate": 1.7025702525835778e-05, "loss": 0.2375, "step": 35100 }, { "epoch": 0.46903940197476246, "grad_norm": 17.023473739624023, "learning_rate": 1.7010896923395816e-05, "loss": 0.2058, "step": 35200 }, { "epoch": 0.47037190027582715, "grad_norm": 0.3867310881614685, "learning_rate": 1.699609132095585e-05, "loss": 0.2419, "step": 35300 }, { "epoch": 0.47170439857689184, "grad_norm": 13.710586547851562, "learning_rate": 1.6981285718515886e-05, "loss": 0.2232, "step": 35400 }, { "epoch": 0.47303689687795647, "grad_norm": 14.513033866882324, "learning_rate": 1.6966480116075924e-05, "loss": 0.316, "step": 35500 }, { "epoch": 0.47303689687795647, "eval_dev_accuracy": 0.9672815819430518, "eval_dev_accuracy_threshold": 0.9403676986694336, "eval_dev_average_precision": 0.821150327893476, "eval_dev_f1": 0.7596174282678001, "eval_dev_f1_threshold": 0.7547413110733032, "eval_dev_precision": 0.7374393892499742, "eval_dev_recall": 0.7831708118768489, "eval_loss": 0.2614619731903076, "eval_runtime": 520.6856, "eval_samples_per_second": 254.755, "eval_steps_per_second": 7.963, "step": 35500 }, { "epoch": 0.47436939517902116, "grad_norm": 2.2954721450805664, "learning_rate": 1.695167451363596e-05, "loss": 0.2527, "step": 35600 }, { "epoch": 0.4757018934800858, "grad_norm": 2.294912338256836, "learning_rate": 1.6936868911195997e-05, "loss": 0.2732, "step": 35700 }, { "epoch": 0.4770343917811505, "grad_norm": 100.57258605957031, "learning_rate": 1.6922063308756032e-05, "loss": 0.2806, "step": 35800 }, { "epoch": 0.47836689008221517, "grad_norm": 13.040018081665039, "learning_rate": 1.690725770631607e-05, "loss": 0.25, "step": 35900 }, { "epoch": 0.4796993883832798, "grad_norm": 0.7189066410064697, "learning_rate": 1.689245210387611e-05, "loss": 0.2173, "step": 36000 }, { "epoch": 0.4796993883832798, "eval_dev_accuracy": 0.9675982117952159, "eval_dev_accuracy_threshold": 0.9232138395309448, "eval_dev_average_precision": 0.8271432363427305, "eval_dev_f1": 0.7561493449329397, "eval_dev_f1_threshold": 0.49452510476112366, "eval_dev_precision": 0.7169088766692852, "eval_dev_recall": 0.799934260983894, "eval_loss": 0.2864265441894531, "eval_runtime": 519.7616, "eval_samples_per_second": 255.207, "eval_steps_per_second": 7.977, "step": 36000 }, { "epoch": 0.4810318866843445, "grad_norm": 0.35153084993362427, "learning_rate": 1.6877646501436144e-05, "loss": 0.2576, "step": 36100 }, { "epoch": 0.4823643849854091, "grad_norm": 0.3834153413772583, "learning_rate": 1.6862840898996182e-05, "loss": 0.2087, "step": 36200 }, { "epoch": 0.4836968832864738, "grad_norm": 0.9096924066543579, "learning_rate": 1.6848035296556217e-05, "loss": 0.2581, "step": 36300 }, { "epoch": 0.4850293815875385, "grad_norm": 17.327335357666016, "learning_rate": 1.6833229694116256e-05, "loss": 0.265, "step": 36400 }, { "epoch": 0.48636187988860313, "grad_norm": 3.3336431980133057, "learning_rate": 1.681842409167629e-05, "loss": 0.2404, "step": 36500 }, { "epoch": 0.48636187988860313, "eval_dev_accuracy": 0.9678545311993486, "eval_dev_accuracy_threshold": 0.6843677163124084, "eval_dev_average_precision": 0.8368359833153991, "eval_dev_f1": 0.7613580982292738, "eval_dev_f1_threshold": 0.3513629138469696, "eval_dev_precision": 0.7526766595289079, "eval_dev_recall": 0.770242138709324, "eval_loss": 0.3165341913700104, "eval_runtime": 520.0004, "eval_samples_per_second": 255.09, "eval_steps_per_second": 7.973, "step": 36500 }, { "epoch": 0.4876943781896678, "grad_norm": 22.69322395324707, "learning_rate": 1.680361848923633e-05, "loss": 0.2525, "step": 36600 }, { "epoch": 0.48902687649073245, "grad_norm": 4.788589954376221, "learning_rate": 1.6788812886796367e-05, "loss": 0.2262, "step": 36700 }, { "epoch": 0.49035937479179714, "grad_norm": 48.63047409057617, "learning_rate": 1.6774007284356402e-05, "loss": 0.2572, "step": 36800 }, { "epoch": 0.49169187309286183, "grad_norm": 8.924850463867188, "learning_rate": 1.675920168191644e-05, "loss": 0.2608, "step": 36900 }, { "epoch": 0.49302437139392646, "grad_norm": 0.28982293605804443, "learning_rate": 1.6744396079476476e-05, "loss": 0.2212, "step": 37000 }, { "epoch": 0.49302437139392646, "eval_dev_accuracy": 0.9689250416519032, "eval_dev_accuracy_threshold": 0.9271968603134155, "eval_dev_average_precision": 0.8366709399417354, "eval_dev_f1": 0.7682220970137786, "eval_dev_f1_threshold": 0.2581200897693634, "eval_dev_precision": 0.7213351288957291, "eval_dev_recall": 0.8216281362988934, "eval_loss": 0.29943621158599854, "eval_runtime": 519.5326, "eval_samples_per_second": 255.32, "eval_steps_per_second": 7.98, "step": 37000 }, { "epoch": 0.49435686969499115, "grad_norm": 8.631064414978027, "learning_rate": 1.6729590477036514e-05, "loss": 0.2236, "step": 37100 }, { "epoch": 0.4956893679960558, "grad_norm": 0.3893554210662842, "learning_rate": 1.671478487459655e-05, "loss": 0.2542, "step": 37200 }, { "epoch": 0.4970218662971205, "grad_norm": 11.258530616760254, "learning_rate": 1.6699979272156587e-05, "loss": 0.2775, "step": 37300 }, { "epoch": 0.49835436459818516, "grad_norm": 23.54794692993164, "learning_rate": 1.6685173669716622e-05, "loss": 0.2437, "step": 37400 }, { "epoch": 0.4996868628992498, "grad_norm": 15.748093605041504, "learning_rate": 1.667036806727666e-05, "loss": 0.3368, "step": 37500 }, { "epoch": 0.4996868628992498, "eval_dev_accuracy": 0.9686687222477705, "eval_dev_accuracy_threshold": 0.9627949595451355, "eval_dev_average_precision": 0.8345873786652108, "eval_dev_f1": 0.7644562041783806, "eval_dev_f1_threshold": 0.6956943869590759, "eval_dev_precision": 0.7692478366984691, "eval_dev_recall": 0.7597238961323546, "eval_loss": 0.2797723412513733, "eval_runtime": 520.4988, "eval_samples_per_second": 254.846, "eval_steps_per_second": 7.965, "step": 37500 }, { "epoch": 0.5010193612003144, "grad_norm": 2.5468738079071045, "learning_rate": 1.6655562464836695e-05, "loss": 0.2806, "step": 37600 }, { "epoch": 0.5023518595013792, "grad_norm": 2.1441900730133057, "learning_rate": 1.664075686239673e-05, "loss": 0.2576, "step": 37700 }, { "epoch": 0.5036843578024438, "grad_norm": 1.2568778991699219, "learning_rate": 1.662595125995677e-05, "loss": 0.2848, "step": 37800 }, { "epoch": 0.5050168561035084, "grad_norm": 3.095561981201172, "learning_rate": 1.6611145657516804e-05, "loss": 0.215, "step": 37900 }, { "epoch": 0.5063493544045732, "grad_norm": 1.0205029249191284, "learning_rate": 1.6596340055076842e-05, "loss": 0.2331, "step": 38000 }, { "epoch": 0.5063493544045732, "eval_dev_accuracy": 0.9679223804533837, "eval_dev_accuracy_threshold": 0.9669108390808105, "eval_dev_average_precision": 0.8269638058273905, "eval_dev_f1": 0.7641839204087119, "eval_dev_f1_threshold": 0.8454810380935669, "eval_dev_precision": 0.7504224757076469, "eval_dev_recall": 0.7784595157225813, "eval_loss": 0.27937838435173035, "eval_runtime": 519.5927, "eval_samples_per_second": 255.29, "eval_steps_per_second": 7.979, "step": 38000 }, { "epoch": 0.5076818527056378, "grad_norm": 31.32097053527832, "learning_rate": 1.6581534452636877e-05, "loss": 0.27, "step": 38100 }, { "epoch": 0.5090143510067024, "grad_norm": 0.6534382104873657, "learning_rate": 1.6566728850196915e-05, "loss": 0.2522, "step": 38200 }, { "epoch": 0.5103468493077671, "grad_norm": 23.841657638549805, "learning_rate": 1.6551923247756954e-05, "loss": 0.251, "step": 38300 }, { "epoch": 0.5116793476088318, "grad_norm": 11.927959442138672, "learning_rate": 1.653711764531699e-05, "loss": 0.2299, "step": 38400 }, { "epoch": 0.5130118459098965, "grad_norm": 1.765657663345337, "learning_rate": 1.6522312042877027e-05, "loss": 0.2543, "step": 38500 }, { "epoch": 0.5130118459098965, "eval_dev_accuracy": 0.9683596312016103, "eval_dev_accuracy_threshold": 0.9189764261245728, "eval_dev_average_precision": 0.8251932254713443, "eval_dev_f1": 0.7703276368781975, "eval_dev_f1_threshold": 0.8014627695083618, "eval_dev_precision": 0.765329295987888, "eval_dev_recall": 0.7753916949709653, "eval_loss": 0.2911910116672516, "eval_runtime": 519.1529, "eval_samples_per_second": 255.507, "eval_steps_per_second": 7.986, "step": 38500 }, { "epoch": 0.5143443442109611, "grad_norm": 6.26005220413208, "learning_rate": 1.6507506440437062e-05, "loss": 0.203, "step": 38600 }, { "epoch": 0.5156768425120258, "grad_norm": 3.370025157928467, "learning_rate": 1.64927008379971e-05, "loss": 0.2621, "step": 38700 }, { "epoch": 0.5170093408130905, "grad_norm": 29.85224151611328, "learning_rate": 1.6477895235557135e-05, "loss": 0.2677, "step": 38800 }, { "epoch": 0.5183418391141551, "grad_norm": 13.099495887756348, "learning_rate": 1.6463089633117173e-05, "loss": 0.2377, "step": 38900 }, { "epoch": 0.5196743374152198, "grad_norm": 17.140789031982422, "learning_rate": 1.6448284030677212e-05, "loss": 0.265, "step": 39000 }, { "epoch": 0.5196743374152198, "eval_dev_accuracy": 0.9675529789591925, "eval_dev_accuracy_threshold": 0.9323844909667969, "eval_dev_average_precision": 0.818988116595722, "eval_dev_f1": 0.7656208525773743, "eval_dev_f1_threshold": 0.8410446643829346, "eval_dev_precision": 0.7426364572605562, "eval_dev_recall": 0.7900734085679851, "eval_loss": 0.28379642963409424, "eval_runtime": 518.2405, "eval_samples_per_second": 255.956, "eval_steps_per_second": 8.0, "step": 39000 }, { "epoch": 0.5210068357162845, "grad_norm": 2.0083911418914795, "learning_rate": 1.319120586275816e-07, "loss": 0.2108, "step": 39100 }, { "epoch": 0.5223393340173491, "grad_norm": 0.4948272705078125, "learning_rate": 2.651565622918055e-07, "loss": 0.227, "step": 39200 }, { "epoch": 0.5236718323184137, "grad_norm": 11.525949478149414, "learning_rate": 3.984010659560293e-07, "loss": 0.2081, "step": 39300 }, { "epoch": 0.5250043306194785, "grad_norm": 18.18743133544922, "learning_rate": 5.316455696202532e-07, "loss": 0.2782, "step": 39400 }, { "epoch": 0.5263368289205431, "grad_norm": 30.067602157592773, "learning_rate": 6.64890073284477e-07, "loss": 0.2357, "step": 39500 }, { "epoch": 0.5263368289205431, "eval_dev_accuracy": 0.9679374580653916, "eval_dev_accuracy_threshold": 0.8992660045623779, "eval_dev_average_precision": 0.8239503903565419, "eval_dev_f1": 0.768843413510473, "eval_dev_f1_threshold": 0.8412591814994812, "eval_dev_precision": 0.7522012578616353, "eval_dev_recall": 0.7862386326284649, "eval_loss": 0.27902960777282715, "eval_runtime": 522.9572, "eval_samples_per_second": 253.648, "eval_steps_per_second": 7.928, "step": 39500 }, { "epoch": 0.5276693272216078, "grad_norm": 1.496453881263733, "learning_rate": 7.981345769487009e-07, "loss": 0.2654, "step": 39600 }, { "epoch": 0.5290018255226725, "grad_norm": 2.676929473876953, "learning_rate": 9.313790806129248e-07, "loss": 0.2572, "step": 39700 }, { "epoch": 0.5303343238237371, "grad_norm": 1.3355958461761475, "learning_rate": 1.0646235842771487e-06, "loss": 0.2452, "step": 39800 }, { "epoch": 0.5316668221248018, "grad_norm": 24.94687843322754, "learning_rate": 1.1978680879413725e-06, "loss": 0.2412, "step": 39900 }, { "epoch": 0.5329993204258665, "grad_norm": 16.272785186767578, "learning_rate": 1.3311125916055965e-06, "loss": 0.2656, "step": 40000 }, { "epoch": 0.5329993204258665, "eval_dev_accuracy": 0.9683219371715908, "eval_dev_accuracy_threshold": 0.8796899914741516, "eval_dev_average_precision": 0.8334902875069624, "eval_dev_f1": 0.7711174542763505, "eval_dev_f1_threshold": 0.6210243701934814, "eval_dev_precision": 0.7449698702890409, "eval_dev_recall": 0.7991673057959899, "eval_loss": 0.2717488408088684, "eval_runtime": 523.9326, "eval_samples_per_second": 253.176, "eval_steps_per_second": 7.913, "step": 40000 }, { "epoch": 0.5343318187269311, "grad_norm": 38.643516540527344, "learning_rate": 1.4643570952698202e-06, "loss": 0.2558, "step": 40100 }, { "epoch": 0.5356643170279958, "grad_norm": 0.41367307305336, "learning_rate": 1.597601598934044e-06, "loss": 0.2445, "step": 40200 }, { "epoch": 0.5369968153290604, "grad_norm": 0.5968548655509949, "learning_rate": 1.7308461025982678e-06, "loss": 0.225, "step": 40300 }, { "epoch": 0.5383293136301252, "grad_norm": 3.6407761573791504, "learning_rate": 1.864090606262492e-06, "loss": 0.1996, "step": 40400 }, { "epoch": 0.5396618119311898, "grad_norm": 4.504887580871582, "learning_rate": 1.9973351099267156e-06, "loss": 0.244, "step": 40500 }, { "epoch": 0.5396618119311898, "eval_dev_accuracy": 0.9687214938897978, "eval_dev_accuracy_threshold": 0.9278361797332764, "eval_dev_average_precision": 0.8391958373486473, "eval_dev_f1": 0.772467364332722, "eval_dev_f1_threshold": 0.8639750480651855, "eval_dev_precision": 0.7608118159600468, "eval_dev_recall": 0.7844855921989701, "eval_loss": 0.2598799467086792, "eval_runtime": 524.043, "eval_samples_per_second": 253.122, "eval_steps_per_second": 7.912, "step": 40500 }, { "epoch": 0.5409943102322544, "grad_norm": 102.69219970703125, "learning_rate": 2.1305796135909398e-06, "loss": 0.2261, "step": 40600 }, { "epoch": 0.5423268085333192, "grad_norm": 0.4366992115974426, "learning_rate": 2.2638241172551636e-06, "loss": 0.2146, "step": 40700 }, { "epoch": 0.5436593068343838, "grad_norm": 0.5195454955101013, "learning_rate": 2.3970686209193873e-06, "loss": 0.2287, "step": 40800 }, { "epoch": 0.5449918051354484, "grad_norm": 0.5551161170005798, "learning_rate": 2.530313124583611e-06, "loss": 0.2278, "step": 40900 }, { "epoch": 0.5463243034365132, "grad_norm": 0.49544551968574524, "learning_rate": 2.663557628247835e-06, "loss": 0.2482, "step": 41000 }, { "epoch": 0.5463243034365132, "eval_dev_accuracy": 0.9691587446380242, "eval_dev_accuracy_threshold": 0.9283666610717773, "eval_dev_average_precision": 0.8431961837252191, "eval_dev_f1": 0.7750185715801761, "eval_dev_f1_threshold": 0.6344282627105713, "eval_dev_precision": 0.7514147546043831, "eval_dev_recall": 0.8001533910375808, "eval_loss": 0.275828093290329, "eval_runtime": 522.4079, "eval_samples_per_second": 253.915, "eval_steps_per_second": 7.936, "step": 41000 }, { "epoch": 0.5476568017375778, "grad_norm": 0.10281296074390411, "learning_rate": 2.7968021319120587e-06, "loss": 0.2163, "step": 41100 }, { "epoch": 0.5489893000386424, "grad_norm": 1.15056312084198, "learning_rate": 2.930046635576283e-06, "loss": 0.2284, "step": 41200 }, { "epoch": 0.5503217983397071, "grad_norm": 0.4747524559497833, "learning_rate": 3.0632911392405066e-06, "loss": 0.2382, "step": 41300 }, { "epoch": 0.5516542966407718, "grad_norm": 0.4341018795967102, "learning_rate": 3.1965356429047304e-06, "loss": 0.2355, "step": 41400 }, { "epoch": 0.5529867949418364, "grad_norm": 14.61008071899414, "learning_rate": 3.3297801465689546e-06, "loss": 0.2247, "step": 41500 }, { "epoch": 0.5529867949418364, "eval_dev_accuracy": 0.9692039774740476, "eval_dev_accuracy_threshold": 0.9339917302131653, "eval_dev_average_precision": 0.8436933951228754, "eval_dev_f1": 0.7787227299138979, "eval_dev_f1_threshold": 0.5835311412811279, "eval_dev_precision": 0.7518359853121175, "eval_dev_recall": 0.8076038128629341, "eval_loss": 0.2721947729587555, "eval_runtime": 523.5606, "eval_samples_per_second": 253.356, "eval_steps_per_second": 7.919, "step": 41500 }, { "epoch": 0.5543192932429011, "grad_norm": 0.17993593215942383, "learning_rate": 3.4630246502331784e-06, "loss": 0.2731, "step": 41600 }, { "epoch": 0.5556517915439658, "grad_norm": 0.47082406282424927, "learning_rate": 3.596269153897402e-06, "loss": 0.2493, "step": 41700 }, { "epoch": 0.5569842898450305, "grad_norm": 3.0138349533081055, "learning_rate": 3.729513657561626e-06, "loss": 0.2002, "step": 41800 }, { "epoch": 0.5583167881460951, "grad_norm": 15.761974334716797, "learning_rate": 3.862758161225849e-06, "loss": 0.2301, "step": 41900 }, { "epoch": 0.5596492864471598, "grad_norm": 0.34038063883781433, "learning_rate": 3.996002664890073e-06, "loss": 0.2136, "step": 42000 }, { "epoch": 0.5596492864471598, "eval_dev_accuracy": 0.9691587446380242, "eval_dev_accuracy_threshold": 0.9421218633651733, "eval_dev_average_precision": 0.8475633374819089, "eval_dev_f1": 0.7781878671310496, "eval_dev_f1_threshold": 0.3623931407928467, "eval_dev_precision": 0.7403560830860534, "eval_dev_recall": 0.8200942259230853, "eval_loss": 0.2632051110267639, "eval_runtime": 523.1078, "eval_samples_per_second": 253.575, "eval_steps_per_second": 7.926, "step": 42000 }, { "epoch": 0.5609817847482245, "grad_norm": 0.8982422351837158, "learning_rate": 4.129247168554298e-06, "loss": 0.2323, "step": 42100 }, { "epoch": 0.5623142830492891, "grad_norm": 3.004122495651245, "learning_rate": 4.2624916722185215e-06, "loss": 0.2274, "step": 42200 }, { "epoch": 0.5636467813503537, "grad_norm": 7.217723846435547, "learning_rate": 4.395736175882745e-06, "loss": 0.2233, "step": 42300 }, { "epoch": 0.5649792796514185, "grad_norm": 1.1566057205200195, "learning_rate": 4.528980679546969e-06, "loss": 0.2819, "step": 42400 }, { "epoch": 0.5663117779524831, "grad_norm": 0.2774888575077057, "learning_rate": 4.662225183211193e-06, "loss": 0.2002, "step": 42500 }, { "epoch": 0.5663117779524831, "eval_dev_accuracy": 0.9700181685224694, "eval_dev_accuracy_threshold": 0.9420008063316345, "eval_dev_average_precision": 0.8490166145203218, "eval_dev_f1": 0.7794501933730532, "eval_dev_f1_threshold": 0.41960281133651733, "eval_dev_precision": 0.7451783751374038, "eval_dev_recall": 0.8170264051714693, "eval_loss": 0.2606056034564972, "eval_runtime": 524.5455, "eval_samples_per_second": 252.88, "eval_steps_per_second": 7.904, "step": 42500 }, { "epoch": 0.5676442762535477, "grad_norm": 13.932589530944824, "learning_rate": 4.795469686875417e-06, "loss": 0.2599, "step": 42600 }, { "epoch": 0.5689767745546125, "grad_norm": 10.140316009521484, "learning_rate": 4.92871419053964e-06, "loss": 0.2478, "step": 42700 }, { "epoch": 0.5703092728556771, "grad_norm": 13.381287574768066, "learning_rate": 5.061958694203864e-06, "loss": 0.2151, "step": 42800 }, { "epoch": 0.5716417711567418, "grad_norm": 3.821155548095703, "learning_rate": 5.195203197868088e-06, "loss": 0.2207, "step": 42900 }, { "epoch": 0.5729742694578065, "grad_norm": 0.3303406834602356, "learning_rate": 5.328447701532313e-06, "loss": 0.2683, "step": 43000 }, { "epoch": 0.5729742694578065, "eval_dev_accuracy": 0.9702820267326061, "eval_dev_accuracy_threshold": 0.9166876673698425, "eval_dev_average_precision": 0.8539072755077529, "eval_dev_f1": 0.7817631806395852, "eval_dev_f1_threshold": 0.4148586690425873, "eval_dev_precision": 0.7710175812466702, "eval_dev_recall": 0.7928125342390709, "eval_loss": 0.2761251628398895, "eval_runtime": 522.8877, "eval_samples_per_second": 253.682, "eval_steps_per_second": 7.929, "step": 43000 }, { "epoch": 0.5743067677588711, "grad_norm": 2.869353771209717, "learning_rate": 5.461692205196536e-06, "loss": 0.2233, "step": 43100 }, { "epoch": 0.5756392660599358, "grad_norm": 1.4524685144424438, "learning_rate": 5.59493670886076e-06, "loss": 0.2473, "step": 43200 }, { "epoch": 0.5769717643610004, "grad_norm": 0.838426411151886, "learning_rate": 5.728181212524984e-06, "loss": 0.2289, "step": 43300 }, { "epoch": 0.5783042626620651, "grad_norm": 33.507659912109375, "learning_rate": 5.861425716189208e-06, "loss": 0.2757, "step": 43400 }, { "epoch": 0.5796367609631298, "grad_norm": 10.75368595123291, "learning_rate": 5.9946702198534315e-06, "loss": 0.2489, "step": 43500 }, { "epoch": 0.5796367609631298, "eval_dev_accuracy": 0.9702367938965827, "eval_dev_accuracy_threshold": 0.9455279111862183, "eval_dev_average_precision": 0.8513893973961074, "eval_dev_f1": 0.7795382036446223, "eval_dev_f1_threshold": 0.6581396460533142, "eval_dev_precision": 0.7695921417894512, "eval_dev_recall": 0.7897447134874548, "eval_loss": 0.24530762434005737, "eval_runtime": 523.1112, "eval_samples_per_second": 253.573, "eval_steps_per_second": 7.926, "step": 43500 }, { "epoch": 0.5809692592641944, "grad_norm": 4.178175449371338, "learning_rate": 6.127914723517655e-06, "loss": 0.2238, "step": 43600 }, { "epoch": 0.5823017575652591, "grad_norm": 7.612859725952148, "learning_rate": 6.261159227181879e-06, "loss": 0.2342, "step": 43700 }, { "epoch": 0.5836342558663238, "grad_norm": 19.10555648803711, "learning_rate": 6.394403730846103e-06, "loss": 0.2209, "step": 43800 }, { "epoch": 0.5849667541673884, "grad_norm": 0.2660426199436188, "learning_rate": 6.527648234510327e-06, "loss": 0.1982, "step": 43900 }, { "epoch": 0.5862992524684532, "grad_norm": 4.176153659820557, "learning_rate": 6.660892738174551e-06, "loss": 0.2577, "step": 44000 }, { "epoch": 0.5862992524684532, "eval_dev_accuracy": 0.9705006521067193, "eval_dev_accuracy_threshold": 0.9348860383033752, "eval_dev_average_precision": 0.8544433474182094, "eval_dev_f1": 0.7824561403508773, "eval_dev_f1_threshold": 0.41301047801971436, "eval_dev_precision": 0.759991738097697, "eval_dev_recall": 0.806289032540813, "eval_loss": 0.268686980009079, "eval_runtime": 525.8935, "eval_samples_per_second": 252.232, "eval_steps_per_second": 7.884, "step": 44000 }, { "epoch": 0.5876317507695178, "grad_norm": 2.451788902282715, "learning_rate": 6.794137241838775e-06, "loss": 0.1872, "step": 44100 }, { "epoch": 0.5889642490705824, "grad_norm": 0.2053864449262619, "learning_rate": 6.927381745502999e-06, "loss": 0.2132, "step": 44200 }, { "epoch": 0.5902967473716471, "grad_norm": 2.7442498207092285, "learning_rate": 7.0606262491672225e-06, "loss": 0.1735, "step": 44300 }, { "epoch": 0.5916292456727118, "grad_norm": 14.928565979003906, "learning_rate": 7.193870752831446e-06, "loss": 0.2907, "step": 44400 }, { "epoch": 0.5929617439737764, "grad_norm": 1.0581625699996948, "learning_rate": 7.32711525649567e-06, "loss": 0.2109, "step": 44500 }, { "epoch": 0.5929617439737764, "eval_dev_accuracy": 0.9710132909149849, "eval_dev_accuracy_threshold": 0.9184995889663696, "eval_dev_average_precision": 0.8564900386871592, "eval_dev_f1": 0.7874429836329488, "eval_dev_f1_threshold": 0.42533212900161743, "eval_dev_precision": 0.7716659655027346, "eval_dev_recall": 0.8038786019502575, "eval_loss": 0.2596043348312378, "eval_runtime": 521.1196, "eval_samples_per_second": 254.542, "eval_steps_per_second": 7.956, "step": 44500 }, { "epoch": 0.5942942422748411, "grad_norm": 7.90291166305542, "learning_rate": 7.460359760159894e-06, "loss": 0.2621, "step": 44600 }, { "epoch": 0.5956267405759058, "grad_norm": 27.323461532592773, "learning_rate": 7.593604263824118e-06, "loss": 0.21, "step": 44700 }, { "epoch": 0.5969592388769704, "grad_norm": 0.3570970296859741, "learning_rate": 7.726848767488342e-06, "loss": 0.216, "step": 44800 }, { "epoch": 0.5982917371780351, "grad_norm": 0.6491680145263672, "learning_rate": 7.860093271152565e-06, "loss": 0.2136, "step": 44900 }, { "epoch": 0.5996242354790997, "grad_norm": 20.47812271118164, "learning_rate": 7.99333777481679e-06, "loss": 0.2099, "step": 45000 }, { "epoch": 0.5996242354790997, "eval_dev_accuracy": 0.9701463282245358, "eval_dev_accuracy_threshold": 0.7721706628799438, "eval_dev_average_precision": 0.8515314890810202, "eval_dev_f1": 0.7854063375727528, "eval_dev_f1_threshold": 0.46630430221557617, "eval_dev_precision": 0.7728285077951003, "eval_dev_recall": 0.7984003506080859, "eval_loss": 0.27925005555152893, "eval_runtime": 528.9897, "eval_samples_per_second": 250.755, "eval_steps_per_second": 7.838, "step": 45000 }, { "epoch": 0.6009567337801645, "grad_norm": 0.4902491867542267, "learning_rate": 8.126582278481013e-06, "loss": 0.2536, "step": 45100 }, { "epoch": 0.6022892320812291, "grad_norm": 0.5637998580932617, "learning_rate": 8.259826782145237e-06, "loss": 0.2247, "step": 45200 }, { "epoch": 0.6036217303822937, "grad_norm": 1.9175264835357666, "learning_rate": 8.39307128580946e-06, "loss": 0.2349, "step": 45300 }, { "epoch": 0.6049542286833585, "grad_norm": 76.62299346923828, "learning_rate": 8.526315789473685e-06, "loss": 0.1836, "step": 45400 }, { "epoch": 0.6062867269844231, "grad_norm": 1.5868983268737793, "learning_rate": 8.659560293137908e-06, "loss": 0.2635, "step": 45500 }, { "epoch": 0.6062867269844231, "eval_dev_accuracy": 0.9700709401644968, "eval_dev_accuracy_threshold": 0.9073478579521179, "eval_dev_average_precision": 0.85367208401453, "eval_dev_f1": 0.7835151777033597, "eval_dev_f1_threshold": 0.5480349659919739, "eval_dev_precision": 0.7726643229998935, "eval_dev_recall": 0.7946751396954093, "eval_loss": 0.27641019225120544, "eval_runtime": 535.7653, "eval_samples_per_second": 247.584, "eval_steps_per_second": 7.738, "step": 45500 }, { "epoch": 0.6076192252854877, "grad_norm": 0.3646801710128784, "learning_rate": 8.792804796802133e-06, "loss": 0.2259, "step": 45600 }, { "epoch": 0.6089517235865525, "grad_norm": 0.1534300446510315, "learning_rate": 8.926049300466355e-06, "loss": 0.1824, "step": 45700 }, { "epoch": 0.6102842218876171, "grad_norm": 4.515030384063721, "learning_rate": 9.05929380413058e-06, "loss": 0.2108, "step": 45800 }, { "epoch": 0.6116167201886817, "grad_norm": 27.513139724731445, "learning_rate": 9.192538307794803e-06, "loss": 0.1652, "step": 45900 }, { "epoch": 0.6129492184897464, "grad_norm": 0.3283866345882416, "learning_rate": 9.325782811459028e-06, "loss": 0.2599, "step": 46000 }, { "epoch": 0.6129492184897464, "eval_dev_accuracy": 0.9699880132984537, "eval_dev_accuracy_threshold": 0.9482549428939819, "eval_dev_average_precision": 0.8491188703823201, "eval_dev_f1": 0.7826180027828322, "eval_dev_f1_threshold": 0.9011486768722534, "eval_dev_precision": 0.7649335704571608, "eval_dev_recall": 0.8011394762791717, "eval_loss": 0.2594774067401886, "eval_runtime": 527.4017, "eval_samples_per_second": 251.51, "eval_steps_per_second": 7.861, "step": 46000 }, { "epoch": 0.6142817167908111, "grad_norm": 0.6060785055160522, "learning_rate": 9.459027315123252e-06, "loss": 0.231, "step": 46100 }, { "epoch": 0.6156142150918757, "grad_norm": 1.9709681272506714, "learning_rate": 9.592271818787475e-06, "loss": 0.2364, "step": 46200 }, { "epoch": 0.6169467133929404, "grad_norm": 0.13106560707092285, "learning_rate": 9.7255163224517e-06, "loss": 0.1774, "step": 46300 }, { "epoch": 0.6182792116940051, "grad_norm": 53.972103118896484, "learning_rate": 9.858760826115924e-06, "loss": 0.2322, "step": 46400 }, { "epoch": 0.6196117099950698, "grad_norm": 12.795185089111328, "learning_rate": 9.992005329780147e-06, "loss": 0.2283, "step": 46500 }, { "epoch": 0.6196117099950698, "eval_dev_accuracy": 0.9702518715085905, "eval_dev_accuracy_threshold": 0.8647300004959106, "eval_dev_average_precision": 0.8569022880485853, "eval_dev_f1": 0.7869809918232983, "eval_dev_f1_threshold": 0.43426772952079773, "eval_dev_precision": 0.7634696610693315, "eval_dev_recall": 0.8119864139366714, "eval_loss": 0.26569852232933044, "eval_runtime": 528.8452, "eval_samples_per_second": 250.824, "eval_steps_per_second": 7.84, "step": 46500 }, { "epoch": 0.6209442082961344, "grad_norm": 6.9099507331848145, "learning_rate": 1.012524983344437e-05, "loss": 0.2275, "step": 46600 }, { "epoch": 0.6222767065971991, "grad_norm": 3.897141456604004, "learning_rate": 1.0258494337108595e-05, "loss": 0.1867, "step": 46700 }, { "epoch": 0.6236092048982638, "grad_norm": 1.8539767265319824, "learning_rate": 1.0391738840772818e-05, "loss": 0.276, "step": 46800 }, { "epoch": 0.6249417031993284, "grad_norm": 17.823284149169922, "learning_rate": 1.0524983344437042e-05, "loss": 0.2208, "step": 46900 }, { "epoch": 0.626274201500393, "grad_norm": 0.8377816081047058, "learning_rate": 1.0658227848101265e-05, "loss": 0.2644, "step": 47000 }, { "epoch": 0.626274201500393, "eval_dev_accuracy": 0.9708625147949068, "eval_dev_accuracy_threshold": 0.8370188474655151, "eval_dev_average_precision": 0.8568328618718613, "eval_dev_f1": 0.7867207514944491, "eval_dev_f1_threshold": 0.3532576858997345, "eval_dev_precision": 0.766989280882506, "eval_dev_recall": 0.8074942478360907, "eval_loss": 0.2608221769332886, "eval_runtime": 528.9364, "eval_samples_per_second": 250.781, "eval_steps_per_second": 7.838, "step": 47000 }, { "epoch": 0.6276066998014578, "grad_norm": 23.196794509887695, "learning_rate": 1.079147235176549e-05, "loss": 0.1944, "step": 47100 }, { "epoch": 0.6289391981025224, "grad_norm": 0.2909054458141327, "learning_rate": 1.0924716855429713e-05, "loss": 0.2221, "step": 47200 }, { "epoch": 0.630271696403587, "grad_norm": 15.759045600891113, "learning_rate": 1.1057961359093938e-05, "loss": 0.2392, "step": 47300 }, { "epoch": 0.6316041947046518, "grad_norm": 4.435680866241455, "learning_rate": 1.1191205862758164e-05, "loss": 0.1809, "step": 47400 }, { "epoch": 0.6329366930057164, "grad_norm": 3.936431646347046, "learning_rate": 1.1324450366422385e-05, "loss": 0.1708, "step": 47500 }, { "epoch": 0.6329366930057164, "eval_dev_accuracy": 0.9702594103145944, "eval_dev_accuracy_threshold": 0.9633700847625732, "eval_dev_average_precision": 0.8539832745264263, "eval_dev_f1": 0.7859069988890653, "eval_dev_f1_threshold": 0.7301878929138184, "eval_dev_precision": 0.7598199672667758, "eval_dev_recall": 0.8138490193930098, "eval_loss": 0.2916560173034668, "eval_runtime": 524.8425, "eval_samples_per_second": 252.737, "eval_steps_per_second": 7.9, "step": 47500 }, { "epoch": 0.634269191306781, "grad_norm": 1.574413776397705, "learning_rate": 1.1457694870086611e-05, "loss": 0.2181, "step": 47600 }, { "epoch": 0.6356016896078458, "grad_norm": 4.340725421905518, "learning_rate": 1.1590939373750833e-05, "loss": 0.2258, "step": 47700 }, { "epoch": 0.6369341879089104, "grad_norm": 5.916915416717529, "learning_rate": 1.1724183877415059e-05, "loss": 0.2808, "step": 47800 }, { "epoch": 0.6382666862099751, "grad_norm": 15.759284019470215, "learning_rate": 1.1857428381079282e-05, "loss": 0.2394, "step": 47900 }, { "epoch": 0.6395991845110397, "grad_norm": 14.555028915405273, "learning_rate": 1.1990672884743507e-05, "loss": 0.2267, "step": 48000 }, { "epoch": 0.6395991845110397, "eval_dev_accuracy": 0.9713826924091762, "eval_dev_accuracy_threshold": 0.9341762065887451, "eval_dev_average_precision": 0.8563315677126753, "eval_dev_f1": 0.7862142099681866, "eval_dev_f1_threshold": 0.4216569662094116, "eval_dev_precision": 0.7617384156991678, "eval_dev_recall": 0.8123151090172017, "eval_loss": 0.26165512204170227, "eval_runtime": 525.2884, "eval_samples_per_second": 252.522, "eval_steps_per_second": 7.893, "step": 48000 }, { "epoch": 0.6409316828121044, "grad_norm": 8.00622844696045, "learning_rate": 1.212391738840773e-05, "loss": 0.2583, "step": 48100 }, { "epoch": 0.6422641811131691, "grad_norm": 13.320343017578125, "learning_rate": 1.2257161892071954e-05, "loss": 0.2188, "step": 48200 }, { "epoch": 0.6435966794142337, "grad_norm": 2.9494426250457764, "learning_rate": 1.2390406395736177e-05, "loss": 0.1877, "step": 48300 }, { "epoch": 0.6449291777152985, "grad_norm": 0.39628902077674866, "learning_rate": 1.2523650899400402e-05, "loss": 0.2324, "step": 48400 }, { "epoch": 0.6462616760163631, "grad_norm": 0.1506374627351761, "learning_rate": 1.2656895403064625e-05, "loss": 0.2239, "step": 48500 }, { "epoch": 0.6462616760163631, "eval_dev_accuracy": 0.9706514282267974, "eval_dev_accuracy_threshold": 0.8615503311157227, "eval_dev_average_precision": 0.8586570982605375, "eval_dev_f1": 0.7870691958322201, "eval_dev_f1_threshold": 0.24849581718444824, "eval_dev_precision": 0.7681476846057572, "eval_dev_recall": 0.8069464227018736, "eval_loss": 0.28418707847595215, "eval_runtime": 533.0754, "eval_samples_per_second": 248.833, "eval_steps_per_second": 7.778, "step": 48500 }, { "epoch": 0.6475941743174277, "grad_norm": 0.48906368017196655, "learning_rate": 1.279013990672885e-05, "loss": 0.22, "step": 48600 }, { "epoch": 0.6489266726184925, "grad_norm": 71.81077575683594, "learning_rate": 1.2923384410393072e-05, "loss": 0.2079, "step": 48700 }, { "epoch": 0.6502591709195571, "grad_norm": 17.413375854492188, "learning_rate": 1.3056628914057297e-05, "loss": 0.2212, "step": 48800 }, { "epoch": 0.6515916692206217, "grad_norm": 0.7448732852935791, "learning_rate": 1.318987341772152e-05, "loss": 0.2106, "step": 48900 }, { "epoch": 0.6529241675216864, "grad_norm": 0.6357948780059814, "learning_rate": 1.3323117921385744e-05, "loss": 0.2095, "step": 49000 }, { "epoch": 0.6529241675216864, "eval_dev_accuracy": 0.971164067035063, "eval_dev_accuracy_threshold": 0.925714373588562, "eval_dev_average_precision": 0.8570638757463108, "eval_dev_f1": 0.7913554743365645, "eval_dev_f1_threshold": 0.5317444801330566, "eval_dev_precision": 0.7659967186218212, "eval_dev_recall": 0.8184507505204339, "eval_loss": 0.2687513828277588, "eval_runtime": 529.1402, "eval_samples_per_second": 250.684, "eval_steps_per_second": 7.835, "step": 49000 }, { "epoch": 0.6542566658227511, "grad_norm": 12.15365982055664, "learning_rate": 1.3456362425049967e-05, "loss": 0.2359, "step": 49100 }, { "epoch": 0.6555891641238157, "grad_norm": 12.457159996032715, "learning_rate": 1.3589606928714192e-05, "loss": 0.2392, "step": 49200 }, { "epoch": 0.6569216624248804, "grad_norm": 0.6378312110900879, "learning_rate": 1.3722851432378415e-05, "loss": 0.2185, "step": 49300 }, { "epoch": 0.6582541607259451, "grad_norm": 10.198519706726074, "learning_rate": 1.385609593604264e-05, "loss": 0.2497, "step": 49400 }, { "epoch": 0.6595866590270097, "grad_norm": 0.6230494976043701, "learning_rate": 1.3989340439706862e-05, "loss": 0.2357, "step": 49500 }, { "epoch": 0.6595866590270097, "eval_dev_accuracy": 0.9700030909104616, "eval_dev_accuracy_threshold": 0.5345156192779541, "eval_dev_average_precision": 0.8443688741553218, "eval_dev_f1": 0.785516801361123, "eval_dev_f1_threshold": 0.39208123087882996, "eval_dev_precision": 0.7630410081603141, "eval_dev_recall": 0.809356853292429, "eval_loss": 0.270622581243515, "eval_runtime": 527.7067, "eval_samples_per_second": 251.365, "eval_steps_per_second": 7.857, "step": 49500 }, { "epoch": 0.6609191573280744, "grad_norm": 6.028562068939209, "learning_rate": 1.4122584943371087e-05, "loss": 0.2147, "step": 49600 }, { "epoch": 0.6622516556291391, "grad_norm": 7.488621711730957, "learning_rate": 1.4255829447035312e-05, "loss": 0.2252, "step": 49700 }, { "epoch": 0.6635841539302038, "grad_norm": 3.221320152282715, "learning_rate": 1.4389073950699535e-05, "loss": 0.2296, "step": 49800 }, { "epoch": 0.6649166522312684, "grad_norm": 33.004817962646484, "learning_rate": 1.452231845436376e-05, "loss": 0.2434, "step": 49900 }, { "epoch": 0.666249150532333, "grad_norm": 6.759824752807617, "learning_rate": 1.4655562958027982e-05, "loss": 0.2449, "step": 50000 }, { "epoch": 0.666249150532333, "eval_dev_accuracy": 0.9705534237487466, "eval_dev_accuracy_threshold": 0.9030373096466064, "eval_dev_average_precision": 0.8517374123261313, "eval_dev_f1": 0.7881202847731378, "eval_dev_f1_threshold": 0.5092203617095947, "eval_dev_precision": 0.7650335224342445, "eval_dev_recall": 0.812643804097732, "eval_loss": 0.24229487776756287, "eval_runtime": 528.3673, "eval_samples_per_second": 251.051, "eval_steps_per_second": 7.847, "step": 50000 }, { "epoch": 0.6675816488333978, "grad_norm": 0.4978267252445221, "learning_rate": 1.4788807461692207e-05, "loss": 0.3087, "step": 50100 }, { "epoch": 0.6689141471344624, "grad_norm": 17.420612335205078, "learning_rate": 1.492205196535643e-05, "loss": 0.2188, "step": 50200 }, { "epoch": 0.670246645435527, "grad_norm": 0.26254966855049133, "learning_rate": 1.5055296469020654e-05, "loss": 0.2214, "step": 50300 }, { "epoch": 0.6715791437365918, "grad_norm": 16.93143653869629, "learning_rate": 1.5188540972684877e-05, "loss": 0.2141, "step": 50400 }, { "epoch": 0.6729116420376564, "grad_norm": 5.481032848358154, "learning_rate": 1.5321785476349102e-05, "loss": 0.2534, "step": 50500 }, { "epoch": 0.6729116420376564, "eval_dev_accuracy": 0.9701538670305397, "eval_dev_accuracy_threshold": 0.9412756562232971, "eval_dev_average_precision": 0.8418413944064206, "eval_dev_f1": 0.78390731292517, "eval_dev_f1_threshold": 0.8259508013725281, "eval_dev_precision": 0.7611724636185365, "eval_dev_recall": 0.8080420729703078, "eval_loss": 0.28124794363975525, "eval_runtime": 528.2314, "eval_samples_per_second": 251.115, "eval_steps_per_second": 7.849, "step": 50500 }, { "epoch": 0.674244140338721, "grad_norm": 0.13247288763523102, "learning_rate": 1.319120586275816e-07, "loss": 0.2242, "step": 50600 }, { "epoch": 0.6755766386397858, "grad_norm": 50.61308670043945, "learning_rate": 2.651565622918055e-07, "loss": 0.199, "step": 50700 }, { "epoch": 0.6769091369408504, "grad_norm": 9.46574592590332, "learning_rate": 3.984010659560293e-07, "loss": 0.2019, "step": 50800 }, { "epoch": 0.678241635241915, "grad_norm": 0.4613121449947357, "learning_rate": 5.316455696202532e-07, "loss": 0.2324, "step": 50900 }, { "epoch": 0.6795741335429797, "grad_norm": 0.06632626801729202, "learning_rate": 6.64890073284477e-07, "loss": 0.2095, "step": 51000 }, { "epoch": 0.6795741335429797, "eval_dev_accuracy": 0.9702820267326061, "eval_dev_accuracy_threshold": 0.9349472522735596, "eval_dev_average_precision": 0.8438139930773977, "eval_dev_f1": 0.7839174599797903, "eval_dev_f1_threshold": 0.7425632476806641, "eval_dev_precision": 0.7616783794956593, "eval_dev_recall": 0.8074942478360907, "eval_loss": 0.2824593782424927, "eval_runtime": 534.9937, "eval_samples_per_second": 247.941, "eval_steps_per_second": 7.75, "step": 51000 }, { "epoch": 0.6809066318440444, "grad_norm": 0.5744990706443787, "learning_rate": 7.981345769487009e-07, "loss": 0.2757, "step": 51100 }, { "epoch": 0.6822391301451091, "grad_norm": 44.8016471862793, "learning_rate": 9.313790806129248e-07, "loss": 0.2954, "step": 51200 }, { "epoch": 0.6835716284461737, "grad_norm": 18.677654266357422, "learning_rate": 1.0646235842771487e-06, "loss": 0.2051, "step": 51300 }, { "epoch": 0.6849041267472384, "grad_norm": 7.698785305023193, "learning_rate": 1.1978680879413725e-06, "loss": 0.2575, "step": 51400 }, { "epoch": 0.6862366250483031, "grad_norm": 1.6236628293991089, "learning_rate": 1.3311125916055965e-06, "loss": 0.1763, "step": 51500 }, { "epoch": 0.6862366250483031, "eval_dev_accuracy": 0.9702669491205983, "eval_dev_accuracy_threshold": 0.9349033832550049, "eval_dev_average_precision": 0.8468881842158165, "eval_dev_f1": 0.783245178180264, "eval_dev_f1_threshold": 0.763167142868042, "eval_dev_precision": 0.7643378519290928, "eval_dev_recall": 0.8031116467623535, "eval_loss": 0.2643745541572571, "eval_runtime": 526.1955, "eval_samples_per_second": 252.087, "eval_steps_per_second": 7.879, "step": 51500 }, { "epoch": 0.6875691233493677, "grad_norm": 28.033424377441406, "learning_rate": 1.4643570952698202e-06, "loss": 0.2108, "step": 51600 }, { "epoch": 0.6889016216504324, "grad_norm": 19.735244750976562, "learning_rate": 1.597601598934044e-06, "loss": 0.2313, "step": 51700 }, { "epoch": 0.6902341199514971, "grad_norm": 2.9967164993286133, "learning_rate": 1.7308461025982678e-06, "loss": 0.2344, "step": 51800 }, { "epoch": 0.6915666182525617, "grad_norm": 1.428648591041565, "learning_rate": 1.864090606262492e-06, "loss": 0.1968, "step": 51900 }, { "epoch": 0.6928991165536263, "grad_norm": 0.3774360418319702, "learning_rate": 1.9973351099267156e-06, "loss": 0.2222, "step": 52000 }, { "epoch": 0.6928991165536263, "eval_dev_accuracy": 0.9705609625547506, "eval_dev_accuracy_threshold": 0.9212765693664551, "eval_dev_average_precision": 0.8504652727472383, "eval_dev_f1": 0.786851950828434, "eval_dev_f1_threshold": 0.6886965036392212, "eval_dev_precision": 0.7681310654283627, "eval_dev_recall": 0.8065081625944999, "eval_loss": 0.26056790351867676, "eval_runtime": 524.9198, "eval_samples_per_second": 252.7, "eval_steps_per_second": 7.898, "step": 52000 }, { "epoch": 0.6942316148546911, "grad_norm": 5.1444525718688965, "learning_rate": 2.1305796135909398e-06, "loss": 0.2213, "step": 52100 }, { "epoch": 0.6955641131557557, "grad_norm": 0.18948954343795776, "learning_rate": 2.2638241172551636e-06, "loss": 0.2055, "step": 52200 }, { "epoch": 0.6968966114568204, "grad_norm": 13.482624053955078, "learning_rate": 2.3970686209193873e-06, "loss": 0.2321, "step": 52300 }, { "epoch": 0.6982291097578851, "grad_norm": 0.6994342803955078, "learning_rate": 2.530313124583611e-06, "loss": 0.257, "step": 52400 }, { "epoch": 0.6995616080589497, "grad_norm": 0.9283449053764343, "learning_rate": 2.663557628247835e-06, "loss": 0.2398, "step": 52500 }, { "epoch": 0.6995616080589497, "eval_dev_accuracy": 0.9711112953930356, "eval_dev_accuracy_threshold": 0.9353954195976257, "eval_dev_average_precision": 0.854664598144776, "eval_dev_f1": 0.789044289044289, "eval_dev_f1_threshold": 0.7551745176315308, "eval_dev_precision": 0.7638732177659248, "eval_dev_recall": 0.8159307549030349, "eval_loss": 0.2365955263376236, "eval_runtime": 526.0912, "eval_samples_per_second": 252.137, "eval_steps_per_second": 7.881, "step": 52500 }, { "epoch": 0.7008941063600144, "grad_norm": 69.95816040039062, "learning_rate": 2.7968021319120587e-06, "loss": 0.2168, "step": 52600 }, { "epoch": 0.7022266046610791, "grad_norm": 13.763835906982422, "learning_rate": 2.930046635576283e-06, "loss": 0.2066, "step": 52700 }, { "epoch": 0.7035591029621437, "grad_norm": 2.3356781005859375, "learning_rate": 3.0632911392405066e-06, "loss": 0.222, "step": 52800 }, { "epoch": 0.7048916012632084, "grad_norm": 4.479837417602539, "learning_rate": 3.1965356429047304e-06, "loss": 0.269, "step": 52900 }, { "epoch": 0.706224099564273, "grad_norm": 15.155440330505371, "learning_rate": 3.3297801465689546e-06, "loss": 0.2327, "step": 53000 }, { "epoch": 0.706224099564273, "eval_dev_accuracy": 0.971005752108981, "eval_dev_accuracy_threshold": 0.9340351819992065, "eval_dev_average_precision": 0.8546100599663748, "eval_dev_f1": 0.7908306421726932, "eval_dev_f1_threshold": 0.7827771306037903, "eval_dev_precision": 0.7651096086867445, "eval_dev_recall": 0.8183411854935905, "eval_loss": 0.24670535326004028, "eval_runtime": 524.1368, "eval_samples_per_second": 253.077, "eval_steps_per_second": 7.91, "step": 53000 }, { "epoch": 0.7075565978653378, "grad_norm": 30.88198471069336, "learning_rate": 3.4630246502331784e-06, "loss": 0.2274, "step": 53100 }, { "epoch": 0.7088890961664024, "grad_norm": 19.670501708984375, "learning_rate": 3.596269153897402e-06, "loss": 0.1619, "step": 53200 }, { "epoch": 0.710221594467467, "grad_norm": 1.817409873008728, "learning_rate": 3.729513657561626e-06, "loss": 0.2105, "step": 53300 }, { "epoch": 0.7115540927685318, "grad_norm": 7.859726428985596, "learning_rate": 3.862758161225849e-06, "loss": 0.2314, "step": 53400 }, { "epoch": 0.7128865910695964, "grad_norm": 1.2846513986587524, "learning_rate": 3.996002664890073e-06, "loss": 0.2118, "step": 53500 }, { "epoch": 0.7128865910695964, "eval_dev_accuracy": 0.9711263730050435, "eval_dev_accuracy_threshold": 0.956214189529419, "eval_dev_average_precision": 0.8529387869562187, "eval_dev_f1": 0.7885323513940031, "eval_dev_f1_threshold": 0.7215464115142822, "eval_dev_precision": 0.7583729636749975, "eval_dev_recall": 0.8211898761915196, "eval_loss": 0.251621812582016, "eval_runtime": 524.6134, "eval_samples_per_second": 252.847, "eval_steps_per_second": 7.903, "step": 53500 }, { "epoch": 0.714219089370661, "grad_norm": 29.144947052001953, "learning_rate": 4.129247168554298e-06, "loss": 0.223, "step": 53600 }, { "epoch": 0.7155515876717257, "grad_norm": 1.1121717691421509, "learning_rate": 4.2624916722185215e-06, "loss": 0.2177, "step": 53700 }, { "epoch": 0.7168840859727904, "grad_norm": 20.09768295288086, "learning_rate": 4.395736175882745e-06, "loss": 0.2092, "step": 53800 }, { "epoch": 0.718216584273855, "grad_norm": 0.34697094559669495, "learning_rate": 4.528980679546969e-06, "loss": 0.2112, "step": 53900 }, { "epoch": 0.7195490825749197, "grad_norm": 27.53289222717285, "learning_rate": 4.662225183211193e-06, "loss": 0.2188, "step": 54000 }, { "epoch": 0.7195490825749197, "eval_dev_accuracy": 0.9717445550973637, "eval_dev_accuracy_threshold": 0.9209288358688354, "eval_dev_average_precision": 0.8572864419695019, "eval_dev_f1": 0.7925902130849127, "eval_dev_f1_threshold": 0.5230389833450317, "eval_dev_precision": 0.7749973824730395, "eval_dev_recall": 0.8110003286950805, "eval_loss": 0.2652234435081482, "eval_runtime": 524.205, "eval_samples_per_second": 253.044, "eval_steps_per_second": 7.909, "step": 54000 }, { "epoch": 0.7208815808759844, "grad_norm": 0.12331326305866241, "learning_rate": 4.795469686875417e-06, "loss": 0.1995, "step": 54100 }, { "epoch": 0.722214079177049, "grad_norm": 26.130399703979492, "learning_rate": 4.92871419053964e-06, "loss": 0.1863, "step": 54200 }, { "epoch": 0.7235465774781137, "grad_norm": 63.348262786865234, "learning_rate": 5.061958694203864e-06, "loss": 0.1885, "step": 54300 }, { "epoch": 0.7248790757791784, "grad_norm": 4.434421539306641, "learning_rate": 5.195203197868088e-06, "loss": 0.2059, "step": 54400 }, { "epoch": 0.7262115740802431, "grad_norm": 1.5990498065948486, "learning_rate": 5.328447701532313e-06, "loss": 0.1944, "step": 54500 }, { "epoch": 0.7262115740802431, "eval_dev_accuracy": 0.9710962177810278, "eval_dev_accuracy_threshold": 0.938183069229126, "eval_dev_average_precision": 0.8581458729833185, "eval_dev_f1": 0.79388743943347, "eval_dev_f1_threshold": 0.692324697971344, "eval_dev_precision": 0.7722187694220013, "eval_dev_recall": 0.8168072751177824, "eval_loss": 0.24563372135162354, "eval_runtime": 524.5271, "eval_samples_per_second": 252.889, "eval_steps_per_second": 7.904, "step": 54500 }, { "epoch": 0.7275440723813077, "grad_norm": 13.777716636657715, "learning_rate": 5.461692205196536e-06, "loss": 0.2015, "step": 54600 }, { "epoch": 0.7288765706823723, "grad_norm": 0.40915578603744507, "learning_rate": 5.59493670886076e-06, "loss": 0.1804, "step": 54700 }, { "epoch": 0.7302090689834371, "grad_norm": 2.3663179874420166, "learning_rate": 5.728181212524984e-06, "loss": 0.2424, "step": 54800 }, { "epoch": 0.7315415672845017, "grad_norm": 19.617507934570312, "learning_rate": 5.861425716189208e-06, "loss": 0.2331, "step": 54900 }, { "epoch": 0.7328740655855663, "grad_norm": 1.4067281484603882, "learning_rate": 5.9946702198534315e-06, "loss": 0.197, "step": 55000 }, { "epoch": 0.7328740655855663, "eval_dev_accuracy": 0.9715711625592739, "eval_dev_accuracy_threshold": 0.9351357221603394, "eval_dev_average_precision": 0.8584440513483999, "eval_dev_f1": 0.7944548676255994, "eval_dev_f1_threshold": 0.3239399194717407, "eval_dev_precision": 0.7575787695060133, "eval_dev_recall": 0.8351046346006354, "eval_loss": 0.271222859621048, "eval_runtime": 526.0703, "eval_samples_per_second": 252.147, "eval_steps_per_second": 7.881, "step": 55000 }, { "epoch": 0.7342065638866311, "grad_norm": 0.45710641145706177, "learning_rate": 6.127914723517655e-06, "loss": 0.2503, "step": 55100 }, { "epoch": 0.7355390621876957, "grad_norm": 0.6267761588096619, "learning_rate": 6.261159227181879e-06, "loss": 0.2421, "step": 55200 }, { "epoch": 0.7368715604887603, "grad_norm": 11.160945892333984, "learning_rate": 6.394403730846103e-06, "loss": 0.2169, "step": 55300 }, { "epoch": 0.7382040587898251, "grad_norm": 0.22500374913215637, "learning_rate": 6.527648234510327e-06, "loss": 0.1801, "step": 55400 }, { "epoch": 0.7395365570908897, "grad_norm": 0.34952008724212646, "learning_rate": 6.660892738174551e-06, "loss": 0.2168, "step": 55500 }, { "epoch": 0.7395365570908897, "eval_dev_accuracy": 0.9718576371874222, "eval_dev_accuracy_threshold": 0.9311728477478027, "eval_dev_average_precision": 0.8606955219787713, "eval_dev_f1": 0.7966432680635458, "eval_dev_f1_threshold": 0.3317277133464813, "eval_dev_precision": 0.7685336048879837, "eval_dev_recall": 0.8268872575873781, "eval_loss": 0.24974019825458527, "eval_runtime": 524.3487, "eval_samples_per_second": 252.975, "eval_steps_per_second": 7.907, "step": 55500 }, { "epoch": 0.7408690553919544, "grad_norm": 13.866408348083496, "learning_rate": 6.794137241838775e-06, "loss": 0.2266, "step": 55600 }, { "epoch": 0.742201553693019, "grad_norm": 9.584277153015137, "learning_rate": 6.927381745502999e-06, "loss": 0.1882, "step": 55700 }, { "epoch": 0.7435340519940837, "grad_norm": 52.4222297668457, "learning_rate": 7.0606262491672225e-06, "loss": 0.2214, "step": 55800 }, { "epoch": 0.7448665502951484, "grad_norm": 15.216498374938965, "learning_rate": 7.193870752831446e-06, "loss": 0.23, "step": 55900 }, { "epoch": 0.746199048596213, "grad_norm": 21.095590591430664, "learning_rate": 7.32711525649567e-06, "loss": 0.2355, "step": 56000 }, { "epoch": 0.746199048596213, "eval_dev_accuracy": 0.9719631804714769, "eval_dev_accuracy_threshold": 0.9183558821678162, "eval_dev_average_precision": 0.8589405860687593, "eval_dev_f1": 0.7974690109434157, "eval_dev_f1_threshold": 0.33763912320137024, "eval_dev_precision": 0.7571400433326768, "eval_dev_recall": 0.8423359263723019, "eval_loss": 0.24558140337467194, "eval_runtime": 523.2596, "eval_samples_per_second": 253.501, "eval_steps_per_second": 7.923, "step": 56000 }, { "epoch": 0.7475315468972777, "grad_norm": 87.9457778930664, "learning_rate": 7.460359760159894e-06, "loss": 0.2105, "step": 56100 }, { "epoch": 0.7488640451983424, "grad_norm": 1.1765731573104858, "learning_rate": 7.593604263824118e-06, "loss": 0.1608, "step": 56200 }, { "epoch": 0.750196543499407, "grad_norm": 12.082050323486328, "learning_rate": 7.726848767488342e-06, "loss": 0.214, "step": 56300 }, { "epoch": 0.7515290418004718, "grad_norm": 17.673494338989258, "learning_rate": 7.860093271152565e-06, "loss": 0.2531, "step": 56400 }, { "epoch": 0.7528615401015364, "grad_norm": 4.850943565368652, "learning_rate": 7.99333777481679e-06, "loss": 0.2641, "step": 56500 }, { "epoch": 0.7528615401015364, "eval_dev_accuracy": 0.9718953312174418, "eval_dev_accuracy_threshold": 0.9289690852165222, "eval_dev_average_precision": 0.8607199959963239, "eval_dev_f1": 0.7934619562406249, "eval_dev_f1_threshold": 0.2598855793476105, "eval_dev_precision": 0.7515187144816774, "eval_dev_recall": 0.8403637558891202, "eval_loss": 0.24914328753948212, "eval_runtime": 526.4308, "eval_samples_per_second": 251.974, "eval_steps_per_second": 7.876, "step": 56500 }, { "epoch": 0.754194038402601, "grad_norm": 21.872079849243164, "learning_rate": 8.126582278481013e-06, "loss": 0.2002, "step": 56600 }, { "epoch": 0.7555265367036657, "grad_norm": 0.3463062345981598, "learning_rate": 8.259826782145237e-06, "loss": 0.1727, "step": 56700 }, { "epoch": 0.7568590350047304, "grad_norm": 4.641270637512207, "learning_rate": 8.39307128580946e-06, "loss": 0.2135, "step": 56800 }, { "epoch": 0.758191533305795, "grad_norm": 1.456807017326355, "learning_rate": 8.526315789473685e-06, "loss": 0.1694, "step": 56900 }, { "epoch": 0.7595240316068597, "grad_norm": 0.2848343551158905, "learning_rate": 8.659560293137908e-06, "loss": 0.1969, "step": 57000 }, { "epoch": 0.7595240316068597, "eval_dev_accuracy": 0.9716917834553364, "eval_dev_accuracy_threshold": 0.9249356389045715, "eval_dev_average_precision": 0.8628574223791167, "eval_dev_f1": 0.7945488333677474, "eval_dev_f1_threshold": 0.2702260911464691, "eval_dev_precision": 0.7511957052220596, "eval_dev_recall": 0.8432124465870494, "eval_loss": 0.2667163014411926, "eval_runtime": 523.0471, "eval_samples_per_second": 253.604, "eval_steps_per_second": 7.927, "step": 57000 }, { "epoch": 1.521692783285364, "grad_norm": 680.6102294921875, "learning_rate": 1.1723219044235212e-05, "loss": 0.1989, "step": 57100 }, { "epoch": 1.524357744376932, "grad_norm": 555.5462036132812, "learning_rate": 1.1900870492094512e-05, "loss": 0.1823, "step": 57200 }, { "epoch": 1.5270227054685002, "grad_norm": 19347.361328125, "learning_rate": 1.207852193995381e-05, "loss": 0.2099, "step": 57300 }, { "epoch": 1.5296876665600683, "grad_norm": 28487.04296875, "learning_rate": 1.225617338781311e-05, "loss": 0.2007, "step": 57400 }, { "epoch": 1.5323526276516364, "grad_norm": 33787.03515625, "learning_rate": 1.2433824835672413e-05, "loss": 0.1893, "step": 57500 }, { "epoch": 1.5323526276516364, "eval_dev_accuracy": 0.9712469939011059, "eval_dev_accuracy_threshold": 0.930076539516449, "eval_dev_average_precision": 0.8589126571915907, "eval_dev_f1": 0.788643194504079, "eval_dev_f1_threshold": 0.8417924642562866, "eval_dev_precision": 0.7729615991583377, "eval_dev_recall": 0.8049742522186918, "eval_loss": 0.22310471534729004, "eval_runtime": 911.6835, "eval_samples_per_second": 145.497, "eval_steps_per_second": 2.274, "step": 57500 }, { "epoch": 1.5350175887432043, "grad_norm": 10426.8994140625, "learning_rate": 1.2611476283531711e-05, "loss": 0.1941, "step": 57600 }, { "epoch": 1.5376825498347724, "grad_norm": 20932.927734375, "learning_rate": 1.2789127731391012e-05, "loss": 0.1917, "step": 57700 }, { "epoch": 1.5403475109263405, "grad_norm": 19958.53125, "learning_rate": 1.2966779179250314e-05, "loss": 0.1704, "step": 57800 }, { "epoch": 1.5430124720179086, "grad_norm": 4519.30517578125, "learning_rate": 1.3144430627109612e-05, "loss": 0.1769, "step": 57900 }, { "epoch": 1.5456774331094767, "grad_norm": 1185.6409912109375, "learning_rate": 1.3322082074968912e-05, "loss": 0.1917, "step": 58000 }, { "epoch": 1.5456774331094767, "eval_dev_accuracy": 0.971314843155141, "eval_dev_accuracy_threshold": 0.9302895069122314, "eval_dev_average_precision": 0.8581921137101376, "eval_dev_f1": 0.7902556259558663, "eval_dev_f1_threshold": 0.9142668843269348, "eval_dev_precision": 0.7879315978651563, "eval_dev_recall": 0.792593404185384, "eval_loss": 0.21683622896671295, "eval_runtime": 910.3929, "eval_samples_per_second": 145.703, "eval_steps_per_second": 2.277, "step": 58000 }, { "epoch": 1.5483423942010446, "grad_norm": 10156.921875, "learning_rate": 1.3499733522828211e-05, "loss": 0.156, "step": 58100 }, { "epoch": 1.5510073552926127, "grad_norm": 20830.22265625, "learning_rate": 1.3677384970687513e-05, "loss": 0.1882, "step": 58200 }, { "epoch": 1.5536723163841808, "grad_norm": 10158.1328125, "learning_rate": 1.3855036418546812e-05, "loss": 0.1914, "step": 58300 }, { "epoch": 1.556337277475749, "grad_norm": 12550.0205078125, "learning_rate": 1.4032687866406112e-05, "loss": 0.1859, "step": 58400 }, { "epoch": 1.559002238567317, "grad_norm": 25116.525390625, "learning_rate": 1.4210339314265414e-05, "loss": 0.1915, "step": 58500 }, { "epoch": 1.559002238567317, "eval_dev_accuracy": 0.9707343550928405, "eval_dev_accuracy_threshold": 0.9600124359130859, "eval_dev_average_precision": 0.8552104699335599, "eval_dev_f1": 0.788252996419862, "eval_dev_f1_threshold": 0.6280207633972168, "eval_dev_precision": 0.7486694263749261, "eval_dev_recall": 0.8322559439027063, "eval_loss": 0.22474558651447296, "eval_runtime": 912.547, "eval_samples_per_second": 145.359, "eval_steps_per_second": 2.272, "step": 58500 }, { "epoch": 1.561667199658885, "grad_norm": 1747.8248291015625, "learning_rate": 1.4387990762124712e-05, "loss": 0.1658, "step": 58600 }, { "epoch": 1.564332160750453, "grad_norm": 10528.990234375, "learning_rate": 1.4565642209984013e-05, "loss": 0.1877, "step": 58700 }, { "epoch": 1.5669971218420211, "grad_norm": 14108.591796875, "learning_rate": 1.4743293657843311e-05, "loss": 0.1972, "step": 58800 }, { "epoch": 1.5696620829335892, "grad_norm": 33609.73828125, "learning_rate": 1.4920945105702613e-05, "loss": 0.1915, "step": 58900 }, { "epoch": 1.5723270440251573, "grad_norm": 14393.123046875, "learning_rate": 1.5098596553561913e-05, "loss": 0.1982, "step": 59000 }, { "epoch": 1.5723270440251573, "eval_dev_accuracy": 0.9714354640512036, "eval_dev_accuracy_threshold": 0.861323356628418, "eval_dev_average_precision": 0.8617997355004788, "eval_dev_f1": 0.792690745885873, "eval_dev_f1_threshold": 0.5087981224060059, "eval_dev_precision": 0.7735947439774742, "eval_dev_recall": 0.8127533691245754, "eval_loss": 0.27619487047195435, "eval_runtime": 912.722, "eval_samples_per_second": 145.331, "eval_steps_per_second": 2.271, "step": 59000 }, { "epoch": 1.5749920051167252, "grad_norm": 2650.031982421875, "learning_rate": 1.5276248001421212e-05, "loss": 0.1977, "step": 59100 }, { "epoch": 1.5776569662082933, "grad_norm": 21126.404296875, "learning_rate": 1.5453899449280514e-05, "loss": 0.1646, "step": 59200 }, { "epoch": 1.5803219272998614, "grad_norm": 1604.2296142578125, "learning_rate": 1.5631550897139813e-05, "loss": 0.1855, "step": 59300 }, { "epoch": 1.5829868883914295, "grad_norm": 9624.1689453125, "learning_rate": 1.580920234499911e-05, "loss": 0.1809, "step": 59400 }, { "epoch": 1.5856518494829976, "grad_norm": 4949.5078125, "learning_rate": 1.5986853792858413e-05, "loss": 0.185, "step": 59500 }, { "epoch": 1.5856518494829976, "eval_dev_accuracy": 0.9717068610673442, "eval_dev_accuracy_threshold": 0.9281443357467651, "eval_dev_average_precision": 0.8651298435899648, "eval_dev_f1": 0.7949938492806332, "eval_dev_f1_threshold": 0.7204960584640503, "eval_dev_precision": 0.7765935214211076, "eval_dev_recall": 0.8142872795003835, "eval_loss": 0.23017099499702454, "eval_runtime": 912.0946, "eval_samples_per_second": 145.431, "eval_steps_per_second": 2.273, "step": 59500 }, { "epoch": 1.5883168105745655, "grad_norm": 4366.28125, "learning_rate": 1.6164505240717715e-05, "loss": 0.1524, "step": 59600 }, { "epoch": 1.5909817716661336, "grad_norm": 6088.126953125, "learning_rate": 1.6342156688577014e-05, "loss": 0.1626, "step": 59700 }, { "epoch": 1.5936467327577017, "grad_norm": 41741.02734375, "learning_rate": 1.6519808136436312e-05, "loss": 0.1855, "step": 59800 }, { "epoch": 1.5963116938492699, "grad_norm": 6351.677734375, "learning_rate": 1.6697459584295614e-05, "loss": 0.1777, "step": 59900 }, { "epoch": 1.598976654940838, "grad_norm": 667.612548828125, "learning_rate": 1.6875111032154913e-05, "loss": 0.1519, "step": 60000 }, { "epoch": 1.598976654940838, "eval_dev_accuracy": 0.9702217162845749, "eval_dev_accuracy_threshold": 0.9527369737625122, "eval_dev_average_precision": 0.8599004250878434, "eval_dev_f1": 0.7857490403849272, "eval_dev_f1_threshold": 0.9123563170433044, "eval_dev_precision": 0.7755602988260406, "eval_dev_recall": 0.7962090500712172, "eval_loss": 0.23386961221694946, "eval_runtime": 912.5307, "eval_samples_per_second": 145.362, "eval_steps_per_second": 2.272, "step": 60000 }, { "epoch": 1.6016416160324058, "grad_norm": 74362.328125, "learning_rate": 1.7052762480014215e-05, "loss": 0.1705, "step": 60100 }, { "epoch": 1.604306577123974, "grad_norm": 41024.45703125, "learning_rate": 1.7230413927873513e-05, "loss": 0.1868, "step": 60200 }, { "epoch": 1.606971538215542, "grad_norm": 10907.779296875, "learning_rate": 1.7408065375732815e-05, "loss": 0.1801, "step": 60300 }, { "epoch": 1.6096364993071102, "grad_norm": 17233.494140625, "learning_rate": 1.7585716823592114e-05, "loss": 0.1672, "step": 60400 }, { "epoch": 1.6123014603986783, "grad_norm": 6108.4228515625, "learning_rate": 1.7763368271451412e-05, "loss": 0.1619, "step": 60500 }, { "epoch": 1.6123014603986783, "eval_dev_accuracy": 0.9701990998665632, "eval_dev_accuracy_threshold": 0.9653939604759216, "eval_dev_average_precision": 0.8583701139769879, "eval_dev_f1": 0.7852786105654916, "eval_dev_f1_threshold": 0.4483921527862549, "eval_dev_precision": 0.7433212643115765, "eval_dev_recall": 0.8322559439027063, "eval_loss": 0.2841331958770752, "eval_runtime": 912.3726, "eval_samples_per_second": 145.387, "eval_steps_per_second": 2.272, "step": 60500 }, { "epoch": 1.6149664214902462, "grad_norm": 753.2778930664062, "learning_rate": 1.7941019719310714e-05, "loss": 0.1775, "step": 60600 }, { "epoch": 1.6176313825818143, "grad_norm": 7861.2724609375, "learning_rate": 1.8118671167170013e-05, "loss": 0.1539, "step": 60700 }, { "epoch": 1.6202963436733824, "grad_norm": 4606.5625, "learning_rate": 1.8296322615029315e-05, "loss": 0.1984, "step": 60800 }, { "epoch": 1.6229613047649505, "grad_norm": 3256.729248046875, "learning_rate": 1.8473974062888614e-05, "loss": 0.1936, "step": 60900 }, { "epoch": 1.6256262658565186, "grad_norm": 16788.51953125, "learning_rate": 1.8651625510747916e-05, "loss": 0.1928, "step": 61000 }, { "epoch": 1.6256262658565186, "eval_dev_accuracy": 0.9702820267326061, "eval_dev_accuracy_threshold": 0.9588229656219482, "eval_dev_average_precision": 0.8578683942622316, "eval_dev_f1": 0.78329335697153, "eval_dev_f1_threshold": 0.8013461232185364, "eval_dev_precision": 0.7472888269823899, "eval_dev_recall": 0.8229429166210146, "eval_loss": 0.21942387521266937, "eval_runtime": 911.7434, "eval_samples_per_second": 145.487, "eval_steps_per_second": 2.274, "step": 61000 }, { "epoch": 1.6282912269480865, "grad_norm": 1664.8751220703125, "learning_rate": 1.8829276958607214e-05, "loss": 0.166, "step": 61100 }, { "epoch": 1.6309561880396546, "grad_norm": 21448.6796875, "learning_rate": 1.9006928406466513e-05, "loss": 0.1774, "step": 61200 }, { "epoch": 1.6336211491312227, "grad_norm": 18060.765625, "learning_rate": 1.9184579854325815e-05, "loss": 0.1319, "step": 61300 }, { "epoch": 1.6362861102227908, "grad_norm": 7385.87353515625, "learning_rate": 1.9362231302185113e-05, "loss": 0.1971, "step": 61400 }, { "epoch": 1.638951071314359, "grad_norm": 5024.80078125, "learning_rate": 1.9539882750044415e-05, "loss": 0.1728, "step": 61500 }, { "epoch": 1.638951071314359, "eval_dev_accuracy": 0.9713073043491371, "eval_dev_accuracy_threshold": 0.9408199787139893, "eval_dev_average_precision": 0.8671213714406215, "eval_dev_f1": 0.7911789297658863, "eval_dev_f1_threshold": 0.6503252983093262, "eval_dev_precision": 0.7563193126186433, "eval_dev_recall": 0.829407253204777, "eval_loss": 0.23295743763446808, "eval_runtime": 911.9086, "eval_samples_per_second": 145.461, "eval_steps_per_second": 2.273, "step": 61500 }, { "epoch": 1.6416160324059268, "grad_norm": 8569.271484375, "learning_rate": 1.9717534197903714e-05, "loss": 0.1703, "step": 61600 }, { "epoch": 1.644280993497495, "grad_norm": 20367.513671875, "learning_rate": 1.9895185645763016e-05, "loss": 0.1624, "step": 61700 }, { "epoch": 1.646945954589063, "grad_norm": 1712.7371826171875, "learning_rate": 1.9991906350553724e-05, "loss": 0.1526, "step": 61800 }, { "epoch": 1.6496109156806311, "grad_norm": 408.11163330078125, "learning_rate": 1.9972165742148174e-05, "loss": 0.1611, "step": 61900 }, { "epoch": 1.6522758767721992, "grad_norm": 6086.27587890625, "learning_rate": 1.9952425133742624e-05, "loss": 0.1603, "step": 62000 }, { "epoch": 1.6522758767721992, "eval_dev_accuracy": 0.971164067035063, "eval_dev_accuracy_threshold": 0.9526249170303345, "eval_dev_average_precision": 0.8575865091547995, "eval_dev_f1": 0.7882105728821057, "eval_dev_f1_threshold": 0.9510890245437622, "eval_dev_precision": 0.7956905213799264, "eval_dev_recall": 0.7808699463131369, "eval_loss": 0.2797718644142151, "eval_runtime": 911.0244, "eval_samples_per_second": 145.602, "eval_steps_per_second": 2.275, "step": 62000 }, { "epoch": 1.6549408378637671, "grad_norm": 448.80615234375, "learning_rate": 1.993268452533707e-05, "loss": 0.1546, "step": 62100 }, { "epoch": 1.6576057989553352, "grad_norm": 31734.08984375, "learning_rate": 1.991294391693152e-05, "loss": 0.2302, "step": 62200 }, { "epoch": 1.6602707600469033, "grad_norm": 18211.0, "learning_rate": 1.989320330852597e-05, "loss": 0.1694, "step": 62300 }, { "epoch": 1.6629357211384712, "grad_norm": 8841.400390625, "learning_rate": 1.9873462700120417e-05, "loss": 0.1705, "step": 62400 }, { "epoch": 1.6656006822300395, "grad_norm": 24008.82421875, "learning_rate": 1.985372209171487e-05, "loss": 0.1606, "step": 62500 }, { "epoch": 1.6656006822300395, "eval_dev_accuracy": 0.9708549759889029, "eval_dev_accuracy_threshold": 0.9542537927627563, "eval_dev_average_precision": 0.841409192198319, "eval_dev_f1": 0.7890381515314348, "eval_dev_f1_threshold": 0.8909753561019897, "eval_dev_precision": 0.7742275651165244, "eval_dev_recall": 0.8044264270844746, "eval_loss": 0.2822663486003876, "eval_runtime": 911.9083, "eval_samples_per_second": 145.461, "eval_steps_per_second": 2.273, "step": 62500 }, { "epoch": 1.6682656433216074, "grad_norm": 2424.1279296875, "learning_rate": 1.9833981483309317e-05, "loss": 0.1887, "step": 62600 }, { "epoch": 1.6709306044131755, "grad_norm": 45195.04296875, "learning_rate": 1.9814240874903764e-05, "loss": 0.1918, "step": 62700 }, { "epoch": 1.6735955655047436, "grad_norm": 2223.521728515625, "learning_rate": 1.9794500266498217e-05, "loss": 0.1475, "step": 62800 }, { "epoch": 1.6762605265963115, "grad_norm": 2829.02099609375, "learning_rate": 1.9774759658092664e-05, "loss": 0.1995, "step": 62900 }, { "epoch": 1.6789254876878799, "grad_norm": 11702.283203125, "learning_rate": 1.975501904968711e-05, "loss": 0.1648, "step": 63000 }, { "epoch": 1.6789254876878799, "eval_dev_accuracy": 0.9710359073329966, "eval_dev_accuracy_threshold": 0.910698652267456, "eval_dev_average_precision": 0.849610869643878, "eval_dev_f1": 0.7900427192658614, "eval_dev_f1_threshold": 0.4727928936481476, "eval_dev_precision": 0.7616432784218019, "eval_dev_recall": 0.8206420510573025, "eval_loss": 0.25969284772872925, "eval_runtime": 910.5015, "eval_samples_per_second": 145.686, "eval_steps_per_second": 2.277, "step": 63000 }, { "epoch": 1.6815904487794477, "grad_norm": 21649.341796875, "learning_rate": 1.9735278441281564e-05, "loss": 0.1788, "step": 63100 }, { "epoch": 1.6842554098710159, "grad_norm": 86422.7421875, "learning_rate": 1.971553783287601e-05, "loss": 0.2286, "step": 63200 }, { "epoch": 1.686920370962584, "grad_norm": 45808.265625, "learning_rate": 1.969579722447046e-05, "loss": 0.1611, "step": 63300 }, { "epoch": 1.6895853320541518, "grad_norm": 13495.0380859375, "learning_rate": 1.967605661606491e-05, "loss": 0.1962, "step": 63400 }, { "epoch": 1.6922502931457202, "grad_norm": 22458.46484375, "learning_rate": 1.9656316007659357e-05, "loss": 0.1825, "step": 63500 }, { "epoch": 1.6922502931457202, "eval_dev_accuracy": 0.9710283685269927, "eval_dev_accuracy_threshold": 0.932883620262146, "eval_dev_average_precision": 0.8574042822114104, "eval_dev_f1": 0.7900720576461169, "eval_dev_f1_threshold": 0.9062104225158691, "eval_dev_precision": 0.7702955870108243, "eval_dev_recall": 0.8108907636682371, "eval_loss": 0.20927684009075165, "eval_runtime": 911.9738, "eval_samples_per_second": 145.45, "eval_steps_per_second": 2.273, "step": 63500 }, { "epoch": 1.694915254237288, "grad_norm": 4333.6484375, "learning_rate": 1.9636575399253807e-05, "loss": 0.1795, "step": 63600 }, { "epoch": 1.6975802153288562, "grad_norm": 51141.83203125, "learning_rate": 1.9616834790848257e-05, "loss": 0.1944, "step": 63700 }, { "epoch": 1.7002451764204243, "grad_norm": 24413.966796875, "learning_rate": 1.9597094182442704e-05, "loss": 0.196, "step": 63800 }, { "epoch": 1.7029101375119922, "grad_norm": 11386.5224609375, "learning_rate": 1.9577353574037154e-05, "loss": 0.1851, "step": 63900 }, { "epoch": 1.7055750986035605, "grad_norm": 1291.42236328125, "learning_rate": 1.9557612965631604e-05, "loss": 0.1787, "step": 64000 }, { "epoch": 1.7055750986035605, "eval_dev_accuracy": 0.9717747103213793, "eval_dev_accuracy_threshold": 0.9631803035736084, "eval_dev_average_precision": 0.8630868871875782, "eval_dev_f1": 0.7981506777345803, "eval_dev_f1_threshold": 0.9355161786079407, "eval_dev_precision": 0.7667305945291208, "eval_dev_recall": 0.8322559439027063, "eval_loss": 0.23051400482654572, "eval_runtime": 912.7714, "eval_samples_per_second": 145.323, "eval_steps_per_second": 2.271, "step": 64000 }, { "epoch": 1.7082400596951284, "grad_norm": 689.987060546875, "learning_rate": 1.953787235722605e-05, "loss": 0.1924, "step": 64100 }, { "epoch": 1.7109050207866965, "grad_norm": 11370.0517578125, "learning_rate": 1.95181317488205e-05, "loss": 0.1611, "step": 64200 }, { "epoch": 1.7135699818782646, "grad_norm": 15404.4140625, "learning_rate": 1.949839114041495e-05, "loss": 0.1799, "step": 64300 }, { "epoch": 1.7162349429698325, "grad_norm": 14026.65234375, "learning_rate": 1.9478650532009397e-05, "loss": 0.1977, "step": 64400 }, { "epoch": 1.7188999040614008, "grad_norm": 1225.2841796875, "learning_rate": 1.9458909923603847e-05, "loss": 0.1672, "step": 64500 }, { "epoch": 1.7188999040614008, "eval_dev_accuracy": 0.9720687237555315, "eval_dev_accuracy_threshold": 0.920991063117981, "eval_dev_average_precision": 0.845700489229083, "eval_dev_f1": 0.7995607383778697, "eval_dev_f1_threshold": 0.6048256158828735, "eval_dev_precision": 0.7648059223689476, "eval_dev_recall": 0.8376246302180343, "eval_loss": 0.21997055411338806, "eval_runtime": 912.741, "eval_samples_per_second": 145.328, "eval_steps_per_second": 2.271, "step": 64500 }, { "epoch": 1.7215648651529687, "grad_norm": 18876.72265625, "learning_rate": 1.9439169315198297e-05, "loss": 0.1812, "step": 64600 }, { "epoch": 1.7242298262445368, "grad_norm": 44768.2578125, "learning_rate": 1.9419428706792744e-05, "loss": 0.1641, "step": 64700 }, { "epoch": 1.726894787336105, "grad_norm": 1987.0482177734375, "learning_rate": 1.9399688098387194e-05, "loss": 0.1526, "step": 64800 }, { "epoch": 1.7295597484276728, "grad_norm": 1468.9228515625, "learning_rate": 1.9379947489981644e-05, "loss": 0.1745, "step": 64900 }, { "epoch": 1.7322247095192411, "grad_norm": 2461.248291015625, "learning_rate": 1.936020688157609e-05, "loss": 0.2017, "step": 65000 }, { "epoch": 1.7322247095192411, "eval_dev_accuracy": 0.9716842446493325, "eval_dev_accuracy_threshold": 0.8851553201675415, "eval_dev_average_precision": 0.8642482817005424, "eval_dev_f1": 0.7979695431472081, "eval_dev_f1_threshold": 0.674056887626648, "eval_dev_precision": 0.7787859824780976, "eval_dev_recall": 0.8181220554399036, "eval_loss": 0.25105008482933044, "eval_runtime": 933.219, "eval_samples_per_second": 142.139, "eval_steps_per_second": 2.221, "step": 65000 }, { "epoch": 1.734889670610809, "grad_norm": 1901.36474609375, "learning_rate": 1.934046627317054e-05, "loss": 0.2092, "step": 65100 }, { "epoch": 1.7375546317023771, "grad_norm": 25123.84375, "learning_rate": 1.932072566476499e-05, "loss": 0.1807, "step": 65200 }, { "epoch": 1.7402195927939452, "grad_norm": 21136.314453125, "learning_rate": 1.9300985056359437e-05, "loss": 0.1627, "step": 65300 }, { "epoch": 1.742884553885513, "grad_norm": 14610.0068359375, "learning_rate": 1.9281244447953887e-05, "loss": 0.1809, "step": 65400 }, { "epoch": 1.7455495149770814, "grad_norm": 5105.17529296875, "learning_rate": 1.9261503839548337e-05, "loss": 0.1774, "step": 65500 }, { "epoch": 1.7455495149770814, "eval_dev_accuracy": 0.9722270386816136, "eval_dev_accuracy_threshold": 0.9311126470565796, "eval_dev_average_precision": 0.8672414882858807, "eval_dev_f1": 0.801593625498008, "eval_dev_f1_threshold": 0.841367244720459, "eval_dev_precision": 0.7779954629820581, "eval_dev_recall": 0.8266681275336912, "eval_loss": 0.2049088478088379, "eval_runtime": 933.0893, "eval_samples_per_second": 142.159, "eval_steps_per_second": 2.222, "step": 65500 }, { "epoch": 1.7482144760686493, "grad_norm": 52553.86328125, "learning_rate": 1.9241763231142784e-05, "loss": 0.154, "step": 65600 }, { "epoch": 1.7508794371602174, "grad_norm": 8918.7666015625, "learning_rate": 1.9222022622737234e-05, "loss": 0.1871, "step": 65700 }, { "epoch": 1.7535443982517855, "grad_norm": 1728.83984375, "learning_rate": 1.9202282014331684e-05, "loss": 0.1929, "step": 65800 }, { "epoch": 1.7562093593433534, "grad_norm": 8542.5439453125, "learning_rate": 1.918254140592613e-05, "loss": 0.1519, "step": 65900 }, { "epoch": 1.7588743204349218, "grad_norm": 40360.875, "learning_rate": 1.916280079752058e-05, "loss": 0.2105, "step": 66000 }, { "epoch": 1.7588743204349218, "eval_dev_accuracy": 0.9723099655476566, "eval_dev_accuracy_threshold": 0.82029128074646, "eval_dev_average_precision": 0.8667448997071003, "eval_dev_f1": 0.801227852873068, "eval_dev_f1_threshold": 0.5722821354866028, "eval_dev_precision": 0.7878627409447151, "eval_dev_recall": 0.8150542346882875, "eval_loss": 0.2998444736003876, "eval_runtime": 935.7172, "eval_samples_per_second": 141.76, "eval_steps_per_second": 2.215, "step": 66000 }, { "epoch": 1.7615392815264896, "grad_norm": 46394.6875, "learning_rate": 1.914306018911503e-05, "loss": 0.1664, "step": 66100 }, { "epoch": 1.7642042426180577, "grad_norm": 3412.559814453125, "learning_rate": 1.9123319580709477e-05, "loss": 0.1806, "step": 66200 }, { "epoch": 1.7668692037096259, "grad_norm": 5545.865234375, "learning_rate": 1.910357897230393e-05, "loss": 0.1881, "step": 66300 }, { "epoch": 1.7695341648011937, "grad_norm": 85940.0234375, "learning_rate": 1.9083838363898377e-05, "loss": 0.1881, "step": 66400 }, { "epoch": 1.772199125892762, "grad_norm": 15622.53125, "learning_rate": 1.9064097755492824e-05, "loss": 0.1889, "step": 66500 }, { "epoch": 1.772199125892762, "eval_dev_accuracy": 0.9718651759934261, "eval_dev_accuracy_threshold": 0.9094328880310059, "eval_dev_average_precision": 0.8682256601471484, "eval_dev_f1": 0.7982062780269058, "eval_dev_f1_threshold": 0.6328648328781128, "eval_dev_precision": 0.7697395197395197, "eval_dev_recall": 0.8288594280705599, "eval_loss": 0.19647949934005737, "eval_runtime": 933.2357, "eval_samples_per_second": 142.137, "eval_steps_per_second": 2.221, "step": 66500 }, { "epoch": 1.77486408698433, "grad_norm": 1002.01220703125, "learning_rate": 1.9044357147087277e-05, "loss": 0.1722, "step": 66600 }, { "epoch": 1.777529048075898, "grad_norm": 45076.7421875, "learning_rate": 1.9024616538681724e-05, "loss": 0.1999, "step": 66700 }, { "epoch": 1.7801940091674662, "grad_norm": 2053.866455078125, "learning_rate": 1.900487593027617e-05, "loss": 0.1894, "step": 66800 }, { "epoch": 1.782858970259034, "grad_norm": 3085.87451171875, "learning_rate": 1.8985135321870624e-05, "loss": 0.1702, "step": 66900 }, { "epoch": 1.7855239313506024, "grad_norm": 1689.106201171875, "learning_rate": 1.896539471346507e-05, "loss": 0.1905, "step": 67000 }, { "epoch": 1.7855239313506024, "eval_dev_accuracy": 0.97235519838368, "eval_dev_accuracy_threshold": 0.8816102743148804, "eval_dev_average_precision": 0.8719513025342801, "eval_dev_f1": 0.8005663642561224, "eval_dev_f1_threshold": 0.595874547958374, "eval_dev_precision": 0.7677529672098169, "eval_dev_recall": 0.8363098498959132, "eval_loss": 0.22260619699954987, "eval_runtime": 935.4859, "eval_samples_per_second": 141.795, "eval_steps_per_second": 2.216, "step": 67000 }, { "epoch": 1.7881888924421703, "grad_norm": 24842.880859375, "learning_rate": 1.8945654105059517e-05, "loss": 0.1809, "step": 67100 }, { "epoch": 1.7908538535337384, "grad_norm": 60853.56640625, "learning_rate": 1.892591349665397e-05, "loss": 0.1825, "step": 67200 }, { "epoch": 1.7935188146253065, "grad_norm": 6448.2060546875, "learning_rate": 1.8906172888248417e-05, "loss": 0.1912, "step": 67300 }, { "epoch": 1.7961837757168744, "grad_norm": 28209.67578125, "learning_rate": 1.8886432279842867e-05, "loss": 0.1849, "step": 67400 }, { "epoch": 1.7988487368084427, "grad_norm": 1441.7255859375, "learning_rate": 1.8866691671437317e-05, "loss": 0.1812, "step": 67500 }, { "epoch": 1.7988487368084427, "eval_dev_accuracy": 0.9728678371919456, "eval_dev_accuracy_threshold": 0.8422494530677795, "eval_dev_average_precision": 0.8713666080730428, "eval_dev_f1": 0.8021557531662624, "eval_dev_f1_threshold": 0.6560682058334351, "eval_dev_precision": 0.7893508697496818, "eval_dev_recall": 0.8153829297688178, "eval_loss": 0.21360942721366882, "eval_runtime": 935.6466, "eval_samples_per_second": 141.77, "eval_steps_per_second": 2.216, "step": 67500 }, { "epoch": 1.8015136979000106, "grad_norm": 19593.896484375, "learning_rate": 1.8846951063031764e-05, "loss": 0.1729, "step": 67600 }, { "epoch": 1.8041786589915787, "grad_norm": 49532.5390625, "learning_rate": 1.8827210454626214e-05, "loss": 0.1981, "step": 67700 }, { "epoch": 1.8068436200831468, "grad_norm": 2939.565185546875, "learning_rate": 1.8807469846220664e-05, "loss": 0.172, "step": 67800 }, { "epoch": 1.8095085811747147, "grad_norm": 18294.060546875, "learning_rate": 1.878772923781511e-05, "loss": 0.1609, "step": 67900 }, { "epoch": 1.812173542266283, "grad_norm": 67081.5234375, "learning_rate": 1.876798862940956e-05, "loss": 0.18, "step": 68000 }, { "epoch": 1.812173542266283, "eval_dev_accuracy": 0.972196883457598, "eval_dev_accuracy_threshold": 0.9536248445510864, "eval_dev_average_precision": 0.8677887820499237, "eval_dev_f1": 0.7948606271777002, "eval_dev_f1_threshold": 0.8924222588539124, "eval_dev_precision": 0.7899577967752408, "eval_dev_recall": 0.7998246959570505, "eval_loss": 0.22794483602046967, "eval_runtime": 934.1595, "eval_samples_per_second": 141.996, "eval_steps_per_second": 2.219, "step": 68000 }, { "epoch": 1.814838503357851, "grad_norm": 3441.131103515625, "learning_rate": 1.874824802100401e-05, "loss": 0.1618, "step": 68100 }, { "epoch": 1.817503464449419, "grad_norm": 40774.67578125, "learning_rate": 1.8728507412598457e-05, "loss": 0.1673, "step": 68200 }, { "epoch": 1.8201684255409871, "grad_norm": 23139.685546875, "learning_rate": 1.8708766804192907e-05, "loss": 0.1793, "step": 68300 }, { "epoch": 1.822833386632555, "grad_norm": 8400.26171875, "learning_rate": 1.8689026195787357e-05, "loss": 0.219, "step": 68400 }, { "epoch": 1.8254983477241233, "grad_norm": 874.6626586914062, "learning_rate": 1.8669285587381804e-05, "loss": 0.1714, "step": 68500 }, { "epoch": 1.8254983477241233, "eval_dev_accuracy": 0.9731241565960783, "eval_dev_accuracy_threshold": 0.939326286315918, "eval_dev_average_precision": 0.872717385393903, "eval_dev_f1": 0.803395225464191, "eval_dev_f1_threshold": 0.7294609546661377, "eval_dev_precision": 0.7787719839555692, "eval_dev_recall": 0.8296263832584639, "eval_loss": 0.22690728306770325, "eval_runtime": 931.9708, "eval_samples_per_second": 142.33, "eval_steps_per_second": 2.224, "step": 68500 }, { "epoch": 1.8281633088156912, "grad_norm": 339.2591552734375, "learning_rate": 1.8649544978976254e-05, "loss": 0.1969, "step": 68600 }, { "epoch": 1.8308282699072593, "grad_norm": 48369.09375, "learning_rate": 1.8629804370570704e-05, "loss": 0.1715, "step": 68700 }, { "epoch": 1.8334932309988274, "grad_norm": 1295.3619384765625, "learning_rate": 1.861006376216515e-05, "loss": 0.1728, "step": 68800 }, { "epoch": 1.8361581920903953, "grad_norm": 13706.5322265625, "learning_rate": 1.85903231537596e-05, "loss": 0.1768, "step": 68900 }, { "epoch": 1.8388231531819637, "grad_norm": 36329.11328125, "learning_rate": 1.857058254535405e-05, "loss": 0.1821, "step": 69000 }, { "epoch": 1.8388231531819637, "eval_dev_accuracy": 0.9732900103281642, "eval_dev_accuracy_threshold": 0.9531596899032593, "eval_dev_average_precision": 0.8750664616109699, "eval_dev_f1": 0.8036220816059348, "eval_dev_f1_threshold": 0.925843358039856, "eval_dev_precision": 0.8002172732210755, "eval_dev_recall": 0.807055987728717, "eval_loss": 0.22201138734817505, "eval_runtime": 933.9653, "eval_samples_per_second": 142.026, "eval_steps_per_second": 2.22, "step": 69000 }, { "epoch": 1.8414881142735315, "grad_norm": 21184.15234375, "learning_rate": 1.8550841936948497e-05, "loss": 0.1925, "step": 69100 }, { "epoch": 1.8441530753650996, "grad_norm": 1523.7003173828125, "learning_rate": 1.8531101328542947e-05, "loss": 0.1761, "step": 69200 }, { "epoch": 1.8468180364566678, "grad_norm": 18345.251953125, "learning_rate": 1.8511360720137397e-05, "loss": 0.1656, "step": 69300 }, { "epoch": 1.8494829975482356, "grad_norm": 3282.25830078125, "learning_rate": 1.8491620111731844e-05, "loss": 0.2208, "step": 69400 }, { "epoch": 1.852147958639804, "grad_norm": 10842.587890625, "learning_rate": 1.8471879503326294e-05, "loss": 0.1579, "step": 69500 }, { "epoch": 1.852147958639804, "eval_dev_accuracy": 0.9733653983882032, "eval_dev_accuracy_threshold": 0.9553133249282837, "eval_dev_average_precision": 0.8719043392702807, "eval_dev_f1": 0.8058681249342727, "eval_dev_f1_threshold": 0.8481921553611755, "eval_dev_precision": 0.7747447174198766, "eval_dev_recall": 0.8395968007012161, "eval_loss": 0.2123890370130539, "eval_runtime": 932.9373, "eval_samples_per_second": 142.182, "eval_steps_per_second": 2.222, "step": 69500 }, { "epoch": 1.8548129197313719, "grad_norm": 14122.6357421875, "learning_rate": 1.8452138894920744e-05, "loss": 0.1684, "step": 69600 }, { "epoch": 1.85747788082294, "grad_norm": 22713.14453125, "learning_rate": 1.843239828651519e-05, "loss": 0.207, "step": 69700 }, { "epoch": 1.860142841914508, "grad_norm": 21279.48828125, "learning_rate": 1.841265767810964e-05, "loss": 0.1679, "step": 69800 }, { "epoch": 1.862807803006076, "grad_norm": 1724.1683349609375, "learning_rate": 1.839291706970409e-05, "loss": 0.1658, "step": 69900 }, { "epoch": 1.8654727640976443, "grad_norm": 25310.3359375, "learning_rate": 1.8373176461298537e-05, "loss": 0.2035, "step": 70000 }, { "epoch": 1.8654727640976443, "eval_dev_accuracy": 0.9732146222681252, "eval_dev_accuracy_threshold": 0.9318354725837708, "eval_dev_average_precision": 0.8761383143347535, "eval_dev_f1": 0.8027572731220147, "eval_dev_f1_threshold": 0.7808271646499634, "eval_dev_precision": 0.7954178767344304, "eval_dev_recall": 0.8102333735071765, "eval_loss": 0.2241707593202591, "eval_runtime": 934.0769, "eval_samples_per_second": 142.009, "eval_steps_per_second": 2.219, "step": 70000 }, { "epoch": 1.8681377251892122, "grad_norm": 1192.052978515625, "learning_rate": 1.8353435852892987e-05, "loss": 0.1671, "step": 70100 }, { "epoch": 1.8708026862807803, "grad_norm": 3381.109375, "learning_rate": 1.8333695244487437e-05, "loss": 0.1777, "step": 70200 }, { "epoch": 1.8734676473723484, "grad_norm": 2287.74267578125, "learning_rate": 1.8313954636081884e-05, "loss": 0.1894, "step": 70300 }, { "epoch": 1.8761326084639163, "grad_norm": 5671.9111328125, "learning_rate": 1.8294214027676334e-05, "loss": 0.2227, "step": 70400 }, { "epoch": 1.8787975695554846, "grad_norm": 8669.9560546875, "learning_rate": 1.8274473419270784e-05, "loss": 0.1754, "step": 70500 }, { "epoch": 1.8787975695554846, "eval_dev_accuracy": 0.9732221610741291, "eval_dev_accuracy_threshold": 0.9238910675048828, "eval_dev_average_precision": 0.8787022531852614, "eval_dev_f1": 0.8059863355384449, "eval_dev_f1_threshold": 0.760931134223938, "eval_dev_precision": 0.7978529253891573, "eval_dev_recall": 0.8142872795003835, "eval_loss": 0.1879546046257019, "eval_runtime": 934.0481, "eval_samples_per_second": 142.013, "eval_steps_per_second": 2.219, "step": 70500 }, { "epoch": 1.8814625306470525, "grad_norm": 21830.791015625, "learning_rate": 1.825473281086523e-05, "loss": 0.1683, "step": 70600 }, { "epoch": 1.8841274917386206, "grad_norm": 5870.6396484375, "learning_rate": 1.823499220245968e-05, "loss": 0.1618, "step": 70700 }, { "epoch": 1.8867924528301887, "grad_norm": 9237.384765625, "learning_rate": 1.821525159405413e-05, "loss": 0.1806, "step": 70800 }, { "epoch": 1.8894574139217566, "grad_norm": 5946.40380859375, "learning_rate": 1.8195510985648577e-05, "loss": 0.1701, "step": 70900 }, { "epoch": 1.892122375013325, "grad_norm": 4265.1650390625, "learning_rate": 1.8175770377243027e-05, "loss": 0.1752, "step": 71000 }, { "epoch": 1.892122375013325, "eval_dev_accuracy": 0.9730336909240315, "eval_dev_accuracy_threshold": 0.9348808526992798, "eval_dev_average_precision": 0.8700561831987852, "eval_dev_f1": 0.8034291366708798, "eval_dev_f1_threshold": 0.9348808526992798, "eval_dev_precision": 0.8059536934950385, "eval_dev_recall": 0.8009203462254848, "eval_loss": 0.201664537191391, "eval_runtime": 931.3245, "eval_samples_per_second": 142.428, "eval_steps_per_second": 2.226, "step": 71000 }, { "epoch": 1.8947873361048928, "grad_norm": 2272.4169921875, "learning_rate": 1.8156029768837477e-05, "loss": 0.1688, "step": 71100 }, { "epoch": 1.897452297196461, "grad_norm": 11893.5654296875, "learning_rate": 1.8136289160431924e-05, "loss": 0.184, "step": 71200 }, { "epoch": 1.900117258288029, "grad_norm": 3861.369384765625, "learning_rate": 1.8116548552026374e-05, "loss": 0.1665, "step": 71300 }, { "epoch": 1.902782219379597, "grad_norm": 35609.0, "learning_rate": 1.8096807943620824e-05, "loss": 0.1749, "step": 71400 }, { "epoch": 1.9054471804711652, "grad_norm": 11618.3125, "learning_rate": 1.8077067335215274e-05, "loss": 0.1899, "step": 71500 }, { "epoch": 1.9054471804711652, "eval_dev_accuracy": 0.9734483252542462, "eval_dev_accuracy_threshold": 0.943538248538971, "eval_dev_average_precision": 0.8746432264035248, "eval_dev_f1": 0.8067354698533405, "eval_dev_f1_threshold": 0.9360702037811279, "eval_dev_precision": 0.7999569104815254, "eval_dev_recall": 0.8136298893393229, "eval_loss": 0.20475232601165771, "eval_runtime": 860.459, "eval_samples_per_second": 154.158, "eval_steps_per_second": 2.409, "step": 71500 }, { "epoch": 1.9081121415627331, "grad_norm": 8260.7607421875, "learning_rate": 1.805732672680972e-05, "loss": 0.1886, "step": 71600 }, { "epoch": 1.9107771026543012, "grad_norm": 47676.78125, "learning_rate": 1.803758611840417e-05, "loss": 0.1858, "step": 71700 }, { "epoch": 1.9134420637458693, "grad_norm": 554.1092529296875, "learning_rate": 1.801784550999862e-05, "loss": 0.165, "step": 71800 }, { "epoch": 1.9161070248374372, "grad_norm": 12699.4365234375, "learning_rate": 1.7998104901593067e-05, "loss": 0.1784, "step": 71900 }, { "epoch": 1.9187719859290056, "grad_norm": 4534.798828125, "learning_rate": 1.7978364293187517e-05, "loss": 0.1767, "step": 72000 }, { "epoch": 1.9187719859290056, "eval_dev_accuracy": 0.9739835804805235, "eval_dev_accuracy_threshold": 0.9395354986190796, "eval_dev_average_precision": 0.8772612012666982, "eval_dev_f1": 0.8093941820122765, "eval_dev_f1_threshold": 0.875823974609375, "eval_dev_precision": 0.7891340549542049, "eval_dev_recall": 0.8307220335268982, "eval_loss": 0.20605036616325378, "eval_runtime": 861.3232, "eval_samples_per_second": 154.004, "eval_steps_per_second": 2.407, "step": 72000 }, { "epoch": 1.9214369470205734, "grad_norm": 65605.9375, "learning_rate": 1.7958623684781968e-05, "loss": 0.1687, "step": 72100 }, { "epoch": 1.9241019081121415, "grad_norm": 11532.1455078125, "learning_rate": 1.7938883076376414e-05, "loss": 0.1664, "step": 72200 }, { "epoch": 1.9267668692037097, "grad_norm": 11916.1513671875, "learning_rate": 1.7919142467970864e-05, "loss": 0.1669, "step": 72300 }, { "epoch": 1.9294318302952775, "grad_norm": 2029.2286376953125, "learning_rate": 1.7899401859565314e-05, "loss": 0.1787, "step": 72400 }, { "epoch": 1.9320967913868459, "grad_norm": 6753.46142578125, "learning_rate": 1.787966125115976e-05, "loss": 0.1728, "step": 72500 }, { "epoch": 1.9320967913868459, "eval_dev_accuracy": 0.9743379043627071, "eval_dev_accuracy_threshold": 0.8970457315444946, "eval_dev_average_precision": 0.8806920275415929, "eval_dev_f1": 0.8153239556692241, "eval_dev_f1_threshold": 0.7824004888534546, "eval_dev_precision": 0.7935898765688206, "eval_dev_recall": 0.8382820203790949, "eval_loss": 0.19223952293395996, "eval_runtime": 862.5657, "eval_samples_per_second": 153.782, "eval_steps_per_second": 2.403, "step": 72500 }, { "epoch": 1.9347617524784138, "grad_norm": 27343.193359375, "learning_rate": 1.785992064275421e-05, "loss": 0.1443, "step": 72600 }, { "epoch": 1.9374267135699819, "grad_norm": 13309.6455078125, "learning_rate": 1.784018003434866e-05, "loss": 0.1569, "step": 72700 }, { "epoch": 1.94009167466155, "grad_norm": 1874.899169921875, "learning_rate": 1.7820439425943108e-05, "loss": 0.1931, "step": 72800 }, { "epoch": 1.9427566357531179, "grad_norm": 31156.685546875, "learning_rate": 1.7800698817537558e-05, "loss": 0.1811, "step": 72900 }, { "epoch": 1.9454215968446862, "grad_norm": 4346.09912109375, "learning_rate": 1.7780958209132008e-05, "loss": 0.1836, "step": 73000 }, { "epoch": 1.9454215968446862, "eval_dev_accuracy": 0.9730563073420432, "eval_dev_accuracy_threshold": 0.9250275492668152, "eval_dev_average_precision": 0.8743046594125137, "eval_dev_f1": 0.8057607880929436, "eval_dev_f1_threshold": 0.8426618576049805, "eval_dev_precision": 0.7878756151188357, "eval_dev_recall": 0.8244768269968226, "eval_loss": 0.207134410738945, "eval_runtime": 861.6487, "eval_samples_per_second": 153.946, "eval_steps_per_second": 2.406, "step": 73000 }, { "epoch": 1.948086557936254, "grad_norm": 5061.1884765625, "learning_rate": 1.7761217600726454e-05, "loss": 0.1739, "step": 73100 }, { "epoch": 1.9507515190278222, "grad_norm": 103200.015625, "learning_rate": 1.7741476992320904e-05, "loss": 0.1966, "step": 73200 }, { "epoch": 1.9534164801193903, "grad_norm": 18783.486328125, "learning_rate": 1.7721736383915354e-05, "loss": 0.1723, "step": 73300 }, { "epoch": 1.9560814412109582, "grad_norm": 13243.9150390625, "learning_rate": 1.7701995775509804e-05, "loss": 0.1698, "step": 73400 }, { "epoch": 1.9587464023025265, "grad_norm": 4332.658203125, "learning_rate": 1.768225516710425e-05, "loss": 0.1801, "step": 73500 }, { "epoch": 1.9587464023025265, "eval_dev_accuracy": 0.972988458088008, "eval_dev_accuracy_threshold": 0.9180799126625061, "eval_dev_average_precision": 0.8762342719828209, "eval_dev_f1": 0.8045175392942646, "eval_dev_f1_threshold": 0.7035636901855469, "eval_dev_precision": 0.7662337662337663, "eval_dev_recall": 0.8468280924728827, "eval_loss": 0.18561449646949768, "eval_runtime": 862.9273, "eval_samples_per_second": 153.717, "eval_steps_per_second": 2.402, "step": 73500 }, { "epoch": 1.9614113633940944, "grad_norm": 13960.3876953125, "learning_rate": 1.76625145586987e-05, "loss": 0.1599, "step": 73600 }, { "epoch": 1.9640763244856625, "grad_norm": 12248.2890625, "learning_rate": 1.764277395029315e-05, "loss": 0.1722, "step": 73700 }, { "epoch": 1.9667412855772306, "grad_norm": 20745.55859375, "learning_rate": 1.7623033341887598e-05, "loss": 0.1708, "step": 73800 }, { "epoch": 1.9694062466687985, "grad_norm": 13722.9697265625, "learning_rate": 1.7603292733482048e-05, "loss": 0.1662, "step": 73900 }, { "epoch": 1.9720712077603668, "grad_norm": 18372.69140625, "learning_rate": 1.7583552125076498e-05, "loss": 0.1716, "step": 74000 }, { "epoch": 1.9720712077603668, "eval_dev_accuracy": 0.9739232700324922, "eval_dev_accuracy_threshold": 0.8308413624763489, "eval_dev_average_precision": 0.8841492699463087, "eval_dev_f1": 0.8137931034482759, "eval_dev_f1_threshold": 0.6751728057861328, "eval_dev_precision": 0.7832387515200648, "eval_dev_recall": 0.8468280924728827, "eval_loss": 0.2016657292842865, "eval_runtime": 862.1524, "eval_samples_per_second": 153.856, "eval_steps_per_second": 2.404, "step": 74000 }, { "epoch": 1.9747361688519347, "grad_norm": 22373.701171875, "learning_rate": 1.7563811516670944e-05, "loss": 0.1741, "step": 74100 }, { "epoch": 1.9774011299435028, "grad_norm": 1855.767822265625, "learning_rate": 1.7544070908265394e-05, "loss": 0.1318, "step": 74200 }, { "epoch": 1.980066091035071, "grad_norm": 20893.662109375, "learning_rate": 1.7524330299859844e-05, "loss": 0.1782, "step": 74300 }, { "epoch": 1.9827310521266388, "grad_norm": 1626.1358642578125, "learning_rate": 1.750458969145429e-05, "loss": 0.1842, "step": 74400 }, { "epoch": 1.9853960132182071, "grad_norm": 8638.869140625, "learning_rate": 1.748484908304874e-05, "loss": 0.1545, "step": 74500 }, { "epoch": 1.9853960132182071, "eval_dev_accuracy": 0.9740815849585742, "eval_dev_accuracy_threshold": 0.7622551918029785, "eval_dev_average_precision": 0.8838940929517627, "eval_dev_f1": 0.8130659767141011, "eval_dev_f1_threshold": 0.6812475919723511, "eval_dev_precision": 0.800212201591512, "eval_dev_recall": 0.826339432453161, "eval_loss": 0.21240267157554626, "eval_runtime": 862.2203, "eval_samples_per_second": 153.844, "eval_steps_per_second": 2.404, "step": 74500 }, { "epoch": 1.988060974309775, "grad_norm": 12036.10546875, "learning_rate": 1.746510847464319e-05, "loss": 0.1786, "step": 74600 }, { "epoch": 1.9907259354013431, "grad_norm": 3197.989013671875, "learning_rate": 1.7445367866237638e-05, "loss": 0.1589, "step": 74700 }, { "epoch": 1.9933908964929112, "grad_norm": 2326.903564453125, "learning_rate": 1.7425627257832088e-05, "loss": 0.1712, "step": 74800 }, { "epoch": 1.9960558575844791, "grad_norm": 13623.826171875, "learning_rate": 1.7405886649426538e-05, "loss": 0.1761, "step": 74900 }, { "epoch": 1.9987208186760475, "grad_norm": 7701.57861328125, "learning_rate": 1.7386146041020984e-05, "loss": 0.1958, "step": 75000 }, { "epoch": 1.9987208186760475, "eval_dev_accuracy": 0.9734558640602501, "eval_dev_accuracy_threshold": 0.957332968711853, "eval_dev_average_precision": 0.8773248937578426, "eval_dev_f1": 0.8058651661075641, "eval_dev_f1_threshold": 0.763139009475708, "eval_dev_precision": 0.796044895777659, "eval_dev_recall": 0.8159307549030349, "eval_loss": 0.25920844078063965, "eval_runtime": 862.3734, "eval_samples_per_second": 153.816, "eval_steps_per_second": 2.404, "step": 75000 }, { "epoch": 2.0013857797676153, "grad_norm": 19200.3828125, "learning_rate": 1.7366405432615434e-05, "loss": 0.1859, "step": 75100 }, { "epoch": 2.0040507408591837, "grad_norm": 27715.55859375, "learning_rate": 1.7346664824209884e-05, "loss": 0.215, "step": 75200 }, { "epoch": 2.0067157019507516, "grad_norm": 14230.0625, "learning_rate": 1.7326924215804334e-05, "loss": 0.1883, "step": 75300 }, { "epoch": 2.0093806630423194, "grad_norm": 214.24032592773438, "learning_rate": 1.730718360739878e-05, "loss": 0.1771, "step": 75400 }, { "epoch": 2.0120456241338878, "grad_norm": 11949.2451171875, "learning_rate": 1.728744299899323e-05, "loss": 0.1568, "step": 75500 }, { "epoch": 2.0120456241338878, "eval_dev_accuracy": 0.9732749327161564, "eval_dev_accuracy_threshold": 0.9531142115592957, "eval_dev_average_precision": 0.8772400052614694, "eval_dev_f1": 0.8078490242333263, "eval_dev_f1_threshold": 0.9034242630004883, "eval_dev_precision": 0.7909711286089239, "eval_dev_recall": 0.8254629122384135, "eval_loss": 0.27995508909225464, "eval_runtime": 861.968, "eval_samples_per_second": 153.889, "eval_steps_per_second": 2.405, "step": 75500 }, { "epoch": 2.0147105852254557, "grad_norm": 1409.49951171875, "learning_rate": 1.726770239058768e-05, "loss": 0.1797, "step": 75600 }, { "epoch": 2.017375546317024, "grad_norm": 5395.6484375, "learning_rate": 1.7247961782182128e-05, "loss": 0.1659, "step": 75700 }, { "epoch": 2.020040507408592, "grad_norm": 49720.015625, "learning_rate": 1.7228221173776578e-05, "loss": 0.1519, "step": 75800 }, { "epoch": 2.0227054685001598, "grad_norm": 39423.91015625, "learning_rate": 1.7208480565371028e-05, "loss": 0.1366, "step": 75900 }, { "epoch": 2.025370429591728, "grad_norm": 1205.4697265625, "learning_rate": 1.7188739956965474e-05, "loss": 0.1641, "step": 76000 }, { "epoch": 2.025370429591728, "eval_dev_accuracy": 0.9739760416745196, "eval_dev_accuracy_threshold": 0.9528675079345703, "eval_dev_average_precision": 0.8829642344114682, "eval_dev_f1": 0.8102727032036007, "eval_dev_f1_threshold": 0.8193379640579224, "eval_dev_precision": 0.7840746054519369, "eval_dev_recall": 0.8382820203790949, "eval_loss": 0.22183284163475037, "eval_runtime": 861.7487, "eval_samples_per_second": 153.928, "eval_steps_per_second": 2.406, "step": 76000 }, { "epoch": 2.028035390683296, "grad_norm": 143011.90625, "learning_rate": 1.7168999348559924e-05, "loss": 0.1551, "step": 76100 }, { "epoch": 2.0307003517748643, "grad_norm": 3733.740234375, "learning_rate": 1.7149258740154374e-05, "loss": 0.1612, "step": 76200 }, { "epoch": 2.033365312866432, "grad_norm": 13346.1015625, "learning_rate": 1.712951813174882e-05, "loss": 0.1643, "step": 76300 }, { "epoch": 2.036030273958, "grad_norm": 10167.767578125, "learning_rate": 1.710977752334327e-05, "loss": 0.1692, "step": 76400 }, { "epoch": 2.0386952350495684, "grad_norm": 26428.076171875, "learning_rate": 1.709003691493772e-05, "loss": 0.1708, "step": 76500 }, { "epoch": 2.0386952350495684, "eval_dev_accuracy": 0.9733277043581837, "eval_dev_accuracy_threshold": 0.9573899507522583, "eval_dev_average_precision": 0.8690568245333676, "eval_dev_f1": 0.8137024870952604, "eval_dev_f1_threshold": 0.8371973037719727, "eval_dev_precision": 0.7762634301631516, "eval_dev_recall": 0.8549359044592966, "eval_loss": 0.21817246079444885, "eval_runtime": 861.7458, "eval_samples_per_second": 153.928, "eval_steps_per_second": 2.406, "step": 76500 }, { "epoch": 2.0413601961411363, "grad_norm": 22541.1796875, "learning_rate": 1.7070296306532168e-05, "loss": 0.165, "step": 76600 }, { "epoch": 2.0440251572327046, "grad_norm": 49104.6015625, "learning_rate": 1.7050555698126618e-05, "loss": 0.1445, "step": 76700 }, { "epoch": 2.0466901183242725, "grad_norm": 47796.04296875, "learning_rate": 1.7030815089721068e-05, "loss": 0.1354, "step": 76800 }, { "epoch": 2.0493550794158404, "grad_norm": 21167.962890625, "learning_rate": 1.7011074481315514e-05, "loss": 0.1787, "step": 76900 }, { "epoch": 2.0520200405074087, "grad_norm": 75447.2890625, "learning_rate": 1.6991333872909964e-05, "loss": 0.1626, "step": 77000 }, { "epoch": 2.0520200405074087, "eval_dev_accuracy": 0.9745339133188086, "eval_dev_accuracy_threshold": 0.9593422412872314, "eval_dev_average_precision": 0.8806603026145806, "eval_dev_f1": 0.8148537765621713, "eval_dev_f1_threshold": 0.781623363494873, "eval_dev_precision": 0.7836115326251897, "eval_dev_recall": 0.848690697929221, "eval_loss": 0.2216637134552002, "eval_runtime": 862.3061, "eval_samples_per_second": 153.828, "eval_steps_per_second": 2.404, "step": 77000 }, { "epoch": 2.0546850015989766, "grad_norm": 4420.5458984375, "learning_rate": 1.6971593264504414e-05, "loss": 0.1418, "step": 77100 }, { "epoch": 2.057349962690545, "grad_norm": 14327.546875, "learning_rate": 1.695185265609886e-05, "loss": 0.2011, "step": 77200 }, { "epoch": 2.060014923782113, "grad_norm": 19713.06640625, "learning_rate": 1.693211204769331e-05, "loss": 0.1593, "step": 77300 }, { "epoch": 2.0626798848736807, "grad_norm": 5675.8125, "learning_rate": 1.691237143928776e-05, "loss": 0.1546, "step": 77400 }, { "epoch": 2.065344845965249, "grad_norm": 7002.0654296875, "learning_rate": 1.6892630830882208e-05, "loss": 0.177, "step": 77500 }, { "epoch": 2.065344845965249, "eval_dev_accuracy": 0.9752048670531561, "eval_dev_accuracy_threshold": 0.8865873217582703, "eval_dev_average_precision": 0.8890707955101652, "eval_dev_f1": 0.8212508115126596, "eval_dev_f1_threshold": 0.8439962863922119, "eval_dev_precision": 0.8111574222507214, "eval_dev_recall": 0.8315985537416457, "eval_loss": 0.21185144782066345, "eval_runtime": 860.1662, "eval_samples_per_second": 154.211, "eval_steps_per_second": 2.41, "step": 77500 }, { "epoch": 2.068009807056817, "grad_norm": 418.3937683105469, "learning_rate": 1.6872890222476658e-05, "loss": 0.1546, "step": 77600 }, { "epoch": 2.0706747681483852, "grad_norm": 47829.74609375, "learning_rate": 1.6853149614071108e-05, "loss": 0.1766, "step": 77700 }, { "epoch": 2.073339729239953, "grad_norm": 395.5926208496094, "learning_rate": 1.6833409005665554e-05, "loss": 0.1879, "step": 77800 }, { "epoch": 2.076004690331521, "grad_norm": 13378.1806640625, "learning_rate": 1.6813668397260004e-05, "loss": 0.1694, "step": 77900 }, { "epoch": 2.0786696514230893, "grad_norm": 4878.7451171875, "learning_rate": 1.6793927788854454e-05, "loss": 0.1546, "step": 78000 }, { "epoch": 2.0786696514230893, "eval_dev_accuracy": 0.9736971058523751, "eval_dev_accuracy_threshold": 0.9617332220077515, "eval_dev_average_precision": 0.8737670860803924, "eval_dev_f1": 0.8101625374783019, "eval_dev_f1_threshold": 0.8637624979019165, "eval_dev_precision": 0.7791380008093889, "eval_dev_recall": 0.8437602717212666, "eval_loss": 0.24948453903198242, "eval_runtime": 861.1759, "eval_samples_per_second": 154.03, "eval_steps_per_second": 2.407, "step": 78000 }, { "epoch": 2.0813346125146572, "grad_norm": 26331.390625, "learning_rate": 1.67741871804489e-05, "loss": 0.1742, "step": 78100 }, { "epoch": 2.0839995736062256, "grad_norm": 5203.9365234375, "learning_rate": 1.675444657204335e-05, "loss": 0.2024, "step": 78200 }, { "epoch": 2.0866645346977934, "grad_norm": 27641.3671875, "learning_rate": 1.67347059636378e-05, "loss": 0.2126, "step": 78300 }, { "epoch": 2.0893294957893613, "grad_norm": 3783.3671875, "learning_rate": 1.6714965355232248e-05, "loss": 0.1747, "step": 78400 }, { "epoch": 2.0919944568809297, "grad_norm": 20038.98046875, "learning_rate": 1.6695224746826698e-05, "loss": 0.1807, "step": 78500 }, { "epoch": 2.0919944568809297, "eval_dev_accuracy": 0.9743303655567032, "eval_dev_accuracy_threshold": 0.9270470142364502, "eval_dev_average_precision": 0.8818386397835865, "eval_dev_f1": 0.816217350257002, "eval_dev_f1_threshold": 0.7469815015792847, "eval_dev_precision": 0.7828755407988731, "eval_dev_recall": 0.8525254738687411, "eval_loss": 0.21055419743061066, "eval_runtime": 861.287, "eval_samples_per_second": 154.01, "eval_steps_per_second": 2.407, "step": 78500 }, { "epoch": 2.0946594179724976, "grad_norm": 18032.57421875, "learning_rate": 1.6675484138421148e-05, "loss": 0.1805, "step": 78600 }, { "epoch": 2.097324379064066, "grad_norm": 13172.416015625, "learning_rate": 1.6655743530015594e-05, "loss": 0.1498, "step": 78700 }, { "epoch": 2.0999893401556338, "grad_norm": 10491.02734375, "learning_rate": 1.6636002921610045e-05, "loss": 0.1899, "step": 78800 }, { "epoch": 2.1026543012472017, "grad_norm": 3893.85107421875, "learning_rate": 1.6616262313204495e-05, "loss": 0.1924, "step": 78900 }, { "epoch": 2.10531926233877, "grad_norm": 1639.23486328125, "learning_rate": 1.659652170479894e-05, "loss": 0.1521, "step": 79000 }, { "epoch": 2.10531926233877, "eval_dev_accuracy": 0.9743982148107383, "eval_dev_accuracy_threshold": 0.9525002837181091, "eval_dev_average_precision": 0.883524287942099, "eval_dev_f1": 0.8129610403803071, "eval_dev_f1_threshold": 0.9087203145027161, "eval_dev_precision": 0.8108785698713756, "eval_dev_recall": 0.8150542346882875, "eval_loss": 0.24836769700050354, "eval_runtime": 952.3381, "eval_samples_per_second": 139.286, "eval_steps_per_second": 2.177, "step": 79000 }, { "epoch": 2.107984223430338, "grad_norm": 7783.5283203125, "learning_rate": 1.657678109639339e-05, "loss": 0.1988, "step": 79100 }, { "epoch": 2.1106491845219058, "grad_norm": 1583.300537109375, "learning_rate": 1.655704048798784e-05, "loss": 0.1702, "step": 79200 }, { "epoch": 2.113314145613474, "grad_norm": 1492.0706787109375, "learning_rate": 1.6537299879582288e-05, "loss": 0.1824, "step": 79300 }, { "epoch": 2.115979106705042, "grad_norm": 18683.794921875, "learning_rate": 1.651755927117674e-05, "loss": 0.1688, "step": 79400 }, { "epoch": 2.1186440677966103, "grad_norm": 8736.2275390625, "learning_rate": 1.6497818662771188e-05, "loss": 0.1809, "step": 79500 }, { "epoch": 2.1186440677966103, "eval_dev_accuracy": 0.9739685028685157, "eval_dev_accuracy_threshold": 0.9717953205108643, "eval_dev_average_precision": 0.8798479877006415, "eval_dev_f1": 0.8135902528044657, "eval_dev_f1_threshold": 0.9465633630752563, "eval_dev_precision": 0.7974537037037037, "eval_dev_recall": 0.8303933384463679, "eval_loss": 0.22024385631084442, "eval_runtime": 951.2023, "eval_samples_per_second": 139.452, "eval_steps_per_second": 2.179, "step": 79500 }, { "epoch": 2.121309028888178, "grad_norm": 54950.51953125, "learning_rate": 1.6478078054365635e-05, "loss": 0.1858, "step": 79600 }, { "epoch": 2.1239739899797465, "grad_norm": 19716.146484375, "learning_rate": 1.6458337445960088e-05, "loss": 0.1642, "step": 79700 }, { "epoch": 2.1266389510713144, "grad_norm": 18239.75, "learning_rate": 1.6438596837554535e-05, "loss": 0.191, "step": 79800 }, { "epoch": 2.1293039121628823, "grad_norm": 41301.21875, "learning_rate": 1.641885622914898e-05, "loss": 0.1655, "step": 79900 }, { "epoch": 2.1319688732544506, "grad_norm": 1119.526123046875, "learning_rate": 1.6399115620743435e-05, "loss": 0.1789, "step": 80000 }, { "epoch": 2.1319688732544506, "eval_dev_accuracy": 0.9743152879446954, "eval_dev_accuracy_threshold": 0.8854852914810181, "eval_dev_average_precision": 0.8771901487923467, "eval_dev_f1": 0.813726025900224, "eval_dev_f1_threshold": 0.8826526403427124, "eval_dev_precision": 0.8116415958142577, "eval_dev_recall": 0.8158211898761916, "eval_loss": 0.1959654837846756, "eval_runtime": 952.4132, "eval_samples_per_second": 139.275, "eval_steps_per_second": 2.177, "step": 80000 }, { "epoch": 2.1346338343460185, "grad_norm": 3469.789794921875, "learning_rate": 1.637937501233788e-05, "loss": 0.2002, "step": 80100 }, { "epoch": 2.1372987954375864, "grad_norm": 15840.623046875, "learning_rate": 1.635963440393233e-05, "loss": 0.2139, "step": 80200 }, { "epoch": 2.1399637565291547, "grad_norm": 24576.1328125, "learning_rate": 1.633989379552678e-05, "loss": 0.199, "step": 80300 }, { "epoch": 2.1426287176207226, "grad_norm": 9852.4111328125, "learning_rate": 1.6320153187121228e-05, "loss": 0.165, "step": 80400 }, { "epoch": 2.145293678712291, "grad_norm": 280.64031982421875, "learning_rate": 1.6300412578715678e-05, "loss": 0.1848, "step": 80500 }, { "epoch": 2.145293678712291, "eval_dev_accuracy": 0.9742022058546368, "eval_dev_accuracy_threshold": 0.9753606915473938, "eval_dev_average_precision": 0.8782336024461705, "eval_dev_f1": 0.8096592433592701, "eval_dev_f1_threshold": 0.8103638887405396, "eval_dev_precision": 0.7934371055952881, "eval_dev_recall": 0.8265585625068478, "eval_loss": 0.26615819334983826, "eval_runtime": 951.0255, "eval_samples_per_second": 139.478, "eval_steps_per_second": 2.18, "step": 80500 }, { "epoch": 2.147958639803859, "grad_norm": 3749.137939453125, "learning_rate": 1.6280671970310128e-05, "loss": 0.2118, "step": 80600 }, { "epoch": 2.150623600895427, "grad_norm": 16408.94140625, "learning_rate": 1.6260931361904575e-05, "loss": 0.177, "step": 80700 }, { "epoch": 2.153288561986995, "grad_norm": 122466.71875, "learning_rate": 1.6241190753499025e-05, "loss": 0.169, "step": 80800 }, { "epoch": 2.155953523078563, "grad_norm": 35088.30078125, "learning_rate": 1.6221450145093475e-05, "loss": 0.1748, "step": 80900 }, { "epoch": 2.1586184841701312, "grad_norm": 2193.1103515625, "learning_rate": 1.620170953668792e-05, "loss": 0.1532, "step": 81000 }, { "epoch": 2.1586184841701312, "eval_dev_accuracy": 0.9747299222749101, "eval_dev_accuracy_threshold": 0.7087757587432861, "eval_dev_average_precision": 0.8839240203558189, "eval_dev_f1": 0.8178559791463017, "eval_dev_f1_threshold": 0.6686054468154907, "eval_dev_precision": 0.8108108108108109, "eval_dev_recall": 0.8250246521310398, "eval_loss": 0.2607557475566864, "eval_runtime": 952.9522, "eval_samples_per_second": 139.196, "eval_steps_per_second": 2.175, "step": 81000 }, { "epoch": 2.161283445261699, "grad_norm": 2420.868896484375, "learning_rate": 1.618196892828237e-05, "loss": 0.1618, "step": 81100 }, { "epoch": 2.163948406353267, "grad_norm": 706.0858764648438, "learning_rate": 1.616222831987682e-05, "loss": 0.1679, "step": 81200 }, { "epoch": 2.1666133674448353, "grad_norm": 23174.521484375, "learning_rate": 1.6142487711471268e-05, "loss": 0.1808, "step": 81300 }, { "epoch": 2.1692783285364032, "grad_norm": 15347.12890625, "learning_rate": 1.6122747103065718e-05, "loss": 0.1685, "step": 81400 }, { "epoch": 2.1719432896279716, "grad_norm": 19526.70703125, "learning_rate": 1.6103006494660168e-05, "loss": 0.1901, "step": 81500 }, { "epoch": 2.1719432896279716, "eval_dev_accuracy": 0.9744434476467617, "eval_dev_accuracy_threshold": 0.9750630855560303, "eval_dev_average_precision": 0.8830413621285588, "eval_dev_f1": 0.8129956790461085, "eval_dev_f1_threshold": 0.9695107936859131, "eval_dev_precision": 0.8117081695063346, "eval_dev_recall": 0.8142872795003835, "eval_loss": 0.23483458161354065, "eval_runtime": 950.8404, "eval_samples_per_second": 139.505, "eval_steps_per_second": 2.18, "step": 81500 }, { "epoch": 2.1746082507195394, "grad_norm": 1307.5916748046875, "learning_rate": 1.6083265886254615e-05, "loss": 0.184, "step": 81600 }, { "epoch": 2.177273211811108, "grad_norm": 40642.421875, "learning_rate": 1.6063525277849065e-05, "loss": 0.1667, "step": 81700 }, { "epoch": 2.1799381729026757, "grad_norm": 1084.0020751953125, "learning_rate": 1.6043784669443515e-05, "loss": 0.1816, "step": 81800 }, { "epoch": 2.1826031339942435, "grad_norm": 14024.021484375, "learning_rate": 1.602404406103796e-05, "loss": 0.159, "step": 81900 }, { "epoch": 2.185268095085812, "grad_norm": 8854.5498046875, "learning_rate": 1.600430345263241e-05, "loss": 0.1553, "step": 82000 }, { "epoch": 2.185268095085812, "eval_dev_accuracy": 0.9748957760069961, "eval_dev_accuracy_threshold": 0.8661369681358337, "eval_dev_average_precision": 0.8869519261803035, "eval_dev_f1": 0.8157429896224332, "eval_dev_f1_threshold": 0.8445290327072144, "eval_dev_precision": 0.8220046723773501, "eval_dev_recall": 0.8095759833461159, "eval_loss": 0.23748071491718292, "eval_runtime": 951.5083, "eval_samples_per_second": 139.407, "eval_steps_per_second": 2.179, "step": 82000 }, { "epoch": 2.1879330561773798, "grad_norm": 44325.265625, "learning_rate": 1.598456284422686e-05, "loss": 0.1572, "step": 82100 }, { "epoch": 2.1905980172689477, "grad_norm": 1203.1580810546875, "learning_rate": 1.5964822235821308e-05, "loss": 0.1629, "step": 82200 }, { "epoch": 2.193262978360516, "grad_norm": 745.87353515625, "learning_rate": 1.5945081627415758e-05, "loss": 0.194, "step": 82300 }, { "epoch": 2.195927939452084, "grad_norm": 17854.037109375, "learning_rate": 1.5925341019010208e-05, "loss": 0.1685, "step": 82400 }, { "epoch": 2.198592900543652, "grad_norm": 44721.08203125, "learning_rate": 1.5905600410604655e-05, "loss": 0.1859, "step": 82500 }, { "epoch": 2.198592900543652, "eval_dev_accuracy": 0.974345443168711, "eval_dev_accuracy_threshold": 0.9739015102386475, "eval_dev_average_precision": 0.8797686946603407, "eval_dev_f1": 0.8160733549083065, "eval_dev_f1_threshold": 0.9577875137329102, "eval_dev_precision": 0.8036757675555083, "eval_dev_recall": 0.8288594280705599, "eval_loss": 0.2292918860912323, "eval_runtime": 950.1815, "eval_samples_per_second": 139.602, "eval_steps_per_second": 2.182, "step": 82500 }, { "epoch": 2.20125786163522, "grad_norm": 170.60641479492188, "learning_rate": 1.5885859802199105e-05, "loss": 0.1483, "step": 82600 }, { "epoch": 2.2039228227267884, "grad_norm": 27626.072265625, "learning_rate": 1.5866119193793555e-05, "loss": 0.2056, "step": 82700 }, { "epoch": 2.2065877838183563, "grad_norm": 731.1361083984375, "learning_rate": 1.5846378585388e-05, "loss": 0.1799, "step": 82800 }, { "epoch": 2.209252744909924, "grad_norm": 36164.07421875, "learning_rate": 1.582663797698245e-05, "loss": 0.1645, "step": 82900 }, { "epoch": 2.2119177060014925, "grad_norm": 6034.74853515625, "learning_rate": 1.58068973685769e-05, "loss": 0.1633, "step": 83000 }, { "epoch": 2.2119177060014925, "eval_dev_accuracy": 0.9737423386883985, "eval_dev_accuracy_threshold": 0.950665295124054, "eval_dev_average_precision": 0.883874392367785, "eval_dev_f1": 0.8089262330859885, "eval_dev_f1_threshold": 0.9107600450515747, "eval_dev_precision": 0.8056732963808282, "eval_dev_recall": 0.8122055439903583, "eval_loss": 0.23654605448246002, "eval_runtime": 951.9974, "eval_samples_per_second": 139.335, "eval_steps_per_second": 2.178, "step": 83000 }, { "epoch": 2.2145826670930604, "grad_norm": 10695.716796875, "learning_rate": 1.5787156760171348e-05, "loss": 0.1714, "step": 83100 }, { "epoch": 2.2172476281846283, "grad_norm": 63246.39453125, "learning_rate": 1.5767416151765798e-05, "loss": 0.1793, "step": 83200 }, { "epoch": 2.2199125892761966, "grad_norm": 1381.2412109375, "learning_rate": 1.5747675543360248e-05, "loss": 0.154, "step": 83300 }, { "epoch": 2.2225775503677645, "grad_norm": 31067.8828125, "learning_rate": 1.5727934934954695e-05, "loss": 0.151, "step": 83400 }, { "epoch": 2.225242511459333, "grad_norm": 33396.78125, "learning_rate": 1.5708194326549148e-05, "loss": 0.1841, "step": 83500 }, { "epoch": 2.225242511459333, "eval_dev_accuracy": 0.9747902327229413, "eval_dev_accuracy_threshold": 0.9669053554534912, "eval_dev_average_precision": 0.8854411022874333, "eval_dev_f1": 0.8149101635827299, "eval_dev_f1_threshold": 0.9264481067657471, "eval_dev_precision": 0.7981718848497583, "eval_dev_recall": 0.8323655089295496, "eval_loss": 0.20306049287319183, "eval_runtime": 951.0541, "eval_samples_per_second": 139.474, "eval_steps_per_second": 2.18, "step": 83500 }, { "epoch": 2.2279074725509007, "grad_norm": 1647.4901123046875, "learning_rate": 1.5688453718143595e-05, "loss": 0.1709, "step": 83600 }, { "epoch": 2.230572433642469, "grad_norm": 310.0802307128906, "learning_rate": 1.566871310973804e-05, "loss": 0.1875, "step": 83700 }, { "epoch": 2.233237394734037, "grad_norm": 14275.015625, "learning_rate": 1.5648972501332495e-05, "loss": 0.2041, "step": 83800 }, { "epoch": 2.235902355825605, "grad_norm": 28323.603515625, "learning_rate": 1.562923189292694e-05, "loss": 0.1812, "step": 83900 }, { "epoch": 2.238567316917173, "grad_norm": 25161.5546875, "learning_rate": 1.5609491284521388e-05, "loss": 0.1779, "step": 84000 }, { "epoch": 2.238567316917173, "eval_dev_accuracy": 0.975137017799121, "eval_dev_accuracy_threshold": 0.9797601699829102, "eval_dev_average_precision": 0.8889116324411686, "eval_dev_f1": 0.8158041179744018, "eval_dev_f1_threshold": 0.9772592782974243, "eval_dev_precision": 0.8289042180255569, "eval_dev_recall": 0.8031116467623535, "eval_loss": 0.2351406365633011, "eval_runtime": 953.0513, "eval_samples_per_second": 139.181, "eval_steps_per_second": 2.175, "step": 84000 }, { "epoch": 2.241232278008741, "grad_norm": 1789.634033203125, "learning_rate": 1.558975067611584e-05, "loss": 0.224, "step": 84100 }, { "epoch": 2.243897239100309, "grad_norm": 5931.00048828125, "learning_rate": 1.5570010067710288e-05, "loss": 0.1624, "step": 84200 }, { "epoch": 2.2465622001918772, "grad_norm": 18578.33203125, "learning_rate": 1.5550269459304738e-05, "loss": 0.1361, "step": 84300 }, { "epoch": 2.249227161283445, "grad_norm": 1247.7115478515625, "learning_rate": 1.5530528850899188e-05, "loss": 0.1371, "step": 84400 }, { "epoch": 2.2518921223750135, "grad_norm": 713.0791625976562, "learning_rate": 1.5510788242493635e-05, "loss": 0.2314, "step": 84500 }, { "epoch": 2.2518921223750135, "eval_dev_accuracy": 0.974737461080914, "eval_dev_accuracy_threshold": 0.9574118256568909, "eval_dev_average_precision": 0.8861569751582977, "eval_dev_f1": 0.8148996509598603, "eval_dev_f1_threshold": 0.832693338394165, "eval_dev_precision": 0.8112715821478987, "eval_dev_recall": 0.8185603155472773, "eval_loss": 0.2932807505130768, "eval_runtime": 951.2089, "eval_samples_per_second": 139.451, "eval_steps_per_second": 2.179, "step": 84500 }, { "epoch": 2.2545570834665813, "grad_norm": 2987.9375, "learning_rate": 1.5491047634088085e-05, "loss": 0.1889, "step": 84600 }, { "epoch": 2.2572220445581497, "grad_norm": 2279.125, "learning_rate": 1.5471307025682535e-05, "loss": 0.2079, "step": 84700 }, { "epoch": 2.2598870056497176, "grad_norm": 1106.462890625, "learning_rate": 1.545156641727698e-05, "loss": 0.1783, "step": 84800 }, { "epoch": 2.2625519667412854, "grad_norm": 7212.99560546875, "learning_rate": 1.543182580887143e-05, "loss": 0.1551, "step": 84900 }, { "epoch": 2.2652169278328538, "grad_norm": 22761.849609375, "learning_rate": 1.541208520046588e-05, "loss": 0.1606, "step": 85000 }, { "epoch": 2.2652169278328538, "eval_dev_accuracy": 0.9748203879469569, "eval_dev_accuracy_threshold": 0.9453166723251343, "eval_dev_average_precision": 0.8915161607864528, "eval_dev_f1": 0.8188866156993647, "eval_dev_f1_threshold": 0.8840415477752686, "eval_dev_precision": 0.8050174658621785, "eval_dev_recall": 0.8332420291442971, "eval_loss": 0.2096114605665207, "eval_runtime": 952.4626, "eval_samples_per_second": 139.267, "eval_steps_per_second": 2.176, "step": 85000 }, { "epoch": 2.2678818889244217, "grad_norm": 661.853271484375, "learning_rate": 1.5392344592060328e-05, "loss": 0.1887, "step": 85100 }, { "epoch": 2.2705468500159895, "grad_norm": 26199.923828125, "learning_rate": 1.5372603983654778e-05, "loss": 0.1829, "step": 85200 }, { "epoch": 2.273211811107558, "grad_norm": 11920.501953125, "learning_rate": 1.5352863375249228e-05, "loss": 0.18, "step": 85300 }, { "epoch": 2.2758767721991258, "grad_norm": 13859.2724609375, "learning_rate": 1.5333122766843675e-05, "loss": 0.1935, "step": 85400 }, { "epoch": 2.278541733290694, "grad_norm": 476.45367431640625, "learning_rate": 1.5313382158438125e-05, "loss": 0.1934, "step": 85500 }, { "epoch": 2.278541733290694, "eval_dev_accuracy": 0.9749636252610312, "eval_dev_accuracy_threshold": 0.9518921375274658, "eval_dev_average_precision": 0.8866670870419442, "eval_dev_f1": 0.8237035470740602, "eval_dev_f1_threshold": 0.6820048093795776, "eval_dev_precision": 0.7958120531154239, "eval_dev_recall": 0.8536211241371754, "eval_loss": 0.23012706637382507, "eval_runtime": 952.604, "eval_samples_per_second": 139.247, "eval_steps_per_second": 2.176, "step": 85500 } ], "logging_steps": 100, "max_steps": 112572, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }