diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4171 @@ +{ + "best_global_step": 55000, + "best_metric": 0.8768783517240833, + "best_model_checkpoint": "./lang-ner-xlmr/checkpoint-55000", + "epoch": 2.0, + "eval_steps": 2500, + "global_step": 55278, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036180759072325336, + "grad_norm": 5.75448751449585, + "learning_rate": 4.9910452621295995e-05, + "loss": 4.179392395019531, + "step": 100 + }, + { + "epoch": 0.007236151814465067, + "grad_norm": 2.6520659923553467, + "learning_rate": 4.9820000723615186e-05, + "loss": 0.6058632278442383, + "step": 200 + }, + { + "epoch": 0.010854227721697602, + "grad_norm": 3.474226951599121, + "learning_rate": 4.972954882593437e-05, + "loss": 0.3028737449645996, + "step": 300 + }, + { + "epoch": 0.014472303628930134, + "grad_norm": 1.4948221445083618, + "learning_rate": 4.963909692825356e-05, + "loss": 0.18973339080810547, + "step": 400 + }, + { + "epoch": 0.01809037953616267, + "grad_norm": 1.389740228652954, + "learning_rate": 4.9548645030572745e-05, + "loss": 0.15398676872253417, + "step": 500 + }, + { + "epoch": 0.021708455443395204, + "grad_norm": 1.4510504007339478, + "learning_rate": 4.945819313289193e-05, + "loss": 0.13108017921447754, + "step": 600 + }, + { + "epoch": 0.025326531350627735, + "grad_norm": 1.4420865774154663, + "learning_rate": 4.936774123521112e-05, + "loss": 0.12688090324401854, + "step": 700 + }, + { + "epoch": 0.02894460725786027, + "grad_norm": 0.9447225332260132, + "learning_rate": 4.92772893375303e-05, + "loss": 0.11376466751098632, + "step": 800 + }, + { + "epoch": 0.0325626831650928, + "grad_norm": 1.9140123128890991, + "learning_rate": 4.9186837439849494e-05, + "loss": 0.10734249114990234, + "step": 900 + }, + { + "epoch": 0.03618075907232534, + "grad_norm": 1.2182528972625732, + "learning_rate": 4.909638554216868e-05, + "loss": 0.09950636863708497, + "step": 1000 + }, + { + "epoch": 0.03979883497955787, + "grad_norm": 1.5587440729141235, + "learning_rate": 4.900593364448786e-05, + "loss": 0.08896804809570312, + "step": 1100 + }, + { + "epoch": 0.04341691088679041, + "grad_norm": 2.021667242050171, + "learning_rate": 4.891548174680705e-05, + "loss": 0.09553884506225586, + "step": 1200 + }, + { + "epoch": 0.04703498679402294, + "grad_norm": 3.561288595199585, + "learning_rate": 4.882502984912624e-05, + "loss": 0.0916118335723877, + "step": 1300 + }, + { + "epoch": 0.05065306270125547, + "grad_norm": 2.239180088043213, + "learning_rate": 4.873457795144543e-05, + "loss": 0.08524966239929199, + "step": 1400 + }, + { + "epoch": 0.054271138608488007, + "grad_norm": 1.880850076675415, + "learning_rate": 4.864412605376461e-05, + "loss": 0.08407029151916504, + "step": 1500 + }, + { + "epoch": 0.05788921451572054, + "grad_norm": 2.365021228790283, + "learning_rate": 4.8553674156083796e-05, + "loss": 0.09083961486816407, + "step": 1600 + }, + { + "epoch": 0.061507290422953075, + "grad_norm": 1.8810335397720337, + "learning_rate": 4.8463222258402987e-05, + "loss": 0.0841958236694336, + "step": 1700 + }, + { + "epoch": 0.0651253663301856, + "grad_norm": 1.7592241764068604, + "learning_rate": 4.837277036072217e-05, + "loss": 0.08484026908874512, + "step": 1800 + }, + { + "epoch": 0.06874344223741814, + "grad_norm": 1.4012072086334229, + "learning_rate": 4.828231846304136e-05, + "loss": 0.07917069911956787, + "step": 1900 + }, + { + "epoch": 0.07236151814465068, + "grad_norm": 1.6757310628890991, + "learning_rate": 4.8191866565360545e-05, + "loss": 0.0806041145324707, + "step": 2000 + }, + { + "epoch": 0.0759795940518832, + "grad_norm": 0.6598155498504639, + "learning_rate": 4.810141466767973e-05, + "loss": 0.07851210594177246, + "step": 2100 + }, + { + "epoch": 0.07959766995911574, + "grad_norm": 1.5423673391342163, + "learning_rate": 4.801096276999892e-05, + "loss": 0.08287395477294922, + "step": 2200 + }, + { + "epoch": 0.08321574586634828, + "grad_norm": 0.4928501546382904, + "learning_rate": 4.7920510872318104e-05, + "loss": 0.07287377834320069, + "step": 2300 + }, + { + "epoch": 0.08683382177358082, + "grad_norm": 1.8151744604110718, + "learning_rate": 4.7830058974637295e-05, + "loss": 0.06640945911407471, + "step": 2400 + }, + { + "epoch": 0.09045189768081334, + "grad_norm": 1.1932594776153564, + "learning_rate": 4.773960707695648e-05, + "loss": 0.07295094966888428, + "step": 2500 + }, + { + "epoch": 0.09045189768081334, + "eval_accuracy": 0.975962734636331, + "eval_f1": 0.7717093579748968, + "eval_loss": 0.10806787014007568, + "eval_precision": 0.7241184528264584, + "eval_recall": 0.8259959084392468, + "eval_runtime": 117.8075, + "eval_samples_per_second": 169.768, + "eval_steps_per_second": 4.72, + "step": 2500 + }, + { + "epoch": 0.09406997358804588, + "grad_norm": 1.0983343124389648, + "learning_rate": 4.764915517927566e-05, + "loss": 0.06925168514251709, + "step": 2600 + }, + { + "epoch": 0.09768804949527841, + "grad_norm": 0.8816857933998108, + "learning_rate": 4.7558703281594854e-05, + "loss": 0.06958985328674316, + "step": 2700 + }, + { + "epoch": 0.10130612540251094, + "grad_norm": 0.8671173453330994, + "learning_rate": 4.746825138391404e-05, + "loss": 0.07468698024749756, + "step": 2800 + }, + { + "epoch": 0.10492420130974348, + "grad_norm": 0.27838993072509766, + "learning_rate": 4.737779948623322e-05, + "loss": 0.07403119087219238, + "step": 2900 + }, + { + "epoch": 0.10854227721697601, + "grad_norm": 0.4557673931121826, + "learning_rate": 4.728734758855241e-05, + "loss": 0.07262114524841308, + "step": 3000 + }, + { + "epoch": 0.11216035312420855, + "grad_norm": 0.8267778158187866, + "learning_rate": 4.71968956908716e-05, + "loss": 0.07057662963867188, + "step": 3100 + }, + { + "epoch": 0.11577842903144107, + "grad_norm": 1.401780128479004, + "learning_rate": 4.710644379319079e-05, + "loss": 0.06252509117126465, + "step": 3200 + }, + { + "epoch": 0.11939650493867361, + "grad_norm": 1.7423473596572876, + "learning_rate": 4.701599189550997e-05, + "loss": 0.06425057411193848, + "step": 3300 + }, + { + "epoch": 0.12301458084590615, + "grad_norm": 0.7547276616096497, + "learning_rate": 4.6925539997829156e-05, + "loss": 0.06438188076019287, + "step": 3400 + }, + { + "epoch": 0.12663265675313867, + "grad_norm": 0.4259902238845825, + "learning_rate": 4.6835088100148346e-05, + "loss": 0.0666530466079712, + "step": 3500 + }, + { + "epoch": 0.1302507326603712, + "grad_norm": 0.42786452174186707, + "learning_rate": 4.674463620246753e-05, + "loss": 0.05976760864257812, + "step": 3600 + }, + { + "epoch": 0.13386880856760375, + "grad_norm": 1.1275266408920288, + "learning_rate": 4.665418430478672e-05, + "loss": 0.06228343009948731, + "step": 3700 + }, + { + "epoch": 0.13748688447483629, + "grad_norm": 1.345894455909729, + "learning_rate": 4.6563732407105905e-05, + "loss": 0.0695729398727417, + "step": 3800 + }, + { + "epoch": 0.14110496038206882, + "grad_norm": 0.5640186071395874, + "learning_rate": 4.647328050942509e-05, + "loss": 0.06416056156158448, + "step": 3900 + }, + { + "epoch": 0.14472303628930136, + "grad_norm": 1.5667623281478882, + "learning_rate": 4.638282861174428e-05, + "loss": 0.06927279949188232, + "step": 4000 + }, + { + "epoch": 0.14834111219653387, + "grad_norm": 0.4014199674129486, + "learning_rate": 4.6292376714063464e-05, + "loss": 0.060500779151916505, + "step": 4100 + }, + { + "epoch": 0.1519591881037664, + "grad_norm": 0.8349173069000244, + "learning_rate": 4.6201924816382655e-05, + "loss": 0.05734441757202149, + "step": 4200 + }, + { + "epoch": 0.15557726401099894, + "grad_norm": 0.48946359753608704, + "learning_rate": 4.611147291870184e-05, + "loss": 0.0637766456604004, + "step": 4300 + }, + { + "epoch": 0.15919533991823148, + "grad_norm": 0.44791749119758606, + "learning_rate": 4.602102102102102e-05, + "loss": 0.0613397216796875, + "step": 4400 + }, + { + "epoch": 0.16281341582546402, + "grad_norm": 1.0726768970489502, + "learning_rate": 4.5930569123340214e-05, + "loss": 0.07220725536346435, + "step": 4500 + }, + { + "epoch": 0.16643149173269656, + "grad_norm": 0.48238834738731384, + "learning_rate": 4.58401172256594e-05, + "loss": 0.05229937076568603, + "step": 4600 + }, + { + "epoch": 0.1700495676399291, + "grad_norm": 0.4427547752857208, + "learning_rate": 4.574966532797859e-05, + "loss": 0.06027111530303955, + "step": 4700 + }, + { + "epoch": 0.17366764354716163, + "grad_norm": 0.44010627269744873, + "learning_rate": 4.565921343029777e-05, + "loss": 0.06117689609527588, + "step": 4800 + }, + { + "epoch": 0.17728571945439414, + "grad_norm": 0.26065585017204285, + "learning_rate": 4.5568761532616956e-05, + "loss": 0.060817084312438964, + "step": 4900 + }, + { + "epoch": 0.18090379536162668, + "grad_norm": 0.41624584794044495, + "learning_rate": 4.547830963493615e-05, + "loss": 0.06215104579925537, + "step": 5000 + }, + { + "epoch": 0.18090379536162668, + "eval_accuracy": 0.9724426137358435, + "eval_f1": 0.741559979115958, + "eval_loss": 0.12759321928024292, + "eval_precision": 0.6822080909213909, + "eval_recall": 0.8122231350376133, + "eval_runtime": 63.257, + "eval_samples_per_second": 316.17, + "eval_steps_per_second": 8.79, + "step": 5000 + }, + { + "epoch": 0.18452187126885922, + "grad_norm": 1.1262469291687012, + "learning_rate": 4.538785773725533e-05, + "loss": 0.056777148246765136, + "step": 5100 + }, + { + "epoch": 0.18813994717609175, + "grad_norm": 0.44265300035476685, + "learning_rate": 4.5297405839574515e-05, + "loss": 0.05986386775970459, + "step": 5200 + }, + { + "epoch": 0.1917580230833243, + "grad_norm": 0.5468171238899231, + "learning_rate": 4.5206953941893706e-05, + "loss": 0.05671721935272217, + "step": 5300 + }, + { + "epoch": 0.19537609899055683, + "grad_norm": 0.3858329653739929, + "learning_rate": 4.511650204421289e-05, + "loss": 0.05604006290435791, + "step": 5400 + }, + { + "epoch": 0.19899417489778937, + "grad_norm": 1.0813618898391724, + "learning_rate": 4.502605014653208e-05, + "loss": 0.05299887180328369, + "step": 5500 + }, + { + "epoch": 0.20261225080502188, + "grad_norm": 0.7834122776985168, + "learning_rate": 4.4935598248851265e-05, + "loss": 0.0669465970993042, + "step": 5600 + }, + { + "epoch": 0.2062303267122544, + "grad_norm": 0.8666114211082458, + "learning_rate": 4.484514635117045e-05, + "loss": 0.06568387985229492, + "step": 5700 + }, + { + "epoch": 0.20984840261948695, + "grad_norm": 0.7354055643081665, + "learning_rate": 4.475469445348964e-05, + "loss": 0.06354703903198242, + "step": 5800 + }, + { + "epoch": 0.2134664785267195, + "grad_norm": 0.3984626829624176, + "learning_rate": 4.4664242555808824e-05, + "loss": 0.05610593318939209, + "step": 5900 + }, + { + "epoch": 0.21708455443395203, + "grad_norm": 0.5307297110557556, + "learning_rate": 4.4573790658128014e-05, + "loss": 0.058310718536376954, + "step": 6000 + }, + { + "epoch": 0.22070263034118456, + "grad_norm": 0.23685064911842346, + "learning_rate": 4.44833387604472e-05, + "loss": 0.0474505615234375, + "step": 6100 + }, + { + "epoch": 0.2243207062484171, + "grad_norm": 0.6271052360534668, + "learning_rate": 4.439288686276638e-05, + "loss": 0.05871774673461914, + "step": 6200 + }, + { + "epoch": 0.22793878215564964, + "grad_norm": 0.6762889623641968, + "learning_rate": 4.430243496508557e-05, + "loss": 0.05517944812774658, + "step": 6300 + }, + { + "epoch": 0.23155685806288215, + "grad_norm": 0.9603418111801147, + "learning_rate": 4.421198306740476e-05, + "loss": 0.05483291625976563, + "step": 6400 + }, + { + "epoch": 0.23517493397011469, + "grad_norm": 0.6032853126525879, + "learning_rate": 4.412153116972395e-05, + "loss": 0.05903904914855957, + "step": 6500 + }, + { + "epoch": 0.23879300987734722, + "grad_norm": 0.40814077854156494, + "learning_rate": 4.403107927204313e-05, + "loss": 0.05642669677734375, + "step": 6600 + }, + { + "epoch": 0.24241108578457976, + "grad_norm": 0.5799020528793335, + "learning_rate": 4.3940627374362316e-05, + "loss": 0.055092153549194334, + "step": 6700 + }, + { + "epoch": 0.2460291616918123, + "grad_norm": 1.0993859767913818, + "learning_rate": 4.385017547668151e-05, + "loss": 0.054167227745056154, + "step": 6800 + }, + { + "epoch": 0.24964723759904484, + "grad_norm": 1.9801974296569824, + "learning_rate": 4.375972357900069e-05, + "loss": 0.057117671966552735, + "step": 6900 + }, + { + "epoch": 0.25326531350627735, + "grad_norm": 0.4046414792537689, + "learning_rate": 4.366927168131988e-05, + "loss": 0.054672832489013674, + "step": 7000 + }, + { + "epoch": 0.2568833894135099, + "grad_norm": 0.41931968927383423, + "learning_rate": 4.3578819783639066e-05, + "loss": 0.05668231964111328, + "step": 7100 + }, + { + "epoch": 0.2605014653207424, + "grad_norm": 0.5075521469116211, + "learning_rate": 4.348836788595825e-05, + "loss": 0.05900467395782471, + "step": 7200 + }, + { + "epoch": 0.264119541227975, + "grad_norm": 1.0615949630737305, + "learning_rate": 4.339791598827744e-05, + "loss": 0.060022168159484864, + "step": 7300 + }, + { + "epoch": 0.2677376171352075, + "grad_norm": 0.6786783337593079, + "learning_rate": 4.3307464090596625e-05, + "loss": 0.053788251876831054, + "step": 7400 + }, + { + "epoch": 0.27135569304244, + "grad_norm": 0.7518507838249207, + "learning_rate": 4.321701219291581e-05, + "loss": 0.05555037975311279, + "step": 7500 + }, + { + "epoch": 0.27135569304244, + "eval_accuracy": 0.9812751684036897, + "eval_f1": 0.8064070486745359, + "eval_loss": 0.08261791616678238, + "eval_precision": 0.7701385325808107, + "eval_recall": 0.8462604101225857, + "eval_runtime": 62.4561, + "eval_samples_per_second": 320.225, + "eval_steps_per_second": 8.902, + "step": 7500 + }, + { + "epoch": 0.27497376894967257, + "grad_norm": 0.8300764560699463, + "learning_rate": 4.3126560295235e-05, + "loss": 0.051460466384887694, + "step": 7600 + }, + { + "epoch": 0.2785918448569051, + "grad_norm": 1.0100982189178467, + "learning_rate": 4.303610839755418e-05, + "loss": 0.05660095691680908, + "step": 7700 + }, + { + "epoch": 0.28220992076413765, + "grad_norm": 0.5547285676002502, + "learning_rate": 4.2945656499873374e-05, + "loss": 0.05661679267883301, + "step": 7800 + }, + { + "epoch": 0.28582799667137015, + "grad_norm": 0.49258002638816833, + "learning_rate": 4.285520460219256e-05, + "loss": 0.04981692790985107, + "step": 7900 + }, + { + "epoch": 0.2894460725786027, + "grad_norm": 2.1518049240112305, + "learning_rate": 4.276475270451174e-05, + "loss": 0.04876615524291992, + "step": 8000 + }, + { + "epoch": 0.29306414848583523, + "grad_norm": 0.973175048828125, + "learning_rate": 4.267430080683093e-05, + "loss": 0.0555543327331543, + "step": 8100 + }, + { + "epoch": 0.29668222439306774, + "grad_norm": 2.2509944438934326, + "learning_rate": 4.258384890915012e-05, + "loss": 0.05133993148803711, + "step": 8200 + }, + { + "epoch": 0.3003003003003003, + "grad_norm": 1.938225507736206, + "learning_rate": 4.249339701146931e-05, + "loss": 0.05030904769897461, + "step": 8300 + }, + { + "epoch": 0.3039183762075328, + "grad_norm": 0.5656659007072449, + "learning_rate": 4.240294511378849e-05, + "loss": 0.05507714748382568, + "step": 8400 + }, + { + "epoch": 0.3075364521147654, + "grad_norm": 0.7741718888282776, + "learning_rate": 4.2312493216107676e-05, + "loss": 0.05459506511688232, + "step": 8500 + }, + { + "epoch": 0.3111545280219979, + "grad_norm": 0.547379195690155, + "learning_rate": 4.2222041318426867e-05, + "loss": 0.050563540458679196, + "step": 8600 + }, + { + "epoch": 0.31477260392923045, + "grad_norm": 0.5133877396583557, + "learning_rate": 4.213158942074605e-05, + "loss": 0.05503926753997803, + "step": 8700 + }, + { + "epoch": 0.31839067983646296, + "grad_norm": 0.4732136130332947, + "learning_rate": 4.204113752306524e-05, + "loss": 0.04883493423461914, + "step": 8800 + }, + { + "epoch": 0.32200875574369553, + "grad_norm": 0.7309387922286987, + "learning_rate": 4.1950685625384425e-05, + "loss": 0.0464065933227539, + "step": 8900 + }, + { + "epoch": 0.32562683165092804, + "grad_norm": 0.9696952104568481, + "learning_rate": 4.186023372770361e-05, + "loss": 0.05353004455566406, + "step": 9000 + }, + { + "epoch": 0.32924490755816055, + "grad_norm": 0.6350353956222534, + "learning_rate": 4.17697818300228e-05, + "loss": 0.05357151508331299, + "step": 9100 + }, + { + "epoch": 0.3328629834653931, + "grad_norm": 0.5927383899688721, + "learning_rate": 4.1679329932341984e-05, + "loss": 0.0496389102935791, + "step": 9200 + }, + { + "epoch": 0.3364810593726256, + "grad_norm": 0.555016040802002, + "learning_rate": 4.1588878034661175e-05, + "loss": 0.048683485984802245, + "step": 9300 + }, + { + "epoch": 0.3400991352798582, + "grad_norm": 0.33153098821640015, + "learning_rate": 4.149842613698036e-05, + "loss": 0.049552416801452635, + "step": 9400 + }, + { + "epoch": 0.3437172111870907, + "grad_norm": 0.7421421408653259, + "learning_rate": 4.140797423929954e-05, + "loss": 0.050444388389587404, + "step": 9500 + }, + { + "epoch": 0.34733528709432326, + "grad_norm": 0.7501067519187927, + "learning_rate": 4.1317522341618734e-05, + "loss": 0.05306045532226562, + "step": 9600 + }, + { + "epoch": 0.3509533630015558, + "grad_norm": 0.9074022173881531, + "learning_rate": 4.122707044393792e-05, + "loss": 0.04894153594970703, + "step": 9700 + }, + { + "epoch": 0.3545714389087883, + "grad_norm": 0.6082141399383545, + "learning_rate": 4.11366185462571e-05, + "loss": 0.05211612224578857, + "step": 9800 + }, + { + "epoch": 0.35818951481602085, + "grad_norm": 0.6638932824134827, + "learning_rate": 4.104616664857629e-05, + "loss": 0.05089833736419678, + "step": 9900 + }, + { + "epoch": 0.36180759072325336, + "grad_norm": 0.8939893841743469, + "learning_rate": 4.095571475089548e-05, + "loss": 0.05038036823272705, + "step": 10000 + }, + { + "epoch": 0.36180759072325336, + "eval_accuracy": 0.9821651815196725, + "eval_f1": 0.8226399325197526, + "eval_loss": 0.07629744708538055, + "eval_precision": 0.7916120576671035, + "eval_recall": 0.8561993588814253, + "eval_runtime": 62.5369, + "eval_samples_per_second": 319.811, + "eval_steps_per_second": 8.891, + "step": 10000 + }, + { + "epoch": 0.3654256666304859, + "grad_norm": 0.3776226043701172, + "learning_rate": 4.086526285321467e-05, + "loss": 0.05038893222808838, + "step": 10100 + }, + { + "epoch": 0.36904374253771843, + "grad_norm": 0.29007160663604736, + "learning_rate": 4.077481095553385e-05, + "loss": 0.05022284507751465, + "step": 10200 + }, + { + "epoch": 0.372661818444951, + "grad_norm": 0.2021007239818573, + "learning_rate": 4.0684359057853036e-05, + "loss": 0.049036202430725095, + "step": 10300 + }, + { + "epoch": 0.3762798943521835, + "grad_norm": 0.2728661894798279, + "learning_rate": 4.0593907160172226e-05, + "loss": 0.05147543907165528, + "step": 10400 + }, + { + "epoch": 0.379897970259416, + "grad_norm": 0.6017497181892395, + "learning_rate": 4.050345526249141e-05, + "loss": 0.052560653686523434, + "step": 10500 + }, + { + "epoch": 0.3835160461666486, + "grad_norm": 0.5500878095626831, + "learning_rate": 4.0413003364810594e-05, + "loss": 0.0445310115814209, + "step": 10600 + }, + { + "epoch": 0.3871341220738811, + "grad_norm": 1.6260461807250977, + "learning_rate": 4.0322551467129785e-05, + "loss": 0.04827467441558838, + "step": 10700 + }, + { + "epoch": 0.39075219798111366, + "grad_norm": 1.0797089338302612, + "learning_rate": 4.023209956944897e-05, + "loss": 0.0508196496963501, + "step": 10800 + }, + { + "epoch": 0.39437027388834617, + "grad_norm": 0.33457517623901367, + "learning_rate": 4.014164767176816e-05, + "loss": 0.04953153133392334, + "step": 10900 + }, + { + "epoch": 0.39798834979557873, + "grad_norm": 0.5582904815673828, + "learning_rate": 4.0051195774087344e-05, + "loss": 0.04928678035736084, + "step": 11000 + }, + { + "epoch": 0.40160642570281124, + "grad_norm": 0.21949921548366547, + "learning_rate": 3.996074387640653e-05, + "loss": 0.05192047119140625, + "step": 11100 + }, + { + "epoch": 0.40522450161004375, + "grad_norm": 0.7574787139892578, + "learning_rate": 3.987029197872572e-05, + "loss": 0.049414234161376955, + "step": 11200 + }, + { + "epoch": 0.4088425775172763, + "grad_norm": 1.8344570398330688, + "learning_rate": 3.97798400810449e-05, + "loss": 0.05043137550354004, + "step": 11300 + }, + { + "epoch": 0.4124606534245088, + "grad_norm": 0.618725061416626, + "learning_rate": 3.968938818336409e-05, + "loss": 0.04852957248687744, + "step": 11400 + }, + { + "epoch": 0.4160787293317414, + "grad_norm": 0.6515002250671387, + "learning_rate": 3.959893628568328e-05, + "loss": 0.051465816497802734, + "step": 11500 + }, + { + "epoch": 0.4196968052389739, + "grad_norm": 0.6772841215133667, + "learning_rate": 3.950848438800246e-05, + "loss": 0.05751809120178222, + "step": 11600 + }, + { + "epoch": 0.42331488114620647, + "grad_norm": 0.3189091384410858, + "learning_rate": 3.941803249032165e-05, + "loss": 0.047155842781066895, + "step": 11700 + }, + { + "epoch": 0.426932957053439, + "grad_norm": 0.2367490977048874, + "learning_rate": 3.9327580592640836e-05, + "loss": 0.043431487083435055, + "step": 11800 + }, + { + "epoch": 0.43055103296067154, + "grad_norm": 0.38205036520957947, + "learning_rate": 3.923712869496002e-05, + "loss": 0.04606367588043213, + "step": 11900 + }, + { + "epoch": 0.43416910886790405, + "grad_norm": 0.539438009262085, + "learning_rate": 3.914667679727921e-05, + "loss": 0.04509395122528076, + "step": 12000 + }, + { + "epoch": 0.43778718477513656, + "grad_norm": 1.1849830150604248, + "learning_rate": 3.9056224899598395e-05, + "loss": 0.045330324172973634, + "step": 12100 + }, + { + "epoch": 0.4414052606823691, + "grad_norm": 0.6970862746238708, + "learning_rate": 3.896577300191758e-05, + "loss": 0.04937627792358398, + "step": 12200 + }, + { + "epoch": 0.44502333658960164, + "grad_norm": 0.3145708739757538, + "learning_rate": 3.887532110423677e-05, + "loss": 0.04958348274230957, + "step": 12300 + }, + { + "epoch": 0.4486414124968342, + "grad_norm": 1.822594404220581, + "learning_rate": 3.8784869206555954e-05, + "loss": 0.05177441120147705, + "step": 12400 + }, + { + "epoch": 0.4522594884040667, + "grad_norm": 0.3980540335178375, + "learning_rate": 3.8694417308875145e-05, + "loss": 0.04803945064544678, + "step": 12500 + }, + { + "epoch": 0.4522594884040667, + "eval_accuracy": 0.9839402163062075, + "eval_f1": 0.8303541577576488, + "eval_loss": 0.07028726488351822, + "eval_precision": 0.8025429842491283, + "eval_recall": 0.8601620515794391, + "eval_runtime": 61.9616, + "eval_samples_per_second": 322.781, + "eval_steps_per_second": 8.973, + "step": 12500 + }, + { + "epoch": 0.4558775643112993, + "grad_norm": 2.3516685962677, + "learning_rate": 3.860396541119433e-05, + "loss": 0.04993240833282471, + "step": 12600 + }, + { + "epoch": 0.4594956402185318, + "grad_norm": 0.9219645857810974, + "learning_rate": 3.851351351351351e-05, + "loss": 0.04464954853057861, + "step": 12700 + }, + { + "epoch": 0.4631137161257643, + "grad_norm": 0.7087405920028687, + "learning_rate": 3.8423061615832704e-05, + "loss": 0.041380634307861326, + "step": 12800 + }, + { + "epoch": 0.46673179203299686, + "grad_norm": 0.3233760893344879, + "learning_rate": 3.833260971815189e-05, + "loss": 0.05234696865081787, + "step": 12900 + }, + { + "epoch": 0.47034986794022937, + "grad_norm": 0.31167057156562805, + "learning_rate": 3.824215782047107e-05, + "loss": 0.04531662464141846, + "step": 13000 + }, + { + "epoch": 0.47396794384746194, + "grad_norm": 0.9034203886985779, + "learning_rate": 3.815170592279026e-05, + "loss": 0.04655809879302979, + "step": 13100 + }, + { + "epoch": 0.47758601975469445, + "grad_norm": 0.3943072259426117, + "learning_rate": 3.8061254025109447e-05, + "loss": 0.0500339937210083, + "step": 13200 + }, + { + "epoch": 0.481204095661927, + "grad_norm": 0.9143586158752441, + "learning_rate": 3.797080212742864e-05, + "loss": 0.04793615818023682, + "step": 13300 + }, + { + "epoch": 0.4848221715691595, + "grad_norm": 1.2170947790145874, + "learning_rate": 3.788035022974782e-05, + "loss": 0.04486670970916748, + "step": 13400 + }, + { + "epoch": 0.48844024747639203, + "grad_norm": 0.4851992130279541, + "learning_rate": 3.7789898332067005e-05, + "loss": 0.0455370569229126, + "step": 13500 + }, + { + "epoch": 0.4920583233836246, + "grad_norm": 0.3209129273891449, + "learning_rate": 3.7699446434386196e-05, + "loss": 0.04612759113311768, + "step": 13600 + }, + { + "epoch": 0.4956763992908571, + "grad_norm": 0.6042996644973755, + "learning_rate": 3.760899453670538e-05, + "loss": 0.04637802600860596, + "step": 13700 + }, + { + "epoch": 0.49929447519808967, + "grad_norm": 0.422635018825531, + "learning_rate": 3.751854263902457e-05, + "loss": 0.050551199913024904, + "step": 13800 + }, + { + "epoch": 0.5029125511053222, + "grad_norm": 0.9524370431900024, + "learning_rate": 3.7428090741343755e-05, + "loss": 0.04804905891418457, + "step": 13900 + }, + { + "epoch": 0.5065306270125547, + "grad_norm": 0.8618633151054382, + "learning_rate": 3.733763884366294e-05, + "loss": 0.0453568172454834, + "step": 14000 + }, + { + "epoch": 0.5101487029197873, + "grad_norm": 0.8186506032943726, + "learning_rate": 3.724718694598213e-05, + "loss": 0.04810242176055908, + "step": 14100 + }, + { + "epoch": 0.5137667788270198, + "grad_norm": 0.4649534225463867, + "learning_rate": 3.7156735048301314e-05, + "loss": 0.041149930953979494, + "step": 14200 + }, + { + "epoch": 0.5173848547342523, + "grad_norm": 1.2224235534667969, + "learning_rate": 3.70662831506205e-05, + "loss": 0.0440573263168335, + "step": 14300 + }, + { + "epoch": 0.5210029306414848, + "grad_norm": 1.2368969917297363, + "learning_rate": 3.697583125293969e-05, + "loss": 0.045858840942382816, + "step": 14400 + }, + { + "epoch": 0.5246210065487174, + "grad_norm": 1.4308712482452393, + "learning_rate": 3.688537935525887e-05, + "loss": 0.0431610631942749, + "step": 14500 + }, + { + "epoch": 0.52823908245595, + "grad_norm": 1.7747290134429932, + "learning_rate": 3.6794927457578063e-05, + "loss": 0.04555936813354492, + "step": 14600 + }, + { + "epoch": 0.5318571583631825, + "grad_norm": 0.6626078486442566, + "learning_rate": 3.670447555989725e-05, + "loss": 0.04809264183044434, + "step": 14700 + }, + { + "epoch": 0.535475234270415, + "grad_norm": 0.49305254220962524, + "learning_rate": 3.661402366221643e-05, + "loss": 0.044796910285949704, + "step": 14800 + }, + { + "epoch": 0.5390933101776475, + "grad_norm": 0.5383502840995789, + "learning_rate": 3.652357176453562e-05, + "loss": 0.04197264194488525, + "step": 14900 + }, + { + "epoch": 0.54271138608488, + "grad_norm": 0.9339898824691772, + "learning_rate": 3.6433119866854806e-05, + "loss": 0.04077723026275635, + "step": 15000 + }, + { + "epoch": 0.54271138608488, + "eval_accuracy": 0.9837071542003397, + "eval_f1": 0.8344733667950663, + "eval_loss": 0.0750078409910202, + "eval_precision": 0.8071688796555565, + "eval_recall": 0.8636898145910855, + "eval_runtime": 62.6857, + "eval_samples_per_second": 319.052, + "eval_steps_per_second": 8.87, + "step": 15000 + }, + { + "epoch": 0.5463294619921126, + "grad_norm": 0.7692775130271912, + "learning_rate": 3.634266796917399e-05, + "loss": 0.04739581108093262, + "step": 15100 + }, + { + "epoch": 0.5499475378993451, + "grad_norm": 1.047753095626831, + "learning_rate": 3.625221607149318e-05, + "loss": 0.04375821590423584, + "step": 15200 + }, + { + "epoch": 0.5535656138065776, + "grad_norm": 0.9720122218132019, + "learning_rate": 3.6161764173812365e-05, + "loss": 0.0421258020401001, + "step": 15300 + }, + { + "epoch": 0.5571836897138102, + "grad_norm": 0.3475571274757385, + "learning_rate": 3.6071312276131556e-05, + "loss": 0.04756541728973389, + "step": 15400 + }, + { + "epoch": 0.5608017656210428, + "grad_norm": 0.8692478537559509, + "learning_rate": 3.598086037845074e-05, + "loss": 0.04661733150482178, + "step": 15500 + }, + { + "epoch": 0.5644198415282753, + "grad_norm": 1.0307046175003052, + "learning_rate": 3.5890408480769924e-05, + "loss": 0.044859604835510256, + "step": 15600 + }, + { + "epoch": 0.5680379174355078, + "grad_norm": 0.654683530330658, + "learning_rate": 3.5799956583089115e-05, + "loss": 0.04575653076171875, + "step": 15700 + }, + { + "epoch": 0.5716559933427403, + "grad_norm": 2.222489356994629, + "learning_rate": 3.57095046854083e-05, + "loss": 0.04321366310119629, + "step": 15800 + }, + { + "epoch": 0.5752740692499728, + "grad_norm": 1.1416321992874146, + "learning_rate": 3.561905278772748e-05, + "loss": 0.043632102012634275, + "step": 15900 + }, + { + "epoch": 0.5788921451572054, + "grad_norm": 1.0366028547286987, + "learning_rate": 3.5528600890046673e-05, + "loss": 0.04524300575256348, + "step": 16000 + }, + { + "epoch": 0.582510221064438, + "grad_norm": 0.7538347840309143, + "learning_rate": 3.543814899236586e-05, + "loss": 0.04251582622528076, + "step": 16100 + }, + { + "epoch": 0.5861282969716705, + "grad_norm": 0.2561816871166229, + "learning_rate": 3.534769709468505e-05, + "loss": 0.04683804512023926, + "step": 16200 + }, + { + "epoch": 0.589746372878903, + "grad_norm": 0.9383835196495056, + "learning_rate": 3.525724519700423e-05, + "loss": 0.0412297248840332, + "step": 16300 + }, + { + "epoch": 0.5933644487861355, + "grad_norm": 0.5518015623092651, + "learning_rate": 3.5166793299323416e-05, + "loss": 0.0455796480178833, + "step": 16400 + }, + { + "epoch": 0.5969825246933681, + "grad_norm": 0.5094241499900818, + "learning_rate": 3.507634140164261e-05, + "loss": 0.04736936569213867, + "step": 16500 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 0.2816466987133026, + "learning_rate": 3.498588950396179e-05, + "loss": 0.042105512619018556, + "step": 16600 + }, + { + "epoch": 0.6042186765078331, + "grad_norm": 0.4187323749065399, + "learning_rate": 3.489543760628098e-05, + "loss": 0.044366950988769534, + "step": 16700 + }, + { + "epoch": 0.6078367524150656, + "grad_norm": 0.28667891025543213, + "learning_rate": 3.4804985708600166e-05, + "loss": 0.03723037719726562, + "step": 16800 + }, + { + "epoch": 0.6114548283222982, + "grad_norm": 0.3902330994606018, + "learning_rate": 3.471453381091935e-05, + "loss": 0.042644596099853514, + "step": 16900 + }, + { + "epoch": 0.6150729042295308, + "grad_norm": 0.465101033449173, + "learning_rate": 3.462408191323854e-05, + "loss": 0.04263707160949707, + "step": 17000 + }, + { + "epoch": 0.6186909801367633, + "grad_norm": 1.1710171699523926, + "learning_rate": 3.4533630015557725e-05, + "loss": 0.044122686386108396, + "step": 17100 + }, + { + "epoch": 0.6223090560439958, + "grad_norm": 0.4717200696468353, + "learning_rate": 3.444317811787691e-05, + "loss": 0.042054853439331054, + "step": 17200 + }, + { + "epoch": 0.6259271319512283, + "grad_norm": 0.18602319061756134, + "learning_rate": 3.43527262201961e-05, + "loss": 0.03980276823043823, + "step": 17300 + }, + { + "epoch": 0.6295452078584609, + "grad_norm": 2.258084535598755, + "learning_rate": 3.4262274322515284e-05, + "loss": 0.043924779891967775, + "step": 17400 + }, + { + "epoch": 0.6331632837656934, + "grad_norm": 0.5568512082099915, + "learning_rate": 3.4171822424834474e-05, + "loss": 0.04432165145874024, + "step": 17500 + }, + { + "epoch": 0.6331632837656934, + "eval_accuracy": 0.9848981898715126, + "eval_f1": 0.8395063656955402, + "eval_loss": 0.06519697606563568, + "eval_precision": 0.8148625494685449, + "eval_recall": 0.8656872694469949, + "eval_runtime": 61.9341, + "eval_samples_per_second": 322.924, + "eval_steps_per_second": 8.977, + "step": 17500 + }, + { + "epoch": 0.6367813596729259, + "grad_norm": 0.302276611328125, + "learning_rate": 3.408137052715366e-05, + "loss": 0.04175849914550781, + "step": 17600 + }, + { + "epoch": 0.6403994355801584, + "grad_norm": 0.20687709748744965, + "learning_rate": 3.399091862947284e-05, + "loss": 0.042713408470153806, + "step": 17700 + }, + { + "epoch": 0.6440175114873911, + "grad_norm": 0.5285593271255493, + "learning_rate": 3.390046673179203e-05, + "loss": 0.041079201698303223, + "step": 17800 + }, + { + "epoch": 0.6476355873946236, + "grad_norm": 0.359951913356781, + "learning_rate": 3.381001483411122e-05, + "loss": 0.047190561294555664, + "step": 17900 + }, + { + "epoch": 0.6512536633018561, + "grad_norm": 0.5516379475593567, + "learning_rate": 3.371956293643041e-05, + "loss": 0.049062256813049314, + "step": 18000 + }, + { + "epoch": 0.6548717392090886, + "grad_norm": 0.2408919632434845, + "learning_rate": 3.362911103874959e-05, + "loss": 0.041800622940063474, + "step": 18100 + }, + { + "epoch": 0.6584898151163211, + "grad_norm": 0.5572479963302612, + "learning_rate": 3.3538659141068776e-05, + "loss": 0.04303212165832519, + "step": 18200 + }, + { + "epoch": 0.6621078910235537, + "grad_norm": 1.1610311269760132, + "learning_rate": 3.344820724338797e-05, + "loss": 0.04213200092315674, + "step": 18300 + }, + { + "epoch": 0.6657259669307862, + "grad_norm": 0.945891797542572, + "learning_rate": 3.335775534570715e-05, + "loss": 0.0419348955154419, + "step": 18400 + }, + { + "epoch": 0.6693440428380187, + "grad_norm": 0.40828007459640503, + "learning_rate": 3.326730344802634e-05, + "loss": 0.039156782627105716, + "step": 18500 + }, + { + "epoch": 0.6729621187452512, + "grad_norm": 2.0386905670166016, + "learning_rate": 3.3176851550345526e-05, + "loss": 0.042091598510742186, + "step": 18600 + }, + { + "epoch": 0.6765801946524838, + "grad_norm": 2.043750762939453, + "learning_rate": 3.308639965266471e-05, + "loss": 0.04341127872467041, + "step": 18700 + }, + { + "epoch": 0.6801982705597164, + "grad_norm": 1.103946328163147, + "learning_rate": 3.29959477549839e-05, + "loss": 0.04109795570373535, + "step": 18800 + }, + { + "epoch": 0.6838163464669489, + "grad_norm": 1.6356172561645508, + "learning_rate": 3.2905495857303084e-05, + "loss": 0.04152417182922363, + "step": 18900 + }, + { + "epoch": 0.6874344223741814, + "grad_norm": 0.5166067481040955, + "learning_rate": 3.2815043959622275e-05, + "loss": 0.03941408634185791, + "step": 19000 + }, + { + "epoch": 0.6910524982814139, + "grad_norm": 0.341791570186615, + "learning_rate": 3.272459206194146e-05, + "loss": 0.04008223056793213, + "step": 19100 + }, + { + "epoch": 0.6946705741886465, + "grad_norm": 0.2977801263332367, + "learning_rate": 3.263414016426064e-05, + "loss": 0.046716113090515134, + "step": 19200 + }, + { + "epoch": 0.698288650095879, + "grad_norm": 1.640602707862854, + "learning_rate": 3.2543688266579834e-05, + "loss": 0.043398504257202146, + "step": 19300 + }, + { + "epoch": 0.7019067260031115, + "grad_norm": 0.3690544366836548, + "learning_rate": 3.245323636889902e-05, + "loss": 0.03948961734771728, + "step": 19400 + }, + { + "epoch": 0.7055248019103441, + "grad_norm": 2.460749387741089, + "learning_rate": 3.236278447121821e-05, + "loss": 0.04185768127441406, + "step": 19500 + }, + { + "epoch": 0.7091428778175766, + "grad_norm": 0.5380750894546509, + "learning_rate": 3.227233257353739e-05, + "loss": 0.040400395393371584, + "step": 19600 + }, + { + "epoch": 0.7127609537248092, + "grad_norm": 0.44135797023773193, + "learning_rate": 3.218188067585658e-05, + "loss": 0.04154191017150879, + "step": 19700 + }, + { + "epoch": 0.7163790296320417, + "grad_norm": 0.5789956450462341, + "learning_rate": 3.209142877817577e-05, + "loss": 0.0443493127822876, + "step": 19800 + }, + { + "epoch": 0.7199971055392742, + "grad_norm": 0.32769912481307983, + "learning_rate": 3.200097688049495e-05, + "loss": 0.03976017475128174, + "step": 19900 + }, + { + "epoch": 0.7236151814465067, + "grad_norm": 0.6033921837806702, + "learning_rate": 3.1910524982814136e-05, + "loss": 0.04033390522003174, + "step": 20000 + }, + { + "epoch": 0.7236151814465067, + "eval_accuracy": 0.9859394821797719, + "eval_f1": 0.8507431047883741, + "eval_loss": 0.064690500497818, + "eval_precision": 0.8298106965631318, + "eval_recall": 0.8727589039771904, + "eval_runtime": 62.6781, + "eval_samples_per_second": 319.091, + "eval_steps_per_second": 8.871, + "step": 20000 + }, + { + "epoch": 0.7272332573537392, + "grad_norm": 0.21106982231140137, + "learning_rate": 3.1820073085133327e-05, + "loss": 0.0368848705291748, + "step": 20100 + }, + { + "epoch": 0.7308513332609718, + "grad_norm": 0.8279436826705933, + "learning_rate": 3.172962118745251e-05, + "loss": 0.040103306770324705, + "step": 20200 + }, + { + "epoch": 0.7344694091682044, + "grad_norm": 0.21994882822036743, + "learning_rate": 3.16391692897717e-05, + "loss": 0.037559795379638675, + "step": 20300 + }, + { + "epoch": 0.7380874850754369, + "grad_norm": 1.8766059875488281, + "learning_rate": 3.1548717392090885e-05, + "loss": 0.04059103012084961, + "step": 20400 + }, + { + "epoch": 0.7417055609826694, + "grad_norm": 0.6307962536811829, + "learning_rate": 3.145826549441007e-05, + "loss": 0.03980612993240357, + "step": 20500 + }, + { + "epoch": 0.745323636889902, + "grad_norm": 0.33936986327171326, + "learning_rate": 3.136781359672926e-05, + "loss": 0.043472270965576175, + "step": 20600 + }, + { + "epoch": 0.7489417127971345, + "grad_norm": 0.7730916738510132, + "learning_rate": 3.1277361699048444e-05, + "loss": 0.040565075874328616, + "step": 20700 + }, + { + "epoch": 0.752559788704367, + "grad_norm": 0.3246110677719116, + "learning_rate": 3.1186909801367635e-05, + "loss": 0.04017134189605713, + "step": 20800 + }, + { + "epoch": 0.7561778646115995, + "grad_norm": 0.8956949710845947, + "learning_rate": 3.109645790368682e-05, + "loss": 0.04045989513397217, + "step": 20900 + }, + { + "epoch": 0.759795940518832, + "grad_norm": 2.5085365772247314, + "learning_rate": 3.1006006006006e-05, + "loss": 0.0404241943359375, + "step": 21000 + }, + { + "epoch": 0.7634140164260647, + "grad_norm": 0.1668255627155304, + "learning_rate": 3.0915554108325194e-05, + "loss": 0.039553046226501465, + "step": 21100 + }, + { + "epoch": 0.7670320923332972, + "grad_norm": 0.39517688751220703, + "learning_rate": 3.082510221064438e-05, + "loss": 0.04120331764221191, + "step": 21200 + }, + { + "epoch": 0.7706501682405297, + "grad_norm": 0.6607240438461304, + "learning_rate": 3.073465031296357e-05, + "loss": 0.03997873306274414, + "step": 21300 + }, + { + "epoch": 0.7742682441477622, + "grad_norm": 0.44018736481666565, + "learning_rate": 3.064419841528275e-05, + "loss": 0.041695055961608884, + "step": 21400 + }, + { + "epoch": 0.7778863200549948, + "grad_norm": 0.15856041014194489, + "learning_rate": 3.055374651760194e-05, + "loss": 0.04077398300170899, + "step": 21500 + }, + { + "epoch": 0.7815043959622273, + "grad_norm": 0.39261528849601746, + "learning_rate": 3.0463294619921127e-05, + "loss": 0.041572155952453616, + "step": 21600 + }, + { + "epoch": 0.7851224718694598, + "grad_norm": 0.28265002369880676, + "learning_rate": 3.0372842722240315e-05, + "loss": 0.045727620124816896, + "step": 21700 + }, + { + "epoch": 0.7887405477766923, + "grad_norm": 0.6709412336349487, + "learning_rate": 3.0282390824559502e-05, + "loss": 0.04259458065032959, + "step": 21800 + }, + { + "epoch": 0.7923586236839248, + "grad_norm": 0.24202914535999298, + "learning_rate": 3.0191938926878686e-05, + "loss": 0.03839920997619629, + "step": 21900 + }, + { + "epoch": 0.7959766995911575, + "grad_norm": 0.4965508282184601, + "learning_rate": 3.0101487029197874e-05, + "loss": 0.03700316905975342, + "step": 22000 + }, + { + "epoch": 0.79959477549839, + "grad_norm": 0.596442461013794, + "learning_rate": 3.001103513151706e-05, + "loss": 0.04116812229156494, + "step": 22100 + }, + { + "epoch": 0.8032128514056225, + "grad_norm": 0.5273512601852417, + "learning_rate": 2.992058323383625e-05, + "loss": 0.04079509735107422, + "step": 22200 + }, + { + "epoch": 0.806830927312855, + "grad_norm": 0.24124516546726227, + "learning_rate": 2.9830131336155432e-05, + "loss": 0.03795903921127319, + "step": 22300 + }, + { + "epoch": 0.8104490032200875, + "grad_norm": 0.46343305706977844, + "learning_rate": 2.973967943847462e-05, + "loss": 0.038403522968292234, + "step": 22400 + }, + { + "epoch": 0.8140670791273201, + "grad_norm": 0.2311462014913559, + "learning_rate": 2.9649227540793807e-05, + "loss": 0.04132327079772949, + "step": 22500 + }, + { + "epoch": 0.8140670791273201, + "eval_accuracy": 0.9865150342336365, + "eval_f1": 0.8464219002621376, + "eval_loss": 0.05898759886622429, + "eval_precision": 0.8253309864544272, + "eval_recall": 0.8686190177032491, + "eval_runtime": 62.4843, + "eval_samples_per_second": 320.08, + "eval_steps_per_second": 8.898, + "step": 22500 + }, + { + "epoch": 0.8176851550345526, + "grad_norm": 0.6530361175537109, + "learning_rate": 2.9558775643112995e-05, + "loss": 0.04163932323455811, + "step": 22600 + }, + { + "epoch": 0.8213032309417851, + "grad_norm": 1.38533353805542, + "learning_rate": 2.946832374543218e-05, + "loss": 0.03626733779907226, + "step": 22700 + }, + { + "epoch": 0.8249213068490177, + "grad_norm": 1.6181460618972778, + "learning_rate": 2.9377871847751366e-05, + "loss": 0.03692409038543701, + "step": 22800 + }, + { + "epoch": 0.8285393827562503, + "grad_norm": 6.322599411010742, + "learning_rate": 2.9287419950070554e-05, + "loss": 0.03785946369171143, + "step": 22900 + }, + { + "epoch": 0.8321574586634828, + "grad_norm": 0.24266965687274933, + "learning_rate": 2.919696805238974e-05, + "loss": 0.03527719974517822, + "step": 23000 + }, + { + "epoch": 0.8357755345707153, + "grad_norm": 0.41426071524620056, + "learning_rate": 2.910651615470893e-05, + "loss": 0.0348510479927063, + "step": 23100 + }, + { + "epoch": 0.8393936104779478, + "grad_norm": 0.3566010892391205, + "learning_rate": 2.9016064257028112e-05, + "loss": 0.03639560461044312, + "step": 23200 + }, + { + "epoch": 0.8430116863851803, + "grad_norm": 0.14937593042850494, + "learning_rate": 2.89256123593473e-05, + "loss": 0.033641955852508544, + "step": 23300 + }, + { + "epoch": 0.8466297622924129, + "grad_norm": 0.5473237037658691, + "learning_rate": 2.8835160461666487e-05, + "loss": 0.03712946176528931, + "step": 23400 + }, + { + "epoch": 0.8502478381996454, + "grad_norm": 0.3679254949092865, + "learning_rate": 2.874470856398567e-05, + "loss": 0.03785475969314575, + "step": 23500 + }, + { + "epoch": 0.853865914106878, + "grad_norm": 0.20851418375968933, + "learning_rate": 2.8654256666304862e-05, + "loss": 0.04206960201263428, + "step": 23600 + }, + { + "epoch": 0.8574839900141105, + "grad_norm": 0.22139862179756165, + "learning_rate": 2.8563804768624046e-05, + "loss": 0.03989522218704224, + "step": 23700 + }, + { + "epoch": 0.8611020659213431, + "grad_norm": 0.14680643379688263, + "learning_rate": 2.8473352870943233e-05, + "loss": 0.03717276811599732, + "step": 23800 + }, + { + "epoch": 0.8647201418285756, + "grad_norm": 0.2279856950044632, + "learning_rate": 2.838290097326242e-05, + "loss": 0.039047441482543944, + "step": 23900 + }, + { + "epoch": 0.8683382177358081, + "grad_norm": 1.1088160276412964, + "learning_rate": 2.8292449075581605e-05, + "loss": 0.03408738613128662, + "step": 24000 + }, + { + "epoch": 0.8719562936430406, + "grad_norm": 0.8532550930976868, + "learning_rate": 2.8201997177900796e-05, + "loss": 0.036566758155822755, + "step": 24100 + }, + { + "epoch": 0.8755743695502731, + "grad_norm": 0.1683458536863327, + "learning_rate": 2.811154528021998e-05, + "loss": 0.0397763442993164, + "step": 24200 + }, + { + "epoch": 0.8791924454575057, + "grad_norm": 0.3468044102191925, + "learning_rate": 2.8021093382539164e-05, + "loss": 0.036167433261871336, + "step": 24300 + }, + { + "epoch": 0.8828105213647383, + "grad_norm": 1.5043731927871704, + "learning_rate": 2.7930641484858354e-05, + "loss": 0.04083109855651856, + "step": 24400 + }, + { + "epoch": 0.8864285972719708, + "grad_norm": 2.7504560947418213, + "learning_rate": 2.784018958717754e-05, + "loss": 0.039477238655090334, + "step": 24500 + }, + { + "epoch": 0.8900466731792033, + "grad_norm": 0.27413201332092285, + "learning_rate": 2.7749737689496726e-05, + "loss": 0.03859598875045776, + "step": 24600 + }, + { + "epoch": 0.8936647490864358, + "grad_norm": 0.4622710645198822, + "learning_rate": 2.7659285791815913e-05, + "loss": 0.03455983877182007, + "step": 24700 + }, + { + "epoch": 0.8972828249936684, + "grad_norm": 1.0147453546524048, + "learning_rate": 2.7568833894135097e-05, + "loss": 0.03525468587875366, + "step": 24800 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.34606319665908813, + "learning_rate": 2.7478381996454288e-05, + "loss": 0.03580186367034912, + "step": 24900 + }, + { + "epoch": 0.9045189768081334, + "grad_norm": 0.3202800750732422, + "learning_rate": 2.7387930098773472e-05, + "loss": 0.03665663719177246, + "step": 25000 + }, + { + "epoch": 0.9045189768081334, + "eval_accuracy": 0.986656714492393, + "eval_f1": 0.8509657594381035, + "eval_loss": 0.05820872634649277, + "eval_precision": 0.8288109453496006, + "eval_recall": 0.8743375376536349, + "eval_runtime": 62.5862, + "eval_samples_per_second": 319.559, + "eval_steps_per_second": 8.884, + "step": 25000 + }, + { + "epoch": 0.9081370527153659, + "grad_norm": 0.557600736618042, + "learning_rate": 2.7297478201092656e-05, + "loss": 0.03967963457107544, + "step": 25100 + }, + { + "epoch": 0.9117551286225986, + "grad_norm": 0.4092039465904236, + "learning_rate": 2.7207026303411847e-05, + "loss": 0.03797311782836914, + "step": 25200 + }, + { + "epoch": 0.9153732045298311, + "grad_norm": 0.40534520149230957, + "learning_rate": 2.711657440573103e-05, + "loss": 0.036147847175598144, + "step": 25300 + }, + { + "epoch": 0.9189912804370636, + "grad_norm": 0.4325968623161316, + "learning_rate": 2.702612250805022e-05, + "loss": 0.03767855882644653, + "step": 25400 + }, + { + "epoch": 0.9226093563442961, + "grad_norm": 0.25961676239967346, + "learning_rate": 2.6935670610369406e-05, + "loss": 0.03738126039505005, + "step": 25500 + }, + { + "epoch": 0.9262274322515286, + "grad_norm": 0.2495643049478531, + "learning_rate": 2.684521871268859e-05, + "loss": 0.03809333562850952, + "step": 25600 + }, + { + "epoch": 0.9298455081587612, + "grad_norm": 0.20810630917549133, + "learning_rate": 2.675476681500778e-05, + "loss": 0.03803467035293579, + "step": 25700 + }, + { + "epoch": 0.9334635840659937, + "grad_norm": 0.3630845844745636, + "learning_rate": 2.6664314917326964e-05, + "loss": 0.04232705593109131, + "step": 25800 + }, + { + "epoch": 0.9370816599732262, + "grad_norm": 0.6230679154396057, + "learning_rate": 2.6573863019646155e-05, + "loss": 0.03966914892196655, + "step": 25900 + }, + { + "epoch": 0.9406997358804587, + "grad_norm": 0.6846088767051697, + "learning_rate": 2.648341112196534e-05, + "loss": 0.03988933086395264, + "step": 26000 + }, + { + "epoch": 0.9443178117876913, + "grad_norm": 0.29151585698127747, + "learning_rate": 2.6392959224284523e-05, + "loss": 0.036113507747650146, + "step": 26100 + }, + { + "epoch": 0.9479358876949239, + "grad_norm": 0.3652597963809967, + "learning_rate": 2.6302507326603714e-05, + "loss": 0.03595402717590332, + "step": 26200 + }, + { + "epoch": 0.9515539636021564, + "grad_norm": 0.3763394355773926, + "learning_rate": 2.6212055428922898e-05, + "loss": 0.03632761478424072, + "step": 26300 + }, + { + "epoch": 0.9551720395093889, + "grad_norm": 0.16137683391571045, + "learning_rate": 2.612160353124209e-05, + "loss": 0.03010902166366577, + "step": 26400 + }, + { + "epoch": 0.9587901154166214, + "grad_norm": 0.5310078859329224, + "learning_rate": 2.6031151633561273e-05, + "loss": 0.034855997562408446, + "step": 26500 + }, + { + "epoch": 0.962408191323854, + "grad_norm": 0.4904273748397827, + "learning_rate": 2.5940699735880457e-05, + "loss": 0.03756725311279297, + "step": 26600 + }, + { + "epoch": 0.9660262672310865, + "grad_norm": 0.7692480087280273, + "learning_rate": 2.5850247838199648e-05, + "loss": 0.03645958185195923, + "step": 26700 + }, + { + "epoch": 0.969644343138319, + "grad_norm": 0.45624640583992004, + "learning_rate": 2.5759795940518832e-05, + "loss": 0.037951292991638186, + "step": 26800 + }, + { + "epoch": 0.9732624190455516, + "grad_norm": 0.41989752650260925, + "learning_rate": 2.5669344042838023e-05, + "loss": 0.03396618366241455, + "step": 26900 + }, + { + "epoch": 0.9768804949527841, + "grad_norm": 0.5218580961227417, + "learning_rate": 2.5578892145157207e-05, + "loss": 0.034535303115844726, + "step": 27000 + }, + { + "epoch": 0.9804985708600167, + "grad_norm": 0.24635274708271027, + "learning_rate": 2.548844024747639e-05, + "loss": 0.034599866867065426, + "step": 27100 + }, + { + "epoch": 0.9841166467672492, + "grad_norm": 0.8805984258651733, + "learning_rate": 2.539798834979558e-05, + "loss": 0.0382379937171936, + "step": 27200 + }, + { + "epoch": 0.9877347226744817, + "grad_norm": 0.4743868410587311, + "learning_rate": 2.5307536452114765e-05, + "loss": 0.03450409173965454, + "step": 27300 + }, + { + "epoch": 0.9913527985817142, + "grad_norm": 0.4024532735347748, + "learning_rate": 2.521708455443395e-05, + "loss": 0.032371597290039064, + "step": 27400 + }, + { + "epoch": 0.9949708744889468, + "grad_norm": 1.2098551988601685, + "learning_rate": 2.512663265675314e-05, + "loss": 0.03947657585144043, + "step": 27500 + }, + { + "epoch": 0.9949708744889468, + "eval_accuracy": 0.9862055646169487, + "eval_f1": 0.8529879572824359, + "eval_loss": 0.05825402960181236, + "eval_precision": 0.8304042715484363, + "eval_recall": 0.8768343562235217, + "eval_runtime": 62.2283, + "eval_samples_per_second": 321.397, + "eval_steps_per_second": 8.935, + "step": 27500 + }, + { + "epoch": 0.9985889503961793, + "grad_norm": 0.3243059515953064, + "learning_rate": 2.5036180759072324e-05, + "loss": 0.03721761703491211, + "step": 27600 + }, + { + "epoch": 1.0022070263034117, + "grad_norm": 0.5898327231407166, + "learning_rate": 2.494572886139151e-05, + "loss": 0.03310096025466919, + "step": 27700 + }, + { + "epoch": 1.0058251022106444, + "grad_norm": 0.30443838238716125, + "learning_rate": 2.48552769637107e-05, + "loss": 0.033098301887512206, + "step": 27800 + }, + { + "epoch": 1.009443178117877, + "grad_norm": 0.7985163331031799, + "learning_rate": 2.4764825066029886e-05, + "loss": 0.031821844577789304, + "step": 27900 + }, + { + "epoch": 1.0130612540251094, + "grad_norm": 0.6274137496948242, + "learning_rate": 2.4674373168349074e-05, + "loss": 0.03217078447341919, + "step": 28000 + }, + { + "epoch": 1.016679329932342, + "grad_norm": 0.744652271270752, + "learning_rate": 2.4583921270668258e-05, + "loss": 0.030337939262390135, + "step": 28100 + }, + { + "epoch": 1.0202974058395746, + "grad_norm": 0.20680102705955505, + "learning_rate": 2.4493469372987445e-05, + "loss": 0.03135863780975342, + "step": 28200 + }, + { + "epoch": 1.023915481746807, + "grad_norm": 0.5819505453109741, + "learning_rate": 2.4403017475306633e-05, + "loss": 0.030997350215911865, + "step": 28300 + }, + { + "epoch": 1.0275335576540396, + "grad_norm": 0.8105890154838562, + "learning_rate": 2.431256557762582e-05, + "loss": 0.029717042446136474, + "step": 28400 + }, + { + "epoch": 1.031151633561272, + "grad_norm": 0.4248642325401306, + "learning_rate": 2.4222113679945007e-05, + "loss": 0.02956360101699829, + "step": 28500 + }, + { + "epoch": 1.0347697094685047, + "grad_norm": 0.17442703247070312, + "learning_rate": 2.413166178226419e-05, + "loss": 0.03415003776550293, + "step": 28600 + }, + { + "epoch": 1.0383877853757373, + "grad_norm": 0.3765491843223572, + "learning_rate": 2.404120988458338e-05, + "loss": 0.03359386682510376, + "step": 28700 + }, + { + "epoch": 1.0420058612829697, + "grad_norm": 0.2846165895462036, + "learning_rate": 2.3950757986902566e-05, + "loss": 0.03219552993774414, + "step": 28800 + }, + { + "epoch": 1.0456239371902023, + "grad_norm": 0.6828330755233765, + "learning_rate": 2.3860306089221754e-05, + "loss": 0.028468940258026123, + "step": 28900 + }, + { + "epoch": 1.0492420130974347, + "grad_norm": 0.24457824230194092, + "learning_rate": 2.3769854191540938e-05, + "loss": 0.03526209592819214, + "step": 29000 + }, + { + "epoch": 1.0528600890046673, + "grad_norm": 0.4728795886039734, + "learning_rate": 2.3679402293860125e-05, + "loss": 0.027564334869384765, + "step": 29100 + }, + { + "epoch": 1.0564781649119, + "grad_norm": 0.34912073612213135, + "learning_rate": 2.3588950396179312e-05, + "loss": 0.03199338912963867, + "step": 29200 + }, + { + "epoch": 1.0600962408191323, + "grad_norm": 0.7076539993286133, + "learning_rate": 2.34984984984985e-05, + "loss": 0.02838871717453003, + "step": 29300 + }, + { + "epoch": 1.063714316726365, + "grad_norm": 0.22086426615715027, + "learning_rate": 2.3408046600817687e-05, + "loss": 0.03132739543914795, + "step": 29400 + }, + { + "epoch": 1.0673323926335974, + "grad_norm": 0.4026763439178467, + "learning_rate": 2.331759470313687e-05, + "loss": 0.030288333892822265, + "step": 29500 + }, + { + "epoch": 1.07095046854083, + "grad_norm": 0.6986600160598755, + "learning_rate": 2.322714280545606e-05, + "loss": 0.027701468467712403, + "step": 29600 + }, + { + "epoch": 1.0745685444480626, + "grad_norm": 0.3440704047679901, + "learning_rate": 2.3136690907775246e-05, + "loss": 0.03199631690979004, + "step": 29700 + }, + { + "epoch": 1.078186620355295, + "grad_norm": 0.5154510736465454, + "learning_rate": 2.3046239010094434e-05, + "loss": 0.03085195779800415, + "step": 29800 + }, + { + "epoch": 1.0818046962625276, + "grad_norm": 1.2285401821136475, + "learning_rate": 2.295578711241362e-05, + "loss": 0.031190474033355713, + "step": 29900 + }, + { + "epoch": 1.08542277216976, + "grad_norm": 0.3479061722755432, + "learning_rate": 2.2865335214732805e-05, + "loss": 0.03375990152359009, + "step": 30000 + }, + { + "epoch": 1.08542277216976, + "eval_accuracy": 0.9868820974514447, + "eval_f1": 0.8562118190241375, + "eval_loss": 0.05674006789922714, + "eval_precision": 0.8352508617387974, + "eval_recall": 0.8782519048309412, + "eval_runtime": 63.2356, + "eval_samples_per_second": 316.278, + "eval_steps_per_second": 8.793, + "step": 30000 + }, + { + "epoch": 1.0890408480769926, + "grad_norm": 0.18956594169139862, + "learning_rate": 2.2774883317051992e-05, + "loss": 0.027218008041381837, + "step": 30100 + }, + { + "epoch": 1.0926589239842253, + "grad_norm": 0.24030227959156036, + "learning_rate": 2.268443141937118e-05, + "loss": 0.03073176145553589, + "step": 30200 + }, + { + "epoch": 1.0962769998914577, + "grad_norm": 0.1687329262495041, + "learning_rate": 2.2593979521690367e-05, + "loss": 0.033424663543701175, + "step": 30300 + }, + { + "epoch": 1.0998950757986903, + "grad_norm": 1.2173426151275635, + "learning_rate": 2.250352762400955e-05, + "loss": 0.03079766035079956, + "step": 30400 + }, + { + "epoch": 1.103513151705923, + "grad_norm": 0.35310184955596924, + "learning_rate": 2.241307572632874e-05, + "loss": 0.03289975881576538, + "step": 30500 + }, + { + "epoch": 1.1071312276131553, + "grad_norm": 0.14718961715698242, + "learning_rate": 2.2322623828647926e-05, + "loss": 0.03266577005386353, + "step": 30600 + }, + { + "epoch": 1.110749303520388, + "grad_norm": 0.29442161321640015, + "learning_rate": 2.2232171930967113e-05, + "loss": 0.02883612871170044, + "step": 30700 + }, + { + "epoch": 1.1143673794276203, + "grad_norm": 0.36244460940361023, + "learning_rate": 2.21417200332863e-05, + "loss": 0.030666334629058836, + "step": 30800 + }, + { + "epoch": 1.117985455334853, + "grad_norm": 0.2421630471944809, + "learning_rate": 2.2051268135605485e-05, + "loss": 0.02931546211242676, + "step": 30900 + }, + { + "epoch": 1.1216035312420856, + "grad_norm": 0.5055842995643616, + "learning_rate": 2.1960816237924672e-05, + "loss": 0.030934171676635744, + "step": 31000 + }, + { + "epoch": 1.125221607149318, + "grad_norm": 0.27207571268081665, + "learning_rate": 2.187036434024386e-05, + "loss": 0.03155987024307251, + "step": 31100 + }, + { + "epoch": 1.1288396830565506, + "grad_norm": 0.5190430879592896, + "learning_rate": 2.1779912442563047e-05, + "loss": 0.030766298770904543, + "step": 31200 + }, + { + "epoch": 1.132457758963783, + "grad_norm": 0.5578451156616211, + "learning_rate": 2.168946054488223e-05, + "loss": 0.030352199077606203, + "step": 31300 + }, + { + "epoch": 1.1360758348710156, + "grad_norm": 0.775244951248169, + "learning_rate": 2.159900864720142e-05, + "loss": 0.027431459426879884, + "step": 31400 + }, + { + "epoch": 1.1396939107782482, + "grad_norm": 0.17452310025691986, + "learning_rate": 2.1508556749520606e-05, + "loss": 0.02899331569671631, + "step": 31500 + }, + { + "epoch": 1.1433119866854806, + "grad_norm": 1.0152820348739624, + "learning_rate": 2.1418104851839793e-05, + "loss": 0.02969914197921753, + "step": 31600 + }, + { + "epoch": 1.1469300625927132, + "grad_norm": 0.21474546194076538, + "learning_rate": 2.132765295415898e-05, + "loss": 0.03098618268966675, + "step": 31700 + }, + { + "epoch": 1.1505481384999456, + "grad_norm": 0.27076786756515503, + "learning_rate": 2.1237201056478165e-05, + "loss": 0.026145567893981935, + "step": 31800 + }, + { + "epoch": 1.1541662144071783, + "grad_norm": 0.20778276026248932, + "learning_rate": 2.1146749158797352e-05, + "loss": 0.030465993881225586, + "step": 31900 + }, + { + "epoch": 1.1577842903144109, + "grad_norm": 0.2573922276496887, + "learning_rate": 2.105629726111654e-05, + "loss": 0.031988742351531985, + "step": 32000 + }, + { + "epoch": 1.1614023662216433, + "grad_norm": 0.33712247014045715, + "learning_rate": 2.0965845363435727e-05, + "loss": 0.031969892978668216, + "step": 32100 + }, + { + "epoch": 1.165020442128876, + "grad_norm": 0.5677493214607239, + "learning_rate": 2.0875393465754914e-05, + "loss": 0.02892348051071167, + "step": 32200 + }, + { + "epoch": 1.1686385180361083, + "grad_norm": 0.19627009332180023, + "learning_rate": 2.0784941568074098e-05, + "loss": 0.02890573740005493, + "step": 32300 + }, + { + "epoch": 1.172256593943341, + "grad_norm": 0.2041957825422287, + "learning_rate": 2.0694489670393286e-05, + "loss": 0.02606424331665039, + "step": 32400 + }, + { + "epoch": 1.1758746698505735, + "grad_norm": 0.36798298358917236, + "learning_rate": 2.0604037772712473e-05, + "loss": 0.029083385467529296, + "step": 32500 + }, + { + "epoch": 1.1758746698505735, + "eval_accuracy": 0.9877625116339074, + "eval_f1": 0.8611236096967975, + "eval_loss": 0.05370509624481201, + "eval_precision": 0.8443082257515248, + "eval_recall": 0.8786224004896986, + "eval_runtime": 62.1854, + "eval_samples_per_second": 321.619, + "eval_steps_per_second": 8.941, + "step": 32500 + }, + { + "epoch": 1.179492745757806, + "grad_norm": 0.2152443379163742, + "learning_rate": 2.051358587503166e-05, + "loss": 0.028284170627593995, + "step": 32600 + }, + { + "epoch": 1.1831108216650386, + "grad_norm": 0.2933087646961212, + "learning_rate": 2.0423133977350845e-05, + "loss": 0.034238841533660885, + "step": 32700 + }, + { + "epoch": 1.1867288975722712, + "grad_norm": 0.36995938420295715, + "learning_rate": 2.0332682079670032e-05, + "loss": 0.03170938491821289, + "step": 32800 + }, + { + "epoch": 1.1903469734795036, + "grad_norm": 0.7478405833244324, + "learning_rate": 2.024223018198922e-05, + "loss": 0.029751029014587402, + "step": 32900 + }, + { + "epoch": 1.1939650493867362, + "grad_norm": 0.44457152485847473, + "learning_rate": 2.0151778284308407e-05, + "loss": 0.02949444770812988, + "step": 33000 + }, + { + "epoch": 1.1975831252939686, + "grad_norm": 0.4324032664299011, + "learning_rate": 2.0061326386627594e-05, + "loss": 0.030652081966400145, + "step": 33100 + }, + { + "epoch": 1.2012012012012012, + "grad_norm": 1.3409758806228638, + "learning_rate": 1.9970874488946778e-05, + "loss": 0.02934673547744751, + "step": 33200 + }, + { + "epoch": 1.2048192771084336, + "grad_norm": 0.3867700397968292, + "learning_rate": 1.9880422591265966e-05, + "loss": 0.02774231195449829, + "step": 33300 + }, + { + "epoch": 1.2084373530156662, + "grad_norm": 0.1256304383277893, + "learning_rate": 1.9789970693585153e-05, + "loss": 0.030440127849578856, + "step": 33400 + }, + { + "epoch": 1.2120554289228989, + "grad_norm": 0.574845552444458, + "learning_rate": 1.969951879590434e-05, + "loss": 0.030182530879974367, + "step": 33500 + }, + { + "epoch": 1.2156735048301313, + "grad_norm": 0.501304566860199, + "learning_rate": 1.9609066898223528e-05, + "loss": 0.03053757667541504, + "step": 33600 + }, + { + "epoch": 1.2192915807373639, + "grad_norm": 0.1869884878396988, + "learning_rate": 1.9518615000542712e-05, + "loss": 0.02801114559173584, + "step": 33700 + }, + { + "epoch": 1.2229096566445965, + "grad_norm": 0.44489210844039917, + "learning_rate": 1.94281631028619e-05, + "loss": 0.02709296464920044, + "step": 33800 + }, + { + "epoch": 1.226527732551829, + "grad_norm": 0.2928631007671356, + "learning_rate": 1.9337711205181087e-05, + "loss": 0.033639376163482664, + "step": 33900 + }, + { + "epoch": 1.2301458084590615, + "grad_norm": 0.2070285826921463, + "learning_rate": 1.9247259307500274e-05, + "loss": 0.03141526222229004, + "step": 34000 + }, + { + "epoch": 1.233763884366294, + "grad_norm": 0.4693046510219574, + "learning_rate": 1.9156807409819458e-05, + "loss": 0.029341881275177003, + "step": 34100 + }, + { + "epoch": 1.2373819602735265, + "grad_norm": 0.187980055809021, + "learning_rate": 1.9066355512138645e-05, + "loss": 0.033849341869354246, + "step": 34200 + }, + { + "epoch": 1.2410000361807592, + "grad_norm": 0.7411011457443237, + "learning_rate": 1.8975903614457833e-05, + "loss": 0.027842617034912108, + "step": 34300 + }, + { + "epoch": 1.2446181120879916, + "grad_norm": 0.4449065327644348, + "learning_rate": 1.888545171677702e-05, + "loss": 0.031680150032043455, + "step": 34400 + }, + { + "epoch": 1.2482361879952242, + "grad_norm": 0.7327262759208679, + "learning_rate": 1.8794999819096208e-05, + "loss": 0.02651881694793701, + "step": 34500 + }, + { + "epoch": 1.2518542639024566, + "grad_norm": 0.41838428378105164, + "learning_rate": 1.870454792141539e-05, + "loss": 0.032553679943084717, + "step": 34600 + }, + { + "epoch": 1.2554723398096892, + "grad_norm": 0.3279021382331848, + "learning_rate": 1.861409602373458e-05, + "loss": 0.02605849742889404, + "step": 34700 + }, + { + "epoch": 1.2590904157169218, + "grad_norm": 0.23042799532413483, + "learning_rate": 1.8523644126053766e-05, + "loss": 0.02857684135437012, + "step": 34800 + }, + { + "epoch": 1.2627084916241542, + "grad_norm": 0.14856815338134766, + "learning_rate": 1.8433192228372954e-05, + "loss": 0.030806925296783447, + "step": 34900 + }, + { + "epoch": 1.2663265675313868, + "grad_norm": 0.48354101181030273, + "learning_rate": 1.8342740330692138e-05, + "loss": 0.030027375221252442, + "step": 35000 + }, + { + "epoch": 1.2663265675313868, + "eval_accuracy": 0.9877813255436068, + "eval_f1": 0.8615969042346098, + "eval_loss": 0.05214959755539894, + "eval_precision": 0.8434818838343312, + "eval_recall": 0.8805070957972906, + "eval_runtime": 62.9193, + "eval_samples_per_second": 317.867, + "eval_steps_per_second": 8.837, + "step": 35000 + }, + { + "epoch": 1.2699446434386195, + "grad_norm": 0.13334180414676666, + "learning_rate": 1.8252288433011325e-05, + "loss": 0.027159340381622314, + "step": 35100 + }, + { + "epoch": 1.2735627193458519, + "grad_norm": 0.7394197583198547, + "learning_rate": 1.8161836535330513e-05, + "loss": 0.03075253963470459, + "step": 35200 + }, + { + "epoch": 1.2771807952530845, + "grad_norm": 0.2870982587337494, + "learning_rate": 1.80713846376497e-05, + "loss": 0.030658049583435057, + "step": 35300 + }, + { + "epoch": 1.2807988711603169, + "grad_norm": 0.9762187004089355, + "learning_rate": 1.7980932739968887e-05, + "loss": 0.031029996871948243, + "step": 35400 + }, + { + "epoch": 1.2844169470675495, + "grad_norm": 0.44388410449028015, + "learning_rate": 1.789048084228807e-05, + "loss": 0.03051720142364502, + "step": 35500 + }, + { + "epoch": 1.288035022974782, + "grad_norm": 0.7785915732383728, + "learning_rate": 1.780002894460726e-05, + "loss": 0.02536651849746704, + "step": 35600 + }, + { + "epoch": 1.2916530988820145, + "grad_norm": 0.1702079176902771, + "learning_rate": 1.7709577046926446e-05, + "loss": 0.030427489280700683, + "step": 35700 + }, + { + "epoch": 1.2952711747892471, + "grad_norm": 0.4802360236644745, + "learning_rate": 1.7619125149245634e-05, + "loss": 0.03049640417098999, + "step": 35800 + }, + { + "epoch": 1.2988892506964795, + "grad_norm": 0.40013861656188965, + "learning_rate": 1.752867325156482e-05, + "loss": 0.030040171146392822, + "step": 35900 + }, + { + "epoch": 1.3025073266037122, + "grad_norm": 0.34162065386772156, + "learning_rate": 1.7438221353884005e-05, + "loss": 0.031596968173980715, + "step": 36000 + }, + { + "epoch": 1.3061254025109448, + "grad_norm": 0.34575241804122925, + "learning_rate": 1.7347769456203193e-05, + "loss": 0.03362387895584106, + "step": 36100 + }, + { + "epoch": 1.3097434784181772, + "grad_norm": 0.4098789691925049, + "learning_rate": 1.725731755852238e-05, + "loss": 0.027526361942291258, + "step": 36200 + }, + { + "epoch": 1.3133615543254098, + "grad_norm": 0.35067400336265564, + "learning_rate": 1.7166865660841567e-05, + "loss": 0.02835451364517212, + "step": 36300 + }, + { + "epoch": 1.3169796302326424, + "grad_norm": 0.1685800403356552, + "learning_rate": 1.707641376316075e-05, + "loss": 0.028891866207122804, + "step": 36400 + }, + { + "epoch": 1.3205977061398748, + "grad_norm": 0.32651832699775696, + "learning_rate": 1.698596186547994e-05, + "loss": 0.026589181423187256, + "step": 36500 + }, + { + "epoch": 1.3242157820471072, + "grad_norm": 0.3153350353240967, + "learning_rate": 1.6895509967799126e-05, + "loss": 0.031108696460723877, + "step": 36600 + }, + { + "epoch": 1.3278338579543398, + "grad_norm": 0.4476368725299835, + "learning_rate": 1.6805058070118314e-05, + "loss": 0.030014872550964355, + "step": 36700 + }, + { + "epoch": 1.3314519338615725, + "grad_norm": 0.1972656548023224, + "learning_rate": 1.67146061724375e-05, + "loss": 0.029410278797149657, + "step": 36800 + }, + { + "epoch": 1.3350700097688049, + "grad_norm": 0.7246927618980408, + "learning_rate": 1.6624154274756685e-05, + "loss": 0.03080254316329956, + "step": 36900 + }, + { + "epoch": 1.3386880856760375, + "grad_norm": 0.3670811355113983, + "learning_rate": 1.6533702377075872e-05, + "loss": 0.02861506223678589, + "step": 37000 + }, + { + "epoch": 1.34230616158327, + "grad_norm": 0.22275477647781372, + "learning_rate": 1.644325047939506e-05, + "loss": 0.0255238938331604, + "step": 37100 + }, + { + "epoch": 1.3459242374905025, + "grad_norm": 0.3272339999675751, + "learning_rate": 1.6352798581714247e-05, + "loss": 0.028979463577270506, + "step": 37200 + }, + { + "epoch": 1.3495423133977351, + "grad_norm": 0.5552839040756226, + "learning_rate": 1.626234668403343e-05, + "loss": 0.028283817768096922, + "step": 37300 + }, + { + "epoch": 1.3531603893049677, + "grad_norm": 0.33792686462402344, + "learning_rate": 1.617189478635262e-05, + "loss": 0.03224069595336914, + "step": 37400 + }, + { + "epoch": 1.3567784652122001, + "grad_norm": 1.0481899976730347, + "learning_rate": 1.6081442888671806e-05, + "loss": 0.02690179109573364, + "step": 37500 + }, + { + "epoch": 1.3567784652122001, + "eval_accuracy": 0.9878715555186957, + "eval_f1": 0.8683487542236398, + "eval_loss": 0.05309534817934036, + "eval_precision": 0.851476257567078, + "eval_recall": 0.8859034456096264, + "eval_runtime": 62.1337, + "eval_samples_per_second": 321.887, + "eval_steps_per_second": 8.948, + "step": 37500 + }, + { + "epoch": 1.3603965411194328, + "grad_norm": 0.20256465673446655, + "learning_rate": 1.5990990990990993e-05, + "loss": 0.027432169914245606, + "step": 37600 + }, + { + "epoch": 1.3640146170266652, + "grad_norm": 0.3237811028957367, + "learning_rate": 1.590053909331018e-05, + "loss": 0.030464730262756347, + "step": 37700 + }, + { + "epoch": 1.3676326929338978, + "grad_norm": 0.31953930854797363, + "learning_rate": 1.5810087195629365e-05, + "loss": 0.027273902893066405, + "step": 37800 + }, + { + "epoch": 1.3712507688411302, + "grad_norm": 0.38057664036750793, + "learning_rate": 1.5719635297948552e-05, + "loss": 0.0259963059425354, + "step": 37900 + }, + { + "epoch": 1.3748688447483628, + "grad_norm": 0.6410769820213318, + "learning_rate": 1.562918340026774e-05, + "loss": 0.031271641254425046, + "step": 38000 + }, + { + "epoch": 1.3784869206555954, + "grad_norm": 0.8330540060997009, + "learning_rate": 1.5538731502586927e-05, + "loss": 0.02934875011444092, + "step": 38100 + }, + { + "epoch": 1.3821049965628278, + "grad_norm": 1.1677355766296387, + "learning_rate": 1.5448279604906114e-05, + "loss": 0.02971445083618164, + "step": 38200 + }, + { + "epoch": 1.3857230724700604, + "grad_norm": 0.4667145609855652, + "learning_rate": 1.53578277072253e-05, + "loss": 0.02775926113128662, + "step": 38300 + }, + { + "epoch": 1.389341148377293, + "grad_norm": 0.4434032440185547, + "learning_rate": 1.5267375809544486e-05, + "loss": 0.026833882331848146, + "step": 38400 + }, + { + "epoch": 1.3929592242845255, + "grad_norm": 0.2564474642276764, + "learning_rate": 1.5176923911863672e-05, + "loss": 0.02980698347091675, + "step": 38500 + }, + { + "epoch": 1.396577300191758, + "grad_norm": 0.43813377618789673, + "learning_rate": 1.5086472014182859e-05, + "loss": 0.028636832237243653, + "step": 38600 + }, + { + "epoch": 1.4001953760989905, + "grad_norm": 0.928669810295105, + "learning_rate": 1.4996020116502043e-05, + "loss": 0.02784595012664795, + "step": 38700 + }, + { + "epoch": 1.403813452006223, + "grad_norm": 1.0816453695297241, + "learning_rate": 1.490556821882123e-05, + "loss": 0.031624915599823, + "step": 38800 + }, + { + "epoch": 1.4074315279134555, + "grad_norm": 1.6790099143981934, + "learning_rate": 1.4815116321140418e-05, + "loss": 0.02443223476409912, + "step": 38900 + }, + { + "epoch": 1.4110496038206881, + "grad_norm": 0.39879387617111206, + "learning_rate": 1.4724664423459605e-05, + "loss": 0.02753525972366333, + "step": 39000 + }, + { + "epoch": 1.4146676797279207, + "grad_norm": 0.6372315883636475, + "learning_rate": 1.4634212525778793e-05, + "loss": 0.02859419822692871, + "step": 39100 + }, + { + "epoch": 1.4182857556351531, + "grad_norm": 0.4357219934463501, + "learning_rate": 1.4543760628097977e-05, + "loss": 0.02929396152496338, + "step": 39200 + }, + { + "epoch": 1.4219038315423858, + "grad_norm": 0.8673311471939087, + "learning_rate": 1.4453308730417164e-05, + "loss": 0.027733774185180665, + "step": 39300 + }, + { + "epoch": 1.4255219074496184, + "grad_norm": 0.31178081035614014, + "learning_rate": 1.4362856832736351e-05, + "loss": 0.029380517005920412, + "step": 39400 + }, + { + "epoch": 1.4291399833568508, + "grad_norm": 0.9862114191055298, + "learning_rate": 1.4272404935055539e-05, + "loss": 0.02801510810852051, + "step": 39500 + }, + { + "epoch": 1.4327580592640834, + "grad_norm": 0.3226287364959717, + "learning_rate": 1.4181953037374726e-05, + "loss": 0.02600921630859375, + "step": 39600 + }, + { + "epoch": 1.436376135171316, + "grad_norm": 1.0932515859603882, + "learning_rate": 1.409150113969391e-05, + "loss": 0.027818257808685302, + "step": 39700 + }, + { + "epoch": 1.4399942110785484, + "grad_norm": 0.4064158797264099, + "learning_rate": 1.4001049242013098e-05, + "loss": 0.030927972793579103, + "step": 39800 + }, + { + "epoch": 1.443612286985781, + "grad_norm": 0.6574753522872925, + "learning_rate": 1.3910597344332285e-05, + "loss": 0.028972697257995606, + "step": 39900 + }, + { + "epoch": 1.4472303628930134, + "grad_norm": 0.24314340949058533, + "learning_rate": 1.3820145446651472e-05, + "loss": 0.029455924034118654, + "step": 40000 + }, + { + "epoch": 1.4472303628930134, + "eval_accuracy": 0.9882140454666924, + "eval_f1": 0.8711891990109102, + "eval_loss": 0.05167451128363609, + "eval_precision": 0.8548262069393198, + "eval_recall": 0.8881908535897808, + "eval_runtime": 62.5842, + "eval_samples_per_second": 319.57, + "eval_steps_per_second": 8.884, + "step": 40000 + }, + { + "epoch": 1.450848438800246, + "grad_norm": 0.28122034668922424, + "learning_rate": 1.3729693548970656e-05, + "loss": 0.029821088314056398, + "step": 40100 + }, + { + "epoch": 1.4544665147074785, + "grad_norm": 0.45019853115081787, + "learning_rate": 1.3639241651289844e-05, + "loss": 0.027684724330902098, + "step": 40200 + }, + { + "epoch": 1.458084590614711, + "grad_norm": 0.6584652066230774, + "learning_rate": 1.3548789753609031e-05, + "loss": 0.026381478309631348, + "step": 40300 + }, + { + "epoch": 1.4617026665219437, + "grad_norm": 2.1259236335754395, + "learning_rate": 1.3458337855928219e-05, + "loss": 0.02868267774581909, + "step": 40400 + }, + { + "epoch": 1.465320742429176, + "grad_norm": 0.9566027522087097, + "learning_rate": 1.3367885958247406e-05, + "loss": 0.027485811710357667, + "step": 40500 + }, + { + "epoch": 1.4689388183364087, + "grad_norm": 0.9289085268974304, + "learning_rate": 1.327743406056659e-05, + "loss": 0.030939743518829346, + "step": 40600 + }, + { + "epoch": 1.4725568942436413, + "grad_norm": 0.6716954112052917, + "learning_rate": 1.3186982162885778e-05, + "loss": 0.026526257991790772, + "step": 40700 + }, + { + "epoch": 1.4761749701508737, + "grad_norm": 0.26186442375183105, + "learning_rate": 1.3096530265204965e-05, + "loss": 0.027606160640716554, + "step": 40800 + }, + { + "epoch": 1.4797930460581064, + "grad_norm": 0.5962882041931152, + "learning_rate": 1.3006078367524152e-05, + "loss": 0.03013371229171753, + "step": 40900 + }, + { + "epoch": 1.4834111219653388, + "grad_norm": 0.28622719645500183, + "learning_rate": 1.2915626469843336e-05, + "loss": 0.026788763999938965, + "step": 41000 + }, + { + "epoch": 1.4870291978725714, + "grad_norm": 0.2146042138338089, + "learning_rate": 1.2825174572162524e-05, + "loss": 0.026920742988586426, + "step": 41100 + }, + { + "epoch": 1.4906472737798038, + "grad_norm": 0.30449753999710083, + "learning_rate": 1.2734722674481711e-05, + "loss": 0.028757052421569826, + "step": 41200 + }, + { + "epoch": 1.4942653496870364, + "grad_norm": 0.11651007831096649, + "learning_rate": 1.2644270776800899e-05, + "loss": 0.029123516082763673, + "step": 41300 + }, + { + "epoch": 1.497883425594269, + "grad_norm": 3.1146299839019775, + "learning_rate": 1.2553818879120086e-05, + "loss": 0.028435797691345216, + "step": 41400 + }, + { + "epoch": 1.5015015015015014, + "grad_norm": 0.2705380916595459, + "learning_rate": 1.2463366981439272e-05, + "loss": 0.03229628562927246, + "step": 41500 + }, + { + "epoch": 1.505119577408734, + "grad_norm": 0.5641364455223083, + "learning_rate": 1.2372915083758457e-05, + "loss": 0.02912388801574707, + "step": 41600 + }, + { + "epoch": 1.5087376533159667, + "grad_norm": 0.4726872444152832, + "learning_rate": 1.2282463186077645e-05, + "loss": 0.028761823177337647, + "step": 41700 + }, + { + "epoch": 1.512355729223199, + "grad_norm": 2.5604758262634277, + "learning_rate": 1.2192011288396832e-05, + "loss": 0.02635906219482422, + "step": 41800 + }, + { + "epoch": 1.5159738051304317, + "grad_norm": 0.3598019778728485, + "learning_rate": 1.2101559390716018e-05, + "loss": 0.026577677726745606, + "step": 41900 + }, + { + "epoch": 1.5195918810376643, + "grad_norm": 0.31742435693740845, + "learning_rate": 1.2011107493035205e-05, + "loss": 0.02479785919189453, + "step": 42000 + }, + { + "epoch": 1.5232099569448967, + "grad_norm": 1.0102005004882812, + "learning_rate": 1.1920655595354391e-05, + "loss": 0.028279991149902345, + "step": 42100 + }, + { + "epoch": 1.526828032852129, + "grad_norm": 0.4230172038078308, + "learning_rate": 1.1830203697673578e-05, + "loss": 0.027808871269226074, + "step": 42200 + }, + { + "epoch": 1.530446108759362, + "grad_norm": 0.35221824049949646, + "learning_rate": 1.1739751799992764e-05, + "loss": 0.02666907787322998, + "step": 42300 + }, + { + "epoch": 1.5340641846665943, + "grad_norm": 0.37867021560668945, + "learning_rate": 1.1649299902311952e-05, + "loss": 0.028237838745117188, + "step": 42400 + }, + { + "epoch": 1.5376822605738267, + "grad_norm": 1.1692699193954468, + "learning_rate": 1.1558848004631137e-05, + "loss": 0.027906298637390137, + "step": 42500 + }, + { + "epoch": 1.5376822605738267, + "eval_accuracy": 0.9883852904406909, + "eval_f1": 0.8713540843735187, + "eval_loss": 0.048916082829236984, + "eval_precision": 0.8549944962093611, + "eval_recall": 0.8883519386588057, + "eval_runtime": 62.2278, + "eval_samples_per_second": 321.4, + "eval_steps_per_second": 8.935, + "step": 42500 + }, + { + "epoch": 1.5413003364810594, + "grad_norm": 0.273318886756897, + "learning_rate": 1.1468396106950325e-05, + "loss": 0.031116650104522706, + "step": 42600 + }, + { + "epoch": 1.544918412388292, + "grad_norm": 0.48087653517723083, + "learning_rate": 1.1377944209269512e-05, + "loss": 0.026544408798217775, + "step": 42700 + }, + { + "epoch": 1.5485364882955244, + "grad_norm": 0.7746985554695129, + "learning_rate": 1.1287492311588698e-05, + "loss": 0.026500403881072998, + "step": 42800 + }, + { + "epoch": 1.552154564202757, + "grad_norm": 0.1549975574016571, + "learning_rate": 1.1197040413907885e-05, + "loss": 0.026587300300598145, + "step": 42900 + }, + { + "epoch": 1.5557726401099896, + "grad_norm": 1.972495198249817, + "learning_rate": 1.110658851622707e-05, + "loss": 0.029258613586425782, + "step": 43000 + }, + { + "epoch": 1.559390716017222, + "grad_norm": 0.6956634521484375, + "learning_rate": 1.1016136618546258e-05, + "loss": 0.026978886127471922, + "step": 43100 + }, + { + "epoch": 1.5630087919244544, + "grad_norm": 0.16629020869731903, + "learning_rate": 1.0925684720865444e-05, + "loss": 0.03226327657699585, + "step": 43200 + }, + { + "epoch": 1.5666268678316873, + "grad_norm": 0.37136366963386536, + "learning_rate": 1.0835232823184631e-05, + "loss": 0.028375396728515623, + "step": 43300 + }, + { + "epoch": 1.5702449437389197, + "grad_norm": 0.2561453580856323, + "learning_rate": 1.0744780925503819e-05, + "loss": 0.027073240280151366, + "step": 43400 + }, + { + "epoch": 1.573863019646152, + "grad_norm": 0.42630210518836975, + "learning_rate": 1.0654329027823004e-05, + "loss": 0.026704757213592528, + "step": 43500 + }, + { + "epoch": 1.5774810955533847, + "grad_norm": 0.4090301990509033, + "learning_rate": 1.0563877130142192e-05, + "loss": 0.02855618476867676, + "step": 43600 + }, + { + "epoch": 1.5810991714606173, + "grad_norm": 0.24324025213718414, + "learning_rate": 1.0473425232461378e-05, + "loss": 0.025224699974060058, + "step": 43700 + }, + { + "epoch": 1.5847172473678497, + "grad_norm": 0.4220653772354126, + "learning_rate": 1.0382973334780565e-05, + "loss": 0.029145328998565673, + "step": 43800 + }, + { + "epoch": 1.5883353232750823, + "grad_norm": 0.4333362281322479, + "learning_rate": 1.029252143709975e-05, + "loss": 0.025774214267730713, + "step": 43900 + }, + { + "epoch": 1.591953399182315, + "grad_norm": 0.15959997475147247, + "learning_rate": 1.0202069539418938e-05, + "loss": 0.026988446712493896, + "step": 44000 + }, + { + "epoch": 1.5955714750895473, + "grad_norm": 0.2643369138240814, + "learning_rate": 1.0111617641738126e-05, + "loss": 0.0258998441696167, + "step": 44100 + }, + { + "epoch": 1.59918955099678, + "grad_norm": 0.8528566360473633, + "learning_rate": 1.0021165744057311e-05, + "loss": 0.02746238708496094, + "step": 44200 + }, + { + "epoch": 1.6028076269040126, + "grad_norm": 0.999005138874054, + "learning_rate": 9.930713846376499e-06, + "loss": 0.028600902557373048, + "step": 44300 + }, + { + "epoch": 1.606425702811245, + "grad_norm": 0.6834824681282043, + "learning_rate": 9.840261948695684e-06, + "loss": 0.028850455284118653, + "step": 44400 + }, + { + "epoch": 1.6100437787184774, + "grad_norm": 0.3043724298477173, + "learning_rate": 9.749810051014872e-06, + "loss": 0.0262698769569397, + "step": 44500 + }, + { + "epoch": 1.6136618546257102, + "grad_norm": 0.8399735689163208, + "learning_rate": 9.659358153334057e-06, + "loss": 0.02827603816986084, + "step": 44600 + }, + { + "epoch": 1.6172799305329426, + "grad_norm": 0.9611870646476746, + "learning_rate": 9.568906255653245e-06, + "loss": 0.02755260467529297, + "step": 44700 + }, + { + "epoch": 1.620898006440175, + "grad_norm": 0.23461508750915527, + "learning_rate": 9.47845435797243e-06, + "loss": 0.0311501145362854, + "step": 44800 + }, + { + "epoch": 1.6245160823474076, + "grad_norm": 2.882127046585083, + "learning_rate": 9.388002460291618e-06, + "loss": 0.029984614849090575, + "step": 44900 + }, + { + "epoch": 1.6281341582546403, + "grad_norm": 0.32786279916763306, + "learning_rate": 9.297550562610804e-06, + "loss": 0.028132951259613036, + "step": 45000 + }, + { + "epoch": 1.6281341582546403, + "eval_accuracy": 0.9886770980197016, + "eval_f1": 0.8710388819944511, + "eval_loss": 0.047967541962862015, + "eval_precision": 0.855134094859697, + "eval_recall": 0.887546513313681, + "eval_runtime": 62.7107, + "eval_samples_per_second": 318.925, + "eval_steps_per_second": 8.866, + "step": 45000 + }, + { + "epoch": 1.6317522341618727, + "grad_norm": 1.6328613758087158, + "learning_rate": 9.207098664929991e-06, + "loss": 0.028099877834320067, + "step": 45100 + }, + { + "epoch": 1.6353703100691053, + "grad_norm": 1.1488419771194458, + "learning_rate": 9.116646767249177e-06, + "loss": 0.025699715614318847, + "step": 45200 + }, + { + "epoch": 1.638988385976338, + "grad_norm": 1.2527875900268555, + "learning_rate": 9.026194869568364e-06, + "loss": 0.02980081081390381, + "step": 45300 + }, + { + "epoch": 1.6426064618835703, + "grad_norm": 0.25659850239753723, + "learning_rate": 8.93574297188755e-06, + "loss": 0.02849080801010132, + "step": 45400 + }, + { + "epoch": 1.6462245377908027, + "grad_norm": 0.24858339130878448, + "learning_rate": 8.845291074206737e-06, + "loss": 0.02909574508666992, + "step": 45500 + }, + { + "epoch": 1.6498426136980355, + "grad_norm": 0.35774946212768555, + "learning_rate": 8.754839176525923e-06, + "loss": 0.028034112453460693, + "step": 45600 + }, + { + "epoch": 1.653460689605268, + "grad_norm": 0.28512680530548096, + "learning_rate": 8.66438727884511e-06, + "loss": 0.029735114574432373, + "step": 45700 + }, + { + "epoch": 1.6570787655125003, + "grad_norm": 0.12049074470996857, + "learning_rate": 8.573935381164296e-06, + "loss": 0.03128848075866699, + "step": 45800 + }, + { + "epoch": 1.660696841419733, + "grad_norm": 0.5767261385917664, + "learning_rate": 8.483483483483484e-06, + "loss": 0.02762418031692505, + "step": 45900 + }, + { + "epoch": 1.6643149173269656, + "grad_norm": 0.12318204343318939, + "learning_rate": 8.39303158580267e-06, + "loss": 0.026004743576049805, + "step": 46000 + }, + { + "epoch": 1.667932993234198, + "grad_norm": 0.311279833316803, + "learning_rate": 8.302579688121857e-06, + "loss": 0.024458692073822022, + "step": 46100 + }, + { + "epoch": 1.6715510691414306, + "grad_norm": 0.2753770351409912, + "learning_rate": 8.212127790441042e-06, + "loss": 0.026231870651245118, + "step": 46200 + }, + { + "epoch": 1.6751691450486632, + "grad_norm": 0.8421895503997803, + "learning_rate": 8.12167589276023e-06, + "loss": 0.02496417760848999, + "step": 46300 + }, + { + "epoch": 1.6787872209558956, + "grad_norm": 0.6493498086929321, + "learning_rate": 8.031223995079417e-06, + "loss": 0.026742682456970215, + "step": 46400 + }, + { + "epoch": 1.6824052968631282, + "grad_norm": 0.3029896318912506, + "learning_rate": 7.940772097398603e-06, + "loss": 0.024227650165557862, + "step": 46500 + }, + { + "epoch": 1.6860233727703609, + "grad_norm": 0.34622183442115784, + "learning_rate": 7.85032019971779e-06, + "loss": 0.025336668491363526, + "step": 46600 + }, + { + "epoch": 1.6896414486775932, + "grad_norm": 1.1520912647247314, + "learning_rate": 7.759868302036976e-06, + "loss": 0.028549084663391112, + "step": 46700 + }, + { + "epoch": 1.6932595245848256, + "grad_norm": 0.11390261352062225, + "learning_rate": 7.669416404356163e-06, + "loss": 0.025614957809448242, + "step": 46800 + }, + { + "epoch": 1.6968776004920583, + "grad_norm": 0.20818683505058289, + "learning_rate": 7.57896450667535e-06, + "loss": 0.02624866247177124, + "step": 46900 + }, + { + "epoch": 1.700495676399291, + "grad_norm": 0.11861401051282883, + "learning_rate": 7.488512608994537e-06, + "loss": 0.029836065769195556, + "step": 47000 + }, + { + "epoch": 1.7041137523065233, + "grad_norm": 0.21509072184562683, + "learning_rate": 7.398060711313724e-06, + "loss": 0.02764824151992798, + "step": 47100 + }, + { + "epoch": 1.707731828213756, + "grad_norm": 0.09410534054040909, + "learning_rate": 7.3076088136329105e-06, + "loss": 0.026358423233032228, + "step": 47200 + }, + { + "epoch": 1.7113499041209885, + "grad_norm": 0.4441370666027069, + "learning_rate": 7.217156915952097e-06, + "loss": 0.028589205741882326, + "step": 47300 + }, + { + "epoch": 1.714967980028221, + "grad_norm": 0.301600843667984, + "learning_rate": 7.1267050182712836e-06, + "loss": 0.02586300849914551, + "step": 47400 + }, + { + "epoch": 1.7185860559354535, + "grad_norm": 0.2969602942466736, + "learning_rate": 7.03625312059047e-06, + "loss": 0.027719602584838868, + "step": 47500 + }, + { + "epoch": 1.7185860559354535, + "eval_accuracy": 0.9887869098191715, + "eval_f1": 0.8751810891473175, + "eval_loss": 0.04670108109712601, + "eval_precision": 0.8604607721046077, + "eval_recall": 0.8904138275423251, + "eval_runtime": 62.4542, + "eval_samples_per_second": 320.234, + "eval_steps_per_second": 8.903, + "step": 47500 + }, + { + "epoch": 1.7222041318426862, + "grad_norm": 2.922269582748413, + "learning_rate": 6.945801222909657e-06, + "loss": 0.026613037586212158, + "step": 47600 + }, + { + "epoch": 1.7258222077499186, + "grad_norm": 0.3603607714176178, + "learning_rate": 6.855349325228843e-06, + "loss": 0.02875258445739746, + "step": 47700 + }, + { + "epoch": 1.729440283657151, + "grad_norm": 0.17424313724040985, + "learning_rate": 6.764897427548031e-06, + "loss": 0.028092458248138427, + "step": 47800 + }, + { + "epoch": 1.7330583595643838, + "grad_norm": 0.39376911520957947, + "learning_rate": 6.674445529867217e-06, + "loss": 0.029860684871673582, + "step": 47900 + }, + { + "epoch": 1.7366764354716162, + "grad_norm": 0.30766257643699646, + "learning_rate": 6.583993632186404e-06, + "loss": 0.027765181064605713, + "step": 48000 + }, + { + "epoch": 1.7402945113788486, + "grad_norm": 0.4809003472328186, + "learning_rate": 6.49354173450559e-06, + "loss": 0.025850486755371094, + "step": 48100 + }, + { + "epoch": 1.7439125872860812, + "grad_norm": 0.31469446420669556, + "learning_rate": 6.403089836824777e-06, + "loss": 0.024390408992767332, + "step": 48200 + }, + { + "epoch": 1.7475306631933138, + "grad_norm": 0.1946684867143631, + "learning_rate": 6.312637939143963e-06, + "loss": 0.02534383535385132, + "step": 48300 + }, + { + "epoch": 1.7511487391005462, + "grad_norm": 0.31097686290740967, + "learning_rate": 6.22218604146315e-06, + "loss": 0.02695645809173584, + "step": 48400 + }, + { + "epoch": 1.7547668150077789, + "grad_norm": 0.7921291589736938, + "learning_rate": 6.1317341437823365e-06, + "loss": 0.023772099018096925, + "step": 48500 + }, + { + "epoch": 1.7583848909150115, + "grad_norm": 0.3385520577430725, + "learning_rate": 6.041282246101523e-06, + "loss": 0.024593567848205565, + "step": 48600 + }, + { + "epoch": 1.7620029668222439, + "grad_norm": 0.23133955895900726, + "learning_rate": 5.95083034842071e-06, + "loss": 0.025404906272888182, + "step": 48700 + }, + { + "epoch": 1.7656210427294765, + "grad_norm": 0.17175310850143433, + "learning_rate": 5.860378450739896e-06, + "loss": 0.024191346168518067, + "step": 48800 + }, + { + "epoch": 1.7692391186367091, + "grad_norm": 1.453963041305542, + "learning_rate": 5.769926553059084e-06, + "loss": 0.023371386528015136, + "step": 48900 + }, + { + "epoch": 1.7728571945439415, + "grad_norm": 0.4487530291080475, + "learning_rate": 5.67947465537827e-06, + "loss": 0.024376935958862304, + "step": 49000 + }, + { + "epoch": 1.776475270451174, + "grad_norm": 0.17453834414482117, + "learning_rate": 5.589022757697457e-06, + "loss": 0.027640838623046875, + "step": 49100 + }, + { + "epoch": 1.7800933463584065, + "grad_norm": 0.24941837787628174, + "learning_rate": 5.498570860016643e-06, + "loss": 0.02413508415222168, + "step": 49200 + }, + { + "epoch": 1.7837114222656392, + "grad_norm": 0.3545306622982025, + "learning_rate": 5.40811896233583e-06, + "loss": 0.025269722938537596, + "step": 49300 + }, + { + "epoch": 1.7873294981728716, + "grad_norm": 0.21222856640815735, + "learning_rate": 5.317667064655016e-06, + "loss": 0.02443007230758667, + "step": 49400 + }, + { + "epoch": 1.7909475740801042, + "grad_norm": 0.5955353379249573, + "learning_rate": 5.227215166974203e-06, + "loss": 0.027793030738830566, + "step": 49500 + }, + { + "epoch": 1.7945656499873368, + "grad_norm": 1.0362492799758911, + "learning_rate": 5.13676326929339e-06, + "loss": 0.02576704978942871, + "step": 49600 + }, + { + "epoch": 1.7981837258945692, + "grad_norm": 0.2961190938949585, + "learning_rate": 5.046311371612577e-06, + "loss": 0.027634003162384034, + "step": 49700 + }, + { + "epoch": 1.8018018018018018, + "grad_norm": 0.2701990604400635, + "learning_rate": 4.9558594739317635e-06, + "loss": 0.026762216091156005, + "step": 49800 + }, + { + "epoch": 1.8054198777090344, + "grad_norm": 0.3419773280620575, + "learning_rate": 4.86540757625095e-06, + "loss": 0.028021221160888673, + "step": 49900 + }, + { + "epoch": 1.8090379536162668, + "grad_norm": 0.3847455680370331, + "learning_rate": 4.7749556785701366e-06, + "loss": 0.028925769329071045, + "step": 50000 + }, + { + "epoch": 1.8090379536162668, + "eval_accuracy": 0.9891697152879526, + "eval_f1": 0.8756019071264223, + "eval_loss": 0.04578976333141327, + "eval_precision": 0.8598627201292046, + "eval_recall": 0.8919280271911596, + "eval_runtime": 62.7397, + "eval_samples_per_second": 318.777, + "eval_steps_per_second": 8.862, + "step": 50000 + }, + { + "epoch": 1.8126560295234992, + "grad_norm": 0.12807752192020416, + "learning_rate": 4.684503780889323e-06, + "loss": 0.024477434158325196, + "step": 50100 + }, + { + "epoch": 1.816274105430732, + "grad_norm": 0.5839409828186035, + "learning_rate": 4.59405188320851e-06, + "loss": 0.029098427295684813, + "step": 50200 + }, + { + "epoch": 1.8198921813379645, + "grad_norm": 0.1988334357738495, + "learning_rate": 4.503599985527696e-06, + "loss": 0.027852838039398194, + "step": 50300 + }, + { + "epoch": 1.8235102572451969, + "grad_norm": 1.1250760555267334, + "learning_rate": 4.413148087846884e-06, + "loss": 0.025283007621765136, + "step": 50400 + }, + { + "epoch": 1.8271283331524295, + "grad_norm": 0.3275587558746338, + "learning_rate": 4.32269619016607e-06, + "loss": 0.0253476619720459, + "step": 50500 + }, + { + "epoch": 1.8307464090596621, + "grad_norm": 0.2422463297843933, + "learning_rate": 4.232244292485257e-06, + "loss": 0.025618109703063965, + "step": 50600 + }, + { + "epoch": 1.8343644849668945, + "grad_norm": 0.6434578895568848, + "learning_rate": 4.141792394804443e-06, + "loss": 0.026464188098907472, + "step": 50700 + }, + { + "epoch": 1.8379825608741271, + "grad_norm": 0.16934601962566376, + "learning_rate": 4.05134049712363e-06, + "loss": 0.025098586082458497, + "step": 50800 + }, + { + "epoch": 1.8416006367813598, + "grad_norm": 0.21844395995140076, + "learning_rate": 3.9608885994428164e-06, + "loss": 0.023906781673431396, + "step": 50900 + }, + { + "epoch": 1.8452187126885922, + "grad_norm": 0.2674906253814697, + "learning_rate": 3.870436701762003e-06, + "loss": 0.026905314922332765, + "step": 51000 + }, + { + "epoch": 1.8488367885958248, + "grad_norm": 0.4344836473464966, + "learning_rate": 3.77998480408119e-06, + "loss": 0.026017348766326904, + "step": 51100 + }, + { + "epoch": 1.8524548645030574, + "grad_norm": 0.5953734517097473, + "learning_rate": 3.6895329064003765e-06, + "loss": 0.02634397745132446, + "step": 51200 + }, + { + "epoch": 1.8560729404102898, + "grad_norm": 0.14901016652584076, + "learning_rate": 3.599081008719563e-06, + "loss": 0.02832331895828247, + "step": 51300 + }, + { + "epoch": 1.8596910163175222, + "grad_norm": 0.7816808223724365, + "learning_rate": 3.5086291110387496e-06, + "loss": 0.026141095161437988, + "step": 51400 + }, + { + "epoch": 1.8633090922247548, + "grad_norm": 0.5734632015228271, + "learning_rate": 3.418177213357936e-06, + "loss": 0.02372182607650757, + "step": 51500 + }, + { + "epoch": 1.8669271681319874, + "grad_norm": 0.9664448499679565, + "learning_rate": 3.3277253156771227e-06, + "loss": 0.024712865352630616, + "step": 51600 + }, + { + "epoch": 1.8705452440392198, + "grad_norm": 0.390066921710968, + "learning_rate": 3.2372734179963093e-06, + "loss": 0.026522459983825682, + "step": 51700 + }, + { + "epoch": 1.8741633199464525, + "grad_norm": 0.6472379565238953, + "learning_rate": 3.146821520315496e-06, + "loss": 0.024525246620178222, + "step": 51800 + }, + { + "epoch": 1.877781395853685, + "grad_norm": 0.4985784888267517, + "learning_rate": 3.056369622634683e-06, + "loss": 0.02446552038192749, + "step": 51900 + }, + { + "epoch": 1.8813994717609175, + "grad_norm": 0.22120802104473114, + "learning_rate": 2.9659177249538694e-06, + "loss": 0.025269200801849367, + "step": 52000 + }, + { + "epoch": 1.88501754766815, + "grad_norm": 0.3579547703266144, + "learning_rate": 2.8754658272730564e-06, + "loss": 0.025214505195617676, + "step": 52100 + }, + { + "epoch": 1.8886356235753827, + "grad_norm": 0.7338326573371887, + "learning_rate": 2.785013929592243e-06, + "loss": 0.02668466329574585, + "step": 52200 + }, + { + "epoch": 1.8922536994826151, + "grad_norm": 0.3315567970275879, + "learning_rate": 2.6945620319114295e-06, + "loss": 0.030078487396240236, + "step": 52300 + }, + { + "epoch": 1.8958717753898475, + "grad_norm": 0.35072797536849976, + "learning_rate": 2.6041101342306165e-06, + "loss": 0.02516920804977417, + "step": 52400 + }, + { + "epoch": 1.8994898512970804, + "grad_norm": 0.43289047479629517, + "learning_rate": 2.513658236549803e-06, + "loss": 0.026839351654052733, + "step": 52500 + }, + { + "epoch": 1.8994898512970804, + "eval_accuracy": 0.9891036746253344, + "eval_f1": 0.876242095754291, + "eval_loss": 0.045680414885282516, + "eval_precision": 0.8623029055350209, + "eval_recall": 0.89063934663896, + "eval_runtime": 62.307, + "eval_samples_per_second": 320.991, + "eval_steps_per_second": 8.924, + "step": 52500 + }, + { + "epoch": 1.9031079272043128, + "grad_norm": 0.4170491099357605, + "learning_rate": 2.4232063388689896e-06, + "loss": 0.027149310111999513, + "step": 52600 + }, + { + "epoch": 1.9067260031115452, + "grad_norm": 0.33568137884140015, + "learning_rate": 2.332754441188176e-06, + "loss": 0.024306225776672363, + "step": 52700 + }, + { + "epoch": 1.9103440790187778, + "grad_norm": 0.831928551197052, + "learning_rate": 2.242302543507363e-06, + "loss": 0.025090248584747316, + "step": 52800 + }, + { + "epoch": 1.9139621549260104, + "grad_norm": 0.2261083424091339, + "learning_rate": 2.1518506458265497e-06, + "loss": 0.02992173671722412, + "step": 52900 + }, + { + "epoch": 1.9175802308332428, + "grad_norm": 0.36420953273773193, + "learning_rate": 2.0613987481457362e-06, + "loss": 0.026374735832214356, + "step": 53000 + }, + { + "epoch": 1.9211983067404754, + "grad_norm": 0.3849758207798004, + "learning_rate": 1.970946850464923e-06, + "loss": 0.024311881065368652, + "step": 53100 + }, + { + "epoch": 1.924816382647708, + "grad_norm": 0.1625661551952362, + "learning_rate": 1.8804949527841096e-06, + "loss": 0.028159475326538085, + "step": 53200 + }, + { + "epoch": 1.9284344585549404, + "grad_norm": 0.10745652765035629, + "learning_rate": 1.7900430551032961e-06, + "loss": 0.028279855251312255, + "step": 53300 + }, + { + "epoch": 1.932052534462173, + "grad_norm": 0.3585937023162842, + "learning_rate": 1.6995911574224827e-06, + "loss": 0.025097475051879883, + "step": 53400 + }, + { + "epoch": 1.9356706103694057, + "grad_norm": 0.3355402648448944, + "learning_rate": 1.6091392597416697e-06, + "loss": 0.0232719612121582, + "step": 53500 + }, + { + "epoch": 1.939288686276638, + "grad_norm": 0.6301077604293823, + "learning_rate": 1.5186873620608562e-06, + "loss": 0.023976569175720216, + "step": 53600 + }, + { + "epoch": 1.9429067621838705, + "grad_norm": 1.720951795578003, + "learning_rate": 1.4282354643800428e-06, + "loss": 0.027393877506256104, + "step": 53700 + }, + { + "epoch": 1.946524838091103, + "grad_norm": 1.0819095373153687, + "learning_rate": 1.3377835666992295e-06, + "loss": 0.028527204990386964, + "step": 53800 + }, + { + "epoch": 1.9501429139983357, + "grad_norm": 0.4960351884365082, + "learning_rate": 1.247331669018416e-06, + "loss": 0.023636491298675538, + "step": 53900 + }, + { + "epoch": 1.9537609899055681, + "grad_norm": 0.6555366516113281, + "learning_rate": 1.1568797713376029e-06, + "loss": 0.02606668949127197, + "step": 54000 + }, + { + "epoch": 1.9573790658128007, + "grad_norm": 0.17520390450954437, + "learning_rate": 1.0664278736567894e-06, + "loss": 0.024348812103271486, + "step": 54100 + }, + { + "epoch": 1.9609971417200334, + "grad_norm": 0.2867375612258911, + "learning_rate": 9.75975975975976e-07, + "loss": 0.024609763622283936, + "step": 54200 + }, + { + "epoch": 1.9646152176272658, + "grad_norm": 0.11981488019227982, + "learning_rate": 8.855240782951626e-07, + "loss": 0.02563744068145752, + "step": 54300 + }, + { + "epoch": 1.9682332935344984, + "grad_norm": 0.25503483414649963, + "learning_rate": 7.950721806143494e-07, + "loss": 0.026204137802124022, + "step": 54400 + }, + { + "epoch": 1.971851369441731, + "grad_norm": 0.23244522511959076, + "learning_rate": 7.04620282933536e-07, + "loss": 0.0256950044631958, + "step": 54500 + }, + { + "epoch": 1.9754694453489634, + "grad_norm": 0.20025278627872467, + "learning_rate": 6.141683852527226e-07, + "loss": 0.025686397552490234, + "step": 54600 + }, + { + "epoch": 1.9790875212561958, + "grad_norm": 0.4756115972995758, + "learning_rate": 5.237164875719093e-07, + "loss": 0.02578796148300171, + "step": 54700 + }, + { + "epoch": 1.9827055971634286, + "grad_norm": 0.27420374751091003, + "learning_rate": 4.3326458989109595e-07, + "loss": 0.023311092853546142, + "step": 54800 + }, + { + "epoch": 1.986323673070661, + "grad_norm": 0.19387075304985046, + "learning_rate": 3.4281269221028255e-07, + "loss": 0.02670889377593994, + "step": 54900 + }, + { + "epoch": 1.9899417489778934, + "grad_norm": 0.726769745349884, + "learning_rate": 2.523607945294692e-07, + "loss": 0.03058022975921631, + "step": 55000 + }, + { + "epoch": 1.9899417489778934, + "eval_accuracy": 0.9892449709267501, + "eval_f1": 0.8768783517240833, + "eval_loss": 0.0451948419213295, + "eval_precision": 0.8626445559677067, + "eval_recall": 0.8915897485462072, + "eval_runtime": 62.8103, + "eval_samples_per_second": 318.419, + "eval_steps_per_second": 8.852, + "step": 55000 + }, + { + "epoch": 1.993559824885126, + "grad_norm": 0.22022511065006256, + "learning_rate": 1.6190889684865588e-07, + "loss": 0.026084864139556886, + "step": 55100 + }, + { + "epoch": 1.9971779007923587, + "grad_norm": 0.5684672594070435, + "learning_rate": 7.145699916784254e-08, + "loss": 0.027587156295776367, + "step": 55200 + }, + { + "epoch": 2.0, + "step": 55278, + "total_flos": 1.9407141577440333e+18, + "train_loss": 0.04855243214227653, + "train_runtime": 26239.1933, + "train_samples_per_second": 303.363, + "train_steps_per_second": 2.107 + } + ], + "logging_steps": 100, + "max_steps": 55278, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9407141577440333e+18, + "train_batch_size": 72, + "trial_name": null, + "trial_params": null +}