diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4220 @@ +{ + "best_global_step": 55000, + "best_metric": 0.9045753492836575, + "best_model_checkpoint": "./lang-ner-xlmr/checkpoint-55000", + "epoch": 2.0, + "eval_steps": 2500, + "global_step": 55938, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0035753870356466087, + "grad_norm": 4.756625652313232, + "learning_rate": 4.991150917086775e-05, + "loss": 4.384464111328125, + "step": 100 + }, + { + "epoch": 0.007150774071293217, + "grad_norm": 2.1916704177856445, + "learning_rate": 4.982212449497658e-05, + "loss": 0.6957355499267578, + "step": 200 + }, + { + "epoch": 0.010726161106939826, + "grad_norm": 3.089632987976074, + "learning_rate": 4.973273981908542e-05, + "loss": 0.3664897537231445, + "step": 300 + }, + { + "epoch": 0.014301548142586435, + "grad_norm": 3.1561801433563232, + "learning_rate": 4.964335514319425e-05, + "loss": 0.24268556594848634, + "step": 400 + }, + { + "epoch": 0.017876935178233043, + "grad_norm": 4.541702747344971, + "learning_rate": 4.955397046730309e-05, + "loss": 0.18832412719726563, + "step": 500 + }, + { + "epoch": 0.021452322213879653, + "grad_norm": 3.958519220352173, + "learning_rate": 4.946458579141192e-05, + "loss": 0.1659502410888672, + "step": 600 + }, + { + "epoch": 0.025027709249526263, + "grad_norm": 2.7777926921844482, + "learning_rate": 4.9375201115520756e-05, + "loss": 0.1568641757965088, + "step": 700 + }, + { + "epoch": 0.02860309628517287, + "grad_norm": 2.150230646133423, + "learning_rate": 4.928581643962959e-05, + "loss": 0.14989984512329102, + "step": 800 + }, + { + "epoch": 0.032178483320819476, + "grad_norm": 1.7589229345321655, + "learning_rate": 4.9196431763738424e-05, + "loss": 0.13244229316711426, + "step": 900 + }, + { + "epoch": 0.035753870356466086, + "grad_norm": 1.2541024684906006, + "learning_rate": 4.910704708784726e-05, + "loss": 0.1411085033416748, + "step": 1000 + }, + { + "epoch": 0.039329257392112696, + "grad_norm": 1.043690800666809, + "learning_rate": 4.901766241195609e-05, + "loss": 0.12802630424499511, + "step": 1100 + }, + { + "epoch": 0.042904644427759306, + "grad_norm": 1.5866156816482544, + "learning_rate": 4.892827773606493e-05, + "loss": 0.11472611427307129, + "step": 1200 + }, + { + "epoch": 0.046480031463405916, + "grad_norm": 2.9468393325805664, + "learning_rate": 4.883889306017377e-05, + "loss": 0.11635817527770996, + "step": 1300 + }, + { + "epoch": 0.050055418499052526, + "grad_norm": 2.623593330383301, + "learning_rate": 4.87495083842826e-05, + "loss": 0.11469986915588379, + "step": 1400 + }, + { + "epoch": 0.05363080553469913, + "grad_norm": 1.0270402431488037, + "learning_rate": 4.8660123708391436e-05, + "loss": 0.10951638221740723, + "step": 1500 + }, + { + "epoch": 0.05720619257034574, + "grad_norm": 0.6011027693748474, + "learning_rate": 4.8570739032500274e-05, + "loss": 0.1056843090057373, + "step": 1600 + }, + { + "epoch": 0.06078157960599235, + "grad_norm": 1.5310850143432617, + "learning_rate": 4.8481354356609104e-05, + "loss": 0.10531362533569336, + "step": 1700 + }, + { + "epoch": 0.06435696664163895, + "grad_norm": 1.9218846559524536, + "learning_rate": 4.839196968071794e-05, + "loss": 0.10761914253234864, + "step": 1800 + }, + { + "epoch": 0.06793235367728556, + "grad_norm": 0.9941307306289673, + "learning_rate": 4.830258500482677e-05, + "loss": 0.10573001861572266, + "step": 1900 + }, + { + "epoch": 0.07150774071293217, + "grad_norm": 1.5511739253997803, + "learning_rate": 4.821320032893561e-05, + "loss": 0.09843612670898437, + "step": 2000 + }, + { + "epoch": 0.07508312774857878, + "grad_norm": 3.8423593044281006, + "learning_rate": 4.812381565304445e-05, + "loss": 0.09764796257019043, + "step": 2100 + }, + { + "epoch": 0.07865851478422539, + "grad_norm": 2.3102476596832275, + "learning_rate": 4.803443097715328e-05, + "loss": 0.0996187973022461, + "step": 2200 + }, + { + "epoch": 0.082233901819872, + "grad_norm": 0.8750975131988525, + "learning_rate": 4.7945046301262116e-05, + "loss": 0.09662745475769043, + "step": 2300 + }, + { + "epoch": 0.08580928885551861, + "grad_norm": 1.4319772720336914, + "learning_rate": 4.7855661625370954e-05, + "loss": 0.09479823112487792, + "step": 2400 + }, + { + "epoch": 0.08938467589116522, + "grad_norm": 1.1493583917617798, + "learning_rate": 4.7766276949479785e-05, + "loss": 0.09187865257263184, + "step": 2500 + }, + { + "epoch": 0.08938467589116522, + "eval_accuracy": 0.97115720940899, + "eval_f1": 0.7833422259443614, + "eval_loss": 0.12425321340560913, + "eval_precision": 0.7387798259806108, + "eval_recall": 0.8336256217117557, + "eval_runtime": 30.7236, + "eval_samples_per_second": 732.336, + "eval_steps_per_second": 20.343, + "step": 2500 + }, + { + "epoch": 0.09296006292681183, + "grad_norm": 0.4979458749294281, + "learning_rate": 4.767689227358862e-05, + "loss": 0.0913974666595459, + "step": 2600 + }, + { + "epoch": 0.09653544996245844, + "grad_norm": 0.9877503514289856, + "learning_rate": 4.758750759769745e-05, + "loss": 0.09267548561096191, + "step": 2700 + }, + { + "epoch": 0.10011083699810505, + "grad_norm": 1.4115008115768433, + "learning_rate": 4.749812292180629e-05, + "loss": 0.09185708045959473, + "step": 2800 + }, + { + "epoch": 0.10368622403375165, + "grad_norm": 1.2117033004760742, + "learning_rate": 4.740873824591512e-05, + "loss": 0.09133506774902343, + "step": 2900 + }, + { + "epoch": 0.10726161106939826, + "grad_norm": 0.527315080165863, + "learning_rate": 4.731935357002396e-05, + "loss": 0.08854376792907714, + "step": 3000 + }, + { + "epoch": 0.11083699810504487, + "grad_norm": 0.5725809931755066, + "learning_rate": 4.722996889413279e-05, + "loss": 0.08516644477844239, + "step": 3100 + }, + { + "epoch": 0.11441238514069148, + "grad_norm": 1.4227476119995117, + "learning_rate": 4.714058421824163e-05, + "loss": 0.08871203422546386, + "step": 3200 + }, + { + "epoch": 0.11798777217633809, + "grad_norm": 1.2104847431182861, + "learning_rate": 4.705119954235046e-05, + "loss": 0.0874100399017334, + "step": 3300 + }, + { + "epoch": 0.1215631592119847, + "grad_norm": 1.4136381149291992, + "learning_rate": 4.6961814866459295e-05, + "loss": 0.09060199737548828, + "step": 3400 + }, + { + "epoch": 0.1251385462476313, + "grad_norm": 1.7565488815307617, + "learning_rate": 4.687243019056813e-05, + "loss": 0.09233291625976563, + "step": 3500 + }, + { + "epoch": 0.1287139332832779, + "grad_norm": 1.2004791498184204, + "learning_rate": 4.6783045514676964e-05, + "loss": 0.08300934791564941, + "step": 3600 + }, + { + "epoch": 0.1322893203189245, + "grad_norm": 1.7836707830429077, + "learning_rate": 4.66936608387858e-05, + "loss": 0.09250588417053222, + "step": 3700 + }, + { + "epoch": 0.13586470735457112, + "grad_norm": 1.83432137966156, + "learning_rate": 4.660427616289463e-05, + "loss": 0.08511058807373047, + "step": 3800 + }, + { + "epoch": 0.13944009439021773, + "grad_norm": 1.1962814331054688, + "learning_rate": 4.651489148700347e-05, + "loss": 0.07956169128417968, + "step": 3900 + }, + { + "epoch": 0.14301548142586434, + "grad_norm": 1.145377278327942, + "learning_rate": 4.642550681111231e-05, + "loss": 0.08179279327392579, + "step": 4000 + }, + { + "epoch": 0.14659086846151095, + "grad_norm": 4.283623218536377, + "learning_rate": 4.633612213522114e-05, + "loss": 0.09118062019348144, + "step": 4100 + }, + { + "epoch": 0.15016625549715756, + "grad_norm": 2.0267841815948486, + "learning_rate": 4.6246737459329975e-05, + "loss": 0.0859706974029541, + "step": 4200 + }, + { + "epoch": 0.15374164253280417, + "grad_norm": 1.3412806987762451, + "learning_rate": 4.615735278343881e-05, + "loss": 0.07687939643859863, + "step": 4300 + }, + { + "epoch": 0.15731702956845078, + "grad_norm": 1.2748081684112549, + "learning_rate": 4.6067968107547644e-05, + "loss": 0.0789797306060791, + "step": 4400 + }, + { + "epoch": 0.1608924166040974, + "grad_norm": 0.8491079807281494, + "learning_rate": 4.597858343165648e-05, + "loss": 0.07809987068176269, + "step": 4500 + }, + { + "epoch": 0.164467803639744, + "grad_norm": 1.1583634614944458, + "learning_rate": 4.588919875576531e-05, + "loss": 0.07350683212280273, + "step": 4600 + }, + { + "epoch": 0.1680431906753906, + "grad_norm": 0.6579107642173767, + "learning_rate": 4.579981407987415e-05, + "loss": 0.07875243663787841, + "step": 4700 + }, + { + "epoch": 0.17161857771103722, + "grad_norm": 0.9742094278335571, + "learning_rate": 4.571042940398299e-05, + "loss": 0.08122955322265625, + "step": 4800 + }, + { + "epoch": 0.17519396474668383, + "grad_norm": 0.7365472912788391, + "learning_rate": 4.562104472809182e-05, + "loss": 0.07848617553710938, + "step": 4900 + }, + { + "epoch": 0.17876935178233044, + "grad_norm": 4.312972545623779, + "learning_rate": 4.5531660052200655e-05, + "loss": 0.07981382846832276, + "step": 5000 + }, + { + "epoch": 0.17876935178233044, + "eval_accuracy": 0.9774074625381929, + "eval_f1": 0.8253539377731214, + "eval_loss": 0.09500592201948166, + "eval_precision": 0.7928386037396048, + "eval_recall": 0.8606503176839261, + "eval_runtime": 27.5737, + "eval_samples_per_second": 815.994, + "eval_steps_per_second": 22.666, + "step": 5000 + }, + { + "epoch": 0.18234473881797705, + "grad_norm": 0.7737888097763062, + "learning_rate": 4.544227537630949e-05, + "loss": 0.07826550960540772, + "step": 5100 + }, + { + "epoch": 0.18592012585362366, + "grad_norm": 1.3171430826187134, + "learning_rate": 4.5352890700418324e-05, + "loss": 0.07463918685913086, + "step": 5200 + }, + { + "epoch": 0.18949551288927027, + "grad_norm": 1.445436716079712, + "learning_rate": 4.526350602452716e-05, + "loss": 0.07105834484100342, + "step": 5300 + }, + { + "epoch": 0.19307089992491688, + "grad_norm": 1.4572588205337524, + "learning_rate": 4.517412134863599e-05, + "loss": 0.07838897705078125, + "step": 5400 + }, + { + "epoch": 0.1966462869605635, + "grad_norm": 0.940371572971344, + "learning_rate": 4.508473667274482e-05, + "loss": 0.07499915599822998, + "step": 5500 + }, + { + "epoch": 0.2002216739962101, + "grad_norm": 0.6899816393852234, + "learning_rate": 4.499535199685366e-05, + "loss": 0.07030135154724121, + "step": 5600 + }, + { + "epoch": 0.20379706103185669, + "grad_norm": 1.0485793352127075, + "learning_rate": 4.490596732096249e-05, + "loss": 0.07988662719726562, + "step": 5700 + }, + { + "epoch": 0.2073724480675033, + "grad_norm": 1.5643068552017212, + "learning_rate": 4.481658264507133e-05, + "loss": 0.06960040092468261, + "step": 5800 + }, + { + "epoch": 0.2109478351031499, + "grad_norm": 0.5016098022460938, + "learning_rate": 4.4727197969180166e-05, + "loss": 0.07297846794128418, + "step": 5900 + }, + { + "epoch": 0.21452322213879652, + "grad_norm": 1.0210011005401611, + "learning_rate": 4.4637813293289e-05, + "loss": 0.07059600353240966, + "step": 6000 + }, + { + "epoch": 0.21809860917444313, + "grad_norm": 1.0845718383789062, + "learning_rate": 4.4548428617397835e-05, + "loss": 0.06686034202575683, + "step": 6100 + }, + { + "epoch": 0.22167399621008974, + "grad_norm": 0.8528701663017273, + "learning_rate": 4.445904394150667e-05, + "loss": 0.06841277122497559, + "step": 6200 + }, + { + "epoch": 0.22524938324573635, + "grad_norm": 1.8847771883010864, + "learning_rate": 4.43696592656155e-05, + "loss": 0.0757840919494629, + "step": 6300 + }, + { + "epoch": 0.22882477028138296, + "grad_norm": 2.079796075820923, + "learning_rate": 4.428027458972434e-05, + "loss": 0.06774754524230957, + "step": 6400 + }, + { + "epoch": 0.23240015731702957, + "grad_norm": 1.0023269653320312, + "learning_rate": 4.419088991383317e-05, + "loss": 0.07408513069152832, + "step": 6500 + }, + { + "epoch": 0.23597554435267618, + "grad_norm": 1.2481714487075806, + "learning_rate": 4.410150523794201e-05, + "loss": 0.07167030811309814, + "step": 6600 + }, + { + "epoch": 0.23955093138832279, + "grad_norm": 1.2812612056732178, + "learning_rate": 4.4012120562050846e-05, + "loss": 0.07096508502960205, + "step": 6700 + }, + { + "epoch": 0.2431263184239694, + "grad_norm": 1.1778594255447388, + "learning_rate": 4.392273588615968e-05, + "loss": 0.06785487651824951, + "step": 6800 + }, + { + "epoch": 0.246701705459616, + "grad_norm": 1.0923346281051636, + "learning_rate": 4.3833351210268515e-05, + "loss": 0.07081903457641602, + "step": 6900 + }, + { + "epoch": 0.2502770924952626, + "grad_norm": 0.9252088069915771, + "learning_rate": 4.374396653437735e-05, + "loss": 0.0647373390197754, + "step": 7000 + }, + { + "epoch": 0.2538524795309092, + "grad_norm": 2.192573070526123, + "learning_rate": 4.365458185848618e-05, + "loss": 0.0676526165008545, + "step": 7100 + }, + { + "epoch": 0.2574278665665558, + "grad_norm": 1.6381704807281494, + "learning_rate": 4.356519718259502e-05, + "loss": 0.07402976512908936, + "step": 7200 + }, + { + "epoch": 0.26100325360220245, + "grad_norm": 0.9283214807510376, + "learning_rate": 4.347581250670385e-05, + "loss": 0.06920474052429199, + "step": 7300 + }, + { + "epoch": 0.264578640637849, + "grad_norm": 0.8774147033691406, + "learning_rate": 4.338642783081269e-05, + "loss": 0.07280929565429688, + "step": 7400 + }, + { + "epoch": 0.26815402767349567, + "grad_norm": 1.8515883684158325, + "learning_rate": 4.3297043154921526e-05, + "loss": 0.07380086898803712, + "step": 7500 + }, + { + "epoch": 0.26815402767349567, + "eval_accuracy": 0.9785046625741701, + "eval_f1": 0.8438232328500399, + "eval_loss": 0.08568981289863586, + "eval_precision": 0.8172658575681245, + "eval_recall": 0.8721645631486645, + "eval_runtime": 27.0275, + "eval_samples_per_second": 832.487, + "eval_steps_per_second": 23.125, + "step": 7500 + }, + { + "epoch": 0.27172941470914225, + "grad_norm": 0.7200068235397339, + "learning_rate": 4.320765847903036e-05, + "loss": 0.0730604887008667, + "step": 7600 + }, + { + "epoch": 0.2753048017447889, + "grad_norm": 0.915267288684845, + "learning_rate": 4.3118273803139195e-05, + "loss": 0.07050428867340088, + "step": 7700 + }, + { + "epoch": 0.27888018878043547, + "grad_norm": 0.7131528854370117, + "learning_rate": 4.3028889127248025e-05, + "loss": 0.0710810136795044, + "step": 7800 + }, + { + "epoch": 0.2824555758160821, + "grad_norm": 1.1002038717269897, + "learning_rate": 4.293950445135686e-05, + "loss": 0.07342493057250976, + "step": 7900 + }, + { + "epoch": 0.2860309628517287, + "grad_norm": 0.9407269954681396, + "learning_rate": 4.2850119775465694e-05, + "loss": 0.0673301601409912, + "step": 8000 + }, + { + "epoch": 0.2896063498873753, + "grad_norm": 2.832193613052368, + "learning_rate": 4.276073509957453e-05, + "loss": 0.06240209102630615, + "step": 8100 + }, + { + "epoch": 0.2931817369230219, + "grad_norm": 0.8768466114997864, + "learning_rate": 4.267135042368336e-05, + "loss": 0.06878421783447265, + "step": 8200 + }, + { + "epoch": 0.29675712395866854, + "grad_norm": 2.6219418048858643, + "learning_rate": 4.25819657477922e-05, + "loss": 0.06775379657745362, + "step": 8300 + }, + { + "epoch": 0.3003325109943151, + "grad_norm": 1.4696264266967773, + "learning_rate": 4.249258107190103e-05, + "loss": 0.06918183803558349, + "step": 8400 + }, + { + "epoch": 0.30390789802996176, + "grad_norm": 0.3726998269557953, + "learning_rate": 4.240319639600987e-05, + "loss": 0.0662617588043213, + "step": 8500 + }, + { + "epoch": 0.30748328506560835, + "grad_norm": 0.7445316314697266, + "learning_rate": 4.2313811720118706e-05, + "loss": 0.06442654609680176, + "step": 8600 + }, + { + "epoch": 0.311058672101255, + "grad_norm": 1.971909761428833, + "learning_rate": 4.2224427044227536e-05, + "loss": 0.0726364278793335, + "step": 8700 + }, + { + "epoch": 0.31463405913690157, + "grad_norm": 1.5563815832138062, + "learning_rate": 4.2135042368336374e-05, + "loss": 0.06712177753448487, + "step": 8800 + }, + { + "epoch": 0.3182094461725482, + "grad_norm": 0.7900974154472351, + "learning_rate": 4.204565769244521e-05, + "loss": 0.058818936347961426, + "step": 8900 + }, + { + "epoch": 0.3217848332081948, + "grad_norm": 1.3865457773208618, + "learning_rate": 4.195627301655404e-05, + "loss": 0.06370719909667968, + "step": 9000 + }, + { + "epoch": 0.32536022024384137, + "grad_norm": 0.34235015511512756, + "learning_rate": 4.186688834066288e-05, + "loss": 0.06904962062835693, + "step": 9100 + }, + { + "epoch": 0.328935607279488, + "grad_norm": 2.1909384727478027, + "learning_rate": 4.177750366477171e-05, + "loss": 0.06057620048522949, + "step": 9200 + }, + { + "epoch": 0.3325109943151346, + "grad_norm": 1.308127760887146, + "learning_rate": 4.168811898888055e-05, + "loss": 0.06866058826446533, + "step": 9300 + }, + { + "epoch": 0.3360863813507812, + "grad_norm": 0.6863975524902344, + "learning_rate": 4.1598734312989386e-05, + "loss": 0.06358649730682372, + "step": 9400 + }, + { + "epoch": 0.3396617683864278, + "grad_norm": 1.1869947910308838, + "learning_rate": 4.1509349637098216e-05, + "loss": 0.06475292205810547, + "step": 9500 + }, + { + "epoch": 0.34323715542207445, + "grad_norm": 1.4386121034622192, + "learning_rate": 4.1419964961207054e-05, + "loss": 0.06661314010620117, + "step": 9600 + }, + { + "epoch": 0.34681254245772103, + "grad_norm": 0.48181113600730896, + "learning_rate": 4.133058028531589e-05, + "loss": 0.060897083282470704, + "step": 9700 + }, + { + "epoch": 0.35038792949336767, + "grad_norm": 0.8885261416435242, + "learning_rate": 4.124119560942472e-05, + "loss": 0.06239647388458252, + "step": 9800 + }, + { + "epoch": 0.35396331652901425, + "grad_norm": 1.2147257328033447, + "learning_rate": 4.115181093353356e-05, + "loss": 0.06007009029388428, + "step": 9900 + }, + { + "epoch": 0.3575387035646609, + "grad_norm": 3.1831276416778564, + "learning_rate": 4.106242625764239e-05, + "loss": 0.06108261108398438, + "step": 10000 + }, + { + "epoch": 0.3575387035646609, + "eval_accuracy": 0.9812439807847978, + "eval_f1": 0.8498944390638173, + "eval_loss": 0.07966496795415878, + "eval_precision": 0.8246859491839411, + "eval_recall": 0.8766926372078314, + "eval_runtime": 27.6584, + "eval_samples_per_second": 813.495, + "eval_steps_per_second": 22.597, + "step": 10000 + }, + { + "epoch": 0.36111409060030747, + "grad_norm": 1.2144405841827393, + "learning_rate": 4.097304158175123e-05, + "loss": 0.06152146816253662, + "step": 10100 + }, + { + "epoch": 0.3646894776359541, + "grad_norm": 0.777988076210022, + "learning_rate": 4.0883656905860066e-05, + "loss": 0.06342405319213867, + "step": 10200 + }, + { + "epoch": 0.3682648646716007, + "grad_norm": 0.6419842839241028, + "learning_rate": 4.0794272229968896e-05, + "loss": 0.055976643562316894, + "step": 10300 + }, + { + "epoch": 0.3718402517072473, + "grad_norm": 0.45166343450546265, + "learning_rate": 4.0704887554077734e-05, + "loss": 0.07191664695739747, + "step": 10400 + }, + { + "epoch": 0.3754156387428939, + "grad_norm": 0.5005468726158142, + "learning_rate": 4.0615502878186565e-05, + "loss": 0.06205938339233399, + "step": 10500 + }, + { + "epoch": 0.37899102577854055, + "grad_norm": 0.6201260089874268, + "learning_rate": 4.0526118202295396e-05, + "loss": 0.061759543418884275, + "step": 10600 + }, + { + "epoch": 0.38256641281418713, + "grad_norm": 0.4341242015361786, + "learning_rate": 4.043673352640423e-05, + "loss": 0.06618201732635498, + "step": 10700 + }, + { + "epoch": 0.38614179984983377, + "grad_norm": 0.4003482460975647, + "learning_rate": 4.034734885051307e-05, + "loss": 0.06178065299987793, + "step": 10800 + }, + { + "epoch": 0.38971718688548035, + "grad_norm": 1.0296162366867065, + "learning_rate": 4.02579641746219e-05, + "loss": 0.06249929904937744, + "step": 10900 + }, + { + "epoch": 0.393292573921127, + "grad_norm": 1.362121820449829, + "learning_rate": 4.016857949873074e-05, + "loss": 0.05500233173370361, + "step": 11000 + }, + { + "epoch": 0.39686796095677357, + "grad_norm": 0.7699733376502991, + "learning_rate": 4.007919482283957e-05, + "loss": 0.060595006942749025, + "step": 11100 + }, + { + "epoch": 0.4004433479924202, + "grad_norm": 1.3927844762802124, + "learning_rate": 3.998981014694841e-05, + "loss": 0.05860278129577637, + "step": 11200 + }, + { + "epoch": 0.4040187350280668, + "grad_norm": 0.5842928290367126, + "learning_rate": 3.9900425471057245e-05, + "loss": 0.062330193519592285, + "step": 11300 + }, + { + "epoch": 0.40759412206371337, + "grad_norm": 1.231602430343628, + "learning_rate": 3.9811040795166076e-05, + "loss": 0.05743512153625488, + "step": 11400 + }, + { + "epoch": 0.41116950909936, + "grad_norm": 0.33235710859298706, + "learning_rate": 3.972165611927491e-05, + "loss": 0.059948296546936036, + "step": 11500 + }, + { + "epoch": 0.4147448961350066, + "grad_norm": 0.812560498714447, + "learning_rate": 3.963227144338375e-05, + "loss": 0.06013148784637451, + "step": 11600 + }, + { + "epoch": 0.41832028317065323, + "grad_norm": 0.7160065174102783, + "learning_rate": 3.954288676749258e-05, + "loss": 0.0654984951019287, + "step": 11700 + }, + { + "epoch": 0.4218956702062998, + "grad_norm": 0.959859311580658, + "learning_rate": 3.945350209160142e-05, + "loss": 0.061361746788024904, + "step": 11800 + }, + { + "epoch": 0.42547105724194645, + "grad_norm": 0.661882758140564, + "learning_rate": 3.936411741571025e-05, + "loss": 0.05800935268402099, + "step": 11900 + }, + { + "epoch": 0.42904644427759303, + "grad_norm": 1.3494808673858643, + "learning_rate": 3.927473273981909e-05, + "loss": 0.058743157386779786, + "step": 12000 + }, + { + "epoch": 0.43262183131323967, + "grad_norm": 0.3964793384075165, + "learning_rate": 3.9185348063927925e-05, + "loss": 0.05860978603363037, + "step": 12100 + }, + { + "epoch": 0.43619721834888625, + "grad_norm": 0.6984548568725586, + "learning_rate": 3.9095963388036756e-05, + "loss": 0.05355045795440674, + "step": 12200 + }, + { + "epoch": 0.4397726053845329, + "grad_norm": 0.9193189144134521, + "learning_rate": 3.900657871214559e-05, + "loss": 0.05985400676727295, + "step": 12300 + }, + { + "epoch": 0.44334799242017947, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.891719403625443e-05, + "loss": 0.06027592182159424, + "step": 12400 + }, + { + "epoch": 0.4469233794558261, + "grad_norm": 2.280050754547119, + "learning_rate": 3.882780936036326e-05, + "loss": 0.05881267070770264, + "step": 12500 + }, + { + "epoch": 0.4469233794558261, + "eval_accuracy": 0.9822486078551317, + "eval_f1": 0.8582031250000001, + "eval_loss": 0.07316970080137253, + "eval_precision": 0.8336179093151205, + "eval_recall": 0.884282551821292, + "eval_runtime": 27.7811, + "eval_samples_per_second": 809.904, + "eval_steps_per_second": 22.497, + "step": 12500 + }, + { + "epoch": 0.4504987664914727, + "grad_norm": 0.9513980746269226, + "learning_rate": 3.87384246844721e-05, + "loss": 0.05991718769073486, + "step": 12600 + }, + { + "epoch": 0.45407415352711933, + "grad_norm": 0.8513447046279907, + "learning_rate": 3.8649040008580937e-05, + "loss": 0.059436683654785154, + "step": 12700 + }, + { + "epoch": 0.4576495405627659, + "grad_norm": 3.6959080696105957, + "learning_rate": 3.855965533268977e-05, + "loss": 0.06146327018737793, + "step": 12800 + }, + { + "epoch": 0.46122492759841255, + "grad_norm": 0.4215289056301117, + "learning_rate": 3.8470270656798605e-05, + "loss": 0.051028499603271486, + "step": 12900 + }, + { + "epoch": 0.46480031463405913, + "grad_norm": 0.553249716758728, + "learning_rate": 3.8380885980907436e-05, + "loss": 0.05888910293579101, + "step": 13000 + }, + { + "epoch": 0.46837570166970577, + "grad_norm": 0.534638524055481, + "learning_rate": 3.8291501305016266e-05, + "loss": 0.056477956771850586, + "step": 13100 + }, + { + "epoch": 0.47195108870535235, + "grad_norm": 0.5859609842300415, + "learning_rate": 3.8202116629125104e-05, + "loss": 0.05791654109954834, + "step": 13200 + }, + { + "epoch": 0.475526475740999, + "grad_norm": 0.6610586047172546, + "learning_rate": 3.8112731953233935e-05, + "loss": 0.05362565040588379, + "step": 13300 + }, + { + "epoch": 0.47910186277664557, + "grad_norm": 0.6048291325569153, + "learning_rate": 3.802334727734277e-05, + "loss": 0.057788271903991696, + "step": 13400 + }, + { + "epoch": 0.4826772498122922, + "grad_norm": 0.7319697141647339, + "learning_rate": 3.793396260145161e-05, + "loss": 0.05477115631103516, + "step": 13500 + }, + { + "epoch": 0.4862526368479388, + "grad_norm": 0.5771811008453369, + "learning_rate": 3.784457792556044e-05, + "loss": 0.059410476684570314, + "step": 13600 + }, + { + "epoch": 0.4898280238835854, + "grad_norm": 1.9499260187149048, + "learning_rate": 3.775519324966928e-05, + "loss": 0.052494893074035646, + "step": 13700 + }, + { + "epoch": 0.493403410919232, + "grad_norm": 0.8795179128646851, + "learning_rate": 3.766580857377811e-05, + "loss": 0.05528387546539307, + "step": 13800 + }, + { + "epoch": 0.4969787979548786, + "grad_norm": 0.5892202258110046, + "learning_rate": 3.7576423897886947e-05, + "loss": 0.05600544452667236, + "step": 13900 + }, + { + "epoch": 0.5005541849905252, + "grad_norm": 0.6402941346168518, + "learning_rate": 3.7487039221995784e-05, + "loss": 0.05357628345489502, + "step": 14000 + }, + { + "epoch": 0.5041295720261718, + "grad_norm": 0.5255988836288452, + "learning_rate": 3.7397654546104615e-05, + "loss": 0.057103352546691896, + "step": 14100 + }, + { + "epoch": 0.5077049590618185, + "grad_norm": 0.8301808834075928, + "learning_rate": 3.730826987021345e-05, + "loss": 0.0532010555267334, + "step": 14200 + }, + { + "epoch": 0.5112803460974651, + "grad_norm": 0.6901052594184875, + "learning_rate": 3.721888519432229e-05, + "loss": 0.05516294002532959, + "step": 14300 + }, + { + "epoch": 0.5148557331331116, + "grad_norm": 0.9628658890724182, + "learning_rate": 3.712950051843112e-05, + "loss": 0.06214995384216308, + "step": 14400 + }, + { + "epoch": 0.5184311201687583, + "grad_norm": 1.3679792881011963, + "learning_rate": 3.704011584253996e-05, + "loss": 0.05541347503662109, + "step": 14500 + }, + { + "epoch": 0.5220065072044049, + "grad_norm": 0.23267334699630737, + "learning_rate": 3.695073116664879e-05, + "loss": 0.0589248275756836, + "step": 14600 + }, + { + "epoch": 0.5255818942400515, + "grad_norm": 0.6239579319953918, + "learning_rate": 3.6861346490757627e-05, + "loss": 0.053284521102905276, + "step": 14700 + }, + { + "epoch": 0.529157281275698, + "grad_norm": 0.7674051523208618, + "learning_rate": 3.6771961814866464e-05, + "loss": 0.05738714218139648, + "step": 14800 + }, + { + "epoch": 0.5327326683113447, + "grad_norm": 0.8594136834144592, + "learning_rate": 3.6682577138975295e-05, + "loss": 0.055074062347412106, + "step": 14900 + }, + { + "epoch": 0.5363080553469913, + "grad_norm": 1.3505005836486816, + "learning_rate": 3.659319246308413e-05, + "loss": 0.05417671680450439, + "step": 15000 + }, + { + "epoch": 0.5363080553469913, + "eval_accuracy": 0.9838496993745539, + "eval_f1": 0.8737497800457504, + "eval_loss": 0.06651480495929718, + "eval_precision": 0.8560178736432719, + "eval_recall": 0.8922318373918293, + "eval_runtime": 27.3392, + "eval_samples_per_second": 822.994, + "eval_steps_per_second": 22.861, + "step": 15000 + }, + { + "epoch": 0.539883442382638, + "grad_norm": 0.7868797779083252, + "learning_rate": 3.650380778719297e-05, + "loss": 0.060230064392089847, + "step": 15100 + }, + { + "epoch": 0.5434588294182845, + "grad_norm": 0.3154486119747162, + "learning_rate": 3.64144231113018e-05, + "loss": 0.05918198108673096, + "step": 15200 + }, + { + "epoch": 0.5470342164539311, + "grad_norm": 0.5093942284584045, + "learning_rate": 3.632503843541064e-05, + "loss": 0.05554147720336914, + "step": 15300 + }, + { + "epoch": 0.5506096034895778, + "grad_norm": 1.080651044845581, + "learning_rate": 3.623565375951947e-05, + "loss": 0.05167547702789307, + "step": 15400 + }, + { + "epoch": 0.5541849905252244, + "grad_norm": 1.2834564447402954, + "learning_rate": 3.614626908362831e-05, + "loss": 0.05269266128540039, + "step": 15500 + }, + { + "epoch": 0.5577603775608709, + "grad_norm": 0.9456666707992554, + "learning_rate": 3.605688440773714e-05, + "loss": 0.05228121280670166, + "step": 15600 + }, + { + "epoch": 0.5613357645965176, + "grad_norm": 1.931270718574524, + "learning_rate": 3.5967499731845975e-05, + "loss": 0.05532039642333984, + "step": 15700 + }, + { + "epoch": 0.5649111516321642, + "grad_norm": 1.9416167736053467, + "learning_rate": 3.5878115055954806e-05, + "loss": 0.05132888793945312, + "step": 15800 + }, + { + "epoch": 0.5684865386678107, + "grad_norm": 0.2992418110370636, + "learning_rate": 3.578873038006364e-05, + "loss": 0.05806799411773682, + "step": 15900 + }, + { + "epoch": 0.5720619257034574, + "grad_norm": 0.7173650860786438, + "learning_rate": 3.5699345704172474e-05, + "loss": 0.05833985805511475, + "step": 16000 + }, + { + "epoch": 0.575637312739104, + "grad_norm": 1.0283321142196655, + "learning_rate": 3.560996102828131e-05, + "loss": 0.05651096820831299, + "step": 16100 + }, + { + "epoch": 0.5792126997747507, + "grad_norm": 0.43172529339790344, + "learning_rate": 3.552057635239015e-05, + "loss": 0.05330658435821533, + "step": 16200 + }, + { + "epoch": 0.5827880868103972, + "grad_norm": 0.6333898901939392, + "learning_rate": 3.543119167649898e-05, + "loss": 0.053462224006652834, + "step": 16300 + }, + { + "epoch": 0.5863634738460438, + "grad_norm": 0.8817270994186401, + "learning_rate": 3.534180700060782e-05, + "loss": 0.05549070358276367, + "step": 16400 + }, + { + "epoch": 0.5899388608816905, + "grad_norm": 4.280094146728516, + "learning_rate": 3.525242232471665e-05, + "loss": 0.05985762119293213, + "step": 16500 + }, + { + "epoch": 0.5935142479173371, + "grad_norm": 0.62297523021698, + "learning_rate": 3.5163037648825486e-05, + "loss": 0.05666534423828125, + "step": 16600 + }, + { + "epoch": 0.5970896349529836, + "grad_norm": 0.29738688468933105, + "learning_rate": 3.507365297293432e-05, + "loss": 0.053336749076843264, + "step": 16700 + }, + { + "epoch": 0.6006650219886303, + "grad_norm": 1.139436960220337, + "learning_rate": 3.4984268297043154e-05, + "loss": 0.05532379150390625, + "step": 16800 + }, + { + "epoch": 0.6042404090242769, + "grad_norm": 0.37320244312286377, + "learning_rate": 3.489488362115199e-05, + "loss": 0.05435383796691894, + "step": 16900 + }, + { + "epoch": 0.6078157960599235, + "grad_norm": 0.5908817648887634, + "learning_rate": 3.480549894526083e-05, + "loss": 0.052842388153076174, + "step": 17000 + }, + { + "epoch": 0.6113911830955701, + "grad_norm": 0.4973529279232025, + "learning_rate": 3.471611426936966e-05, + "loss": 0.05500569343566895, + "step": 17100 + }, + { + "epoch": 0.6149665701312167, + "grad_norm": 1.438362717628479, + "learning_rate": 3.46267295934785e-05, + "loss": 0.04875383853912354, + "step": 17200 + }, + { + "epoch": 0.6185419571668633, + "grad_norm": 1.1460702419281006, + "learning_rate": 3.4537344917587335e-05, + "loss": 0.05489758968353271, + "step": 17300 + }, + { + "epoch": 0.62211734420251, + "grad_norm": 0.359030157327652, + "learning_rate": 3.4447960241696166e-05, + "loss": 0.0537039852142334, + "step": 17400 + }, + { + "epoch": 0.6256927312381565, + "grad_norm": 1.0160428285598755, + "learning_rate": 3.4358575565805e-05, + "loss": 0.05569758415222168, + "step": 17500 + }, + { + "epoch": 0.6256927312381565, + "eval_accuracy": 0.9845403147375635, + "eval_f1": 0.877477096546864, + "eval_loss": 0.0613168403506279, + "eval_precision": 0.8606879199270053, + "eval_recall": 0.8949343069890464, + "eval_runtime": 27.832, + "eval_samples_per_second": 808.423, + "eval_steps_per_second": 22.456, + "step": 17500 + }, + { + "epoch": 0.6292681182738031, + "grad_norm": 0.9637561440467834, + "learning_rate": 3.4269190889913834e-05, + "loss": 0.05049953460693359, + "step": 17600 + }, + { + "epoch": 0.6328435053094498, + "grad_norm": 0.4047839343547821, + "learning_rate": 3.417980621402267e-05, + "loss": 0.051105165481567384, + "step": 17700 + }, + { + "epoch": 0.6364188923450964, + "grad_norm": 0.5562448501586914, + "learning_rate": 3.409042153813151e-05, + "loss": 0.04887496471405029, + "step": 17800 + }, + { + "epoch": 0.6399942793807429, + "grad_norm": 0.7675971984863281, + "learning_rate": 3.400103686224034e-05, + "loss": 0.05429211139678955, + "step": 17900 + }, + { + "epoch": 0.6435696664163896, + "grad_norm": 0.44871142506599426, + "learning_rate": 3.391165218634918e-05, + "loss": 0.04755040645599365, + "step": 18000 + }, + { + "epoch": 0.6471450534520362, + "grad_norm": 0.4453502297401428, + "learning_rate": 3.382226751045801e-05, + "loss": 0.05987214088439941, + "step": 18100 + }, + { + "epoch": 0.6507204404876827, + "grad_norm": 0.4004403352737427, + "learning_rate": 3.373288283456684e-05, + "loss": 0.054094972610473635, + "step": 18200 + }, + { + "epoch": 0.6542958275233294, + "grad_norm": 0.8362923264503479, + "learning_rate": 3.364349815867568e-05, + "loss": 0.04843898296356201, + "step": 18300 + }, + { + "epoch": 0.657871214558976, + "grad_norm": 0.6269751787185669, + "learning_rate": 3.355411348278451e-05, + "loss": 0.05007925033569336, + "step": 18400 + }, + { + "epoch": 0.6614466015946227, + "grad_norm": 0.7181591987609863, + "learning_rate": 3.3464728806893345e-05, + "loss": 0.05742511749267578, + "step": 18500 + }, + { + "epoch": 0.6650219886302692, + "grad_norm": 2.8255951404571533, + "learning_rate": 3.337534413100218e-05, + "loss": 0.050363807678222655, + "step": 18600 + }, + { + "epoch": 0.6685973756659158, + "grad_norm": 1.1854428052902222, + "learning_rate": 3.328595945511101e-05, + "loss": 0.05580689430236816, + "step": 18700 + }, + { + "epoch": 0.6721727627015625, + "grad_norm": 0.3564029335975647, + "learning_rate": 3.319657477921985e-05, + "loss": 0.04986191749572754, + "step": 18800 + }, + { + "epoch": 0.6757481497372091, + "grad_norm": 0.9392517805099487, + "learning_rate": 3.310719010332869e-05, + "loss": 0.05029686450958252, + "step": 18900 + }, + { + "epoch": 0.6793235367728556, + "grad_norm": 0.9811071157455444, + "learning_rate": 3.301780542743752e-05, + "loss": 0.05468404293060303, + "step": 19000 + }, + { + "epoch": 0.6828989238085023, + "grad_norm": 2.1979386806488037, + "learning_rate": 3.292842075154636e-05, + "loss": 0.04795463562011719, + "step": 19100 + }, + { + "epoch": 0.6864743108441489, + "grad_norm": 4.135185241699219, + "learning_rate": 3.2839036075655194e-05, + "loss": 0.051746668815612795, + "step": 19200 + }, + { + "epoch": 0.6900496978797955, + "grad_norm": 0.611629843711853, + "learning_rate": 3.2749651399764025e-05, + "loss": 0.05136622428894043, + "step": 19300 + }, + { + "epoch": 0.6936250849154421, + "grad_norm": 0.7905089259147644, + "learning_rate": 3.266026672387286e-05, + "loss": 0.0534757661819458, + "step": 19400 + }, + { + "epoch": 0.6972004719510887, + "grad_norm": 0.3704472482204437, + "learning_rate": 3.257088204798169e-05, + "loss": 0.05190816879272461, + "step": 19500 + }, + { + "epoch": 0.7007758589867353, + "grad_norm": 0.41257503628730774, + "learning_rate": 3.248149737209053e-05, + "loss": 0.05314404487609863, + "step": 19600 + }, + { + "epoch": 0.704351246022382, + "grad_norm": 1.0130038261413574, + "learning_rate": 3.239211269619937e-05, + "loss": 0.051221070289611814, + "step": 19700 + }, + { + "epoch": 0.7079266330580285, + "grad_norm": 0.44306495785713196, + "learning_rate": 3.23027280203082e-05, + "loss": 0.05151443004608154, + "step": 19800 + }, + { + "epoch": 0.7115020200936751, + "grad_norm": 1.3375622034072876, + "learning_rate": 3.221334334441704e-05, + "loss": 0.051753206253051756, + "step": 19900 + }, + { + "epoch": 0.7150774071293218, + "grad_norm": 0.48512154817581177, + "learning_rate": 3.2123958668525874e-05, + "loss": 0.04863485813140869, + "step": 20000 + }, + { + "epoch": 0.7150774071293218, + "eval_accuracy": 0.9851494027728759, + "eval_f1": 0.8755350929603205, + "eval_loss": 0.05904531106352806, + "eval_precision": 0.856669280182671, + "eval_recall": 0.8952505534312739, + "eval_runtime": 27.4851, + "eval_samples_per_second": 818.624, + "eval_steps_per_second": 22.74, + "step": 20000 + }, + { + "epoch": 0.7186527941649684, + "grad_norm": 0.45322614908218384, + "learning_rate": 3.2034573992634705e-05, + "loss": 0.05499778270721436, + "step": 20100 + }, + { + "epoch": 0.7222281812006149, + "grad_norm": 0.4665698707103729, + "learning_rate": 3.194518931674354e-05, + "loss": 0.05100120544433594, + "step": 20200 + }, + { + "epoch": 0.7258035682362616, + "grad_norm": 0.7074053883552551, + "learning_rate": 3.185580464085237e-05, + "loss": 0.04919565200805664, + "step": 20300 + }, + { + "epoch": 0.7293789552719082, + "grad_norm": 1.2581121921539307, + "learning_rate": 3.176641996496121e-05, + "loss": 0.05387771606445312, + "step": 20400 + }, + { + "epoch": 0.7329543423075547, + "grad_norm": 0.3161942660808563, + "learning_rate": 3.167703528907004e-05, + "loss": 0.04680909633636474, + "step": 20500 + }, + { + "epoch": 0.7365297293432014, + "grad_norm": 0.8641468286514282, + "learning_rate": 3.158765061317888e-05, + "loss": 0.04961400508880615, + "step": 20600 + }, + { + "epoch": 0.740105116378848, + "grad_norm": 0.6563690304756165, + "learning_rate": 3.149826593728771e-05, + "loss": 0.05145148754119873, + "step": 20700 + }, + { + "epoch": 0.7436805034144947, + "grad_norm": 0.3394390940666199, + "learning_rate": 3.140888126139655e-05, + "loss": 0.048502054214477536, + "step": 20800 + }, + { + "epoch": 0.7472558904501412, + "grad_norm": 0.5382287502288818, + "learning_rate": 3.131949658550538e-05, + "loss": 0.052634720802307126, + "step": 20900 + }, + { + "epoch": 0.7508312774857878, + "grad_norm": 0.5506078004837036, + "learning_rate": 3.1230111909614216e-05, + "loss": 0.05615939140319824, + "step": 21000 + }, + { + "epoch": 0.7544066645214345, + "grad_norm": 0.4533487558364868, + "learning_rate": 3.114072723372305e-05, + "loss": 0.0571517276763916, + "step": 21100 + }, + { + "epoch": 0.7579820515570811, + "grad_norm": 1.2659982442855835, + "learning_rate": 3.1051342557831884e-05, + "loss": 0.05127411842346191, + "step": 21200 + }, + { + "epoch": 0.7615574385927276, + "grad_norm": 0.38378211855888367, + "learning_rate": 3.096195788194072e-05, + "loss": 0.04847681522369385, + "step": 21300 + }, + { + "epoch": 0.7651328256283743, + "grad_norm": 0.2992658317089081, + "learning_rate": 3.087257320604955e-05, + "loss": 0.05205928325653076, + "step": 21400 + }, + { + "epoch": 0.7687082126640209, + "grad_norm": 0.5818284749984741, + "learning_rate": 3.078318853015839e-05, + "loss": 0.04922466278076172, + "step": 21500 + }, + { + "epoch": 0.7722835996996675, + "grad_norm": 0.41028082370758057, + "learning_rate": 3.069380385426723e-05, + "loss": 0.04695847034454346, + "step": 21600 + }, + { + "epoch": 0.7758589867353141, + "grad_norm": 0.31596678495407104, + "learning_rate": 3.060441917837606e-05, + "loss": 0.049401440620422364, + "step": 21700 + }, + { + "epoch": 0.7794343737709607, + "grad_norm": 0.39899763464927673, + "learning_rate": 3.0515034502484896e-05, + "loss": 0.0458904504776001, + "step": 21800 + }, + { + "epoch": 0.7830097608066073, + "grad_norm": 4.016449928283691, + "learning_rate": 3.0425649826593733e-05, + "loss": 0.04808720588684082, + "step": 21900 + }, + { + "epoch": 0.786585147842254, + "grad_norm": 1.8184044361114502, + "learning_rate": 3.0336265150702564e-05, + "loss": 0.050203371047973636, + "step": 22000 + }, + { + "epoch": 0.7901605348779005, + "grad_norm": 0.47340500354766846, + "learning_rate": 3.0246880474811402e-05, + "loss": 0.04804760932922363, + "step": 22100 + }, + { + "epoch": 0.7937359219135471, + "grad_norm": 1.306254506111145, + "learning_rate": 3.0157495798920233e-05, + "loss": 0.04765232563018799, + "step": 22200 + }, + { + "epoch": 0.7973113089491938, + "grad_norm": 0.6133173704147339, + "learning_rate": 3.006811112302907e-05, + "loss": 0.04909511566162109, + "step": 22300 + }, + { + "epoch": 0.8008866959848404, + "grad_norm": 1.063022494316101, + "learning_rate": 2.9978726447137904e-05, + "loss": 0.048132557868957516, + "step": 22400 + }, + { + "epoch": 0.8044620830204869, + "grad_norm": 0.4442903697490692, + "learning_rate": 2.988934177124674e-05, + "loss": 0.04739914894104004, + "step": 22500 + }, + { + "epoch": 0.8044620830204869, + "eval_accuracy": 0.9853803092042249, + "eval_f1": 0.8812850838481906, + "eval_loss": 0.060121480375528336, + "eval_precision": 0.8660403280645027, + "eval_recall": 0.8970761578932237, + "eval_runtime": 27.7438, + "eval_samples_per_second": 810.991, + "eval_steps_per_second": 22.528, + "step": 22500 + }, + { + "epoch": 0.8080374700561336, + "grad_norm": 0.8813098073005676, + "learning_rate": 2.9799957095355573e-05, + "loss": 0.05161878108978271, + "step": 22600 + }, + { + "epoch": 0.8116128570917802, + "grad_norm": 0.7460477948188782, + "learning_rate": 2.971057241946441e-05, + "loss": 0.0515793514251709, + "step": 22700 + }, + { + "epoch": 0.8151882441274267, + "grad_norm": 0.5062021613121033, + "learning_rate": 2.962118774357324e-05, + "loss": 0.04754622936248779, + "step": 22800 + }, + { + "epoch": 0.8187636311630734, + "grad_norm": 0.7567230463027954, + "learning_rate": 2.953180306768208e-05, + "loss": 0.05149875164031983, + "step": 22900 + }, + { + "epoch": 0.82233901819872, + "grad_norm": 0.7439789772033691, + "learning_rate": 2.944241839179091e-05, + "loss": 0.04974982738494873, + "step": 23000 + }, + { + "epoch": 0.8259144052343667, + "grad_norm": 0.669979453086853, + "learning_rate": 2.9353033715899747e-05, + "loss": 0.04604334354400635, + "step": 23100 + }, + { + "epoch": 0.8294897922700132, + "grad_norm": 1.005071759223938, + "learning_rate": 2.9263649040008584e-05, + "loss": 0.04706980228424072, + "step": 23200 + }, + { + "epoch": 0.8330651793056598, + "grad_norm": 0.31772536039352417, + "learning_rate": 2.9174264364117415e-05, + "loss": 0.05056349754333496, + "step": 23300 + }, + { + "epoch": 0.8366405663413065, + "grad_norm": 0.32514145970344543, + "learning_rate": 2.9084879688226253e-05, + "loss": 0.04744285106658935, + "step": 23400 + }, + { + "epoch": 0.8402159533769531, + "grad_norm": 1.0965938568115234, + "learning_rate": 2.899549501233509e-05, + "loss": 0.04618396759033203, + "step": 23500 + }, + { + "epoch": 0.8437913404125996, + "grad_norm": 0.6312568783760071, + "learning_rate": 2.890611033644392e-05, + "loss": 0.04719692230224609, + "step": 23600 + }, + { + "epoch": 0.8473667274482463, + "grad_norm": 0.5469244122505188, + "learning_rate": 2.881672566055276e-05, + "loss": 0.04657519817352295, + "step": 23700 + }, + { + "epoch": 0.8509421144838929, + "grad_norm": 0.9338961839675903, + "learning_rate": 2.8727340984661593e-05, + "loss": 0.04994749069213867, + "step": 23800 + }, + { + "epoch": 0.8545175015195395, + "grad_norm": 0.6873934268951416, + "learning_rate": 2.8637956308770423e-05, + "loss": 0.04766389846801758, + "step": 23900 + }, + { + "epoch": 0.8580928885551861, + "grad_norm": 1.3465129137039185, + "learning_rate": 2.854857163287926e-05, + "loss": 0.04612489223480225, + "step": 24000 + }, + { + "epoch": 0.8616682755908327, + "grad_norm": 0.3835633397102356, + "learning_rate": 2.8459186956988092e-05, + "loss": 0.048950729370117185, + "step": 24100 + }, + { + "epoch": 0.8652436626264793, + "grad_norm": 0.7884401082992554, + "learning_rate": 2.836980228109693e-05, + "loss": 0.046166911125183105, + "step": 24200 + }, + { + "epoch": 0.868819049662126, + "grad_norm": 0.49389323592185974, + "learning_rate": 2.8280417605205767e-05, + "loss": 0.046818752288818356, + "step": 24300 + }, + { + "epoch": 0.8723944366977725, + "grad_norm": 0.6339199542999268, + "learning_rate": 2.8191032929314598e-05, + "loss": 0.04933880805969238, + "step": 24400 + }, + { + "epoch": 0.8759698237334191, + "grad_norm": 0.5761122703552246, + "learning_rate": 2.8101648253423435e-05, + "loss": 0.044534187316894534, + "step": 24500 + }, + { + "epoch": 0.8795452107690658, + "grad_norm": 0.45685720443725586, + "learning_rate": 2.8012263577532273e-05, + "loss": 0.051560683250427244, + "step": 24600 + }, + { + "epoch": 0.8831205978047124, + "grad_norm": 0.4419282078742981, + "learning_rate": 2.7922878901641104e-05, + "loss": 0.043671913146972656, + "step": 24700 + }, + { + "epoch": 0.8866959848403589, + "grad_norm": 0.734449028968811, + "learning_rate": 2.783349422574994e-05, + "loss": 0.05153060913085938, + "step": 24800 + }, + { + "epoch": 0.8902713718760056, + "grad_norm": 1.0401020050048828, + "learning_rate": 2.7744109549858772e-05, + "loss": 0.04694102287292481, + "step": 24900 + }, + { + "epoch": 0.8938467589116522, + "grad_norm": 0.646715521812439, + "learning_rate": 2.765472487396761e-05, + "loss": 0.054542098045349124, + "step": 25000 + }, + { + "epoch": 0.8938467589116522, + "eval_accuracy": 0.9856964248425865, + "eval_f1": 0.8835749303424683, + "eval_loss": 0.05743265897035599, + "eval_precision": 0.8674635382761534, + "eval_recall": 0.9002961216686313, + "eval_runtime": 27.4328, + "eval_samples_per_second": 820.186, + "eval_steps_per_second": 22.783, + "step": 25000 + }, + { + "epoch": 0.8974221459472987, + "grad_norm": 0.3341001570224762, + "learning_rate": 2.7565340198076444e-05, + "loss": 0.04484391689300537, + "step": 25100 + }, + { + "epoch": 0.9009975329829454, + "grad_norm": 0.700167715549469, + "learning_rate": 2.7475955522185278e-05, + "loss": 0.04423677921295166, + "step": 25200 + }, + { + "epoch": 0.904572920018592, + "grad_norm": 1.2379734516143799, + "learning_rate": 2.7386570846294112e-05, + "loss": 0.04488907337188721, + "step": 25300 + }, + { + "epoch": 0.9081483070542387, + "grad_norm": 0.4145027697086334, + "learning_rate": 2.729718617040295e-05, + "loss": 0.04520434856414795, + "step": 25400 + }, + { + "epoch": 0.9117236940898852, + "grad_norm": 0.3579607605934143, + "learning_rate": 2.720780149451178e-05, + "loss": 0.04551751613616943, + "step": 25500 + }, + { + "epoch": 0.9152990811255318, + "grad_norm": 0.5503469705581665, + "learning_rate": 2.7118416818620618e-05, + "loss": 0.04752420425415039, + "step": 25600 + }, + { + "epoch": 0.9188744681611785, + "grad_norm": 0.41558948159217834, + "learning_rate": 2.702903214272945e-05, + "loss": 0.05269415855407715, + "step": 25700 + }, + { + "epoch": 0.9224498551968251, + "grad_norm": 1.5605533123016357, + "learning_rate": 2.6939647466838286e-05, + "loss": 0.0499528169631958, + "step": 25800 + }, + { + "epoch": 0.9260252422324716, + "grad_norm": 0.6252946853637695, + "learning_rate": 2.6850262790947124e-05, + "loss": 0.04681193351745606, + "step": 25900 + }, + { + "epoch": 0.9296006292681183, + "grad_norm": 0.4643714427947998, + "learning_rate": 2.6760878115055954e-05, + "loss": 0.04491585254669189, + "step": 26000 + }, + { + "epoch": 0.9331760163037649, + "grad_norm": 1.0552211999893188, + "learning_rate": 2.6671493439164792e-05, + "loss": 0.050134167671203614, + "step": 26100 + }, + { + "epoch": 0.9367514033394115, + "grad_norm": 0.2919712960720062, + "learning_rate": 2.6582108763273626e-05, + "loss": 0.045297045707702634, + "step": 26200 + }, + { + "epoch": 0.9403267903750581, + "grad_norm": 0.5062688589096069, + "learning_rate": 2.649272408738246e-05, + "loss": 0.04247344017028808, + "step": 26300 + }, + { + "epoch": 0.9439021774107047, + "grad_norm": 0.4406910538673401, + "learning_rate": 2.6403339411491294e-05, + "loss": 0.0437799072265625, + "step": 26400 + }, + { + "epoch": 0.9474775644463513, + "grad_norm": 0.41486886143684387, + "learning_rate": 2.6313954735600132e-05, + "loss": 0.04669870376586914, + "step": 26500 + }, + { + "epoch": 0.951052951481998, + "grad_norm": 0.6877465844154358, + "learning_rate": 2.6224570059708963e-05, + "loss": 0.04583415985107422, + "step": 26600 + }, + { + "epoch": 0.9546283385176445, + "grad_norm": 0.6501809358596802, + "learning_rate": 2.61351853838178e-05, + "loss": 0.04593777179718018, + "step": 26700 + }, + { + "epoch": 0.9582037255532911, + "grad_norm": 0.7312682271003723, + "learning_rate": 2.604580070792663e-05, + "loss": 0.050377216339111325, + "step": 26800 + }, + { + "epoch": 0.9617791125889378, + "grad_norm": 0.8844775557518005, + "learning_rate": 2.595641603203547e-05, + "loss": 0.04860093593597412, + "step": 26900 + }, + { + "epoch": 0.9653544996245844, + "grad_norm": 0.4647756814956665, + "learning_rate": 2.5867031356144306e-05, + "loss": 0.0445063066482544, + "step": 27000 + }, + { + "epoch": 0.9689298866602309, + "grad_norm": 0.20223687589168549, + "learning_rate": 2.5777646680253137e-05, + "loss": 0.04691956520080567, + "step": 27100 + }, + { + "epoch": 0.9725052736958776, + "grad_norm": 0.9210941195487976, + "learning_rate": 2.5688262004361974e-05, + "loss": 0.049297604560852054, + "step": 27200 + }, + { + "epoch": 0.9760806607315242, + "grad_norm": 0.35992079973220825, + "learning_rate": 2.5598877328470812e-05, + "loss": 0.04701284408569336, + "step": 27300 + }, + { + "epoch": 0.9796560477671707, + "grad_norm": 0.6507813334465027, + "learning_rate": 2.5509492652579643e-05, + "loss": 0.04716668605804444, + "step": 27400 + }, + { + "epoch": 0.9832314348028174, + "grad_norm": 0.5909741520881653, + "learning_rate": 2.542010797668848e-05, + "loss": 0.048493666648864744, + "step": 27500 + }, + { + "epoch": 0.9832314348028174, + "eval_accuracy": 0.9858305504462175, + "eval_f1": 0.8868203247033212, + "eval_loss": 0.05660928413271904, + "eval_precision": 0.8723250413671315, + "eval_recall": 0.9018054796883535, + "eval_runtime": 27.7774, + "eval_samples_per_second": 810.012, + "eval_steps_per_second": 22.5, + "step": 27500 + }, + { + "epoch": 0.986806821838464, + "grad_norm": 0.47291576862335205, + "learning_rate": 2.533072330079731e-05, + "loss": 0.04355491161346436, + "step": 27600 + }, + { + "epoch": 0.9903822088741107, + "grad_norm": 0.4872467815876007, + "learning_rate": 2.5241338624906145e-05, + "loss": 0.0435347318649292, + "step": 27700 + }, + { + "epoch": 0.9939575959097572, + "grad_norm": 1.711300015449524, + "learning_rate": 2.5151953949014983e-05, + "loss": 0.04561484336853027, + "step": 27800 + }, + { + "epoch": 0.9975329829454038, + "grad_norm": 0.2917760908603668, + "learning_rate": 2.5062569273123814e-05, + "loss": 0.047463297843933105, + "step": 27900 + }, + { + "epoch": 1.0011083699810504, + "grad_norm": 0.2678261697292328, + "learning_rate": 2.497318459723265e-05, + "loss": 0.04366901874542237, + "step": 28000 + }, + { + "epoch": 1.004683757016697, + "grad_norm": 0.3751468062400818, + "learning_rate": 2.4883799921341485e-05, + "loss": 0.03846597194671631, + "step": 28100 + }, + { + "epoch": 1.0082591440523436, + "grad_norm": 0.41662493348121643, + "learning_rate": 2.4794415245450323e-05, + "loss": 0.03653419733047485, + "step": 28200 + }, + { + "epoch": 1.0118345310879904, + "grad_norm": 0.6062248945236206, + "learning_rate": 2.4705030569559157e-05, + "loss": 0.037252871990203856, + "step": 28300 + }, + { + "epoch": 1.015409918123637, + "grad_norm": 0.7458221316337585, + "learning_rate": 2.461564589366799e-05, + "loss": 0.03445641756057739, + "step": 28400 + }, + { + "epoch": 1.0189853051592834, + "grad_norm": 0.13679973781108856, + "learning_rate": 2.4526261217776825e-05, + "loss": 0.03599729061126709, + "step": 28500 + }, + { + "epoch": 1.0225606921949302, + "grad_norm": 1.258949637413025, + "learning_rate": 2.4436876541885663e-05, + "loss": 0.037976634502410886, + "step": 28600 + }, + { + "epoch": 1.0261360792305767, + "grad_norm": 0.27776288986206055, + "learning_rate": 2.4347491865994497e-05, + "loss": 0.03968371391296387, + "step": 28700 + }, + { + "epoch": 1.0297114662662232, + "grad_norm": 0.34287697076797485, + "learning_rate": 2.425810719010333e-05, + "loss": 0.03572561502456665, + "step": 28800 + }, + { + "epoch": 1.03328685330187, + "grad_norm": 0.5158637166023254, + "learning_rate": 2.4168722514212165e-05, + "loss": 0.036703295707702636, + "step": 28900 + }, + { + "epoch": 1.0368622403375165, + "grad_norm": 0.8635151982307434, + "learning_rate": 2.4079337838321e-05, + "loss": 0.035954997539520264, + "step": 29000 + }, + { + "epoch": 1.040437627373163, + "grad_norm": 0.6386840343475342, + "learning_rate": 2.3989953162429834e-05, + "loss": 0.039990205764770505, + "step": 29100 + }, + { + "epoch": 1.0440130144088098, + "grad_norm": 0.2795710861682892, + "learning_rate": 2.3900568486538668e-05, + "loss": 0.03744415760040283, + "step": 29200 + }, + { + "epoch": 1.0475884014444563, + "grad_norm": 0.674773097038269, + "learning_rate": 2.3811183810647502e-05, + "loss": 0.038765432834625246, + "step": 29300 + }, + { + "epoch": 1.051163788480103, + "grad_norm": 0.5345519185066223, + "learning_rate": 2.372179913475634e-05, + "loss": 0.03793670177459717, + "step": 29400 + }, + { + "epoch": 1.0547391755157496, + "grad_norm": 0.19475312530994415, + "learning_rate": 2.3632414458865174e-05, + "loss": 0.03510812759399414, + "step": 29500 + }, + { + "epoch": 1.058314562551396, + "grad_norm": 0.6469267010688782, + "learning_rate": 2.3543029782974008e-05, + "loss": 0.03977480411529541, + "step": 29600 + }, + { + "epoch": 1.0618899495870429, + "grad_norm": 0.3818305432796478, + "learning_rate": 2.3453645107082842e-05, + "loss": 0.03915615558624268, + "step": 29700 + }, + { + "epoch": 1.0654653366226894, + "grad_norm": 0.7031393051147461, + "learning_rate": 2.336426043119168e-05, + "loss": 0.03701666355133057, + "step": 29800 + }, + { + "epoch": 1.069040723658336, + "grad_norm": 0.34952452778816223, + "learning_rate": 2.3274875755300514e-05, + "loss": 0.03564514398574829, + "step": 29900 + }, + { + "epoch": 1.0726161106939827, + "grad_norm": 0.5351042747497559, + "learning_rate": 2.3185491079409348e-05, + "loss": 0.04400619983673096, + "step": 30000 + }, + { + "epoch": 1.0726161106939827, + "eval_accuracy": 0.9867462864302234, + "eval_f1": 0.8903530810550676, + "eval_loss": 0.05216454714536667, + "eval_precision": 0.8769046324564705, + "eval_recall": 0.9042204525199091, + "eval_runtime": 27.3869, + "eval_samples_per_second": 821.559, + "eval_steps_per_second": 22.821, + "step": 30000 + }, + { + "epoch": 1.0761914977296292, + "grad_norm": 0.6395847201347351, + "learning_rate": 2.3096106403518182e-05, + "loss": 0.03795994281768799, + "step": 30100 + }, + { + "epoch": 1.079766884765276, + "grad_norm": 0.2738804221153259, + "learning_rate": 2.3006721727627016e-05, + "loss": 0.034112286567687986, + "step": 30200 + }, + { + "epoch": 1.0833422718009225, + "grad_norm": 0.36416754126548767, + "learning_rate": 2.291733705173585e-05, + "loss": 0.03839835166931152, + "step": 30300 + }, + { + "epoch": 1.086917658836569, + "grad_norm": 0.8902291059494019, + "learning_rate": 2.2827952375844684e-05, + "loss": 0.04109617233276367, + "step": 30400 + }, + { + "epoch": 1.0904930458722157, + "grad_norm": 0.47186803817749023, + "learning_rate": 2.2738567699953522e-05, + "loss": 0.03920984029769897, + "step": 30500 + }, + { + "epoch": 1.0940684329078623, + "grad_norm": 3.810819625854492, + "learning_rate": 2.2649183024062356e-05, + "loss": 0.0391163420677185, + "step": 30600 + }, + { + "epoch": 1.0976438199435088, + "grad_norm": 0.8752216696739197, + "learning_rate": 2.255979834817119e-05, + "loss": 0.038404548168182374, + "step": 30700 + }, + { + "epoch": 1.1012192069791555, + "grad_norm": 0.2776939570903778, + "learning_rate": 2.2470413672280025e-05, + "loss": 0.037470765113830566, + "step": 30800 + }, + { + "epoch": 1.104794594014802, + "grad_norm": 0.549679160118103, + "learning_rate": 2.2381028996388862e-05, + "loss": 0.03804266691207886, + "step": 30900 + }, + { + "epoch": 1.1083699810504486, + "grad_norm": 0.7605739235877991, + "learning_rate": 2.2291644320497696e-05, + "loss": 0.03416654348373413, + "step": 31000 + }, + { + "epoch": 1.1119453680860953, + "grad_norm": 0.16704197227954865, + "learning_rate": 2.220225964460653e-05, + "loss": 0.034537038803100585, + "step": 31100 + }, + { + "epoch": 1.1155207551217419, + "grad_norm": 0.5772648453712463, + "learning_rate": 2.2112874968715365e-05, + "loss": 0.03786729097366333, + "step": 31200 + }, + { + "epoch": 1.1190961421573886, + "grad_norm": 0.3576936423778534, + "learning_rate": 2.2023490292824202e-05, + "loss": 0.04146803379058838, + "step": 31300 + }, + { + "epoch": 1.1226715291930351, + "grad_norm": 0.24434928596019745, + "learning_rate": 2.1934105616933033e-05, + "loss": 0.03837924718856812, + "step": 31400 + }, + { + "epoch": 1.1262469162286817, + "grad_norm": 0.8151653409004211, + "learning_rate": 2.1844720941041867e-05, + "loss": 0.03402991771697998, + "step": 31500 + }, + { + "epoch": 1.1298223032643284, + "grad_norm": 0.803303062915802, + "learning_rate": 2.17553362651507e-05, + "loss": 0.03701550483703613, + "step": 31600 + }, + { + "epoch": 1.133397690299975, + "grad_norm": 0.5276838541030884, + "learning_rate": 2.166595158925954e-05, + "loss": 0.037687735557556154, + "step": 31700 + }, + { + "epoch": 1.1369730773356217, + "grad_norm": 1.2563331127166748, + "learning_rate": 2.1576566913368373e-05, + "loss": 0.04105483055114746, + "step": 31800 + }, + { + "epoch": 1.1405484643712682, + "grad_norm": 2.2794508934020996, + "learning_rate": 2.1487182237477207e-05, + "loss": 0.03871995687484741, + "step": 31900 + }, + { + "epoch": 1.1441238514069147, + "grad_norm": 0.5270197987556458, + "learning_rate": 2.139779756158604e-05, + "loss": 0.03748847007751465, + "step": 32000 + }, + { + "epoch": 1.1476992384425615, + "grad_norm": 0.4776967763900757, + "learning_rate": 2.130841288569488e-05, + "loss": 0.04054388523101807, + "step": 32100 + }, + { + "epoch": 1.151274625478208, + "grad_norm": 0.281562864780426, + "learning_rate": 2.1219028209803713e-05, + "loss": 0.03565767288208008, + "step": 32200 + }, + { + "epoch": 1.1548500125138546, + "grad_norm": 0.986331582069397, + "learning_rate": 2.1129643533912547e-05, + "loss": 0.03515695333480835, + "step": 32300 + }, + { + "epoch": 1.1584253995495013, + "grad_norm": 1.0339690446853638, + "learning_rate": 2.104025885802138e-05, + "loss": 0.03650200843811035, + "step": 32400 + }, + { + "epoch": 1.1620007865851478, + "grad_norm": 0.6622812747955322, + "learning_rate": 2.095087418213022e-05, + "loss": 0.03963910102844238, + "step": 32500 + }, + { + "epoch": 1.1620007865851478, + "eval_accuracy": 0.9872691132930045, + "eval_f1": 0.8901178950048444, + "eval_loss": 0.050942763686180115, + "eval_precision": 0.8760841419442859, + "eval_recall": 0.904608573153552, + "eval_runtime": 27.7471, + "eval_samples_per_second": 810.896, + "eval_steps_per_second": 22.525, + "step": 32500 + }, + { + "epoch": 1.1655761736207944, + "grad_norm": 0.4157122075557709, + "learning_rate": 2.0861489506239053e-05, + "loss": 0.03528056383132935, + "step": 32600 + }, + { + "epoch": 1.169151560656441, + "grad_norm": 1.0833650827407837, + "learning_rate": 2.0772104830347887e-05, + "loss": 0.0338432765007019, + "step": 32700 + }, + { + "epoch": 1.1727269476920876, + "grad_norm": 0.6234818696975708, + "learning_rate": 2.068272015445672e-05, + "loss": 0.03545186996459961, + "step": 32800 + }, + { + "epoch": 1.1763023347277342, + "grad_norm": 0.46430152654647827, + "learning_rate": 2.0593335478565555e-05, + "loss": 0.03938552379608154, + "step": 32900 + }, + { + "epoch": 1.179877721763381, + "grad_norm": 0.32441213726997375, + "learning_rate": 2.050395080267439e-05, + "loss": 0.03765884399414063, + "step": 33000 + }, + { + "epoch": 1.1834531087990274, + "grad_norm": 0.5149340033531189, + "learning_rate": 2.0414566126783224e-05, + "loss": 0.04111374378204346, + "step": 33100 + }, + { + "epoch": 1.1870284958346742, + "grad_norm": 0.6311440467834473, + "learning_rate": 2.032518145089206e-05, + "loss": 0.03235443115234375, + "step": 33200 + }, + { + "epoch": 1.1906038828703207, + "grad_norm": 0.41769224405288696, + "learning_rate": 2.0235796775000895e-05, + "loss": 0.03542477607727051, + "step": 33300 + }, + { + "epoch": 1.1941792699059672, + "grad_norm": 1.399487853050232, + "learning_rate": 2.014641209910973e-05, + "loss": 0.03973909854888916, + "step": 33400 + }, + { + "epoch": 1.197754656941614, + "grad_norm": 0.44740626215934753, + "learning_rate": 2.0057027423218564e-05, + "loss": 0.03419320821762085, + "step": 33500 + }, + { + "epoch": 1.2013300439772605, + "grad_norm": 0.7771443128585815, + "learning_rate": 1.99676427473274e-05, + "loss": 0.03688163042068481, + "step": 33600 + }, + { + "epoch": 1.2049054310129073, + "grad_norm": 0.33263227343559265, + "learning_rate": 1.9878258071436235e-05, + "loss": 0.0361082911491394, + "step": 33700 + }, + { + "epoch": 1.2084808180485538, + "grad_norm": 0.586033821105957, + "learning_rate": 1.978887339554507e-05, + "loss": 0.037032432556152343, + "step": 33800 + }, + { + "epoch": 1.2120562050842003, + "grad_norm": 0.17661893367767334, + "learning_rate": 1.9699488719653904e-05, + "loss": 0.03797416687011719, + "step": 33900 + }, + { + "epoch": 1.215631592119847, + "grad_norm": 0.6682581305503845, + "learning_rate": 1.9610104043762738e-05, + "loss": 0.03688710927963257, + "step": 34000 + }, + { + "epoch": 1.2192069791554936, + "grad_norm": 0.33618828654289246, + "learning_rate": 1.9520719367871572e-05, + "loss": 0.03531041145324707, + "step": 34100 + }, + { + "epoch": 1.2227823661911401, + "grad_norm": 0.2299039363861084, + "learning_rate": 1.9431334691980406e-05, + "loss": 0.037303669452667235, + "step": 34200 + }, + { + "epoch": 1.2263577532267869, + "grad_norm": 0.38670745491981506, + "learning_rate": 1.934195001608924e-05, + "loss": 0.03624207735061646, + "step": 34300 + }, + { + "epoch": 1.2299331402624334, + "grad_norm": 0.28273847699165344, + "learning_rate": 1.9252565340198078e-05, + "loss": 0.03737942218780518, + "step": 34400 + }, + { + "epoch": 1.23350852729808, + "grad_norm": 0.1840369552373886, + "learning_rate": 1.9163180664306912e-05, + "loss": 0.04193697929382324, + "step": 34500 + }, + { + "epoch": 1.2370839143337267, + "grad_norm": 0.3581949770450592, + "learning_rate": 1.9073795988415746e-05, + "loss": 0.03538564205169678, + "step": 34600 + }, + { + "epoch": 1.2406593013693732, + "grad_norm": 0.47306036949157715, + "learning_rate": 1.898441131252458e-05, + "loss": 0.03894999265670776, + "step": 34700 + }, + { + "epoch": 1.24423468840502, + "grad_norm": 0.961359977722168, + "learning_rate": 1.8895026636633418e-05, + "loss": 0.03768787622451782, + "step": 34800 + }, + { + "epoch": 1.2478100754406665, + "grad_norm": 0.873396098613739, + "learning_rate": 1.8805641960742252e-05, + "loss": 0.03798648834228516, + "step": 34900 + }, + { + "epoch": 1.251385462476313, + "grad_norm": 0.27755600214004517, + "learning_rate": 1.8716257284851086e-05, + "loss": 0.03826235771179199, + "step": 35000 + }, + { + "epoch": 1.251385462476313, + "eval_accuracy": 0.9878845131214289, + "eval_f1": 0.8920698296733638, + "eval_loss": 0.04892827197909355, + "eval_precision": 0.8788428276516208, + "eval_recall": 0.9057010608630653, + "eval_runtime": 27.2527, + "eval_samples_per_second": 825.607, + "eval_steps_per_second": 22.934, + "step": 35000 + }, + { + "epoch": 1.2549608495119597, + "grad_norm": 0.19469444453716278, + "learning_rate": 1.862687260895992e-05, + "loss": 0.037444868087768556, + "step": 35100 + }, + { + "epoch": 1.2585362365476063, + "grad_norm": 0.7563005685806274, + "learning_rate": 1.8537487933068755e-05, + "loss": 0.03585953950881958, + "step": 35200 + }, + { + "epoch": 1.262111623583253, + "grad_norm": 0.748693585395813, + "learning_rate": 1.844810325717759e-05, + "loss": 0.036836111545562746, + "step": 35300 + }, + { + "epoch": 1.2656870106188995, + "grad_norm": 0.2749057114124298, + "learning_rate": 1.8358718581286423e-05, + "loss": 0.035653345584869385, + "step": 35400 + }, + { + "epoch": 1.269262397654546, + "grad_norm": 0.46990424394607544, + "learning_rate": 1.826933390539526e-05, + "loss": 0.038926541805267334, + "step": 35500 + }, + { + "epoch": 1.2728377846901928, + "grad_norm": 0.5694590210914612, + "learning_rate": 1.8179949229504095e-05, + "loss": 0.041200418472290036, + "step": 35600 + }, + { + "epoch": 1.2764131717258393, + "grad_norm": 0.44198593497276306, + "learning_rate": 1.809056455361293e-05, + "loss": 0.03306173086166382, + "step": 35700 + }, + { + "epoch": 1.2799885587614859, + "grad_norm": 1.5265918970108032, + "learning_rate": 1.8001179877721763e-05, + "loss": 0.03929618358612061, + "step": 35800 + }, + { + "epoch": 1.2835639457971326, + "grad_norm": 0.568000078201294, + "learning_rate": 1.79117952018306e-05, + "loss": 0.035215189456939695, + "step": 35900 + }, + { + "epoch": 1.2871393328327791, + "grad_norm": 0.3256838619709015, + "learning_rate": 1.7822410525939435e-05, + "loss": 0.03693248510360718, + "step": 36000 + }, + { + "epoch": 1.2907147198684257, + "grad_norm": 0.37276744842529297, + "learning_rate": 1.773302585004827e-05, + "loss": 0.038254330158233645, + "step": 36100 + }, + { + "epoch": 1.2942901069040724, + "grad_norm": 0.9104180335998535, + "learning_rate": 1.7643641174157103e-05, + "loss": 0.03706887722015381, + "step": 36200 + }, + { + "epoch": 1.297865493939719, + "grad_norm": 0.855074942111969, + "learning_rate": 1.755425649826594e-05, + "loss": 0.039341244697570804, + "step": 36300 + }, + { + "epoch": 1.3014408809753655, + "grad_norm": 1.0919744968414307, + "learning_rate": 1.7464871822374775e-05, + "loss": 0.03796007394790649, + "step": 36400 + }, + { + "epoch": 1.3050162680110122, + "grad_norm": 0.4765317142009735, + "learning_rate": 1.737548714648361e-05, + "loss": 0.03425301790237427, + "step": 36500 + }, + { + "epoch": 1.3085916550466588, + "grad_norm": 0.28184378147125244, + "learning_rate": 1.728610247059244e-05, + "loss": 0.03511073589324951, + "step": 36600 + }, + { + "epoch": 1.3121670420823053, + "grad_norm": 0.26926326751708984, + "learning_rate": 1.7196717794701277e-05, + "loss": 0.03917940616607666, + "step": 36700 + }, + { + "epoch": 1.315742429117952, + "grad_norm": 2.2863128185272217, + "learning_rate": 1.710733311881011e-05, + "loss": 0.03748450517654419, + "step": 36800 + }, + { + "epoch": 1.3193178161535986, + "grad_norm": 0.47158753871917725, + "learning_rate": 1.7017948442918946e-05, + "loss": 0.034841620922088624, + "step": 36900 + }, + { + "epoch": 1.3228932031892453, + "grad_norm": 0.3611966371536255, + "learning_rate": 1.692856376702778e-05, + "loss": 0.03597846508026123, + "step": 37000 + }, + { + "epoch": 1.3264685902248918, + "grad_norm": 0.19897930324077606, + "learning_rate": 1.6839179091136617e-05, + "loss": 0.0384373140335083, + "step": 37100 + }, + { + "epoch": 1.3300439772605386, + "grad_norm": 0.4929654002189636, + "learning_rate": 1.674979441524545e-05, + "loss": 0.03474846363067627, + "step": 37200 + }, + { + "epoch": 1.333619364296185, + "grad_norm": 1.4330233335494995, + "learning_rate": 1.6660409739354286e-05, + "loss": 0.03804588317871094, + "step": 37300 + }, + { + "epoch": 1.3371947513318316, + "grad_norm": 0.7935028076171875, + "learning_rate": 1.6571025063463123e-05, + "loss": 0.036091580390930175, + "step": 37400 + }, + { + "epoch": 1.3407701383674784, + "grad_norm": 0.6093057990074158, + "learning_rate": 1.6481640387571957e-05, + "loss": 0.036958491802215575, + "step": 37500 + }, + { + "epoch": 1.3407701383674784, + "eval_accuracy": 0.9876767499314908, + "eval_f1": 0.8963199795830114, + "eval_loss": 0.048646602779626846, + "eval_precision": 0.8842404151455387, + "eval_recall": 0.9087341517407929, + "eval_runtime": 27.7954, + "eval_samples_per_second": 809.486, + "eval_steps_per_second": 22.486, + "step": 37500 + }, + { + "epoch": 1.344345525403125, + "grad_norm": 0.530693531036377, + "learning_rate": 1.639225571168079e-05, + "loss": 0.040179696083068844, + "step": 37600 + }, + { + "epoch": 1.3479209124387714, + "grad_norm": 0.70650714635849, + "learning_rate": 1.6302871035789626e-05, + "loss": 0.03599003076553345, + "step": 37700 + }, + { + "epoch": 1.3514962994744182, + "grad_norm": 0.673740029335022, + "learning_rate": 1.621348635989846e-05, + "loss": 0.03707956552505493, + "step": 37800 + }, + { + "epoch": 1.3550716865100647, + "grad_norm": 0.28047823905944824, + "learning_rate": 1.6124101684007294e-05, + "loss": 0.034383256435394284, + "step": 37900 + }, + { + "epoch": 1.3586470735457112, + "grad_norm": 0.4644497036933899, + "learning_rate": 1.6034717008116128e-05, + "loss": 0.039096081256866456, + "step": 38000 + }, + { + "epoch": 1.362222460581358, + "grad_norm": 0.2905023992061615, + "learning_rate": 1.5945332332224962e-05, + "loss": 0.031935737133026124, + "step": 38100 + }, + { + "epoch": 1.3657978476170045, + "grad_norm": 0.519289493560791, + "learning_rate": 1.58559476563338e-05, + "loss": 0.03160768747329712, + "step": 38200 + }, + { + "epoch": 1.369373234652651, + "grad_norm": 0.4803026616573334, + "learning_rate": 1.5766562980442634e-05, + "loss": 0.03475278615951538, + "step": 38300 + }, + { + "epoch": 1.3729486216882978, + "grad_norm": 0.2219659686088562, + "learning_rate": 1.5677178304551468e-05, + "loss": 0.03382747411727905, + "step": 38400 + }, + { + "epoch": 1.3765240087239443, + "grad_norm": 0.9020390510559082, + "learning_rate": 1.5587793628660302e-05, + "loss": 0.03778740644454956, + "step": 38500 + }, + { + "epoch": 1.3800993957595908, + "grad_norm": 0.4074041247367859, + "learning_rate": 1.549840895276914e-05, + "loss": 0.03417648077011108, + "step": 38600 + }, + { + "epoch": 1.3836747827952376, + "grad_norm": 0.2950891852378845, + "learning_rate": 1.5409024276877974e-05, + "loss": 0.037335121631622316, + "step": 38700 + }, + { + "epoch": 1.3872501698308841, + "grad_norm": 0.5112789869308472, + "learning_rate": 1.5319639600986808e-05, + "loss": 0.03443581342697143, + "step": 38800 + }, + { + "epoch": 1.3908255568665309, + "grad_norm": 0.6883418560028076, + "learning_rate": 1.523025492509564e-05, + "loss": 0.03647557497024536, + "step": 38900 + }, + { + "epoch": 1.3944009439021774, + "grad_norm": 0.22857694327831268, + "learning_rate": 1.5140870249204478e-05, + "loss": 0.03520648956298828, + "step": 39000 + }, + { + "epoch": 1.3979763309378241, + "grad_norm": 1.4312663078308105, + "learning_rate": 1.5051485573313312e-05, + "loss": 0.031425106525421145, + "step": 39100 + }, + { + "epoch": 1.4015517179734707, + "grad_norm": 0.7821195125579834, + "learning_rate": 1.4962100897422146e-05, + "loss": 0.03315335750579834, + "step": 39200 + }, + { + "epoch": 1.4051271050091172, + "grad_norm": 0.27848535776138306, + "learning_rate": 1.487271622153098e-05, + "loss": 0.033316426277160645, + "step": 39300 + }, + { + "epoch": 1.408702492044764, + "grad_norm": 0.6713240146636963, + "learning_rate": 1.4783331545639816e-05, + "loss": 0.033266935348510746, + "step": 39400 + }, + { + "epoch": 1.4122778790804105, + "grad_norm": 3.596701145172119, + "learning_rate": 1.469394686974865e-05, + "loss": 0.03419414281845093, + "step": 39500 + }, + { + "epoch": 1.415853266116057, + "grad_norm": 1.069840908050537, + "learning_rate": 1.4604562193857485e-05, + "loss": 0.035397300720214846, + "step": 39600 + }, + { + "epoch": 1.4194286531517037, + "grad_norm": 0.2466162145137787, + "learning_rate": 1.4515177517966322e-05, + "loss": 0.0358107590675354, + "step": 39700 + }, + { + "epoch": 1.4230040401873503, + "grad_norm": 0.5182567834854126, + "learning_rate": 1.4425792842075156e-05, + "loss": 0.03377439260482788, + "step": 39800 + }, + { + "epoch": 1.4265794272229968, + "grad_norm": 0.8782963752746582, + "learning_rate": 1.433640816618399e-05, + "loss": 0.03737137794494629, + "step": 39900 + }, + { + "epoch": 1.4301548142586435, + "grad_norm": 0.2662527561187744, + "learning_rate": 1.4247023490292825e-05, + "loss": 0.035046143531799315, + "step": 40000 + }, + { + "epoch": 1.4301548142586435, + "eval_accuracy": 0.9874205963276935, + "eval_f1": 0.8909038185431681, + "eval_loss": 0.048918217420578, + "eval_precision": 0.8769057265778372, + "eval_recall": 0.9053560647442717, + "eval_runtime": 27.308, + "eval_samples_per_second": 823.934, + "eval_steps_per_second": 22.887, + "step": 40000 + }, + { + "epoch": 1.43373020129429, + "grad_norm": 4.632917404174805, + "learning_rate": 1.415763881440166e-05, + "loss": 0.03318638563156128, + "step": 40100 + }, + { + "epoch": 1.4373055883299366, + "grad_norm": 0.34400591254234314, + "learning_rate": 1.4068254138510495e-05, + "loss": 0.03380630970001221, + "step": 40200 + }, + { + "epoch": 1.4408809753655833, + "grad_norm": 0.3949352204799652, + "learning_rate": 1.3978869462619329e-05, + "loss": 0.035887646675109866, + "step": 40300 + }, + { + "epoch": 1.4444563624012299, + "grad_norm": 0.21083228290081024, + "learning_rate": 1.3889484786728163e-05, + "loss": 0.02981067180633545, + "step": 40400 + }, + { + "epoch": 1.4480317494368766, + "grad_norm": 0.5403398871421814, + "learning_rate": 1.3800100110836999e-05, + "loss": 0.03951683759689331, + "step": 40500 + }, + { + "epoch": 1.4516071364725232, + "grad_norm": 0.37334415316581726, + "learning_rate": 1.3710715434945833e-05, + "loss": 0.03376241683959961, + "step": 40600 + }, + { + "epoch": 1.4551825235081697, + "grad_norm": 0.6374111771583557, + "learning_rate": 1.3621330759054667e-05, + "loss": 0.035758087635040285, + "step": 40700 + }, + { + "epoch": 1.4587579105438164, + "grad_norm": 0.4704621434211731, + "learning_rate": 1.3531946083163501e-05, + "loss": 0.03579946041107178, + "step": 40800 + }, + { + "epoch": 1.462333297579463, + "grad_norm": 0.31890979409217834, + "learning_rate": 1.3442561407272339e-05, + "loss": 0.036891818046569824, + "step": 40900 + }, + { + "epoch": 1.4659086846151097, + "grad_norm": 0.36003023386001587, + "learning_rate": 1.3353176731381173e-05, + "loss": 0.03722346544265747, + "step": 41000 + }, + { + "epoch": 1.4694840716507562, + "grad_norm": 0.3868881165981293, + "learning_rate": 1.3263792055490007e-05, + "loss": 0.03188649654388428, + "step": 41100 + }, + { + "epoch": 1.4730594586864028, + "grad_norm": 0.1989583820104599, + "learning_rate": 1.3174407379598841e-05, + "loss": 0.03385810136795044, + "step": 41200 + }, + { + "epoch": 1.4766348457220495, + "grad_norm": 1.653865933418274, + "learning_rate": 1.3085022703707677e-05, + "loss": 0.033811585903167726, + "step": 41300 + }, + { + "epoch": 1.480210232757696, + "grad_norm": 0.4005359709262848, + "learning_rate": 1.2995638027816512e-05, + "loss": 0.034179413318634035, + "step": 41400 + }, + { + "epoch": 1.4837856197933426, + "grad_norm": 0.40698060393333435, + "learning_rate": 1.2906253351925346e-05, + "loss": 0.0344992733001709, + "step": 41500 + }, + { + "epoch": 1.4873610068289893, + "grad_norm": 0.23063120245933533, + "learning_rate": 1.281686867603418e-05, + "loss": 0.036886801719665525, + "step": 41600 + }, + { + "epoch": 1.4909363938646358, + "grad_norm": 0.36372461915016174, + "learning_rate": 1.2727484000143017e-05, + "loss": 0.03418808460235596, + "step": 41700 + }, + { + "epoch": 1.4945117809002824, + "grad_norm": 3.4656498432159424, + "learning_rate": 1.2638099324251852e-05, + "loss": 0.035131211280822756, + "step": 41800 + }, + { + "epoch": 1.498087167935929, + "grad_norm": 0.5397525429725647, + "learning_rate": 1.2548714648360686e-05, + "loss": 0.032310936450958255, + "step": 41900 + }, + { + "epoch": 1.5016625549715756, + "grad_norm": 0.803663969039917, + "learning_rate": 1.245932997246952e-05, + "loss": 0.034068484306335446, + "step": 42000 + }, + { + "epoch": 1.5052379420072222, + "grad_norm": 0.44578149914741516, + "learning_rate": 1.2369945296578356e-05, + "loss": 0.033568575382232665, + "step": 42100 + }, + { + "epoch": 1.508813329042869, + "grad_norm": 0.3740385174751282, + "learning_rate": 1.228056062068719e-05, + "loss": 0.0316014552116394, + "step": 42200 + }, + { + "epoch": 1.5123887160785157, + "grad_norm": 0.7885581254959106, + "learning_rate": 1.2191175944796026e-05, + "loss": 0.036092112064361574, + "step": 42300 + }, + { + "epoch": 1.515964103114162, + "grad_norm": 0.2616823613643646, + "learning_rate": 1.210179126890486e-05, + "loss": 0.0364843225479126, + "step": 42400 + }, + { + "epoch": 1.5195394901498087, + "grad_norm": 1.1933097839355469, + "learning_rate": 1.2012406593013694e-05, + "loss": 0.032956657409667967, + "step": 42500 + }, + { + "epoch": 1.5195394901498087, + "eval_accuracy": 0.9878976626904123, + "eval_f1": 0.8964901338171921, + "eval_loss": 0.04782980680465698, + "eval_precision": 0.8842314252957132, + "eval_recall": 0.9090935226978697, + "eval_runtime": 27.8912, + "eval_samples_per_second": 806.706, + "eval_steps_per_second": 22.409, + "step": 42500 + }, + { + "epoch": 1.5231148771854555, + "grad_norm": 1.002236247062683, + "learning_rate": 1.1923021917122528e-05, + "loss": 0.03267708301544189, + "step": 42600 + }, + { + "epoch": 1.526690264221102, + "grad_norm": 0.2965432405471802, + "learning_rate": 1.1833637241231364e-05, + "loss": 0.03969228982925415, + "step": 42700 + }, + { + "epoch": 1.5302656512567485, + "grad_norm": 0.35980096459388733, + "learning_rate": 1.1744252565340198e-05, + "loss": 0.033807692527770994, + "step": 42800 + }, + { + "epoch": 1.5338410382923953, + "grad_norm": 0.4036603271961212, + "learning_rate": 1.1654867889449034e-05, + "loss": 0.036050264835357664, + "step": 42900 + }, + { + "epoch": 1.5374164253280418, + "grad_norm": 0.4341689348220825, + "learning_rate": 1.1565483213557868e-05, + "loss": 0.03344399690628052, + "step": 43000 + }, + { + "epoch": 1.5409918123636883, + "grad_norm": 0.35666847229003906, + "learning_rate": 1.1476098537666702e-05, + "loss": 0.035790588855743405, + "step": 43100 + }, + { + "epoch": 1.544567199399335, + "grad_norm": 2.009552001953125, + "learning_rate": 1.1386713861775537e-05, + "loss": 0.03580213069915771, + "step": 43200 + }, + { + "epoch": 1.5481425864349816, + "grad_norm": 0.9199197888374329, + "learning_rate": 1.1297329185884372e-05, + "loss": 0.035557851791381836, + "step": 43300 + }, + { + "epoch": 1.5517179734706281, + "grad_norm": 0.3379763662815094, + "learning_rate": 1.1207944509993207e-05, + "loss": 0.037502107620239256, + "step": 43400 + }, + { + "epoch": 1.5552933605062749, + "grad_norm": 0.4002296030521393, + "learning_rate": 1.1118559834102042e-05, + "loss": 0.03514168262481689, + "step": 43500 + }, + { + "epoch": 1.5588687475419214, + "grad_norm": 0.44335803389549255, + "learning_rate": 1.1029175158210877e-05, + "loss": 0.03210949659347534, + "step": 43600 + }, + { + "epoch": 1.562444134577568, + "grad_norm": 0.3367313742637634, + "learning_rate": 1.0939790482319712e-05, + "loss": 0.03381946325302124, + "step": 43700 + }, + { + "epoch": 1.5660195216132147, + "grad_norm": 0.3180839419364929, + "learning_rate": 1.0850405806428547e-05, + "loss": 0.033136572837829587, + "step": 43800 + }, + { + "epoch": 1.5695949086488612, + "grad_norm": 0.49929025769233704, + "learning_rate": 1.076102113053738e-05, + "loss": 0.03284239530563354, + "step": 43900 + }, + { + "epoch": 1.5731702956845077, + "grad_norm": 0.36956411600112915, + "learning_rate": 1.0671636454646217e-05, + "loss": 0.032391068935394285, + "step": 44000 + }, + { + "epoch": 1.5767456827201545, + "grad_norm": 0.3806305527687073, + "learning_rate": 1.058225177875505e-05, + "loss": 0.03159698247909546, + "step": 44100 + }, + { + "epoch": 1.5803210697558012, + "grad_norm": 0.24886535108089447, + "learning_rate": 1.0492867102863887e-05, + "loss": 0.03376968622207641, + "step": 44200 + }, + { + "epoch": 1.5838964567914475, + "grad_norm": 0.8062007427215576, + "learning_rate": 1.040348242697272e-05, + "loss": 0.031125342845916747, + "step": 44300 + }, + { + "epoch": 1.5874718438270943, + "grad_norm": 0.32632651925086975, + "learning_rate": 1.0314097751081555e-05, + "loss": 0.032405462265014645, + "step": 44400 + }, + { + "epoch": 1.591047230862741, + "grad_norm": 0.9697968363761902, + "learning_rate": 1.0224713075190389e-05, + "loss": 0.0316835880279541, + "step": 44500 + }, + { + "epoch": 1.5946226178983876, + "grad_norm": 0.7041149735450745, + "learning_rate": 1.0135328399299225e-05, + "loss": 0.03227449417114258, + "step": 44600 + }, + { + "epoch": 1.598198004934034, + "grad_norm": 1.0169494152069092, + "learning_rate": 1.0045943723408059e-05, + "loss": 0.03636837244033814, + "step": 44700 + }, + { + "epoch": 1.6017733919696808, + "grad_norm": 1.4278594255447388, + "learning_rate": 9.956559047516895e-06, + "loss": 0.036050994396209714, + "step": 44800 + }, + { + "epoch": 1.6053487790053274, + "grad_norm": 0.21218614280223846, + "learning_rate": 9.867174371625729e-06, + "loss": 0.03155009746551514, + "step": 44900 + }, + { + "epoch": 1.6089241660409739, + "grad_norm": 0.2901414930820465, + "learning_rate": 9.777789695734563e-06, + "loss": 0.030832624435424803, + "step": 45000 + }, + { + "epoch": 1.6089241660409739, + "eval_accuracy": 0.9888076128640655, + "eval_f1": 0.9007828635915198, + "eval_loss": 0.04577971622347832, + "eval_precision": 0.8896631009295218, + "eval_recall": 0.9121841129287296, + "eval_runtime": 27.4639, + "eval_samples_per_second": 819.259, + "eval_steps_per_second": 22.757, + "step": 45000 + }, + { + "epoch": 1.6124995530766206, + "grad_norm": 0.6742628812789917, + "learning_rate": 9.688405019843397e-06, + "loss": 0.03396400213241577, + "step": 45100 + }, + { + "epoch": 1.6160749401122672, + "grad_norm": 0.30497708916664124, + "learning_rate": 9.599020343952233e-06, + "loss": 0.030751326084136964, + "step": 45200 + }, + { + "epoch": 1.6196503271479137, + "grad_norm": 0.33833158016204834, + "learning_rate": 9.509635668061067e-06, + "loss": 0.03044323444366455, + "step": 45300 + }, + { + "epoch": 1.6232257141835604, + "grad_norm": 0.35390418767929077, + "learning_rate": 9.420250992169903e-06, + "loss": 0.03425618410110474, + "step": 45400 + }, + { + "epoch": 1.626801101219207, + "grad_norm": 0.6008805632591248, + "learning_rate": 9.330866316278737e-06, + "loss": 0.03422411203384399, + "step": 45500 + }, + { + "epoch": 1.6303764882548535, + "grad_norm": 0.7057814598083496, + "learning_rate": 9.241481640387573e-06, + "loss": 0.03580734968185425, + "step": 45600 + }, + { + "epoch": 1.6339518752905002, + "grad_norm": 0.6222581267356873, + "learning_rate": 9.152096964496407e-06, + "loss": 0.03245258092880249, + "step": 45700 + }, + { + "epoch": 1.6375272623261468, + "grad_norm": 0.19113455712795258, + "learning_rate": 9.062712288605242e-06, + "loss": 0.03314180135726929, + "step": 45800 + }, + { + "epoch": 1.6411026493617933, + "grad_norm": 0.35139983892440796, + "learning_rate": 8.973327612714076e-06, + "loss": 0.03314854860305786, + "step": 45900 + }, + { + "epoch": 1.64467803639744, + "grad_norm": 2.3638358116149902, + "learning_rate": 8.883942936822912e-06, + "loss": 0.03374920845031738, + "step": 46000 + }, + { + "epoch": 1.6482534234330868, + "grad_norm": 0.3906150162220001, + "learning_rate": 8.794558260931746e-06, + "loss": 0.030902385711669922, + "step": 46100 + }, + { + "epoch": 1.651828810468733, + "grad_norm": 1.5684771537780762, + "learning_rate": 8.705173585040582e-06, + "loss": 0.03261609077453613, + "step": 46200 + }, + { + "epoch": 1.6554041975043798, + "grad_norm": 0.5489705801010132, + "learning_rate": 8.615788909149416e-06, + "loss": 0.032475869655609134, + "step": 46300 + }, + { + "epoch": 1.6589795845400266, + "grad_norm": 0.4629211127758026, + "learning_rate": 8.52640423325825e-06, + "loss": 0.0343438458442688, + "step": 46400 + }, + { + "epoch": 1.6625549715756731, + "grad_norm": 0.35416728258132935, + "learning_rate": 8.437019557367086e-06, + "loss": 0.030291988849639892, + "step": 46500 + }, + { + "epoch": 1.6661303586113196, + "grad_norm": 0.3730672597885132, + "learning_rate": 8.34763488147592e-06, + "loss": 0.03114586353302002, + "step": 46600 + }, + { + "epoch": 1.6697057456469664, + "grad_norm": 0.8023098111152649, + "learning_rate": 8.258250205584756e-06, + "loss": 0.031157519817352295, + "step": 46700 + }, + { + "epoch": 1.673281132682613, + "grad_norm": 0.3616831600666046, + "learning_rate": 8.16886552969359e-06, + "loss": 0.03633548498153687, + "step": 46800 + }, + { + "epoch": 1.6768565197182594, + "grad_norm": 0.2969978451728821, + "learning_rate": 8.079480853802424e-06, + "loss": 0.030888726711273195, + "step": 46900 + }, + { + "epoch": 1.6804319067539062, + "grad_norm": 0.5954911708831787, + "learning_rate": 7.990096177911258e-06, + "loss": 0.02800543785095215, + "step": 47000 + }, + { + "epoch": 1.6840072937895527, + "grad_norm": 0.28519004583358765, + "learning_rate": 7.900711502020094e-06, + "loss": 0.0348360013961792, + "step": 47100 + }, + { + "epoch": 1.6875826808251992, + "grad_norm": 3.0812149047851562, + "learning_rate": 7.811326826128928e-06, + "loss": 0.03429551839828491, + "step": 47200 + }, + { + "epoch": 1.691158067860846, + "grad_norm": 0.3664245903491974, + "learning_rate": 7.721942150237764e-06, + "loss": 0.03342988014221191, + "step": 47300 + }, + { + "epoch": 1.6947334548964925, + "grad_norm": 0.4746117889881134, + "learning_rate": 7.632557474346598e-06, + "loss": 0.031046552658081053, + "step": 47400 + }, + { + "epoch": 1.698308841932139, + "grad_norm": 0.26298218965530396, + "learning_rate": 7.543172798455433e-06, + "loss": 0.03168731689453125, + "step": 47500 + }, + { + "epoch": 1.698308841932139, + "eval_accuracy": 0.9886813770018247, + "eval_f1": 0.8991618091307493, + "eval_loss": 0.04538652300834656, + "eval_precision": 0.8872949672507418, + "eval_recall": 0.9113503723083115, + "eval_runtime": 27.7648, + "eval_samples_per_second": 810.377, + "eval_steps_per_second": 22.51, + "step": 47500 + }, + { + "epoch": 1.7018842289677858, + "grad_norm": 1.6149009466171265, + "learning_rate": 7.4537881225642675e-06, + "loss": 0.035295097827911376, + "step": 47600 + }, + { + "epoch": 1.7054596160034323, + "grad_norm": 0.37669169902801514, + "learning_rate": 7.3644034466731025e-06, + "loss": 0.03364665269851685, + "step": 47700 + }, + { + "epoch": 1.7090350030390788, + "grad_norm": 0.5029271841049194, + "learning_rate": 7.275018770781937e-06, + "loss": 0.032778596878051756, + "step": 47800 + }, + { + "epoch": 1.7126103900747256, + "grad_norm": 0.265184611082077, + "learning_rate": 7.1856340948907725e-06, + "loss": 0.033551807403564456, + "step": 47900 + }, + { + "epoch": 1.7161857771103723, + "grad_norm": 0.5929502248764038, + "learning_rate": 7.096249418999607e-06, + "loss": 0.033371658325195314, + "step": 48000 + }, + { + "epoch": 1.7197611641460187, + "grad_norm": 0.6151393055915833, + "learning_rate": 7.006864743108442e-06, + "loss": 0.034622840881347657, + "step": 48100 + }, + { + "epoch": 1.7233365511816654, + "grad_norm": Infinity, + "learning_rate": 6.917480067217276e-06, + "loss": 0.032550268173217774, + "step": 48200 + }, + { + "epoch": 1.7269119382173121, + "grad_norm": 3.7852137088775635, + "learning_rate": 6.828095391326112e-06, + "loss": 0.03138866424560547, + "step": 48300 + }, + { + "epoch": 1.7304873252529587, + "grad_norm": 0.1753600835800171, + "learning_rate": 6.738710715434946e-06, + "loss": 0.03186697244644165, + "step": 48400 + }, + { + "epoch": 1.7340627122886052, + "grad_norm": 6.609533786773682, + "learning_rate": 6.649326039543781e-06, + "loss": 0.031199581623077392, + "step": 48500 + }, + { + "epoch": 1.737638099324252, + "grad_norm": 1.9689279794692993, + "learning_rate": 6.559941363652617e-06, + "loss": 0.03473323583602905, + "step": 48600 + }, + { + "epoch": 1.7412134863598985, + "grad_norm": 1.0971671342849731, + "learning_rate": 6.47055668776145e-06, + "loss": 0.031001167297363283, + "step": 48700 + }, + { + "epoch": 1.744788873395545, + "grad_norm": 0.5941652655601501, + "learning_rate": 6.381172011870286e-06, + "loss": 0.03148573875427246, + "step": 48800 + }, + { + "epoch": 1.7483642604311918, + "grad_norm": 1.0142033100128174, + "learning_rate": 6.29178733597912e-06, + "loss": 0.03321949720382691, + "step": 48900 + }, + { + "epoch": 1.7519396474668383, + "grad_norm": 1.1377204656600952, + "learning_rate": 6.202402660087954e-06, + "loss": 0.03343360424041748, + "step": 49000 + }, + { + "epoch": 1.7555150345024848, + "grad_norm": 0.5484851002693176, + "learning_rate": 6.113017984196789e-06, + "loss": 0.03009215831756592, + "step": 49100 + }, + { + "epoch": 1.7590904215381316, + "grad_norm": 0.4845998287200928, + "learning_rate": 6.023633308305624e-06, + "loss": 0.03416025161743164, + "step": 49200 + }, + { + "epoch": 1.762665808573778, + "grad_norm": 2.4999592304229736, + "learning_rate": 5.934248632414459e-06, + "loss": 0.03311382532119751, + "step": 49300 + }, + { + "epoch": 1.7662411956094246, + "grad_norm": 0.8577232956886292, + "learning_rate": 5.844863956523293e-06, + "loss": 0.030206308364868165, + "step": 49400 + }, + { + "epoch": 1.7698165826450714, + "grad_norm": 0.90534508228302, + "learning_rate": 5.755479280632128e-06, + "loss": 0.03304917335510254, + "step": 49500 + }, + { + "epoch": 1.7733919696807179, + "grad_norm": 0.4702795445919037, + "learning_rate": 5.666094604740963e-06, + "loss": 0.03289535760879517, + "step": 49600 + }, + { + "epoch": 1.7769673567163644, + "grad_norm": 0.3340344727039337, + "learning_rate": 5.5767099288497984e-06, + "loss": 0.03143750667572021, + "step": 49700 + }, + { + "epoch": 1.7805427437520112, + "grad_norm": 0.8033680319786072, + "learning_rate": 5.4873252529586334e-06, + "loss": 0.03799154043197632, + "step": 49800 + }, + { + "epoch": 1.784118130787658, + "grad_norm": 0.3498431444168091, + "learning_rate": 5.3979405770674684e-06, + "loss": 0.032227945327758786, + "step": 49900 + }, + { + "epoch": 1.7876935178233042, + "grad_norm": 0.5044463276863098, + "learning_rate": 5.308555901176303e-06, + "loss": 0.03224561214447021, + "step": 50000 + }, + { + "epoch": 1.7876935178233042, + "eval_accuracy": 0.9888107687606216, + "eval_f1": 0.9007455797770362, + "eval_loss": 0.04468328878283501, + "eval_precision": 0.890014593623709, + "eval_recall": 0.9117384929419544, + "eval_runtime": 27.446, + "eval_samples_per_second": 819.793, + "eval_steps_per_second": 22.772, + "step": 50000 + }, + { + "epoch": 1.791268904858951, + "grad_norm": 0.2692296504974365, + "learning_rate": 5.219171225285138e-06, + "loss": 0.03332348108291626, + "step": 50100 + }, + { + "epoch": 1.7948442918945977, + "grad_norm": 0.29106396436691284, + "learning_rate": 5.129786549393973e-06, + "loss": 0.032147047519683836, + "step": 50200 + }, + { + "epoch": 1.7984196789302442, + "grad_norm": 0.20724542438983917, + "learning_rate": 5.040401873502807e-06, + "loss": 0.02886124849319458, + "step": 50300 + }, + { + "epoch": 1.8019950659658908, + "grad_norm": 0.7092130184173584, + "learning_rate": 4.951017197611642e-06, + "loss": 0.033159823417663575, + "step": 50400 + }, + { + "epoch": 1.8055704530015375, + "grad_norm": 0.432674765586853, + "learning_rate": 4.861632521720477e-06, + "loss": 0.03299700260162353, + "step": 50500 + }, + { + "epoch": 1.809145840037184, + "grad_norm": 0.9785314798355103, + "learning_rate": 4.772247845829311e-06, + "loss": 0.03019791841506958, + "step": 50600 + }, + { + "epoch": 1.8127212270728306, + "grad_norm": 0.5002002120018005, + "learning_rate": 4.682863169938146e-06, + "loss": 0.035624983310699465, + "step": 50700 + }, + { + "epoch": 1.8162966141084773, + "grad_norm": 0.765285313129425, + "learning_rate": 4.593478494046981e-06, + "loss": 0.02971407175064087, + "step": 50800 + }, + { + "epoch": 1.8198720011441238, + "grad_norm": 0.534965991973877, + "learning_rate": 4.504093818155815e-06, + "loss": 0.03354018688201904, + "step": 50900 + }, + { + "epoch": 1.8234473881797704, + "grad_norm": 0.7223150134086609, + "learning_rate": 4.41470914226465e-06, + "loss": 0.02953230619430542, + "step": 51000 + }, + { + "epoch": 1.8270227752154171, + "grad_norm": 0.38850611448287964, + "learning_rate": 4.325324466373485e-06, + "loss": 0.030534558296203614, + "step": 51100 + }, + { + "epoch": 1.8305981622510636, + "grad_norm": 0.36119019985198975, + "learning_rate": 4.23593979048232e-06, + "loss": 0.030811927318572997, + "step": 51200 + }, + { + "epoch": 1.8341735492867102, + "grad_norm": 0.4112676978111267, + "learning_rate": 4.146555114591154e-06, + "loss": 0.036168689727783206, + "step": 51300 + }, + { + "epoch": 1.837748936322357, + "grad_norm": 0.38200223445892334, + "learning_rate": 4.057170438699989e-06, + "loss": 0.03023934841156006, + "step": 51400 + }, + { + "epoch": 1.8413243233580037, + "grad_norm": 0.22987698018550873, + "learning_rate": 3.967785762808824e-06, + "loss": 0.03280112981796265, + "step": 51500 + }, + { + "epoch": 1.84489971039365, + "grad_norm": 0.5126951336860657, + "learning_rate": 3.8784010869176585e-06, + "loss": 0.032214133739471434, + "step": 51600 + }, + { + "epoch": 1.8484750974292967, + "grad_norm": 0.3394624888896942, + "learning_rate": 3.7890164110264935e-06, + "loss": 0.0288789963722229, + "step": 51700 + }, + { + "epoch": 1.8520504844649435, + "grad_norm": 0.8338372111320496, + "learning_rate": 3.699631735135328e-06, + "loss": 0.03252574443817138, + "step": 51800 + }, + { + "epoch": 1.8556258715005898, + "grad_norm": 0.2515293061733246, + "learning_rate": 3.6102470592441635e-06, + "loss": 0.029772815704345704, + "step": 51900 + }, + { + "epoch": 1.8592012585362365, + "grad_norm": 0.5206916332244873, + "learning_rate": 3.5208623833529985e-06, + "loss": 0.030335335731506347, + "step": 52000 + }, + { + "epoch": 1.8627766455718833, + "grad_norm": 2.3129968643188477, + "learning_rate": 3.431477707461833e-06, + "loss": 0.032417423725128174, + "step": 52100 + }, + { + "epoch": 1.8663520326075298, + "grad_norm": 1.627025842666626, + "learning_rate": 3.342093031570668e-06, + "loss": 0.03170029640197754, + "step": 52200 + }, + { + "epoch": 1.8699274196431763, + "grad_norm": 1.4574371576309204, + "learning_rate": 3.2527083556795027e-06, + "loss": 0.03141381978988647, + "step": 52300 + }, + { + "epoch": 1.873502806678823, + "grad_norm": 0.3863239288330078, + "learning_rate": 3.1633236797883373e-06, + "loss": 0.031075146198272705, + "step": 52400 + }, + { + "epoch": 1.8770781937144696, + "grad_norm": 0.4181801676750183, + "learning_rate": 3.0739390038971723e-06, + "loss": 0.031000993251800536, + "step": 52500 + }, + { + "epoch": 1.8770781937144696, + "eval_accuracy": 0.9888013010709535, + "eval_f1": 0.9016717087789566, + "eval_loss": 0.04389448091387749, + "eval_precision": 0.8910285200988098, + "eval_recall": 0.9125722335623724, + "eval_runtime": 27.8666, + "eval_samples_per_second": 807.418, + "eval_steps_per_second": 22.428, + "step": 52500 + }, + { + "epoch": 1.8806535807501161, + "grad_norm": 0.2707064151763916, + "learning_rate": 2.984554328006007e-06, + "loss": 0.03185615539550781, + "step": 52600 + }, + { + "epoch": 1.8842289677857629, + "grad_norm": 0.5553069710731506, + "learning_rate": 2.895169652114842e-06, + "loss": 0.03002817392349243, + "step": 52700 + }, + { + "epoch": 1.8878043548214094, + "grad_norm": 0.3491911292076111, + "learning_rate": 2.8057849762236764e-06, + "loss": 0.028789632320404053, + "step": 52800 + }, + { + "epoch": 1.891379741857056, + "grad_norm": 0.25187739729881287, + "learning_rate": 2.716400300332511e-06, + "loss": 0.030605175495147706, + "step": 52900 + }, + { + "epoch": 1.8949551288927027, + "grad_norm": 0.9672222137451172, + "learning_rate": 2.627015624441346e-06, + "loss": 0.026704788208007812, + "step": 53000 + }, + { + "epoch": 1.8985305159283492, + "grad_norm": 0.20565390586853027, + "learning_rate": 2.5376309485501806e-06, + "loss": 0.03059121608734131, + "step": 53100 + }, + { + "epoch": 1.9021059029639957, + "grad_norm": 0.28167805075645447, + "learning_rate": 2.448246272659015e-06, + "loss": 0.03177599668502808, + "step": 53200 + }, + { + "epoch": 1.9056812899996425, + "grad_norm": 0.24386221170425415, + "learning_rate": 2.35886159676785e-06, + "loss": 0.029768753051757812, + "step": 53300 + }, + { + "epoch": 1.9092566770352892, + "grad_norm": 3.4795925617218018, + "learning_rate": 2.2694769208766848e-06, + "loss": 0.030632736682891844, + "step": 53400 + }, + { + "epoch": 1.9128320640709355, + "grad_norm": 0.28710371255874634, + "learning_rate": 2.1800922449855198e-06, + "loss": 0.03532270431518555, + "step": 53500 + }, + { + "epoch": 1.9164074511065823, + "grad_norm": 1.0009117126464844, + "learning_rate": 2.090707569094355e-06, + "loss": 0.030157883167266846, + "step": 53600 + }, + { + "epoch": 1.919982838142229, + "grad_norm": 0.8986654877662659, + "learning_rate": 2.0013228932031894e-06, + "loss": 0.02968831777572632, + "step": 53700 + }, + { + "epoch": 1.9235582251778756, + "grad_norm": 0.4408089518547058, + "learning_rate": 1.9119382173120244e-06, + "loss": 0.031650230884552, + "step": 53800 + }, + { + "epoch": 1.927133612213522, + "grad_norm": 0.44061407446861267, + "learning_rate": 1.822553541420859e-06, + "loss": 0.03314239501953125, + "step": 53900 + }, + { + "epoch": 1.9307089992491688, + "grad_norm": 0.31529247760772705, + "learning_rate": 1.7331688655296938e-06, + "loss": 0.028174445629119874, + "step": 54000 + }, + { + "epoch": 1.9342843862848154, + "grad_norm": 0.46949172019958496, + "learning_rate": 1.6437841896385283e-06, + "loss": 0.03205679178237915, + "step": 54100 + }, + { + "epoch": 1.9378597733204619, + "grad_norm": 0.42985737323760986, + "learning_rate": 1.5543995137473631e-06, + "loss": 0.03423054218292236, + "step": 54200 + }, + { + "epoch": 1.9414351603561086, + "grad_norm": 0.3582230806350708, + "learning_rate": 1.465014837856198e-06, + "loss": 0.036082537174224855, + "step": 54300 + }, + { + "epoch": 1.9450105473917552, + "grad_norm": 0.2743465304374695, + "learning_rate": 1.375630161965033e-06, + "loss": 0.03133800745010376, + "step": 54400 + }, + { + "epoch": 1.9485859344274017, + "grad_norm": 0.3252977728843689, + "learning_rate": 1.2862454860738675e-06, + "loss": 0.029351208209991455, + "step": 54500 + }, + { + "epoch": 1.9521613214630484, + "grad_norm": 0.7166300415992737, + "learning_rate": 1.1968608101827023e-06, + "loss": 0.03286364078521729, + "step": 54600 + }, + { + "epoch": 1.955736708498695, + "grad_norm": 0.4002815783023834, + "learning_rate": 1.1074761342915371e-06, + "loss": 0.03330163955688477, + "step": 54700 + }, + { + "epoch": 1.9593120955343415, + "grad_norm": 0.6636976003646851, + "learning_rate": 1.018091458400372e-06, + "loss": 0.03203016996383667, + "step": 54800 + }, + { + "epoch": 1.9628874825699882, + "grad_norm": 0.9583289623260498, + "learning_rate": 9.287067825092066e-07, + "loss": 0.03129979610443115, + "step": 54900 + }, + { + "epoch": 1.9664628696056348, + "grad_norm": 0.31978148221969604, + "learning_rate": 8.393221066180415e-07, + "loss": 0.029429452419281008, + "step": 55000 + }, + { + "epoch": 1.9664628696056348, + "eval_accuracy": 0.9891878983990663, + "eval_f1": 0.9045753492836575, + "eval_loss": 0.04267999157309532, + "eval_precision": 0.8949478748997595, + "eval_recall": 0.9144122128626053, + "eval_runtime": 27.5433, + "eval_samples_per_second": 816.897, + "eval_steps_per_second": 22.692, + "step": 55000 + }, + { + "epoch": 1.9700382566412813, + "grad_norm": 2.8054332733154297, + "learning_rate": 7.499374307268763e-07, + "loss": 0.03312858819961548, + "step": 55100 + }, + { + "epoch": 1.973613643676928, + "grad_norm": 0.5224851369857788, + "learning_rate": 6.60552754835711e-07, + "loss": 0.028790268898010254, + "step": 55200 + }, + { + "epoch": 1.9771890307125748, + "grad_norm": 0.26614582538604736, + "learning_rate": 5.711680789445458e-07, + "loss": 0.028711328506469725, + "step": 55300 + }, + { + "epoch": 1.980764417748221, + "grad_norm": 0.7065221667289734, + "learning_rate": 4.817834030533806e-07, + "loss": 0.03409520626068115, + "step": 55400 + }, + { + "epoch": 1.9843398047838678, + "grad_norm": 0.5520646572113037, + "learning_rate": 3.923987271622153e-07, + "loss": 0.030617287158966066, + "step": 55500 + }, + { + "epoch": 1.9879151918195146, + "grad_norm": 0.8152151703834534, + "learning_rate": 3.030140512710501e-07, + "loss": 0.034760825634002686, + "step": 55600 + }, + { + "epoch": 1.9914905788551611, + "grad_norm": 0.7719851136207581, + "learning_rate": 2.136293753798849e-07, + "loss": 0.033238520622253416, + "step": 55700 + }, + { + "epoch": 1.9950659658908076, + "grad_norm": 0.3627885580062866, + "learning_rate": 1.2424469948871967e-07, + "loss": 0.029695370197296143, + "step": 55800 + }, + { + "epoch": 1.9986413529264544, + "grad_norm": 1.9493422508239746, + "learning_rate": 3.4860023597554434e-08, + "loss": 0.032883105278015134, + "step": 55900 + }, + { + "epoch": 2.0, + "step": 55938, + "total_flos": 1.889848580814228e+18, + "train_loss": 0.058781605476671404, + "train_runtime": 18337.3113, + "train_samples_per_second": 439.268, + "train_steps_per_second": 3.051 + } + ], + "logging_steps": 100, + "max_steps": 55938, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.889848580814228e+18, + "train_batch_size": 72, + "trial_name": null, + "trial_params": null +}