diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5694 @@ +{ + "best_global_step": 7998, + "best_metric": 1.0033386945724487, + "best_model_checkpoint": "./../../../models/LedgerBERT-SciBERT-base-v3-News-Class/2025-10-15_00-24-07/market_direction/checkpoint-7998", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 7998, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_accuracy": 0.37046413502109704, + "eval_f1_macro": 0.31648220525898246, + "eval_f1_weighted": 0.3428571794493407, + "eval_loss": 1.0956553220748901, + "eval_precision_macro": 0.3404493817232522, + "eval_precision_weighted": 0.3541600044961222, + "eval_recall_macro": 0.3400488233349732, + "eval_recall_weighted": 0.37046413502109704, + "eval_runtime": 5.1696, + "eval_samples_per_second": 458.454, + "eval_steps_per_second": 57.452, + "step": 0 + }, + { + "epoch": 0.00037509377344336085, + "grad_norm": 6.950562000274658, + "learning_rate": 0.0, + "loss": 1.0664, + "step": 1 + }, + { + "epoch": 0.0037509377344336083, + "grad_norm": 7.2157673835754395, + "learning_rate": 3.6e-07, + "loss": 1.1053, + "step": 10 + }, + { + "epoch": 0.007501875468867217, + "grad_norm": 7.101637840270996, + "learning_rate": 7.6e-07, + "loss": 1.081, + "step": 20 + }, + { + "epoch": 0.011252813203300824, + "grad_norm": 7.503627777099609, + "learning_rate": 1.1600000000000001e-06, + "loss": 1.1167, + "step": 30 + }, + { + "epoch": 0.015003750937734433, + "grad_norm": 6.733654975891113, + "learning_rate": 1.56e-06, + "loss": 1.1038, + "step": 40 + }, + { + "epoch": 0.018754688672168042, + "grad_norm": 6.987666130065918, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.1025, + "step": 50 + }, + { + "epoch": 0.02250562640660165, + "grad_norm": 9.358382225036621, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.1166, + "step": 60 + }, + { + "epoch": 0.02625656414103526, + "grad_norm": 6.5409040451049805, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.0841, + "step": 70 + }, + { + "epoch": 0.030007501875468866, + "grad_norm": 7.030813217163086, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.0583, + "step": 80 + }, + { + "epoch": 0.03375843960990248, + "grad_norm": 6.986401081085205, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.1288, + "step": 90 + }, + { + "epoch": 0.037509377344336084, + "grad_norm": 5.53237247467041, + "learning_rate": 3.96e-06, + "loss": 1.0573, + "step": 100 + }, + { + "epoch": 0.04126031507876969, + "grad_norm": 14.836161613464355, + "learning_rate": 4.360000000000001e-06, + "loss": 1.0754, + "step": 110 + }, + { + "epoch": 0.0450112528132033, + "grad_norm": 8.877525329589844, + "learning_rate": 4.76e-06, + "loss": 1.0753, + "step": 120 + }, + { + "epoch": 0.04876219054763691, + "grad_norm": 10.311164855957031, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.1246, + "step": 130 + }, + { + "epoch": 0.05251312828207052, + "grad_norm": 5.360109329223633, + "learning_rate": 5.560000000000001e-06, + "loss": 1.0218, + "step": 140 + }, + { + "epoch": 0.056264066016504126, + "grad_norm": 10.249133110046387, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.0723, + "step": 150 + }, + { + "epoch": 0.06001500375093773, + "grad_norm": 7.881443500518799, + "learning_rate": 6.360000000000001e-06, + "loss": 1.0727, + "step": 160 + }, + { + "epoch": 0.06376594148537135, + "grad_norm": 5.892578601837158, + "learning_rate": 6.760000000000001e-06, + "loss": 1.0498, + "step": 170 + }, + { + "epoch": 0.06751687921980495, + "grad_norm": 6.164844512939453, + "learning_rate": 7.16e-06, + "loss": 1.1182, + "step": 180 + }, + { + "epoch": 0.07126781695423856, + "grad_norm": 6.351868629455566, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.0735, + "step": 190 + }, + { + "epoch": 0.07501875468867217, + "grad_norm": 6.895458698272705, + "learning_rate": 7.960000000000002e-06, + "loss": 1.0503, + "step": 200 + }, + { + "epoch": 0.07876969242310577, + "grad_norm": 8.486842155456543, + "learning_rate": 8.36e-06, + "loss": 1.0965, + "step": 210 + }, + { + "epoch": 0.08252063015753938, + "grad_norm": 8.301511764526367, + "learning_rate": 8.76e-06, + "loss": 1.1157, + "step": 220 + }, + { + "epoch": 0.08627156789197299, + "grad_norm": 11.515487670898438, + "learning_rate": 9.16e-06, + "loss": 1.0854, + "step": 230 + }, + { + "epoch": 0.0900225056264066, + "grad_norm": 6.189631938934326, + "learning_rate": 9.56e-06, + "loss": 1.054, + "step": 240 + }, + { + "epoch": 0.09377344336084022, + "grad_norm": 4.8885393142700195, + "learning_rate": 9.960000000000001e-06, + "loss": 1.0693, + "step": 250 + }, + { + "epoch": 0.09752438109527382, + "grad_norm": 6.190073490142822, + "learning_rate": 1.036e-05, + "loss": 1.0786, + "step": 260 + }, + { + "epoch": 0.10127531882970743, + "grad_norm": 8.178174018859863, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.0374, + "step": 270 + }, + { + "epoch": 0.10502625656414104, + "grad_norm": 5.824592113494873, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.0829, + "step": 280 + }, + { + "epoch": 0.10877719429857464, + "grad_norm": 7.339807033538818, + "learning_rate": 1.156e-05, + "loss": 1.1085, + "step": 290 + }, + { + "epoch": 0.11252813203300825, + "grad_norm": 6.39154577255249, + "learning_rate": 1.196e-05, + "loss": 1.0505, + "step": 300 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 7.54710054397583, + "learning_rate": 1.236e-05, + "loss": 1.049, + "step": 310 + }, + { + "epoch": 0.12003000750187547, + "grad_norm": 10.610452651977539, + "learning_rate": 1.2760000000000001e-05, + "loss": 1.1105, + "step": 320 + }, + { + "epoch": 0.12378094523630907, + "grad_norm": 6.961548328399658, + "learning_rate": 1.3160000000000001e-05, + "loss": 1.0392, + "step": 330 + }, + { + "epoch": 0.1275318829707427, + "grad_norm": 8.800139427185059, + "learning_rate": 1.3560000000000002e-05, + "loss": 1.1473, + "step": 340 + }, + { + "epoch": 0.1312828207051763, + "grad_norm": 7.540011405944824, + "learning_rate": 1.396e-05, + "loss": 1.0891, + "step": 350 + }, + { + "epoch": 0.1350337584396099, + "grad_norm": 11.337075233459473, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.0715, + "step": 360 + }, + { + "epoch": 0.13878469617404351, + "grad_norm": 5.6576457023620605, + "learning_rate": 1.4760000000000001e-05, + "loss": 1.0702, + "step": 370 + }, + { + "epoch": 0.14253563390847712, + "grad_norm": 8.98009967803955, + "learning_rate": 1.516e-05, + "loss": 1.0752, + "step": 380 + }, + { + "epoch": 0.14628657164291073, + "grad_norm": 4.932474613189697, + "learning_rate": 1.556e-05, + "loss": 1.0641, + "step": 390 + }, + { + "epoch": 0.15003750937734434, + "grad_norm": 6.130215644836426, + "learning_rate": 1.5960000000000003e-05, + "loss": 1.0133, + "step": 400 + }, + { + "epoch": 0.15378844711177794, + "grad_norm": 16.0273380279541, + "learning_rate": 1.636e-05, + "loss": 1.0442, + "step": 410 + }, + { + "epoch": 0.15753938484621155, + "grad_norm": 12.93301010131836, + "learning_rate": 1.6760000000000002e-05, + "loss": 1.1161, + "step": 420 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 9.27346420288086, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.0539, + "step": 430 + }, + { + "epoch": 0.16504126031507876, + "grad_norm": 5.5671186447143555, + "learning_rate": 1.756e-05, + "loss": 0.9452, + "step": 440 + }, + { + "epoch": 0.16879219804951237, + "grad_norm": 7.939000606536865, + "learning_rate": 1.796e-05, + "loss": 1.0522, + "step": 450 + }, + { + "epoch": 0.17254313578394598, + "grad_norm": 9.265899658203125, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.0866, + "step": 460 + }, + { + "epoch": 0.17629407351837958, + "grad_norm": 6.934913158416748, + "learning_rate": 1.876e-05, + "loss": 0.9723, + "step": 470 + }, + { + "epoch": 0.1800450112528132, + "grad_norm": 6.007977485656738, + "learning_rate": 1.916e-05, + "loss": 0.9742, + "step": 480 + }, + { + "epoch": 0.1837959489872468, + "grad_norm": 7.842029094696045, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.9334, + "step": 490 + }, + { + "epoch": 0.18754688672168043, + "grad_norm": 10.53432559967041, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.8706, + "step": 500 + }, + { + "epoch": 0.19129782445611404, + "grad_norm": 9.365771293640137, + "learning_rate": 1.997599359829288e-05, + "loss": 1.0399, + "step": 510 + }, + { + "epoch": 0.19504876219054765, + "grad_norm": 9.351228713989258, + "learning_rate": 1.99493198186183e-05, + "loss": 0.9525, + "step": 520 + }, + { + "epoch": 0.19879969992498125, + "grad_norm": 12.21917724609375, + "learning_rate": 1.992264603894372e-05, + "loss": 0.9793, + "step": 530 + }, + { + "epoch": 0.20255063765941486, + "grad_norm": 17.076719284057617, + "learning_rate": 1.9895972259269142e-05, + "loss": 1.0403, + "step": 540 + }, + { + "epoch": 0.20630157539384847, + "grad_norm": 6.928652286529541, + "learning_rate": 1.9869298479594562e-05, + "loss": 0.9047, + "step": 550 + }, + { + "epoch": 0.21005251312828208, + "grad_norm": 6.858879089355469, + "learning_rate": 1.984262469991998e-05, + "loss": 1.012, + "step": 560 + }, + { + "epoch": 0.21380345086271568, + "grad_norm": 5.987520217895508, + "learning_rate": 1.98159509202454e-05, + "loss": 0.9345, + "step": 570 + }, + { + "epoch": 0.2175543885971493, + "grad_norm": 12.161517143249512, + "learning_rate": 1.978927714057082e-05, + "loss": 0.9955, + "step": 580 + }, + { + "epoch": 0.2213053263315829, + "grad_norm": 9.229764938354492, + "learning_rate": 1.976260336089624e-05, + "loss": 0.998, + "step": 590 + }, + { + "epoch": 0.2250562640660165, + "grad_norm": 9.257465362548828, + "learning_rate": 1.973592958122166e-05, + "loss": 0.9882, + "step": 600 + }, + { + "epoch": 0.2288072018004501, + "grad_norm": 11.260259628295898, + "learning_rate": 1.970925580154708e-05, + "loss": 0.9727, + "step": 610 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 5.7551984786987305, + "learning_rate": 1.96825820218725e-05, + "loss": 0.9139, + "step": 620 + }, + { + "epoch": 0.23630907726931732, + "grad_norm": 7.264505863189697, + "learning_rate": 1.9655908242197922e-05, + "loss": 0.8718, + "step": 630 + }, + { + "epoch": 0.24006001500375093, + "grad_norm": 13.518917083740234, + "learning_rate": 1.9629234462523342e-05, + "loss": 1.0478, + "step": 640 + }, + { + "epoch": 0.24381095273818454, + "grad_norm": 7.133944034576416, + "learning_rate": 1.960256068284876e-05, + "loss": 0.951, + "step": 650 + }, + { + "epoch": 0.24756189047261815, + "grad_norm": 10.491629600524902, + "learning_rate": 1.957588690317418e-05, + "loss": 0.9271, + "step": 660 + }, + { + "epoch": 0.25131282820705175, + "grad_norm": 6.807431697845459, + "learning_rate": 1.95492131234996e-05, + "loss": 1.0804, + "step": 670 + }, + { + "epoch": 0.2550637659414854, + "grad_norm": 9.180730819702148, + "learning_rate": 1.9522539343825024e-05, + "loss": 0.9079, + "step": 680 + }, + { + "epoch": 0.25881470367591897, + "grad_norm": 6.459209442138672, + "learning_rate": 1.9495865564150443e-05, + "loss": 0.9989, + "step": 690 + }, + { + "epoch": 0.2625656414103526, + "grad_norm": 5.8546929359436035, + "learning_rate": 1.9469191784475863e-05, + "loss": 0.951, + "step": 700 + }, + { + "epoch": 0.2663165791447862, + "grad_norm": 10.301909446716309, + "learning_rate": 1.9442518004801282e-05, + "loss": 0.8549, + "step": 710 + }, + { + "epoch": 0.2700675168792198, + "grad_norm": 17.759777069091797, + "learning_rate": 1.9415844225126702e-05, + "loss": 1.1818, + "step": 720 + }, + { + "epoch": 0.2738184546136534, + "grad_norm": 7.105804920196533, + "learning_rate": 1.938917044545212e-05, + "loss": 1.034, + "step": 730 + }, + { + "epoch": 0.27756939234808703, + "grad_norm": 8.125602722167969, + "learning_rate": 1.936249666577754e-05, + "loss": 0.9509, + "step": 740 + }, + { + "epoch": 0.2813203300825206, + "grad_norm": 6.968907833099365, + "learning_rate": 1.933582288610296e-05, + "loss": 0.9292, + "step": 750 + }, + { + "epoch": 0.28507126781695424, + "grad_norm": 9.841052055358887, + "learning_rate": 1.930914910642838e-05, + "loss": 1.0401, + "step": 760 + }, + { + "epoch": 0.2888222055513878, + "grad_norm": 6.7177910804748535, + "learning_rate": 1.9282475326753804e-05, + "loss": 1.0079, + "step": 770 + }, + { + "epoch": 0.29257314328582146, + "grad_norm": 8.652711868286133, + "learning_rate": 1.9255801547079223e-05, + "loss": 0.8986, + "step": 780 + }, + { + "epoch": 0.29632408102025504, + "grad_norm": 7.266161918640137, + "learning_rate": 1.9229127767404643e-05, + "loss": 0.9805, + "step": 790 + }, + { + "epoch": 0.30007501875468867, + "grad_norm": 7.372107982635498, + "learning_rate": 1.9202453987730062e-05, + "loss": 1.0254, + "step": 800 + }, + { + "epoch": 0.3038259564891223, + "grad_norm": 6.467881202697754, + "learning_rate": 1.9175780208055482e-05, + "loss": 0.9931, + "step": 810 + }, + { + "epoch": 0.3075768942235559, + "grad_norm": 8.692418098449707, + "learning_rate": 1.9149106428380905e-05, + "loss": 0.8585, + "step": 820 + }, + { + "epoch": 0.3113278319579895, + "grad_norm": 7.981175422668457, + "learning_rate": 1.9122432648706325e-05, + "loss": 0.9164, + "step": 830 + }, + { + "epoch": 0.3150787696924231, + "grad_norm": 11.882697105407715, + "learning_rate": 1.9095758869031744e-05, + "loss": 1.0325, + "step": 840 + }, + { + "epoch": 0.31882970742685673, + "grad_norm": 10.736306190490723, + "learning_rate": 1.9069085089357164e-05, + "loss": 0.9888, + "step": 850 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 5.334744453430176, + "learning_rate": 1.9042411309682583e-05, + "loss": 0.9364, + "step": 860 + }, + { + "epoch": 0.32633158289572395, + "grad_norm": 6.579550743103027, + "learning_rate": 1.9015737530008003e-05, + "loss": 0.9395, + "step": 870 + }, + { + "epoch": 0.3300825206301575, + "grad_norm": 7.336994171142578, + "learning_rate": 1.8989063750333423e-05, + "loss": 0.9363, + "step": 880 + }, + { + "epoch": 0.33383345836459116, + "grad_norm": 9.523600578308105, + "learning_rate": 1.8962389970658842e-05, + "loss": 0.9405, + "step": 890 + }, + { + "epoch": 0.33758439609902474, + "grad_norm": 9.350625038146973, + "learning_rate": 1.8935716190984262e-05, + "loss": 1.0351, + "step": 900 + }, + { + "epoch": 0.3413353338334584, + "grad_norm": 9.00391674041748, + "learning_rate": 1.8909042411309685e-05, + "loss": 0.9721, + "step": 910 + }, + { + "epoch": 0.34508627156789196, + "grad_norm": 5.69331693649292, + "learning_rate": 1.8882368631635105e-05, + "loss": 0.8811, + "step": 920 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 6.127689361572266, + "learning_rate": 1.8855694851960524e-05, + "loss": 1.0079, + "step": 930 + }, + { + "epoch": 0.35258814703675917, + "grad_norm": 11.653777122497559, + "learning_rate": 1.8829021072285944e-05, + "loss": 0.9518, + "step": 940 + }, + { + "epoch": 0.3563390847711928, + "grad_norm": 7.30828332901001, + "learning_rate": 1.8802347292611363e-05, + "loss": 0.8464, + "step": 950 + }, + { + "epoch": 0.3600900225056264, + "grad_norm": 9.21927547454834, + "learning_rate": 1.8775673512936786e-05, + "loss": 1.0584, + "step": 960 + }, + { + "epoch": 0.36384096024006, + "grad_norm": 6.939789772033691, + "learning_rate": 1.8748999733262206e-05, + "loss": 0.9, + "step": 970 + }, + { + "epoch": 0.3675918979744936, + "grad_norm": 12.434165954589844, + "learning_rate": 1.8722325953587626e-05, + "loss": 1.015, + "step": 980 + }, + { + "epoch": 0.37134283570892723, + "grad_norm": 11.779828071594238, + "learning_rate": 1.8695652173913045e-05, + "loss": 0.9725, + "step": 990 + }, + { + "epoch": 0.37509377344336087, + "grad_norm": 12.166790962219238, + "learning_rate": 1.8668978394238465e-05, + "loss": 1.0591, + "step": 1000 + }, + { + "epoch": 0.37884471117779445, + "grad_norm": 8.87903881072998, + "learning_rate": 1.8642304614563884e-05, + "loss": 0.9767, + "step": 1010 + }, + { + "epoch": 0.3825956489122281, + "grad_norm": 5.176930904388428, + "learning_rate": 1.8615630834889304e-05, + "loss": 0.8934, + "step": 1020 + }, + { + "epoch": 0.38634658664666166, + "grad_norm": 7.772132396697998, + "learning_rate": 1.8588957055214724e-05, + "loss": 0.9488, + "step": 1030 + }, + { + "epoch": 0.3900975243810953, + "grad_norm": 10.097055435180664, + "learning_rate": 1.8562283275540143e-05, + "loss": 0.9725, + "step": 1040 + }, + { + "epoch": 0.3938484621155289, + "grad_norm": 10.014994621276855, + "learning_rate": 1.8535609495865566e-05, + "loss": 0.9432, + "step": 1050 + }, + { + "epoch": 0.3975993998499625, + "grad_norm": 10.885961532592773, + "learning_rate": 1.8508935716190986e-05, + "loss": 1.0393, + "step": 1060 + }, + { + "epoch": 0.4013503375843961, + "grad_norm": 7.621641635894775, + "learning_rate": 1.8482261936516406e-05, + "loss": 0.9801, + "step": 1070 + }, + { + "epoch": 0.4051012753188297, + "grad_norm": 6.268519878387451, + "learning_rate": 1.8455588156841825e-05, + "loss": 0.9922, + "step": 1080 + }, + { + "epoch": 0.4088522130532633, + "grad_norm": 6.714245796203613, + "learning_rate": 1.8428914377167245e-05, + "loss": 1.0355, + "step": 1090 + }, + { + "epoch": 0.41260315078769694, + "grad_norm": 11.643074035644531, + "learning_rate": 1.8402240597492668e-05, + "loss": 1.0575, + "step": 1100 + }, + { + "epoch": 0.4163540885221305, + "grad_norm": 6.439828395843506, + "learning_rate": 1.8375566817818087e-05, + "loss": 0.9101, + "step": 1110 + }, + { + "epoch": 0.42010502625656415, + "grad_norm": 6.833279609680176, + "learning_rate": 1.8348893038143507e-05, + "loss": 0.935, + "step": 1120 + }, + { + "epoch": 0.42385596399099773, + "grad_norm": 7.262381553649902, + "learning_rate": 1.8322219258468927e-05, + "loss": 0.977, + "step": 1130 + }, + { + "epoch": 0.42760690172543137, + "grad_norm": 5.480360984802246, + "learning_rate": 1.8295545478794346e-05, + "loss": 0.8673, + "step": 1140 + }, + { + "epoch": 0.43135783945986494, + "grad_norm": 8.4745454788208, + "learning_rate": 1.8268871699119766e-05, + "loss": 0.88, + "step": 1150 + }, + { + "epoch": 0.4351087771942986, + "grad_norm": 16.769878387451172, + "learning_rate": 1.8242197919445185e-05, + "loss": 0.9576, + "step": 1160 + }, + { + "epoch": 0.43885971492873216, + "grad_norm": 7.4179582595825195, + "learning_rate": 1.8215524139770605e-05, + "loss": 0.9263, + "step": 1170 + }, + { + "epoch": 0.4426106526631658, + "grad_norm": 11.899470329284668, + "learning_rate": 1.8188850360096028e-05, + "loss": 0.9328, + "step": 1180 + }, + { + "epoch": 0.4463615903975994, + "grad_norm": 8.113855361938477, + "learning_rate": 1.8162176580421448e-05, + "loss": 0.9684, + "step": 1190 + }, + { + "epoch": 0.450112528132033, + "grad_norm": 7.619154453277588, + "learning_rate": 1.8135502800746867e-05, + "loss": 0.918, + "step": 1200 + }, + { + "epoch": 0.45386346586646664, + "grad_norm": 7.7961602210998535, + "learning_rate": 1.8108829021072287e-05, + "loss": 0.8574, + "step": 1210 + }, + { + "epoch": 0.4576144036009002, + "grad_norm": 8.734787940979004, + "learning_rate": 1.8082155241397707e-05, + "loss": 0.9009, + "step": 1220 + }, + { + "epoch": 0.46136534133533386, + "grad_norm": 5.773232936859131, + "learning_rate": 1.8055481461723126e-05, + "loss": 1.0554, + "step": 1230 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 7.872585773468018, + "learning_rate": 1.802880768204855e-05, + "loss": 0.8688, + "step": 1240 + }, + { + "epoch": 0.46886721680420107, + "grad_norm": 7.2498602867126465, + "learning_rate": 1.800213390237397e-05, + "loss": 0.9726, + "step": 1250 + }, + { + "epoch": 0.47261815453863465, + "grad_norm": 11.007004737854004, + "learning_rate": 1.797546012269939e-05, + "loss": 0.9338, + "step": 1260 + }, + { + "epoch": 0.4763690922730683, + "grad_norm": 10.418313980102539, + "learning_rate": 1.7948786343024808e-05, + "loss": 0.9217, + "step": 1270 + }, + { + "epoch": 0.48012003000750186, + "grad_norm": 11.935880661010742, + "learning_rate": 1.7922112563350228e-05, + "loss": 0.8656, + "step": 1280 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 10.331807136535645, + "learning_rate": 1.789543878367565e-05, + "loss": 0.9948, + "step": 1290 + }, + { + "epoch": 0.4876219054763691, + "grad_norm": 7.979977607727051, + "learning_rate": 1.7868765004001067e-05, + "loss": 0.9068, + "step": 1300 + }, + { + "epoch": 0.4913728432108027, + "grad_norm": 7.865904808044434, + "learning_rate": 1.7842091224326486e-05, + "loss": 0.8362, + "step": 1310 + }, + { + "epoch": 0.4951237809452363, + "grad_norm": 11.6406888961792, + "learning_rate": 1.781541744465191e-05, + "loss": 1.0061, + "step": 1320 + }, + { + "epoch": 0.4988747186796699, + "grad_norm": 9.274069786071777, + "learning_rate": 1.778874366497733e-05, + "loss": 0.9448, + "step": 1330 + }, + { + "epoch": 0.5026256564141035, + "grad_norm": 9.999556541442871, + "learning_rate": 1.776206988530275e-05, + "loss": 0.9188, + "step": 1340 + }, + { + "epoch": 0.5063765941485371, + "grad_norm": 10.032958984375, + "learning_rate": 1.773539610562817e-05, + "loss": 0.9794, + "step": 1350 + }, + { + "epoch": 0.5101275318829708, + "grad_norm": 5.453114032745361, + "learning_rate": 1.7708722325953588e-05, + "loss": 1.0102, + "step": 1360 + }, + { + "epoch": 0.5138784696174044, + "grad_norm": 13.257373809814453, + "learning_rate": 1.7682048546279008e-05, + "loss": 0.9801, + "step": 1370 + }, + { + "epoch": 0.5176294073518379, + "grad_norm": 5.355706691741943, + "learning_rate": 1.765537476660443e-05, + "loss": 0.9126, + "step": 1380 + }, + { + "epoch": 0.5213803450862715, + "grad_norm": 9.768399238586426, + "learning_rate": 1.762870098692985e-05, + "loss": 0.9423, + "step": 1390 + }, + { + "epoch": 0.5251312828207052, + "grad_norm": 8.362143516540527, + "learning_rate": 1.760202720725527e-05, + "loss": 1.0289, + "step": 1400 + }, + { + "epoch": 0.5288822205551388, + "grad_norm": 10.58354377746582, + "learning_rate": 1.757535342758069e-05, + "loss": 0.9593, + "step": 1410 + }, + { + "epoch": 0.5326331582895724, + "grad_norm": 8.964977264404297, + "learning_rate": 1.754867964790611e-05, + "loss": 1.1002, + "step": 1420 + }, + { + "epoch": 0.536384096024006, + "grad_norm": 11.886764526367188, + "learning_rate": 1.7522005868231532e-05, + "loss": 0.842, + "step": 1430 + }, + { + "epoch": 0.5401350337584396, + "grad_norm": 9.155001640319824, + "learning_rate": 1.7495332088556948e-05, + "loss": 1.0402, + "step": 1440 + }, + { + "epoch": 0.5438859714928732, + "grad_norm": 7.865649223327637, + "learning_rate": 1.7468658308882368e-05, + "loss": 0.9502, + "step": 1450 + }, + { + "epoch": 0.5476369092273068, + "grad_norm": 8.232137680053711, + "learning_rate": 1.744198452920779e-05, + "loss": 0.9042, + "step": 1460 + }, + { + "epoch": 0.5513878469617405, + "grad_norm": 7.428460597991943, + "learning_rate": 1.741531074953321e-05, + "loss": 0.8664, + "step": 1470 + }, + { + "epoch": 0.5551387846961741, + "grad_norm": 6.769949913024902, + "learning_rate": 1.738863696985863e-05, + "loss": 0.9676, + "step": 1480 + }, + { + "epoch": 0.5588897224306076, + "grad_norm": 7.262323379516602, + "learning_rate": 1.736196319018405e-05, + "loss": 0.9461, + "step": 1490 + }, + { + "epoch": 0.5626406601650412, + "grad_norm": 7.46332311630249, + "learning_rate": 1.733528941050947e-05, + "loss": 0.9928, + "step": 1500 + }, + { + "epoch": 0.5663915978994749, + "grad_norm": 13.346348762512207, + "learning_rate": 1.7308615630834892e-05, + "loss": 0.9645, + "step": 1510 + }, + { + "epoch": 0.5701425356339085, + "grad_norm": 7.057946681976318, + "learning_rate": 1.7281941851160312e-05, + "loss": 0.872, + "step": 1520 + }, + { + "epoch": 0.5738934733683421, + "grad_norm": 11.920793533325195, + "learning_rate": 1.725526807148573e-05, + "loss": 0.9084, + "step": 1530 + }, + { + "epoch": 0.5776444111027756, + "grad_norm": 4.696298122406006, + "learning_rate": 1.722859429181115e-05, + "loss": 0.9184, + "step": 1540 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 9.623963356018066, + "learning_rate": 1.720192051213657e-05, + "loss": 0.8924, + "step": 1550 + }, + { + "epoch": 0.5851462865716429, + "grad_norm": 10.262091636657715, + "learning_rate": 1.717524673246199e-05, + "loss": 0.9476, + "step": 1560 + }, + { + "epoch": 0.5888972243060765, + "grad_norm": 10.587578773498535, + "learning_rate": 1.7148572952787413e-05, + "loss": 0.9443, + "step": 1570 + }, + { + "epoch": 0.5926481620405101, + "grad_norm": 8.189558029174805, + "learning_rate": 1.7121899173112833e-05, + "loss": 0.9245, + "step": 1580 + }, + { + "epoch": 0.5963990997749438, + "grad_norm": 7.582670211791992, + "learning_rate": 1.709522539343825e-05, + "loss": 0.8533, + "step": 1590 + }, + { + "epoch": 0.6001500375093773, + "grad_norm": 8.973713874816895, + "learning_rate": 1.7068551613763672e-05, + "loss": 0.9197, + "step": 1600 + }, + { + "epoch": 0.6039009752438109, + "grad_norm": 7.140238285064697, + "learning_rate": 1.7041877834089092e-05, + "loss": 0.8815, + "step": 1610 + }, + { + "epoch": 0.6076519129782446, + "grad_norm": 7.83927059173584, + "learning_rate": 1.701520405441451e-05, + "loss": 0.9782, + "step": 1620 + }, + { + "epoch": 0.6114028507126782, + "grad_norm": 6.876523494720459, + "learning_rate": 1.698853027473993e-05, + "loss": 0.9575, + "step": 1630 + }, + { + "epoch": 0.6151537884471118, + "grad_norm": 10.362568855285645, + "learning_rate": 1.696185649506535e-05, + "loss": 0.8977, + "step": 1640 + }, + { + "epoch": 0.6189047261815454, + "grad_norm": 9.509383201599121, + "learning_rate": 1.6935182715390774e-05, + "loss": 0.996, + "step": 1650 + }, + { + "epoch": 0.622655663915979, + "grad_norm": 5.023642539978027, + "learning_rate": 1.6908508935716193e-05, + "loss": 0.9131, + "step": 1660 + }, + { + "epoch": 0.6264066016504126, + "grad_norm": 6.320276260375977, + "learning_rate": 1.6881835156041613e-05, + "loss": 0.9765, + "step": 1670 + }, + { + "epoch": 0.6301575393848462, + "grad_norm": 10.261762619018555, + "learning_rate": 1.6855161376367033e-05, + "loss": 0.9057, + "step": 1680 + }, + { + "epoch": 0.6339084771192798, + "grad_norm": 8.115468978881836, + "learning_rate": 1.6828487596692452e-05, + "loss": 0.8892, + "step": 1690 + }, + { + "epoch": 0.6376594148537135, + "grad_norm": 10.657661437988281, + "learning_rate": 1.6801813817017875e-05, + "loss": 0.9186, + "step": 1700 + }, + { + "epoch": 0.641410352588147, + "grad_norm": 7.065814018249512, + "learning_rate": 1.6775140037343295e-05, + "loss": 0.8878, + "step": 1710 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 8.048439979553223, + "learning_rate": 1.6748466257668714e-05, + "loss": 0.946, + "step": 1720 + }, + { + "epoch": 0.6489122280570142, + "grad_norm": 10.228202819824219, + "learning_rate": 1.672179247799413e-05, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.6526631657914479, + "grad_norm": 10.011300086975098, + "learning_rate": 1.6695118698319554e-05, + "loss": 1.0565, + "step": 1740 + }, + { + "epoch": 0.6564141035258815, + "grad_norm": 8.266985893249512, + "learning_rate": 1.6668444918644973e-05, + "loss": 0.9523, + "step": 1750 + }, + { + "epoch": 0.660165041260315, + "grad_norm": 7.511131763458252, + "learning_rate": 1.6641771138970393e-05, + "loss": 1.0325, + "step": 1760 + }, + { + "epoch": 0.6639159789947486, + "grad_norm": 7.235232830047607, + "learning_rate": 1.6615097359295813e-05, + "loss": 0.9197, + "step": 1770 + }, + { + "epoch": 0.6676669167291823, + "grad_norm": 8.137916564941406, + "learning_rate": 1.6588423579621232e-05, + "loss": 0.886, + "step": 1780 + }, + { + "epoch": 0.6714178544636159, + "grad_norm": 7.320621013641357, + "learning_rate": 1.6561749799946655e-05, + "loss": 0.8866, + "step": 1790 + }, + { + "epoch": 0.6751687921980495, + "grad_norm": 8.104268074035645, + "learning_rate": 1.6535076020272075e-05, + "loss": 0.9554, + "step": 1800 + }, + { + "epoch": 0.6789197299324832, + "grad_norm": 8.669350624084473, + "learning_rate": 1.6508402240597494e-05, + "loss": 0.907, + "step": 1810 + }, + { + "epoch": 0.6826706676669168, + "grad_norm": 7.718722820281982, + "learning_rate": 1.6481728460922914e-05, + "loss": 0.9931, + "step": 1820 + }, + { + "epoch": 0.6864216054013503, + "grad_norm": 6.479692459106445, + "learning_rate": 1.6455054681248334e-05, + "loss": 0.9669, + "step": 1830 + }, + { + "epoch": 0.6901725431357839, + "grad_norm": 5.159636497497559, + "learning_rate": 1.6428380901573757e-05, + "loss": 1.0003, + "step": 1840 + }, + { + "epoch": 0.6939234808702176, + "grad_norm": 6.043707847595215, + "learning_rate": 1.6401707121899176e-05, + "loss": 0.894, + "step": 1850 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 8.509610176086426, + "learning_rate": 1.6375033342224596e-05, + "loss": 1.0656, + "step": 1860 + }, + { + "epoch": 0.7014253563390848, + "grad_norm": 10.496292114257812, + "learning_rate": 1.6348359562550015e-05, + "loss": 0.9162, + "step": 1870 + }, + { + "epoch": 0.7051762940735183, + "grad_norm": 9.357151985168457, + "learning_rate": 1.6321685782875435e-05, + "loss": 0.8575, + "step": 1880 + }, + { + "epoch": 0.708927231807952, + "grad_norm": 7.78256368637085, + "learning_rate": 1.6295012003200855e-05, + "loss": 0.7904, + "step": 1890 + }, + { + "epoch": 0.7126781695423856, + "grad_norm": 6.14832067489624, + "learning_rate": 1.6268338223526274e-05, + "loss": 0.8348, + "step": 1900 + }, + { + "epoch": 0.7164291072768192, + "grad_norm": 7.879366874694824, + "learning_rate": 1.6241664443851694e-05, + "loss": 0.8826, + "step": 1910 + }, + { + "epoch": 0.7201800450112528, + "grad_norm": 6.204752445220947, + "learning_rate": 1.6214990664177114e-05, + "loss": 0.9157, + "step": 1920 + }, + { + "epoch": 0.7239309827456865, + "grad_norm": 7.274019241333008, + "learning_rate": 1.6188316884502537e-05, + "loss": 0.8869, + "step": 1930 + }, + { + "epoch": 0.72768192048012, + "grad_norm": 5.929676055908203, + "learning_rate": 1.6161643104827956e-05, + "loss": 0.9372, + "step": 1940 + }, + { + "epoch": 0.7314328582145536, + "grad_norm": 9.161755561828613, + "learning_rate": 1.6134969325153376e-05, + "loss": 0.9211, + "step": 1950 + }, + { + "epoch": 0.7351837959489872, + "grad_norm": 5.079675674438477, + "learning_rate": 1.6108295545478795e-05, + "loss": 0.8084, + "step": 1960 + }, + { + "epoch": 0.7389347336834209, + "grad_norm": 8.15173053741455, + "learning_rate": 1.6081621765804215e-05, + "loss": 1.0033, + "step": 1970 + }, + { + "epoch": 0.7426856714178545, + "grad_norm": 6.805727005004883, + "learning_rate": 1.6054947986129638e-05, + "loss": 1.0074, + "step": 1980 + }, + { + "epoch": 0.746436609152288, + "grad_norm": 8.05391788482666, + "learning_rate": 1.6028274206455058e-05, + "loss": 0.9942, + "step": 1990 + }, + { + "epoch": 0.7501875468867217, + "grad_norm": 6.02817440032959, + "learning_rate": 1.6001600426780477e-05, + "loss": 1.0494, + "step": 2000 + }, + { + "epoch": 0.7539384846211553, + "grad_norm": 9.404801368713379, + "learning_rate": 1.5974926647105897e-05, + "loss": 0.9451, + "step": 2010 + }, + { + "epoch": 0.7576894223555889, + "grad_norm": 5.526783466339111, + "learning_rate": 1.5948252867431316e-05, + "loss": 0.9378, + "step": 2020 + }, + { + "epoch": 0.7614403600900225, + "grad_norm": 8.972588539123535, + "learning_rate": 1.5921579087756736e-05, + "loss": 0.9808, + "step": 2030 + }, + { + "epoch": 0.7651912978244562, + "grad_norm": 4.961981296539307, + "learning_rate": 1.5894905308082156e-05, + "loss": 0.9078, + "step": 2040 + }, + { + "epoch": 0.7689422355588897, + "grad_norm": 3.8509440422058105, + "learning_rate": 1.5868231528407575e-05, + "loss": 1.0518, + "step": 2050 + }, + { + "epoch": 0.7726931732933233, + "grad_norm": 7.673577785491943, + "learning_rate": 1.5841557748732995e-05, + "loss": 0.9075, + "step": 2060 + }, + { + "epoch": 0.7764441110277569, + "grad_norm": 8.731016159057617, + "learning_rate": 1.5814883969058418e-05, + "loss": 0.9208, + "step": 2070 + }, + { + "epoch": 0.7801950487621906, + "grad_norm": 6.979492664337158, + "learning_rate": 1.5788210189383838e-05, + "loss": 0.8977, + "step": 2080 + }, + { + "epoch": 0.7839459864966242, + "grad_norm": 8.666240692138672, + "learning_rate": 1.5761536409709257e-05, + "loss": 0.899, + "step": 2090 + }, + { + "epoch": 0.7876969242310577, + "grad_norm": 6.528694152832031, + "learning_rate": 1.5734862630034677e-05, + "loss": 0.844, + "step": 2100 + }, + { + "epoch": 0.7914478619654913, + "grad_norm": 7.253232479095459, + "learning_rate": 1.5708188850360096e-05, + "loss": 0.7766, + "step": 2110 + }, + { + "epoch": 0.795198799699925, + "grad_norm": 6.888519287109375, + "learning_rate": 1.568151507068552e-05, + "loss": 0.9393, + "step": 2120 + }, + { + "epoch": 0.7989497374343586, + "grad_norm": 6.408233165740967, + "learning_rate": 1.565484129101094e-05, + "loss": 1.0171, + "step": 2130 + }, + { + "epoch": 0.8027006751687922, + "grad_norm": 9.36056137084961, + "learning_rate": 1.562816751133636e-05, + "loss": 0.9127, + "step": 2140 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 11.695134162902832, + "learning_rate": 1.5601493731661778e-05, + "loss": 1.0232, + "step": 2150 + }, + { + "epoch": 0.8102025506376594, + "grad_norm": 6.716568470001221, + "learning_rate": 1.5574819951987198e-05, + "loss": 0.9904, + "step": 2160 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 5.994268417358398, + "learning_rate": 1.5548146172312617e-05, + "loss": 0.8897, + "step": 2170 + }, + { + "epoch": 0.8177044261065266, + "grad_norm": 8.419204711914062, + "learning_rate": 1.5521472392638037e-05, + "loss": 0.8315, + "step": 2180 + }, + { + "epoch": 0.8214553638409603, + "grad_norm": 6.702762603759766, + "learning_rate": 1.5494798612963457e-05, + "loss": 0.9393, + "step": 2190 + }, + { + "epoch": 0.8252063015753939, + "grad_norm": 9.53264045715332, + "learning_rate": 1.5468124833288876e-05, + "loss": 1.0074, + "step": 2200 + }, + { + "epoch": 0.8289572393098275, + "grad_norm": 5.6720476150512695, + "learning_rate": 1.54414510536143e-05, + "loss": 0.7935, + "step": 2210 + }, + { + "epoch": 0.832708177044261, + "grad_norm": 7.338003158569336, + "learning_rate": 1.541477727393972e-05, + "loss": 0.898, + "step": 2220 + }, + { + "epoch": 0.8364591147786947, + "grad_norm": 6.529892444610596, + "learning_rate": 1.538810349426514e-05, + "loss": 0.8197, + "step": 2230 + }, + { + "epoch": 0.8402100525131283, + "grad_norm": 9.971487045288086, + "learning_rate": 1.5361429714590558e-05, + "loss": 0.9551, + "step": 2240 + }, + { + "epoch": 0.8439609902475619, + "grad_norm": 5.594128608703613, + "learning_rate": 1.5334755934915978e-05, + "loss": 1.1114, + "step": 2250 + }, + { + "epoch": 0.8477119279819955, + "grad_norm": 5.723794460296631, + "learning_rate": 1.53080821552414e-05, + "loss": 0.9341, + "step": 2260 + }, + { + "epoch": 0.8514628657164292, + "grad_norm": 5.728211879730225, + "learning_rate": 1.528140837556682e-05, + "loss": 0.9961, + "step": 2270 + }, + { + "epoch": 0.8552138034508627, + "grad_norm": 7.517919063568115, + "learning_rate": 1.525473459589224e-05, + "loss": 0.8542, + "step": 2280 + }, + { + "epoch": 0.8589647411852963, + "grad_norm": 4.70159387588501, + "learning_rate": 1.522806081621766e-05, + "loss": 1.0348, + "step": 2290 + }, + { + "epoch": 0.8627156789197299, + "grad_norm": 5.308437347412109, + "learning_rate": 1.5201387036543081e-05, + "loss": 0.9645, + "step": 2300 + }, + { + "epoch": 0.8664666166541636, + "grad_norm": 5.659054756164551, + "learning_rate": 1.5174713256868499e-05, + "loss": 0.8317, + "step": 2310 + }, + { + "epoch": 0.8702175543885972, + "grad_norm": 5.970462799072266, + "learning_rate": 1.5148039477193918e-05, + "loss": 0.9889, + "step": 2320 + }, + { + "epoch": 0.8739684921230307, + "grad_norm": 5.605343818664551, + "learning_rate": 1.512136569751934e-05, + "loss": 0.8545, + "step": 2330 + }, + { + "epoch": 0.8777194298574643, + "grad_norm": 9.641878128051758, + "learning_rate": 1.509469191784476e-05, + "loss": 1.0026, + "step": 2340 + }, + { + "epoch": 0.881470367591898, + "grad_norm": 9.36474323272705, + "learning_rate": 1.5068018138170179e-05, + "loss": 0.927, + "step": 2350 + }, + { + "epoch": 0.8852213053263316, + "grad_norm": 8.28822135925293, + "learning_rate": 1.50413443584956e-05, + "loss": 0.9955, + "step": 2360 + }, + { + "epoch": 0.8889722430607652, + "grad_norm": 7.714781284332275, + "learning_rate": 1.501467057882102e-05, + "loss": 0.9366, + "step": 2370 + }, + { + "epoch": 0.8927231807951987, + "grad_norm": 3.879307508468628, + "learning_rate": 1.498799679914644e-05, + "loss": 0.9002, + "step": 2380 + }, + { + "epoch": 0.8964741185296324, + "grad_norm": 5.898133754730225, + "learning_rate": 1.4961323019471861e-05, + "loss": 0.8564, + "step": 2390 + }, + { + "epoch": 0.900225056264066, + "grad_norm": 6.275933265686035, + "learning_rate": 1.493464923979728e-05, + "loss": 0.9471, + "step": 2400 + }, + { + "epoch": 0.9039759939984996, + "grad_norm": 6.680263519287109, + "learning_rate": 1.4907975460122702e-05, + "loss": 0.8609, + "step": 2410 + }, + { + "epoch": 0.9077269317329333, + "grad_norm": 7.0698676109313965, + "learning_rate": 1.4881301680448121e-05, + "loss": 0.7758, + "step": 2420 + }, + { + "epoch": 0.9114778694673669, + "grad_norm": 10.66848373413086, + "learning_rate": 1.4854627900773541e-05, + "loss": 0.8225, + "step": 2430 + }, + { + "epoch": 0.9152288072018004, + "grad_norm": 8.714693069458008, + "learning_rate": 1.4827954121098962e-05, + "loss": 0.8777, + "step": 2440 + }, + { + "epoch": 0.918979744936234, + "grad_norm": 31.062232971191406, + "learning_rate": 1.480128034142438e-05, + "loss": 1.0204, + "step": 2450 + }, + { + "epoch": 0.9227306826706677, + "grad_norm": 11.140453338623047, + "learning_rate": 1.47746065617498e-05, + "loss": 0.9509, + "step": 2460 + }, + { + "epoch": 0.9264816204051013, + "grad_norm": 6.338695526123047, + "learning_rate": 1.4747932782075221e-05, + "loss": 0.8125, + "step": 2470 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 8.720800399780273, + "learning_rate": 1.472125900240064e-05, + "loss": 0.8114, + "step": 2480 + }, + { + "epoch": 0.9339834958739685, + "grad_norm": 11.407164573669434, + "learning_rate": 1.469458522272606e-05, + "loss": 1.0623, + "step": 2490 + }, + { + "epoch": 0.9377344336084021, + "grad_norm": 6.310417652130127, + "learning_rate": 1.4667911443051482e-05, + "loss": 0.9014, + "step": 2500 + }, + { + "epoch": 0.9414853713428357, + "grad_norm": 5.94149923324585, + "learning_rate": 1.4641237663376901e-05, + "loss": 0.7657, + "step": 2510 + }, + { + "epoch": 0.9452363090772693, + "grad_norm": 9.478999137878418, + "learning_rate": 1.4614563883702323e-05, + "loss": 0.8412, + "step": 2520 + }, + { + "epoch": 0.9489872468117029, + "grad_norm": 8.735868453979492, + "learning_rate": 1.4587890104027742e-05, + "loss": 0.9043, + "step": 2530 + }, + { + "epoch": 0.9527381845461366, + "grad_norm": 6.766534328460693, + "learning_rate": 1.4561216324353162e-05, + "loss": 0.9538, + "step": 2540 + }, + { + "epoch": 0.9564891222805701, + "grad_norm": 18.577468872070312, + "learning_rate": 1.4534542544678583e-05, + "loss": 0.9458, + "step": 2550 + }, + { + "epoch": 0.9602400600150037, + "grad_norm": 9.248088836669922, + "learning_rate": 1.4507868765004003e-05, + "loss": 0.913, + "step": 2560 + }, + { + "epoch": 0.9639909977494373, + "grad_norm": 7.771203994750977, + "learning_rate": 1.4481194985329422e-05, + "loss": 0.931, + "step": 2570 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 7.330334663391113, + "learning_rate": 1.4454521205654844e-05, + "loss": 0.9681, + "step": 2580 + }, + { + "epoch": 0.9714928732183046, + "grad_norm": 6.74515438079834, + "learning_rate": 1.4427847425980263e-05, + "loss": 0.9477, + "step": 2590 + }, + { + "epoch": 0.9752438109527382, + "grad_norm": 8.954100608825684, + "learning_rate": 1.4401173646305681e-05, + "loss": 0.8958, + "step": 2600 + }, + { + "epoch": 0.9789947486871718, + "grad_norm": 11.33262825012207, + "learning_rate": 1.4374499866631103e-05, + "loss": 0.7998, + "step": 2610 + }, + { + "epoch": 0.9827456864216054, + "grad_norm": 7.142065048217773, + "learning_rate": 1.4347826086956522e-05, + "loss": 0.9897, + "step": 2620 + }, + { + "epoch": 0.986496624156039, + "grad_norm": 8.922056198120117, + "learning_rate": 1.4321152307281942e-05, + "loss": 0.9172, + "step": 2630 + }, + { + "epoch": 0.9902475618904726, + "grad_norm": 5.288200378417969, + "learning_rate": 1.4294478527607363e-05, + "loss": 0.8836, + "step": 2640 + }, + { + "epoch": 0.9939984996249063, + "grad_norm": 10.067593574523926, + "learning_rate": 1.4267804747932783e-05, + "loss": 1.0019, + "step": 2650 + }, + { + "epoch": 0.9977494373593399, + "grad_norm": 5.186861515045166, + "learning_rate": 1.4241130968258204e-05, + "loss": 0.8005, + "step": 2660 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.5890295358649789, + "eval_f1_macro": 0.5855792301386851, + "eval_f1_weighted": 0.5883403945261724, + "eval_loss": 0.9054797887802124, + "eval_precision_macro": 0.5964531108356991, + "eval_precision_weighted": 0.5920764019753845, + "eval_recall_macro": 0.5799936335134275, + "eval_recall_weighted": 0.5890295358649789, + "eval_runtime": 4.8377, + "eval_samples_per_second": 489.903, + "eval_steps_per_second": 61.393, + "step": 2666 + }, + { + "epoch": 1.0015003750937734, + "grad_norm": 7.454843044281006, + "learning_rate": 1.4214457188583624e-05, + "loss": 0.9261, + "step": 2670 + }, + { + "epoch": 1.005251312828207, + "grad_norm": 7.612959384918213, + "learning_rate": 1.4187783408909043e-05, + "loss": 0.8656, + "step": 2680 + }, + { + "epoch": 1.0090022505626406, + "grad_norm": 5.689546585083008, + "learning_rate": 1.4161109629234465e-05, + "loss": 0.8539, + "step": 2690 + }, + { + "epoch": 1.0127531882970742, + "grad_norm": 9.812941551208496, + "learning_rate": 1.4134435849559884e-05, + "loss": 0.8154, + "step": 2700 + }, + { + "epoch": 1.016504126031508, + "grad_norm": 6.9208550453186035, + "learning_rate": 1.4107762069885304e-05, + "loss": 0.8441, + "step": 2710 + }, + { + "epoch": 1.0202550637659416, + "grad_norm": 5.310056686401367, + "learning_rate": 1.4081088290210725e-05, + "loss": 0.9471, + "step": 2720 + }, + { + "epoch": 1.0240060015003751, + "grad_norm": 9.985223770141602, + "learning_rate": 1.4054414510536145e-05, + "loss": 0.853, + "step": 2730 + }, + { + "epoch": 1.0277569392348087, + "grad_norm": 21.524646759033203, + "learning_rate": 1.4027740730861563e-05, + "loss": 0.9408, + "step": 2740 + }, + { + "epoch": 1.0315078769692423, + "grad_norm": 9.250083923339844, + "learning_rate": 1.4001066951186984e-05, + "loss": 0.8023, + "step": 2750 + }, + { + "epoch": 1.0352588147036759, + "grad_norm": 6.028738975524902, + "learning_rate": 1.3974393171512404e-05, + "loss": 0.7849, + "step": 2760 + }, + { + "epoch": 1.0390097524381094, + "grad_norm": 9.787884712219238, + "learning_rate": 1.3947719391837823e-05, + "loss": 0.7474, + "step": 2770 + }, + { + "epoch": 1.042760690172543, + "grad_norm": 12.639663696289062, + "learning_rate": 1.3921045612163244e-05, + "loss": 0.8167, + "step": 2780 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 15.691644668579102, + "learning_rate": 1.3894371832488664e-05, + "loss": 0.7467, + "step": 2790 + }, + { + "epoch": 1.0502625656414104, + "grad_norm": 7.864928722381592, + "learning_rate": 1.3867698052814085e-05, + "loss": 0.9476, + "step": 2800 + }, + { + "epoch": 1.054013503375844, + "grad_norm": 8.662647247314453, + "learning_rate": 1.3841024273139505e-05, + "loss": 0.8529, + "step": 2810 + }, + { + "epoch": 1.0577644411102776, + "grad_norm": 8.244277954101562, + "learning_rate": 1.3814350493464925e-05, + "loss": 0.753, + "step": 2820 + }, + { + "epoch": 1.0615153788447111, + "grad_norm": 8.806965827941895, + "learning_rate": 1.3787676713790346e-05, + "loss": 0.7577, + "step": 2830 + }, + { + "epoch": 1.0652663165791447, + "grad_norm": 11.864466667175293, + "learning_rate": 1.3761002934115766e-05, + "loss": 0.8227, + "step": 2840 + }, + { + "epoch": 1.0690172543135783, + "grad_norm": 16.477638244628906, + "learning_rate": 1.3734329154441187e-05, + "loss": 0.8603, + "step": 2850 + }, + { + "epoch": 1.072768192048012, + "grad_norm": 10.029014587402344, + "learning_rate": 1.3707655374766607e-05, + "loss": 0.7507, + "step": 2860 + }, + { + "epoch": 1.0765191297824457, + "grad_norm": 42.02882766723633, + "learning_rate": 1.3680981595092026e-05, + "loss": 0.8731, + "step": 2870 + }, + { + "epoch": 1.0802700675168793, + "grad_norm": 11.340489387512207, + "learning_rate": 1.3654307815417447e-05, + "loss": 0.8736, + "step": 2880 + }, + { + "epoch": 1.0840210052513128, + "grad_norm": 10.736079216003418, + "learning_rate": 1.3627634035742865e-05, + "loss": 0.7387, + "step": 2890 + }, + { + "epoch": 1.0877719429857464, + "grad_norm": 12.158968925476074, + "learning_rate": 1.3600960256068285e-05, + "loss": 0.8563, + "step": 2900 + }, + { + "epoch": 1.09152288072018, + "grad_norm": 4.968686103820801, + "learning_rate": 1.3574286476393706e-05, + "loss": 0.8865, + "step": 2910 + }, + { + "epoch": 1.0952738184546136, + "grad_norm": 9.05169677734375, + "learning_rate": 1.3547612696719126e-05, + "loss": 0.9706, + "step": 2920 + }, + { + "epoch": 1.0990247561890472, + "grad_norm": 8.993448257446289, + "learning_rate": 1.3520938917044546e-05, + "loss": 0.7936, + "step": 2930 + }, + { + "epoch": 1.102775693923481, + "grad_norm": 9.852548599243164, + "learning_rate": 1.3494265137369967e-05, + "loss": 0.9188, + "step": 2940 + }, + { + "epoch": 1.1065266316579145, + "grad_norm": 8.509963035583496, + "learning_rate": 1.3467591357695386e-05, + "loss": 0.9182, + "step": 2950 + }, + { + "epoch": 1.1102775693923481, + "grad_norm": 9.74703311920166, + "learning_rate": 1.3440917578020806e-05, + "loss": 0.8979, + "step": 2960 + }, + { + "epoch": 1.1140285071267817, + "grad_norm": 11.76938247680664, + "learning_rate": 1.3414243798346227e-05, + "loss": 0.8431, + "step": 2970 + }, + { + "epoch": 1.1177794448612153, + "grad_norm": 8.194916725158691, + "learning_rate": 1.3387570018671647e-05, + "loss": 0.8794, + "step": 2980 + }, + { + "epoch": 1.1215303825956489, + "grad_norm": 5.259307861328125, + "learning_rate": 1.3360896238997068e-05, + "loss": 0.8688, + "step": 2990 + }, + { + "epoch": 1.1252813203300824, + "grad_norm": 8.892224311828613, + "learning_rate": 1.3334222459322488e-05, + "loss": 0.8924, + "step": 3000 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 10.505491256713867, + "learning_rate": 1.3307548679647908e-05, + "loss": 0.8347, + "step": 3010 + }, + { + "epoch": 1.1327831957989498, + "grad_norm": 4.74807071685791, + "learning_rate": 1.3280874899973329e-05, + "loss": 0.7728, + "step": 3020 + }, + { + "epoch": 1.1365341335333834, + "grad_norm": 12.980900764465332, + "learning_rate": 1.3254201120298747e-05, + "loss": 0.7915, + "step": 3030 + }, + { + "epoch": 1.140285071267817, + "grad_norm": 12.24691104888916, + "learning_rate": 1.3227527340624166e-05, + "loss": 0.8422, + "step": 3040 + }, + { + "epoch": 1.1440360090022506, + "grad_norm": 6.215153217315674, + "learning_rate": 1.3200853560949588e-05, + "loss": 0.8067, + "step": 3050 + }, + { + "epoch": 1.1477869467366841, + "grad_norm": 15.73306941986084, + "learning_rate": 1.3174179781275007e-05, + "loss": 0.8135, + "step": 3060 + }, + { + "epoch": 1.1515378844711177, + "grad_norm": 12.068921089172363, + "learning_rate": 1.3147506001600427e-05, + "loss": 0.7305, + "step": 3070 + }, + { + "epoch": 1.1552888222055513, + "grad_norm": 6.1044464111328125, + "learning_rate": 1.3120832221925848e-05, + "loss": 0.9578, + "step": 3080 + }, + { + "epoch": 1.159039759939985, + "grad_norm": 10.416324615478516, + "learning_rate": 1.3094158442251268e-05, + "loss": 0.8362, + "step": 3090 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 13.548623085021973, + "learning_rate": 1.3067484662576687e-05, + "loss": 0.7862, + "step": 3100 + }, + { + "epoch": 1.1665416354088523, + "grad_norm": 9.015273094177246, + "learning_rate": 1.3040810882902109e-05, + "loss": 0.8432, + "step": 3110 + }, + { + "epoch": 1.1702925731432858, + "grad_norm": 4.893497467041016, + "learning_rate": 1.3014137103227528e-05, + "loss": 0.719, + "step": 3120 + }, + { + "epoch": 1.1740435108777194, + "grad_norm": 12.783862113952637, + "learning_rate": 1.298746332355295e-05, + "loss": 0.9088, + "step": 3130 + }, + { + "epoch": 1.177794448612153, + "grad_norm": 10.826465606689453, + "learning_rate": 1.296078954387837e-05, + "loss": 0.8758, + "step": 3140 + }, + { + "epoch": 1.1815453863465866, + "grad_norm": 9.32836627960205, + "learning_rate": 1.2934115764203789e-05, + "loss": 0.7643, + "step": 3150 + }, + { + "epoch": 1.1852963240810204, + "grad_norm": 9.504363059997559, + "learning_rate": 1.290744198452921e-05, + "loss": 0.8174, + "step": 3160 + }, + { + "epoch": 1.189047261815454, + "grad_norm": 12.839066505432129, + "learning_rate": 1.2880768204854628e-05, + "loss": 0.7992, + "step": 3170 + }, + { + "epoch": 1.1927981995498875, + "grad_norm": 9.912968635559082, + "learning_rate": 1.2854094425180048e-05, + "loss": 0.7793, + "step": 3180 + }, + { + "epoch": 1.196549137284321, + "grad_norm": 9.632975578308105, + "learning_rate": 1.2827420645505469e-05, + "loss": 0.8062, + "step": 3190 + }, + { + "epoch": 1.2003000750187547, + "grad_norm": 15.091144561767578, + "learning_rate": 1.2800746865830889e-05, + "loss": 0.8319, + "step": 3200 + }, + { + "epoch": 1.2040510127531883, + "grad_norm": 9.834930419921875, + "learning_rate": 1.2774073086156308e-05, + "loss": 0.7946, + "step": 3210 + }, + { + "epoch": 1.2078019504876218, + "grad_norm": 9.097467422485352, + "learning_rate": 1.274739930648173e-05, + "loss": 0.7423, + "step": 3220 + }, + { + "epoch": 1.2115528882220554, + "grad_norm": 7.097741603851318, + "learning_rate": 1.272072552680715e-05, + "loss": 0.7668, + "step": 3230 + }, + { + "epoch": 1.215303825956489, + "grad_norm": 16.66200828552246, + "learning_rate": 1.269405174713257e-05, + "loss": 0.8292, + "step": 3240 + }, + { + "epoch": 1.2190547636909228, + "grad_norm": 4.819615840911865, + "learning_rate": 1.266737796745799e-05, + "loss": 0.7641, + "step": 3250 + }, + { + "epoch": 1.2228057014253564, + "grad_norm": 12.379060745239258, + "learning_rate": 1.264070418778341e-05, + "loss": 0.7749, + "step": 3260 + }, + { + "epoch": 1.22655663915979, + "grad_norm": 10.446650505065918, + "learning_rate": 1.2614030408108831e-05, + "loss": 0.8306, + "step": 3270 + }, + { + "epoch": 1.2303075768942235, + "grad_norm": 13.330952644348145, + "learning_rate": 1.258735662843425e-05, + "loss": 0.7924, + "step": 3280 + }, + { + "epoch": 1.2340585146286571, + "grad_norm": 11.163646697998047, + "learning_rate": 1.256068284875967e-05, + "loss": 0.8505, + "step": 3290 + }, + { + "epoch": 1.2378094523630907, + "grad_norm": 10.235424995422363, + "learning_rate": 1.2534009069085092e-05, + "loss": 0.7111, + "step": 3300 + }, + { + "epoch": 1.2415603900975243, + "grad_norm": 9.529205322265625, + "learning_rate": 1.2507335289410511e-05, + "loss": 0.9559, + "step": 3310 + }, + { + "epoch": 1.245311327831958, + "grad_norm": 9.511346817016602, + "learning_rate": 1.2480661509735929e-05, + "loss": 1.0107, + "step": 3320 + }, + { + "epoch": 1.2490622655663917, + "grad_norm": 5.115582466125488, + "learning_rate": 1.245398773006135e-05, + "loss": 0.8128, + "step": 3330 + }, + { + "epoch": 1.2528132033008252, + "grad_norm": 10.270365715026855, + "learning_rate": 1.242731395038677e-05, + "loss": 0.7537, + "step": 3340 + }, + { + "epoch": 1.2565641410352588, + "grad_norm": 15.309682846069336, + "learning_rate": 1.240064017071219e-05, + "loss": 0.776, + "step": 3350 + }, + { + "epoch": 1.2603150787696924, + "grad_norm": 6.9617414474487305, + "learning_rate": 1.2373966391037611e-05, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 1.264066016504126, + "grad_norm": 14.111533164978027, + "learning_rate": 1.234729261136303e-05, + "loss": 0.8766, + "step": 3370 + }, + { + "epoch": 1.2678169542385596, + "grad_norm": 15.513258934020996, + "learning_rate": 1.2320618831688452e-05, + "loss": 0.8124, + "step": 3380 + }, + { + "epoch": 1.2715678919729934, + "grad_norm": 10.617011070251465, + "learning_rate": 1.2293945052013872e-05, + "loss": 0.7367, + "step": 3390 + }, + { + "epoch": 1.275318829707427, + "grad_norm": 10.756956100463867, + "learning_rate": 1.2267271272339291e-05, + "loss": 0.9371, + "step": 3400 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 20.27239990234375, + "learning_rate": 1.2240597492664712e-05, + "loss": 0.7812, + "step": 3410 + }, + { + "epoch": 1.282820705176294, + "grad_norm": 13.26762580871582, + "learning_rate": 1.2213923712990132e-05, + "loss": 0.9214, + "step": 3420 + }, + { + "epoch": 1.2865716429107277, + "grad_norm": 6.740780830383301, + "learning_rate": 1.2187249933315552e-05, + "loss": 0.7254, + "step": 3430 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 8.460843086242676, + "learning_rate": 1.2160576153640973e-05, + "loss": 0.8793, + "step": 3440 + }, + { + "epoch": 1.2940735183795948, + "grad_norm": 8.37424373626709, + "learning_rate": 1.2133902373966393e-05, + "loss": 0.7178, + "step": 3450 + }, + { + "epoch": 1.2978244561140286, + "grad_norm": 9.57453441619873, + "learning_rate": 1.210722859429181e-05, + "loss": 0.9584, + "step": 3460 + }, + { + "epoch": 1.301575393848462, + "grad_norm": 15.27446460723877, + "learning_rate": 1.2080554814617232e-05, + "loss": 0.7314, + "step": 3470 + }, + { + "epoch": 1.3053263315828958, + "grad_norm": 16.266162872314453, + "learning_rate": 1.2053881034942651e-05, + "loss": 0.8651, + "step": 3480 + }, + { + "epoch": 1.3090772693173294, + "grad_norm": 9.161102294921875, + "learning_rate": 1.2027207255268071e-05, + "loss": 0.7086, + "step": 3490 + }, + { + "epoch": 1.312828207051763, + "grad_norm": 12.645145416259766, + "learning_rate": 1.2000533475593492e-05, + "loss": 0.874, + "step": 3500 + }, + { + "epoch": 1.3165791447861965, + "grad_norm": 9.018929481506348, + "learning_rate": 1.1973859695918912e-05, + "loss": 0.7457, + "step": 3510 + }, + { + "epoch": 1.32033008252063, + "grad_norm": 10.96903133392334, + "learning_rate": 1.1947185916244333e-05, + "loss": 0.865, + "step": 3520 + }, + { + "epoch": 1.3240810202550637, + "grad_norm": 15.08077621459961, + "learning_rate": 1.1920512136569753e-05, + "loss": 0.8127, + "step": 3530 + }, + { + "epoch": 1.3278319579894973, + "grad_norm": 6.171741962432861, + "learning_rate": 1.1893838356895173e-05, + "loss": 0.7038, + "step": 3540 + }, + { + "epoch": 1.331582895723931, + "grad_norm": 12.167604446411133, + "learning_rate": 1.1867164577220594e-05, + "loss": 0.7373, + "step": 3550 + }, + { + "epoch": 1.3353338334583646, + "grad_norm": 12.859063148498535, + "learning_rate": 1.1840490797546013e-05, + "loss": 0.8292, + "step": 3560 + }, + { + "epoch": 1.3390847711927982, + "grad_norm": 9.17769718170166, + "learning_rate": 1.1813817017871435e-05, + "loss": 0.8117, + "step": 3570 + }, + { + "epoch": 1.3428357089272318, + "grad_norm": 7.380620002746582, + "learning_rate": 1.1787143238196854e-05, + "loss": 0.7943, + "step": 3580 + }, + { + "epoch": 1.3465866466616654, + "grad_norm": 19.143110275268555, + "learning_rate": 1.1760469458522274e-05, + "loss": 0.7798, + "step": 3590 + }, + { + "epoch": 1.350337584396099, + "grad_norm": 14.915560722351074, + "learning_rate": 1.1733795678847695e-05, + "loss": 0.8568, + "step": 3600 + }, + { + "epoch": 1.3540885221305325, + "grad_norm": 16.487377166748047, + "learning_rate": 1.1707121899173113e-05, + "loss": 0.8586, + "step": 3610 + }, + { + "epoch": 1.3578394598649663, + "grad_norm": 9.255929946899414, + "learning_rate": 1.1680448119498533e-05, + "loss": 0.7957, + "step": 3620 + }, + { + "epoch": 1.3615903975994, + "grad_norm": 12.38227653503418, + "learning_rate": 1.1653774339823954e-05, + "loss": 0.7957, + "step": 3630 + }, + { + "epoch": 1.3653413353338335, + "grad_norm": 10.949649810791016, + "learning_rate": 1.1627100560149374e-05, + "loss": 0.6871, + "step": 3640 + }, + { + "epoch": 1.369092273068267, + "grad_norm": 7.265697956085205, + "learning_rate": 1.1600426780474793e-05, + "loss": 0.7341, + "step": 3650 + }, + { + "epoch": 1.3728432108027007, + "grad_norm": 12.582711219787598, + "learning_rate": 1.1573753000800215e-05, + "loss": 0.8242, + "step": 3660 + }, + { + "epoch": 1.3765941485371342, + "grad_norm": 12.345062255859375, + "learning_rate": 1.1547079221125634e-05, + "loss": 0.8768, + "step": 3670 + }, + { + "epoch": 1.3803450862715678, + "grad_norm": 8.697713851928711, + "learning_rate": 1.1520405441451054e-05, + "loss": 0.855, + "step": 3680 + }, + { + "epoch": 1.3840960240060016, + "grad_norm": 9.254758834838867, + "learning_rate": 1.1493731661776475e-05, + "loss": 0.909, + "step": 3690 + }, + { + "epoch": 1.387846961740435, + "grad_norm": 9.739770889282227, + "learning_rate": 1.1467057882101895e-05, + "loss": 0.8582, + "step": 3700 + }, + { + "epoch": 1.3915978994748688, + "grad_norm": 12.004996299743652, + "learning_rate": 1.1440384102427316e-05, + "loss": 0.7344, + "step": 3710 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 13.092066764831543, + "learning_rate": 1.1413710322752736e-05, + "loss": 0.8916, + "step": 3720 + }, + { + "epoch": 1.399099774943736, + "grad_norm": 12.259298324584961, + "learning_rate": 1.1387036543078155e-05, + "loss": 0.9096, + "step": 3730 + }, + { + "epoch": 1.4028507126781695, + "grad_norm": 8.312166213989258, + "learning_rate": 1.1360362763403577e-05, + "loss": 0.8647, + "step": 3740 + }, + { + "epoch": 1.406601650412603, + "grad_norm": 8.59150218963623, + "learning_rate": 1.1333688983728995e-05, + "loss": 0.9202, + "step": 3750 + }, + { + "epoch": 1.4103525881470367, + "grad_norm": 8.444820404052734, + "learning_rate": 1.1307015204054414e-05, + "loss": 0.7343, + "step": 3760 + }, + { + "epoch": 1.4141035258814703, + "grad_norm": 12.232796669006348, + "learning_rate": 1.1280341424379836e-05, + "loss": 0.7329, + "step": 3770 + }, + { + "epoch": 1.417854463615904, + "grad_norm": 9.038057327270508, + "learning_rate": 1.1253667644705255e-05, + "loss": 0.8751, + "step": 3780 + }, + { + "epoch": 1.4216054013503376, + "grad_norm": 5.729677200317383, + "learning_rate": 1.1226993865030675e-05, + "loss": 0.7319, + "step": 3790 + }, + { + "epoch": 1.4253563390847712, + "grad_norm": 7.777376651763916, + "learning_rate": 1.1200320085356096e-05, + "loss": 0.802, + "step": 3800 + }, + { + "epoch": 1.4291072768192048, + "grad_norm": 13.165481567382812, + "learning_rate": 1.1173646305681516e-05, + "loss": 0.7195, + "step": 3810 + }, + { + "epoch": 1.4328582145536384, + "grad_norm": 10.966960906982422, + "learning_rate": 1.1146972526006935e-05, + "loss": 0.8234, + "step": 3820 + }, + { + "epoch": 1.436609152288072, + "grad_norm": 8.237056732177734, + "learning_rate": 1.1120298746332357e-05, + "loss": 0.7832, + "step": 3830 + }, + { + "epoch": 1.4403600900225055, + "grad_norm": 10.419988632202148, + "learning_rate": 1.1093624966657776e-05, + "loss": 0.8292, + "step": 3840 + }, + { + "epoch": 1.4441110277569393, + "grad_norm": 14.655726432800293, + "learning_rate": 1.1066951186983198e-05, + "loss": 0.8523, + "step": 3850 + }, + { + "epoch": 1.447861965491373, + "grad_norm": 10.38304328918457, + "learning_rate": 1.1040277407308617e-05, + "loss": 0.856, + "step": 3860 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 13.249422073364258, + "learning_rate": 1.1013603627634037e-05, + "loss": 0.8403, + "step": 3870 + }, + { + "epoch": 1.45536384096024, + "grad_norm": 9.854536056518555, + "learning_rate": 1.0986929847959458e-05, + "loss": 0.794, + "step": 3880 + }, + { + "epoch": 1.4591147786946737, + "grad_norm": 11.48951530456543, + "learning_rate": 1.0960256068284876e-05, + "loss": 0.7569, + "step": 3890 + }, + { + "epoch": 1.4628657164291072, + "grad_norm": 8.955044746398926, + "learning_rate": 1.0933582288610296e-05, + "loss": 0.8064, + "step": 3900 + }, + { + "epoch": 1.4666166541635408, + "grad_norm": 16.088743209838867, + "learning_rate": 1.0906908508935717e-05, + "loss": 0.8518, + "step": 3910 + }, + { + "epoch": 1.4703675918979746, + "grad_norm": 9.207806587219238, + "learning_rate": 1.0880234729261137e-05, + "loss": 0.875, + "step": 3920 + }, + { + "epoch": 1.474118529632408, + "grad_norm": 18.738187789916992, + "learning_rate": 1.0853560949586556e-05, + "loss": 0.8164, + "step": 3930 + }, + { + "epoch": 1.4778694673668418, + "grad_norm": 10.138594627380371, + "learning_rate": 1.0826887169911977e-05, + "loss": 0.791, + "step": 3940 + }, + { + "epoch": 1.4816204051012754, + "grad_norm": 9.635621070861816, + "learning_rate": 1.0800213390237397e-05, + "loss": 0.7878, + "step": 3950 + }, + { + "epoch": 1.485371342835709, + "grad_norm": 9.569879531860352, + "learning_rate": 1.0773539610562818e-05, + "loss": 0.8404, + "step": 3960 + }, + { + "epoch": 1.4891222805701425, + "grad_norm": 9.855542182922363, + "learning_rate": 1.0746865830888238e-05, + "loss": 0.8726, + "step": 3970 + }, + { + "epoch": 1.492873218304576, + "grad_norm": 16.710786819458008, + "learning_rate": 1.0720192051213658e-05, + "loss": 0.8706, + "step": 3980 + }, + { + "epoch": 1.49662415603901, + "grad_norm": 13.603216171264648, + "learning_rate": 1.0693518271539079e-05, + "loss": 0.8437, + "step": 3990 + }, + { + "epoch": 1.5003750937734432, + "grad_norm": 11.3872652053833, + "learning_rate": 1.0666844491864499e-05, + "loss": 0.6512, + "step": 4000 + }, + { + "epoch": 1.504126031507877, + "grad_norm": 10.2975492477417, + "learning_rate": 1.0640170712189918e-05, + "loss": 0.8774, + "step": 4010 + }, + { + "epoch": 1.5078769692423106, + "grad_norm": 7.741751194000244, + "learning_rate": 1.061349693251534e-05, + "loss": 0.7528, + "step": 4020 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 9.902315139770508, + "learning_rate": 1.0586823152840759e-05, + "loss": 0.7995, + "step": 4030 + }, + { + "epoch": 1.5153788447111778, + "grad_norm": 11.541082382202148, + "learning_rate": 1.0560149373166177e-05, + "loss": 0.7694, + "step": 4040 + }, + { + "epoch": 1.5191297824456114, + "grad_norm": 8.56485366821289, + "learning_rate": 1.0533475593491598e-05, + "loss": 0.8002, + "step": 4050 + }, + { + "epoch": 1.5228807201800452, + "grad_norm": 8.866626739501953, + "learning_rate": 1.0506801813817018e-05, + "loss": 0.792, + "step": 4060 + }, + { + "epoch": 1.5266316579144785, + "grad_norm": 10.332854270935059, + "learning_rate": 1.0480128034142438e-05, + "loss": 0.7378, + "step": 4070 + }, + { + "epoch": 1.5303825956489123, + "grad_norm": 8.805913925170898, + "learning_rate": 1.0453454254467859e-05, + "loss": 0.8287, + "step": 4080 + }, + { + "epoch": 1.5341335333833457, + "grad_norm": 10.885342597961426, + "learning_rate": 1.0426780474793278e-05, + "loss": 0.8454, + "step": 4090 + }, + { + "epoch": 1.5378844711177795, + "grad_norm": 11.047041893005371, + "learning_rate": 1.04001066951187e-05, + "loss": 0.8955, + "step": 4100 + }, + { + "epoch": 1.541635408852213, + "grad_norm": 12.287060737609863, + "learning_rate": 1.037343291544412e-05, + "loss": 0.9106, + "step": 4110 + }, + { + "epoch": 1.5453863465866466, + "grad_norm": 7.6913628578186035, + "learning_rate": 1.0346759135769539e-05, + "loss": 0.8287, + "step": 4120 + }, + { + "epoch": 1.5491372843210802, + "grad_norm": 12.864625930786133, + "learning_rate": 1.032008535609496e-05, + "loss": 0.8176, + "step": 4130 + }, + { + "epoch": 1.5528882220555138, + "grad_norm": 17.12616539001465, + "learning_rate": 1.029341157642038e-05, + "loss": 0.8964, + "step": 4140 + }, + { + "epoch": 1.5566391597899476, + "grad_norm": 9.076611518859863, + "learning_rate": 1.02667377967458e-05, + "loss": 0.8222, + "step": 4150 + }, + { + "epoch": 1.560390097524381, + "grad_norm": 9.327693939208984, + "learning_rate": 1.0240064017071221e-05, + "loss": 0.9358, + "step": 4160 + }, + { + "epoch": 1.5641410352588148, + "grad_norm": 7.653916358947754, + "learning_rate": 1.021339023739664e-05, + "loss": 0.7721, + "step": 4170 + }, + { + "epoch": 1.5678919729932483, + "grad_norm": 10.110307693481445, + "learning_rate": 1.0186716457722058e-05, + "loss": 0.9346, + "step": 4180 + }, + { + "epoch": 1.571642910727682, + "grad_norm": 11.298696517944336, + "learning_rate": 1.016004267804748e-05, + "loss": 0.8456, + "step": 4190 + }, + { + "epoch": 1.5753938484621155, + "grad_norm": 13.459417343139648, + "learning_rate": 1.01333688983729e-05, + "loss": 0.7815, + "step": 4200 + }, + { + "epoch": 1.579144786196549, + "grad_norm": 16.08547592163086, + "learning_rate": 1.0106695118698319e-05, + "loss": 0.7656, + "step": 4210 + }, + { + "epoch": 1.5828957239309829, + "grad_norm": 8.995433807373047, + "learning_rate": 1.008002133902374e-05, + "loss": 0.8248, + "step": 4220 + }, + { + "epoch": 1.5866466616654162, + "grad_norm": 10.426254272460938, + "learning_rate": 1.005334755934916e-05, + "loss": 0.7957, + "step": 4230 + }, + { + "epoch": 1.59039759939985, + "grad_norm": 8.310003280639648, + "learning_rate": 1.0026673779674581e-05, + "loss": 0.8313, + "step": 4240 + }, + { + "epoch": 1.5941485371342836, + "grad_norm": 14.415204048156738, + "learning_rate": 1e-05, + "loss": 0.7711, + "step": 4250 + }, + { + "epoch": 1.5978994748687172, + "grad_norm": 8.948083877563477, + "learning_rate": 9.97332622032542e-06, + "loss": 0.7868, + "step": 4260 + }, + { + "epoch": 1.6016504126031508, + "grad_norm": 16.681766510009766, + "learning_rate": 9.946652440650842e-06, + "loss": 0.8633, + "step": 4270 + }, + { + "epoch": 1.6054013503375844, + "grad_norm": 11.883402824401855, + "learning_rate": 9.919978660976261e-06, + "loss": 0.8195, + "step": 4280 + }, + { + "epoch": 1.6091522880720182, + "grad_norm": 11.386548042297363, + "learning_rate": 9.893304881301681e-06, + "loss": 0.7621, + "step": 4290 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 13.255663871765137, + "learning_rate": 9.8666311016271e-06, + "loss": 1.0233, + "step": 4300 + }, + { + "epoch": 1.6166541635408853, + "grad_norm": 10.955714225769043, + "learning_rate": 9.839957321952522e-06, + "loss": 0.9456, + "step": 4310 + }, + { + "epoch": 1.6204051012753187, + "grad_norm": 7.624833583831787, + "learning_rate": 9.813283542277942e-06, + "loss": 0.9029, + "step": 4320 + }, + { + "epoch": 1.6241560390097525, + "grad_norm": 8.860147476196289, + "learning_rate": 9.786609762603361e-06, + "loss": 0.835, + "step": 4330 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 13.29971981048584, + "learning_rate": 9.759935982928782e-06, + "loss": 0.848, + "step": 4340 + }, + { + "epoch": 1.6316579144786196, + "grad_norm": 10.151264190673828, + "learning_rate": 9.733262203254202e-06, + "loss": 0.7443, + "step": 4350 + }, + { + "epoch": 1.6354088522130532, + "grad_norm": 14.21789264678955, + "learning_rate": 9.706588423579622e-06, + "loss": 0.908, + "step": 4360 + }, + { + "epoch": 1.6391597899474868, + "grad_norm": 7.94905424118042, + "learning_rate": 9.679914643905041e-06, + "loss": 0.6919, + "step": 4370 + }, + { + "epoch": 1.6429107276819206, + "grad_norm": 8.60908031463623, + "learning_rate": 9.653240864230463e-06, + "loss": 0.7309, + "step": 4380 + }, + { + "epoch": 1.646661665416354, + "grad_norm": 15.03842544555664, + "learning_rate": 9.626567084555882e-06, + "loss": 0.9343, + "step": 4390 + }, + { + "epoch": 1.6504126031507877, + "grad_norm": 11.684754371643066, + "learning_rate": 9.599893304881302e-06, + "loss": 0.7532, + "step": 4400 + }, + { + "epoch": 1.6541635408852213, + "grad_norm": 6.24261999130249, + "learning_rate": 9.573219525206723e-06, + "loss": 0.8449, + "step": 4410 + }, + { + "epoch": 1.657914478619655, + "grad_norm": 5.580635070800781, + "learning_rate": 9.546545745532143e-06, + "loss": 0.644, + "step": 4420 + }, + { + "epoch": 1.6616654163540885, + "grad_norm": 13.382287979125977, + "learning_rate": 9.519871965857564e-06, + "loss": 0.8713, + "step": 4430 + }, + { + "epoch": 1.665416354088522, + "grad_norm": 11.218451499938965, + "learning_rate": 9.493198186182982e-06, + "loss": 0.6552, + "step": 4440 + }, + { + "epoch": 1.6691672918229559, + "grad_norm": 16.548782348632812, + "learning_rate": 9.466524406508403e-06, + "loss": 0.8024, + "step": 4450 + }, + { + "epoch": 1.6729182295573892, + "grad_norm": 17.210647583007812, + "learning_rate": 9.439850626833823e-06, + "loss": 0.7543, + "step": 4460 + }, + { + "epoch": 1.676669167291823, + "grad_norm": 13.630977630615234, + "learning_rate": 9.413176847159243e-06, + "loss": 0.8754, + "step": 4470 + }, + { + "epoch": 1.6804201050262566, + "grad_norm": 13.967558860778809, + "learning_rate": 9.386503067484664e-06, + "loss": 0.7699, + "step": 4480 + }, + { + "epoch": 1.6841710427606902, + "grad_norm": 11.707578659057617, + "learning_rate": 9.359829287810083e-06, + "loss": 0.8324, + "step": 4490 + }, + { + "epoch": 1.6879219804951238, + "grad_norm": 9.124420166015625, + "learning_rate": 9.333155508135505e-06, + "loss": 0.794, + "step": 4500 + }, + { + "epoch": 1.6916729182295573, + "grad_norm": 10.910788536071777, + "learning_rate": 9.306481728460923e-06, + "loss": 0.8241, + "step": 4510 + }, + { + "epoch": 1.6954238559639911, + "grad_norm": 13.6180419921875, + "learning_rate": 9.279807948786344e-06, + "loss": 0.8882, + "step": 4520 + }, + { + "epoch": 1.6991747936984245, + "grad_norm": 7.055276393890381, + "learning_rate": 9.253134169111764e-06, + "loss": 0.9011, + "step": 4530 + }, + { + "epoch": 1.7029257314328583, + "grad_norm": 14.100971221923828, + "learning_rate": 9.226460389437183e-06, + "loss": 0.8026, + "step": 4540 + }, + { + "epoch": 1.7066766691672917, + "grad_norm": 6.9184441566467285, + "learning_rate": 9.199786609762605e-06, + "loss": 0.6888, + "step": 4550 + }, + { + "epoch": 1.7104276069017255, + "grad_norm": 9.915225982666016, + "learning_rate": 9.173112830088024e-06, + "loss": 0.8456, + "step": 4560 + }, + { + "epoch": 1.714178544636159, + "grad_norm": 11.1101655960083, + "learning_rate": 9.146439050413445e-06, + "loss": 0.8979, + "step": 4570 + }, + { + "epoch": 1.7179294823705926, + "grad_norm": 11.128944396972656, + "learning_rate": 9.119765270738863e-06, + "loss": 0.8386, + "step": 4580 + }, + { + "epoch": 1.7216804201050264, + "grad_norm": 8.845916748046875, + "learning_rate": 9.093091491064285e-06, + "loss": 0.8375, + "step": 4590 + }, + { + "epoch": 1.7254313578394598, + "grad_norm": 12.3989839553833, + "learning_rate": 9.066417711389704e-06, + "loss": 0.7884, + "step": 4600 + }, + { + "epoch": 1.7291822955738936, + "grad_norm": 8.899964332580566, + "learning_rate": 9.039743931715126e-06, + "loss": 0.8391, + "step": 4610 + }, + { + "epoch": 1.732933233308327, + "grad_norm": 11.830737113952637, + "learning_rate": 9.013070152040545e-06, + "loss": 0.836, + "step": 4620 + }, + { + "epoch": 1.7366841710427607, + "grad_norm": 14.875555038452148, + "learning_rate": 8.986396372365965e-06, + "loss": 0.8148, + "step": 4630 + }, + { + "epoch": 1.7404351087771943, + "grad_norm": 8.44090461730957, + "learning_rate": 8.959722592691386e-06, + "loss": 0.7033, + "step": 4640 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 7.954046726226807, + "learning_rate": 8.933048813016804e-06, + "loss": 0.8364, + "step": 4650 + }, + { + "epoch": 1.7479369842460615, + "grad_norm": 14.886021614074707, + "learning_rate": 8.906375033342225e-06, + "loss": 0.7641, + "step": 4660 + }, + { + "epoch": 1.751687921980495, + "grad_norm": 15.42341136932373, + "learning_rate": 8.879701253667645e-06, + "loss": 0.7152, + "step": 4670 + }, + { + "epoch": 1.7554388597149289, + "grad_norm": 18.62801742553711, + "learning_rate": 8.853027473993066e-06, + "loss": 0.9192, + "step": 4680 + }, + { + "epoch": 1.7591897974493622, + "grad_norm": 9.787707328796387, + "learning_rate": 8.826353694318486e-06, + "loss": 1.0479, + "step": 4690 + }, + { + "epoch": 1.762940735183796, + "grad_norm": 10.803950309753418, + "learning_rate": 8.799679914643906e-06, + "loss": 0.7217, + "step": 4700 + }, + { + "epoch": 1.7666916729182296, + "grad_norm": 5.519962787628174, + "learning_rate": 8.773006134969327e-06, + "loss": 0.8044, + "step": 4710 + }, + { + "epoch": 1.7704426106526632, + "grad_norm": 10.77694320678711, + "learning_rate": 8.746332355294745e-06, + "loss": 0.7759, + "step": 4720 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 8.671502113342285, + "learning_rate": 8.719658575620166e-06, + "loss": 0.834, + "step": 4730 + }, + { + "epoch": 1.7779444861215303, + "grad_norm": 10.25809097290039, + "learning_rate": 8.692984795945586e-06, + "loss": 0.7088, + "step": 4740 + }, + { + "epoch": 1.7816954238559641, + "grad_norm": 11.049978256225586, + "learning_rate": 8.666311016271007e-06, + "loss": 0.8777, + "step": 4750 + }, + { + "epoch": 1.7854463615903975, + "grad_norm": 6.090721130371094, + "learning_rate": 8.639637236596427e-06, + "loss": 0.8311, + "step": 4760 + }, + { + "epoch": 1.7891972993248313, + "grad_norm": 7.393324375152588, + "learning_rate": 8.612963456921846e-06, + "loss": 0.7623, + "step": 4770 + }, + { + "epoch": 1.7929482370592649, + "grad_norm": 9.985932350158691, + "learning_rate": 8.586289677247268e-06, + "loss": 0.7164, + "step": 4780 + }, + { + "epoch": 1.7966991747936985, + "grad_norm": 23.15224266052246, + "learning_rate": 8.559615897572687e-06, + "loss": 0.8139, + "step": 4790 + }, + { + "epoch": 1.800450112528132, + "grad_norm": 15.539804458618164, + "learning_rate": 8.532942117898107e-06, + "loss": 0.9038, + "step": 4800 + }, + { + "epoch": 1.8042010502625656, + "grad_norm": 20.424936294555664, + "learning_rate": 8.506268338223526e-06, + "loss": 0.9228, + "step": 4810 + }, + { + "epoch": 1.8079519879969994, + "grad_norm": 12.960927963256836, + "learning_rate": 8.479594558548948e-06, + "loss": 0.8129, + "step": 4820 + }, + { + "epoch": 1.8117029257314328, + "grad_norm": 12.578907012939453, + "learning_rate": 8.452920778874367e-06, + "loss": 0.7919, + "step": 4830 + }, + { + "epoch": 1.8154538634658666, + "grad_norm": 9.88344955444336, + "learning_rate": 8.426246999199787e-06, + "loss": 0.8888, + "step": 4840 + }, + { + "epoch": 1.8192048012003, + "grad_norm": 9.531432151794434, + "learning_rate": 8.399573219525208e-06, + "loss": 0.8074, + "step": 4850 + }, + { + "epoch": 1.8229557389347337, + "grad_norm": 10.701923370361328, + "learning_rate": 8.372899439850628e-06, + "loss": 0.8598, + "step": 4860 + }, + { + "epoch": 1.8267066766691673, + "grad_norm": 10.894915580749512, + "learning_rate": 8.346225660176047e-06, + "loss": 0.6588, + "step": 4870 + }, + { + "epoch": 1.8304576144036009, + "grad_norm": 9.2036714553833, + "learning_rate": 8.319551880501467e-06, + "loss": 0.8323, + "step": 4880 + }, + { + "epoch": 1.8342085521380345, + "grad_norm": 8.6634521484375, + "learning_rate": 8.292878100826888e-06, + "loss": 0.7526, + "step": 4890 + }, + { + "epoch": 1.837959489872468, + "grad_norm": 14.781025886535645, + "learning_rate": 8.266204321152308e-06, + "loss": 0.6999, + "step": 4900 + }, + { + "epoch": 1.8417104276069018, + "grad_norm": 12.273209571838379, + "learning_rate": 8.239530541477728e-06, + "loss": 0.6734, + "step": 4910 + }, + { + "epoch": 1.8454613653413352, + "grad_norm": 11.974825859069824, + "learning_rate": 8.212856761803149e-06, + "loss": 0.7195, + "step": 4920 + }, + { + "epoch": 1.849212303075769, + "grad_norm": 12.195642471313477, + "learning_rate": 8.186182982128569e-06, + "loss": 0.8301, + "step": 4930 + }, + { + "epoch": 1.8529632408102026, + "grad_norm": 6.2414751052856445, + "learning_rate": 8.159509202453988e-06, + "loss": 0.8528, + "step": 4940 + }, + { + "epoch": 1.8567141785446362, + "grad_norm": 9.026991844177246, + "learning_rate": 8.132835422779408e-06, + "loss": 0.8165, + "step": 4950 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 13.745824813842773, + "learning_rate": 8.106161643104829e-06, + "loss": 0.9866, + "step": 4960 + }, + { + "epoch": 1.8642160540135033, + "grad_norm": 8.861783027648926, + "learning_rate": 8.079487863430249e-06, + "loss": 0.9738, + "step": 4970 + }, + { + "epoch": 1.8679669917479371, + "grad_norm": 7.437354564666748, + "learning_rate": 8.052814083755668e-06, + "loss": 0.7223, + "step": 4980 + }, + { + "epoch": 1.8717179294823705, + "grad_norm": 14.148890495300293, + "learning_rate": 8.02614030408109e-06, + "loss": 0.8356, + "step": 4990 + }, + { + "epoch": 1.8754688672168043, + "grad_norm": 13.688013076782227, + "learning_rate": 7.99946652440651e-06, + "loss": 0.8949, + "step": 5000 + }, + { + "epoch": 1.8792198049512379, + "grad_norm": 16.709125518798828, + "learning_rate": 7.972792744731929e-06, + "loss": 0.8775, + "step": 5010 + }, + { + "epoch": 1.8829707426856714, + "grad_norm": 9.73661994934082, + "learning_rate": 7.946118965057348e-06, + "loss": 0.832, + "step": 5020 + }, + { + "epoch": 1.886721680420105, + "grad_norm": 10.575983047485352, + "learning_rate": 7.91944518538277e-06, + "loss": 0.7976, + "step": 5030 + }, + { + "epoch": 1.8904726181545386, + "grad_norm": 9.284303665161133, + "learning_rate": 7.89277140570819e-06, + "loss": 0.9656, + "step": 5040 + }, + { + "epoch": 1.8942235558889724, + "grad_norm": 6.543034553527832, + "learning_rate": 7.866097626033609e-06, + "loss": 0.7157, + "step": 5050 + }, + { + "epoch": 1.8979744936234058, + "grad_norm": 5.064873218536377, + "learning_rate": 7.83942384635903e-06, + "loss": 0.7334, + "step": 5060 + }, + { + "epoch": 1.9017254313578396, + "grad_norm": 16.654563903808594, + "learning_rate": 7.81275006668445e-06, + "loss": 0.7677, + "step": 5070 + }, + { + "epoch": 1.905476369092273, + "grad_norm": 20.614212036132812, + "learning_rate": 7.78607628700987e-06, + "loss": 0.8525, + "step": 5080 + }, + { + "epoch": 1.9092273068267067, + "grad_norm": 13.709310531616211, + "learning_rate": 7.75940250733529e-06, + "loss": 0.7237, + "step": 5090 + }, + { + "epoch": 1.9129782445611403, + "grad_norm": 17.662317276000977, + "learning_rate": 7.73272872766071e-06, + "loss": 0.8529, + "step": 5100 + }, + { + "epoch": 1.9167291822955739, + "grad_norm": 9.610177040100098, + "learning_rate": 7.70605494798613e-06, + "loss": 0.9447, + "step": 5110 + }, + { + "epoch": 1.9204801200300075, + "grad_norm": 19.19601821899414, + "learning_rate": 7.67938116831155e-06, + "loss": 0.8738, + "step": 5120 + }, + { + "epoch": 1.924231057764441, + "grad_norm": 8.228813171386719, + "learning_rate": 7.652707388636971e-06, + "loss": 0.8096, + "step": 5130 + }, + { + "epoch": 1.9279819954988748, + "grad_norm": 14.475564956665039, + "learning_rate": 7.626033608962391e-06, + "loss": 0.7235, + "step": 5140 + }, + { + "epoch": 1.9317329332333082, + "grad_norm": 17.313648223876953, + "learning_rate": 7.599359829287811e-06, + "loss": 0.7778, + "step": 5150 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 6.775811672210693, + "learning_rate": 7.572686049613231e-06, + "loss": 0.7627, + "step": 5160 + }, + { + "epoch": 1.9392348087021756, + "grad_norm": 11.815681457519531, + "learning_rate": 7.54601226993865e-06, + "loss": 0.8978, + "step": 5170 + }, + { + "epoch": 1.9429857464366092, + "grad_norm": 13.653975486755371, + "learning_rate": 7.519338490264071e-06, + "loss": 0.7364, + "step": 5180 + }, + { + "epoch": 1.9467366841710427, + "grad_norm": 9.049905776977539, + "learning_rate": 7.492664710589491e-06, + "loss": 0.8631, + "step": 5190 + }, + { + "epoch": 1.9504876219054763, + "grad_norm": 14.149343490600586, + "learning_rate": 7.465990930914912e-06, + "loss": 0.8279, + "step": 5200 + }, + { + "epoch": 1.9542385596399101, + "grad_norm": 15.612215995788574, + "learning_rate": 7.439317151240331e-06, + "loss": 0.9058, + "step": 5210 + }, + { + "epoch": 1.9579894973743435, + "grad_norm": 11.682372093200684, + "learning_rate": 7.412643371565752e-06, + "loss": 0.8859, + "step": 5220 + }, + { + "epoch": 1.9617404351087773, + "grad_norm": 9.87074089050293, + "learning_rate": 7.385969591891171e-06, + "loss": 0.8733, + "step": 5230 + }, + { + "epoch": 1.9654913728432108, + "grad_norm": 9.963356971740723, + "learning_rate": 7.359295812216591e-06, + "loss": 0.7134, + "step": 5240 + }, + { + "epoch": 1.9692423105776444, + "grad_norm": 4.6800537109375, + "learning_rate": 7.3326220325420115e-06, + "loss": 0.7594, + "step": 5250 + }, + { + "epoch": 1.972993248312078, + "grad_norm": 13.148963928222656, + "learning_rate": 7.305948252867432e-06, + "loss": 0.947, + "step": 5260 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 10.073929786682129, + "learning_rate": 7.279274473192852e-06, + "loss": 0.8769, + "step": 5270 + }, + { + "epoch": 1.9804951237809454, + "grad_norm": 11.67326831817627, + "learning_rate": 7.252600693518272e-06, + "loss": 0.7545, + "step": 5280 + }, + { + "epoch": 1.9842460615153787, + "grad_norm": 7.498824119567871, + "learning_rate": 7.2259269138436925e-06, + "loss": 0.7997, + "step": 5290 + }, + { + "epoch": 1.9879969992498125, + "grad_norm": 9.357927322387695, + "learning_rate": 7.199253134169112e-06, + "loss": 0.8754, + "step": 5300 + }, + { + "epoch": 1.991747936984246, + "grad_norm": 12.50817584991455, + "learning_rate": 7.172579354494532e-06, + "loss": 0.79, + "step": 5310 + }, + { + "epoch": 1.9954988747186797, + "grad_norm": 14.613991737365723, + "learning_rate": 7.145905574819952e-06, + "loss": 0.8005, + "step": 5320 + }, + { + "epoch": 1.9992498124531133, + "grad_norm": 9.007129669189453, + "learning_rate": 7.119231795145373e-06, + "loss": 0.9009, + "step": 5330 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.580168776371308, + "eval_f1_macro": 0.5788189436128865, + "eval_f1_weighted": 0.5800618837244829, + "eval_loss": 0.9064968228340149, + "eval_precision_macro": 0.5789782500874713, + "eval_precision_weighted": 0.5804785651892536, + "eval_recall_macro": 0.5792145494510413, + "eval_recall_weighted": 0.580168776371308, + "eval_runtime": 4.8637, + "eval_samples_per_second": 487.284, + "eval_steps_per_second": 61.065, + "step": 5332 + }, + { + "epoch": 2.003000750187547, + "grad_norm": 10.368429183959961, + "learning_rate": 7.092558015470793e-06, + "loss": 0.6842, + "step": 5340 + }, + { + "epoch": 2.0067516879219807, + "grad_norm": 10.329928398132324, + "learning_rate": 7.065884235796214e-06, + "loss": 0.7709, + "step": 5350 + }, + { + "epoch": 2.010502625656414, + "grad_norm": 13.128575325012207, + "learning_rate": 7.039210456121633e-06, + "loss": 0.7054, + "step": 5360 + }, + { + "epoch": 2.014253563390848, + "grad_norm": 10.884894371032715, + "learning_rate": 7.012536676447053e-06, + "loss": 0.8103, + "step": 5370 + }, + { + "epoch": 2.018004501125281, + "grad_norm": 17.327537536621094, + "learning_rate": 6.985862896772473e-06, + "loss": 0.6551, + "step": 5380 + }, + { + "epoch": 2.021755438859715, + "grad_norm": 9.725515365600586, + "learning_rate": 6.959189117097893e-06, + "loss": 0.6534, + "step": 5390 + }, + { + "epoch": 2.0255063765941483, + "grad_norm": 9.302525520324707, + "learning_rate": 6.932515337423313e-06, + "loss": 0.6741, + "step": 5400 + }, + { + "epoch": 2.029257314328582, + "grad_norm": 12.362338066101074, + "learning_rate": 6.905841557748734e-06, + "loss": 0.7026, + "step": 5410 + }, + { + "epoch": 2.033008252063016, + "grad_norm": 7.654306411743164, + "learning_rate": 6.879167778074154e-06, + "loss": 0.5962, + "step": 5420 + }, + { + "epoch": 2.0367591897974493, + "grad_norm": 14.547067642211914, + "learning_rate": 6.852493998399574e-06, + "loss": 0.5578, + "step": 5430 + }, + { + "epoch": 2.040510127531883, + "grad_norm": 12.792427062988281, + "learning_rate": 6.8258202187249935e-06, + "loss": 0.7636, + "step": 5440 + }, + { + "epoch": 2.0442610652663165, + "grad_norm": 8.322968482971191, + "learning_rate": 6.799146439050414e-06, + "loss": 0.5881, + "step": 5450 + }, + { + "epoch": 2.0480120030007503, + "grad_norm": 14.064526557922363, + "learning_rate": 6.772472659375834e-06, + "loss": 0.6907, + "step": 5460 + }, + { + "epoch": 2.0517629407351836, + "grad_norm": 11.318249702453613, + "learning_rate": 6.745798879701254e-06, + "loss": 0.6179, + "step": 5470 + }, + { + "epoch": 2.0555138784696174, + "grad_norm": 7.615289688110352, + "learning_rate": 6.7191251000266745e-06, + "loss": 0.5912, + "step": 5480 + }, + { + "epoch": 2.059264816204051, + "grad_norm": 20.249950408935547, + "learning_rate": 6.692451320352095e-06, + "loss": 0.7777, + "step": 5490 + }, + { + "epoch": 2.0630157539384846, + "grad_norm": 13.289349555969238, + "learning_rate": 6.665777540677515e-06, + "loss": 0.6271, + "step": 5500 + }, + { + "epoch": 2.0667666916729184, + "grad_norm": 14.625772476196289, + "learning_rate": 6.639103761002935e-06, + "loss": 0.7248, + "step": 5510 + }, + { + "epoch": 2.0705176294073517, + "grad_norm": 14.428004264831543, + "learning_rate": 6.612429981328355e-06, + "loss": 0.6791, + "step": 5520 + }, + { + "epoch": 2.0742685671417855, + "grad_norm": 21.052837371826172, + "learning_rate": 6.585756201653774e-06, + "loss": 0.6244, + "step": 5530 + }, + { + "epoch": 2.078019504876219, + "grad_norm": 17.523300170898438, + "learning_rate": 6.559082421979195e-06, + "loss": 0.6498, + "step": 5540 + }, + { + "epoch": 2.0817704426106527, + "grad_norm": 9.524145126342773, + "learning_rate": 6.532408642304615e-06, + "loss": 0.7792, + "step": 5550 + }, + { + "epoch": 2.085521380345086, + "grad_norm": 14.92676830291748, + "learning_rate": 6.505734862630036e-06, + "loss": 0.5748, + "step": 5560 + }, + { + "epoch": 2.08927231807952, + "grad_norm": 18.87467384338379, + "learning_rate": 6.479061082955455e-06, + "loss": 0.7199, + "step": 5570 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 10.356287002563477, + "learning_rate": 6.452387303280876e-06, + "loss": 0.7016, + "step": 5580 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 11.189599990844727, + "learning_rate": 6.425713523606295e-06, + "loss": 0.6511, + "step": 5590 + }, + { + "epoch": 2.100525131282821, + "grad_norm": 12.267254829406738, + "learning_rate": 6.399039743931715e-06, + "loss": 0.6421, + "step": 5600 + }, + { + "epoch": 2.104276069017254, + "grad_norm": 19.524673461914062, + "learning_rate": 6.3723659642571354e-06, + "loss": 0.6963, + "step": 5610 + }, + { + "epoch": 2.108027006751688, + "grad_norm": 13.466742515563965, + "learning_rate": 6.345692184582556e-06, + "loss": 0.6727, + "step": 5620 + }, + { + "epoch": 2.1117779444861213, + "grad_norm": 20.707855224609375, + "learning_rate": 6.319018404907976e-06, + "loss": 0.6695, + "step": 5630 + }, + { + "epoch": 2.115528882220555, + "grad_norm": 15.425350189208984, + "learning_rate": 6.292344625233396e-06, + "loss": 0.673, + "step": 5640 + }, + { + "epoch": 2.119279819954989, + "grad_norm": 5.349853038787842, + "learning_rate": 6.2656708455588164e-06, + "loss": 0.6275, + "step": 5650 + }, + { + "epoch": 2.1230307576894223, + "grad_norm": 13.552290916442871, + "learning_rate": 6.238997065884236e-06, + "loss": 0.6945, + "step": 5660 + }, + { + "epoch": 2.126781695423856, + "grad_norm": 17.840105056762695, + "learning_rate": 6.212323286209656e-06, + "loss": 0.8054, + "step": 5670 + }, + { + "epoch": 2.1305326331582894, + "grad_norm": 21.012237548828125, + "learning_rate": 6.185649506535076e-06, + "loss": 0.7306, + "step": 5680 + }, + { + "epoch": 2.1342835708927232, + "grad_norm": 13.1303129196167, + "learning_rate": 6.158975726860497e-06, + "loss": 0.871, + "step": 5690 + }, + { + "epoch": 2.1380345086271566, + "grad_norm": 11.506791114807129, + "learning_rate": 6.132301947185917e-06, + "loss": 0.6722, + "step": 5700 + }, + { + "epoch": 2.1417854463615904, + "grad_norm": 9.709290504455566, + "learning_rate": 6.1056281675113375e-06, + "loss": 0.6695, + "step": 5710 + }, + { + "epoch": 2.145536384096024, + "grad_norm": 8.551689147949219, + "learning_rate": 6.078954387836757e-06, + "loss": 0.7001, + "step": 5720 + }, + { + "epoch": 2.1492873218304576, + "grad_norm": 12.69763469696045, + "learning_rate": 6.052280608162177e-06, + "loss": 0.6778, + "step": 5730 + }, + { + "epoch": 2.1530382595648914, + "grad_norm": 10.49093246459961, + "learning_rate": 6.025606828487597e-06, + "loss": 0.6671, + "step": 5740 + }, + { + "epoch": 2.1567891972993247, + "grad_norm": 7.214636325836182, + "learning_rate": 5.998933048813017e-06, + "loss": 0.616, + "step": 5750 + }, + { + "epoch": 2.1605401350337585, + "grad_norm": 8.58086109161377, + "learning_rate": 5.972259269138437e-06, + "loss": 0.6024, + "step": 5760 + }, + { + "epoch": 2.164291072768192, + "grad_norm": 7.856104373931885, + "learning_rate": 5.945585489463858e-06, + "loss": 0.6202, + "step": 5770 + }, + { + "epoch": 2.1680420105026257, + "grad_norm": 6.472407341003418, + "learning_rate": 5.918911709789278e-06, + "loss": 0.6141, + "step": 5780 + }, + { + "epoch": 2.1717929482370595, + "grad_norm": 6.612668991088867, + "learning_rate": 5.892237930114698e-06, + "loss": 0.7841, + "step": 5790 + }, + { + "epoch": 2.175543885971493, + "grad_norm": 9.869592666625977, + "learning_rate": 5.865564150440118e-06, + "loss": 0.5949, + "step": 5800 + }, + { + "epoch": 2.1792948237059266, + "grad_norm": 12.85415267944336, + "learning_rate": 5.838890370765538e-06, + "loss": 0.663, + "step": 5810 + }, + { + "epoch": 2.18304576144036, + "grad_norm": 22.380807876586914, + "learning_rate": 5.8122165910909575e-06, + "loss": 0.6532, + "step": 5820 + }, + { + "epoch": 2.186796699174794, + "grad_norm": 23.866607666015625, + "learning_rate": 5.785542811416378e-06, + "loss": 0.6704, + "step": 5830 + }, + { + "epoch": 2.190547636909227, + "grad_norm": 12.608299255371094, + "learning_rate": 5.7588690317417985e-06, + "loss": 0.6231, + "step": 5840 + }, + { + "epoch": 2.194298574643661, + "grad_norm": 27.60419464111328, + "learning_rate": 5.732195252067219e-06, + "loss": 0.6369, + "step": 5850 + }, + { + "epoch": 2.1980495123780943, + "grad_norm": 10.39966869354248, + "learning_rate": 5.7055214723926385e-06, + "loss": 0.5964, + "step": 5860 + }, + { + "epoch": 2.201800450112528, + "grad_norm": 23.611059188842773, + "learning_rate": 5.678847692718059e-06, + "loss": 0.7365, + "step": 5870 + }, + { + "epoch": 2.205551387846962, + "grad_norm": 10.59642505645752, + "learning_rate": 5.652173913043479e-06, + "loss": 0.6305, + "step": 5880 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 15.549806594848633, + "learning_rate": 5.625500133368898e-06, + "loss": 0.624, + "step": 5890 + }, + { + "epoch": 2.213053263315829, + "grad_norm": 17.546363830566406, + "learning_rate": 5.598826353694319e-06, + "loss": 0.7103, + "step": 5900 + }, + { + "epoch": 2.2168042010502624, + "grad_norm": 19.833606719970703, + "learning_rate": 5.572152574019739e-06, + "loss": 0.4821, + "step": 5910 + }, + { + "epoch": 2.2205551387846962, + "grad_norm": 18.05365562438965, + "learning_rate": 5.54547879434516e-06, + "loss": 0.6953, + "step": 5920 + }, + { + "epoch": 2.2243060765191296, + "grad_norm": 3.1533432006835938, + "learning_rate": 5.518805014670579e-06, + "loss": 0.6899, + "step": 5930 + }, + { + "epoch": 2.2280570142535634, + "grad_norm": 21.84452247619629, + "learning_rate": 5.492131234996e-06, + "loss": 0.8146, + "step": 5940 + }, + { + "epoch": 2.231807951987997, + "grad_norm": 20.791135787963867, + "learning_rate": 5.465457455321419e-06, + "loss": 0.4915, + "step": 5950 + }, + { + "epoch": 2.2355588897224306, + "grad_norm": 16.44775390625, + "learning_rate": 5.438783675646839e-06, + "loss": 0.5946, + "step": 5960 + }, + { + "epoch": 2.2393098274568644, + "grad_norm": 8.386981964111328, + "learning_rate": 5.412109895972259e-06, + "loss": 0.7348, + "step": 5970 + }, + { + "epoch": 2.2430607651912977, + "grad_norm": 26.47071075439453, + "learning_rate": 5.38543611629768e-06, + "loss": 0.6261, + "step": 5980 + }, + { + "epoch": 2.2468117029257315, + "grad_norm": 11.219141960144043, + "learning_rate": 5.3587623366231e-06, + "loss": 0.5324, + "step": 5990 + }, + { + "epoch": 2.250562640660165, + "grad_norm": 15.969422340393066, + "learning_rate": 5.33208855694852e-06, + "loss": 0.7459, + "step": 6000 + }, + { + "epoch": 2.2543135783945987, + "grad_norm": 20.990497589111328, + "learning_rate": 5.30541477727394e-06, + "loss": 0.5593, + "step": 6010 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 10.82603645324707, + "learning_rate": 5.27874099759936e-06, + "loss": 0.6698, + "step": 6020 + }, + { + "epoch": 2.261815453863466, + "grad_norm": 19.865243911743164, + "learning_rate": 5.25206721792478e-06, + "loss": 0.732, + "step": 6030 + }, + { + "epoch": 2.2655663915978996, + "grad_norm": 25.37660026550293, + "learning_rate": 5.2253934382502e-06, + "loss": 0.5585, + "step": 6040 + }, + { + "epoch": 2.269317329332333, + "grad_norm": 19.796749114990234, + "learning_rate": 5.1987196585756205e-06, + "loss": 0.7108, + "step": 6050 + }, + { + "epoch": 2.273068267066767, + "grad_norm": 12.207030296325684, + "learning_rate": 5.172045878901041e-06, + "loss": 0.6683, + "step": 6060 + }, + { + "epoch": 2.2768192048012, + "grad_norm": 20.979265213012695, + "learning_rate": 5.1453720992264615e-06, + "loss": 0.6962, + "step": 6070 + }, + { + "epoch": 2.280570142535634, + "grad_norm": 13.058587074279785, + "learning_rate": 5.118698319551881e-06, + "loss": 0.6119, + "step": 6080 + }, + { + "epoch": 2.2843210802700673, + "grad_norm": 7.18276309967041, + "learning_rate": 5.092024539877301e-06, + "loss": 0.606, + "step": 6090 + }, + { + "epoch": 2.288072018004501, + "grad_norm": 21.568151473999023, + "learning_rate": 5.065350760202721e-06, + "loss": 0.6909, + "step": 6100 + }, + { + "epoch": 2.291822955738935, + "grad_norm": 28.49129867553711, + "learning_rate": 5.038676980528141e-06, + "loss": 0.6764, + "step": 6110 + }, + { + "epoch": 2.2955738934733683, + "grad_norm": 12.39367389678955, + "learning_rate": 5.012003200853561e-06, + "loss": 0.753, + "step": 6120 + }, + { + "epoch": 2.299324831207802, + "grad_norm": 17.55943489074707, + "learning_rate": 4.985329421178982e-06, + "loss": 0.7103, + "step": 6130 + }, + { + "epoch": 2.3030757689422354, + "grad_norm": 16.813745498657227, + "learning_rate": 4.958655641504402e-06, + "loss": 0.645, + "step": 6140 + }, + { + "epoch": 2.3068267066766692, + "grad_norm": 20.711591720581055, + "learning_rate": 4.931981861829822e-06, + "loss": 0.6337, + "step": 6150 + }, + { + "epoch": 2.3105776444111026, + "grad_norm": 5.449891567230225, + "learning_rate": 4.905308082155241e-06, + "loss": 0.6224, + "step": 6160 + }, + { + "epoch": 2.3143285821455364, + "grad_norm": 15.508672714233398, + "learning_rate": 4.878634302480662e-06, + "loss": 0.6718, + "step": 6170 + }, + { + "epoch": 2.31807951987997, + "grad_norm": 12.16860294342041, + "learning_rate": 4.8519605228060815e-06, + "loss": 0.6044, + "step": 6180 + }, + { + "epoch": 2.3218304576144035, + "grad_norm": 16.671234130859375, + "learning_rate": 4.825286743131502e-06, + "loss": 0.7397, + "step": 6190 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 27.95615577697754, + "learning_rate": 4.798612963456922e-06, + "loss": 0.6451, + "step": 6200 + }, + { + "epoch": 2.3293323330832707, + "grad_norm": 23.62805938720703, + "learning_rate": 4.771939183782343e-06, + "loss": 0.7978, + "step": 6210 + }, + { + "epoch": 2.3330832708177045, + "grad_norm": 17.226280212402344, + "learning_rate": 4.7452654041077625e-06, + "loss": 0.6442, + "step": 6220 + }, + { + "epoch": 2.336834208552138, + "grad_norm": 22.371273040771484, + "learning_rate": 4.718591624433183e-06, + "loss": 0.6885, + "step": 6230 + }, + { + "epoch": 2.3405851462865717, + "grad_norm": 12.560019493103027, + "learning_rate": 4.6919178447586026e-06, + "loss": 0.6033, + "step": 6240 + }, + { + "epoch": 2.3443360840210055, + "grad_norm": 14.103109359741211, + "learning_rate": 4.665244065084023e-06, + "loss": 0.6683, + "step": 6250 + }, + { + "epoch": 2.348087021755439, + "grad_norm": 11.051913261413574, + "learning_rate": 4.638570285409443e-06, + "loss": 0.7092, + "step": 6260 + }, + { + "epoch": 2.3518379594898726, + "grad_norm": 15.613760948181152, + "learning_rate": 4.611896505734863e-06, + "loss": 0.6974, + "step": 6270 + }, + { + "epoch": 2.355588897224306, + "grad_norm": 19.85428237915039, + "learning_rate": 4.5852227260602836e-06, + "loss": 0.6637, + "step": 6280 + }, + { + "epoch": 2.35933983495874, + "grad_norm": 15.703207015991211, + "learning_rate": 4.558548946385703e-06, + "loss": 0.6508, + "step": 6290 + }, + { + "epoch": 2.363090772693173, + "grad_norm": 11.342123985290527, + "learning_rate": 4.531875166711124e-06, + "loss": 0.7348, + "step": 6300 + }, + { + "epoch": 2.366841710427607, + "grad_norm": 11.049941062927246, + "learning_rate": 4.505201387036543e-06, + "loss": 0.6421, + "step": 6310 + }, + { + "epoch": 2.3705926481620407, + "grad_norm": 24.488731384277344, + "learning_rate": 4.478527607361964e-06, + "loss": 0.7123, + "step": 6320 + }, + { + "epoch": 2.374343585896474, + "grad_norm": 14.967778205871582, + "learning_rate": 4.451853827687383e-06, + "loss": 0.7142, + "step": 6330 + }, + { + "epoch": 2.378094523630908, + "grad_norm": 9.328021049499512, + "learning_rate": 4.425180048012804e-06, + "loss": 0.6251, + "step": 6340 + }, + { + "epoch": 2.3818454613653413, + "grad_norm": 17.42303466796875, + "learning_rate": 4.398506268338224e-06, + "loss": 0.6355, + "step": 6350 + }, + { + "epoch": 2.385596399099775, + "grad_norm": 15.201652526855469, + "learning_rate": 4.371832488663644e-06, + "loss": 0.7441, + "step": 6360 + }, + { + "epoch": 2.3893473368342084, + "grad_norm": 23.0561466217041, + "learning_rate": 4.345158708989064e-06, + "loss": 0.6641, + "step": 6370 + }, + { + "epoch": 2.393098274568642, + "grad_norm": 14.52270221710205, + "learning_rate": 4.318484929314484e-06, + "loss": 0.7073, + "step": 6380 + }, + { + "epoch": 2.396849212303076, + "grad_norm": 13.747902870178223, + "learning_rate": 4.291811149639904e-06, + "loss": 0.81, + "step": 6390 + }, + { + "epoch": 2.4006001500375094, + "grad_norm": 14.231673240661621, + "learning_rate": 4.265137369965324e-06, + "loss": 0.6939, + "step": 6400 + }, + { + "epoch": 2.404351087771943, + "grad_norm": 7.63701057434082, + "learning_rate": 4.2384635902907445e-06, + "loss": 0.6873, + "step": 6410 + }, + { + "epoch": 2.4081020255063765, + "grad_norm": 20.752126693725586, + "learning_rate": 4.211789810616165e-06, + "loss": 0.571, + "step": 6420 + }, + { + "epoch": 2.4118529632408103, + "grad_norm": 13.460418701171875, + "learning_rate": 4.185116030941585e-06, + "loss": 0.6506, + "step": 6430 + }, + { + "epoch": 2.4156039009752437, + "grad_norm": 8.838345527648926, + "learning_rate": 4.158442251267005e-06, + "loss": 0.5745, + "step": 6440 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 10.570659637451172, + "learning_rate": 4.131768471592425e-06, + "loss": 0.6607, + "step": 6450 + }, + { + "epoch": 2.423105776444111, + "grad_norm": 12.49052619934082, + "learning_rate": 4.105094691917845e-06, + "loss": 0.5026, + "step": 6460 + }, + { + "epoch": 2.4268567141785446, + "grad_norm": 9.46437931060791, + "learning_rate": 4.078420912243265e-06, + "loss": 0.6005, + "step": 6470 + }, + { + "epoch": 2.430607651912978, + "grad_norm": 29.9566593170166, + "learning_rate": 4.051747132568685e-06, + "loss": 0.6292, + "step": 6480 + }, + { + "epoch": 2.434358589647412, + "grad_norm": 12.318580627441406, + "learning_rate": 4.025073352894106e-06, + "loss": 0.7056, + "step": 6490 + }, + { + "epoch": 2.4381095273818456, + "grad_norm": 20.635848999023438, + "learning_rate": 3.998399573219526e-06, + "loss": 0.6663, + "step": 6500 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 13.231310844421387, + "learning_rate": 3.971725793544946e-06, + "loss": 0.7405, + "step": 6510 + }, + { + "epoch": 2.4456114028507128, + "grad_norm": 16.560197830200195, + "learning_rate": 3.945052013870365e-06, + "loss": 0.6678, + "step": 6520 + }, + { + "epoch": 2.449362340585146, + "grad_norm": 21.45167350769043, + "learning_rate": 3.918378234195786e-06, + "loss": 0.6032, + "step": 6530 + }, + { + "epoch": 2.45311327831958, + "grad_norm": 37.360843658447266, + "learning_rate": 3.891704454521205e-06, + "loss": 0.8438, + "step": 6540 + }, + { + "epoch": 2.4568642160540133, + "grad_norm": 30.98585319519043, + "learning_rate": 3.865030674846626e-06, + "loss": 0.6035, + "step": 6550 + }, + { + "epoch": 2.460615153788447, + "grad_norm": 13.408466339111328, + "learning_rate": 3.838356895172046e-06, + "loss": 0.5181, + "step": 6560 + }, + { + "epoch": 2.464366091522881, + "grad_norm": 16.84627914428711, + "learning_rate": 3.8116831154974664e-06, + "loss": 0.6353, + "step": 6570 + }, + { + "epoch": 2.4681170292573142, + "grad_norm": 19.02153968811035, + "learning_rate": 3.785009335822886e-06, + "loss": 0.6052, + "step": 6580 + }, + { + "epoch": 2.471867966991748, + "grad_norm": 13.263850212097168, + "learning_rate": 3.7583355561483065e-06, + "loss": 0.8126, + "step": 6590 + }, + { + "epoch": 2.4756189047261814, + "grad_norm": 22.753215789794922, + "learning_rate": 3.731661776473727e-06, + "loss": 0.6449, + "step": 6600 + }, + { + "epoch": 2.479369842460615, + "grad_norm": 13.979212760925293, + "learning_rate": 3.704987996799147e-06, + "loss": 0.7421, + "step": 6610 + }, + { + "epoch": 2.4831207801950486, + "grad_norm": 23.614389419555664, + "learning_rate": 3.6783142171245666e-06, + "loss": 0.8168, + "step": 6620 + }, + { + "epoch": 2.4868717179294824, + "grad_norm": 7.810019493103027, + "learning_rate": 3.651640437449987e-06, + "loss": 0.6301, + "step": 6630 + }, + { + "epoch": 2.490622655663916, + "grad_norm": 17.90605926513672, + "learning_rate": 3.624966657775407e-06, + "loss": 0.6369, + "step": 6640 + }, + { + "epoch": 2.4943735933983495, + "grad_norm": 10.375251770019531, + "learning_rate": 3.598292878100827e-06, + "loss": 0.6254, + "step": 6650 + }, + { + "epoch": 2.4981245311327833, + "grad_norm": 15.813028335571289, + "learning_rate": 3.571619098426247e-06, + "loss": 0.7866, + "step": 6660 + }, + { + "epoch": 2.5018754688672167, + "grad_norm": 8.438957214355469, + "learning_rate": 3.5449453187516676e-06, + "loss": 0.7288, + "step": 6670 + }, + { + "epoch": 2.5056264066016505, + "grad_norm": 23.076040267944336, + "learning_rate": 3.5182715390770877e-06, + "loss": 0.6743, + "step": 6680 + }, + { + "epoch": 2.509377344336084, + "grad_norm": 14.966166496276855, + "learning_rate": 3.4915977594025073e-06, + "loss": 0.6408, + "step": 6690 + }, + { + "epoch": 2.5131282820705176, + "grad_norm": 19.553081512451172, + "learning_rate": 3.4649239797279277e-06, + "loss": 0.613, + "step": 6700 + }, + { + "epoch": 2.5168792198049514, + "grad_norm": 12.050764083862305, + "learning_rate": 3.4382502000533478e-06, + "loss": 0.6547, + "step": 6710 + }, + { + "epoch": 2.520630157539385, + "grad_norm": 14.52085018157959, + "learning_rate": 3.411576420378768e-06, + "loss": 0.7239, + "step": 6720 + }, + { + "epoch": 2.5243810952738186, + "grad_norm": 20.222137451171875, + "learning_rate": 3.384902640704188e-06, + "loss": 0.7656, + "step": 6730 + }, + { + "epoch": 2.528132033008252, + "grad_norm": 14.729280471801758, + "learning_rate": 3.3582288610296083e-06, + "loss": 0.6013, + "step": 6740 + }, + { + "epoch": 2.5318829707426858, + "grad_norm": 21.984832763671875, + "learning_rate": 3.3315550813550284e-06, + "loss": 0.6453, + "step": 6750 + }, + { + "epoch": 2.535633908477119, + "grad_norm": 19.643138885498047, + "learning_rate": 3.304881301680448e-06, + "loss": 0.8221, + "step": 6760 + }, + { + "epoch": 2.539384846211553, + "grad_norm": 17.281740188598633, + "learning_rate": 3.2782075220058684e-06, + "loss": 0.6348, + "step": 6770 + }, + { + "epoch": 2.5431357839459867, + "grad_norm": 17.821035385131836, + "learning_rate": 3.251533742331289e-06, + "loss": 0.5855, + "step": 6780 + }, + { + "epoch": 2.54688672168042, + "grad_norm": 14.015131950378418, + "learning_rate": 3.224859962656709e-06, + "loss": 0.5544, + "step": 6790 + }, + { + "epoch": 2.550637659414854, + "grad_norm": 10.391494750976562, + "learning_rate": 3.1981861829821286e-06, + "loss": 0.588, + "step": 6800 + }, + { + "epoch": 2.5543885971492872, + "grad_norm": 14.990039825439453, + "learning_rate": 3.171512403307549e-06, + "loss": 0.5782, + "step": 6810 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 13.448775291442871, + "learning_rate": 3.144838623632969e-06, + "loss": 0.8525, + "step": 6820 + }, + { + "epoch": 2.5618904726181544, + "grad_norm": 13.461121559143066, + "learning_rate": 3.118164843958389e-06, + "loss": 0.6256, + "step": 6830 + }, + { + "epoch": 2.565641410352588, + "grad_norm": 13.295988082885742, + "learning_rate": 3.091491064283809e-06, + "loss": 0.7059, + "step": 6840 + }, + { + "epoch": 2.569392348087022, + "grad_norm": 14.871612548828125, + "learning_rate": 3.0648172846092296e-06, + "loss": 0.6338, + "step": 6850 + }, + { + "epoch": 2.5731432858214554, + "grad_norm": 30.46957778930664, + "learning_rate": 3.0381435049346496e-06, + "loss": 0.7072, + "step": 6860 + }, + { + "epoch": 2.5768942235558887, + "grad_norm": 20.661733627319336, + "learning_rate": 3.0114697252600693e-06, + "loss": 0.6634, + "step": 6870 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 10.35488224029541, + "learning_rate": 2.9847959455854897e-06, + "loss": 0.5709, + "step": 6880 + }, + { + "epoch": 2.5843960990247563, + "grad_norm": 22.383169174194336, + "learning_rate": 2.9581221659109098e-06, + "loss": 0.433, + "step": 6890 + }, + { + "epoch": 2.5881470367591897, + "grad_norm": 21.173015594482422, + "learning_rate": 2.93144838623633e-06, + "loss": 0.6109, + "step": 6900 + }, + { + "epoch": 2.5918979744936235, + "grad_norm": 25.366735458374023, + "learning_rate": 2.90477460656175e-06, + "loss": 0.8177, + "step": 6910 + }, + { + "epoch": 2.5956489122280573, + "grad_norm": 18.91875457763672, + "learning_rate": 2.8781008268871703e-06, + "loss": 0.7214, + "step": 6920 + }, + { + "epoch": 2.5993998499624906, + "grad_norm": 12.457830429077148, + "learning_rate": 2.8514270472125903e-06, + "loss": 0.553, + "step": 6930 + }, + { + "epoch": 2.603150787696924, + "grad_norm": 6.222160816192627, + "learning_rate": 2.82475326753801e-06, + "loss": 0.679, + "step": 6940 + }, + { + "epoch": 2.606901725431358, + "grad_norm": 8.99958324432373, + "learning_rate": 2.7980794878634304e-06, + "loss": 0.5929, + "step": 6950 + }, + { + "epoch": 2.6106526631657916, + "grad_norm": 11.063492774963379, + "learning_rate": 2.771405708188851e-06, + "loss": 0.5185, + "step": 6960 + }, + { + "epoch": 2.614403600900225, + "grad_norm": 10.320928573608398, + "learning_rate": 2.744731928514271e-06, + "loss": 0.5286, + "step": 6970 + }, + { + "epoch": 2.6181545386346587, + "grad_norm": 13.718670845031738, + "learning_rate": 2.7180581488396905e-06, + "loss": 0.6508, + "step": 6980 + }, + { + "epoch": 2.6219054763690925, + "grad_norm": 10.613819122314453, + "learning_rate": 2.691384369165111e-06, + "loss": 0.5805, + "step": 6990 + }, + { + "epoch": 2.625656414103526, + "grad_norm": 22.765199661254883, + "learning_rate": 2.664710589490531e-06, + "loss": 0.6691, + "step": 7000 + }, + { + "epoch": 2.6294073518379593, + "grad_norm": 12.34518051147461, + "learning_rate": 2.638036809815951e-06, + "loss": 0.6577, + "step": 7010 + }, + { + "epoch": 2.633158289572393, + "grad_norm": 15.861391067504883, + "learning_rate": 2.611363030141371e-06, + "loss": 0.5159, + "step": 7020 + }, + { + "epoch": 2.636909227306827, + "grad_norm": 7.271751880645752, + "learning_rate": 2.5846892504667916e-06, + "loss": 0.6844, + "step": 7030 + }, + { + "epoch": 2.64066016504126, + "grad_norm": 20.930856704711914, + "learning_rate": 2.5580154707922116e-06, + "loss": 0.7827, + "step": 7040 + }, + { + "epoch": 2.644411102775694, + "grad_norm": 28.042675018310547, + "learning_rate": 2.5313416911176312e-06, + "loss": 0.6383, + "step": 7050 + }, + { + "epoch": 2.6481620405101274, + "grad_norm": 25.815296173095703, + "learning_rate": 2.5046679114430517e-06, + "loss": 0.6866, + "step": 7060 + }, + { + "epoch": 2.651912978244561, + "grad_norm": 16.492206573486328, + "learning_rate": 2.4779941317684717e-06, + "loss": 0.5342, + "step": 7070 + }, + { + "epoch": 2.6556639159789945, + "grad_norm": 23.266910552978516, + "learning_rate": 2.4513203520938918e-06, + "loss": 0.5564, + "step": 7080 + }, + { + "epoch": 2.6594148537134283, + "grad_norm": 11.591928482055664, + "learning_rate": 2.424646572419312e-06, + "loss": 0.573, + "step": 7090 + }, + { + "epoch": 2.663165791447862, + "grad_norm": 14.71267032623291, + "learning_rate": 2.3979727927447323e-06, + "loss": 0.6456, + "step": 7100 + }, + { + "epoch": 2.6669167291822955, + "grad_norm": 7.238256454467773, + "learning_rate": 2.3712990130701523e-06, + "loss": 0.7731, + "step": 7110 + }, + { + "epoch": 2.6706676669167293, + "grad_norm": 38.71699523925781, + "learning_rate": 2.3446252333955723e-06, + "loss": 0.7189, + "step": 7120 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 24.029537200927734, + "learning_rate": 2.3179514537209924e-06, + "loss": 0.7435, + "step": 7130 + }, + { + "epoch": 2.6781695423855965, + "grad_norm": 17.704763412475586, + "learning_rate": 2.291277674046413e-06, + "loss": 0.838, + "step": 7140 + }, + { + "epoch": 2.68192048012003, + "grad_norm": 36.12045669555664, + "learning_rate": 2.2646038943718325e-06, + "loss": 0.7103, + "step": 7150 + }, + { + "epoch": 2.6856714178544636, + "grad_norm": 20.062591552734375, + "learning_rate": 2.237930114697253e-06, + "loss": 0.5748, + "step": 7160 + }, + { + "epoch": 2.6894223555888974, + "grad_norm": 9.567973136901855, + "learning_rate": 2.211256335022673e-06, + "loss": 0.7598, + "step": 7170 + }, + { + "epoch": 2.6931732933233308, + "grad_norm": 19.337631225585938, + "learning_rate": 2.184582555348093e-06, + "loss": 0.5945, + "step": 7180 + }, + { + "epoch": 2.6969242310577646, + "grad_norm": 11.189875602722168, + "learning_rate": 2.157908775673513e-06, + "loss": 0.77, + "step": 7190 + }, + { + "epoch": 2.700675168792198, + "grad_norm": 16.071062088012695, + "learning_rate": 2.131234995998933e-06, + "loss": 0.6758, + "step": 7200 + }, + { + "epoch": 2.7044261065266317, + "grad_norm": 11.37120532989502, + "learning_rate": 2.1045612163243535e-06, + "loss": 0.5912, + "step": 7210 + }, + { + "epoch": 2.708177044261065, + "grad_norm": 25.354324340820312, + "learning_rate": 2.0778874366497736e-06, + "loss": 0.6741, + "step": 7220 + }, + { + "epoch": 2.711927981995499, + "grad_norm": 11.246193885803223, + "learning_rate": 2.0512136569751936e-06, + "loss": 0.6073, + "step": 7230 + }, + { + "epoch": 2.7156789197299327, + "grad_norm": 9.01452350616455, + "learning_rate": 2.0245398773006137e-06, + "loss": 0.7363, + "step": 7240 + }, + { + "epoch": 2.719429857464366, + "grad_norm": 22.3641414642334, + "learning_rate": 1.9978660976260337e-06, + "loss": 0.6278, + "step": 7250 + }, + { + "epoch": 2.7231807951988, + "grad_norm": 14.206088066101074, + "learning_rate": 1.9711923179514537e-06, + "loss": 0.6676, + "step": 7260 + }, + { + "epoch": 2.726931732933233, + "grad_norm": 14.623751640319824, + "learning_rate": 1.9445185382768738e-06, + "loss": 0.6629, + "step": 7270 + }, + { + "epoch": 2.730682670667667, + "grad_norm": 15.682950019836426, + "learning_rate": 1.9178447586022942e-06, + "loss": 0.8008, + "step": 7280 + }, + { + "epoch": 2.7344336084021004, + "grad_norm": 16.56915855407715, + "learning_rate": 1.891170978927714e-06, + "loss": 0.8421, + "step": 7290 + }, + { + "epoch": 2.738184546136534, + "grad_norm": 20.514009475708008, + "learning_rate": 1.8644971992531343e-06, + "loss": 0.6755, + "step": 7300 + }, + { + "epoch": 2.741935483870968, + "grad_norm": 15.838664054870605, + "learning_rate": 1.8378234195785544e-06, + "loss": 0.6463, + "step": 7310 + }, + { + "epoch": 2.7456864216054013, + "grad_norm": 30.3530330657959, + "learning_rate": 1.8111496399039746e-06, + "loss": 0.6295, + "step": 7320 + }, + { + "epoch": 2.7494373593398347, + "grad_norm": 8.959320068359375, + "learning_rate": 1.7844758602293946e-06, + "loss": 0.6443, + "step": 7330 + }, + { + "epoch": 2.7531882970742685, + "grad_norm": 11.156110763549805, + "learning_rate": 1.757802080554815e-06, + "loss": 0.5971, + "step": 7340 + }, + { + "epoch": 2.7569392348087023, + "grad_norm": 21.744304656982422, + "learning_rate": 1.731128300880235e-06, + "loss": 0.5818, + "step": 7350 + }, + { + "epoch": 2.7606901725431356, + "grad_norm": 23.995845794677734, + "learning_rate": 1.7044545212056548e-06, + "loss": 0.6885, + "step": 7360 + }, + { + "epoch": 2.7644411102775694, + "grad_norm": 13.629135131835938, + "learning_rate": 1.677780741531075e-06, + "loss": 0.771, + "step": 7370 + }, + { + "epoch": 2.7681920480120032, + "grad_norm": 6.805270671844482, + "learning_rate": 1.651106961856495e-06, + "loss": 0.687, + "step": 7380 + }, + { + "epoch": 2.7719429857464366, + "grad_norm": 21.93046760559082, + "learning_rate": 1.6244331821819153e-06, + "loss": 0.5681, + "step": 7390 + }, + { + "epoch": 2.77569392348087, + "grad_norm": 22.271133422851562, + "learning_rate": 1.5977594025073353e-06, + "loss": 0.7504, + "step": 7400 + }, + { + "epoch": 2.7794448612153038, + "grad_norm": 19.411861419677734, + "learning_rate": 1.5710856228327556e-06, + "loss": 0.7141, + "step": 7410 + }, + { + "epoch": 2.7831957989497376, + "grad_norm": 21.990013122558594, + "learning_rate": 1.5444118431581756e-06, + "loss": 0.7941, + "step": 7420 + }, + { + "epoch": 2.786946736684171, + "grad_norm": 26.875274658203125, + "learning_rate": 1.5177380634835959e-06, + "loss": 0.7077, + "step": 7430 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 17.144861221313477, + "learning_rate": 1.491064283809016e-06, + "loss": 0.6153, + "step": 7440 + }, + { + "epoch": 2.7944486121530385, + "grad_norm": 18.100868225097656, + "learning_rate": 1.4643905041344357e-06, + "loss": 0.635, + "step": 7450 + }, + { + "epoch": 2.798199549887472, + "grad_norm": 17.497039794921875, + "learning_rate": 1.437716724459856e-06, + "loss": 0.7681, + "step": 7460 + }, + { + "epoch": 2.8019504876219052, + "grad_norm": 11.748749732971191, + "learning_rate": 1.411042944785276e-06, + "loss": 0.7916, + "step": 7470 + }, + { + "epoch": 2.805701425356339, + "grad_norm": 17.71030616760254, + "learning_rate": 1.3843691651106963e-06, + "loss": 0.6826, + "step": 7480 + }, + { + "epoch": 2.809452363090773, + "grad_norm": 15.269068717956543, + "learning_rate": 1.3576953854361163e-06, + "loss": 0.6008, + "step": 7490 + }, + { + "epoch": 2.813203300825206, + "grad_norm": 16.148839950561523, + "learning_rate": 1.3310216057615366e-06, + "loss": 0.7581, + "step": 7500 + }, + { + "epoch": 2.81695423855964, + "grad_norm": 7.341813564300537, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.4343, + "step": 7510 + }, + { + "epoch": 2.8207051762940734, + "grad_norm": 11.722135543823242, + "learning_rate": 1.2776740464123769e-06, + "loss": 0.5226, + "step": 7520 + }, + { + "epoch": 2.824456114028507, + "grad_norm": 17.107776641845703, + "learning_rate": 1.251000266737797e-06, + "loss": 0.6466, + "step": 7530 + }, + { + "epoch": 2.8282070517629405, + "grad_norm": 15.833941459655762, + "learning_rate": 1.224326487063217e-06, + "loss": 0.7703, + "step": 7540 + }, + { + "epoch": 2.8319579894973743, + "grad_norm": 19.610742568969727, + "learning_rate": 1.197652707388637e-06, + "loss": 0.6359, + "step": 7550 + }, + { + "epoch": 2.835708927231808, + "grad_norm": 12.620158195495605, + "learning_rate": 1.1709789277140572e-06, + "loss": 0.6561, + "step": 7560 + }, + { + "epoch": 2.8394598649662415, + "grad_norm": 20.80132293701172, + "learning_rate": 1.1443051480394773e-06, + "loss": 0.7668, + "step": 7570 + }, + { + "epoch": 2.8432108027006753, + "grad_norm": 9.778907775878906, + "learning_rate": 1.1176313683648973e-06, + "loss": 0.7067, + "step": 7580 + }, + { + "epoch": 2.8469617404351086, + "grad_norm": 11.224839210510254, + "learning_rate": 1.0909575886903174e-06, + "loss": 0.5963, + "step": 7590 + }, + { + "epoch": 2.8507126781695424, + "grad_norm": 11.957784652709961, + "learning_rate": 1.0642838090157376e-06, + "loss": 0.7127, + "step": 7600 + }, + { + "epoch": 2.854463615903976, + "grad_norm": 17.465967178344727, + "learning_rate": 1.0376100293411576e-06, + "loss": 0.5896, + "step": 7610 + }, + { + "epoch": 2.8582145536384096, + "grad_norm": 22.074583053588867, + "learning_rate": 1.010936249666578e-06, + "loss": 0.664, + "step": 7620 + }, + { + "epoch": 2.8619654913728434, + "grad_norm": 45.1811408996582, + "learning_rate": 9.84262469991998e-07, + "loss": 0.6788, + "step": 7630 + }, + { + "epoch": 2.8657164291072768, + "grad_norm": 12.519074440002441, + "learning_rate": 9.57588690317418e-07, + "loss": 0.5208, + "step": 7640 + }, + { + "epoch": 2.8694673668417106, + "grad_norm": 14.533720016479492, + "learning_rate": 9.309149106428382e-07, + "loss": 0.6403, + "step": 7650 + }, + { + "epoch": 2.873218304576144, + "grad_norm": 6.502141952514648, + "learning_rate": 9.042411309682584e-07, + "loss": 0.6661, + "step": 7660 + }, + { + "epoch": 2.8769692423105777, + "grad_norm": 16.4246826171875, + "learning_rate": 8.775673512936784e-07, + "loss": 0.6631, + "step": 7670 + }, + { + "epoch": 2.880720180045011, + "grad_norm": 20.435749053955078, + "learning_rate": 8.508935716190984e-07, + "loss": 0.769, + "step": 7680 + }, + { + "epoch": 2.884471117779445, + "grad_norm": 9.382180213928223, + "learning_rate": 8.242197919445186e-07, + "loss": 0.5407, + "step": 7690 + }, + { + "epoch": 2.8882220555138787, + "grad_norm": 12.802393913269043, + "learning_rate": 7.975460122699387e-07, + "loss": 0.636, + "step": 7700 + }, + { + "epoch": 2.891972993248312, + "grad_norm": 5.997576713562012, + "learning_rate": 7.708722325953588e-07, + "loss": 0.561, + "step": 7710 + }, + { + "epoch": 2.895723930982746, + "grad_norm": 8.369012832641602, + "learning_rate": 7.441984529207789e-07, + "loss": 0.5721, + "step": 7720 + }, + { + "epoch": 2.899474868717179, + "grad_norm": 19.990249633789062, + "learning_rate": 7.175246732461991e-07, + "loss": 0.7267, + "step": 7730 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 19.364540100097656, + "learning_rate": 6.908508935716192e-07, + "loss": 0.6867, + "step": 7740 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 10.638273239135742, + "learning_rate": 6.641771138970394e-07, + "loss": 0.5809, + "step": 7750 + }, + { + "epoch": 2.91072768192048, + "grad_norm": 24.913246154785156, + "learning_rate": 6.375033342224594e-07, + "loss": 0.5658, + "step": 7760 + }, + { + "epoch": 2.914478619654914, + "grad_norm": 6.1255412101745605, + "learning_rate": 6.108295545478795e-07, + "loss": 0.4796, + "step": 7770 + }, + { + "epoch": 2.9182295573893473, + "grad_norm": 13.97762680053711, + "learning_rate": 5.841557748732996e-07, + "loss": 0.6201, + "step": 7780 + }, + { + "epoch": 2.921980495123781, + "grad_norm": 24.56553840637207, + "learning_rate": 5.574819951987197e-07, + "loss": 0.5206, + "step": 7790 + }, + { + "epoch": 2.9257314328582145, + "grad_norm": 20.081579208374023, + "learning_rate": 5.308082155241398e-07, + "loss": 0.5697, + "step": 7800 + }, + { + "epoch": 2.9294823705926483, + "grad_norm": 11.358619689941406, + "learning_rate": 5.041344358495599e-07, + "loss": 0.6268, + "step": 7810 + }, + { + "epoch": 2.9332333083270816, + "grad_norm": 11.016149520874023, + "learning_rate": 4.7746065617498e-07, + "loss": 0.5753, + "step": 7820 + }, + { + "epoch": 2.9369842460615154, + "grad_norm": 17.64615249633789, + "learning_rate": 4.507868765004002e-07, + "loss": 0.7584, + "step": 7830 + }, + { + "epoch": 2.9407351837959492, + "grad_norm": 17.292207717895508, + "learning_rate": 4.2411309682582024e-07, + "loss": 0.6361, + "step": 7840 + }, + { + "epoch": 2.9444861215303826, + "grad_norm": 17.94815444946289, + "learning_rate": 3.974393171512404e-07, + "loss": 0.7208, + "step": 7850 + }, + { + "epoch": 2.948237059264816, + "grad_norm": 13.073601722717285, + "learning_rate": 3.7076553747666047e-07, + "loss": 0.7179, + "step": 7860 + }, + { + "epoch": 2.9519879969992497, + "grad_norm": 7.956513404846191, + "learning_rate": 3.440917578020806e-07, + "loss": 0.6109, + "step": 7870 + }, + { + "epoch": 2.9557389347336835, + "grad_norm": 18.16693687438965, + "learning_rate": 3.1741797812750066e-07, + "loss": 0.6499, + "step": 7880 + }, + { + "epoch": 2.959489872468117, + "grad_norm": 25.006132125854492, + "learning_rate": 2.907441984529208e-07, + "loss": 0.5358, + "step": 7890 + }, + { + "epoch": 2.9632408102025507, + "grad_norm": 20.937856674194336, + "learning_rate": 2.640704187783409e-07, + "loss": 0.6364, + "step": 7900 + }, + { + "epoch": 2.9669917479369845, + "grad_norm": 12.37922477722168, + "learning_rate": 2.3739663910376104e-07, + "loss": 0.4916, + "step": 7910 + }, + { + "epoch": 2.970742685671418, + "grad_norm": 8.240549087524414, + "learning_rate": 2.1072285942918113e-07, + "loss": 0.6811, + "step": 7920 + }, + { + "epoch": 2.974493623405851, + "grad_norm": 9.405010223388672, + "learning_rate": 1.8404907975460125e-07, + "loss": 0.5338, + "step": 7930 + }, + { + "epoch": 2.978244561140285, + "grad_norm": 13.773921966552734, + "learning_rate": 1.5737530008002134e-07, + "loss": 0.6314, + "step": 7940 + }, + { + "epoch": 2.981995498874719, + "grad_norm": 12.41072940826416, + "learning_rate": 1.3070152040544146e-07, + "loss": 0.5497, + "step": 7950 + }, + { + "epoch": 2.985746436609152, + "grad_norm": 17.232473373413086, + "learning_rate": 1.0402774073086158e-07, + "loss": 0.588, + "step": 7960 + }, + { + "epoch": 2.989497374343586, + "grad_norm": 27.516319274902344, + "learning_rate": 7.735396105628168e-08, + "loss": 0.7673, + "step": 7970 + }, + { + "epoch": 2.99324831207802, + "grad_norm": 16.864728927612305, + "learning_rate": 5.0680181381701795e-08, + "loss": 0.6883, + "step": 7980 + }, + { + "epoch": 2.996999249812453, + "grad_norm": 16.803760528564453, + "learning_rate": 2.40064017071219e-08, + "loss": 0.6337, + "step": 7990 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.5919831223628692, + "eval_f1_macro": 0.5904844573730711, + "eval_f1_weighted": 0.5917816930917, + "eval_loss": 1.0033386945724487, + "eval_precision_macro": 0.5945695673493336, + "eval_precision_weighted": 0.5926704635628428, + "eval_recall_macro": 0.5877159391363334, + "eval_recall_weighted": 0.5919831223628692, + "eval_runtime": 4.8573, + "eval_samples_per_second": 487.928, + "eval_steps_per_second": 61.145, + "step": 7998 + } + ], + "logging_steps": 10, + "max_steps": 7998, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.68326808991488e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}