{ "best_metric": 0.5136106610298157, "best_model_checkpoint": "./results/checkpoint-2874", "epoch": 4.0, "eval_steps": 500, "global_step": 5748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006958942240779402, "grad_norm": 4.3180251121521, "learning_rate": 1.9972164231036883e-05, "loss": 1.3759, "step": 10 }, { "epoch": 0.013917884481558803, "grad_norm": 2.7253687381744385, "learning_rate": 1.9944328462073764e-05, "loss": 1.3343, "step": 20 }, { "epoch": 0.020876826722338204, "grad_norm": 3.703948736190796, "learning_rate": 1.9916492693110648e-05, "loss": 1.226, "step": 30 }, { "epoch": 0.027835768963117607, "grad_norm": 6.535583972930908, "learning_rate": 1.9888656924147533e-05, "loss": 1.0883, "step": 40 }, { "epoch": 0.03479471120389701, "grad_norm": 13.897150039672852, "learning_rate": 1.9860821155184414e-05, "loss": 0.9748, "step": 50 }, { "epoch": 0.04175365344467641, "grad_norm": 7.23193359375, "learning_rate": 1.9832985386221295e-05, "loss": 0.8725, "step": 60 }, { "epoch": 0.04871259568545581, "grad_norm": 13.598752975463867, "learning_rate": 1.980514961725818e-05, "loss": 0.8301, "step": 70 }, { "epoch": 0.055671537926235214, "grad_norm": 15.322084426879883, "learning_rate": 1.977731384829506e-05, "loss": 0.8432, "step": 80 }, { "epoch": 0.06263048016701461, "grad_norm": 6.434969425201416, "learning_rate": 1.974947807933194e-05, "loss": 0.7408, "step": 90 }, { "epoch": 0.06958942240779402, "grad_norm": 8.11408805847168, "learning_rate": 1.9721642310368826e-05, "loss": 0.7086, "step": 100 }, { "epoch": 0.07654836464857341, "grad_norm": 7.85330057144165, "learning_rate": 1.969380654140571e-05, "loss": 0.7318, "step": 110 }, { "epoch": 0.08350730688935282, "grad_norm": 10.534364700317383, "learning_rate": 1.966597077244259e-05, "loss": 0.6798, "step": 120 }, { "epoch": 0.09046624913013222, "grad_norm": 13.151297569274902, "learning_rate": 1.9638135003479472e-05, "loss": 0.6988, "step": 130 }, { "epoch": 0.09742519137091162, "grad_norm": 9.17402458190918, "learning_rate": 1.9610299234516353e-05, "loss": 0.6723, "step": 140 }, { "epoch": 0.10438413361169102, "grad_norm": 19.499664306640625, "learning_rate": 1.9582463465553238e-05, "loss": 0.5994, "step": 150 }, { "epoch": 0.11134307585247043, "grad_norm": 11.141926765441895, "learning_rate": 1.955462769659012e-05, "loss": 0.6565, "step": 160 }, { "epoch": 0.11830201809324982, "grad_norm": 15.322267532348633, "learning_rate": 1.9526791927627003e-05, "loss": 0.7444, "step": 170 }, { "epoch": 0.12526096033402923, "grad_norm": 9.982789993286133, "learning_rate": 1.9498956158663885e-05, "loss": 0.6882, "step": 180 }, { "epoch": 0.13221990257480862, "grad_norm": 15.503236770629883, "learning_rate": 1.9471120389700766e-05, "loss": 0.6286, "step": 190 }, { "epoch": 0.13917884481558804, "grad_norm": 9.867083549499512, "learning_rate": 1.944328462073765e-05, "loss": 0.7333, "step": 200 }, { "epoch": 0.14613778705636743, "grad_norm": 21.876277923583984, "learning_rate": 1.941544885177453e-05, "loss": 0.7438, "step": 210 }, { "epoch": 0.15309672929714682, "grad_norm": 17.48896598815918, "learning_rate": 1.9387613082811416e-05, "loss": 0.6269, "step": 220 }, { "epoch": 0.16005567153792624, "grad_norm": 6.696071147918701, "learning_rate": 1.9359777313848297e-05, "loss": 0.6739, "step": 230 }, { "epoch": 0.16701461377870563, "grad_norm": 11.048694610595703, "learning_rate": 1.933194154488518e-05, "loss": 0.7373, "step": 240 }, { "epoch": 0.17397355601948503, "grad_norm": 8.395058631896973, "learning_rate": 1.9304105775922062e-05, "loss": 0.6133, "step": 250 }, { "epoch": 0.18093249826026445, "grad_norm": 9.003988265991211, "learning_rate": 1.9276270006958943e-05, "loss": 0.5964, "step": 260 }, { "epoch": 0.18789144050104384, "grad_norm": 10.79345703125, "learning_rate": 1.9248434237995824e-05, "loss": 0.6293, "step": 270 }, { "epoch": 0.19485038274182323, "grad_norm": 19.241323471069336, "learning_rate": 1.922059846903271e-05, "loss": 0.6418, "step": 280 }, { "epoch": 0.20180932498260265, "grad_norm": 20.079133987426758, "learning_rate": 1.9192762700069593e-05, "loss": 0.749, "step": 290 }, { "epoch": 0.20876826722338204, "grad_norm": 31.14455223083496, "learning_rate": 1.9164926931106474e-05, "loss": 0.6719, "step": 300 }, { "epoch": 0.21572720946416143, "grad_norm": 18.57120132446289, "learning_rate": 1.9137091162143355e-05, "loss": 0.674, "step": 310 }, { "epoch": 0.22268615170494085, "grad_norm": 6.905323028564453, "learning_rate": 1.9109255393180236e-05, "loss": 0.5172, "step": 320 }, { "epoch": 0.22964509394572025, "grad_norm": 9.24687671661377, "learning_rate": 1.908141962421712e-05, "loss": 0.637, "step": 330 }, { "epoch": 0.23660403618649964, "grad_norm": 11.945255279541016, "learning_rate": 1.9053583855254002e-05, "loss": 0.6888, "step": 340 }, { "epoch": 0.24356297842727906, "grad_norm": 7.873608112335205, "learning_rate": 1.9025748086290886e-05, "loss": 0.625, "step": 350 }, { "epoch": 0.25052192066805845, "grad_norm": 10.357784271240234, "learning_rate": 1.8997912317327767e-05, "loss": 0.5676, "step": 360 }, { "epoch": 0.25748086290883787, "grad_norm": 12.345287322998047, "learning_rate": 1.8970076548364652e-05, "loss": 0.5983, "step": 370 }, { "epoch": 0.26443980514961724, "grad_norm": 7.470931053161621, "learning_rate": 1.8942240779401533e-05, "loss": 0.5978, "step": 380 }, { "epoch": 0.27139874739039666, "grad_norm": 10.457704544067383, "learning_rate": 1.8914405010438414e-05, "loss": 0.6074, "step": 390 }, { "epoch": 0.2783576896311761, "grad_norm": 14.570989608764648, "learning_rate": 1.8886569241475295e-05, "loss": 0.6218, "step": 400 }, { "epoch": 0.28531663187195544, "grad_norm": 4.791262626647949, "learning_rate": 1.885873347251218e-05, "loss": 0.5941, "step": 410 }, { "epoch": 0.29227557411273486, "grad_norm": 7.30219030380249, "learning_rate": 1.8830897703549064e-05, "loss": 0.5407, "step": 420 }, { "epoch": 0.2992345163535143, "grad_norm": 7.455644130706787, "learning_rate": 1.8803061934585945e-05, "loss": 0.6167, "step": 430 }, { "epoch": 0.30619345859429364, "grad_norm": 14.505638122558594, "learning_rate": 1.8775226165622826e-05, "loss": 0.6554, "step": 440 }, { "epoch": 0.31315240083507306, "grad_norm": 9.505303382873535, "learning_rate": 1.8747390396659707e-05, "loss": 0.5817, "step": 450 }, { "epoch": 0.3201113430758525, "grad_norm": 11.907074928283691, "learning_rate": 1.871955462769659e-05, "loss": 0.5423, "step": 460 }, { "epoch": 0.32707028531663185, "grad_norm": 9.421733856201172, "learning_rate": 1.8691718858733473e-05, "loss": 0.5928, "step": 470 }, { "epoch": 0.33402922755741127, "grad_norm": 8.2644624710083, "learning_rate": 1.8663883089770357e-05, "loss": 0.6234, "step": 480 }, { "epoch": 0.3409881697981907, "grad_norm": 16.61441421508789, "learning_rate": 1.8636047320807238e-05, "loss": 0.5777, "step": 490 }, { "epoch": 0.34794711203897005, "grad_norm": 8.484561920166016, "learning_rate": 1.8608211551844123e-05, "loss": 0.5505, "step": 500 }, { "epoch": 0.35490605427974947, "grad_norm": 7.690084457397461, "learning_rate": 1.8580375782881004e-05, "loss": 0.5898, "step": 510 }, { "epoch": 0.3618649965205289, "grad_norm": 7.3729963302612305, "learning_rate": 1.8552540013917885e-05, "loss": 0.6634, "step": 520 }, { "epoch": 0.36882393876130826, "grad_norm": 8.139713287353516, "learning_rate": 1.852470424495477e-05, "loss": 0.6074, "step": 530 }, { "epoch": 0.3757828810020877, "grad_norm": 10.640233039855957, "learning_rate": 1.849686847599165e-05, "loss": 0.6352, "step": 540 }, { "epoch": 0.3827418232428671, "grad_norm": 16.793916702270508, "learning_rate": 1.8469032707028535e-05, "loss": 0.6306, "step": 550 }, { "epoch": 0.38970076548364646, "grad_norm": 15.125770568847656, "learning_rate": 1.8441196938065416e-05, "loss": 0.6629, "step": 560 }, { "epoch": 0.3966597077244259, "grad_norm": 13.026156425476074, "learning_rate": 1.8413361169102297e-05, "loss": 0.5743, "step": 570 }, { "epoch": 0.4036186499652053, "grad_norm": 10.893036842346191, "learning_rate": 1.838552540013918e-05, "loss": 0.5443, "step": 580 }, { "epoch": 0.41057759220598466, "grad_norm": 21.39899253845215, "learning_rate": 1.8357689631176062e-05, "loss": 0.5717, "step": 590 }, { "epoch": 0.4175365344467641, "grad_norm": 29.05453872680664, "learning_rate": 1.8329853862212947e-05, "loss": 0.5856, "step": 600 }, { "epoch": 0.4244954766875435, "grad_norm": 9.653709411621094, "learning_rate": 1.8302018093249828e-05, "loss": 0.5105, "step": 610 }, { "epoch": 0.43145441892832287, "grad_norm": 14.518112182617188, "learning_rate": 1.827418232428671e-05, "loss": 0.6757, "step": 620 }, { "epoch": 0.4384133611691023, "grad_norm": 12.425946235656738, "learning_rate": 1.8246346555323593e-05, "loss": 0.5708, "step": 630 }, { "epoch": 0.4453723034098817, "grad_norm": 16.857666015625, "learning_rate": 1.8218510786360474e-05, "loss": 0.603, "step": 640 }, { "epoch": 0.4523312456506611, "grad_norm": 19.68130111694336, "learning_rate": 1.8190675017397356e-05, "loss": 0.611, "step": 650 }, { "epoch": 0.4592901878914405, "grad_norm": 13.848482131958008, "learning_rate": 1.816283924843424e-05, "loss": 0.625, "step": 660 }, { "epoch": 0.4662491301322199, "grad_norm": 12.750751495361328, "learning_rate": 1.813500347947112e-05, "loss": 0.616, "step": 670 }, { "epoch": 0.4732080723729993, "grad_norm": 9.34748363494873, "learning_rate": 1.8107167710508005e-05, "loss": 0.612, "step": 680 }, { "epoch": 0.4801670146137787, "grad_norm": 14.746898651123047, "learning_rate": 1.8079331941544887e-05, "loss": 0.6521, "step": 690 }, { "epoch": 0.4871259568545581, "grad_norm": 16.734874725341797, "learning_rate": 1.8051496172581768e-05, "loss": 0.606, "step": 700 }, { "epoch": 0.4940848990953375, "grad_norm": 10.393136024475098, "learning_rate": 1.8023660403618652e-05, "loss": 0.6286, "step": 710 }, { "epoch": 0.5010438413361169, "grad_norm": 8.78846263885498, "learning_rate": 1.7995824634655533e-05, "loss": 0.4593, "step": 720 }, { "epoch": 0.5080027835768963, "grad_norm": 18.29091453552246, "learning_rate": 1.7967988865692418e-05, "loss": 0.6764, "step": 730 }, { "epoch": 0.5149617258176757, "grad_norm": 13.510717391967773, "learning_rate": 1.79401530967293e-05, "loss": 0.6387, "step": 740 }, { "epoch": 0.5219206680584552, "grad_norm": 8.811914443969727, "learning_rate": 1.791231732776618e-05, "loss": 0.6072, "step": 750 }, { "epoch": 0.5288796102992345, "grad_norm": 12.649739265441895, "learning_rate": 1.7884481558803064e-05, "loss": 0.6128, "step": 760 }, { "epoch": 0.5358385525400139, "grad_norm": 9.891730308532715, "learning_rate": 1.7856645789839945e-05, "loss": 0.568, "step": 770 }, { "epoch": 0.5427974947807933, "grad_norm": 9.014187812805176, "learning_rate": 1.7828810020876826e-05, "loss": 0.5393, "step": 780 }, { "epoch": 0.5497564370215727, "grad_norm": 19.861948013305664, "learning_rate": 1.780097425191371e-05, "loss": 0.5345, "step": 790 }, { "epoch": 0.5567153792623522, "grad_norm": 8.01091480255127, "learning_rate": 1.7773138482950595e-05, "loss": 0.6457, "step": 800 }, { "epoch": 0.5636743215031316, "grad_norm": 10.99765682220459, "learning_rate": 1.7745302713987476e-05, "loss": 0.5605, "step": 810 }, { "epoch": 0.5706332637439109, "grad_norm": 7.248875617980957, "learning_rate": 1.7717466945024357e-05, "loss": 0.5975, "step": 820 }, { "epoch": 0.5775922059846903, "grad_norm": 12.024378776550293, "learning_rate": 1.768963117606124e-05, "loss": 0.6238, "step": 830 }, { "epoch": 0.5845511482254697, "grad_norm": 7.073344707489014, "learning_rate": 1.7661795407098123e-05, "loss": 0.6067, "step": 840 }, { "epoch": 0.5915100904662491, "grad_norm": 11.20361614227295, "learning_rate": 1.7633959638135004e-05, "loss": 0.5319, "step": 850 }, { "epoch": 0.5984690327070286, "grad_norm": 15.502198219299316, "learning_rate": 1.760612386917189e-05, "loss": 0.6038, "step": 860 }, { "epoch": 0.605427974947808, "grad_norm": 7.949779510498047, "learning_rate": 1.757828810020877e-05, "loss": 0.5103, "step": 870 }, { "epoch": 0.6123869171885873, "grad_norm": 21.52058219909668, "learning_rate": 1.7550452331245654e-05, "loss": 0.5607, "step": 880 }, { "epoch": 0.6193458594293667, "grad_norm": 11.751809120178223, "learning_rate": 1.7522616562282535e-05, "loss": 0.46, "step": 890 }, { "epoch": 0.6263048016701461, "grad_norm": 19.769689559936523, "learning_rate": 1.7494780793319416e-05, "loss": 0.5555, "step": 900 }, { "epoch": 0.6332637439109255, "grad_norm": 6.040853977203369, "learning_rate": 1.74669450243563e-05, "loss": 0.6577, "step": 910 }, { "epoch": 0.640222686151705, "grad_norm": 10.926910400390625, "learning_rate": 1.743910925539318e-05, "loss": 0.5674, "step": 920 }, { "epoch": 0.6471816283924844, "grad_norm": 5.619332790374756, "learning_rate": 1.7411273486430066e-05, "loss": 0.5937, "step": 930 }, { "epoch": 0.6541405706332637, "grad_norm": 9.416117668151855, "learning_rate": 1.7383437717466947e-05, "loss": 0.5718, "step": 940 }, { "epoch": 0.6610995128740431, "grad_norm": 16.25941276550293, "learning_rate": 1.7355601948503828e-05, "loss": 0.5068, "step": 950 }, { "epoch": 0.6680584551148225, "grad_norm": 10.545511245727539, "learning_rate": 1.732776617954071e-05, "loss": 0.5642, "step": 960 }, { "epoch": 0.675017397355602, "grad_norm": 7.5949578285217285, "learning_rate": 1.7299930410577594e-05, "loss": 0.5243, "step": 970 }, { "epoch": 0.6819763395963814, "grad_norm": 10.766368865966797, "learning_rate": 1.7272094641614475e-05, "loss": 0.5285, "step": 980 }, { "epoch": 0.6889352818371608, "grad_norm": 7.256805419921875, "learning_rate": 1.724425887265136e-05, "loss": 0.5286, "step": 990 }, { "epoch": 0.6958942240779401, "grad_norm": 10.257540702819824, "learning_rate": 1.721642310368824e-05, "loss": 0.6115, "step": 1000 }, { "epoch": 0.7028531663187195, "grad_norm": 14.531510353088379, "learning_rate": 1.7188587334725125e-05, "loss": 0.6112, "step": 1010 }, { "epoch": 0.7098121085594989, "grad_norm": 8.326130867004395, "learning_rate": 1.7160751565762006e-05, "loss": 0.5534, "step": 1020 }, { "epoch": 0.7167710508002784, "grad_norm": 13.698468208312988, "learning_rate": 1.7132915796798887e-05, "loss": 0.5505, "step": 1030 }, { "epoch": 0.7237299930410578, "grad_norm": 12.968040466308594, "learning_rate": 1.710508002783577e-05, "loss": 0.596, "step": 1040 }, { "epoch": 0.7306889352818372, "grad_norm": 10.557011604309082, "learning_rate": 1.7077244258872652e-05, "loss": 0.5588, "step": 1050 }, { "epoch": 0.7376478775226165, "grad_norm": 7.124124526977539, "learning_rate": 1.7049408489909537e-05, "loss": 0.5644, "step": 1060 }, { "epoch": 0.7446068197633959, "grad_norm": 7.210671901702881, "learning_rate": 1.7021572720946418e-05, "loss": 0.5828, "step": 1070 }, { "epoch": 0.7515657620041754, "grad_norm": 20.80126190185547, "learning_rate": 1.69937369519833e-05, "loss": 0.5491, "step": 1080 }, { "epoch": 0.7585247042449548, "grad_norm": 8.95080852508545, "learning_rate": 1.696590118302018e-05, "loss": 0.495, "step": 1090 }, { "epoch": 0.7654836464857342, "grad_norm": 8.503535270690918, "learning_rate": 1.6938065414057064e-05, "loss": 0.5253, "step": 1100 }, { "epoch": 0.7724425887265136, "grad_norm": 6.803649425506592, "learning_rate": 1.691022964509395e-05, "loss": 0.5859, "step": 1110 }, { "epoch": 0.7794015309672929, "grad_norm": 9.828047752380371, "learning_rate": 1.688239387613083e-05, "loss": 0.5418, "step": 1120 }, { "epoch": 0.7863604732080723, "grad_norm": 7.690149307250977, "learning_rate": 1.685455810716771e-05, "loss": 0.5738, "step": 1130 }, { "epoch": 0.7933194154488518, "grad_norm": 8.76807975769043, "learning_rate": 1.6826722338204595e-05, "loss": 0.458, "step": 1140 }, { "epoch": 0.8002783576896312, "grad_norm": 10.1242036819458, "learning_rate": 1.6798886569241476e-05, "loss": 0.5573, "step": 1150 }, { "epoch": 0.8072372999304106, "grad_norm": 8.316211700439453, "learning_rate": 1.6771050800278358e-05, "loss": 0.5527, "step": 1160 }, { "epoch": 0.81419624217119, "grad_norm": 15.57465934753418, "learning_rate": 1.6743215031315242e-05, "loss": 0.5704, "step": 1170 }, { "epoch": 0.8211551844119693, "grad_norm": 13.459155082702637, "learning_rate": 1.6715379262352126e-05, "loss": 0.5659, "step": 1180 }, { "epoch": 0.8281141266527487, "grad_norm": 7.378421783447266, "learning_rate": 1.6687543493389008e-05, "loss": 0.5482, "step": 1190 }, { "epoch": 0.8350730688935282, "grad_norm": 10.607648849487305, "learning_rate": 1.665970772442589e-05, "loss": 0.5627, "step": 1200 }, { "epoch": 0.8420320111343076, "grad_norm": 13.157455444335938, "learning_rate": 1.663187195546277e-05, "loss": 0.5479, "step": 1210 }, { "epoch": 0.848990953375087, "grad_norm": 9.343613624572754, "learning_rate": 1.6604036186499654e-05, "loss": 0.5695, "step": 1220 }, { "epoch": 0.8559498956158664, "grad_norm": 24.01211929321289, "learning_rate": 1.6576200417536535e-05, "loss": 0.5173, "step": 1230 }, { "epoch": 0.8629088378566457, "grad_norm": 14.033452987670898, "learning_rate": 1.654836464857342e-05, "loss": 0.5604, "step": 1240 }, { "epoch": 0.8698677800974252, "grad_norm": 13.518189430236816, "learning_rate": 1.65205288796103e-05, "loss": 0.5797, "step": 1250 }, { "epoch": 0.8768267223382046, "grad_norm": 9.779712677001953, "learning_rate": 1.6492693110647182e-05, "loss": 0.5761, "step": 1260 }, { "epoch": 0.883785664578984, "grad_norm": 25.537031173706055, "learning_rate": 1.6464857341684066e-05, "loss": 0.4546, "step": 1270 }, { "epoch": 0.8907446068197634, "grad_norm": 9.35092544555664, "learning_rate": 1.6437021572720947e-05, "loss": 0.5037, "step": 1280 }, { "epoch": 0.8977035490605428, "grad_norm": 11.983678817749023, "learning_rate": 1.640918580375783e-05, "loss": 0.4967, "step": 1290 }, { "epoch": 0.9046624913013221, "grad_norm": 8.681296348571777, "learning_rate": 1.6381350034794713e-05, "loss": 0.6147, "step": 1300 }, { "epoch": 0.9116214335421016, "grad_norm": 9.482718467712402, "learning_rate": 1.6353514265831597e-05, "loss": 0.5906, "step": 1310 }, { "epoch": 0.918580375782881, "grad_norm": 11.09154987335205, "learning_rate": 1.6325678496868478e-05, "loss": 0.5307, "step": 1320 }, { "epoch": 0.9255393180236604, "grad_norm": 12.523815155029297, "learning_rate": 1.629784272790536e-05, "loss": 0.4931, "step": 1330 }, { "epoch": 0.9324982602644398, "grad_norm": 13.545294761657715, "learning_rate": 1.627000695894224e-05, "loss": 0.4874, "step": 1340 }, { "epoch": 0.9394572025052192, "grad_norm": 8.994409561157227, "learning_rate": 1.6242171189979125e-05, "loss": 0.5581, "step": 1350 }, { "epoch": 0.9464161447459986, "grad_norm": 9.682478904724121, "learning_rate": 1.6214335421016006e-05, "loss": 0.5548, "step": 1360 }, { "epoch": 0.953375086986778, "grad_norm": 7.079063415527344, "learning_rate": 1.618649965205289e-05, "loss": 0.4918, "step": 1370 }, { "epoch": 0.9603340292275574, "grad_norm": 10.886133193969727, "learning_rate": 1.615866388308977e-05, "loss": 0.594, "step": 1380 }, { "epoch": 0.9672929714683368, "grad_norm": 17.207847595214844, "learning_rate": 1.6130828114126653e-05, "loss": 0.5892, "step": 1390 }, { "epoch": 0.9742519137091162, "grad_norm": 9.363895416259766, "learning_rate": 1.6102992345163537e-05, "loss": 0.5774, "step": 1400 }, { "epoch": 0.9812108559498957, "grad_norm": 16.450660705566406, "learning_rate": 1.6075156576200418e-05, "loss": 0.481, "step": 1410 }, { "epoch": 0.988169798190675, "grad_norm": 12.754993438720703, "learning_rate": 1.6047320807237302e-05, "loss": 0.5667, "step": 1420 }, { "epoch": 0.9951287404314544, "grad_norm": 9.848027229309082, "learning_rate": 1.6019485038274184e-05, "loss": 0.562, "step": 1430 }, { "epoch": 1.0, "eval_accuracy": 0.8099216710182767, "eval_f1": 0.8040056577967634, "eval_loss": 0.518168568611145, "eval_precision": 0.8071878224437447, "eval_recall": 0.8099216710182767, "eval_runtime": 30.1807, "eval_samples_per_second": 190.353, "eval_steps_per_second": 5.964, "step": 1437 }, { "epoch": 1.0020876826722338, "grad_norm": 10.002483367919922, "learning_rate": 1.5991649269311068e-05, "loss": 0.489, "step": 1440 }, { "epoch": 1.0090466249130132, "grad_norm": 8.09190559387207, "learning_rate": 1.596381350034795e-05, "loss": 0.4158, "step": 1450 }, { "epoch": 1.0160055671537926, "grad_norm": 11.66649055480957, "learning_rate": 1.593597773138483e-05, "loss": 0.4249, "step": 1460 }, { "epoch": 1.022964509394572, "grad_norm": 11.087824821472168, "learning_rate": 1.590814196242171e-05, "loss": 0.4064, "step": 1470 }, { "epoch": 1.0299234516353515, "grad_norm": 14.069585800170898, "learning_rate": 1.5880306193458596e-05, "loss": 0.4159, "step": 1480 }, { "epoch": 1.036882393876131, "grad_norm": 17.53207778930664, "learning_rate": 1.585247042449548e-05, "loss": 0.3956, "step": 1490 }, { "epoch": 1.0438413361169103, "grad_norm": 5.558675289154053, "learning_rate": 1.582463465553236e-05, "loss": 0.3837, "step": 1500 }, { "epoch": 1.0508002783576895, "grad_norm": 14.77510929107666, "learning_rate": 1.5796798886569242e-05, "loss": 0.3886, "step": 1510 }, { "epoch": 1.057759220598469, "grad_norm": 11.358789443969727, "learning_rate": 1.5768963117606123e-05, "loss": 0.3557, "step": 1520 }, { "epoch": 1.0647181628392484, "grad_norm": 15.428235054016113, "learning_rate": 1.5741127348643008e-05, "loss": 0.4789, "step": 1530 }, { "epoch": 1.0716771050800278, "grad_norm": 16.330005645751953, "learning_rate": 1.571329157967989e-05, "loss": 0.4957, "step": 1540 }, { "epoch": 1.0786360473208072, "grad_norm": 11.652454376220703, "learning_rate": 1.5685455810716773e-05, "loss": 0.4305, "step": 1550 }, { "epoch": 1.0855949895615866, "grad_norm": 11.190437316894531, "learning_rate": 1.5657620041753654e-05, "loss": 0.4325, "step": 1560 }, { "epoch": 1.092553931802366, "grad_norm": 17.103654861450195, "learning_rate": 1.562978427279054e-05, "loss": 0.4482, "step": 1570 }, { "epoch": 1.0995128740431455, "grad_norm": 13.081258773803711, "learning_rate": 1.560194850382742e-05, "loss": 0.3345, "step": 1580 }, { "epoch": 1.1064718162839249, "grad_norm": 10.141121864318848, "learning_rate": 1.55741127348643e-05, "loss": 0.453, "step": 1590 }, { "epoch": 1.1134307585247043, "grad_norm": 15.91781997680664, "learning_rate": 1.5546276965901182e-05, "loss": 0.4351, "step": 1600 }, { "epoch": 1.1203897007654837, "grad_norm": 11.977279663085938, "learning_rate": 1.5518441196938066e-05, "loss": 0.3826, "step": 1610 }, { "epoch": 1.1273486430062631, "grad_norm": 12.163490295410156, "learning_rate": 1.549060542797495e-05, "loss": 0.439, "step": 1620 }, { "epoch": 1.1343075852470426, "grad_norm": 12.221410751342773, "learning_rate": 1.5462769659011832e-05, "loss": 0.4793, "step": 1630 }, { "epoch": 1.1412665274878218, "grad_norm": 9.962662696838379, "learning_rate": 1.5434933890048713e-05, "loss": 0.4524, "step": 1640 }, { "epoch": 1.1482254697286012, "grad_norm": 10.281346321105957, "learning_rate": 1.5407098121085594e-05, "loss": 0.4537, "step": 1650 }, { "epoch": 1.1551844119693806, "grad_norm": 8.511795997619629, "learning_rate": 1.537926235212248e-05, "loss": 0.3701, "step": 1660 }, { "epoch": 1.16214335421016, "grad_norm": 20.107454299926758, "learning_rate": 1.535142658315936e-05, "loss": 0.4807, "step": 1670 }, { "epoch": 1.1691022964509394, "grad_norm": 12.120085716247559, "learning_rate": 1.5323590814196244e-05, "loss": 0.4253, "step": 1680 }, { "epoch": 1.1760612386917189, "grad_norm": 13.358500480651855, "learning_rate": 1.5295755045233125e-05, "loss": 0.3941, "step": 1690 }, { "epoch": 1.1830201809324983, "grad_norm": 10.09534740447998, "learning_rate": 1.526791927627001e-05, "loss": 0.4248, "step": 1700 }, { "epoch": 1.1899791231732777, "grad_norm": 23.822050094604492, "learning_rate": 1.524008350730689e-05, "loss": 0.4355, "step": 1710 }, { "epoch": 1.1969380654140571, "grad_norm": 10.628303527832031, "learning_rate": 1.5212247738343773e-05, "loss": 0.3619, "step": 1720 }, { "epoch": 1.2038970076548365, "grad_norm": 18.837543487548828, "learning_rate": 1.5184411969380654e-05, "loss": 0.4091, "step": 1730 }, { "epoch": 1.210855949895616, "grad_norm": 9.72425365447998, "learning_rate": 1.5156576200417539e-05, "loss": 0.4468, "step": 1740 }, { "epoch": 1.2178148921363952, "grad_norm": 21.17300796508789, "learning_rate": 1.512874043145442e-05, "loss": 0.4977, "step": 1750 }, { "epoch": 1.2247738343771746, "grad_norm": 13.43977165222168, "learning_rate": 1.5100904662491303e-05, "loss": 0.4028, "step": 1760 }, { "epoch": 1.231732776617954, "grad_norm": 11.052497863769531, "learning_rate": 1.5073068893528184e-05, "loss": 0.5183, "step": 1770 }, { "epoch": 1.2386917188587334, "grad_norm": 12.521780014038086, "learning_rate": 1.5045233124565067e-05, "loss": 0.4714, "step": 1780 }, { "epoch": 1.2456506610995128, "grad_norm": 10.825766563415527, "learning_rate": 1.5017397355601951e-05, "loss": 0.459, "step": 1790 }, { "epoch": 1.2526096033402923, "grad_norm": 9.5389404296875, "learning_rate": 1.4989561586638832e-05, "loss": 0.4247, "step": 1800 }, { "epoch": 1.2595685455810717, "grad_norm": 10.02591609954834, "learning_rate": 1.4961725817675715e-05, "loss": 0.4035, "step": 1810 }, { "epoch": 1.266527487821851, "grad_norm": 11.203591346740723, "learning_rate": 1.4933890048712596e-05, "loss": 0.3517, "step": 1820 }, { "epoch": 1.2734864300626305, "grad_norm": 10.96849250793457, "learning_rate": 1.490605427974948e-05, "loss": 0.4199, "step": 1830 }, { "epoch": 1.28044537230341, "grad_norm": 11.830713272094727, "learning_rate": 1.4878218510786361e-05, "loss": 0.4145, "step": 1840 }, { "epoch": 1.2874043145441894, "grad_norm": 11.980402946472168, "learning_rate": 1.4850382741823244e-05, "loss": 0.4801, "step": 1850 }, { "epoch": 1.2943632567849686, "grad_norm": 12.308026313781738, "learning_rate": 1.4822546972860125e-05, "loss": 0.4601, "step": 1860 }, { "epoch": 1.3013221990257482, "grad_norm": 16.0020694732666, "learning_rate": 1.479471120389701e-05, "loss": 0.4674, "step": 1870 }, { "epoch": 1.3082811412665274, "grad_norm": 6.870344161987305, "learning_rate": 1.4766875434933892e-05, "loss": 0.4389, "step": 1880 }, { "epoch": 1.3152400835073068, "grad_norm": 12.440506935119629, "learning_rate": 1.4739039665970773e-05, "loss": 0.4242, "step": 1890 }, { "epoch": 1.3221990257480862, "grad_norm": 11.824153900146484, "learning_rate": 1.4711203897007655e-05, "loss": 0.4483, "step": 1900 }, { "epoch": 1.3291579679888657, "grad_norm": 6.609494686126709, "learning_rate": 1.4683368128044539e-05, "loss": 0.4573, "step": 1910 }, { "epoch": 1.336116910229645, "grad_norm": 12.088859558105469, "learning_rate": 1.4655532359081422e-05, "loss": 0.3912, "step": 1920 }, { "epoch": 1.3430758524704245, "grad_norm": 7.114819526672363, "learning_rate": 1.4627696590118303e-05, "loss": 0.3867, "step": 1930 }, { "epoch": 1.350034794711204, "grad_norm": 11.105134010314941, "learning_rate": 1.4599860821155186e-05, "loss": 0.5326, "step": 1940 }, { "epoch": 1.3569937369519833, "grad_norm": 8.85695743560791, "learning_rate": 1.4572025052192067e-05, "loss": 0.445, "step": 1950 }, { "epoch": 1.3639526791927628, "grad_norm": 13.736560821533203, "learning_rate": 1.4544189283228951e-05, "loss": 0.4333, "step": 1960 }, { "epoch": 1.3709116214335422, "grad_norm": 9.378311157226562, "learning_rate": 1.4516353514265832e-05, "loss": 0.4275, "step": 1970 }, { "epoch": 1.3778705636743216, "grad_norm": 10.04967212677002, "learning_rate": 1.4488517745302715e-05, "loss": 0.4151, "step": 1980 }, { "epoch": 1.3848295059151008, "grad_norm": 7.61630392074585, "learning_rate": 1.4460681976339596e-05, "loss": 0.4994, "step": 1990 }, { "epoch": 1.3917884481558804, "grad_norm": 14.55225944519043, "learning_rate": 1.443284620737648e-05, "loss": 0.4048, "step": 2000 }, { "epoch": 1.3987473903966596, "grad_norm": 11.608763694763184, "learning_rate": 1.4405010438413363e-05, "loss": 0.3836, "step": 2010 }, { "epoch": 1.405706332637439, "grad_norm": 11.155454635620117, "learning_rate": 1.4377174669450244e-05, "loss": 0.3942, "step": 2020 }, { "epoch": 1.4126652748782185, "grad_norm": 11.708532333374023, "learning_rate": 1.4349338900487127e-05, "loss": 0.478, "step": 2030 }, { "epoch": 1.4196242171189979, "grad_norm": 9.829862594604492, "learning_rate": 1.432150313152401e-05, "loss": 0.3487, "step": 2040 }, { "epoch": 1.4265831593597773, "grad_norm": 14.25184440612793, "learning_rate": 1.4293667362560893e-05, "loss": 0.5225, "step": 2050 }, { "epoch": 1.4335421016005567, "grad_norm": 12.317340850830078, "learning_rate": 1.4265831593597774e-05, "loss": 0.37, "step": 2060 }, { "epoch": 1.4405010438413361, "grad_norm": 13.569458961486816, "learning_rate": 1.4237995824634656e-05, "loss": 0.3805, "step": 2070 }, { "epoch": 1.4474599860821156, "grad_norm": 16.662263870239258, "learning_rate": 1.4210160055671537e-05, "loss": 0.3996, "step": 2080 }, { "epoch": 1.454418928322895, "grad_norm": 12.971599578857422, "learning_rate": 1.4182324286708422e-05, "loss": 0.4265, "step": 2090 }, { "epoch": 1.4613778705636742, "grad_norm": 9.266508102416992, "learning_rate": 1.4154488517745305e-05, "loss": 0.4199, "step": 2100 }, { "epoch": 1.4683368128044538, "grad_norm": 15.103167533874512, "learning_rate": 1.4126652748782186e-05, "loss": 0.472, "step": 2110 }, { "epoch": 1.475295755045233, "grad_norm": 13.94981861114502, "learning_rate": 1.4098816979819068e-05, "loss": 0.4681, "step": 2120 }, { "epoch": 1.4822546972860124, "grad_norm": 19.643762588500977, "learning_rate": 1.4070981210855951e-05, "loss": 0.3848, "step": 2130 }, { "epoch": 1.4892136395267919, "grad_norm": 11.58189868927002, "learning_rate": 1.4043145441892834e-05, "loss": 0.5083, "step": 2140 }, { "epoch": 1.4961725817675713, "grad_norm": 13.264250755310059, "learning_rate": 1.4015309672929715e-05, "loss": 0.45, "step": 2150 }, { "epoch": 1.5031315240083507, "grad_norm": 10.432905197143555, "learning_rate": 1.3987473903966598e-05, "loss": 0.4583, "step": 2160 }, { "epoch": 1.5100904662491301, "grad_norm": 9.850616455078125, "learning_rate": 1.3959638135003482e-05, "loss": 0.349, "step": 2170 }, { "epoch": 1.5170494084899095, "grad_norm": 14.087292671203613, "learning_rate": 1.3931802366040363e-05, "loss": 0.448, "step": 2180 }, { "epoch": 1.524008350730689, "grad_norm": 12.514032363891602, "learning_rate": 1.3903966597077246e-05, "loss": 0.514, "step": 2190 }, { "epoch": 1.5309672929714684, "grad_norm": 25.41820526123047, "learning_rate": 1.3876130828114127e-05, "loss": 0.4356, "step": 2200 }, { "epoch": 1.5379262352122476, "grad_norm": 11.849440574645996, "learning_rate": 1.3848295059151012e-05, "loss": 0.3895, "step": 2210 }, { "epoch": 1.5448851774530272, "grad_norm": 8.636540412902832, "learning_rate": 1.3820459290187893e-05, "loss": 0.481, "step": 2220 }, { "epoch": 1.5518441196938064, "grad_norm": 11.286504745483398, "learning_rate": 1.3792623521224775e-05, "loss": 0.4019, "step": 2230 }, { "epoch": 1.558803061934586, "grad_norm": 11.524672508239746, "learning_rate": 1.3764787752261656e-05, "loss": 0.4617, "step": 2240 }, { "epoch": 1.5657620041753653, "grad_norm": 11.370726585388184, "learning_rate": 1.373695198329854e-05, "loss": 0.4186, "step": 2250 }, { "epoch": 1.572720946416145, "grad_norm": 23.02247428894043, "learning_rate": 1.3709116214335422e-05, "loss": 0.4743, "step": 2260 }, { "epoch": 1.579679888656924, "grad_norm": 11.176335334777832, "learning_rate": 1.3681280445372305e-05, "loss": 0.41, "step": 2270 }, { "epoch": 1.5866388308977035, "grad_norm": 11.33989429473877, "learning_rate": 1.3653444676409186e-05, "loss": 0.4822, "step": 2280 }, { "epoch": 1.593597773138483, "grad_norm": 18.040159225463867, "learning_rate": 1.3625608907446069e-05, "loss": 0.351, "step": 2290 }, { "epoch": 1.6005567153792624, "grad_norm": 5.855461597442627, "learning_rate": 1.3597773138482953e-05, "loss": 0.4567, "step": 2300 }, { "epoch": 1.6075156576200418, "grad_norm": 10.47138500213623, "learning_rate": 1.3569937369519834e-05, "loss": 0.4875, "step": 2310 }, { "epoch": 1.6144745998608212, "grad_norm": 11.59261417388916, "learning_rate": 1.3542101600556717e-05, "loss": 0.4871, "step": 2320 }, { "epoch": 1.6214335421016006, "grad_norm": 7.732606410980225, "learning_rate": 1.3514265831593598e-05, "loss": 0.4078, "step": 2330 }, { "epoch": 1.6283924843423798, "grad_norm": 10.10660457611084, "learning_rate": 1.3486430062630482e-05, "loss": 0.3767, "step": 2340 }, { "epoch": 1.6353514265831595, "grad_norm": 10.724883079528809, "learning_rate": 1.3458594293667363e-05, "loss": 0.4102, "step": 2350 }, { "epoch": 1.6423103688239387, "grad_norm": 11.941119194030762, "learning_rate": 1.3430758524704246e-05, "loss": 0.5101, "step": 2360 }, { "epoch": 1.6492693110647183, "grad_norm": 11.30588436126709, "learning_rate": 1.3402922755741127e-05, "loss": 0.3157, "step": 2370 }, { "epoch": 1.6562282533054975, "grad_norm": 11.969123840332031, "learning_rate": 1.337508698677801e-05, "loss": 0.4568, "step": 2380 }, { "epoch": 1.663187195546277, "grad_norm": 12.086457252502441, "learning_rate": 1.3347251217814894e-05, "loss": 0.4307, "step": 2390 }, { "epoch": 1.6701461377870563, "grad_norm": 11.068685531616211, "learning_rate": 1.3319415448851776e-05, "loss": 0.4913, "step": 2400 }, { "epoch": 1.6771050800278358, "grad_norm": 7.290180206298828, "learning_rate": 1.3291579679888658e-05, "loss": 0.4157, "step": 2410 }, { "epoch": 1.6840640222686152, "grad_norm": 12.097051620483398, "learning_rate": 1.326374391092554e-05, "loss": 0.4356, "step": 2420 }, { "epoch": 1.6910229645093946, "grad_norm": 10.983007431030273, "learning_rate": 1.3235908141962424e-05, "loss": 0.4594, "step": 2430 }, { "epoch": 1.697981906750174, "grad_norm": 8.463971138000488, "learning_rate": 1.3208072372999305e-05, "loss": 0.395, "step": 2440 }, { "epoch": 1.7049408489909532, "grad_norm": 10.346870422363281, "learning_rate": 1.3180236604036188e-05, "loss": 0.436, "step": 2450 }, { "epoch": 1.7118997912317329, "grad_norm": 13.56679916381836, "learning_rate": 1.3152400835073069e-05, "loss": 0.4011, "step": 2460 }, { "epoch": 1.718858733472512, "grad_norm": 8.750198364257812, "learning_rate": 1.3124565066109953e-05, "loss": 0.4479, "step": 2470 }, { "epoch": 1.7258176757132917, "grad_norm": 8.680354118347168, "learning_rate": 1.3096729297146836e-05, "loss": 0.357, "step": 2480 }, { "epoch": 1.732776617954071, "grad_norm": 13.647467613220215, "learning_rate": 1.3068893528183717e-05, "loss": 0.4286, "step": 2490 }, { "epoch": 1.7397355601948505, "grad_norm": 8.507746696472168, "learning_rate": 1.30410577592206e-05, "loss": 0.3357, "step": 2500 }, { "epoch": 1.7466945024356297, "grad_norm": 12.067097663879395, "learning_rate": 1.3013221990257482e-05, "loss": 0.434, "step": 2510 }, { "epoch": 1.7536534446764092, "grad_norm": 9.736947059631348, "learning_rate": 1.2985386221294365e-05, "loss": 0.4228, "step": 2520 }, { "epoch": 1.7606123869171886, "grad_norm": 14.245895385742188, "learning_rate": 1.2957550452331246e-05, "loss": 0.4379, "step": 2530 }, { "epoch": 1.767571329157968, "grad_norm": 12.476272583007812, "learning_rate": 1.2929714683368129e-05, "loss": 0.4332, "step": 2540 }, { "epoch": 1.7745302713987474, "grad_norm": 13.964608192443848, "learning_rate": 1.290187891440501e-05, "loss": 0.3422, "step": 2550 }, { "epoch": 1.7814892136395268, "grad_norm": 11.833532333374023, "learning_rate": 1.2874043145441895e-05, "loss": 0.401, "step": 2560 }, { "epoch": 1.7884481558803063, "grad_norm": 10.771284103393555, "learning_rate": 1.2846207376478776e-05, "loss": 0.3829, "step": 2570 }, { "epoch": 1.7954070981210855, "grad_norm": 13.72558307647705, "learning_rate": 1.2818371607515658e-05, "loss": 0.3885, "step": 2580 }, { "epoch": 1.802366040361865, "grad_norm": 19.78665542602539, "learning_rate": 1.279053583855254e-05, "loss": 0.5615, "step": 2590 }, { "epoch": 1.8093249826026443, "grad_norm": 10.085536003112793, "learning_rate": 1.2762700069589424e-05, "loss": 0.3689, "step": 2600 }, { "epoch": 1.816283924843424, "grad_norm": 11.349889755249023, "learning_rate": 1.2734864300626307e-05, "loss": 0.3492, "step": 2610 }, { "epoch": 1.8232428670842031, "grad_norm": 15.420230865478516, "learning_rate": 1.2707028531663188e-05, "loss": 0.4059, "step": 2620 }, { "epoch": 1.8302018093249826, "grad_norm": 43.528160095214844, "learning_rate": 1.267919276270007e-05, "loss": 0.4361, "step": 2630 }, { "epoch": 1.837160751565762, "grad_norm": 8.911616325378418, "learning_rate": 1.2651356993736953e-05, "loss": 0.413, "step": 2640 }, { "epoch": 1.8441196938065414, "grad_norm": 15.205978393554688, "learning_rate": 1.2623521224773836e-05, "loss": 0.4352, "step": 2650 }, { "epoch": 1.8510786360473208, "grad_norm": 15.270347595214844, "learning_rate": 1.2595685455810717e-05, "loss": 0.5509, "step": 2660 }, { "epoch": 1.8580375782881002, "grad_norm": 7.940185546875, "learning_rate": 1.25678496868476e-05, "loss": 0.4402, "step": 2670 }, { "epoch": 1.8649965205288797, "grad_norm": 9.823007583618164, "learning_rate": 1.2540013917884481e-05, "loss": 0.4116, "step": 2680 }, { "epoch": 1.8719554627696589, "grad_norm": 14.74289321899414, "learning_rate": 1.2512178148921365e-05, "loss": 0.4503, "step": 2690 }, { "epoch": 1.8789144050104385, "grad_norm": 13.300530433654785, "learning_rate": 1.2484342379958248e-05, "loss": 0.4735, "step": 2700 }, { "epoch": 1.8858733472512177, "grad_norm": 10.028038024902344, "learning_rate": 1.245650661099513e-05, "loss": 0.4889, "step": 2710 }, { "epoch": 1.8928322894919973, "grad_norm": 13.30984878540039, "learning_rate": 1.2428670842032012e-05, "loss": 0.4714, "step": 2720 }, { "epoch": 1.8997912317327765, "grad_norm": 8.563050270080566, "learning_rate": 1.2400835073068895e-05, "loss": 0.4601, "step": 2730 }, { "epoch": 1.9067501739735562, "grad_norm": 9.030021667480469, "learning_rate": 1.2372999304105777e-05, "loss": 0.448, "step": 2740 }, { "epoch": 1.9137091162143354, "grad_norm": 11.629081726074219, "learning_rate": 1.2345163535142659e-05, "loss": 0.4628, "step": 2750 }, { "epoch": 1.9206680584551148, "grad_norm": 13.654706001281738, "learning_rate": 1.2317327766179541e-05, "loss": 0.4105, "step": 2760 }, { "epoch": 1.9276270006958942, "grad_norm": 10.076985359191895, "learning_rate": 1.2289491997216426e-05, "loss": 0.4087, "step": 2770 }, { "epoch": 1.9345859429366736, "grad_norm": 10.824203491210938, "learning_rate": 1.2261656228253307e-05, "loss": 0.3625, "step": 2780 }, { "epoch": 1.941544885177453, "grad_norm": 19.84947395324707, "learning_rate": 1.223382045929019e-05, "loss": 0.3909, "step": 2790 }, { "epoch": 1.9485038274182325, "grad_norm": 11.292709350585938, "learning_rate": 1.220598469032707e-05, "loss": 0.5342, "step": 2800 }, { "epoch": 1.955462769659012, "grad_norm": 12.195109367370605, "learning_rate": 1.2178148921363955e-05, "loss": 0.434, "step": 2810 }, { "epoch": 1.962421711899791, "grad_norm": 5.671847820281982, "learning_rate": 1.2150313152400836e-05, "loss": 0.3821, "step": 2820 }, { "epoch": 1.9693806541405707, "grad_norm": 6.8894267082214355, "learning_rate": 1.2122477383437719e-05, "loss": 0.4741, "step": 2830 }, { "epoch": 1.97633959638135, "grad_norm": 11.644760131835938, "learning_rate": 1.20946416144746e-05, "loss": 0.4485, "step": 2840 }, { "epoch": 1.9832985386221296, "grad_norm": 13.690812110900879, "learning_rate": 1.2066805845511483e-05, "loss": 0.4329, "step": 2850 }, { "epoch": 1.9902574808629088, "grad_norm": 14.693482398986816, "learning_rate": 1.2038970076548367e-05, "loss": 0.4345, "step": 2860 }, { "epoch": 1.9972164231036882, "grad_norm": 8.777437210083008, "learning_rate": 1.2011134307585248e-05, "loss": 0.4272, "step": 2870 }, { "epoch": 2.0, "eval_accuracy": 0.8186248912097476, "eval_f1": 0.8142148630689494, "eval_loss": 0.5136106610298157, "eval_precision": 0.815336261474068, "eval_recall": 0.8186248912097476, "eval_runtime": 30.487, "eval_samples_per_second": 188.441, "eval_steps_per_second": 5.904, "step": 2874 }, { "epoch": 2.0041753653444676, "grad_norm": 6.2629876136779785, "learning_rate": 1.198329853862213e-05, "loss": 0.3394, "step": 2880 }, { "epoch": 2.0111343075852472, "grad_norm": 13.670483589172363, "learning_rate": 1.1955462769659012e-05, "loss": 0.2971, "step": 2890 }, { "epoch": 2.0180932498260264, "grad_norm": 7.209113121032715, "learning_rate": 1.1927627000695896e-05, "loss": 0.3459, "step": 2900 }, { "epoch": 2.0250521920668056, "grad_norm": 14.117879867553711, "learning_rate": 1.1899791231732778e-05, "loss": 0.2936, "step": 2910 }, { "epoch": 2.0320111343075853, "grad_norm": 8.980249404907227, "learning_rate": 1.187195546276966e-05, "loss": 0.3326, "step": 2920 }, { "epoch": 2.0389700765483645, "grad_norm": 16.819644927978516, "learning_rate": 1.1844119693806541e-05, "loss": 0.3578, "step": 2930 }, { "epoch": 2.045929018789144, "grad_norm": 14.287947654724121, "learning_rate": 1.1816283924843426e-05, "loss": 0.3025, "step": 2940 }, { "epoch": 2.0528879610299233, "grad_norm": 11.339349746704102, "learning_rate": 1.1788448155880307e-05, "loss": 0.3576, "step": 2950 }, { "epoch": 2.059846903270703, "grad_norm": 7.132763862609863, "learning_rate": 1.176061238691719e-05, "loss": 0.319, "step": 2960 }, { "epoch": 2.066805845511482, "grad_norm": 10.997299194335938, "learning_rate": 1.173277661795407e-05, "loss": 0.398, "step": 2970 }, { "epoch": 2.073764787752262, "grad_norm": 27.665699005126953, "learning_rate": 1.1704940848990953e-05, "loss": 0.2853, "step": 2980 }, { "epoch": 2.080723729993041, "grad_norm": 25.032983779907227, "learning_rate": 1.1677105080027838e-05, "loss": 0.3484, "step": 2990 }, { "epoch": 2.0876826722338206, "grad_norm": 17.8544921875, "learning_rate": 1.1649269311064719e-05, "loss": 0.3496, "step": 3000 }, { "epoch": 2.0946416144746, "grad_norm": 19.761899948120117, "learning_rate": 1.1621433542101602e-05, "loss": 0.3137, "step": 3010 }, { "epoch": 2.101600556715379, "grad_norm": 21.972309112548828, "learning_rate": 1.1593597773138483e-05, "loss": 0.3503, "step": 3020 }, { "epoch": 2.1085594989561587, "grad_norm": 14.141931533813477, "learning_rate": 1.1565762004175367e-05, "loss": 0.2801, "step": 3030 }, { "epoch": 2.115518441196938, "grad_norm": 9.858434677124023, "learning_rate": 1.1537926235212248e-05, "loss": 0.2912, "step": 3040 }, { "epoch": 2.1224773834377175, "grad_norm": 11.357017517089844, "learning_rate": 1.1510090466249131e-05, "loss": 0.3761, "step": 3050 }, { "epoch": 2.1294363256784967, "grad_norm": 14.140629768371582, "learning_rate": 1.1482254697286012e-05, "loss": 0.4435, "step": 3060 }, { "epoch": 2.1363952679192764, "grad_norm": 14.739768028259277, "learning_rate": 1.1454418928322897e-05, "loss": 0.3424, "step": 3070 }, { "epoch": 2.1433542101600556, "grad_norm": 11.241192817687988, "learning_rate": 1.142658315935978e-05, "loss": 0.3119, "step": 3080 }, { "epoch": 2.150313152400835, "grad_norm": 7.299347400665283, "learning_rate": 1.139874739039666e-05, "loss": 0.2737, "step": 3090 }, { "epoch": 2.1572720946416144, "grad_norm": 12.915804862976074, "learning_rate": 1.1370911621433543e-05, "loss": 0.3366, "step": 3100 }, { "epoch": 2.164231036882394, "grad_norm": 14.415313720703125, "learning_rate": 1.1343075852470426e-05, "loss": 0.3461, "step": 3110 }, { "epoch": 2.1711899791231732, "grad_norm": 37.51091384887695, "learning_rate": 1.1315240083507309e-05, "loss": 0.2735, "step": 3120 }, { "epoch": 2.178148921363953, "grad_norm": 14.238667488098145, "learning_rate": 1.128740431454419e-05, "loss": 0.3137, "step": 3130 }, { "epoch": 2.185107863604732, "grad_norm": 10.343038558959961, "learning_rate": 1.1259568545581073e-05, "loss": 0.3305, "step": 3140 }, { "epoch": 2.1920668058455113, "grad_norm": 11.619972229003906, "learning_rate": 1.1231732776617954e-05, "loss": 0.3192, "step": 3150 }, { "epoch": 2.199025748086291, "grad_norm": 10.04326343536377, "learning_rate": 1.1203897007654838e-05, "loss": 0.3116, "step": 3160 }, { "epoch": 2.20598469032707, "grad_norm": 10.689598083496094, "learning_rate": 1.117606123869172e-05, "loss": 0.2896, "step": 3170 }, { "epoch": 2.2129436325678498, "grad_norm": 13.70692253112793, "learning_rate": 1.1148225469728602e-05, "loss": 0.3201, "step": 3180 }, { "epoch": 2.219902574808629, "grad_norm": 11.719026565551758, "learning_rate": 1.1120389700765483e-05, "loss": 0.2838, "step": 3190 }, { "epoch": 2.2268615170494086, "grad_norm": 14.697103500366211, "learning_rate": 1.1092553931802367e-05, "loss": 0.3266, "step": 3200 }, { "epoch": 2.233820459290188, "grad_norm": 9.828338623046875, "learning_rate": 1.106471816283925e-05, "loss": 0.2887, "step": 3210 }, { "epoch": 2.2407794015309674, "grad_norm": 17.478595733642578, "learning_rate": 1.1036882393876131e-05, "loss": 0.263, "step": 3220 }, { "epoch": 2.2477383437717466, "grad_norm": 12.797255516052246, "learning_rate": 1.1009046624913014e-05, "loss": 0.2799, "step": 3230 }, { "epoch": 2.2546972860125263, "grad_norm": 7.045528888702393, "learning_rate": 1.0981210855949897e-05, "loss": 0.3794, "step": 3240 }, { "epoch": 2.2616562282533055, "grad_norm": 13.09620189666748, "learning_rate": 1.095337508698678e-05, "loss": 0.2539, "step": 3250 }, { "epoch": 2.268615170494085, "grad_norm": 7.552209377288818, "learning_rate": 1.092553931802366e-05, "loss": 0.3526, "step": 3260 }, { "epoch": 2.2755741127348643, "grad_norm": 10.503962516784668, "learning_rate": 1.0897703549060543e-05, "loss": 0.3042, "step": 3270 }, { "epoch": 2.2825330549756435, "grad_norm": 9.320645332336426, "learning_rate": 1.0869867780097424e-05, "loss": 0.2384, "step": 3280 }, { "epoch": 2.289491997216423, "grad_norm": 9.707759857177734, "learning_rate": 1.0842032011134309e-05, "loss": 0.2633, "step": 3290 }, { "epoch": 2.2964509394572024, "grad_norm": 10.683955192565918, "learning_rate": 1.0814196242171192e-05, "loss": 0.2941, "step": 3300 }, { "epoch": 2.303409881697982, "grad_norm": 11.840535163879395, "learning_rate": 1.0786360473208073e-05, "loss": 0.2836, "step": 3310 }, { "epoch": 2.310368823938761, "grad_norm": 17.78310203552246, "learning_rate": 1.0758524704244955e-05, "loss": 0.317, "step": 3320 }, { "epoch": 2.317327766179541, "grad_norm": 14.615537643432617, "learning_rate": 1.0730688935281838e-05, "loss": 0.2933, "step": 3330 }, { "epoch": 2.32428670842032, "grad_norm": 14.550018310546875, "learning_rate": 1.0702853166318721e-05, "loss": 0.3915, "step": 3340 }, { "epoch": 2.3312456506610997, "grad_norm": 11.032766342163086, "learning_rate": 1.0675017397355602e-05, "loss": 0.3216, "step": 3350 }, { "epoch": 2.338204592901879, "grad_norm": 11.570281028747559, "learning_rate": 1.0647181628392485e-05, "loss": 0.3011, "step": 3360 }, { "epoch": 2.3451635351426585, "grad_norm": 8.726863861083984, "learning_rate": 1.061934585942937e-05, "loss": 0.2779, "step": 3370 }, { "epoch": 2.3521224773834377, "grad_norm": 11.70459270477295, "learning_rate": 1.059151009046625e-05, "loss": 0.3044, "step": 3380 }, { "epoch": 2.359081419624217, "grad_norm": 11.244171142578125, "learning_rate": 1.0563674321503133e-05, "loss": 0.2742, "step": 3390 }, { "epoch": 2.3660403618649966, "grad_norm": 13.98281192779541, "learning_rate": 1.0535838552540014e-05, "loss": 0.4095, "step": 3400 }, { "epoch": 2.3729993041057758, "grad_norm": 60.7244758605957, "learning_rate": 1.0508002783576897e-05, "loss": 0.2786, "step": 3410 }, { "epoch": 2.3799582463465554, "grad_norm": 17.862695693969727, "learning_rate": 1.048016701461378e-05, "loss": 0.4003, "step": 3420 }, { "epoch": 2.3869171885873346, "grad_norm": 11.725099563598633, "learning_rate": 1.0452331245650662e-05, "loss": 0.2304, "step": 3430 }, { "epoch": 2.3938761308281142, "grad_norm": 14.38791561126709, "learning_rate": 1.0424495476687543e-05, "loss": 0.3011, "step": 3440 }, { "epoch": 2.4008350730688934, "grad_norm": 17.396326065063477, "learning_rate": 1.0396659707724426e-05, "loss": 0.2762, "step": 3450 }, { "epoch": 2.407794015309673, "grad_norm": 15.41369915008545, "learning_rate": 1.036882393876131e-05, "loss": 0.318, "step": 3460 }, { "epoch": 2.4147529575504523, "grad_norm": 10.988295555114746, "learning_rate": 1.0340988169798192e-05, "loss": 0.3218, "step": 3470 }, { "epoch": 2.421711899791232, "grad_norm": 25.048797607421875, "learning_rate": 1.0313152400835074e-05, "loss": 0.3305, "step": 3480 }, { "epoch": 2.428670842032011, "grad_norm": 14.599493026733398, "learning_rate": 1.0285316631871956e-05, "loss": 0.3633, "step": 3490 }, { "epoch": 2.4356297842727903, "grad_norm": 14.409786224365234, "learning_rate": 1.025748086290884e-05, "loss": 0.3577, "step": 3500 }, { "epoch": 2.44258872651357, "grad_norm": 11.649439811706543, "learning_rate": 1.0229645093945721e-05, "loss": 0.321, "step": 3510 }, { "epoch": 2.449547668754349, "grad_norm": 20.704423904418945, "learning_rate": 1.0201809324982604e-05, "loss": 0.3077, "step": 3520 }, { "epoch": 2.456506610995129, "grad_norm": 9.154399871826172, "learning_rate": 1.0173973556019485e-05, "loss": 0.2218, "step": 3530 }, { "epoch": 2.463465553235908, "grad_norm": 10.178906440734863, "learning_rate": 1.014613778705637e-05, "loss": 0.3181, "step": 3540 }, { "epoch": 2.4704244954766876, "grad_norm": 12.843514442443848, "learning_rate": 1.011830201809325e-05, "loss": 0.2807, "step": 3550 }, { "epoch": 2.477383437717467, "grad_norm": 15.194390296936035, "learning_rate": 1.0090466249130133e-05, "loss": 0.3834, "step": 3560 }, { "epoch": 2.4843423799582465, "grad_norm": 10.255640983581543, "learning_rate": 1.0062630480167014e-05, "loss": 0.2968, "step": 3570 }, { "epoch": 2.4913013221990257, "grad_norm": 12.686639785766602, "learning_rate": 1.0034794711203897e-05, "loss": 0.3559, "step": 3580 }, { "epoch": 2.4982602644398053, "grad_norm": 18.302518844604492, "learning_rate": 1.0006958942240781e-05, "loss": 0.2919, "step": 3590 }, { "epoch": 2.5052192066805845, "grad_norm": 13.351080894470215, "learning_rate": 9.979123173277662e-06, "loss": 0.374, "step": 3600 }, { "epoch": 2.5121781489213637, "grad_norm": 11.826626777648926, "learning_rate": 9.951287404314545e-06, "loss": 0.351, "step": 3610 }, { "epoch": 2.5191370911621433, "grad_norm": 14.403546333312988, "learning_rate": 9.923451635351428e-06, "loss": 0.3448, "step": 3620 }, { "epoch": 2.526096033402923, "grad_norm": 8.75331974029541, "learning_rate": 9.895615866388309e-06, "loss": 0.3678, "step": 3630 }, { "epoch": 2.533054975643702, "grad_norm": 7.926251411437988, "learning_rate": 9.867780097425192e-06, "loss": 0.3124, "step": 3640 }, { "epoch": 2.5400139178844814, "grad_norm": 14.520807266235352, "learning_rate": 9.839944328462075e-06, "loss": 0.3685, "step": 3650 }, { "epoch": 2.546972860125261, "grad_norm": 6.630367279052734, "learning_rate": 9.812108559498957e-06, "loss": 0.4329, "step": 3660 }, { "epoch": 2.5539318023660402, "grad_norm": 16.33591651916504, "learning_rate": 9.784272790535838e-06, "loss": 0.2904, "step": 3670 }, { "epoch": 2.56089074460682, "grad_norm": 12.767754554748535, "learning_rate": 9.756437021572723e-06, "loss": 0.3424, "step": 3680 }, { "epoch": 2.567849686847599, "grad_norm": 17.801118850708008, "learning_rate": 9.728601252609604e-06, "loss": 0.3373, "step": 3690 }, { "epoch": 2.5748086290883787, "grad_norm": 12.671394348144531, "learning_rate": 9.700765483646487e-06, "loss": 0.3632, "step": 3700 }, { "epoch": 2.581767571329158, "grad_norm": 13.465224266052246, "learning_rate": 9.67292971468337e-06, "loss": 0.3065, "step": 3710 }, { "epoch": 2.588726513569937, "grad_norm": 7.1438822746276855, "learning_rate": 9.64509394572025e-06, "loss": 0.2948, "step": 3720 }, { "epoch": 2.5956854558107167, "grad_norm": 6.285761833190918, "learning_rate": 9.617258176757133e-06, "loss": 0.2847, "step": 3730 }, { "epoch": 2.6026443980514964, "grad_norm": 12.588811874389648, "learning_rate": 9.589422407794016e-06, "loss": 0.3873, "step": 3740 }, { "epoch": 2.6096033402922756, "grad_norm": 10.82616138458252, "learning_rate": 9.561586638830899e-06, "loss": 0.276, "step": 3750 }, { "epoch": 2.616562282533055, "grad_norm": 19.147323608398438, "learning_rate": 9.53375086986778e-06, "loss": 0.3486, "step": 3760 }, { "epoch": 2.6235212247738344, "grad_norm": 5.6541266441345215, "learning_rate": 9.505915100904664e-06, "loss": 0.3499, "step": 3770 }, { "epoch": 2.6304801670146136, "grad_norm": 11.496247291564941, "learning_rate": 9.478079331941545e-06, "loss": 0.3104, "step": 3780 }, { "epoch": 2.6374391092553933, "grad_norm": 13.55700397491455, "learning_rate": 9.450243562978428e-06, "loss": 0.3168, "step": 3790 }, { "epoch": 2.6443980514961725, "grad_norm": 13.626465797424316, "learning_rate": 9.422407794015311e-06, "loss": 0.3061, "step": 3800 }, { "epoch": 2.651356993736952, "grad_norm": 15.465563774108887, "learning_rate": 9.394572025052194e-06, "loss": 0.2827, "step": 3810 }, { "epoch": 2.6583159359777313, "grad_norm": 10.07729721069336, "learning_rate": 9.366736256089075e-06, "loss": 0.2696, "step": 3820 }, { "epoch": 2.665274878218511, "grad_norm": 14.677043914794922, "learning_rate": 9.338900487125957e-06, "loss": 0.3666, "step": 3830 }, { "epoch": 2.67223382045929, "grad_norm": 9.545304298400879, "learning_rate": 9.31106471816284e-06, "loss": 0.3099, "step": 3840 }, { "epoch": 2.67919276270007, "grad_norm": 13.406818389892578, "learning_rate": 9.283228949199723e-06, "loss": 0.3011, "step": 3850 }, { "epoch": 2.686151704940849, "grad_norm": 9.16269302368164, "learning_rate": 9.255393180236604e-06, "loss": 0.3376, "step": 3860 }, { "epoch": 2.693110647181628, "grad_norm": 13.706355094909668, "learning_rate": 9.227557411273487e-06, "loss": 0.3558, "step": 3870 }, { "epoch": 2.700069589422408, "grad_norm": 13.172536849975586, "learning_rate": 9.19972164231037e-06, "loss": 0.3418, "step": 3880 }, { "epoch": 2.7070285316631875, "grad_norm": 13.34077262878418, "learning_rate": 9.171885873347252e-06, "loss": 0.4049, "step": 3890 }, { "epoch": 2.7139874739039667, "grad_norm": 22.909151077270508, "learning_rate": 9.144050104384135e-06, "loss": 0.3101, "step": 3900 }, { "epoch": 2.720946416144746, "grad_norm": 10.906767845153809, "learning_rate": 9.116214335421016e-06, "loss": 0.2435, "step": 3910 }, { "epoch": 2.7279053583855255, "grad_norm": 17.20676040649414, "learning_rate": 9.088378566457899e-06, "loss": 0.3154, "step": 3920 }, { "epoch": 2.7348643006263047, "grad_norm": 12.238724708557129, "learning_rate": 9.060542797494782e-06, "loss": 0.2535, "step": 3930 }, { "epoch": 2.7418232428670843, "grad_norm": 14.290855407714844, "learning_rate": 9.032707028531664e-06, "loss": 0.32, "step": 3940 }, { "epoch": 2.7487821851078635, "grad_norm": 7.506951332092285, "learning_rate": 9.004871259568545e-06, "loss": 0.3707, "step": 3950 }, { "epoch": 2.755741127348643, "grad_norm": 9.022459030151367, "learning_rate": 8.97703549060543e-06, "loss": 0.2429, "step": 3960 }, { "epoch": 2.7627000695894224, "grad_norm": 8.920448303222656, "learning_rate": 8.949199721642311e-06, "loss": 0.2934, "step": 3970 }, { "epoch": 2.7696590118302016, "grad_norm": 7.519834995269775, "learning_rate": 8.921363952679194e-06, "loss": 0.2833, "step": 3980 }, { "epoch": 2.776617954070981, "grad_norm": 10.720945358276367, "learning_rate": 8.893528183716076e-06, "loss": 0.2456, "step": 3990 }, { "epoch": 2.783576896311761, "grad_norm": 11.830615997314453, "learning_rate": 8.865692414752958e-06, "loss": 0.2867, "step": 4000 }, { "epoch": 2.79053583855254, "grad_norm": 9.925026893615723, "learning_rate": 8.83785664578984e-06, "loss": 0.3503, "step": 4010 }, { "epoch": 2.7974947807933193, "grad_norm": 9.181790351867676, "learning_rate": 8.810020876826723e-06, "loss": 0.3396, "step": 4020 }, { "epoch": 2.804453723034099, "grad_norm": 17.847026824951172, "learning_rate": 8.782185107863606e-06, "loss": 0.3592, "step": 4030 }, { "epoch": 2.811412665274878, "grad_norm": 14.639543533325195, "learning_rate": 8.754349338900487e-06, "loss": 0.2279, "step": 4040 }, { "epoch": 2.8183716075156577, "grad_norm": 14.787379264831543, "learning_rate": 8.72651356993737e-06, "loss": 0.298, "step": 4050 }, { "epoch": 2.825330549756437, "grad_norm": 9.879755020141602, "learning_rate": 8.698677800974252e-06, "loss": 0.3329, "step": 4060 }, { "epoch": 2.8322894919972166, "grad_norm": 8.337702751159668, "learning_rate": 8.670842032011135e-06, "loss": 0.3393, "step": 4070 }, { "epoch": 2.8392484342379958, "grad_norm": 11.59692668914795, "learning_rate": 8.643006263048018e-06, "loss": 0.2697, "step": 4080 }, { "epoch": 2.846207376478775, "grad_norm": 22.700538635253906, "learning_rate": 8.6151704940849e-06, "loss": 0.3444, "step": 4090 }, { "epoch": 2.8531663187195546, "grad_norm": 13.9461088180542, "learning_rate": 8.587334725121782e-06, "loss": 0.2616, "step": 4100 }, { "epoch": 2.8601252609603343, "grad_norm": 9.75053882598877, "learning_rate": 8.559498956158664e-06, "loss": 0.3219, "step": 4110 }, { "epoch": 2.8670842032011135, "grad_norm": 11.127705574035645, "learning_rate": 8.531663187195547e-06, "loss": 0.3238, "step": 4120 }, { "epoch": 2.8740431454418927, "grad_norm": 13.097844123840332, "learning_rate": 8.50382741823243e-06, "loss": 0.3177, "step": 4130 }, { "epoch": 2.8810020876826723, "grad_norm": 11.675921440124512, "learning_rate": 8.475991649269311e-06, "loss": 0.2872, "step": 4140 }, { "epoch": 2.8879610299234515, "grad_norm": 9.369670867919922, "learning_rate": 8.448155880306194e-06, "loss": 0.2693, "step": 4150 }, { "epoch": 2.894919972164231, "grad_norm": 8.535505294799805, "learning_rate": 8.420320111343077e-06, "loss": 0.2828, "step": 4160 }, { "epoch": 2.9018789144050103, "grad_norm": 11.415098190307617, "learning_rate": 8.392484342379958e-06, "loss": 0.341, "step": 4170 }, { "epoch": 2.90883785664579, "grad_norm": 19.970497131347656, "learning_rate": 8.364648573416842e-06, "loss": 0.3472, "step": 4180 }, { "epoch": 2.915796798886569, "grad_norm": 6.632875919342041, "learning_rate": 8.336812804453723e-06, "loss": 0.317, "step": 4190 }, { "epoch": 2.9227557411273484, "grad_norm": 18.783174514770508, "learning_rate": 8.308977035490606e-06, "loss": 0.3287, "step": 4200 }, { "epoch": 2.929714683368128, "grad_norm": 10.871125221252441, "learning_rate": 8.281141266527489e-06, "loss": 0.2833, "step": 4210 }, { "epoch": 2.9366736256089077, "grad_norm": 16.211822509765625, "learning_rate": 8.253305497564371e-06, "loss": 0.2999, "step": 4220 }, { "epoch": 2.943632567849687, "grad_norm": 15.132637023925781, "learning_rate": 8.225469728601253e-06, "loss": 0.3001, "step": 4230 }, { "epoch": 2.950591510090466, "grad_norm": 15.456144332885742, "learning_rate": 8.197633959638135e-06, "loss": 0.3072, "step": 4240 }, { "epoch": 2.9575504523312457, "grad_norm": 12.601120948791504, "learning_rate": 8.169798190675018e-06, "loss": 0.343, "step": 4250 }, { "epoch": 2.964509394572025, "grad_norm": 18.754928588867188, "learning_rate": 8.1419624217119e-06, "loss": 0.3183, "step": 4260 }, { "epoch": 2.9714683368128045, "grad_norm": 12.473878860473633, "learning_rate": 8.114126652748784e-06, "loss": 0.3068, "step": 4270 }, { "epoch": 2.9784272790535837, "grad_norm": 15.256598472595215, "learning_rate": 8.086290883785666e-06, "loss": 0.3455, "step": 4280 }, { "epoch": 2.9853862212943634, "grad_norm": 9.27231216430664, "learning_rate": 8.058455114822547e-06, "loss": 0.352, "step": 4290 }, { "epoch": 2.9923451635351426, "grad_norm": 9.258604049682617, "learning_rate": 8.03061934585943e-06, "loss": 0.2213, "step": 4300 }, { "epoch": 2.999304105775922, "grad_norm": 14.93215560913086, "learning_rate": 8.002783576896313e-06, "loss": 0.4324, "step": 4310 }, { "epoch": 3.0, "eval_accuracy": 0.816710182767624, "eval_f1": 0.8145784658491753, "eval_loss": 0.567512571811676, "eval_precision": 0.8135975269617428, "eval_recall": 0.816710182767624, "eval_runtime": 30.2203, "eval_samples_per_second": 190.104, "eval_steps_per_second": 5.956, "step": 4311 }, { "epoch": 3.0062630480167014, "grad_norm": 18.71284294128418, "learning_rate": 7.974947807933194e-06, "loss": 0.2542, "step": 4320 }, { "epoch": 3.013221990257481, "grad_norm": 7.149287223815918, "learning_rate": 7.947112038970077e-06, "loss": 0.2366, "step": 4330 }, { "epoch": 3.0201809324982603, "grad_norm": 11.145984649658203, "learning_rate": 7.91927627000696e-06, "loss": 0.2427, "step": 4340 }, { "epoch": 3.0271398747390394, "grad_norm": 14.64748764038086, "learning_rate": 7.891440501043842e-06, "loss": 0.2473, "step": 4350 }, { "epoch": 3.034098816979819, "grad_norm": 13.893207550048828, "learning_rate": 7.863604732080723e-06, "loss": 0.2094, "step": 4360 }, { "epoch": 3.0410577592205983, "grad_norm": 22.052799224853516, "learning_rate": 7.835768963117608e-06, "loss": 0.2436, "step": 4370 }, { "epoch": 3.048016701461378, "grad_norm": 16.942176818847656, "learning_rate": 7.807933194154489e-06, "loss": 0.1888, "step": 4380 }, { "epoch": 3.054975643702157, "grad_norm": 12.624246597290039, "learning_rate": 7.780097425191372e-06, "loss": 0.2503, "step": 4390 }, { "epoch": 3.0619345859429368, "grad_norm": 6.612172603607178, "learning_rate": 7.752261656228254e-06, "loss": 0.2775, "step": 4400 }, { "epoch": 3.068893528183716, "grad_norm": 7.194397449493408, "learning_rate": 7.724425887265137e-06, "loss": 0.2143, "step": 4410 }, { "epoch": 3.0758524704244956, "grad_norm": 15.386282920837402, "learning_rate": 7.696590118302018e-06, "loss": 0.2393, "step": 4420 }, { "epoch": 3.082811412665275, "grad_norm": 4.229943752288818, "learning_rate": 7.668754349338901e-06, "loss": 0.2499, "step": 4430 }, { "epoch": 3.0897703549060545, "grad_norm": 7.80819845199585, "learning_rate": 7.640918580375784e-06, "loss": 0.2578, "step": 4440 }, { "epoch": 3.0967292971468336, "grad_norm": 3.920732259750366, "learning_rate": 7.6130828114126656e-06, "loss": 0.2114, "step": 4450 }, { "epoch": 3.1036882393876133, "grad_norm": 16.482385635375977, "learning_rate": 7.585247042449548e-06, "loss": 0.1836, "step": 4460 }, { "epoch": 3.1106471816283925, "grad_norm": 10.486527442932129, "learning_rate": 7.55741127348643e-06, "loss": 0.2508, "step": 4470 }, { "epoch": 3.1176061238691717, "grad_norm": 9.817858695983887, "learning_rate": 7.529575504523313e-06, "loss": 0.2671, "step": 4480 }, { "epoch": 3.1245650661099513, "grad_norm": 7.070506572723389, "learning_rate": 7.501739735560195e-06, "loss": 0.2899, "step": 4490 }, { "epoch": 3.1315240083507305, "grad_norm": 11.537872314453125, "learning_rate": 7.473903966597078e-06, "loss": 0.2668, "step": 4500 }, { "epoch": 3.13848295059151, "grad_norm": 14.454391479492188, "learning_rate": 7.44606819763396e-06, "loss": 0.2487, "step": 4510 }, { "epoch": 3.1454418928322894, "grad_norm": 12.986367225646973, "learning_rate": 7.418232428670843e-06, "loss": 0.3061, "step": 4520 }, { "epoch": 3.152400835073069, "grad_norm": 21.419010162353516, "learning_rate": 7.390396659707725e-06, "loss": 0.2157, "step": 4530 }, { "epoch": 3.159359777313848, "grad_norm": 19.896608352661133, "learning_rate": 7.362560890744608e-06, "loss": 0.3067, "step": 4540 }, { "epoch": 3.166318719554628, "grad_norm": 12.328235626220703, "learning_rate": 7.33472512178149e-06, "loss": 0.2303, "step": 4550 }, { "epoch": 3.173277661795407, "grad_norm": 10.288804054260254, "learning_rate": 7.3068893528183725e-06, "loss": 0.3152, "step": 4560 }, { "epoch": 3.1802366040361867, "grad_norm": 7.457220077514648, "learning_rate": 7.2790535838552544e-06, "loss": 0.2319, "step": 4570 }, { "epoch": 3.187195546276966, "grad_norm": 11.831998825073242, "learning_rate": 7.251217814892137e-06, "loss": 0.2369, "step": 4580 }, { "epoch": 3.1941544885177455, "grad_norm": 8.234902381896973, "learning_rate": 7.223382045929019e-06, "loss": 0.1987, "step": 4590 }, { "epoch": 3.2011134307585247, "grad_norm": 11.515932083129883, "learning_rate": 7.195546276965901e-06, "loss": 0.1955, "step": 4600 }, { "epoch": 3.208072372999304, "grad_norm": 13.247298240661621, "learning_rate": 7.167710508002785e-06, "loss": 0.2123, "step": 4610 }, { "epoch": 3.2150313152400836, "grad_norm": 9.564682006835938, "learning_rate": 7.139874739039666e-06, "loss": 0.2544, "step": 4620 }, { "epoch": 3.2219902574808628, "grad_norm": 16.233783721923828, "learning_rate": 7.112038970076549e-06, "loss": 0.2659, "step": 4630 }, { "epoch": 3.2289491997216424, "grad_norm": 6.909665107727051, "learning_rate": 7.084203201113431e-06, "loss": 0.1884, "step": 4640 }, { "epoch": 3.2359081419624216, "grad_norm": 13.52547836303711, "learning_rate": 7.056367432150314e-06, "loss": 0.2759, "step": 4650 }, { "epoch": 3.2428670842032012, "grad_norm": 10.246102333068848, "learning_rate": 7.028531663187196e-06, "loss": 0.2318, "step": 4660 }, { "epoch": 3.2498260264439804, "grad_norm": 25.54823875427246, "learning_rate": 7.000695894224079e-06, "loss": 0.2425, "step": 4670 }, { "epoch": 3.25678496868476, "grad_norm": 10.150367736816406, "learning_rate": 6.9728601252609605e-06, "loss": 0.2687, "step": 4680 }, { "epoch": 3.2637439109255393, "grad_norm": 17.207233428955078, "learning_rate": 6.945024356297843e-06, "loss": 0.2855, "step": 4690 }, { "epoch": 3.270702853166319, "grad_norm": 8.081562042236328, "learning_rate": 6.917188587334725e-06, "loss": 0.2238, "step": 4700 }, { "epoch": 3.277661795407098, "grad_norm": 22.442302703857422, "learning_rate": 6.889352818371609e-06, "loss": 0.2874, "step": 4710 }, { "epoch": 3.2846207376478773, "grad_norm": 5.26035213470459, "learning_rate": 6.861517049408491e-06, "loss": 0.2347, "step": 4720 }, { "epoch": 3.291579679888657, "grad_norm": 11.35543155670166, "learning_rate": 6.8336812804453735e-06, "loss": 0.2517, "step": 4730 }, { "epoch": 3.298538622129436, "grad_norm": 20.361177444458008, "learning_rate": 6.805845511482255e-06, "loss": 0.2306, "step": 4740 }, { "epoch": 3.305497564370216, "grad_norm": 21.40257453918457, "learning_rate": 6.778009742519137e-06, "loss": 0.245, "step": 4750 }, { "epoch": 3.312456506610995, "grad_norm": 15.335564613342285, "learning_rate": 6.75017397355602e-06, "loss": 0.2872, "step": 4760 }, { "epoch": 3.3194154488517746, "grad_norm": 12.894388198852539, "learning_rate": 6.722338204592902e-06, "loss": 0.2023, "step": 4770 }, { "epoch": 3.326374391092554, "grad_norm": 9.890000343322754, "learning_rate": 6.694502435629785e-06, "loss": 0.2154, "step": 4780 }, { "epoch": 3.3333333333333335, "grad_norm": 9.852010726928711, "learning_rate": 6.666666666666667e-06, "loss": 0.2223, "step": 4790 }, { "epoch": 3.3402922755741127, "grad_norm": 12.012703895568848, "learning_rate": 6.638830897703549e-06, "loss": 0.1917, "step": 4800 }, { "epoch": 3.3472512178148923, "grad_norm": 6.725717544555664, "learning_rate": 6.610995128740431e-06, "loss": 0.2387, "step": 4810 }, { "epoch": 3.3542101600556715, "grad_norm": 11.746057510375977, "learning_rate": 6.583159359777315e-06, "loss": 0.2588, "step": 4820 }, { "epoch": 3.3611691022964507, "grad_norm": 15.1249418258667, "learning_rate": 6.555323590814197e-06, "loss": 0.2504, "step": 4830 }, { "epoch": 3.3681280445372304, "grad_norm": 43.986305236816406, "learning_rate": 6.52748782185108e-06, "loss": 0.2518, "step": 4840 }, { "epoch": 3.3750869867780096, "grad_norm": 6.751053810119629, "learning_rate": 6.4996520528879615e-06, "loss": 0.2555, "step": 4850 }, { "epoch": 3.382045929018789, "grad_norm": 11.51275634765625, "learning_rate": 6.471816283924844e-06, "loss": 0.2098, "step": 4860 }, { "epoch": 3.3890048712595684, "grad_norm": 28.333683013916016, "learning_rate": 6.443980514961726e-06, "loss": 0.2588, "step": 4870 }, { "epoch": 3.395963813500348, "grad_norm": 13.346843719482422, "learning_rate": 6.416144745998609e-06, "loss": 0.2254, "step": 4880 }, { "epoch": 3.4029227557411272, "grad_norm": 14.337092399597168, "learning_rate": 6.388308977035491e-06, "loss": 0.2666, "step": 4890 }, { "epoch": 3.409881697981907, "grad_norm": 19.120765686035156, "learning_rate": 6.360473208072373e-06, "loss": 0.2897, "step": 4900 }, { "epoch": 3.416840640222686, "grad_norm": 9.88152027130127, "learning_rate": 6.332637439109256e-06, "loss": 0.272, "step": 4910 }, { "epoch": 3.4237995824634657, "grad_norm": 16.881410598754883, "learning_rate": 6.304801670146138e-06, "loss": 0.1939, "step": 4920 }, { "epoch": 3.430758524704245, "grad_norm": 9.80156421661377, "learning_rate": 6.276965901183021e-06, "loss": 0.1824, "step": 4930 }, { "epoch": 3.437717466945024, "grad_norm": 13.772383689880371, "learning_rate": 6.249130132219903e-06, "loss": 0.2546, "step": 4940 }, { "epoch": 3.4446764091858038, "grad_norm": 15.60239028930664, "learning_rate": 6.221294363256786e-06, "loss": 0.287, "step": 4950 }, { "epoch": 3.4516353514265834, "grad_norm": 13.885263442993164, "learning_rate": 6.193458594293668e-06, "loss": 0.3121, "step": 4960 }, { "epoch": 3.4585942936673626, "grad_norm": 13.832782745361328, "learning_rate": 6.16562282533055e-06, "loss": 0.1614, "step": 4970 }, { "epoch": 3.465553235908142, "grad_norm": 8.264083862304688, "learning_rate": 6.137787056367432e-06, "loss": 0.2014, "step": 4980 }, { "epoch": 3.4725121781489214, "grad_norm": 10.630083084106445, "learning_rate": 6.109951287404315e-06, "loss": 0.198, "step": 4990 }, { "epoch": 3.4794711203897006, "grad_norm": 12.914116859436035, "learning_rate": 6.082115518441197e-06, "loss": 0.27, "step": 5000 }, { "epoch": 3.4864300626304803, "grad_norm": 9.667845726013184, "learning_rate": 6.0542797494780806e-06, "loss": 0.2419, "step": 5010 }, { "epoch": 3.4933890048712595, "grad_norm": 12.074315071105957, "learning_rate": 6.0264439805149625e-06, "loss": 0.2857, "step": 5020 }, { "epoch": 3.500347947112039, "grad_norm": 15.645792007446289, "learning_rate": 5.998608211551845e-06, "loss": 0.2042, "step": 5030 }, { "epoch": 3.5073068893528183, "grad_norm": 9.472585678100586, "learning_rate": 5.970772442588727e-06, "loss": 0.1702, "step": 5040 }, { "epoch": 3.5142658315935975, "grad_norm": 11.11557674407959, "learning_rate": 5.942936673625609e-06, "loss": 0.24, "step": 5050 }, { "epoch": 3.521224773834377, "grad_norm": 6.724925518035889, "learning_rate": 5.915100904662492e-06, "loss": 0.2401, "step": 5060 }, { "epoch": 3.528183716075157, "grad_norm": 32.468055725097656, "learning_rate": 5.887265135699374e-06, "loss": 0.1586, "step": 5070 }, { "epoch": 3.535142658315936, "grad_norm": 19.15355110168457, "learning_rate": 5.8594293667362565e-06, "loss": 0.2534, "step": 5080 }, { "epoch": 3.542101600556715, "grad_norm": 5.981065273284912, "learning_rate": 5.831593597773138e-06, "loss": 0.275, "step": 5090 }, { "epoch": 3.549060542797495, "grad_norm": 13.749540328979492, "learning_rate": 5.803757828810022e-06, "loss": 0.2548, "step": 5100 }, { "epoch": 3.556019485038274, "grad_norm": 11.47478199005127, "learning_rate": 5.775922059846903e-06, "loss": 0.2089, "step": 5110 }, { "epoch": 3.5629784272790537, "grad_norm": 9.613821029663086, "learning_rate": 5.748086290883787e-06, "loss": 0.2008, "step": 5120 }, { "epoch": 3.569937369519833, "grad_norm": 9.990856170654297, "learning_rate": 5.7202505219206686e-06, "loss": 0.1772, "step": 5130 }, { "epoch": 3.5768963117606125, "grad_norm": 13.554731369018555, "learning_rate": 5.692414752957551e-06, "loss": 0.2308, "step": 5140 }, { "epoch": 3.5838552540013917, "grad_norm": 13.590909004211426, "learning_rate": 5.664578983994433e-06, "loss": 0.1847, "step": 5150 }, { "epoch": 3.5908141962421714, "grad_norm": 19.21977996826172, "learning_rate": 5.636743215031316e-06, "loss": 0.2994, "step": 5160 }, { "epoch": 3.5977731384829506, "grad_norm": 12.067795753479004, "learning_rate": 5.608907446068198e-06, "loss": 0.2212, "step": 5170 }, { "epoch": 3.60473208072373, "grad_norm": 23.037675857543945, "learning_rate": 5.581071677105081e-06, "loss": 0.2694, "step": 5180 }, { "epoch": 3.6116910229645094, "grad_norm": 7.532259464263916, "learning_rate": 5.553235908141963e-06, "loss": 0.2941, "step": 5190 }, { "epoch": 3.6186499652052886, "grad_norm": 10.377799987792969, "learning_rate": 5.5254001391788445e-06, "loss": 0.2152, "step": 5200 }, { "epoch": 3.6256089074460682, "grad_norm": 7.491756916046143, "learning_rate": 5.497564370215728e-06, "loss": 0.2326, "step": 5210 }, { "epoch": 3.632567849686848, "grad_norm": 13.305363655090332, "learning_rate": 5.46972860125261e-06, "loss": 0.1967, "step": 5220 }, { "epoch": 3.639526791927627, "grad_norm": 8.822273254394531, "learning_rate": 5.441892832289493e-06, "loss": 0.2532, "step": 5230 }, { "epoch": 3.6464857341684063, "grad_norm": 22.017900466918945, "learning_rate": 5.414057063326375e-06, "loss": 0.2643, "step": 5240 }, { "epoch": 3.653444676409186, "grad_norm": 17.09214210510254, "learning_rate": 5.3862212943632574e-06, "loss": 0.2167, "step": 5250 }, { "epoch": 3.660403618649965, "grad_norm": 14.493659973144531, "learning_rate": 5.358385525400139e-06, "loss": 0.2321, "step": 5260 }, { "epoch": 3.6673625608907447, "grad_norm": 4.279122829437256, "learning_rate": 5.330549756437022e-06, "loss": 0.228, "step": 5270 }, { "epoch": 3.674321503131524, "grad_norm": 4.165134429931641, "learning_rate": 5.302713987473904e-06, "loss": 0.2386, "step": 5280 }, { "epoch": 3.6812804453723036, "grad_norm": 15.653182983398438, "learning_rate": 5.274878218510787e-06, "loss": 0.2298, "step": 5290 }, { "epoch": 3.688239387613083, "grad_norm": 6.955724239349365, "learning_rate": 5.247042449547669e-06, "loss": 0.1807, "step": 5300 }, { "epoch": 3.695198329853862, "grad_norm": 12.580881118774414, "learning_rate": 5.219206680584552e-06, "loss": 0.2604, "step": 5310 }, { "epoch": 3.7021572720946416, "grad_norm": 9.050446510314941, "learning_rate": 5.191370911621434e-06, "loss": 0.2233, "step": 5320 }, { "epoch": 3.7091162143354213, "grad_norm": 8.741286277770996, "learning_rate": 5.163535142658317e-06, "loss": 0.223, "step": 5330 }, { "epoch": 3.7160751565762005, "grad_norm": 5.017666816711426, "learning_rate": 5.135699373695199e-06, "loss": 0.1976, "step": 5340 }, { "epoch": 3.7230340988169797, "grad_norm": 15.6959228515625, "learning_rate": 5.107863604732081e-06, "loss": 0.2077, "step": 5350 }, { "epoch": 3.7299930410577593, "grad_norm": 12.862638473510742, "learning_rate": 5.0800278357689635e-06, "loss": 0.1962, "step": 5360 }, { "epoch": 3.7369519832985385, "grad_norm": 11.374602317810059, "learning_rate": 5.0521920668058454e-06, "loss": 0.2101, "step": 5370 }, { "epoch": 3.743910925539318, "grad_norm": 25.180683135986328, "learning_rate": 5.024356297842728e-06, "loss": 0.2527, "step": 5380 }, { "epoch": 3.7508698677800973, "grad_norm": 7.092601299285889, "learning_rate": 4.996520528879611e-06, "loss": 0.249, "step": 5390 }, { "epoch": 3.757828810020877, "grad_norm": 12.866328239440918, "learning_rate": 4.968684759916494e-06, "loss": 0.1986, "step": 5400 }, { "epoch": 3.764787752261656, "grad_norm": 15.909232139587402, "learning_rate": 4.940848990953376e-06, "loss": 0.2104, "step": 5410 }, { "epoch": 3.7717466945024354, "grad_norm": 18.9605770111084, "learning_rate": 4.9130132219902575e-06, "loss": 0.2601, "step": 5420 }, { "epoch": 3.778705636743215, "grad_norm": 21.599374771118164, "learning_rate": 4.88517745302714e-06, "loss": 0.222, "step": 5430 }, { "epoch": 3.7856645789839947, "grad_norm": 16.166671752929688, "learning_rate": 4.857341684064022e-06, "loss": 0.2494, "step": 5440 }, { "epoch": 3.792623521224774, "grad_norm": 5.08117151260376, "learning_rate": 4.829505915100905e-06, "loss": 0.2836, "step": 5450 }, { "epoch": 3.799582463465553, "grad_norm": 20.023841857910156, "learning_rate": 4.801670146137788e-06, "loss": 0.2123, "step": 5460 }, { "epoch": 3.8065414057063327, "grad_norm": 17.623476028442383, "learning_rate": 4.77383437717467e-06, "loss": 0.2018, "step": 5470 }, { "epoch": 3.813500347947112, "grad_norm": 17.300357818603516, "learning_rate": 4.745998608211552e-06, "loss": 0.2031, "step": 5480 }, { "epoch": 3.8204592901878915, "grad_norm": 19.605348587036133, "learning_rate": 4.718162839248434e-06, "loss": 0.1733, "step": 5490 }, { "epoch": 3.8274182324286707, "grad_norm": 13.359166145324707, "learning_rate": 4.690327070285317e-06, "loss": 0.2236, "step": 5500 }, { "epoch": 3.8343771746694504, "grad_norm": 22.73190689086914, "learning_rate": 4.6624913013222e-06, "loss": 0.2148, "step": 5510 }, { "epoch": 3.8413361169102296, "grad_norm": 14.263452529907227, "learning_rate": 4.634655532359082e-06, "loss": 0.3008, "step": 5520 }, { "epoch": 3.848295059151009, "grad_norm": 24.41339111328125, "learning_rate": 4.6068197633959645e-06, "loss": 0.2365, "step": 5530 }, { "epoch": 3.8552540013917884, "grad_norm": 10.054245948791504, "learning_rate": 4.578983994432846e-06, "loss": 0.2201, "step": 5540 }, { "epoch": 3.862212943632568, "grad_norm": 20.606000900268555, "learning_rate": 4.551148225469729e-06, "loss": 0.2901, "step": 5550 }, { "epoch": 3.8691718858733473, "grad_norm": 8.361483573913574, "learning_rate": 4.523312456506611e-06, "loss": 0.2133, "step": 5560 }, { "epoch": 3.8761308281141265, "grad_norm": 16.224584579467773, "learning_rate": 4.495476687543494e-06, "loss": 0.2648, "step": 5570 }, { "epoch": 3.883089770354906, "grad_norm": 24.251644134521484, "learning_rate": 4.467640918580376e-06, "loss": 0.2505, "step": 5580 }, { "epoch": 3.8900487125956853, "grad_norm": 10.059554100036621, "learning_rate": 4.4398051496172585e-06, "loss": 0.1715, "step": 5590 }, { "epoch": 3.897007654836465, "grad_norm": 22.54900550842285, "learning_rate": 4.41196938065414e-06, "loss": 0.1525, "step": 5600 }, { "epoch": 3.903966597077244, "grad_norm": 5.793692588806152, "learning_rate": 4.384133611691023e-06, "loss": 0.2345, "step": 5610 }, { "epoch": 3.910925539318024, "grad_norm": 19.2098445892334, "learning_rate": 4.356297842727906e-06, "loss": 0.165, "step": 5620 }, { "epoch": 3.917884481558803, "grad_norm": 11.589962005615234, "learning_rate": 4.328462073764788e-06, "loss": 0.2033, "step": 5630 }, { "epoch": 3.9248434237995826, "grad_norm": 13.061795234680176, "learning_rate": 4.300626304801671e-06, "loss": 0.2517, "step": 5640 }, { "epoch": 3.931802366040362, "grad_norm": 9.12142276763916, "learning_rate": 4.272790535838553e-06, "loss": 0.2068, "step": 5650 }, { "epoch": 3.9387613082811415, "grad_norm": 14.601790428161621, "learning_rate": 4.244954766875435e-06, "loss": 0.2864, "step": 5660 }, { "epoch": 3.9457202505219207, "grad_norm": 10.787036895751953, "learning_rate": 4.217118997912318e-06, "loss": 0.2286, "step": 5670 }, { "epoch": 3.9526791927627, "grad_norm": 11.121417045593262, "learning_rate": 4.1892832289492e-06, "loss": 0.2385, "step": 5680 }, { "epoch": 3.9596381350034795, "grad_norm": 11.553411483764648, "learning_rate": 4.161447459986083e-06, "loss": 0.2509, "step": 5690 }, { "epoch": 3.966597077244259, "grad_norm": 12.59765625, "learning_rate": 4.1336116910229655e-06, "loss": 0.2478, "step": 5700 }, { "epoch": 3.9735560194850383, "grad_norm": 10.153321266174316, "learning_rate": 4.105775922059847e-06, "loss": 0.2149, "step": 5710 }, { "epoch": 3.9805149617258175, "grad_norm": 10.688750267028809, "learning_rate": 4.077940153096729e-06, "loss": 0.2423, "step": 5720 }, { "epoch": 3.987473903966597, "grad_norm": 22.212329864501953, "learning_rate": 4.050104384133612e-06, "loss": 0.2626, "step": 5730 }, { "epoch": 3.9944328462073764, "grad_norm": 7.783158302307129, "learning_rate": 4.022268615170494e-06, "loss": 0.2033, "step": 5740 }, { "epoch": 4.0, "eval_accuracy": 0.8193211488250652, "eval_f1": 0.8180624199818877, "eval_loss": 0.6137004494667053, "eval_precision": 0.8174134517321099, "eval_recall": 0.8193211488250652, "eval_runtime": 30.262, "eval_samples_per_second": 189.842, "eval_steps_per_second": 5.948, "step": 5748 } ], "logging_steps": 10, "max_steps": 7185, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2096484874133504e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }