{ "best_global_step": 5049, "best_metric": 0.5286195286195287, "best_model_checkpoint": "./wav2vec2_accent_classification_exp2/checkpoint-5049", "epoch": 5.0, "eval_steps": 500, "global_step": 8415, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011883541295306001, "grad_norm": 13.946310997009277, "learning_rate": 1.0688836104513065e-06, "loss": 1.3423, "step": 20 }, { "epoch": 0.023767082590612002, "grad_norm": 10.987089157104492, "learning_rate": 2.2565320665083133e-06, "loss": 1.5915, "step": 40 }, { "epoch": 0.035650623885918005, "grad_norm": 9.114075660705566, "learning_rate": 3.4441805225653207e-06, "loss": 1.2294, "step": 60 }, { "epoch": 0.047534165181224004, "grad_norm": 4.089534282684326, "learning_rate": 4.631828978622328e-06, "loss": 1.6258, "step": 80 }, { "epoch": 0.059417706476530004, "grad_norm": 1.886678695678711, "learning_rate": 5.819477434679335e-06, "loss": 1.3137, "step": 100 }, { "epoch": 0.07130124777183601, "grad_norm": 4.820520401000977, "learning_rate": 6.947743467933492e-06, "loss": 1.441, "step": 120 }, { "epoch": 0.08318478906714201, "grad_norm": 8.273014068603516, "learning_rate": 8.135391923990499e-06, "loss": 1.0149, "step": 140 }, { "epoch": 0.09506833036244801, "grad_norm": 14.566327095031738, "learning_rate": 9.263657957244656e-06, "loss": 1.9487, "step": 160 }, { "epoch": 0.10695187165775401, "grad_norm": 8.97884750366211, "learning_rate": 1.0451306413301664e-05, "loss": 1.6529, "step": 180 }, { "epoch": 0.11883541295306001, "grad_norm": 9.661385536193848, "learning_rate": 1.163895486935867e-05, "loss": 1.5005, "step": 200 }, { "epoch": 0.13071895424836602, "grad_norm": 25.764766693115234, "learning_rate": 1.2826603325415679e-05, "loss": 1.1438, "step": 220 }, { "epoch": 0.14260249554367202, "grad_norm": 6.792453289031982, "learning_rate": 1.4014251781472683e-05, "loss": 1.391, "step": 240 }, { "epoch": 0.15448603683897802, "grad_norm": 20.50897979736328, "learning_rate": 1.5201900237529693e-05, "loss": 1.5084, "step": 260 }, { "epoch": 0.16636957813428402, "grad_norm": 16.218360900878906, "learning_rate": 1.63895486935867e-05, "loss": 0.9483, "step": 280 }, { "epoch": 0.17825311942959002, "grad_norm": 42.791805267333984, "learning_rate": 1.7577197149643705e-05, "loss": 1.4542, "step": 300 }, { "epoch": 0.19013666072489602, "grad_norm": 8.093267440795898, "learning_rate": 1.876484560570071e-05, "loss": 0.9734, "step": 320 }, { "epoch": 0.20202020202020202, "grad_norm": 11.891115188598633, "learning_rate": 1.995249406175772e-05, "loss": 2.0537, "step": 340 }, { "epoch": 0.21390374331550802, "grad_norm": 8.027478218078613, "learning_rate": 2.114014251781473e-05, "loss": 1.4334, "step": 360 }, { "epoch": 0.22578728461081402, "grad_norm": 0.1833440214395523, "learning_rate": 2.2327790973871736e-05, "loss": 1.261, "step": 380 }, { "epoch": 0.23767082590612001, "grad_norm": 13.981164932250977, "learning_rate": 2.3515439429928742e-05, "loss": 1.1544, "step": 400 }, { "epoch": 0.24955436720142601, "grad_norm": 10.806032180786133, "learning_rate": 2.4703087885985748e-05, "loss": 1.3147, "step": 420 }, { "epoch": 0.26143790849673204, "grad_norm": 0.24756519496440887, "learning_rate": 2.5890736342042754e-05, "loss": 1.563, "step": 440 }, { "epoch": 0.27332144979203804, "grad_norm": 5.813249588012695, "learning_rate": 2.7078384798099763e-05, "loss": 1.123, "step": 460 }, { "epoch": 0.28520499108734404, "grad_norm": 0.6889862418174744, "learning_rate": 2.826603325415677e-05, "loss": 1.4189, "step": 480 }, { "epoch": 0.29708853238265004, "grad_norm": 16.024629592895508, "learning_rate": 2.9453681710213776e-05, "loss": 1.6982, "step": 500 }, { "epoch": 0.30897207367795604, "grad_norm": 14.917092323303223, "learning_rate": 3.064133016627079e-05, "loss": 1.6073, "step": 520 }, { "epoch": 0.32085561497326204, "grad_norm": 3.446753978729248, "learning_rate": 3.1828978622327794e-05, "loss": 1.6064, "step": 540 }, { "epoch": 0.33273915626856804, "grad_norm": 8.022488594055176, "learning_rate": 3.30166270783848e-05, "loss": 1.6411, "step": 560 }, { "epoch": 0.34462269756387404, "grad_norm": 6.750226974487305, "learning_rate": 3.4204275534441806e-05, "loss": 1.6224, "step": 580 }, { "epoch": 0.35650623885918004, "grad_norm": 8.452493667602539, "learning_rate": 3.539192399049881e-05, "loss": 1.906, "step": 600 }, { "epoch": 0.36838978015448604, "grad_norm": 2.7862751483917236, "learning_rate": 3.657957244655582e-05, "loss": 1.2242, "step": 620 }, { "epoch": 0.38027332144979203, "grad_norm": 9.983319282531738, "learning_rate": 3.7767220902612825e-05, "loss": 0.9727, "step": 640 }, { "epoch": 0.39215686274509803, "grad_norm": 18.746906280517578, "learning_rate": 3.895486935866984e-05, "loss": 1.4356, "step": 660 }, { "epoch": 0.40404040404040403, "grad_norm": 6.126219749450684, "learning_rate": 4.0142517814726843e-05, "loss": 1.354, "step": 680 }, { "epoch": 0.41592394533571003, "grad_norm": 5.892016410827637, "learning_rate": 4.133016627078385e-05, "loss": 1.2593, "step": 700 }, { "epoch": 0.42780748663101603, "grad_norm": 36.41590881347656, "learning_rate": 4.2517814726840856e-05, "loss": 1.6788, "step": 720 }, { "epoch": 0.43969102792632203, "grad_norm": 7.916555881500244, "learning_rate": 4.370546318289787e-05, "loss": 1.2726, "step": 740 }, { "epoch": 0.45157456922162803, "grad_norm": 13.21894359588623, "learning_rate": 4.4893111638954874e-05, "loss": 1.2754, "step": 760 }, { "epoch": 0.46345811051693403, "grad_norm": 15.568081855773926, "learning_rate": 4.6080760095011874e-05, "loss": 1.4403, "step": 780 }, { "epoch": 0.47534165181224003, "grad_norm": 0.22334127128124237, "learning_rate": 4.7268408551068886e-05, "loss": 1.1967, "step": 800 }, { "epoch": 0.48722519310754603, "grad_norm": 3.894404888153076, "learning_rate": 4.845605700712589e-05, "loss": 1.3669, "step": 820 }, { "epoch": 0.49910873440285203, "grad_norm": 0.42559102177619934, "learning_rate": 4.96437054631829e-05, "loss": 1.4207, "step": 840 }, { "epoch": 0.5109922756981581, "grad_norm": 2.018228769302368, "learning_rate": 4.9907566354152915e-05, "loss": 1.5397, "step": 860 }, { "epoch": 0.5228758169934641, "grad_norm": 4.865698337554932, "learning_rate": 4.977551828865707e-05, "loss": 0.9683, "step": 880 }, { "epoch": 0.5347593582887701, "grad_norm": 13.285181999206543, "learning_rate": 4.964347022316123e-05, "loss": 1.0565, "step": 900 }, { "epoch": 0.5466428995840761, "grad_norm": 18.873655319213867, "learning_rate": 4.951142215766539e-05, "loss": 1.2446, "step": 920 }, { "epoch": 0.5585264408793821, "grad_norm": 7.773611068725586, "learning_rate": 4.937937409216955e-05, "loss": 1.1926, "step": 940 }, { "epoch": 0.5704099821746881, "grad_norm": 32.8792610168457, "learning_rate": 4.9247326026673714e-05, "loss": 1.4732, "step": 960 }, { "epoch": 0.5822935234699941, "grad_norm": 7.385775566101074, "learning_rate": 4.911527796117787e-05, "loss": 1.3785, "step": 980 }, { "epoch": 0.5941770647653001, "grad_norm": 18.77931022644043, "learning_rate": 4.898322989568203e-05, "loss": 1.3192, "step": 1000 }, { "epoch": 0.6060606060606061, "grad_norm": 5.315192699432373, "learning_rate": 4.8851181830186186e-05, "loss": 1.8817, "step": 1020 }, { "epoch": 0.6179441473559121, "grad_norm": 55.71400833129883, "learning_rate": 4.871913376469035e-05, "loss": 1.2092, "step": 1040 }, { "epoch": 0.6298276886512181, "grad_norm": 11.886483192443848, "learning_rate": 4.8593688102469305e-05, "loss": 1.4677, "step": 1060 }, { "epoch": 0.6417112299465241, "grad_norm": 11.212895393371582, "learning_rate": 4.846164003697346e-05, "loss": 1.3166, "step": 1080 }, { "epoch": 0.6535947712418301, "grad_norm": 0.1047590896487236, "learning_rate": 4.832959197147762e-05, "loss": 1.1183, "step": 1100 }, { "epoch": 0.6654783125371361, "grad_norm": 15.537800788879395, "learning_rate": 4.8197543905981776e-05, "loss": 1.149, "step": 1120 }, { "epoch": 0.6773618538324421, "grad_norm": 2.474100351333618, "learning_rate": 4.806549584048594e-05, "loss": 1.0457, "step": 1140 }, { "epoch": 0.6892453951277481, "grad_norm": 8.193405151367188, "learning_rate": 4.79334477749901e-05, "loss": 1.6977, "step": 1160 }, { "epoch": 0.7011289364230541, "grad_norm": 8.54953384399414, "learning_rate": 4.780139970949426e-05, "loss": 1.1706, "step": 1180 }, { "epoch": 0.7130124777183601, "grad_norm": 1.4723117351531982, "learning_rate": 4.766935164399842e-05, "loss": 1.8051, "step": 1200 }, { "epoch": 0.7248960190136661, "grad_norm": 14.194575309753418, "learning_rate": 4.7537303578502576e-05, "loss": 1.1304, "step": 1220 }, { "epoch": 0.7367795603089721, "grad_norm": 2.2631449699401855, "learning_rate": 4.740525551300673e-05, "loss": 1.0901, "step": 1240 }, { "epoch": 0.7486631016042781, "grad_norm": 15.501215934753418, "learning_rate": 4.72732074475109e-05, "loss": 1.5244, "step": 1260 }, { "epoch": 0.7605466428995841, "grad_norm": 8.5521240234375, "learning_rate": 4.7141159382015054e-05, "loss": 1.036, "step": 1280 }, { "epoch": 0.7724301841948901, "grad_norm": 2.5163466930389404, "learning_rate": 4.700911131651922e-05, "loss": 1.1431, "step": 1300 }, { "epoch": 0.7843137254901961, "grad_norm": 6.166524887084961, "learning_rate": 4.6877063251023375e-05, "loss": 1.6139, "step": 1320 }, { "epoch": 0.7961972667855021, "grad_norm": 0.9744242429733276, "learning_rate": 4.674501518552753e-05, "loss": 1.177, "step": 1340 }, { "epoch": 0.8080808080808081, "grad_norm": 24.883136749267578, "learning_rate": 4.661296712003169e-05, "loss": 1.0183, "step": 1360 }, { "epoch": 0.8199643493761141, "grad_norm": 60.34940719604492, "learning_rate": 4.648091905453585e-05, "loss": 1.5402, "step": 1380 }, { "epoch": 0.8318478906714201, "grad_norm": 1.4948738813400269, "learning_rate": 4.634887098904001e-05, "loss": 1.8425, "step": 1400 }, { "epoch": 0.8437314319667261, "grad_norm": 16.30664825439453, "learning_rate": 4.6216822923544174e-05, "loss": 1.8361, "step": 1420 }, { "epoch": 0.8556149732620321, "grad_norm": 26.030075073242188, "learning_rate": 4.608477485804833e-05, "loss": 1.6207, "step": 1440 }, { "epoch": 0.8674985145573381, "grad_norm": 9.511061668395996, "learning_rate": 4.595272679255249e-05, "loss": 1.2074, "step": 1460 }, { "epoch": 0.8793820558526441, "grad_norm": 25.75135040283203, "learning_rate": 4.5820678727056646e-05, "loss": 1.3626, "step": 1480 }, { "epoch": 0.8912655971479501, "grad_norm": 9.004514694213867, "learning_rate": 4.568863066156081e-05, "loss": 1.0094, "step": 1500 }, { "epoch": 0.9031491384432561, "grad_norm": 15.008450508117676, "learning_rate": 4.555658259606497e-05, "loss": 0.9041, "step": 1520 }, { "epoch": 0.9150326797385621, "grad_norm": 10.19078540802002, "learning_rate": 4.542453453056913e-05, "loss": 1.2298, "step": 1540 }, { "epoch": 0.9269162210338681, "grad_norm": 6.368117809295654, "learning_rate": 4.529248646507329e-05, "loss": 1.1629, "step": 1560 }, { "epoch": 0.9387997623291741, "grad_norm": 8.672981262207031, "learning_rate": 4.5160438399577445e-05, "loss": 1.4828, "step": 1580 }, { "epoch": 0.9506833036244801, "grad_norm": 8.937565803527832, "learning_rate": 4.502839033408161e-05, "loss": 1.0707, "step": 1600 }, { "epoch": 0.9625668449197861, "grad_norm": 42.59934997558594, "learning_rate": 4.4896342268585766e-05, "loss": 1.1838, "step": 1620 }, { "epoch": 0.9744503862150921, "grad_norm": 5.572483062744141, "learning_rate": 4.476429420308993e-05, "loss": 1.3519, "step": 1640 }, { "epoch": 0.9863339275103981, "grad_norm": 15.407792091369629, "learning_rate": 4.463224613759409e-05, "loss": 1.9737, "step": 1660 }, { "epoch": 0.9982174688057041, "grad_norm": 10.246801376342773, "learning_rate": 4.450019807209825e-05, "loss": 1.3942, "step": 1680 }, { "epoch": 1.0, "eval_accuracy": 0.5084175084175084, "eval_loss": 1.4993430376052856, "eval_runtime": 95.7254, "eval_samples_per_second": 3.103, "eval_steps_per_second": 3.103, "step": 1683 }, { "epoch": 1.0101010101010102, "grad_norm": 92.40440368652344, "learning_rate": 4.436815000660241e-05, "loss": 1.5887, "step": 1700 }, { "epoch": 1.0219845513963162, "grad_norm": 18.110252380371094, "learning_rate": 4.4236101941106565e-05, "loss": 1.8026, "step": 1720 }, { "epoch": 1.0338680926916222, "grad_norm": 14.221270561218262, "learning_rate": 4.410405387561072e-05, "loss": 1.3638, "step": 1740 }, { "epoch": 1.0457516339869282, "grad_norm": 4.216358661651611, "learning_rate": 4.3972005810114886e-05, "loss": 1.8443, "step": 1760 }, { "epoch": 1.0576351752822342, "grad_norm": 0.6796126365661621, "learning_rate": 4.3839957744619043e-05, "loss": 0.7499, "step": 1780 }, { "epoch": 1.0695187165775402, "grad_norm": 6.125434398651123, "learning_rate": 4.370790967912321e-05, "loss": 1.197, "step": 1800 }, { "epoch": 1.0814022578728462, "grad_norm": 1.4550594091415405, "learning_rate": 4.3575861613627364e-05, "loss": 1.7175, "step": 1820 }, { "epoch": 1.0932857991681522, "grad_norm": 9.569698333740234, "learning_rate": 4.344381354813152e-05, "loss": 1.5715, "step": 1840 }, { "epoch": 1.1051693404634582, "grad_norm": 6.054975509643555, "learning_rate": 4.331176548263568e-05, "loss": 1.5164, "step": 1860 }, { "epoch": 1.1170528817587642, "grad_norm": 66.08502197265625, "learning_rate": 4.317971741713984e-05, "loss": 1.834, "step": 1880 }, { "epoch": 1.1289364230540702, "grad_norm": 31.93483543395996, "learning_rate": 4.3047669351644e-05, "loss": 1.4792, "step": 1900 }, { "epoch": 1.1408199643493762, "grad_norm": 8.787007331848145, "learning_rate": 4.2915621286148164e-05, "loss": 1.1916, "step": 1920 }, { "epoch": 1.1527035056446822, "grad_norm": 12.01029109954834, "learning_rate": 4.278357322065232e-05, "loss": 1.4424, "step": 1940 }, { "epoch": 1.1645870469399882, "grad_norm": 10.144413948059082, "learning_rate": 4.265152515515648e-05, "loss": 1.1927, "step": 1960 }, { "epoch": 1.1764705882352942, "grad_norm": 8.053743362426758, "learning_rate": 4.2519477089660635e-05, "loss": 1.4009, "step": 1980 }, { "epoch": 1.1883541295306002, "grad_norm": 3.4341797828674316, "learning_rate": 4.23874290241648e-05, "loss": 0.8713, "step": 2000 }, { "epoch": 1.2002376708259062, "grad_norm": 8.886828422546387, "learning_rate": 4.2255380958668956e-05, "loss": 0.9202, "step": 2020 }, { "epoch": 1.2121212121212122, "grad_norm": 24.93857192993164, "learning_rate": 4.212333289317312e-05, "loss": 1.7785, "step": 2040 }, { "epoch": 1.2240047534165182, "grad_norm": 16.332698822021484, "learning_rate": 4.199128482767728e-05, "loss": 1.1573, "step": 2060 }, { "epoch": 1.2358882947118242, "grad_norm": 19.60898208618164, "learning_rate": 4.1859236762181434e-05, "loss": 1.4027, "step": 2080 }, { "epoch": 1.2477718360071302, "grad_norm": 0.7732555866241455, "learning_rate": 4.172718869668559e-05, "loss": 1.1273, "step": 2100 }, { "epoch": 1.2596553773024362, "grad_norm": 10.469820022583008, "learning_rate": 4.1595140631189755e-05, "loss": 1.1299, "step": 2120 }, { "epoch": 1.2715389185977422, "grad_norm": 33.61751174926758, "learning_rate": 4.146309256569392e-05, "loss": 1.6233, "step": 2140 }, { "epoch": 1.2834224598930482, "grad_norm": 18.64516258239746, "learning_rate": 4.1331044500198077e-05, "loss": 1.222, "step": 2160 }, { "epoch": 1.2953060011883542, "grad_norm": 31.447341918945312, "learning_rate": 4.1198996434702234e-05, "loss": 1.4424, "step": 2180 }, { "epoch": 1.3071895424836601, "grad_norm": 10.651689529418945, "learning_rate": 4.106694836920639e-05, "loss": 1.5345, "step": 2200 }, { "epoch": 1.3190730837789661, "grad_norm": 3.412811040878296, "learning_rate": 4.0934900303710555e-05, "loss": 0.9922, "step": 2220 }, { "epoch": 1.3309566250742721, "grad_norm": 14.007122039794922, "learning_rate": 4.080285223821471e-05, "loss": 0.9139, "step": 2240 }, { "epoch": 1.3428401663695781, "grad_norm": 13.190730094909668, "learning_rate": 4.0670804172718876e-05, "loss": 1.284, "step": 2260 }, { "epoch": 1.3547237076648841, "grad_norm": 0.3047619163990021, "learning_rate": 4.053875610722303e-05, "loss": 0.9321, "step": 2280 }, { "epoch": 1.3666072489601901, "grad_norm": 32.45023727416992, "learning_rate": 4.040670804172719e-05, "loss": 1.217, "step": 2300 }, { "epoch": 1.3784907902554961, "grad_norm": 15.72593879699707, "learning_rate": 4.027465997623135e-05, "loss": 1.7323, "step": 2320 }, { "epoch": 1.3903743315508021, "grad_norm": 29.2542781829834, "learning_rate": 4.014261191073551e-05, "loss": 1.2535, "step": 2340 }, { "epoch": 1.4022578728461081, "grad_norm": 23.876358032226562, "learning_rate": 4.001056384523967e-05, "loss": 1.8049, "step": 2360 }, { "epoch": 1.4141414141414141, "grad_norm": 6.0813398361206055, "learning_rate": 3.987851577974383e-05, "loss": 1.2617, "step": 2380 }, { "epoch": 1.4260249554367201, "grad_norm": 13.96455192565918, "learning_rate": 3.974646771424799e-05, "loss": 1.149, "step": 2400 }, { "epoch": 1.4379084967320261, "grad_norm": 0.3341121971607208, "learning_rate": 3.9614419648752146e-05, "loss": 0.8563, "step": 2420 }, { "epoch": 1.4497920380273321, "grad_norm": 9.068788528442383, "learning_rate": 3.9482371583256304e-05, "loss": 1.5814, "step": 2440 }, { "epoch": 1.4616755793226381, "grad_norm": 6.83055305480957, "learning_rate": 3.935032351776047e-05, "loss": 1.4922, "step": 2460 }, { "epoch": 1.4735591206179441, "grad_norm": 6.623952865600586, "learning_rate": 3.9218275452264625e-05, "loss": 1.0381, "step": 2480 }, { "epoch": 1.4854426619132501, "grad_norm": 14.515748977661133, "learning_rate": 3.908622738676879e-05, "loss": 1.5335, "step": 2500 }, { "epoch": 1.4973262032085561, "grad_norm": 30.07071876525879, "learning_rate": 3.8954179321272946e-05, "loss": 1.4818, "step": 2520 }, { "epoch": 1.5092097445038621, "grad_norm": 18.413236618041992, "learning_rate": 3.88221312557771e-05, "loss": 1.2395, "step": 2540 }, { "epoch": 1.5210932857991681, "grad_norm": 15.883363723754883, "learning_rate": 3.869008319028126e-05, "loss": 1.2161, "step": 2560 }, { "epoch": 1.5329768270944741, "grad_norm": 21.880586624145508, "learning_rate": 3.8558035124785424e-05, "loss": 0.9101, "step": 2580 }, { "epoch": 1.5448603683897801, "grad_norm": 18.103723526000977, "learning_rate": 3.842598705928958e-05, "loss": 1.4307, "step": 2600 }, { "epoch": 1.5567439096850861, "grad_norm": 36.77565002441406, "learning_rate": 3.8293938993793745e-05, "loss": 1.5081, "step": 2620 }, { "epoch": 1.5686274509803921, "grad_norm": 18.047407150268555, "learning_rate": 3.81618909282979e-05, "loss": 1.1816, "step": 2640 }, { "epoch": 1.5805109922756981, "grad_norm": 6.798829555511475, "learning_rate": 3.802984286280206e-05, "loss": 1.0369, "step": 2660 }, { "epoch": 1.5923945335710041, "grad_norm": 49.81755447387695, "learning_rate": 3.7897794797306216e-05, "loss": 1.3919, "step": 2680 }, { "epoch": 1.6042780748663101, "grad_norm": 7.157380104064941, "learning_rate": 3.776574673181038e-05, "loss": 1.1026, "step": 2700 }, { "epoch": 1.6161616161616161, "grad_norm": 10.095293998718262, "learning_rate": 3.7633698666314544e-05, "loss": 1.486, "step": 2720 }, { "epoch": 1.6280451574569221, "grad_norm": 11.767765045166016, "learning_rate": 3.75016506008187e-05, "loss": 1.5152, "step": 2740 }, { "epoch": 1.6399286987522281, "grad_norm": 7.241148471832275, "learning_rate": 3.736960253532286e-05, "loss": 1.4989, "step": 2760 }, { "epoch": 1.6518122400475341, "grad_norm": 10.613069534301758, "learning_rate": 3.7237554469827016e-05, "loss": 1.1205, "step": 2780 }, { "epoch": 1.6636957813428401, "grad_norm": 9.965812683105469, "learning_rate": 3.710550640433118e-05, "loss": 1.3987, "step": 2800 }, { "epoch": 1.6755793226381461, "grad_norm": 1.9508392810821533, "learning_rate": 3.697345833883534e-05, "loss": 0.8366, "step": 2820 }, { "epoch": 1.6874628639334521, "grad_norm": 8.022974967956543, "learning_rate": 3.68414102733395e-05, "loss": 1.481, "step": 2840 }, { "epoch": 1.6993464052287581, "grad_norm": 3.9751815795898438, "learning_rate": 3.670936220784366e-05, "loss": 1.0167, "step": 2860 }, { "epoch": 1.7112299465240641, "grad_norm": 3.7135047912597656, "learning_rate": 3.657731414234782e-05, "loss": 0.9009, "step": 2880 }, { "epoch": 1.7231134878193701, "grad_norm": 4.908830165863037, "learning_rate": 3.644526607685197e-05, "loss": 1.833, "step": 2900 }, { "epoch": 1.7349970291146761, "grad_norm": 13.40709400177002, "learning_rate": 3.6313218011356136e-05, "loss": 1.6045, "step": 2920 }, { "epoch": 1.7468805704099821, "grad_norm": 15.4568510055542, "learning_rate": 3.6187772349135085e-05, "loss": 0.8128, "step": 2940 }, { "epoch": 1.7587641117052881, "grad_norm": 23.41459083557129, "learning_rate": 3.605572428363925e-05, "loss": 1.5322, "step": 2960 }, { "epoch": 1.7706476530005941, "grad_norm": 2.598500967025757, "learning_rate": 3.5923676218143406e-05, "loss": 1.0382, "step": 2980 }, { "epoch": 1.7825311942959001, "grad_norm": 4.4699788093566895, "learning_rate": 3.579162815264756e-05, "loss": 0.8716, "step": 3000 }, { "epoch": 1.7944147355912063, "grad_norm": 12.809410095214844, "learning_rate": 3.565958008715172e-05, "loss": 1.1505, "step": 3020 }, { "epoch": 1.8062982768865123, "grad_norm": 11.521143913269043, "learning_rate": 3.5527532021655884e-05, "loss": 0.932, "step": 3040 }, { "epoch": 1.8181818181818183, "grad_norm": 0.12162753939628601, "learning_rate": 3.539548395616004e-05, "loss": 1.2314, "step": 3060 }, { "epoch": 1.8300653594771243, "grad_norm": 33.265018463134766, "learning_rate": 3.5263435890664205e-05, "loss": 1.6528, "step": 3080 }, { "epoch": 1.8419489007724303, "grad_norm": 13.268301010131836, "learning_rate": 3.513138782516836e-05, "loss": 1.7362, "step": 3100 }, { "epoch": 1.8538324420677363, "grad_norm": 2.0898756980895996, "learning_rate": 3.4999339759672526e-05, "loss": 1.0029, "step": 3120 }, { "epoch": 1.8657159833630423, "grad_norm": 23.98683738708496, "learning_rate": 3.486729169417668e-05, "loss": 1.2156, "step": 3140 }, { "epoch": 1.8775995246583483, "grad_norm": 3.1218037605285645, "learning_rate": 3.473524362868084e-05, "loss": 1.5906, "step": 3160 }, { "epoch": 1.8894830659536543, "grad_norm": 0.22381114959716797, "learning_rate": 3.4603195563185004e-05, "loss": 0.8903, "step": 3180 }, { "epoch": 1.9013666072489603, "grad_norm": 0.15086892247200012, "learning_rate": 3.447114749768916e-05, "loss": 1.9203, "step": 3200 }, { "epoch": 1.9132501485442663, "grad_norm": 4.569338798522949, "learning_rate": 3.4339099432193325e-05, "loss": 1.4692, "step": 3220 }, { "epoch": 1.9251336898395723, "grad_norm": 0.027358679100871086, "learning_rate": 3.420705136669748e-05, "loss": 0.8022, "step": 3240 }, { "epoch": 1.9370172311348783, "grad_norm": 4.4244465827941895, "learning_rate": 3.407500330120164e-05, "loss": 0.8377, "step": 3260 }, { "epoch": 1.9489007724301843, "grad_norm": 11.952341079711914, "learning_rate": 3.3942955235705797e-05, "loss": 1.3864, "step": 3280 }, { "epoch": 1.9607843137254903, "grad_norm": 26.201208114624023, "learning_rate": 3.381090717020996e-05, "loss": 1.9779, "step": 3300 }, { "epoch": 1.9726678550207963, "grad_norm": 7.364301681518555, "learning_rate": 3.367885910471412e-05, "loss": 1.2488, "step": 3320 }, { "epoch": 1.9845513963161023, "grad_norm": 0.006225454155355692, "learning_rate": 3.354681103921828e-05, "loss": 1.7714, "step": 3340 }, { "epoch": 1.9964349376114083, "grad_norm": 8.852691650390625, "learning_rate": 3.341476297372244e-05, "loss": 1.2031, "step": 3360 }, { "epoch": 2.0, "eval_accuracy": 0.4781144781144781, "eval_loss": 2.019242763519287, "eval_runtime": 95.4083, "eval_samples_per_second": 3.113, "eval_steps_per_second": 3.113, "step": 3366 }, { "epoch": 2.0083184789067143, "grad_norm": 31.287784576416016, "learning_rate": 3.3282714908226596e-05, "loss": 0.8208, "step": 3380 }, { "epoch": 2.0202020202020203, "grad_norm": 2.58017635345459, "learning_rate": 3.315066684273075e-05, "loss": 1.1056, "step": 3400 }, { "epoch": 2.0320855614973263, "grad_norm": 0.5721943974494934, "learning_rate": 3.301861877723492e-05, "loss": 1.0099, "step": 3420 }, { "epoch": 2.0439691027926323, "grad_norm": 24.783836364746094, "learning_rate": 3.2886570711739074e-05, "loss": 1.5317, "step": 3440 }, { "epoch": 2.0558526440879383, "grad_norm": 21.35218620300293, "learning_rate": 3.275452264624324e-05, "loss": 1.4263, "step": 3460 }, { "epoch": 2.0677361853832443, "grad_norm": 6.621454238891602, "learning_rate": 3.2622474580747395e-05, "loss": 1.7136, "step": 3480 }, { "epoch": 2.0796197266785503, "grad_norm": 0.5743517875671387, "learning_rate": 3.249042651525155e-05, "loss": 1.3269, "step": 3500 }, { "epoch": 2.0915032679738563, "grad_norm": 6.627630233764648, "learning_rate": 3.235837844975571e-05, "loss": 1.093, "step": 3520 }, { "epoch": 2.1033868092691623, "grad_norm": 8.044147491455078, "learning_rate": 3.222633038425987e-05, "loss": 1.6103, "step": 3540 }, { "epoch": 2.1152703505644683, "grad_norm": 5.326707363128662, "learning_rate": 3.209428231876403e-05, "loss": 0.785, "step": 3560 }, { "epoch": 2.1271538918597743, "grad_norm": 12.404735565185547, "learning_rate": 3.1962234253268194e-05, "loss": 0.6382, "step": 3580 }, { "epoch": 2.1390374331550803, "grad_norm": 20.279443740844727, "learning_rate": 3.183018618777235e-05, "loss": 1.6417, "step": 3600 }, { "epoch": 2.1509209744503863, "grad_norm": 5.098419189453125, "learning_rate": 3.169813812227651e-05, "loss": 1.1915, "step": 3620 }, { "epoch": 2.1628045157456923, "grad_norm": 14.745153427124023, "learning_rate": 3.1566090056780666e-05, "loss": 0.906, "step": 3640 }, { "epoch": 2.1746880570409983, "grad_norm": 13.50672435760498, "learning_rate": 3.143404199128483e-05, "loss": 1.318, "step": 3660 }, { "epoch": 2.1865715983363043, "grad_norm": 0.5474358797073364, "learning_rate": 3.130199392578899e-05, "loss": 1.3119, "step": 3680 }, { "epoch": 2.1984551396316103, "grad_norm": 4.46449089050293, "learning_rate": 3.116994586029315e-05, "loss": 1.263, "step": 3700 }, { "epoch": 2.2103386809269163, "grad_norm": 0.36209437251091003, "learning_rate": 3.103789779479731e-05, "loss": 1.4981, "step": 3720 }, { "epoch": 2.2222222222222223, "grad_norm": 28.142601013183594, "learning_rate": 3.0905849729301465e-05, "loss": 0.8877, "step": 3740 }, { "epoch": 2.2341057635175283, "grad_norm": 0.3299981355667114, "learning_rate": 3.077380166380563e-05, "loss": 1.261, "step": 3760 }, { "epoch": 2.2459893048128343, "grad_norm": 27.140256881713867, "learning_rate": 3.0641753598309786e-05, "loss": 1.8361, "step": 3780 }, { "epoch": 2.2578728461081403, "grad_norm": 8.372640609741211, "learning_rate": 3.0509705532813947e-05, "loss": 1.3107, "step": 3800 }, { "epoch": 2.2697563874034463, "grad_norm": 0.0998767837882042, "learning_rate": 3.0377657467318104e-05, "loss": 1.4373, "step": 3820 }, { "epoch": 2.2816399286987523, "grad_norm": 6.8200154304504395, "learning_rate": 3.0245609401822268e-05, "loss": 1.4613, "step": 3840 }, { "epoch": 2.2935234699940583, "grad_norm": 5.426070213317871, "learning_rate": 3.0113561336326425e-05, "loss": 1.4091, "step": 3860 }, { "epoch": 2.3054070112893643, "grad_norm": 8.299811363220215, "learning_rate": 2.9981513270830585e-05, "loss": 1.3953, "step": 3880 }, { "epoch": 2.3172905525846703, "grad_norm": 32.09596252441406, "learning_rate": 2.9849465205334742e-05, "loss": 0.8212, "step": 3900 }, { "epoch": 2.3291740938799763, "grad_norm": 13.691157341003418, "learning_rate": 2.9717417139838903e-05, "loss": 1.2198, "step": 3920 }, { "epoch": 2.3410576351752823, "grad_norm": 10.414209365844727, "learning_rate": 2.958536907434306e-05, "loss": 1.8699, "step": 3940 }, { "epoch": 2.3529411764705883, "grad_norm": 13.435007095336914, "learning_rate": 2.9453321008847224e-05, "loss": 1.176, "step": 3960 }, { "epoch": 2.3648247177658943, "grad_norm": 7.29554557800293, "learning_rate": 2.932127294335138e-05, "loss": 1.3829, "step": 3980 }, { "epoch": 2.3767082590612003, "grad_norm": 0.7349147200584412, "learning_rate": 2.9189224877855542e-05, "loss": 0.9339, "step": 4000 }, { "epoch": 2.3885918003565063, "grad_norm": 1.7457711696624756, "learning_rate": 2.90571768123597e-05, "loss": 1.0813, "step": 4020 }, { "epoch": 2.4004753416518123, "grad_norm": 8.853830337524414, "learning_rate": 2.892512874686386e-05, "loss": 1.2585, "step": 4040 }, { "epoch": 2.4123588829471183, "grad_norm": 8.946368217468262, "learning_rate": 2.8793080681368017e-05, "loss": 1.063, "step": 4060 }, { "epoch": 2.4242424242424243, "grad_norm": 0.7700206637382507, "learning_rate": 2.866103261587218e-05, "loss": 0.5709, "step": 4080 }, { "epoch": 2.4361259655377303, "grad_norm": 16.189891815185547, "learning_rate": 2.8528984550376338e-05, "loss": 0.898, "step": 4100 }, { "epoch": 2.4480095068330363, "grad_norm": 0.00046739584649913013, "learning_rate": 2.8396936484880498e-05, "loss": 1.4132, "step": 4120 }, { "epoch": 2.4598930481283423, "grad_norm": 30.322633743286133, "learning_rate": 2.8264888419384655e-05, "loss": 1.3733, "step": 4140 }, { "epoch": 2.4717765894236483, "grad_norm": 2.891150951385498, "learning_rate": 2.8132840353888816e-05, "loss": 0.7882, "step": 4160 }, { "epoch": 2.4836601307189543, "grad_norm": 10.033570289611816, "learning_rate": 2.8000792288392973e-05, "loss": 1.3073, "step": 4180 }, { "epoch": 2.4955436720142603, "grad_norm": 0.06040190905332565, "learning_rate": 2.7868744222897137e-05, "loss": 1.0438, "step": 4200 }, { "epoch": 2.5074272133095663, "grad_norm": 26.081911087036133, "learning_rate": 2.7736696157401294e-05, "loss": 1.3487, "step": 4220 }, { "epoch": 2.5193107546048723, "grad_norm": 6.065289497375488, "learning_rate": 2.7604648091905455e-05, "loss": 0.8482, "step": 4240 }, { "epoch": 2.5311942959001783, "grad_norm": 20.791973114013672, "learning_rate": 2.747260002640961e-05, "loss": 1.1362, "step": 4260 }, { "epoch": 2.5430778371954843, "grad_norm": 0.8581683039665222, "learning_rate": 2.7340551960913776e-05, "loss": 1.1743, "step": 4280 }, { "epoch": 2.5549613784907903, "grad_norm": 10.96738338470459, "learning_rate": 2.7208503895417936e-05, "loss": 0.6798, "step": 4300 }, { "epoch": 2.5668449197860963, "grad_norm": 7.614635467529297, "learning_rate": 2.7076455829922093e-05, "loss": 1.3763, "step": 4320 }, { "epoch": 2.5787284610814023, "grad_norm": 12.606915473937988, "learning_rate": 2.6944407764426254e-05, "loss": 0.8797, "step": 4340 }, { "epoch": 2.5906120023767083, "grad_norm": 0.8938280940055847, "learning_rate": 2.681235969893041e-05, "loss": 1.3393, "step": 4360 }, { "epoch": 2.6024955436720143, "grad_norm": 32.35802459716797, "learning_rate": 2.6680311633434575e-05, "loss": 1.1602, "step": 4380 }, { "epoch": 2.6143790849673203, "grad_norm": 8.067231178283691, "learning_rate": 2.6548263567938732e-05, "loss": 1.0959, "step": 4400 }, { "epoch": 2.6262626262626263, "grad_norm": 33.88584899902344, "learning_rate": 2.6416215502442892e-05, "loss": 2.0019, "step": 4420 }, { "epoch": 2.6381461675579323, "grad_norm": 16.35814094543457, "learning_rate": 2.628416743694705e-05, "loss": 1.257, "step": 4440 }, { "epoch": 2.6500297088532383, "grad_norm": 20.917156219482422, "learning_rate": 2.615211937145121e-05, "loss": 1.0194, "step": 4460 }, { "epoch": 2.6619132501485443, "grad_norm": 1.7137622833251953, "learning_rate": 2.6020071305955367e-05, "loss": 0.9663, "step": 4480 }, { "epoch": 2.6737967914438503, "grad_norm": 11.321816444396973, "learning_rate": 2.588802324045953e-05, "loss": 1.3196, "step": 4500 }, { "epoch": 2.6856803327391563, "grad_norm": 0.3049762547016144, "learning_rate": 2.575597517496369e-05, "loss": 1.2364, "step": 4520 }, { "epoch": 2.6975638740344623, "grad_norm": 12.133074760437012, "learning_rate": 2.562392710946785e-05, "loss": 0.6172, "step": 4540 }, { "epoch": 2.7094474153297683, "grad_norm": 17.304231643676758, "learning_rate": 2.5491879043972006e-05, "loss": 1.2715, "step": 4560 }, { "epoch": 2.7213309566250743, "grad_norm": 23.675540924072266, "learning_rate": 2.5359830978476167e-05, "loss": 1.5344, "step": 4580 }, { "epoch": 2.7332144979203803, "grad_norm": 0.7793062329292297, "learning_rate": 2.5227782912980324e-05, "loss": 1.201, "step": 4600 }, { "epoch": 2.7450980392156863, "grad_norm": 7.877832412719727, "learning_rate": 2.5095734847484488e-05, "loss": 1.1906, "step": 4620 }, { "epoch": 2.7569815805109923, "grad_norm": 16.44761085510254, "learning_rate": 2.4963686781988645e-05, "loss": 1.3473, "step": 4640 }, { "epoch": 2.7688651218062983, "grad_norm": 44.280391693115234, "learning_rate": 2.4831638716492805e-05, "loss": 1.0871, "step": 4660 }, { "epoch": 2.7807486631016043, "grad_norm": 12.280585289001465, "learning_rate": 2.4699590650996966e-05, "loss": 0.9871, "step": 4680 }, { "epoch": 2.7926322043969103, "grad_norm": 20.463937759399414, "learning_rate": 2.4567542585501123e-05, "loss": 1.2736, "step": 4700 }, { "epoch": 2.8045157456922163, "grad_norm": 16.443626403808594, "learning_rate": 2.4435494520005283e-05, "loss": 1.4343, "step": 4720 }, { "epoch": 2.8163992869875223, "grad_norm": 0.39355403184890747, "learning_rate": 2.4303446454509444e-05, "loss": 1.4741, "step": 4740 }, { "epoch": 2.8282828282828283, "grad_norm": 0.24809391796588898, "learning_rate": 2.41713983890136e-05, "loss": 1.3588, "step": 4760 }, { "epoch": 2.8401663695781343, "grad_norm": 6.478326797485352, "learning_rate": 2.403935032351776e-05, "loss": 1.2004, "step": 4780 }, { "epoch": 2.8520499108734403, "grad_norm": 7.24426794052124, "learning_rate": 2.3907302258021922e-05, "loss": 0.9857, "step": 4800 }, { "epoch": 2.8639334521687463, "grad_norm": 50.702274322509766, "learning_rate": 2.377525419252608e-05, "loss": 1.3814, "step": 4820 }, { "epoch": 2.8758169934640523, "grad_norm": 29.885473251342773, "learning_rate": 2.364320612703024e-05, "loss": 1.4847, "step": 4840 }, { "epoch": 2.8877005347593583, "grad_norm": 8.818001747131348, "learning_rate": 2.35111580615344e-05, "loss": 1.3428, "step": 4860 }, { "epoch": 2.8995840760546643, "grad_norm": 20.55769920349121, "learning_rate": 2.3379109996038558e-05, "loss": 0.917, "step": 4880 }, { "epoch": 2.9114676173499703, "grad_norm": 47.634132385253906, "learning_rate": 2.3247061930542718e-05, "loss": 1.8805, "step": 4900 }, { "epoch": 2.9233511586452763, "grad_norm": 6.875288963317871, "learning_rate": 2.311501386504688e-05, "loss": 1.2572, "step": 4920 }, { "epoch": 2.9352346999405823, "grad_norm": 14.847312927246094, "learning_rate": 2.2982965799551036e-05, "loss": 1.1894, "step": 4940 }, { "epoch": 2.9471182412358883, "grad_norm": 12.259700775146484, "learning_rate": 2.2850917734055196e-05, "loss": 1.336, "step": 4960 }, { "epoch": 2.9590017825311943, "grad_norm": 0.7911491990089417, "learning_rate": 2.2718869668559357e-05, "loss": 1.1977, "step": 4980 }, { "epoch": 2.9708853238265003, "grad_norm": 0.1997266709804535, "learning_rate": 2.2586821603063514e-05, "loss": 0.8854, "step": 5000 }, { "epoch": 2.9827688651218063, "grad_norm": 20.288911819458008, "learning_rate": 2.2454773537567674e-05, "loss": 1.7974, "step": 5020 }, { "epoch": 2.9946524064171123, "grad_norm": 9.013290405273438, "learning_rate": 2.2322725472071835e-05, "loss": 1.3532, "step": 5040 }, { "epoch": 3.0, "eval_accuracy": 0.5286195286195287, "eval_loss": 1.7492114305496216, "eval_runtime": 96.0692, "eval_samples_per_second": 3.092, "eval_steps_per_second": 3.092, "step": 5049 }, { "epoch": 3.0065359477124183, "grad_norm": 0.3140548765659332, "learning_rate": 2.2190677406575992e-05, "loss": 1.524, "step": 5060 }, { "epoch": 3.0184194890077243, "grad_norm": 13.2945556640625, "learning_rate": 2.2058629341080153e-05, "loss": 1.2853, "step": 5080 }, { "epoch": 3.0303030303030303, "grad_norm": 29.13817024230957, "learning_rate": 2.1926581275584313e-05, "loss": 0.8849, "step": 5100 }, { "epoch": 3.0421865715983363, "grad_norm": 2.024958848953247, "learning_rate": 2.179453321008847e-05, "loss": 0.812, "step": 5120 }, { "epoch": 3.0540701128936423, "grad_norm": 8.262164115905762, "learning_rate": 2.1662485144592634e-05, "loss": 1.0216, "step": 5140 }, { "epoch": 3.0659536541889483, "grad_norm": 7.723167896270752, "learning_rate": 2.1530437079096795e-05, "loss": 1.4141, "step": 5160 }, { "epoch": 3.0778371954842543, "grad_norm": 29.036544799804688, "learning_rate": 2.1398389013600952e-05, "loss": 1.3481, "step": 5180 }, { "epoch": 3.0897207367795603, "grad_norm": 21.65529441833496, "learning_rate": 2.1266340948105112e-05, "loss": 1.4193, "step": 5200 }, { "epoch": 3.1016042780748663, "grad_norm": 4.1284637451171875, "learning_rate": 2.1134292882609273e-05, "loss": 1.0835, "step": 5220 }, { "epoch": 3.1134878193701723, "grad_norm": 2.1729178428649902, "learning_rate": 2.100224481711343e-05, "loss": 0.9514, "step": 5240 }, { "epoch": 3.1253713606654783, "grad_norm": 27.18805694580078, "learning_rate": 2.087019675161759e-05, "loss": 1.1735, "step": 5260 }, { "epoch": 3.1372549019607843, "grad_norm": 28.346616744995117, "learning_rate": 2.073814868612175e-05, "loss": 1.1961, "step": 5280 }, { "epoch": 3.1491384432560903, "grad_norm": 0.26050230860710144, "learning_rate": 2.0606100620625908e-05, "loss": 1.08, "step": 5300 }, { "epoch": 3.1610219845513963, "grad_norm": 16.993898391723633, "learning_rate": 2.047405255513007e-05, "loss": 0.9361, "step": 5320 }, { "epoch": 3.1729055258467023, "grad_norm": 24.867691040039062, "learning_rate": 2.034200448963423e-05, "loss": 0.912, "step": 5340 }, { "epoch": 3.1847890671420083, "grad_norm": 21.213882446289062, "learning_rate": 2.0209956424138386e-05, "loss": 0.9956, "step": 5360 }, { "epoch": 3.1966726084373143, "grad_norm": 1.9638105630874634, "learning_rate": 2.0077908358642547e-05, "loss": 1.1497, "step": 5380 }, { "epoch": 3.2085561497326203, "grad_norm": 38.245845794677734, "learning_rate": 1.9945860293146708e-05, "loss": 1.4225, "step": 5400 }, { "epoch": 3.2204396910279263, "grad_norm": 2.6078875064849854, "learning_rate": 1.9813812227650865e-05, "loss": 1.0647, "step": 5420 }, { "epoch": 3.2323232323232323, "grad_norm": 10.162765502929688, "learning_rate": 1.9681764162155025e-05, "loss": 0.8759, "step": 5440 }, { "epoch": 3.2442067736185383, "grad_norm": 46.642784118652344, "learning_rate": 1.9549716096659186e-05, "loss": 0.8714, "step": 5460 }, { "epoch": 3.2560903149138443, "grad_norm": 13.699341773986816, "learning_rate": 1.9417668031163343e-05, "loss": 1.3097, "step": 5480 }, { "epoch": 3.2679738562091503, "grad_norm": 0.002345487242564559, "learning_rate": 1.9285619965667503e-05, "loss": 1.2136, "step": 5500 }, { "epoch": 3.2798573975044563, "grad_norm": 1.9926633834838867, "learning_rate": 1.9153571900171664e-05, "loss": 1.7481, "step": 5520 }, { "epoch": 3.2917409387997623, "grad_norm": 1.4836812019348145, "learning_rate": 1.902152383467582e-05, "loss": 1.0077, "step": 5540 }, { "epoch": 3.3036244800950683, "grad_norm": 11.53355884552002, "learning_rate": 1.888947576917998e-05, "loss": 1.0962, "step": 5560 }, { "epoch": 3.3155080213903743, "grad_norm": 0.4469129741191864, "learning_rate": 1.8757427703684142e-05, "loss": 0.5683, "step": 5580 }, { "epoch": 3.3273915626856803, "grad_norm": 13.35949993133545, "learning_rate": 1.86253796381883e-05, "loss": 1.5648, "step": 5600 }, { "epoch": 3.3392751039809863, "grad_norm": 20.45748519897461, "learning_rate": 1.849333157269246e-05, "loss": 0.9816, "step": 5620 }, { "epoch": 3.3511586452762923, "grad_norm": 19.171648025512695, "learning_rate": 1.836128350719662e-05, "loss": 1.2036, "step": 5640 }, { "epoch": 3.3630421865715983, "grad_norm": 8.639327049255371, "learning_rate": 1.8229235441700777e-05, "loss": 1.4011, "step": 5660 }, { "epoch": 3.3749257278669043, "grad_norm": 18.734729766845703, "learning_rate": 1.8097187376204938e-05, "loss": 0.9052, "step": 5680 }, { "epoch": 3.3868092691622103, "grad_norm": 8.798933982849121, "learning_rate": 1.79651393107091e-05, "loss": 1.065, "step": 5700 }, { "epoch": 3.3986928104575163, "grad_norm": 13.333514213562012, "learning_rate": 1.783309124521326e-05, "loss": 1.2257, "step": 5720 }, { "epoch": 3.4105763517528223, "grad_norm": 8.952251434326172, "learning_rate": 1.770104317971742e-05, "loss": 1.7173, "step": 5740 }, { "epoch": 3.4224598930481283, "grad_norm": 60.43721389770508, "learning_rate": 1.756899511422158e-05, "loss": 1.6395, "step": 5760 }, { "epoch": 3.4343434343434343, "grad_norm": 28.346031188964844, "learning_rate": 1.7436947048725737e-05, "loss": 0.7907, "step": 5780 }, { "epoch": 3.4462269756387403, "grad_norm": 35.44182586669922, "learning_rate": 1.7304898983229898e-05, "loss": 0.8268, "step": 5800 }, { "epoch": 3.4581105169340463, "grad_norm": 22.25836181640625, "learning_rate": 1.717285091773406e-05, "loss": 1.3629, "step": 5820 }, { "epoch": 3.4699940582293523, "grad_norm": 0.6172131299972534, "learning_rate": 1.7040802852238215e-05, "loss": 1.3379, "step": 5840 }, { "epoch": 3.4818775995246583, "grad_norm": 19.02311897277832, "learning_rate": 1.6908754786742376e-05, "loss": 1.9869, "step": 5860 }, { "epoch": 3.4937611408199643, "grad_norm": 3.069859266281128, "learning_rate": 1.6776706721246537e-05, "loss": 1.1256, "step": 5880 }, { "epoch": 3.5056446821152702, "grad_norm": 2.17348051071167, "learning_rate": 1.6644658655750694e-05, "loss": 1.6133, "step": 5900 }, { "epoch": 3.5175282234105762, "grad_norm": 0.22041022777557373, "learning_rate": 1.6512610590254854e-05, "loss": 1.0189, "step": 5920 }, { "epoch": 3.5294117647058822, "grad_norm": 20.462352752685547, "learning_rate": 1.6380562524759015e-05, "loss": 1.5619, "step": 5940 }, { "epoch": 3.5412953060011882, "grad_norm": 0.396953284740448, "learning_rate": 1.6248514459263172e-05, "loss": 0.9543, "step": 5960 }, { "epoch": 3.5531788472964942, "grad_norm": 0.23321232199668884, "learning_rate": 1.6116466393767332e-05, "loss": 1.8788, "step": 5980 }, { "epoch": 3.5650623885918002, "grad_norm": 3.4248406887054443, "learning_rate": 1.5984418328271493e-05, "loss": 1.2511, "step": 6000 }, { "epoch": 3.5769459298871062, "grad_norm": 12.430109977722168, "learning_rate": 1.585237026277565e-05, "loss": 1.1817, "step": 6020 }, { "epoch": 3.5888294711824122, "grad_norm": 4.982590675354004, "learning_rate": 1.572032219727981e-05, "loss": 0.9199, "step": 6040 }, { "epoch": 3.6007130124777182, "grad_norm": 0.2364758551120758, "learning_rate": 1.558827413178397e-05, "loss": 0.9745, "step": 6060 }, { "epoch": 3.6125965537730242, "grad_norm": 7.997034072875977, "learning_rate": 1.5456226066288128e-05, "loss": 0.6052, "step": 6080 }, { "epoch": 3.6244800950683302, "grad_norm": 0.0892493948340416, "learning_rate": 1.532417800079229e-05, "loss": 1.5061, "step": 6100 }, { "epoch": 3.6363636363636362, "grad_norm": 0.2148207426071167, "learning_rate": 1.5192129935296448e-05, "loss": 0.89, "step": 6120 }, { "epoch": 3.6482471776589422, "grad_norm": 13.237604141235352, "learning_rate": 1.5060081869800608e-05, "loss": 0.7876, "step": 6140 }, { "epoch": 3.6601307189542482, "grad_norm": 8.745626449584961, "learning_rate": 1.4928033804304767e-05, "loss": 0.8908, "step": 6160 }, { "epoch": 3.6720142602495542, "grad_norm": 13.550650596618652, "learning_rate": 1.4795985738808926e-05, "loss": 1.7108, "step": 6180 }, { "epoch": 3.6838978015448602, "grad_norm": 11.73013973236084, "learning_rate": 1.4663937673313086e-05, "loss": 1.2625, "step": 6200 }, { "epoch": 3.6957813428401662, "grad_norm": 0.0058449869975447655, "learning_rate": 1.4531889607817245e-05, "loss": 1.1066, "step": 6220 }, { "epoch": 3.7076648841354722, "grad_norm": 0.6316971778869629, "learning_rate": 1.4399841542321404e-05, "loss": 0.9687, "step": 6240 }, { "epoch": 3.7195484254307782, "grad_norm": 2.3133366107940674, "learning_rate": 1.4267793476825565e-05, "loss": 1.2709, "step": 6260 }, { "epoch": 3.7314319667260842, "grad_norm": 33.18562698364258, "learning_rate": 1.4135745411329727e-05, "loss": 1.2372, "step": 6280 }, { "epoch": 3.7433155080213902, "grad_norm": 3.1751153469085693, "learning_rate": 1.4003697345833886e-05, "loss": 1.6425, "step": 6300 }, { "epoch": 3.7551990493166962, "grad_norm": 21.756851196289062, "learning_rate": 1.3871649280338044e-05, "loss": 0.9176, "step": 6320 }, { "epoch": 3.7670825906120022, "grad_norm": 0.19835074245929718, "learning_rate": 1.3739601214842205e-05, "loss": 0.5485, "step": 6340 }, { "epoch": 3.7789661319073082, "grad_norm": 20.786287307739258, "learning_rate": 1.3607553149346364e-05, "loss": 1.8427, "step": 6360 }, { "epoch": 3.7908496732026142, "grad_norm": 3.5100600719451904, "learning_rate": 1.3475505083850523e-05, "loss": 1.1399, "step": 6380 }, { "epoch": 3.8027332144979202, "grad_norm": 30.671335220336914, "learning_rate": 1.3343457018354683e-05, "loss": 0.9599, "step": 6400 }, { "epoch": 3.8146167557932262, "grad_norm": 15.896026611328125, "learning_rate": 1.3211408952858842e-05, "loss": 0.6175, "step": 6420 }, { "epoch": 3.8265002970885322, "grad_norm": 44.605567932128906, "learning_rate": 1.3079360887363e-05, "loss": 1.1109, "step": 6440 }, { "epoch": 3.8383838383838382, "grad_norm": 3.223285675048828, "learning_rate": 1.2947312821867161e-05, "loss": 1.3965, "step": 6460 }, { "epoch": 3.8502673796791442, "grad_norm": 0.000214108542422764, "learning_rate": 1.281526475637132e-05, "loss": 1.1792, "step": 6480 }, { "epoch": 3.8621509209744502, "grad_norm": 26.38980484008789, "learning_rate": 1.2683216690875479e-05, "loss": 1.3786, "step": 6500 }, { "epoch": 3.8740344622697562, "grad_norm": 17.448007583618164, "learning_rate": 1.255116862537964e-05, "loss": 1.0316, "step": 6520 }, { "epoch": 3.8859180035650622, "grad_norm": 26.470539093017578, "learning_rate": 1.2419120559883798e-05, "loss": 1.0809, "step": 6540 }, { "epoch": 3.8978015448603682, "grad_norm": 7.585838317871094, "learning_rate": 1.2287072494387957e-05, "loss": 0.9324, "step": 6560 }, { "epoch": 3.9096850861556742, "grad_norm": 0.20393586158752441, "learning_rate": 1.2155024428892118e-05, "loss": 1.3897, "step": 6580 }, { "epoch": 3.9215686274509802, "grad_norm": 0.0758831799030304, "learning_rate": 1.2022976363396277e-05, "loss": 0.9505, "step": 6600 }, { "epoch": 3.9334521687462862, "grad_norm": 9.853219985961914, "learning_rate": 1.1890928297900435e-05, "loss": 0.7602, "step": 6620 }, { "epoch": 3.9453357100415927, "grad_norm": 25.84638214111328, "learning_rate": 1.1758880232404596e-05, "loss": 0.8738, "step": 6640 }, { "epoch": 3.9572192513368982, "grad_norm": 7.686094760894775, "learning_rate": 1.1626832166908755e-05, "loss": 0.7964, "step": 6660 }, { "epoch": 3.9691027926322047, "grad_norm": 22.53441047668457, "learning_rate": 1.1494784101412914e-05, "loss": 1.2056, "step": 6680 }, { "epoch": 3.9809863339275102, "grad_norm": 11.927616119384766, "learning_rate": 1.1362736035917074e-05, "loss": 1.2257, "step": 6700 }, { "epoch": 3.9928698752228167, "grad_norm": 26.10556983947754, "learning_rate": 1.1230687970421235e-05, "loss": 1.4232, "step": 6720 }, { "epoch": 4.0, "eval_accuracy": 0.494949494949495, "eval_loss": 1.853576421737671, "eval_runtime": 95.916, "eval_samples_per_second": 3.096, "eval_steps_per_second": 3.096, "step": 6732 }, { "epoch": 4.004753416518122, "grad_norm": 12.647480964660645, "learning_rate": 1.1098639904925393e-05, "loss": 1.668, "step": 6740 }, { "epoch": 4.016636957813429, "grad_norm": 10.921769142150879, "learning_rate": 1.0966591839429554e-05, "loss": 1.0108, "step": 6760 }, { "epoch": 4.028520499108734, "grad_norm": 19.605995178222656, "learning_rate": 1.0834543773933713e-05, "loss": 0.8394, "step": 6780 }, { "epoch": 4.040404040404041, "grad_norm": 0.858731746673584, "learning_rate": 1.0702495708437872e-05, "loss": 1.0475, "step": 6800 }, { "epoch": 4.052287581699346, "grad_norm": 17.299776077270508, "learning_rate": 1.0570447642942032e-05, "loss": 1.3388, "step": 6820 }, { "epoch": 4.064171122994653, "grad_norm": 14.995153427124023, "learning_rate": 1.0438399577446191e-05, "loss": 1.6191, "step": 6840 }, { "epoch": 4.076054664289958, "grad_norm": 7.30485200881958, "learning_rate": 1.030635151195035e-05, "loss": 0.4908, "step": 6860 }, { "epoch": 4.087938205585265, "grad_norm": 30.56113624572754, "learning_rate": 1.017430344645451e-05, "loss": 1.4799, "step": 6880 }, { "epoch": 4.09982174688057, "grad_norm": 0.30605238676071167, "learning_rate": 1.004225538095867e-05, "loss": 0.8943, "step": 6900 }, { "epoch": 4.111705288175877, "grad_norm": 0.005065273959189653, "learning_rate": 9.910207315462828e-06, "loss": 0.8102, "step": 6920 }, { "epoch": 4.123588829471182, "grad_norm": 15.29279899597168, "learning_rate": 9.778159249966989e-06, "loss": 1.6485, "step": 6940 }, { "epoch": 4.135472370766489, "grad_norm": 5.082457065582275, "learning_rate": 9.646111184471147e-06, "loss": 1.4701, "step": 6960 }, { "epoch": 4.147355912061794, "grad_norm": 24.667327880859375, "learning_rate": 9.514063118975306e-06, "loss": 0.9979, "step": 6980 }, { "epoch": 4.159239453357101, "grad_norm": 3.714982032775879, "learning_rate": 9.382015053479467e-06, "loss": 1.7571, "step": 7000 }, { "epoch": 4.171122994652406, "grad_norm": 26.196866989135742, "learning_rate": 9.249966987983627e-06, "loss": 1.0751, "step": 7020 }, { "epoch": 4.183006535947713, "grad_norm": 3.4196934700012207, "learning_rate": 9.117918922487786e-06, "loss": 1.7258, "step": 7040 }, { "epoch": 4.194890077243018, "grad_norm": 0.00030567339854314923, "learning_rate": 8.985870856991947e-06, "loss": 1.2107, "step": 7060 }, { "epoch": 4.206773618538325, "grad_norm": 32.88508224487305, "learning_rate": 8.853822791496106e-06, "loss": 1.189, "step": 7080 }, { "epoch": 4.21865715983363, "grad_norm": 2.1774134635925293, "learning_rate": 8.721774726000264e-06, "loss": 0.788, "step": 7100 }, { "epoch": 4.230540701128937, "grad_norm": 4.273499011993408, "learning_rate": 8.589726660504425e-06, "loss": 1.039, "step": 7120 }, { "epoch": 4.242424242424242, "grad_norm": 0.47934776544570923, "learning_rate": 8.457678595008584e-06, "loss": 0.9438, "step": 7140 }, { "epoch": 4.254307783719549, "grad_norm": 0.0029345829971134663, "learning_rate": 8.325630529512743e-06, "loss": 0.8263, "step": 7160 }, { "epoch": 4.266191325014854, "grad_norm": 28.229270935058594, "learning_rate": 8.193582464016903e-06, "loss": 1.4488, "step": 7180 }, { "epoch": 4.278074866310161, "grad_norm": 65.6903076171875, "learning_rate": 8.061534398521062e-06, "loss": 1.137, "step": 7200 }, { "epoch": 4.289958407605466, "grad_norm": 42.370643615722656, "learning_rate": 7.92948633302522e-06, "loss": 1.3585, "step": 7220 }, { "epoch": 4.301841948900773, "grad_norm": 3.4301371574401855, "learning_rate": 7.797438267529381e-06, "loss": 0.7765, "step": 7240 }, { "epoch": 4.313725490196078, "grad_norm": 27.960153579711914, "learning_rate": 7.66539020203354e-06, "loss": 1.119, "step": 7260 }, { "epoch": 4.325609031491385, "grad_norm": 26.195892333984375, "learning_rate": 7.533342136537701e-06, "loss": 1.0187, "step": 7280 }, { "epoch": 4.33749257278669, "grad_norm": 0.002473491011187434, "learning_rate": 7.40129407104186e-06, "loss": 1.0475, "step": 7300 }, { "epoch": 4.349376114081997, "grad_norm": 9.781648635864258, "learning_rate": 7.26924600554602e-06, "loss": 1.2822, "step": 7320 }, { "epoch": 4.361259655377302, "grad_norm": 36.30683135986328, "learning_rate": 7.137197940050179e-06, "loss": 1.569, "step": 7340 }, { "epoch": 4.373143196672609, "grad_norm": 56.16667175292969, "learning_rate": 7.0051498745543385e-06, "loss": 0.7711, "step": 7360 }, { "epoch": 4.385026737967914, "grad_norm": 29.77967071533203, "learning_rate": 6.873101809058498e-06, "loss": 0.9722, "step": 7380 }, { "epoch": 4.396910279263221, "grad_norm": 0.004187744576483965, "learning_rate": 6.741053743562657e-06, "loss": 1.1351, "step": 7400 }, { "epoch": 4.408793820558526, "grad_norm": 37.170188903808594, "learning_rate": 6.609005678066817e-06, "loss": 0.7695, "step": 7420 }, { "epoch": 4.420677361853833, "grad_norm": 0.7948021292686462, "learning_rate": 6.476957612570976e-06, "loss": 1.4337, "step": 7440 }, { "epoch": 4.432560903149138, "grad_norm": 7.071142673492432, "learning_rate": 6.344909547075135e-06, "loss": 1.1206, "step": 7460 }, { "epoch": 4.444444444444445, "grad_norm": 7.999136447906494, "learning_rate": 6.212861481579295e-06, "loss": 1.0916, "step": 7480 }, { "epoch": 4.45632798573975, "grad_norm": 0.48218631744384766, "learning_rate": 6.080813416083455e-06, "loss": 1.2054, "step": 7500 }, { "epoch": 4.468211527035057, "grad_norm": 0.760070264339447, "learning_rate": 5.948765350587614e-06, "loss": 0.5043, "step": 7520 }, { "epoch": 4.480095068330362, "grad_norm": 24.503273010253906, "learning_rate": 5.816717285091774e-06, "loss": 1.3106, "step": 7540 }, { "epoch": 4.491978609625669, "grad_norm": 1.2818807363510132, "learning_rate": 5.684669219595933e-06, "loss": 1.1969, "step": 7560 }, { "epoch": 4.503862150920974, "grad_norm": 41.00161361694336, "learning_rate": 5.5526211541000925e-06, "loss": 1.0278, "step": 7580 }, { "epoch": 4.515745692216281, "grad_norm": 16.333484649658203, "learning_rate": 5.420573088604252e-06, "loss": 0.9644, "step": 7600 }, { "epoch": 4.527629233511586, "grad_norm": 1.6461055278778076, "learning_rate": 5.288525023108412e-06, "loss": 1.2938, "step": 7620 }, { "epoch": 4.539512774806893, "grad_norm": 0.09808467328548431, "learning_rate": 5.1564769576125715e-06, "loss": 0.9411, "step": 7640 }, { "epoch": 4.551396316102198, "grad_norm": 7.122012138366699, "learning_rate": 5.024428892116731e-06, "loss": 1.114, "step": 7660 }, { "epoch": 4.563279857397505, "grad_norm": 15.417506217956543, "learning_rate": 4.89238082662089e-06, "loss": 1.4353, "step": 7680 }, { "epoch": 4.57516339869281, "grad_norm": 22.031047821044922, "learning_rate": 4.76033276112505e-06, "loss": 1.5221, "step": 7700 }, { "epoch": 4.587046939988117, "grad_norm": 6.318397521972656, "learning_rate": 4.628284695629209e-06, "loss": 0.8358, "step": 7720 }, { "epoch": 4.598930481283422, "grad_norm": 23.213071823120117, "learning_rate": 4.496236630133368e-06, "loss": 0.9493, "step": 7740 }, { "epoch": 4.610814022578729, "grad_norm": 36.51852035522461, "learning_rate": 4.364188564637528e-06, "loss": 2.1477, "step": 7760 }, { "epoch": 4.622697563874034, "grad_norm": 12.436498641967773, "learning_rate": 4.2321404991416884e-06, "loss": 0.9672, "step": 7780 }, { "epoch": 4.634581105169341, "grad_norm": 22.718385696411133, "learning_rate": 4.100092433645847e-06, "loss": 1.3351, "step": 7800 }, { "epoch": 4.646464646464646, "grad_norm": 10.148547172546387, "learning_rate": 3.968044368150007e-06, "loss": 1.2337, "step": 7820 }, { "epoch": 4.658348187759953, "grad_norm": 15.9627685546875, "learning_rate": 3.835996302654167e-06, "loss": 0.6878, "step": 7840 }, { "epoch": 4.670231729055258, "grad_norm": 13.22270393371582, "learning_rate": 3.703948237158326e-06, "loss": 1.1205, "step": 7860 }, { "epoch": 4.682115270350565, "grad_norm": 17.37915802001953, "learning_rate": 3.571900171662485e-06, "loss": 0.803, "step": 7880 }, { "epoch": 4.69399881164587, "grad_norm": 5.952159404754639, "learning_rate": 3.4398521061666444e-06, "loss": 0.6111, "step": 7900 }, { "epoch": 4.705882352941177, "grad_norm": 7.648906707763672, "learning_rate": 3.307804040670804e-06, "loss": 0.7931, "step": 7920 }, { "epoch": 4.717765894236482, "grad_norm": 5.780369281768799, "learning_rate": 3.175755975174964e-06, "loss": 0.7795, "step": 7940 }, { "epoch": 4.729649435531789, "grad_norm": 0.2420853227376938, "learning_rate": 3.0437079096791235e-06, "loss": 0.8364, "step": 7960 }, { "epoch": 4.741532976827094, "grad_norm": 0.007712378166615963, "learning_rate": 2.9116598441832827e-06, "loss": 0.7652, "step": 7980 }, { "epoch": 4.753416518122401, "grad_norm": 1.513141393661499, "learning_rate": 2.7796117786874424e-06, "loss": 0.6637, "step": 8000 }, { "epoch": 4.765300059417706, "grad_norm": 8.523009300231934, "learning_rate": 2.6475637131916017e-06, "loss": 0.7923, "step": 8020 }, { "epoch": 4.777183600713013, "grad_norm": 8.576272010803223, "learning_rate": 2.5155156476957613e-06, "loss": 0.8144, "step": 8040 }, { "epoch": 4.789067142008318, "grad_norm": 4.978365421295166, "learning_rate": 2.383467582199921e-06, "loss": 1.3135, "step": 8060 }, { "epoch": 4.800950683303625, "grad_norm": 22.845678329467773, "learning_rate": 2.2514195167040803e-06, "loss": 1.5446, "step": 8080 }, { "epoch": 4.81283422459893, "grad_norm": 4.380787372589111, "learning_rate": 2.11937145120824e-06, "loss": 0.6026, "step": 8100 }, { "epoch": 4.824717765894237, "grad_norm": 0.1694694310426712, "learning_rate": 1.9873233857123996e-06, "loss": 1.2781, "step": 8120 }, { "epoch": 4.836601307189542, "grad_norm": 105.8367691040039, "learning_rate": 1.8552753202165589e-06, "loss": 1.6788, "step": 8140 }, { "epoch": 4.848484848484849, "grad_norm": 0.12047601491212845, "learning_rate": 1.7232272547207184e-06, "loss": 1.0554, "step": 8160 }, { "epoch": 4.860368389780154, "grad_norm": 21.857023239135742, "learning_rate": 1.591179189224878e-06, "loss": 1.0381, "step": 8180 }, { "epoch": 4.872251931075461, "grad_norm": 64.79426574707031, "learning_rate": 1.4591311237290375e-06, "loss": 1.9111, "step": 8200 }, { "epoch": 4.884135472370766, "grad_norm": 11.07774829864502, "learning_rate": 1.327083058233197e-06, "loss": 1.0473, "step": 8220 }, { "epoch": 4.896019013666073, "grad_norm": 25.981216430664062, "learning_rate": 1.1950349927373565e-06, "loss": 1.2634, "step": 8240 }, { "epoch": 4.907902554961378, "grad_norm": 16.583120346069336, "learning_rate": 1.062986927241516e-06, "loss": 1.1052, "step": 8260 }, { "epoch": 4.919786096256685, "grad_norm": 48.31085968017578, "learning_rate": 9.309388617456755e-07, "loss": 1.2838, "step": 8280 }, { "epoch": 4.93166963755199, "grad_norm": 15.064263343811035, "learning_rate": 7.988907962498351e-07, "loss": 0.8866, "step": 8300 }, { "epoch": 4.943553178847297, "grad_norm": 4.631924629211426, "learning_rate": 6.668427307539944e-07, "loss": 0.9971, "step": 8320 }, { "epoch": 4.955436720142602, "grad_norm": 3.649411678314209, "learning_rate": 5.347946652581539e-07, "loss": 0.7016, "step": 8340 }, { "epoch": 4.967320261437909, "grad_norm": 49.374534606933594, "learning_rate": 4.0274659976231353e-07, "loss": 1.1992, "step": 8360 }, { "epoch": 4.979203802733214, "grad_norm": 8.74241828918457, "learning_rate": 2.70698534266473e-07, "loss": 0.5942, "step": 8380 }, { "epoch": 4.991087344028521, "grad_norm": 20.124603271484375, "learning_rate": 1.386504687706325e-07, "loss": 0.9934, "step": 8400 }, { "epoch": 5.0, "eval_accuracy": 0.4983164983164983, "eval_loss": 1.831876277923584, "eval_runtime": 97.0764, "eval_samples_per_second": 3.059, "eval_steps_per_second": 3.059, "step": 8415 } ], "logging_steps": 20, "max_steps": 8415, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1214329969311736e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }