diff --git "a/checkpoint-34364/trainer_state.json" "b/checkpoint-34364/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-34364/trainer_state.json" @@ -0,0 +1,4195 @@ +{ + "best_metric": 0.8927125930786133, + "best_model_checkpoint": "/workspace/previous_works/MedBLIP/output/MedBLIP-0005/checkpoint-34364", + "epoch": 1.80001047614059, + "eval_steps": 17182, + "global_step": 34364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0030380807710439473, + "grad_norm": 119.46666717529297, + "learning_rate": 1.3268156424581008e-07, + "loss": 5.0724, + "step": 58 + }, + { + "epoch": 0.006076161542087895, + "grad_norm": 34.45277786254883, + "learning_rate": 3.3519553072625703e-07, + "loss": 3.6479, + "step": 116 + }, + { + "epoch": 0.009114242313131841, + "grad_norm": 14.232598304748535, + "learning_rate": 5.37709497206704e-07, + "loss": 2.2356, + "step": 174 + }, + { + "epoch": 0.01215232308417579, + "grad_norm": 10.709288597106934, + "learning_rate": 7.402234636871509e-07, + "loss": 1.9953, + "step": 232 + }, + { + "epoch": 0.015190403855219737, + "grad_norm": 7.259119987487793, + "learning_rate": 9.427374301675979e-07, + "loss": 1.8188, + "step": 290 + }, + { + "epoch": 0.018228484626263683, + "grad_norm": 6.452203273773193, + "learning_rate": 1.1452513966480447e-06, + "loss": 1.5639, + "step": 348 + }, + { + "epoch": 0.021266565397307632, + "grad_norm": 9.458000183105469, + "learning_rate": 1.3477653631284918e-06, + "loss": 1.504, + "step": 406 + }, + { + "epoch": 0.02430464616835158, + "grad_norm": 6.162489414215088, + "learning_rate": 1.5502793296089386e-06, + "loss": 1.2581, + "step": 464 + }, + { + "epoch": 0.027342726939395528, + "grad_norm": 6.684648513793945, + "learning_rate": 1.7527932960893857e-06, + "loss": 1.1713, + "step": 522 + }, + { + "epoch": 0.030380807710439474, + "grad_norm": 6.966240882873535, + "learning_rate": 1.9553072625698325e-06, + "loss": 1.0504, + "step": 580 + }, + { + "epoch": 0.03341888848148342, + "grad_norm": 6.379108905792236, + "learning_rate": 2.1578212290502796e-06, + "loss": 0.968, + "step": 638 + }, + { + "epoch": 0.036456969252527366, + "grad_norm": 5.3792619705200195, + "learning_rate": 2.3603351955307262e-06, + "loss": 0.9209, + "step": 696 + }, + { + "epoch": 0.039495050023571315, + "grad_norm": 6.413719177246094, + "learning_rate": 2.5628491620111733e-06, + "loss": 0.8723, + "step": 754 + }, + { + "epoch": 0.042533130794615265, + "grad_norm": 8.826435089111328, + "learning_rate": 2.7653631284916204e-06, + "loss": 0.8652, + "step": 812 + }, + { + "epoch": 0.045571211565659214, + "grad_norm": 6.294381618499756, + "learning_rate": 2.9678770949720674e-06, + "loss": 0.8099, + "step": 870 + }, + { + "epoch": 0.04860929233670316, + "grad_norm": 7.355430603027344, + "learning_rate": 3.170391061452514e-06, + "loss": 0.7396, + "step": 928 + }, + { + "epoch": 0.051647373107747106, + "grad_norm": 10.120753288269043, + "learning_rate": 3.372905027932961e-06, + "loss": 0.7511, + "step": 986 + }, + { + "epoch": 0.054685453878791056, + "grad_norm": 5.653336048126221, + "learning_rate": 3.575418994413408e-06, + "loss": 0.6688, + "step": 1044 + }, + { + "epoch": 0.057723534649835, + "grad_norm": 5.749114990234375, + "learning_rate": 3.7779329608938552e-06, + "loss": 0.677, + "step": 1102 + }, + { + "epoch": 0.06076161542087895, + "grad_norm": 7.65744686126709, + "learning_rate": 3.980446927374302e-06, + "loss": 0.7195, + "step": 1160 + }, + { + "epoch": 0.0637996961919229, + "grad_norm": 9.296794891357422, + "learning_rate": 4.1829608938547485e-06, + "loss": 0.7598, + "step": 1218 + }, + { + "epoch": 0.06683777696296685, + "grad_norm": 6.070080757141113, + "learning_rate": 4.385474860335196e-06, + "loss": 0.7015, + "step": 1276 + }, + { + "epoch": 0.0698758577340108, + "grad_norm": 6.638489723205566, + "learning_rate": 4.5879888268156435e-06, + "loss": 0.6778, + "step": 1334 + }, + { + "epoch": 0.07291393850505473, + "grad_norm": 9.076967239379883, + "learning_rate": 4.790502793296089e-06, + "loss": 0.5551, + "step": 1392 + }, + { + "epoch": 0.07595201927609868, + "grad_norm": 6.929805278778076, + "learning_rate": 4.993016759776537e-06, + "loss": 0.6792, + "step": 1450 + }, + { + "epoch": 0.07899010004714263, + "grad_norm": 8.316506385803223, + "learning_rate": 5.195530726256983e-06, + "loss": 0.6379, + "step": 1508 + }, + { + "epoch": 0.08202818081818658, + "grad_norm": 9.471745491027832, + "learning_rate": 5.398044692737431e-06, + "loss": 0.6242, + "step": 1566 + }, + { + "epoch": 0.08506626158923053, + "grad_norm": 6.022659778594971, + "learning_rate": 5.6005586592178775e-06, + "loss": 0.6368, + "step": 1624 + }, + { + "epoch": 0.08810434236027448, + "grad_norm": 7.15187406539917, + "learning_rate": 5.803072625698325e-06, + "loss": 0.5381, + "step": 1682 + }, + { + "epoch": 0.09114242313131843, + "grad_norm": 7.10537052154541, + "learning_rate": 6.005586592178772e-06, + "loss": 0.6885, + "step": 1740 + }, + { + "epoch": 0.09418050390236236, + "grad_norm": 5.685272216796875, + "learning_rate": 6.208100558659218e-06, + "loss": 0.6066, + "step": 1798 + }, + { + "epoch": 0.09721858467340631, + "grad_norm": 6.733754634857178, + "learning_rate": 6.410614525139666e-06, + "loss": 0.6124, + "step": 1856 + }, + { + "epoch": 0.10025666544445026, + "grad_norm": 6.112730026245117, + "learning_rate": 6.613128491620112e-06, + "loss": 0.6224, + "step": 1914 + }, + { + "epoch": 0.10329474621549421, + "grad_norm": 5.784328460693359, + "learning_rate": 6.815642458100559e-06, + "loss": 0.6289, + "step": 1972 + }, + { + "epoch": 0.10633282698653816, + "grad_norm": 9.69115924835205, + "learning_rate": 7.0181564245810065e-06, + "loss": 0.5952, + "step": 2030 + }, + { + "epoch": 0.10937090775758211, + "grad_norm": 5.509926795959473, + "learning_rate": 7.220670391061453e-06, + "loss": 0.6288, + "step": 2088 + }, + { + "epoch": 0.11240898852862606, + "grad_norm": 6.545931339263916, + "learning_rate": 7.423184357541901e-06, + "loss": 0.5243, + "step": 2146 + }, + { + "epoch": 0.11544706929967, + "grad_norm": 7.0921173095703125, + "learning_rate": 7.625698324022347e-06, + "loss": 0.5527, + "step": 2204 + }, + { + "epoch": 0.11848515007071395, + "grad_norm": 5.278844833374023, + "learning_rate": 7.828212290502794e-06, + "loss": 0.6739, + "step": 2262 + }, + { + "epoch": 0.1215232308417579, + "grad_norm": 5.799619197845459, + "learning_rate": 8.03072625698324e-06, + "loss": 0.5913, + "step": 2320 + }, + { + "epoch": 0.12456131161280184, + "grad_norm": 7.148493766784668, + "learning_rate": 8.233240223463687e-06, + "loss": 0.6117, + "step": 2378 + }, + { + "epoch": 0.1275993923838458, + "grad_norm": 5.538400650024414, + "learning_rate": 8.435754189944135e-06, + "loss": 0.5512, + "step": 2436 + }, + { + "epoch": 0.13063747315488974, + "grad_norm": 5.364485740661621, + "learning_rate": 8.638268156424582e-06, + "loss": 0.5458, + "step": 2494 + }, + { + "epoch": 0.1336755539259337, + "grad_norm": 7.3765549659729, + "learning_rate": 8.840782122905029e-06, + "loss": 0.5847, + "step": 2552 + }, + { + "epoch": 0.13671363469697764, + "grad_norm": 5.364510536193848, + "learning_rate": 9.043296089385475e-06, + "loss": 0.542, + "step": 2610 + }, + { + "epoch": 0.1397517154680216, + "grad_norm": 3.656923770904541, + "learning_rate": 9.245810055865922e-06, + "loss": 0.5528, + "step": 2668 + }, + { + "epoch": 0.14278979623906554, + "grad_norm": 3.117631435394287, + "learning_rate": 9.448324022346369e-06, + "loss": 0.5793, + "step": 2726 + }, + { + "epoch": 0.14582787701010946, + "grad_norm": 6.822358131408691, + "learning_rate": 9.650837988826817e-06, + "loss": 0.5967, + "step": 2784 + }, + { + "epoch": 0.1488659577811534, + "grad_norm": 4.629315376281738, + "learning_rate": 9.853351955307264e-06, + "loss": 0.5945, + "step": 2842 + }, + { + "epoch": 0.15190403855219736, + "grad_norm": 3.6398866176605225, + "learning_rate": 1.005586592178771e-05, + "loss": 0.6005, + "step": 2900 + }, + { + "epoch": 0.1549421193232413, + "grad_norm": 5.811204433441162, + "learning_rate": 1.0258379888268157e-05, + "loss": 0.5065, + "step": 2958 + }, + { + "epoch": 0.15798020009428526, + "grad_norm": 3.8507301807403564, + "learning_rate": 1.0460893854748604e-05, + "loss": 0.5802, + "step": 3016 + }, + { + "epoch": 0.1610182808653292, + "grad_norm": 5.666468143463135, + "learning_rate": 1.066340782122905e-05, + "loss": 0.5342, + "step": 3074 + }, + { + "epoch": 0.16405636163637316, + "grad_norm": 2.025376558303833, + "learning_rate": 1.0865921787709498e-05, + "loss": 0.5271, + "step": 3132 + }, + { + "epoch": 0.1670944424074171, + "grad_norm": 5.698912143707275, + "learning_rate": 1.1068435754189945e-05, + "loss": 0.538, + "step": 3190 + }, + { + "epoch": 0.17013252317846106, + "grad_norm": 4.067931652069092, + "learning_rate": 1.1270949720670392e-05, + "loss": 0.5601, + "step": 3248 + }, + { + "epoch": 0.173170603949505, + "grad_norm": 5.068817138671875, + "learning_rate": 1.1473463687150838e-05, + "loss": 0.5528, + "step": 3306 + }, + { + "epoch": 0.17620868472054896, + "grad_norm": 7.116920471191406, + "learning_rate": 1.1675977653631285e-05, + "loss": 0.5418, + "step": 3364 + }, + { + "epoch": 0.1792467654915929, + "grad_norm": 8.113608360290527, + "learning_rate": 1.1878491620111732e-05, + "loss": 0.5196, + "step": 3422 + }, + { + "epoch": 0.18228484626263686, + "grad_norm": 4.820245265960693, + "learning_rate": 1.208100558659218e-05, + "loss": 0.6029, + "step": 3480 + }, + { + "epoch": 0.18532292703368078, + "grad_norm": 1.1077276468276978, + "learning_rate": 1.2283519553072627e-05, + "loss": 0.5779, + "step": 3538 + }, + { + "epoch": 0.18836100780472473, + "grad_norm": 3.024624824523926, + "learning_rate": 1.2486033519553073e-05, + "loss": 0.5091, + "step": 3596 + }, + { + "epoch": 0.19139908857576868, + "grad_norm": 8.059369087219238, + "learning_rate": 1.268854748603352e-05, + "loss": 0.5012, + "step": 3654 + }, + { + "epoch": 0.19443716934681263, + "grad_norm": 3.9895098209381104, + "learning_rate": 1.2891061452513967e-05, + "loss": 0.578, + "step": 3712 + }, + { + "epoch": 0.19747525011785658, + "grad_norm": 7.111061096191406, + "learning_rate": 1.3093575418994415e-05, + "loss": 0.4958, + "step": 3770 + }, + { + "epoch": 0.20051333088890053, + "grad_norm": 5.541796684265137, + "learning_rate": 1.3296089385474861e-05, + "loss": 0.589, + "step": 3828 + }, + { + "epoch": 0.20355141165994448, + "grad_norm": 4.365527629852295, + "learning_rate": 1.3498603351955308e-05, + "loss": 0.6003, + "step": 3886 + }, + { + "epoch": 0.20658949243098843, + "grad_norm": 4.486824035644531, + "learning_rate": 1.3701117318435755e-05, + "loss": 0.6135, + "step": 3944 + }, + { + "epoch": 0.20962757320203237, + "grad_norm": 5.487951278686523, + "learning_rate": 1.3903631284916201e-05, + "loss": 0.6234, + "step": 4002 + }, + { + "epoch": 0.21266565397307632, + "grad_norm": 2.3850884437561035, + "learning_rate": 1.410614525139665e-05, + "loss": 0.5536, + "step": 4060 + }, + { + "epoch": 0.21570373474412027, + "grad_norm": 3.7957749366760254, + "learning_rate": 1.4305167597765364e-05, + "loss": 0.5211, + "step": 4118 + }, + { + "epoch": 0.21874181551516422, + "grad_norm": 5.272162437438965, + "learning_rate": 1.450768156424581e-05, + "loss": 0.4992, + "step": 4176 + }, + { + "epoch": 0.22177989628620817, + "grad_norm": 4.292142868041992, + "learning_rate": 1.4710195530726259e-05, + "loss": 0.5124, + "step": 4234 + }, + { + "epoch": 0.22481797705725212, + "grad_norm": 4.948460102081299, + "learning_rate": 1.4912709497206705e-05, + "loss": 0.5575, + "step": 4292 + }, + { + "epoch": 0.22785605782829604, + "grad_norm": 3.456590414047241, + "learning_rate": 1.5115223463687152e-05, + "loss": 0.5672, + "step": 4350 + }, + { + "epoch": 0.23089413859934, + "grad_norm": 5.341044902801514, + "learning_rate": 1.53177374301676e-05, + "loss": 0.5106, + "step": 4408 + }, + { + "epoch": 0.23393221937038394, + "grad_norm": 5.5106987953186035, + "learning_rate": 1.5520251396648043e-05, + "loss": 0.6037, + "step": 4466 + }, + { + "epoch": 0.2369703001414279, + "grad_norm": 4.941389560699463, + "learning_rate": 1.5722765363128495e-05, + "loss": 0.5995, + "step": 4524 + }, + { + "epoch": 0.24000838091247184, + "grad_norm": 3.719957113265991, + "learning_rate": 1.592527932960894e-05, + "loss": 0.5531, + "step": 4582 + }, + { + "epoch": 0.2430464616835158, + "grad_norm": 4.435623645782471, + "learning_rate": 1.612779329608939e-05, + "loss": 0.5502, + "step": 4640 + }, + { + "epoch": 0.24608454245455974, + "grad_norm": 4.688556671142578, + "learning_rate": 1.6330307262569833e-05, + "loss": 0.5056, + "step": 4698 + }, + { + "epoch": 0.2491226232256037, + "grad_norm": 5.511931896209717, + "learning_rate": 1.653282122905028e-05, + "loss": 0.5242, + "step": 4756 + }, + { + "epoch": 0.25216070399664764, + "grad_norm": 4.933206558227539, + "learning_rate": 1.673533519553073e-05, + "loss": 0.5115, + "step": 4814 + }, + { + "epoch": 0.2551987847676916, + "grad_norm": 2.942838191986084, + "learning_rate": 1.6937849162011175e-05, + "loss": 0.5941, + "step": 4872 + }, + { + "epoch": 0.25823686553873554, + "grad_norm": 4.0710625648498535, + "learning_rate": 1.7140363128491623e-05, + "loss": 0.4887, + "step": 4930 + }, + { + "epoch": 0.2612749463097795, + "grad_norm": 3.212920665740967, + "learning_rate": 1.7342877094972068e-05, + "loss": 0.5051, + "step": 4988 + }, + { + "epoch": 0.26431302708082344, + "grad_norm": 4.390661716461182, + "learning_rate": 1.7545391061452513e-05, + "loss": 0.5735, + "step": 5046 + }, + { + "epoch": 0.2673511078518674, + "grad_norm": 3.784395217895508, + "learning_rate": 1.7747905027932965e-05, + "loss": 0.4847, + "step": 5104 + }, + { + "epoch": 0.27038918862291134, + "grad_norm": 4.238777160644531, + "learning_rate": 1.795041899441341e-05, + "loss": 0.4303, + "step": 5162 + }, + { + "epoch": 0.2734272693939553, + "grad_norm": 4.616554260253906, + "learning_rate": 1.8152932960893855e-05, + "loss": 0.53, + "step": 5220 + }, + { + "epoch": 0.27646535016499924, + "grad_norm": 3.0670206546783447, + "learning_rate": 1.8355446927374303e-05, + "loss": 0.5089, + "step": 5278 + }, + { + "epoch": 0.2795034309360432, + "grad_norm": 5.144998550415039, + "learning_rate": 1.8557960893854748e-05, + "loss": 0.5352, + "step": 5336 + }, + { + "epoch": 0.28254151170708713, + "grad_norm": 3.9276976585388184, + "learning_rate": 1.8760474860335196e-05, + "loss": 0.5001, + "step": 5394 + }, + { + "epoch": 0.2855795924781311, + "grad_norm": 4.67507266998291, + "learning_rate": 1.8962988826815645e-05, + "loss": 0.5106, + "step": 5452 + }, + { + "epoch": 0.288617673249175, + "grad_norm": 11.027061462402344, + "learning_rate": 1.916550279329609e-05, + "loss": 0.5311, + "step": 5510 + }, + { + "epoch": 0.2916557540202189, + "grad_norm": 4.6279802322387695, + "learning_rate": 1.9368016759776538e-05, + "loss": 0.4722, + "step": 5568 + }, + { + "epoch": 0.2946938347912629, + "grad_norm": 3.2744059562683105, + "learning_rate": 1.9570530726256983e-05, + "loss": 0.5435, + "step": 5626 + }, + { + "epoch": 0.2977319155623068, + "grad_norm": 4.361588478088379, + "learning_rate": 1.977304469273743e-05, + "loss": 0.5182, + "step": 5684 + }, + { + "epoch": 0.3007699963333508, + "grad_norm": 4.725919246673584, + "learning_rate": 1.997555865921788e-05, + "loss": 0.5108, + "step": 5742 + }, + { + "epoch": 0.3038080771043947, + "grad_norm": 4.126678943634033, + "learning_rate": 1.999995169004151e-05, + "loss": 0.4113, + "step": 5800 + }, + { + "epoch": 0.3068461578754387, + "grad_norm": 4.924627780914307, + "learning_rate": 1.999977932757864e-05, + "loss": 0.5911, + "step": 5858 + }, + { + "epoch": 0.3098842386464826, + "grad_norm": 3.836568832397461, + "learning_rate": 1.9999482004657697e-05, + "loss": 0.5589, + "step": 5916 + }, + { + "epoch": 0.3129223194175266, + "grad_norm": 2.1909499168395996, + "learning_rate": 1.999905972499412e-05, + "loss": 0.5321, + "step": 5974 + }, + { + "epoch": 0.3159604001885705, + "grad_norm": 4.15761661529541, + "learning_rate": 1.9998512493864858e-05, + "loss": 0.4898, + "step": 6032 + }, + { + "epoch": 0.31899848095961447, + "grad_norm": 4.483209133148193, + "learning_rate": 1.9997840318108285e-05, + "loss": 0.5339, + "step": 6090 + }, + { + "epoch": 0.3220365617306584, + "grad_norm": 4.208775997161865, + "learning_rate": 1.9997058007847493e-05, + "loss": 0.4381, + "step": 6148 + }, + { + "epoch": 0.32507464250170237, + "grad_norm": 4.7916951179504395, + "learning_rate": 1.999613812340473e-05, + "loss": 0.5622, + "step": 6206 + }, + { + "epoch": 0.3281127232727463, + "grad_norm": 3.041353702545166, + "learning_rate": 1.999509332400555e-05, + "loss": 0.5797, + "step": 6264 + }, + { + "epoch": 0.33115080404379027, + "grad_norm": 2.720505714416504, + "learning_rate": 1.999392362270611e-05, + "loss": 0.5213, + "step": 6322 + }, + { + "epoch": 0.3341888848148342, + "grad_norm": 4.793097496032715, + "learning_rate": 1.999262903412336e-05, + "loss": 0.4593, + "step": 6380 + }, + { + "epoch": 0.33722696558587817, + "grad_norm": 3.890002489089966, + "learning_rate": 1.999120957443491e-05, + "loss": 0.4486, + "step": 6438 + }, + { + "epoch": 0.3402650463569221, + "grad_norm": 3.537182569503784, + "learning_rate": 1.9989665261378772e-05, + "loss": 0.4879, + "step": 6496 + }, + { + "epoch": 0.34330312712796607, + "grad_norm": 1.6273483037948608, + "learning_rate": 1.998799611425319e-05, + "loss": 0.483, + "step": 6554 + }, + { + "epoch": 0.34634120789901, + "grad_norm": 1.544161081314087, + "learning_rate": 1.9986202153916356e-05, + "loss": 0.5295, + "step": 6612 + }, + { + "epoch": 0.34937928867005397, + "grad_norm": 4.516360759735107, + "learning_rate": 1.9984283402786177e-05, + "loss": 0.5544, + "step": 6670 + }, + { + "epoch": 0.3524173694410979, + "grad_norm": 3.9603912830352783, + "learning_rate": 1.998223988483998e-05, + "loss": 0.5005, + "step": 6728 + }, + { + "epoch": 0.35545545021214187, + "grad_norm": 4.063785076141357, + "learning_rate": 1.998007162561423e-05, + "loss": 0.5339, + "step": 6786 + }, + { + "epoch": 0.3584935309831858, + "grad_norm": 4.825593948364258, + "learning_rate": 1.9977778652204192e-05, + "loss": 0.4702, + "step": 6844 + }, + { + "epoch": 0.36153161175422976, + "grad_norm": 0.5379557013511658, + "learning_rate": 1.997536099326359e-05, + "loss": 0.5397, + "step": 6902 + }, + { + "epoch": 0.3645696925252737, + "grad_norm": 3.922156810760498, + "learning_rate": 1.9972818679004273e-05, + "loss": 0.5663, + "step": 6960 + }, + { + "epoch": 0.36760777329631766, + "grad_norm": 3.376941442489624, + "learning_rate": 1.9970198778515604e-05, + "loss": 0.5321, + "step": 7018 + }, + { + "epoch": 0.37064585406736156, + "grad_norm": 4.569897174835205, + "learning_rate": 1.9967409398301135e-05, + "loss": 0.4517, + "step": 7076 + }, + { + "epoch": 0.3736839348384055, + "grad_norm": 4.267284393310547, + "learning_rate": 1.9964495462133642e-05, + "loss": 0.5225, + "step": 7134 + }, + { + "epoch": 0.37672201560944946, + "grad_norm": 4.1275506019592285, + "learning_rate": 1.9961457006426603e-05, + "loss": 0.5007, + "step": 7192 + }, + { + "epoch": 0.3797600963804934, + "grad_norm": 4.481261253356934, + "learning_rate": 1.995829406914954e-05, + "loss": 0.4754, + "step": 7250 + }, + { + "epoch": 0.38279817715153736, + "grad_norm": 2.876922845840454, + "learning_rate": 1.995500668982753e-05, + "loss": 0.4729, + "step": 7308 + }, + { + "epoch": 0.3858362579225813, + "grad_norm": 1.0541763305664062, + "learning_rate": 1.9951594909540727e-05, + "loss": 0.5697, + "step": 7366 + }, + { + "epoch": 0.38887433869362525, + "grad_norm": 3.462268114089966, + "learning_rate": 1.9948058770923837e-05, + "loss": 0.4803, + "step": 7424 + }, + { + "epoch": 0.3919124194646692, + "grad_norm": 3.209782123565674, + "learning_rate": 1.9944398318165578e-05, + "loss": 0.5239, + "step": 7482 + }, + { + "epoch": 0.39495050023571315, + "grad_norm": 4.3836445808410645, + "learning_rate": 1.994061359700815e-05, + "loss": 0.5096, + "step": 7540 + }, + { + "epoch": 0.3979885810067571, + "grad_norm": 2.8133575916290283, + "learning_rate": 1.9936704654746642e-05, + "loss": 0.4546, + "step": 7598 + }, + { + "epoch": 0.40102666177780105, + "grad_norm": 3.709463596343994, + "learning_rate": 1.9932671540228456e-05, + "loss": 0.4882, + "step": 7656 + }, + { + "epoch": 0.404064742548845, + "grad_norm": 3.9174060821533203, + "learning_rate": 1.992851430385269e-05, + "loss": 0.4311, + "step": 7714 + }, + { + "epoch": 0.40710282331988895, + "grad_norm": 2.9282238483428955, + "learning_rate": 1.99242329975695e-05, + "loss": 0.5204, + "step": 7772 + }, + { + "epoch": 0.4101409040909329, + "grad_norm": 4.139567852020264, + "learning_rate": 1.9919827674879473e-05, + "loss": 0.4739, + "step": 7830 + }, + { + "epoch": 0.41317898486197685, + "grad_norm": 3.436636447906494, + "learning_rate": 1.9915298390832935e-05, + "loss": 0.4838, + "step": 7888 + }, + { + "epoch": 0.4162170656330208, + "grad_norm": 3.512646198272705, + "learning_rate": 1.9910645202029272e-05, + "loss": 0.4594, + "step": 7946 + }, + { + "epoch": 0.41925514640406475, + "grad_norm": 3.1627018451690674, + "learning_rate": 1.9905868166616234e-05, + "loss": 0.5628, + "step": 8004 + }, + { + "epoch": 0.4222932271751087, + "grad_norm": 1.1955090761184692, + "learning_rate": 1.990096734428919e-05, + "loss": 0.4587, + "step": 8062 + }, + { + "epoch": 0.42533130794615265, + "grad_norm": 3.0422959327697754, + "learning_rate": 1.989594279629039e-05, + "loss": 0.5523, + "step": 8120 + }, + { + "epoch": 0.4283693887171966, + "grad_norm": 3.0934972763061523, + "learning_rate": 1.98907945854082e-05, + "loss": 0.4855, + "step": 8178 + }, + { + "epoch": 0.43140746948824055, + "grad_norm": 3.9694907665252686, + "learning_rate": 1.9885522775976324e-05, + "loss": 0.543, + "step": 8236 + }, + { + "epoch": 0.4344455502592845, + "grad_norm": 4.713873386383057, + "learning_rate": 1.9880127433872983e-05, + "loss": 0.4901, + "step": 8294 + }, + { + "epoch": 0.43748363103032845, + "grad_norm": 2.3840503692626953, + "learning_rate": 1.987460862652011e-05, + "loss": 0.4265, + "step": 8352 + }, + { + "epoch": 0.4405217118013724, + "grad_norm": 4.123522758483887, + "learning_rate": 1.9868966422882496e-05, + "loss": 0.4237, + "step": 8410 + }, + { + "epoch": 0.44355979257241634, + "grad_norm": 3.1017978191375732, + "learning_rate": 1.986320089346693e-05, + "loss": 0.4106, + "step": 8468 + }, + { + "epoch": 0.4465978733434603, + "grad_norm": 2.8059699535369873, + "learning_rate": 1.9857414684867994e-05, + "loss": 0.4641, + "step": 8526 + }, + { + "epoch": 0.44963595411450424, + "grad_norm": 4.327667236328125, + "learning_rate": 1.985140484474396e-05, + "loss": 0.4337, + "step": 8584 + }, + { + "epoch": 0.45267403488554814, + "grad_norm": 0.9626501798629761, + "learning_rate": 1.9845271898297104e-05, + "loss": 0.4932, + "step": 8642 + }, + { + "epoch": 0.4557121156565921, + "grad_norm": 1.3852657079696655, + "learning_rate": 1.9839015922166693e-05, + "loss": 0.4866, + "step": 8700 + }, + { + "epoch": 0.45875019642763604, + "grad_norm": 3.2711095809936523, + "learning_rate": 1.983263699452942e-05, + "loss": 0.4825, + "step": 8758 + }, + { + "epoch": 0.46178827719868, + "grad_norm": 4.84442138671875, + "learning_rate": 1.9826135195098416e-05, + "loss": 0.4559, + "step": 8816 + }, + { + "epoch": 0.46482635796972394, + "grad_norm": 1.2177191972732544, + "learning_rate": 1.9819510605122255e-05, + "loss": 0.4795, + "step": 8874 + }, + { + "epoch": 0.4678644387407679, + "grad_norm": 3.1849379539489746, + "learning_rate": 1.981276330738395e-05, + "loss": 0.474, + "step": 8932 + }, + { + "epoch": 0.47090251951181183, + "grad_norm": 4.420878887176514, + "learning_rate": 1.9805893386199892e-05, + "loss": 0.4876, + "step": 8990 + }, + { + "epoch": 0.4739406002828558, + "grad_norm": 2.714984893798828, + "learning_rate": 1.9798900927418835e-05, + "loss": 0.4491, + "step": 9048 + }, + { + "epoch": 0.47697868105389973, + "grad_norm": 2.185593843460083, + "learning_rate": 1.9791786018420792e-05, + "loss": 0.4808, + "step": 9106 + }, + { + "epoch": 0.4800167618249437, + "grad_norm": 3.3326094150543213, + "learning_rate": 1.9784548748115946e-05, + "loss": 0.4502, + "step": 9164 + }, + { + "epoch": 0.48305484259598763, + "grad_norm": 3.4437661170959473, + "learning_rate": 1.977718920694356e-05, + "loss": 0.5049, + "step": 9222 + }, + { + "epoch": 0.4860929233670316, + "grad_norm": 5.456835746765137, + "learning_rate": 1.9769707486870825e-05, + "loss": 0.4791, + "step": 9280 + }, + { + "epoch": 0.48913100413807553, + "grad_norm": 5.109498023986816, + "learning_rate": 1.9762103681391724e-05, + "loss": 0.5396, + "step": 9338 + }, + { + "epoch": 0.4921690849091195, + "grad_norm": 4.347654342651367, + "learning_rate": 1.9754377885525854e-05, + "loss": 0.4433, + "step": 9396 + }, + { + "epoch": 0.49520716568016343, + "grad_norm": 3.837158203125, + "learning_rate": 1.9746530195817243e-05, + "loss": 0.4791, + "step": 9454 + }, + { + "epoch": 0.4982452464512074, + "grad_norm": 3.8552966117858887, + "learning_rate": 1.9738699146560578e-05, + "loss": 0.4979, + "step": 9512 + }, + { + "epoch": 0.5012833272222513, + "grad_norm": 1.76126229763031, + "learning_rate": 1.973061006224811e-05, + "loss": 0.4716, + "step": 9570 + }, + { + "epoch": 0.5043214079932953, + "grad_norm": 5.198726654052734, + "learning_rate": 1.9722399381103267e-05, + "loss": 0.4801, + "step": 9628 + }, + { + "epoch": 0.5073594887643392, + "grad_norm": 3.2313361167907715, + "learning_rate": 1.9714067205729356e-05, + "loss": 0.4592, + "step": 9686 + }, + { + "epoch": 0.5103975695353832, + "grad_norm": 0.9486598968505859, + "learning_rate": 1.9705613640247928e-05, + "loss": 0.4399, + "step": 9744 + }, + { + "epoch": 0.5134356503064271, + "grad_norm": 3.271669864654541, + "learning_rate": 1.9697038790297442e-05, + "loss": 0.4722, + "step": 9802 + }, + { + "epoch": 0.5164737310774711, + "grad_norm": 5.1848039627075195, + "learning_rate": 1.9688342763031993e-05, + "loss": 0.4336, + "step": 9860 + }, + { + "epoch": 0.519511811848515, + "grad_norm": 4.134024620056152, + "learning_rate": 1.967952566711993e-05, + "loss": 0.4534, + "step": 9918 + }, + { + "epoch": 0.522549892619559, + "grad_norm": 3.0904159545898438, + "learning_rate": 1.9670587612742515e-05, + "loss": 0.4461, + "step": 9976 + }, + { + "epoch": 0.5255879733906029, + "grad_norm": 3.3785481452941895, + "learning_rate": 1.9661528711592553e-05, + "loss": 0.4906, + "step": 10034 + }, + { + "epoch": 0.5286260541616469, + "grad_norm": 4.755141258239746, + "learning_rate": 1.9652349076872986e-05, + "loss": 0.4519, + "step": 10092 + }, + { + "epoch": 0.5316641349326908, + "grad_norm": 4.502477645874023, + "learning_rate": 1.9643048823295482e-05, + "loss": 0.5454, + "step": 10150 + }, + { + "epoch": 0.5347022157037348, + "grad_norm": 2.3361642360687256, + "learning_rate": 1.9633628067078997e-05, + "loss": 0.5069, + "step": 10208 + }, + { + "epoch": 0.5377402964747787, + "grad_norm": 3.6974456310272217, + "learning_rate": 1.9624086925948333e-05, + "loss": 0.4604, + "step": 10266 + }, + { + "epoch": 0.5407783772458227, + "grad_norm": 3.7012462615966797, + "learning_rate": 1.9614425519132654e-05, + "loss": 0.5368, + "step": 10324 + }, + { + "epoch": 0.5438164580168666, + "grad_norm": 0.9825100898742676, + "learning_rate": 1.9604643967364013e-05, + "loss": 0.4917, + "step": 10382 + }, + { + "epoch": 0.5468545387879106, + "grad_norm": 2.8980348110198975, + "learning_rate": 1.959474239287582e-05, + "loss": 0.4571, + "step": 10440 + }, + { + "epoch": 0.5498926195589545, + "grad_norm": 6.615330696105957, + "learning_rate": 1.9584720919401342e-05, + "loss": 0.4949, + "step": 10498 + }, + { + "epoch": 0.5529307003299985, + "grad_norm": 4.613067626953125, + "learning_rate": 1.9574579672172126e-05, + "loss": 0.4072, + "step": 10556 + }, + { + "epoch": 0.5559687811010424, + "grad_norm": 3.3984858989715576, + "learning_rate": 1.9564318777916456e-05, + "loss": 0.412, + "step": 10614 + }, + { + "epoch": 0.5590068618720864, + "grad_norm": 5.624422550201416, + "learning_rate": 1.9553938364857775e-05, + "loss": 0.4781, + "step": 10672 + }, + { + "epoch": 0.5620449426431303, + "grad_norm": 4.486995697021484, + "learning_rate": 1.954343856271306e-05, + "loss": 0.4426, + "step": 10730 + }, + { + "epoch": 0.5650830234141743, + "grad_norm": 2.862964391708374, + "learning_rate": 1.953281950269121e-05, + "loss": 0.506, + "step": 10788 + }, + { + "epoch": 0.5681211041852182, + "grad_norm": 0.556151807308197, + "learning_rate": 1.9522267467101615e-05, + "loss": 0.4095, + "step": 10846 + }, + { + "epoch": 0.5711591849562622, + "grad_norm": 3.035536527633667, + "learning_rate": 1.9511412341335318e-05, + "loss": 0.517, + "step": 10904 + }, + { + "epoch": 0.574197265727306, + "grad_norm": 3.8603086471557617, + "learning_rate": 1.950043835790185e-05, + "loss": 0.4344, + "step": 10962 + }, + { + "epoch": 0.57723534649835, + "grad_norm": 4.4469499588012695, + "learning_rate": 1.9489345653935635e-05, + "loss": 0.4774, + "step": 11020 + }, + { + "epoch": 0.5802734272693939, + "grad_norm": 3.4457666873931885, + "learning_rate": 1.9478134368054676e-05, + "loss": 0.4274, + "step": 11078 + }, + { + "epoch": 0.5833115080404379, + "grad_norm": 3.056290864944458, + "learning_rate": 1.9466804640358798e-05, + "loss": 0.4432, + "step": 11136 + }, + { + "epoch": 0.5863495888114818, + "grad_norm": 4.071867942810059, + "learning_rate": 1.9455356612427928e-05, + "loss": 0.4344, + "step": 11194 + }, + { + "epoch": 0.5893876695825258, + "grad_norm": 2.0395846366882324, + "learning_rate": 1.9443790427320303e-05, + "loss": 0.4714, + "step": 11252 + }, + { + "epoch": 0.5924257503535697, + "grad_norm": 4.563007354736328, + "learning_rate": 1.9432106229570685e-05, + "loss": 0.5157, + "step": 11310 + }, + { + "epoch": 0.5954638311246137, + "grad_norm": 3.7986621856689453, + "learning_rate": 1.9420304165188574e-05, + "loss": 0.4977, + "step": 11368 + }, + { + "epoch": 0.5985019118956576, + "grad_norm": 5.301217555999756, + "learning_rate": 1.9408384381656358e-05, + "loss": 0.4662, + "step": 11426 + }, + { + "epoch": 0.6015399926667016, + "grad_norm": 2.3288731575012207, + "learning_rate": 1.939634702792749e-05, + "loss": 0.4493, + "step": 11484 + }, + { + "epoch": 0.6045780734377455, + "grad_norm": 3.7128169536590576, + "learning_rate": 1.9384192254424606e-05, + "loss": 0.4865, + "step": 11542 + }, + { + "epoch": 0.6076161542087894, + "grad_norm": 4.314477920532227, + "learning_rate": 1.9371920213037665e-05, + "loss": 0.4715, + "step": 11600 + }, + { + "epoch": 0.6106542349798334, + "grad_norm": 2.6989047527313232, + "learning_rate": 1.935953105712205e-05, + "loss": 0.4345, + "step": 11658 + }, + { + "epoch": 0.6136923157508773, + "grad_norm": 3.5463671684265137, + "learning_rate": 1.9347024941496628e-05, + "loss": 0.4611, + "step": 11716 + }, + { + "epoch": 0.6167303965219213, + "grad_norm": 4.914857387542725, + "learning_rate": 1.9334402022441848e-05, + "loss": 0.4952, + "step": 11774 + }, + { + "epoch": 0.6197684772929652, + "grad_norm": 1.5133031606674194, + "learning_rate": 1.932188309270537e-05, + "loss": 0.4401, + "step": 11832 + }, + { + "epoch": 0.6228065580640092, + "grad_norm": 2.30916428565979, + "learning_rate": 1.9309029048500578e-05, + "loss": 0.4177, + "step": 11890 + }, + { + "epoch": 0.6258446388350531, + "grad_norm": 2.825598955154419, + "learning_rate": 1.929605867567532e-05, + "loss": 0.4529, + "step": 11948 + }, + { + "epoch": 0.6288827196060971, + "grad_norm": 5.285458087921143, + "learning_rate": 1.9282972136311554e-05, + "loss": 0.4806, + "step": 12006 + }, + { + "epoch": 0.631920800377141, + "grad_norm": 2.597923755645752, + "learning_rate": 1.9269769593942872e-05, + "loss": 0.4566, + "step": 12064 + }, + { + "epoch": 0.634958881148185, + "grad_norm": 2.780212640762329, + "learning_rate": 1.9256451213552497e-05, + "loss": 0.4725, + "step": 12122 + }, + { + "epoch": 0.6379969619192289, + "grad_norm": 4.1638031005859375, + "learning_rate": 1.9243017161571194e-05, + "loss": 0.463, + "step": 12180 + }, + { + "epoch": 0.6410350426902729, + "grad_norm": 4.174670219421387, + "learning_rate": 1.9229467605875196e-05, + "loss": 0.5236, + "step": 12238 + }, + { + "epoch": 0.6440731234613168, + "grad_norm": 1.9128369092941284, + "learning_rate": 1.9215802715784096e-05, + "loss": 0.4621, + "step": 12296 + }, + { + "epoch": 0.6471112042323608, + "grad_norm": 4.490901947021484, + "learning_rate": 1.9202022662058773e-05, + "loss": 0.4517, + "step": 12354 + }, + { + "epoch": 0.6501492850034047, + "grad_norm": 4.426553726196289, + "learning_rate": 1.9188127616899202e-05, + "loss": 0.488, + "step": 12412 + }, + { + "epoch": 0.6531873657744487, + "grad_norm": 3.694254159927368, + "learning_rate": 1.917411775394233e-05, + "loss": 0.4705, + "step": 12470 + }, + { + "epoch": 0.6562254465454926, + "grad_norm": 2.3134400844573975, + "learning_rate": 1.9159993248259916e-05, + "loss": 0.4402, + "step": 12528 + }, + { + "epoch": 0.6592635273165366, + "grad_norm": 2.868987798690796, + "learning_rate": 1.9145754276356323e-05, + "loss": 0.4085, + "step": 12586 + }, + { + "epoch": 0.6623016080875805, + "grad_norm": 3.815828323364258, + "learning_rate": 1.9131401016166326e-05, + "loss": 0.5569, + "step": 12644 + }, + { + "epoch": 0.6653396888586245, + "grad_norm": 2.865863800048828, + "learning_rate": 1.911693364705287e-05, + "loss": 0.4515, + "step": 12702 + }, + { + "epoch": 0.6683777696296684, + "grad_norm": 3.7862603664398193, + "learning_rate": 1.9102352349804865e-05, + "loss": 0.4685, + "step": 12760 + }, + { + "epoch": 0.6714158504007124, + "grad_norm": 2.3399360179901123, + "learning_rate": 1.9087657306634884e-05, + "loss": 0.5087, + "step": 12818 + }, + { + "epoch": 0.6744539311717563, + "grad_norm": 3.208674430847168, + "learning_rate": 1.9072848701176905e-05, + "loss": 0.4322, + "step": 12876 + }, + { + "epoch": 0.6774920119428003, + "grad_norm": 1.2508207559585571, + "learning_rate": 1.9057926718484036e-05, + "loss": 0.39, + "step": 12934 + }, + { + "epoch": 0.6805300927138442, + "grad_norm": 3.029885768890381, + "learning_rate": 1.9042891545026164e-05, + "loss": 0.4881, + "step": 12992 + }, + { + "epoch": 0.6835681734848882, + "grad_norm": 1.723024606704712, + "learning_rate": 1.9028005500450692e-05, + "loss": 0.4016, + "step": 13050 + }, + { + "epoch": 0.6866062542559321, + "grad_norm": 3.0587244033813477, + "learning_rate": 1.9012746453978195e-05, + "loss": 0.451, + "step": 13108 + }, + { + "epoch": 0.6896443350269761, + "grad_norm": 3.979196548461914, + "learning_rate": 1.899737478132781e-05, + "loss": 0.4584, + "step": 13166 + }, + { + "epoch": 0.69268241579802, + "grad_norm": 3.2428181171417236, + "learning_rate": 1.8981890674588902e-05, + "loss": 0.4419, + "step": 13224 + }, + { + "epoch": 0.695720496569064, + "grad_norm": 1.9672743082046509, + "learning_rate": 1.8966294327255843e-05, + "loss": 0.4463, + "step": 13282 + }, + { + "epoch": 0.6987585773401079, + "grad_norm": 3.543287754058838, + "learning_rate": 1.895058593422561e-05, + "loss": 0.5232, + "step": 13340 + }, + { + "epoch": 0.7017966581111519, + "grad_norm": 2.751725435256958, + "learning_rate": 1.8934765691795337e-05, + "loss": 0.4627, + "step": 13398 + }, + { + "epoch": 0.7048347388821958, + "grad_norm": 3.9089314937591553, + "learning_rate": 1.8918833797659854e-05, + "loss": 0.4701, + "step": 13456 + }, + { + "epoch": 0.7078728196532398, + "grad_norm": 2.623382806777954, + "learning_rate": 1.890279045090924e-05, + "loss": 0.4627, + "step": 13514 + }, + { + "epoch": 0.7109109004242837, + "grad_norm": 3.44734263420105, + "learning_rate": 1.8886635852026307e-05, + "loss": 0.5063, + "step": 13572 + }, + { + "epoch": 0.7139489811953277, + "grad_norm": 4.096603870391846, + "learning_rate": 1.887037020288412e-05, + "loss": 0.4205, + "step": 13630 + }, + { + "epoch": 0.7169870619663716, + "grad_norm": 3.9694747924804688, + "learning_rate": 1.8853993706743465e-05, + "loss": 0.479, + "step": 13688 + }, + { + "epoch": 0.7200251427374156, + "grad_norm": 2.2461936473846436, + "learning_rate": 1.88375065682503e-05, + "loss": 0.4222, + "step": 13746 + }, + { + "epoch": 0.7230632235084595, + "grad_norm": 4.268979549407959, + "learning_rate": 1.882090899343321e-05, + "loss": 0.4013, + "step": 13804 + }, + { + "epoch": 0.7261013042795035, + "grad_norm": 2.9464776515960693, + "learning_rate": 1.8804201189700833e-05, + "loss": 0.5184, + "step": 13862 + }, + { + "epoch": 0.7291393850505474, + "grad_norm": 3.1404519081115723, + "learning_rate": 1.8787383365839248e-05, + "loss": 0.4451, + "step": 13920 + }, + { + "epoch": 0.7321774658215914, + "grad_norm": 3.048670530319214, + "learning_rate": 1.8770455732009393e-05, + "loss": 0.457, + "step": 13978 + }, + { + "epoch": 0.7352155465926353, + "grad_norm": 3.074151039123535, + "learning_rate": 1.8753418499744426e-05, + "loss": 0.4711, + "step": 14036 + }, + { + "epoch": 0.7382536273636792, + "grad_norm": 4.1698150634765625, + "learning_rate": 1.873627188194708e-05, + "loss": 0.4281, + "step": 14094 + }, + { + "epoch": 0.7412917081347231, + "grad_norm": 2.6520071029663086, + "learning_rate": 1.8719016092887e-05, + "loss": 0.497, + "step": 14152 + }, + { + "epoch": 0.7443297889057671, + "grad_norm": 1.3818339109420776, + "learning_rate": 1.870165134819808e-05, + "loss": 0.4234, + "step": 14210 + }, + { + "epoch": 0.747367869676811, + "grad_norm": 3.5460736751556396, + "learning_rate": 1.868417786487575e-05, + "loss": 0.4444, + "step": 14268 + }, + { + "epoch": 0.750405950447855, + "grad_norm": 2.8102331161499023, + "learning_rate": 1.8666595861274283e-05, + "loss": 0.4159, + "step": 14326 + }, + { + "epoch": 0.7534440312188989, + "grad_norm": 3.3770508766174316, + "learning_rate": 1.8648905557104046e-05, + "loss": 0.4357, + "step": 14384 + }, + { + "epoch": 0.7564821119899429, + "grad_norm": 2.95613169670105, + "learning_rate": 1.863110717342876e-05, + "loss": 0.4627, + "step": 14442 + }, + { + "epoch": 0.7595201927609868, + "grad_norm": 2.80786395072937, + "learning_rate": 1.8613200932662764e-05, + "loss": 0.4331, + "step": 14500 + }, + { + "epoch": 0.7625582735320308, + "grad_norm": 2.9433181285858154, + "learning_rate": 1.8595187058568197e-05, + "loss": 0.5087, + "step": 14558 + }, + { + "epoch": 0.7655963543030747, + "grad_norm": 2.6625008583068848, + "learning_rate": 1.8577065776252218e-05, + "loss": 0.5018, + "step": 14616 + }, + { + "epoch": 0.7686344350741187, + "grad_norm": 3.8713533878326416, + "learning_rate": 1.8558837312164198e-05, + "loss": 0.4454, + "step": 14674 + }, + { + "epoch": 0.7716725158451626, + "grad_norm": 3.236130475997925, + "learning_rate": 1.8540501894092894e-05, + "loss": 0.4463, + "step": 14732 + }, + { + "epoch": 0.7747105966162066, + "grad_norm": 1.9471006393432617, + "learning_rate": 1.8522059751163578e-05, + "loss": 0.4615, + "step": 14790 + }, + { + "epoch": 0.7777486773872505, + "grad_norm": 1.1234129667282104, + "learning_rate": 1.85035111138352e-05, + "loss": 0.3841, + "step": 14848 + }, + { + "epoch": 0.7807867581582945, + "grad_norm": 3.155194044113159, + "learning_rate": 1.8484856213897496e-05, + "loss": 0.4932, + "step": 14906 + }, + { + "epoch": 0.7838248389293384, + "grad_norm": 1.2532273530960083, + "learning_rate": 1.8466095284468103e-05, + "loss": 0.427, + "step": 14964 + }, + { + "epoch": 0.7868629197003824, + "grad_norm": 3.315812349319458, + "learning_rate": 1.8447228559989618e-05, + "loss": 0.4406, + "step": 15022 + }, + { + "epoch": 0.7899010004714263, + "grad_norm": 2.3999452590942383, + "learning_rate": 1.842858427754608e-05, + "loss": 0.4413, + "step": 15080 + }, + { + "epoch": 0.7929390812424703, + "grad_norm": 2.896650791168213, + "learning_rate": 1.8409508485466538e-05, + "loss": 0.4068, + "step": 15138 + }, + { + "epoch": 0.7959771620135142, + "grad_norm": 3.3152272701263428, + "learning_rate": 1.8390327605464747e-05, + "loss": 0.4708, + "step": 15196 + }, + { + "epoch": 0.7990152427845582, + "grad_norm": 2.573716163635254, + "learning_rate": 1.8371041877231145e-05, + "loss": 0.4506, + "step": 15254 + }, + { + "epoch": 0.8020533235556021, + "grad_norm": 1.0098395347595215, + "learning_rate": 1.8351651541766398e-05, + "loss": 0.4614, + "step": 15312 + }, + { + "epoch": 0.805091404326646, + "grad_norm": 2.7257494926452637, + "learning_rate": 1.8332156841378376e-05, + "loss": 0.481, + "step": 15370 + }, + { + "epoch": 0.80812948509769, + "grad_norm": 3.291948080062866, + "learning_rate": 1.8312558019679113e-05, + "loss": 0.4872, + "step": 15428 + }, + { + "epoch": 0.811167565868734, + "grad_norm": 0.6372181177139282, + "learning_rate": 1.82928553215818e-05, + "loss": 0.4664, + "step": 15486 + }, + { + "epoch": 0.8142056466397779, + "grad_norm": 2.14487361907959, + "learning_rate": 1.8273048993297682e-05, + "loss": 0.4443, + "step": 15544 + }, + { + "epoch": 0.8172437274108219, + "grad_norm": 1.6703099012374878, + "learning_rate": 1.8253139282333005e-05, + "loss": 0.4683, + "step": 15602 + }, + { + "epoch": 0.8202818081818658, + "grad_norm": 3.7610647678375244, + "learning_rate": 1.8233126437485925e-05, + "loss": 0.4299, + "step": 15660 + }, + { + "epoch": 0.8233198889529098, + "grad_norm": 3.429608106613159, + "learning_rate": 1.821301070884338e-05, + "loss": 0.3976, + "step": 15718 + }, + { + "epoch": 0.8263579697239537, + "grad_norm": 3.0198211669921875, + "learning_rate": 1.819279234777799e-05, + "loss": 0.407, + "step": 15776 + }, + { + "epoch": 0.8293960504949976, + "grad_norm": 3.1796703338623047, + "learning_rate": 1.817247160694489e-05, + "loss": 0.4235, + "step": 15834 + }, + { + "epoch": 0.8324341312660416, + "grad_norm": 2.235328197479248, + "learning_rate": 1.81520487402786e-05, + "loss": 0.403, + "step": 15892 + }, + { + "epoch": 0.8354722120370855, + "grad_norm": 3.7409324645996094, + "learning_rate": 1.8131524002989816e-05, + "loss": 0.4325, + "step": 15950 + }, + { + "epoch": 0.8385102928081295, + "grad_norm": 2.7824206352233887, + "learning_rate": 1.811089765156227e-05, + "loss": 0.432, + "step": 16008 + }, + { + "epoch": 0.8415483735791734, + "grad_norm": 1.6856441497802734, + "learning_rate": 1.8090528175270648e-05, + "loss": 0.4156, + "step": 16066 + }, + { + "epoch": 0.8445864543502174, + "grad_norm": 2.9528396129608154, + "learning_rate": 1.8069701110949214e-05, + "loss": 0.4486, + "step": 16124 + }, + { + "epoch": 0.8476245351212613, + "grad_norm": 1.5881803035736084, + "learning_rate": 1.8048773205047752e-05, + "loss": 0.4133, + "step": 16182 + }, + { + "epoch": 0.8506626158923053, + "grad_norm": 5.50068473815918, + "learning_rate": 1.8027744719088103e-05, + "loss": 0.4553, + "step": 16240 + }, + { + "epoch": 0.8537006966633492, + "grad_norm": 3.2943098545074463, + "learning_rate": 1.800661591584899e-05, + "loss": 0.4428, + "step": 16298 + }, + { + "epoch": 0.8567387774343932, + "grad_norm": 1.3706376552581787, + "learning_rate": 1.798538705936273e-05, + "loss": 0.4779, + "step": 16356 + }, + { + "epoch": 0.8597768582054371, + "grad_norm": 2.18271541595459, + "learning_rate": 1.796405841491194e-05, + "loss": 0.4687, + "step": 16414 + }, + { + "epoch": 0.8628149389764811, + "grad_norm": 2.5106441974639893, + "learning_rate": 1.794263024902622e-05, + "loss": 0.4016, + "step": 16472 + }, + { + "epoch": 0.865853019747525, + "grad_norm": 2.757732629776001, + "learning_rate": 1.7921102829478832e-05, + "loss": 0.4948, + "step": 16530 + }, + { + "epoch": 0.868891100518569, + "grad_norm": 0.37621229887008667, + "learning_rate": 1.7899476425283318e-05, + "loss": 0.4304, + "step": 16588 + }, + { + "epoch": 0.8719291812896129, + "grad_norm": 4.135168552398682, + "learning_rate": 1.787775130669019e-05, + "loss": 0.4195, + "step": 16646 + }, + { + "epoch": 0.8749672620606569, + "grad_norm": 2.2052392959594727, + "learning_rate": 1.7855927745183504e-05, + "loss": 0.4449, + "step": 16704 + }, + { + "epoch": 0.8780053428317008, + "grad_norm": 2.8733346462249756, + "learning_rate": 1.7834006013477513e-05, + "loss": 0.5016, + "step": 16762 + }, + { + "epoch": 0.8810434236027448, + "grad_norm": 1.8927271366119385, + "learning_rate": 1.7811986385513226e-05, + "loss": 0.3793, + "step": 16820 + }, + { + "epoch": 0.8840815043737887, + "grad_norm": 3.7612531185150146, + "learning_rate": 1.7789869136454988e-05, + "loss": 0.3601, + "step": 16878 + }, + { + "epoch": 0.8871195851448327, + "grad_norm": 1.6613848209381104, + "learning_rate": 1.7767654542687057e-05, + "loss": 0.4772, + "step": 16936 + }, + { + "epoch": 0.8901576659158766, + "grad_norm": 2.5755159854888916, + "learning_rate": 1.7745342881810144e-05, + "loss": 0.4475, + "step": 16994 + }, + { + "epoch": 0.8931957466869206, + "grad_norm": 2.7520928382873535, + "learning_rate": 1.7722934432637937e-05, + "loss": 0.3942, + "step": 17052 + }, + { + "epoch": 0.8962338274579645, + "grad_norm": 4.439705848693848, + "learning_rate": 1.770042947519362e-05, + "loss": 0.4361, + "step": 17110 + }, + { + "epoch": 0.8992719082290085, + "grad_norm": 2.091926097869873, + "learning_rate": 1.7677828290706382e-05, + "loss": 0.42, + "step": 17168 + }, + { + "epoch": 0.9000052380702949, + "eval_accuracy": 0.8844256401062012, + "eval_loss": 0.44025254249572754, + "eval_runtime": 5730.5358, + "eval_samples_per_second": 0.835, + "eval_steps_per_second": 0.835, + "step": 17182 + }, + { + "epoch": 0.9023099890000523, + "grad_norm": 0.736885666847229, + "learning_rate": 1.7655131161607887e-05, + "loss": 0.4806, + "step": 17226 + }, + { + "epoch": 0.9053480697710963, + "grad_norm": 2.1999900341033936, + "learning_rate": 1.7632732159520203e-05, + "loss": 0.4541, + "step": 17284 + }, + { + "epoch": 0.9083861505421402, + "grad_norm": 2.4895384311676025, + "learning_rate": 1.76098456352832e-05, + "loss": 0.4643, + "step": 17342 + }, + { + "epoch": 0.9114242313131842, + "grad_norm": 2.773494005203247, + "learning_rate": 1.7586864015968063e-05, + "loss": 0.4031, + "step": 17400 + }, + { + "epoch": 0.9144623120842281, + "grad_norm": 3.805938720703125, + "learning_rate": 1.7563787588760503e-05, + "loss": 0.4756, + "step": 17458 + }, + { + "epoch": 0.9175003928552721, + "grad_norm": 2.412860631942749, + "learning_rate": 1.7540616642030974e-05, + "loss": 0.4453, + "step": 17516 + }, + { + "epoch": 0.920538473626316, + "grad_norm": 3.0736114978790283, + "learning_rate": 1.751735146533107e-05, + "loss": 0.4374, + "step": 17574 + }, + { + "epoch": 0.92357655439736, + "grad_norm": 3.2423524856567383, + "learning_rate": 1.7493992349389927e-05, + "loss": 0.3971, + "step": 17632 + }, + { + "epoch": 0.9266146351684039, + "grad_norm": 2.4448719024658203, + "learning_rate": 1.7470539586110572e-05, + "loss": 0.4407, + "step": 17690 + }, + { + "epoch": 0.9296527159394479, + "grad_norm": 4.234783172607422, + "learning_rate": 1.7446993468566268e-05, + "loss": 0.4136, + "step": 17748 + }, + { + "epoch": 0.9326907967104918, + "grad_norm": 2.7946712970733643, + "learning_rate": 1.742335429099688e-05, + "loss": 0.4021, + "step": 17806 + }, + { + "epoch": 0.9357288774815358, + "grad_norm": 0.8968492746353149, + "learning_rate": 1.7399622348805165e-05, + "loss": 0.4591, + "step": 17864 + }, + { + "epoch": 0.9387669582525797, + "grad_norm": 2.746527671813965, + "learning_rate": 1.7375797938553108e-05, + "loss": 0.3938, + "step": 17922 + }, + { + "epoch": 0.9418050390236237, + "grad_norm": 1.0526750087738037, + "learning_rate": 1.73518813579582e-05, + "loss": 0.4577, + "step": 17980 + }, + { + "epoch": 0.9448431197946676, + "grad_norm": 2.459176778793335, + "learning_rate": 1.7327872905889727e-05, + "loss": 0.395, + "step": 18038 + }, + { + "epoch": 0.9478812005657116, + "grad_norm": 3.1182572841644287, + "learning_rate": 1.7303772882365018e-05, + "loss": 0.4536, + "step": 18096 + }, + { + "epoch": 0.9509192813367555, + "grad_norm": 2.8542633056640625, + "learning_rate": 1.7279581588545723e-05, + "loss": 0.4448, + "step": 18154 + }, + { + "epoch": 0.9539573621077995, + "grad_norm": 2.421351432800293, + "learning_rate": 1.7255299326734026e-05, + "loss": 0.4568, + "step": 18212 + }, + { + "epoch": 0.9569954428788434, + "grad_norm": 1.9418818950653076, + "learning_rate": 1.7230926400368878e-05, + "loss": 0.4509, + "step": 18270 + }, + { + "epoch": 0.9600335236498874, + "grad_norm": 2.297189950942993, + "learning_rate": 1.720688565864609e-05, + "loss": 0.4185, + "step": 18328 + }, + { + "epoch": 0.9630716044209313, + "grad_norm": 3.155817747116089, + "learning_rate": 1.7182333868082773e-05, + "loss": 0.4724, + "step": 18386 + }, + { + "epoch": 0.9661096851919753, + "grad_norm": 2.3270270824432373, + "learning_rate": 1.715769232476584e-05, + "loss": 0.4434, + "step": 18444 + }, + { + "epoch": 0.9691477659630192, + "grad_norm": 2.4003474712371826, + "learning_rate": 1.7132961336623944e-05, + "loss": 0.4325, + "step": 18502 + }, + { + "epoch": 0.9721858467340632, + "grad_norm": 4.449118614196777, + "learning_rate": 1.710814121270346e-05, + "loss": 0.4497, + "step": 18560 + }, + { + "epoch": 0.9752239275051071, + "grad_norm": 4.001181125640869, + "learning_rate": 1.7083232263164643e-05, + "loss": 0.4133, + "step": 18618 + }, + { + "epoch": 0.9782620082761511, + "grad_norm": 1.3908356428146362, + "learning_rate": 1.7058234799277733e-05, + "loss": 0.436, + "step": 18676 + }, + { + "epoch": 0.981300089047195, + "grad_norm": 2.0721793174743652, + "learning_rate": 1.703314913341908e-05, + "loss": 0.327, + "step": 18734 + }, + { + "epoch": 0.984338169818239, + "grad_norm": 1.8085274696350098, + "learning_rate": 1.700797557906723e-05, + "loss": 0.4782, + "step": 18792 + }, + { + "epoch": 0.9873762505892829, + "grad_norm": 2.877991199493408, + "learning_rate": 1.6982714450799006e-05, + "loss": 0.4804, + "step": 18850 + }, + { + "epoch": 0.9904143313603269, + "grad_norm": 2.9189906120300293, + "learning_rate": 1.6957366064285604e-05, + "loss": 0.4473, + "step": 18908 + }, + { + "epoch": 0.9934524121313708, + "grad_norm": 2.6251885890960693, + "learning_rate": 1.6931930736288605e-05, + "loss": 0.4665, + "step": 18966 + }, + { + "epoch": 0.9964904929024148, + "grad_norm": 2.457298517227173, + "learning_rate": 1.6906408784656045e-05, + "loss": 0.4931, + "step": 19024 + }, + { + "epoch": 0.9995285736734587, + "grad_norm": 4.4407057762146, + "learning_rate": 1.6880800528318443e-05, + "loss": 0.4835, + "step": 19082 + }, + { + "epoch": 1.0025666544445027, + "grad_norm": 0.4853041172027588, + "learning_rate": 1.68551062872848e-05, + "loss": 0.4143, + "step": 19140 + }, + { + "epoch": 1.0056047352155466, + "grad_norm": 3.372084379196167, + "learning_rate": 1.682932638263862e-05, + "loss": 0.4104, + "step": 19198 + }, + { + "epoch": 1.0086428159865906, + "grad_norm": 3.5660102367401123, + "learning_rate": 1.6803461136533877e-05, + "loss": 0.3998, + "step": 19256 + }, + { + "epoch": 1.0116808967576345, + "grad_norm": 3.154710531234741, + "learning_rate": 1.6777510872191012e-05, + "loss": 0.4479, + "step": 19314 + }, + { + "epoch": 1.0147189775286785, + "grad_norm": 2.6014859676361084, + "learning_rate": 1.675147591389286e-05, + "loss": 0.4407, + "step": 19372 + }, + { + "epoch": 1.0177570582997224, + "grad_norm": 2.906419515609741, + "learning_rate": 1.672535658698064e-05, + "loss": 0.3687, + "step": 19430 + }, + { + "epoch": 1.0207951390707664, + "grad_norm": 4.82797908782959, + "learning_rate": 1.669915321784986e-05, + "loss": 0.4467, + "step": 19488 + }, + { + "epoch": 1.0238332198418103, + "grad_norm": 2.7285468578338623, + "learning_rate": 1.6673320066929267e-05, + "loss": 0.4124, + "step": 19546 + }, + { + "epoch": 1.0268713006128543, + "grad_norm": 3.0213897228240967, + "learning_rate": 1.664695103165033e-05, + "loss": 0.3955, + "step": 19604 + }, + { + "epoch": 1.0299093813838982, + "grad_norm": 1.75594961643219, + "learning_rate": 1.662049893393386e-05, + "loss": 0.3951, + "step": 19662 + }, + { + "epoch": 1.0329474621549422, + "grad_norm": 4.119887828826904, + "learning_rate": 1.659396410433378e-05, + "loss": 0.405, + "step": 19720 + }, + { + "epoch": 1.035985542925986, + "grad_norm": 2.607999801635742, + "learning_rate": 1.6567346874437857e-05, + "loss": 0.4512, + "step": 19778 + }, + { + "epoch": 1.03902362369703, + "grad_norm": 2.761012077331543, + "learning_rate": 1.6540647576863546e-05, + "loss": 0.3692, + "step": 19836 + }, + { + "epoch": 1.042061704468074, + "grad_norm": 1.8808788061141968, + "learning_rate": 1.6513866545253866e-05, + "loss": 0.4663, + "step": 19894 + }, + { + "epoch": 1.045099785239118, + "grad_norm": 3.0192790031433105, + "learning_rate": 1.648700411427319e-05, + "loss": 0.4485, + "step": 19952 + }, + { + "epoch": 1.048137866010162, + "grad_norm": 3.0104329586029053, + "learning_rate": 1.64600606196031e-05, + "loss": 0.4003, + "step": 20010 + }, + { + "epoch": 1.0511759467812058, + "grad_norm": 2.610039472579956, + "learning_rate": 1.6433036397938168e-05, + "loss": 0.3967, + "step": 20068 + }, + { + "epoch": 1.0542140275522498, + "grad_norm": 2.601706027984619, + "learning_rate": 1.6405931786981753e-05, + "loss": 0.3656, + "step": 20126 + }, + { + "epoch": 1.0572521083232937, + "grad_norm": 1.4956645965576172, + "learning_rate": 1.63787471254418e-05, + "loss": 0.3829, + "step": 20184 + }, + { + "epoch": 1.0602901890943377, + "grad_norm": 2.5245773792266846, + "learning_rate": 1.635148275302657e-05, + "loss": 0.4309, + "step": 20242 + }, + { + "epoch": 1.0633282698653816, + "grad_norm": 2.844923734664917, + "learning_rate": 1.6324139010440435e-05, + "loss": 0.4478, + "step": 20300 + }, + { + "epoch": 1.0663663506364256, + "grad_norm": 2.980348587036133, + "learning_rate": 1.629671623937959e-05, + "loss": 0.3524, + "step": 20358 + }, + { + "epoch": 1.0694044314074695, + "grad_norm": 2.8273379802703857, + "learning_rate": 1.626921478252781e-05, + "loss": 0.4499, + "step": 20416 + }, + { + "epoch": 1.0724425121785135, + "grad_norm": 3.1932373046875, + "learning_rate": 1.624163498355213e-05, + "loss": 0.3989, + "step": 20474 + }, + { + "epoch": 1.0754805929495574, + "grad_norm": 3.9423575401306152, + "learning_rate": 1.62139771870986e-05, + "loss": 0.4338, + "step": 20532 + }, + { + "epoch": 1.0785186737206014, + "grad_norm": 3.1419034004211426, + "learning_rate": 1.618624173878793e-05, + "loss": 0.4207, + "step": 20590 + }, + { + "epoch": 1.0815567544916453, + "grad_norm": 0.7265225052833557, + "learning_rate": 1.61584289852112e-05, + "loss": 0.3271, + "step": 20648 + }, + { + "epoch": 1.0845948352626893, + "grad_norm": 1.7413750886917114, + "learning_rate": 1.613053927392553e-05, + "loss": 0.3501, + "step": 20706 + }, + { + "epoch": 1.0876329160337332, + "grad_norm": 4.131454944610596, + "learning_rate": 1.6102572953449715e-05, + "loss": 0.5085, + "step": 20764 + }, + { + "epoch": 1.0906709968047772, + "grad_norm": 3.0570194721221924, + "learning_rate": 1.6074530373259887e-05, + "loss": 0.4154, + "step": 20822 + }, + { + "epoch": 1.0937090775758211, + "grad_norm": 4.058136463165283, + "learning_rate": 1.6046897326469475e-05, + "loss": 0.4007, + "step": 20880 + }, + { + "epoch": 1.096747158346865, + "grad_norm": 2.809772491455078, + "learning_rate": 1.601870457882787e-05, + "loss": 0.4188, + "step": 20938 + }, + { + "epoch": 1.099785239117909, + "grad_norm": 2.9016387462615967, + "learning_rate": 1.5990436619518428e-05, + "loss": 0.3567, + "step": 20996 + }, + { + "epoch": 1.102823319888953, + "grad_norm": 3.3889174461364746, + "learning_rate": 1.5962093801786668e-05, + "loss": 0.3956, + "step": 21054 + }, + { + "epoch": 1.105861400659997, + "grad_norm": 1.6921156644821167, + "learning_rate": 1.5933676479813547e-05, + "loss": 0.352, + "step": 21112 + }, + { + "epoch": 1.108899481431041, + "grad_norm": 3.156926393508911, + "learning_rate": 1.5905185008711063e-05, + "loss": 0.4026, + "step": 21170 + }, + { + "epoch": 1.1119375622020848, + "grad_norm": 1.8040690422058105, + "learning_rate": 1.58766197445178e-05, + "loss": 0.4248, + "step": 21228 + }, + { + "epoch": 1.1149756429731288, + "grad_norm": 3.1559841632843018, + "learning_rate": 1.5848475435042218e-05, + "loss": 0.3691, + "step": 21286 + }, + { + "epoch": 1.1180137237441727, + "grad_norm": 2.4738497734069824, + "learning_rate": 1.5820260539561704e-05, + "loss": 0.367, + "step": 21344 + }, + { + "epoch": 1.1210518045152167, + "grad_norm": 3.917677402496338, + "learning_rate": 1.579147854311163e-05, + "loss": 0.426, + "step": 21402 + }, + { + "epoch": 1.1240898852862606, + "grad_norm": 2.7536489963531494, + "learning_rate": 1.576262417448334e-05, + "loss": 0.4269, + "step": 21460 + }, + { + "epoch": 1.1271279660573046, + "grad_norm": 3.678410768508911, + "learning_rate": 1.5733697794250292e-05, + "loss": 0.3977, + "step": 21518 + }, + { + "epoch": 1.1301660468283483, + "grad_norm": 3.2154643535614014, + "learning_rate": 1.5704699763885845e-05, + "loss": 0.3647, + "step": 21576 + }, + { + "epoch": 1.1332041275993925, + "grad_norm": 2.71803617477417, + "learning_rate": 1.5675630445758707e-05, + "loss": 0.3774, + "step": 21634 + }, + { + "epoch": 1.1362422083704362, + "grad_norm": 2.9909651279449463, + "learning_rate": 1.5646490203128424e-05, + "loss": 0.3707, + "step": 21692 + }, + { + "epoch": 1.1392802891414804, + "grad_norm": 3.9027791023254395, + "learning_rate": 1.561727940014084e-05, + "loss": 0.3925, + "step": 21750 + }, + { + "epoch": 1.1423183699125241, + "grad_norm": 2.427164316177368, + "learning_rate": 1.558799840182354e-05, + "loss": 0.3666, + "step": 21808 + }, + { + "epoch": 1.1453564506835683, + "grad_norm": 4.000829219818115, + "learning_rate": 1.55586475740813e-05, + "loss": 0.4233, + "step": 21866 + }, + { + "epoch": 1.148394531454612, + "grad_norm": 2.324054718017578, + "learning_rate": 1.5529227283691498e-05, + "loss": 0.4315, + "step": 21924 + }, + { + "epoch": 1.1514326122256562, + "grad_norm": 2.127596378326416, + "learning_rate": 1.549973789829954e-05, + "loss": 0.3987, + "step": 21982 + }, + { + "epoch": 1.1544706929967, + "grad_norm": 2.658895254135132, + "learning_rate": 1.5470179786414278e-05, + "loss": 0.3766, + "step": 22040 + }, + { + "epoch": 1.157508773767744, + "grad_norm": 3.48286509513855, + "learning_rate": 1.5440553317403375e-05, + "loss": 0.414, + "step": 22098 + }, + { + "epoch": 1.1605468545387878, + "grad_norm": 2.305206298828125, + "learning_rate": 1.5410858861488717e-05, + "loss": 0.3961, + "step": 22156 + }, + { + "epoch": 1.163584935309832, + "grad_norm": 2.7347195148468018, + "learning_rate": 1.5381096789741777e-05, + "loss": 0.3038, + "step": 22214 + }, + { + "epoch": 1.1666230160808757, + "grad_norm": 3.5154871940612793, + "learning_rate": 1.5351267474078967e-05, + "loss": 0.3442, + "step": 22272 + }, + { + "epoch": 1.1696610968519197, + "grad_norm": 3.8937666416168213, + "learning_rate": 1.532137128725701e-05, + "loss": 0.396, + "step": 22330 + }, + { + "epoch": 1.1726991776229636, + "grad_norm": 3.284895896911621, + "learning_rate": 1.529140860286828e-05, + "loss": 0.3605, + "step": 22388 + }, + { + "epoch": 1.1757372583940076, + "grad_norm": 2.9780056476593018, + "learning_rate": 1.5261379795336102e-05, + "loss": 0.3908, + "step": 22446 + }, + { + "epoch": 1.1787753391650515, + "grad_norm": 3.4775991439819336, + "learning_rate": 1.5231285239910119e-05, + "loss": 0.4244, + "step": 22504 + }, + { + "epoch": 1.1818134199360955, + "grad_norm": 3.3921220302581787, + "learning_rate": 1.520112531266157e-05, + "loss": 0.4234, + "step": 22562 + }, + { + "epoch": 1.1848515007071394, + "grad_norm": 4.370563507080078, + "learning_rate": 1.5170900390478605e-05, + "loss": 0.3605, + "step": 22620 + }, + { + "epoch": 1.1878895814781834, + "grad_norm": 2.411207675933838, + "learning_rate": 1.5140610851061573e-05, + "loss": 0.4109, + "step": 22678 + }, + { + "epoch": 1.1909276622492273, + "grad_norm": 1.8209956884384155, + "learning_rate": 1.5110257072918297e-05, + "loss": 0.3973, + "step": 22736 + }, + { + "epoch": 1.1939657430202713, + "grad_norm": 1.9300421476364136, + "learning_rate": 1.5079839435359347e-05, + "loss": 0.385, + "step": 22794 + }, + { + "epoch": 1.1970038237913152, + "grad_norm": 2.628262519836426, + "learning_rate": 1.504935831849331e-05, + "loss": 0.3439, + "step": 22852 + }, + { + "epoch": 1.2000419045623592, + "grad_norm": 1.3332035541534424, + "learning_rate": 1.5018814103222013e-05, + "loss": 0.3466, + "step": 22910 + }, + { + "epoch": 1.203079985333403, + "grad_norm": 0.541492223739624, + "learning_rate": 1.4988207171235807e-05, + "loss": 0.4193, + "step": 22968 + }, + { + "epoch": 1.206118066104447, + "grad_norm": 3.0223543643951416, + "learning_rate": 1.4957537905008744e-05, + "loss": 0.3866, + "step": 23026 + }, + { + "epoch": 1.209156146875491, + "grad_norm": 2.3658430576324463, + "learning_rate": 1.492680668779384e-05, + "loss": 0.4248, + "step": 23084 + }, + { + "epoch": 1.212194227646535, + "grad_norm": 2.964507579803467, + "learning_rate": 1.4896013903618272e-05, + "loss": 0.3904, + "step": 23142 + }, + { + "epoch": 1.215232308417579, + "grad_norm": 2.325568675994873, + "learning_rate": 1.4865159937278566e-05, + "loss": 0.3901, + "step": 23200 + }, + { + "epoch": 1.2182703891886228, + "grad_norm": 2.5834431648254395, + "learning_rate": 1.4834245174335812e-05, + "loss": 0.386, + "step": 23258 + }, + { + "epoch": 1.2213084699596668, + "grad_norm": 3.9499247074127197, + "learning_rate": 1.480327000111083e-05, + "loss": 0.4012, + "step": 23316 + }, + { + "epoch": 1.2243465507307107, + "grad_norm": 3.858048439025879, + "learning_rate": 1.477223480467934e-05, + "loss": 0.4277, + "step": 23374 + }, + { + "epoch": 1.2273846315017547, + "grad_norm": 2.3730807304382324, + "learning_rate": 1.4741139972867137e-05, + "loss": 0.3963, + "step": 23432 + }, + { + "epoch": 1.2304227122727986, + "grad_norm": 3.197197914123535, + "learning_rate": 1.4709985894245246e-05, + "loss": 0.4269, + "step": 23490 + }, + { + "epoch": 1.2334607930438426, + "grad_norm": 1.0639739036560059, + "learning_rate": 1.4678772958125043e-05, + "loss": 0.486, + "step": 23548 + }, + { + "epoch": 1.2364988738148865, + "grad_norm": 4.4826202392578125, + "learning_rate": 1.4647501554553417e-05, + "loss": 0.3983, + "step": 23606 + }, + { + "epoch": 1.2395369545859305, + "grad_norm": 3.2974660396575928, + "learning_rate": 1.4616172074307886e-05, + "loss": 0.3893, + "step": 23664 + }, + { + "epoch": 1.2425750353569744, + "grad_norm": 4.817580223083496, + "learning_rate": 1.4584784908891705e-05, + "loss": 0.4044, + "step": 23722 + }, + { + "epoch": 1.2456131161280184, + "grad_norm": 4.333406925201416, + "learning_rate": 1.455334045052899e-05, + "loss": 0.3787, + "step": 23780 + }, + { + "epoch": 1.2486511968990623, + "grad_norm": 4.401366233825684, + "learning_rate": 1.4521839092159802e-05, + "loss": 0.3653, + "step": 23838 + }, + { + "epoch": 1.2516892776701063, + "grad_norm": 1.6555136442184448, + "learning_rate": 1.4490281227435248e-05, + "loss": 0.3371, + "step": 23896 + }, + { + "epoch": 1.2547273584411502, + "grad_norm": 3.3965320587158203, + "learning_rate": 1.4459212792449709e-05, + "loss": 0.3923, + "step": 23954 + }, + { + "epoch": 1.2577654392121942, + "grad_norm": 1.8068640232086182, + "learning_rate": 1.4427544056073314e-05, + "loss": 0.4196, + "step": 24012 + }, + { + "epoch": 1.2608035199832381, + "grad_norm": 2.8050827980041504, + "learning_rate": 1.4395819991682645e-05, + "loss": 0.448, + "step": 24070 + }, + { + "epoch": 1.263841600754282, + "grad_norm": 3.339679002761841, + "learning_rate": 1.4364040995711812e-05, + "loss": 0.4015, + "step": 24128 + }, + { + "epoch": 1.266879681525326, + "grad_norm": 1.8822154998779297, + "learning_rate": 1.4332207465281365e-05, + "loss": 0.3494, + "step": 24186 + }, + { + "epoch": 1.26991776229637, + "grad_norm": 2.0527522563934326, + "learning_rate": 1.4300319798193339e-05, + "loss": 0.4453, + "step": 24244 + }, + { + "epoch": 1.272955843067414, + "grad_norm": 2.963618516921997, + "learning_rate": 1.4268378392926277e-05, + "loss": 0.451, + "step": 24302 + }, + { + "epoch": 1.2759939238384579, + "grad_norm": 2.991400957107544, + "learning_rate": 1.4236383648630245e-05, + "loss": 0.3719, + "step": 24360 + }, + { + "epoch": 1.2790320046095018, + "grad_norm": 1.891254186630249, + "learning_rate": 1.4204335965121862e-05, + "loss": 0.334, + "step": 24418 + }, + { + "epoch": 1.2820700853805458, + "grad_norm": 3.0473315715789795, + "learning_rate": 1.4172235742879283e-05, + "loss": 0.4188, + "step": 24476 + }, + { + "epoch": 1.2851081661515897, + "grad_norm": 1.8583441972732544, + "learning_rate": 1.414008338303721e-05, + "loss": 0.4133, + "step": 24534 + }, + { + "epoch": 1.2881462469226337, + "grad_norm": 2.5623059272766113, + "learning_rate": 1.4107879287381872e-05, + "loss": 0.4317, + "step": 24592 + }, + { + "epoch": 1.2911843276936776, + "grad_norm": 1.2033942937850952, + "learning_rate": 1.4075623858346e-05, + "loss": 0.3494, + "step": 24650 + }, + { + "epoch": 1.2942224084647216, + "grad_norm": 1.8575024604797363, + "learning_rate": 1.404331749900381e-05, + "loss": 0.4533, + "step": 24708 + }, + { + "epoch": 1.2972604892357655, + "grad_norm": 2.617656946182251, + "learning_rate": 1.4010960613065956e-05, + "loss": 0.2822, + "step": 24766 + }, + { + "epoch": 1.3002985700068095, + "grad_norm": 1.466433048248291, + "learning_rate": 1.397855360487449e-05, + "loss": 0.3965, + "step": 24824 + }, + { + "epoch": 1.3033366507778534, + "grad_norm": 2.6398956775665283, + "learning_rate": 1.3946096879397808e-05, + "loss": 0.4247, + "step": 24882 + }, + { + "epoch": 1.3063747315488974, + "grad_norm": 3.450256109237671, + "learning_rate": 1.3913590842225589e-05, + "loss": 0.3854, + "step": 24940 + }, + { + "epoch": 1.3094128123199413, + "grad_norm": 3.451451063156128, + "learning_rate": 1.388103589956372e-05, + "loss": 0.3852, + "step": 24998 + }, + { + "epoch": 1.3124508930909853, + "grad_norm": 3.278472900390625, + "learning_rate": 1.3848432458229241e-05, + "loss": 0.3999, + "step": 25056 + }, + { + "epoch": 1.3154889738620292, + "grad_norm": 0.6535636782646179, + "learning_rate": 1.381578092564524e-05, + "loss": 0.4151, + "step": 25114 + }, + { + "epoch": 1.3185270546330732, + "grad_norm": 3.161670207977295, + "learning_rate": 1.378308170983576e-05, + "loss": 0.3413, + "step": 25172 + }, + { + "epoch": 1.3215651354041171, + "grad_norm": 3.4061434268951416, + "learning_rate": 1.375033521942072e-05, + "loss": 0.3902, + "step": 25230 + }, + { + "epoch": 1.324603216175161, + "grad_norm": 3.568188428878784, + "learning_rate": 1.3717541863610799e-05, + "loss": 0.3391, + "step": 25288 + }, + { + "epoch": 1.327641296946205, + "grad_norm": 1.4888482093811035, + "learning_rate": 1.368470205220231e-05, + "loss": 0.4018, + "step": 25346 + }, + { + "epoch": 1.330679377717249, + "grad_norm": 3.4848194122314453, + "learning_rate": 1.36518161955721e-05, + "loss": 0.4045, + "step": 25404 + }, + { + "epoch": 1.333717458488293, + "grad_norm": 2.341862916946411, + "learning_rate": 1.3618884704672413e-05, + "loss": 0.3615, + "step": 25462 + }, + { + "epoch": 1.3367555392593369, + "grad_norm": 4.151137351989746, + "learning_rate": 1.3585907991025737e-05, + "loss": 0.4122, + "step": 25520 + }, + { + "epoch": 1.3397936200303808, + "grad_norm": 3.4976377487182617, + "learning_rate": 1.3552886466719696e-05, + "loss": 0.3674, + "step": 25578 + }, + { + "epoch": 1.3428317008014248, + "grad_norm": 3.3959484100341797, + "learning_rate": 1.3519820544401882e-05, + "loss": 0.3742, + "step": 25636 + }, + { + "epoch": 1.3458697815724687, + "grad_norm": 1.3133649826049805, + "learning_rate": 1.3486710637274687e-05, + "loss": 0.3388, + "step": 25694 + }, + { + "epoch": 1.3489078623435127, + "grad_norm": 3.167113780975342, + "learning_rate": 1.3453557159090159e-05, + "loss": 0.3886, + "step": 25752 + }, + { + "epoch": 1.3519459431145566, + "grad_norm": 2.9960269927978516, + "learning_rate": 1.342036052414482e-05, + "loss": 0.3896, + "step": 25810 + }, + { + "epoch": 1.3549840238856006, + "grad_norm": 3.482513427734375, + "learning_rate": 1.3387121147274498e-05, + "loss": 0.4203, + "step": 25868 + }, + { + "epoch": 1.3580221046566445, + "grad_norm": 2.4168648719787598, + "learning_rate": 1.3353839443849134e-05, + "loss": 0.343, + "step": 25926 + }, + { + "epoch": 1.3610601854276885, + "grad_norm": 0.6455244421958923, + "learning_rate": 1.33205158297676e-05, + "loss": 0.3872, + "step": 25984 + }, + { + "epoch": 1.3640982661987324, + "grad_norm": 4.288425445556641, + "learning_rate": 1.3287150721452488e-05, + "loss": 0.3655, + "step": 26042 + }, + { + "epoch": 1.3671363469697764, + "grad_norm": 2.454954147338867, + "learning_rate": 1.3254320850234712e-05, + "loss": 0.4089, + "step": 26100 + }, + { + "epoch": 1.3701744277408203, + "grad_norm": 3.7248048782348633, + "learning_rate": 1.322087470228127e-05, + "loss": 0.3784, + "step": 26158 + }, + { + "epoch": 1.3732125085118643, + "grad_norm": 1.3180829286575317, + "learning_rate": 1.3187388305241823e-05, + "loss": 0.3933, + "step": 26216 + }, + { + "epoch": 1.3762505892829082, + "grad_norm": 1.625256061553955, + "learning_rate": 1.3153862077573157e-05, + "loss": 0.4252, + "step": 26274 + }, + { + "epoch": 1.3792886700539522, + "grad_norm": 2.6077065467834473, + "learning_rate": 1.312029643822979e-05, + "loss": 0.385, + "step": 26332 + }, + { + "epoch": 1.3823267508249961, + "grad_norm": 3.2308919429779053, + "learning_rate": 1.3086691806658749e-05, + "loss": 0.4137, + "step": 26390 + }, + { + "epoch": 1.38536483159604, + "grad_norm": 3.302467107772827, + "learning_rate": 1.3053048602794315e-05, + "loss": 0.3842, + "step": 26448 + }, + { + "epoch": 1.388402912367084, + "grad_norm": 2.4501612186431885, + "learning_rate": 1.3019367247052781e-05, + "loss": 0.3889, + "step": 26506 + }, + { + "epoch": 1.391440993138128, + "grad_norm": 3.628817319869995, + "learning_rate": 1.29856481603272e-05, + "loss": 0.3531, + "step": 26564 + }, + { + "epoch": 1.394479073909172, + "grad_norm": 2.43445086479187, + "learning_rate": 1.2951891763982125e-05, + "loss": 0.402, + "step": 26622 + }, + { + "epoch": 1.3975171546802159, + "grad_norm": 0.9383435845375061, + "learning_rate": 1.2918098479848336e-05, + "loss": 0.445, + "step": 26680 + }, + { + "epoch": 1.4005552354512598, + "grad_norm": 3.4569339752197266, + "learning_rate": 1.2884268730217577e-05, + "loss": 0.3929, + "step": 26738 + }, + { + "epoch": 1.4035933162223038, + "grad_norm": 1.1176038980484009, + "learning_rate": 1.2850402937837283e-05, + "loss": 0.4028, + "step": 26796 + }, + { + "epoch": 1.4066313969933477, + "grad_norm": 3.2979259490966797, + "learning_rate": 1.2816501525905282e-05, + "loss": 0.4184, + "step": 26854 + }, + { + "epoch": 1.4096694777643917, + "grad_norm": 3.3147165775299072, + "learning_rate": 1.2782564918064522e-05, + "loss": 0.4289, + "step": 26912 + }, + { + "epoch": 1.4127075585354356, + "grad_norm": 3.0880119800567627, + "learning_rate": 1.2748593538397764e-05, + "loss": 0.4247, + "step": 26970 + }, + { + "epoch": 1.4157456393064796, + "grad_norm": 2.7575788497924805, + "learning_rate": 1.27145878114223e-05, + "loss": 0.3868, + "step": 27028 + }, + { + "epoch": 1.4187837200775235, + "grad_norm": 2.9284110069274902, + "learning_rate": 1.2680548162084614e-05, + "loss": 0.4337, + "step": 27086 + }, + { + "epoch": 1.4218218008485675, + "grad_norm": 2.6792969703674316, + "learning_rate": 1.2646475015755124e-05, + "loss": 0.4215, + "step": 27144 + }, + { + "epoch": 1.4248598816196114, + "grad_norm": 2.336975574493408, + "learning_rate": 1.261236879822282e-05, + "loss": 0.407, + "step": 27202 + }, + { + "epoch": 1.4278979623906554, + "grad_norm": 2.287140369415283, + "learning_rate": 1.2578818810950262e-05, + "loss": 0.3581, + "step": 27260 + }, + { + "epoch": 1.4309360431616993, + "grad_norm": 3.0004403591156006, + "learning_rate": 1.2544648281900015e-05, + "loss": 0.412, + "step": 27318 + }, + { + "epoch": 1.4339741239327433, + "grad_norm": 0.6953569650650024, + "learning_rate": 1.2510445954106563e-05, + "loss": 0.4086, + "step": 27376 + }, + { + "epoch": 1.437012204703787, + "grad_norm": 1.8512517213821411, + "learning_rate": 1.2476212254973198e-05, + "loss": 0.367, + "step": 27434 + }, + { + "epoch": 1.4400502854748312, + "grad_norm": 3.226573944091797, + "learning_rate": 1.2441947612295222e-05, + "loss": 0.4576, + "step": 27492 + }, + { + "epoch": 1.4430883662458749, + "grad_norm": 2.18180251121521, + "learning_rate": 1.2407652454254632e-05, + "loss": 0.406, + "step": 27550 + }, + { + "epoch": 1.446126447016919, + "grad_norm": 2.2790727615356445, + "learning_rate": 1.2373327209414759e-05, + "loss": 0.3834, + "step": 27608 + }, + { + "epoch": 1.4491645277879628, + "grad_norm": 3.7362709045410156, + "learning_rate": 1.2338972306714889e-05, + "loss": 0.3668, + "step": 27666 + }, + { + "epoch": 1.452202608559007, + "grad_norm": 3.6803033351898193, + "learning_rate": 1.2304588175464941e-05, + "loss": 0.3552, + "step": 27724 + }, + { + "epoch": 1.4552406893300507, + "grad_norm": 4.011587142944336, + "learning_rate": 1.2270175245340074e-05, + "loss": 0.3629, + "step": 27782 + }, + { + "epoch": 1.4582787701010949, + "grad_norm": 2.53393292427063, + "learning_rate": 1.223573394637533e-05, + "loss": 0.4304, + "step": 27840 + }, + { + "epoch": 1.4613168508721386, + "grad_norm": 1.7455615997314453, + "learning_rate": 1.2201264708960252e-05, + "loss": 0.3834, + "step": 27898 + }, + { + "epoch": 1.4643549316431828, + "grad_norm": 3.8155839443206787, + "learning_rate": 1.2166767963833519e-05, + "loss": 0.3604, + "step": 27956 + }, + { + "epoch": 1.4673930124142265, + "grad_norm": 2.2630105018615723, + "learning_rate": 1.213224414207755e-05, + "loss": 0.3833, + "step": 28014 + }, + { + "epoch": 1.4704310931852707, + "grad_norm": 2.6760213375091553, + "learning_rate": 1.209769367511312e-05, + "loss": 0.3574, + "step": 28072 + }, + { + "epoch": 1.4734691739563144, + "grad_norm": 3.230642318725586, + "learning_rate": 1.206311699469398e-05, + "loss": 0.4048, + "step": 28130 + }, + { + "epoch": 1.4765072547273586, + "grad_norm": 4.475136756896973, + "learning_rate": 1.2028514532901445e-05, + "loss": 0.3787, + "step": 28188 + }, + { + "epoch": 1.4795453354984023, + "grad_norm": 3.1134748458862305, + "learning_rate": 1.1993886722139004e-05, + "loss": 0.4555, + "step": 28246 + }, + { + "epoch": 1.4825834162694465, + "grad_norm": 1.694558024406433, + "learning_rate": 1.19598316646051e-05, + "loss": 0.3722, + "step": 28304 + }, + { + "epoch": 1.4856214970404902, + "grad_norm": 2.7863929271698, + "learning_rate": 1.1925154872829044e-05, + "loss": 0.3436, + "step": 28362 + }, + { + "epoch": 1.4886595778115344, + "grad_norm": 1.5842992067337036, + "learning_rate": 1.189045402369863e-05, + "loss": 0.4055, + "step": 28420 + }, + { + "epoch": 1.491697658582578, + "grad_norm": 3.305001974105835, + "learning_rate": 1.185572955084683e-05, + "loss": 0.4177, + "step": 28478 + }, + { + "epoch": 1.4947357393536223, + "grad_norm": 1.0446484088897705, + "learning_rate": 1.1820981888201819e-05, + "loss": 0.3333, + "step": 28536 + }, + { + "epoch": 1.497773820124666, + "grad_norm": 2.8945727348327637, + "learning_rate": 1.178621146998157e-05, + "loss": 0.4184, + "step": 28594 + }, + { + "epoch": 1.5008119008957101, + "grad_norm": 2.099456548690796, + "learning_rate": 1.1751418730688405e-05, + "loss": 0.3384, + "step": 28652 + }, + { + "epoch": 1.5038499816667539, + "grad_norm": 2.770231246948242, + "learning_rate": 1.1716604105103582e-05, + "loss": 0.4045, + "step": 28710 + }, + { + "epoch": 1.506888062437798, + "grad_norm": 2.9615721702575684, + "learning_rate": 1.1681768028281859e-05, + "loss": 0.358, + "step": 28768 + }, + { + "epoch": 1.5099261432088418, + "grad_norm": 2.6768476963043213, + "learning_rate": 1.1646910935546055e-05, + "loss": 0.3421, + "step": 28826 + }, + { + "epoch": 1.512964223979886, + "grad_norm": 2.7448065280914307, + "learning_rate": 1.1612033262481607e-05, + "loss": 0.3749, + "step": 28884 + }, + { + "epoch": 1.5160023047509297, + "grad_norm": 2.4983344078063965, + "learning_rate": 1.1577135444931136e-05, + "loss": 0.2883, + "step": 28942 + }, + { + "epoch": 1.5190403855219738, + "grad_norm": 3.426910400390625, + "learning_rate": 1.1542217918988993e-05, + "loss": 0.4035, + "step": 29000 + }, + { + "epoch": 1.5220784662930176, + "grad_norm": 2.1255111694335938, + "learning_rate": 1.1507281120995808e-05, + "loss": 0.3872, + "step": 29058 + }, + { + "epoch": 1.5251165470640617, + "grad_norm": 2.398613691329956, + "learning_rate": 1.147232548753304e-05, + "loss": 0.3485, + "step": 29116 + }, + { + "epoch": 1.5281546278351055, + "grad_norm": 1.043369174003601, + "learning_rate": 1.1437351455417533e-05, + "loss": 0.3921, + "step": 29174 + }, + { + "epoch": 1.5311927086061496, + "grad_norm": 0.4801720976829529, + "learning_rate": 1.1402359461696034e-05, + "loss": 0.3838, + "step": 29232 + }, + { + "epoch": 1.5342307893771934, + "grad_norm": 1.7236027717590332, + "learning_rate": 1.1367349943639748e-05, + "loss": 0.4114, + "step": 29290 + }, + { + "epoch": 1.5372688701482375, + "grad_norm": 2.06709885597229, + "learning_rate": 1.1332323338738873e-05, + "loss": 0.3896, + "step": 29348 + }, + { + "epoch": 1.5403069509192813, + "grad_norm": 2.651384115219116, + "learning_rate": 1.1297280084697126e-05, + "loss": 0.3873, + "step": 29406 + }, + { + "epoch": 1.5433450316903254, + "grad_norm": 1.6569814682006836, + "learning_rate": 1.1262825227855019e-05, + "loss": 0.3591, + "step": 29464 + }, + { + "epoch": 1.5463831124613692, + "grad_norm": 1.6780743598937988, + "learning_rate": 1.1227750257706836e-05, + "loss": 0.314, + "step": 29522 + }, + { + "epoch": 1.5494211932324133, + "grad_norm": 2.9294652938842773, + "learning_rate": 1.1192659945196629e-05, + "loss": 0.3737, + "step": 29580 + }, + { + "epoch": 1.552459274003457, + "grad_norm": 2.5051159858703613, + "learning_rate": 1.115755472882423e-05, + "loss": 0.3653, + "step": 29638 + }, + { + "epoch": 1.5554973547745012, + "grad_norm": 0.6437486410140991, + "learning_rate": 1.1122435047275705e-05, + "loss": 0.3389, + "step": 29696 + }, + { + "epoch": 1.558535435545545, + "grad_norm": 2.644324541091919, + "learning_rate": 1.1087301339417893e-05, + "loss": 0.3918, + "step": 29754 + }, + { + "epoch": 1.561573516316589, + "grad_norm": 2.6001169681549072, + "learning_rate": 1.1052154044292904e-05, + "loss": 0.3319, + "step": 29812 + }, + { + "epoch": 1.5646115970876329, + "grad_norm": 1.9335983991622925, + "learning_rate": 1.101699360111264e-05, + "loss": 0.4825, + "step": 29870 + }, + { + "epoch": 1.5676496778586768, + "grad_norm": 2.186579704284668, + "learning_rate": 1.0981820449253304e-05, + "loss": 0.3636, + "step": 29928 + }, + { + "epoch": 1.5706877586297208, + "grad_norm": 1.987005591392517, + "learning_rate": 1.0946635028249916e-05, + "loss": 0.4214, + "step": 29986 + }, + { + "epoch": 1.5737258394007647, + "grad_norm": 2.18890380859375, + "learning_rate": 1.0911437777790807e-05, + "loss": 0.3795, + "step": 30044 + }, + { + "epoch": 1.5767639201718087, + "grad_norm": 2.2321784496307373, + "learning_rate": 1.0876229137712135e-05, + "loss": 0.3954, + "step": 30102 + }, + { + "epoch": 1.5798020009428526, + "grad_norm": 3.391721725463867, + "learning_rate": 1.0841009547992398e-05, + "loss": 0.4151, + "step": 30160 + }, + { + "epoch": 1.5828400817138966, + "grad_norm": 4.004073619842529, + "learning_rate": 1.0805779448746907e-05, + "loss": 0.3318, + "step": 30218 + }, + { + "epoch": 1.5858781624849405, + "grad_norm": 2.895787477493286, + "learning_rate": 1.0770539280222312e-05, + "loss": 0.3688, + "step": 30276 + }, + { + "epoch": 1.5889162432559845, + "grad_norm": 2.4140095710754395, + "learning_rate": 1.073528948279109e-05, + "loss": 0.4059, + "step": 30334 + }, + { + "epoch": 1.5919543240270284, + "grad_norm": 3.6965787410736084, + "learning_rate": 1.070003049694605e-05, + "loss": 0.359, + "step": 30392 + }, + { + "epoch": 1.5949924047980724, + "grad_norm": 2.7172248363494873, + "learning_rate": 1.0664762763294812e-05, + "loss": 0.4282, + "step": 30450 + }, + { + "epoch": 1.5980304855691163, + "grad_norm": 3.0504825115203857, + "learning_rate": 1.0629486722554316e-05, + "loss": 0.3838, + "step": 30508 + }, + { + "epoch": 1.6010685663401603, + "grad_norm": 1.9200332164764404, + "learning_rate": 1.0594202815545319e-05, + "loss": 0.3814, + "step": 30566 + }, + { + "epoch": 1.6041066471112042, + "grad_norm": 3.9770891666412354, + "learning_rate": 1.0558911483186856e-05, + "loss": 0.3785, + "step": 30624 + }, + { + "epoch": 1.6071447278822482, + "grad_norm": 2.1418545246124268, + "learning_rate": 1.0523613166490776e-05, + "loss": 0.3282, + "step": 30682 + }, + { + "epoch": 1.610182808653292, + "grad_norm": 2.4927940368652344, + "learning_rate": 1.0488308306556192e-05, + "loss": 0.3979, + "step": 30740 + }, + { + "epoch": 1.613220889424336, + "grad_norm": 1.7132062911987305, + "learning_rate": 1.0452997344563982e-05, + "loss": 0.371, + "step": 30798 + }, + { + "epoch": 1.61625897019538, + "grad_norm": 2.9751667976379395, + "learning_rate": 1.0417680721771288e-05, + "loss": 0.3782, + "step": 30856 + }, + { + "epoch": 1.619297050966424, + "grad_norm": 3.013737678527832, + "learning_rate": 1.0382358879505982e-05, + "loss": 0.3899, + "step": 30914 + }, + { + "epoch": 1.622335131737468, + "grad_norm": 3.8316636085510254, + "learning_rate": 1.0347032259161162e-05, + "loss": 0.3942, + "step": 30972 + }, + { + "epoch": 1.6253732125085119, + "grad_norm": 2.597729444503784, + "learning_rate": 1.031170130218964e-05, + "loss": 0.3941, + "step": 31030 + }, + { + "epoch": 1.6284112932795558, + "grad_norm": 3.033374547958374, + "learning_rate": 1.0276975702213507e-05, + "loss": 0.3367, + "step": 31088 + }, + { + "epoch": 1.6314493740505998, + "grad_norm": 3.6567564010620117, + "learning_rate": 1.0241637452361323e-05, + "loss": 0.3943, + "step": 31146 + }, + { + "epoch": 1.6344874548216437, + "grad_norm": 2.558410167694092, + "learning_rate": 1.0206296182929831e-05, + "loss": 0.4043, + "step": 31204 + }, + { + "epoch": 1.6375255355926877, + "grad_norm": 1.9596946239471436, + "learning_rate": 1.01709523355549e-05, + "loss": 0.3589, + "step": 31262 + }, + { + "epoch": 1.6405636163637316, + "grad_norm": 3.5439186096191406, + "learning_rate": 1.013560635190461e-05, + "loss": 0.3769, + "step": 31320 + }, + { + "epoch": 1.6436016971347756, + "grad_norm": 2.5455784797668457, + "learning_rate": 1.010025867367374e-05, + "loss": 0.3099, + "step": 31378 + }, + { + "epoch": 1.6466397779058195, + "grad_norm": 2.672985792160034, + "learning_rate": 1.0064909742578242e-05, + "loss": 0.3384, + "step": 31436 + }, + { + "epoch": 1.6496778586768635, + "grad_norm": 3.4179306030273438, + "learning_rate": 1.002956000034973e-05, + "loss": 0.418, + "step": 31494 + }, + { + "epoch": 1.6527159394479074, + "grad_norm": 3.8588337898254395, + "learning_rate": 9.99420988872995e-06, + "loss": 0.4403, + "step": 31552 + }, + { + "epoch": 1.6557540202189514, + "grad_norm": 2.2216336727142334, + "learning_rate": 9.958859849465258e-06, + "loss": 0.3876, + "step": 31610 + }, + { + "epoch": 1.6587921009899953, + "grad_norm": 5.450603485107422, + "learning_rate": 9.923510324301119e-06, + "loss": 0.3786, + "step": 31668 + }, + { + "epoch": 1.6618301817610392, + "grad_norm": 3.075505256652832, + "learning_rate": 9.888161754976566e-06, + "loss": 0.3599, + "step": 31726 + }, + { + "epoch": 1.6648682625320832, + "grad_norm": 0.3762458562850952, + "learning_rate": 9.852814583218681e-06, + "loss": 0.3758, + "step": 31784 + }, + { + "epoch": 1.6679063433031271, + "grad_norm": 4.629846572875977, + "learning_rate": 9.817469250737098e-06, + "loss": 0.3707, + "step": 31842 + }, + { + "epoch": 1.670944424074171, + "grad_norm": 2.84089732170105, + "learning_rate": 9.782126199218453e-06, + "loss": 0.385, + "step": 31900 + }, + { + "epoch": 1.673982504845215, + "grad_norm": 3.581580400466919, + "learning_rate": 9.74678587032088e-06, + "loss": 0.346, + "step": 31958 + }, + { + "epoch": 1.677020585616259, + "grad_norm": 2.798215866088867, + "learning_rate": 9.711448705668503e-06, + "loss": 0.4142, + "step": 32016 + }, + { + "epoch": 1.680058666387303, + "grad_norm": 2.810606002807617, + "learning_rate": 9.676115146845887e-06, + "loss": 0.3711, + "step": 32074 + }, + { + "epoch": 1.683096747158347, + "grad_norm": 0.6601367592811584, + "learning_rate": 9.640785635392543e-06, + "loss": 0.4049, + "step": 32132 + }, + { + "epoch": 1.6861348279293908, + "grad_norm": 2.879519462585449, + "learning_rate": 9.60546061279741e-06, + "loss": 0.385, + "step": 32190 + }, + { + "epoch": 1.6891729087004348, + "grad_norm": 4.493093013763428, + "learning_rate": 9.570140520493336e-06, + "loss": 0.385, + "step": 32248 + }, + { + "epoch": 1.6922109894714787, + "grad_norm": 1.8380403518676758, + "learning_rate": 9.53482579985154e-06, + "loss": 0.3804, + "step": 32306 + }, + { + "epoch": 1.6952490702425227, + "grad_norm": 0.3858829736709595, + "learning_rate": 9.499516892176139e-06, + "loss": 0.3626, + "step": 32364 + }, + { + "epoch": 1.6982871510135666, + "grad_norm": 3.132283926010132, + "learning_rate": 9.464214238698589e-06, + "loss": 0.3205, + "step": 32422 + }, + { + "epoch": 1.7013252317846106, + "grad_norm": 2.6827683448791504, + "learning_rate": 9.428918280572203e-06, + "loss": 0.3605, + "step": 32480 + }, + { + "epoch": 1.7043633125556545, + "grad_norm": 3.2188720703125, + "learning_rate": 9.394237823897566e-06, + "loss": 0.4137, + "step": 32538 + }, + { + "epoch": 1.7074013933266985, + "grad_norm": 2.3144068717956543, + "learning_rate": 9.35895644521292e-06, + "loss": 0.4068, + "step": 32596 + }, + { + "epoch": 1.7104394740977424, + "grad_norm": 2.736628293991089, + "learning_rate": 9.323683077214672e-06, + "loss": 0.3743, + "step": 32654 + }, + { + "epoch": 1.7134775548687864, + "grad_norm": 3.6730895042419434, + "learning_rate": 9.28841816069017e-06, + "loss": 0.3719, + "step": 32712 + }, + { + "epoch": 1.7165156356398303, + "grad_norm": 4.749345779418945, + "learning_rate": 9.253162136321158e-06, + "loss": 0.3911, + "step": 32770 + }, + { + "epoch": 1.7195537164108743, + "grad_norm": 1.4852901697158813, + "learning_rate": 9.217915444678246e-06, + "loss": 0.4266, + "step": 32828 + }, + { + "epoch": 1.7225917971819182, + "grad_norm": 2.9961495399475098, + "learning_rate": 9.182678526215428e-06, + "loss": 0.3772, + "step": 32886 + }, + { + "epoch": 1.725629877952962, + "grad_norm": 2.647796154022217, + "learning_rate": 9.147451821264571e-06, + "loss": 0.3617, + "step": 32944 + }, + { + "epoch": 1.7286679587240061, + "grad_norm": 1.1617308855056763, + "learning_rate": 9.112235770029908e-06, + "loss": 0.2908, + "step": 33002 + }, + { + "epoch": 1.7317060394950499, + "grad_norm": 2.2398462295532227, + "learning_rate": 9.077030812582535e-06, + "loss": 0.3906, + "step": 33060 + }, + { + "epoch": 1.734744120266094, + "grad_norm": 3.6835782527923584, + "learning_rate": 9.041837388854928e-06, + "loss": 0.4513, + "step": 33118 + }, + { + "epoch": 1.7377822010371378, + "grad_norm": 2.344299077987671, + "learning_rate": 9.006655938635422e-06, + "loss": 0.4084, + "step": 33176 + }, + { + "epoch": 1.740820281808182, + "grad_norm": 3.1176178455352783, + "learning_rate": 8.971486901562728e-06, + "loss": 0.3877, + "step": 33234 + }, + { + "epoch": 1.7438583625792257, + "grad_norm": 1.6773039102554321, + "learning_rate": 8.936330717120455e-06, + "loss": 0.3969, + "step": 33292 + }, + { + "epoch": 1.7468964433502698, + "grad_norm": 3.7852532863616943, + "learning_rate": 8.901187824631575e-06, + "loss": 0.3343, + "step": 33350 + }, + { + "epoch": 1.7499345241213136, + "grad_norm": 2.9691007137298584, + "learning_rate": 8.866058663252984e-06, + "loss": 0.3555, + "step": 33408 + }, + { + "epoch": 1.7529726048923577, + "grad_norm": 2.2175984382629395, + "learning_rate": 8.830943671969973e-06, + "loss": 0.3689, + "step": 33466 + }, + { + "epoch": 1.7560106856634015, + "grad_norm": 4.200769901275635, + "learning_rate": 8.795843289590765e-06, + "loss": 0.3527, + "step": 33524 + }, + { + "epoch": 1.7590487664344456, + "grad_norm": 0.7551002502441406, + "learning_rate": 8.760757954741032e-06, + "loss": 0.3522, + "step": 33582 + }, + { + "epoch": 1.7620868472054894, + "grad_norm": 2.0418272018432617, + "learning_rate": 8.725688105858394e-06, + "loss": 0.4124, + "step": 33640 + }, + { + "epoch": 1.7651249279765335, + "grad_norm": 2.172964096069336, + "learning_rate": 8.690634181186958e-06, + "loss": 0.4303, + "step": 33698 + }, + { + "epoch": 1.7681630087475773, + "grad_norm": 1.0999021530151367, + "learning_rate": 8.655596618771844e-06, + "loss": 0.3028, + "step": 33756 + }, + { + "epoch": 1.7712010895186214, + "grad_norm": 5.170861721038818, + "learning_rate": 8.620575856453699e-06, + "loss": 0.3741, + "step": 33814 + }, + { + "epoch": 1.7742391702896652, + "grad_norm": 2.6932246685028076, + "learning_rate": 8.585572331863224e-06, + "loss": 0.4294, + "step": 33872 + }, + { + "epoch": 1.7772772510607093, + "grad_norm": 0.71751469373703, + "learning_rate": 8.551189534523404e-06, + "loss": 0.338, + "step": 33930 + }, + { + "epoch": 1.780315331831753, + "grad_norm": 2.928600788116455, + "learning_rate": 8.516221481428949e-06, + "loss": 0.4693, + "step": 33988 + }, + { + "epoch": 1.7833534126027972, + "grad_norm": 3.5148239135742188, + "learning_rate": 8.481271970107997e-06, + "loss": 0.4074, + "step": 34046 + }, + { + "epoch": 1.786391493373841, + "grad_norm": 2.133580207824707, + "learning_rate": 8.446341437300874e-06, + "loss": 0.4129, + "step": 34104 + }, + { + "epoch": 1.7894295741448851, + "grad_norm": 2.5686025619506836, + "learning_rate": 8.411430319510761e-06, + "loss": 0.3526, + "step": 34162 + }, + { + "epoch": 1.7924676549159289, + "grad_norm": 4.3522562980651855, + "learning_rate": 8.376539052998205e-06, + "loss": 0.3587, + "step": 34220 + }, + { + "epoch": 1.795505735686973, + "grad_norm": 3.373617649078369, + "learning_rate": 8.34166807377569e-06, + "loss": 0.4069, + "step": 34278 + }, + { + "epoch": 1.7985438164580168, + "grad_norm": 2.4383530616760254, + "learning_rate": 8.306817817602193e-06, + "loss": 0.4002, + "step": 34336 + }, + { + "epoch": 1.80001047614059, + "eval_accuracy": 0.8927125930786133, + "eval_loss": 0.4064957797527313, + "eval_runtime": 5473.958, + "eval_samples_per_second": 0.874, + "eval_steps_per_second": 0.874, + "step": 34364 + } + ], + "logging_steps": 58, + "max_steps": 57273, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 17182, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8389182403659694e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}