{ "best_metric": null, "best_model_checkpoint": null, "epoch": 48.16824966078698, "eval_steps": 3538, "global_step": 71000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.060379918588873815, "grad_norm": 9.192076683044434, "learning_rate": 3.353428786737001e-06, "loss": 4.0695, "step": 89 }, { "epoch": 0.12075983717774763, "grad_norm": 6.064570903778076, "learning_rate": 6.706857573474002e-06, "loss": 3.4544, "step": 178 }, { "epoch": 0.18113975576662145, "grad_norm": 7.58292293548584, "learning_rate": 1.0060286360211004e-05, "loss": 3.2275, "step": 267 }, { "epoch": 0.24151967435549526, "grad_norm": 12.527491569519043, "learning_rate": 1.3413715146948003e-05, "loss": 3.09, "step": 356 }, { "epoch": 0.3018995929443691, "grad_norm": 12.255288124084473, "learning_rate": 1.6767143933685002e-05, "loss": 3.0322, "step": 445 }, { "epoch": 0.3622795115332429, "grad_norm": 6.611302375793457, "learning_rate": 2.0120572720422008e-05, "loss": 2.9608, "step": 534 }, { "epoch": 0.4226594301221167, "grad_norm": 12.716562271118164, "learning_rate": 2.3474001507159007e-05, "loss": 2.892, "step": 623 }, { "epoch": 0.4830393487109905, "grad_norm": 6.532482624053955, "learning_rate": 2.6827430293896006e-05, "loss": 2.8621, "step": 712 }, { "epoch": 0.5434192672998643, "grad_norm": 25.10944175720215, "learning_rate": 3.0180859080633005e-05, "loss": 2.754, "step": 801 }, { "epoch": 0.6037991858887382, "grad_norm": 7.85612154006958, "learning_rate": 3.3534287867370005e-05, "loss": 2.7605, "step": 890 }, { "epoch": 0.664179104477612, "grad_norm": 9.118956565856934, "learning_rate": 3.688771665410701e-05, "loss": 2.7105, "step": 979 }, { "epoch": 0.7245590230664858, "grad_norm": 8.141679763793945, "learning_rate": 4.0241145440844016e-05, "loss": 2.6589, "step": 1068 }, { "epoch": 0.7849389416553596, "grad_norm": 18.962980270385742, "learning_rate": 4.3594574227581015e-05, "loss": 2.6437, "step": 1157 }, { "epoch": 0.8453188602442334, "grad_norm": 7.252344608306885, "learning_rate": 4.6948003014318015e-05, "loss": 2.6, "step": 1246 }, { "epoch": 0.9056987788331072, "grad_norm": 9.572624206542969, "learning_rate": 5.030143180105501e-05, "loss": 2.6181, "step": 1335 }, { "epoch": 0.966078697421981, "grad_norm": 7.212714672088623, "learning_rate": 5.365486058779201e-05, "loss": 2.5157, "step": 1424 }, { "epoch": 1.0264586160108549, "grad_norm": 8.035208702087402, "learning_rate": 5.700828937452901e-05, "loss": 2.5217, "step": 1513 }, { "epoch": 1.0868385345997287, "grad_norm": 10.060546875, "learning_rate": 6.036171816126601e-05, "loss": 2.4495, "step": 1602 }, { "epoch": 1.1472184531886025, "grad_norm": 8.2612943649292, "learning_rate": 6.371514694800301e-05, "loss": 2.4271, "step": 1691 }, { "epoch": 1.2075983717774763, "grad_norm": 7.717799186706543, "learning_rate": 6.706857573474001e-05, "loss": 2.4397, "step": 1780 }, { "epoch": 1.2679782903663501, "grad_norm": 11.520405769348145, "learning_rate": 7.042200452147701e-05, "loss": 2.4099, "step": 1869 }, { "epoch": 1.328358208955224, "grad_norm": 8.826777458190918, "learning_rate": 7.377543330821402e-05, "loss": 2.3349, "step": 1958 }, { "epoch": 1.3887381275440978, "grad_norm": 7.810181140899658, "learning_rate": 7.712886209495102e-05, "loss": 2.3491, "step": 2047 }, { "epoch": 1.4491180461329716, "grad_norm": 9.809256553649902, "learning_rate": 8.048229088168803e-05, "loss": 2.3778, "step": 2136 }, { "epoch": 1.5094979647218452, "grad_norm": 8.396034240722656, "learning_rate": 8.383571966842503e-05, "loss": 2.3717, "step": 2225 }, { "epoch": 1.5698778833107192, "grad_norm": 10.409805297851562, "learning_rate": 8.718914845516203e-05, "loss": 2.3207, "step": 2314 }, { "epoch": 1.6302578018995928, "grad_norm": 7.1885786056518555, "learning_rate": 9.054257724189903e-05, "loss": 2.3651, "step": 2403 }, { "epoch": 1.6906377204884668, "grad_norm": 7.768437385559082, "learning_rate": 9.389600602863603e-05, "loss": 2.3313, "step": 2492 }, { "epoch": 1.7510176390773404, "grad_norm": 5.661167144775391, "learning_rate": 9.724943481537303e-05, "loss": 2.2961, "step": 2581 }, { "epoch": 1.8113975576662145, "grad_norm": 8.26041030883789, "learning_rate": 9.999999141684668e-05, "loss": 2.2966, "step": 2670 }, { "epoch": 1.871777476255088, "grad_norm": 9.347060203552246, "learning_rate": 9.999963035487687e-05, "loss": 2.2934, "step": 2759 }, { "epoch": 1.932157394843962, "grad_norm": 9.274127006530762, "learning_rate": 9.999873814762094e-05, "loss": 2.2813, "step": 2848 }, { "epoch": 1.9925373134328357, "grad_norm": 8.107317924499512, "learning_rate": 9.999731480455674e-05, "loss": 2.2005, "step": 2937 }, { "epoch": 2.0529172320217097, "grad_norm": 8.258145332336426, "learning_rate": 9.999536034080447e-05, "loss": 2.1059, "step": 3026 }, { "epoch": 2.1132971506105833, "grad_norm": 6.388680458068848, "learning_rate": 9.999287477712633e-05, "loss": 2.0797, "step": 3115 }, { "epoch": 2.1736770691994574, "grad_norm": 6.251929759979248, "learning_rate": 9.998985813992645e-05, "loss": 2.0776, "step": 3204 }, { "epoch": 2.234056987788331, "grad_norm": 7.3807172775268555, "learning_rate": 9.998631046125051e-05, "loss": 2.0028, "step": 3293 }, { "epoch": 2.294436906377205, "grad_norm": 8.793547630310059, "learning_rate": 9.998223177878545e-05, "loss": 2.0789, "step": 3382 }, { "epoch": 2.3548168249660786, "grad_norm": 8.737523078918457, "learning_rate": 9.997762213585903e-05, "loss": 2.0322, "step": 3471 }, { "epoch": 2.400271370420624, "eval_accuracy": 0.193359375, "eval_loss": 3.569305419921875, "eval_runtime": 19.2577, "eval_samples_per_second": 26.587, "eval_steps_per_second": 0.208, "step": 3538 }, { "epoch": 2.4151967435549526, "grad_norm": 13.652155876159668, "learning_rate": 9.997248158143945e-05, "loss": 1.9617, "step": 3560 }, { "epoch": 2.475576662143826, "grad_norm": 6.501727104187012, "learning_rate": 9.99668101701347e-05, "loss": 2.0389, "step": 3649 }, { "epoch": 2.5359565807327002, "grad_norm": 11.862299919128418, "learning_rate": 9.99606079621921e-05, "loss": 2.031, "step": 3738 }, { "epoch": 2.596336499321574, "grad_norm": 7.8563551902771, "learning_rate": 9.995387502349764e-05, "loss": 1.9729, "step": 3827 }, { "epoch": 2.656716417910448, "grad_norm": 9.843242645263672, "learning_rate": 9.99466114255752e-05, "loss": 1.9323, "step": 3916 }, { "epoch": 2.7170963364993215, "grad_norm": 6.065842628479004, "learning_rate": 9.993881724558587e-05, "loss": 1.9465, "step": 4005 }, { "epoch": 2.7774762550881955, "grad_norm": 11.29686450958252, "learning_rate": 9.993049256632708e-05, "loss": 1.8912, "step": 4094 }, { "epoch": 2.837856173677069, "grad_norm": 11.959461212158203, "learning_rate": 9.99216374762318e-05, "loss": 1.9665, "step": 4183 }, { "epoch": 2.898236092265943, "grad_norm": 8.116125106811523, "learning_rate": 9.991225206936747e-05, "loss": 1.9158, "step": 4272 }, { "epoch": 2.9586160108548167, "grad_norm": 7.0811309814453125, "learning_rate": 9.990233644543517e-05, "loss": 1.929, "step": 4361 }, { "epoch": 3.0189959294436908, "grad_norm": 7.11262845993042, "learning_rate": 9.989189070976839e-05, "loss": 1.8259, "step": 4450 }, { "epoch": 3.0793758480325644, "grad_norm": 10.385732650756836, "learning_rate": 9.988091497333202e-05, "loss": 1.6678, "step": 4539 }, { "epoch": 3.1397557666214384, "grad_norm": 8.934700965881348, "learning_rate": 9.986940935272113e-05, "loss": 1.7278, "step": 4628 }, { "epoch": 3.200135685210312, "grad_norm": 11.203325271606445, "learning_rate": 9.985737397015975e-05, "loss": 1.6957, "step": 4717 }, { "epoch": 3.260515603799186, "grad_norm": 10.464749336242676, "learning_rate": 9.984480895349955e-05, "loss": 1.6743, "step": 4806 }, { "epoch": 3.3208955223880596, "grad_norm": 9.995988845825195, "learning_rate": 9.983171443621853e-05, "loss": 1.692, "step": 4895 }, { "epoch": 3.3812754409769337, "grad_norm": 11.267080307006836, "learning_rate": 9.981809055741953e-05, "loss": 1.6836, "step": 4984 }, { "epoch": 3.4416553595658073, "grad_norm": 9.267989158630371, "learning_rate": 9.980393746182879e-05, "loss": 1.6307, "step": 5073 }, { "epoch": 3.5020352781546813, "grad_norm": 10.1551513671875, "learning_rate": 9.978925529979441e-05, "loss": 1.6547, "step": 5162 }, { "epoch": 3.562415196743555, "grad_norm": 8.809422492980957, "learning_rate": 9.97740442272848e-05, "loss": 1.6293, "step": 5251 }, { "epoch": 3.622795115332429, "grad_norm": 9.327820777893066, "learning_rate": 9.975830440588692e-05, "loss": 1.6611, "step": 5340 }, { "epoch": 3.6831750339213025, "grad_norm": 9.966196060180664, "learning_rate": 9.974203600280465e-05, "loss": 1.6296, "step": 5429 }, { "epoch": 3.743554952510176, "grad_norm": 9.004570007324219, "learning_rate": 9.972523919085699e-05, "loss": 1.6335, "step": 5518 }, { "epoch": 3.80393487109905, "grad_norm": 8.515008926391602, "learning_rate": 9.97079141484762e-05, "loss": 1.6038, "step": 5607 }, { "epoch": 3.864314789687924, "grad_norm": 8.961952209472656, "learning_rate": 9.969006105970593e-05, "loss": 1.6298, "step": 5696 }, { "epoch": 3.924694708276798, "grad_norm": 11.294893264770508, "learning_rate": 9.967168011419927e-05, "loss": 1.588, "step": 5785 }, { "epoch": 3.9850746268656714, "grad_norm": 9.631956100463867, "learning_rate": 9.965277150721669e-05, "loss": 1.5871, "step": 5874 }, { "epoch": 4.045454545454546, "grad_norm": 7.810765266418457, "learning_rate": 9.963333543962405e-05, "loss": 1.4341, "step": 5963 }, { "epoch": 4.1058344640434195, "grad_norm": 10.34420394897461, "learning_rate": 9.961337211789039e-05, "loss": 1.4289, "step": 6052 }, { "epoch": 4.166214382632293, "grad_norm": 10.042189598083496, "learning_rate": 9.959288175408577e-05, "loss": 1.392, "step": 6141 }, { "epoch": 4.226594301221167, "grad_norm": 9.480945587158203, "learning_rate": 9.957186456587896e-05, "loss": 1.4407, "step": 6230 }, { "epoch": 4.286974219810041, "grad_norm": 10.059048652648926, "learning_rate": 9.955032077653525e-05, "loss": 1.4126, "step": 6319 }, { "epoch": 4.347354138398915, "grad_norm": 8.111236572265625, "learning_rate": 9.9528250614914e-05, "loss": 1.3697, "step": 6408 }, { "epoch": 4.407734056987788, "grad_norm": 7.836842060089111, "learning_rate": 9.950565431546612e-05, "loss": 1.4165, "step": 6497 }, { "epoch": 4.468113975576662, "grad_norm": 10.462437629699707, "learning_rate": 9.948253211823182e-05, "loss": 1.3629, "step": 6586 }, { "epoch": 4.5284938941655355, "grad_norm": 8.220168113708496, "learning_rate": 9.945888426883778e-05, "loss": 1.4402, "step": 6675 }, { "epoch": 4.58887381275441, "grad_norm": 8.81052017211914, "learning_rate": 9.943471101849477e-05, "loss": 1.4194, "step": 6764 }, { "epoch": 4.649253731343284, "grad_norm": 13.403641700744629, "learning_rate": 9.941001262399482e-05, "loss": 1.3943, "step": 6853 }, { "epoch": 4.709633649932157, "grad_norm": 8.589107513427734, "learning_rate": 9.938478934770861e-05, "loss": 1.3888, "step": 6942 }, { "epoch": 4.770013568521032, "grad_norm": 12.76234245300293, "learning_rate": 9.935904145758259e-05, "loss": 1.415, "step": 7031 }, { "epoch": 4.800542740841248, "eval_accuracy": 0.19140625, "eval_loss": 3.4613265991210938, "eval_runtime": 18.4, "eval_samples_per_second": 27.826, "eval_steps_per_second": 0.217, "step": 7076 }, { "epoch": 4.830393487109905, "grad_norm": 12.034464836120605, "learning_rate": 9.933276922713619e-05, "loss": 1.3772, "step": 7120 }, { "epoch": 4.890773405698779, "grad_norm": 10.00120735168457, "learning_rate": 9.930597293545891e-05, "loss": 1.3427, "step": 7209 }, { "epoch": 4.951153324287652, "grad_norm": 10.592118263244629, "learning_rate": 9.927865286720734e-05, "loss": 1.3681, "step": 7298 }, { "epoch": 5.011533242876526, "grad_norm": 10.557718276977539, "learning_rate": 9.925080931260211e-05, "loss": 1.3345, "step": 7387 }, { "epoch": 5.0719131614654005, "grad_norm": 8.036050796508789, "learning_rate": 9.922244256742491e-05, "loss": 1.1945, "step": 7476 }, { "epoch": 5.132293080054274, "grad_norm": 8.104681015014648, "learning_rate": 9.919355293301515e-05, "loss": 1.191, "step": 7565 }, { "epoch": 5.192672998643148, "grad_norm": 8.685461044311523, "learning_rate": 9.916414071626704e-05, "loss": 1.1867, "step": 7654 }, { "epoch": 5.253052917232022, "grad_norm": 7.221011638641357, "learning_rate": 9.913420622962606e-05, "loss": 1.1737, "step": 7743 }, { "epoch": 5.313432835820896, "grad_norm": 9.594326972961426, "learning_rate": 9.910374979108579e-05, "loss": 1.2058, "step": 7832 }, { "epoch": 5.373812754409769, "grad_norm": 8.146512031555176, "learning_rate": 9.907277172418449e-05, "loss": 1.2173, "step": 7921 }, { "epoch": 5.434192672998643, "grad_norm": 8.147337913513184, "learning_rate": 9.904127235800169e-05, "loss": 1.2047, "step": 8010 }, { "epoch": 5.4945725915875165, "grad_norm": 10.820967674255371, "learning_rate": 9.900925202715468e-05, "loss": 1.2287, "step": 8099 }, { "epoch": 5.554952510176391, "grad_norm": 10.343106269836426, "learning_rate": 9.897671107179488e-05, "loss": 1.1927, "step": 8188 }, { "epoch": 5.615332428765265, "grad_norm": 10.41408920288086, "learning_rate": 9.894364983760439e-05, "loss": 1.2321, "step": 8277 }, { "epoch": 5.675712347354138, "grad_norm": 9.299535751342773, "learning_rate": 9.891006867579217e-05, "loss": 1.2012, "step": 8366 }, { "epoch": 5.736092265943013, "grad_norm": 10.728792190551758, "learning_rate": 9.887596794309035e-05, "loss": 1.1812, "step": 8455 }, { "epoch": 5.796472184531886, "grad_norm": 7.432964324951172, "learning_rate": 9.884134800175053e-05, "loss": 1.1521, "step": 8544 }, { "epoch": 5.85685210312076, "grad_norm": 7.614875316619873, "learning_rate": 9.880620921953974e-05, "loss": 1.1487, "step": 8633 }, { "epoch": 5.9172320217096335, "grad_norm": 10.49835205078125, "learning_rate": 9.877055196973674e-05, "loss": 1.2014, "step": 8722 }, { "epoch": 5.977611940298507, "grad_norm": 7.404662609100342, "learning_rate": 9.873437663112794e-05, "loss": 1.1821, "step": 8811 }, { "epoch": 6.0379918588873815, "grad_norm": 9.661059379577637, "learning_rate": 9.869768358800339e-05, "loss": 1.0712, "step": 8900 }, { "epoch": 6.098371777476255, "grad_norm": 11.615382194519043, "learning_rate": 9.866047323015269e-05, "loss": 1.0516, "step": 8989 }, { "epoch": 6.158751696065129, "grad_norm": 10.21226978302002, "learning_rate": 9.86227459528609e-05, "loss": 1.0813, "step": 9078 }, { "epoch": 6.219131614654002, "grad_norm": 7.748682975769043, "learning_rate": 9.85845021569043e-05, "loss": 1.0604, "step": 9167 }, { "epoch": 6.279511533242877, "grad_norm": 10.797855377197266, "learning_rate": 9.854574224854611e-05, "loss": 1.0417, "step": 9256 }, { "epoch": 6.33989145183175, "grad_norm": 9.862196922302246, "learning_rate": 9.850646663953227e-05, "loss": 1.0171, "step": 9345 }, { "epoch": 6.400271370420624, "grad_norm": 10.341273307800293, "learning_rate": 9.84666757470869e-05, "loss": 1.0216, "step": 9434 }, { "epoch": 6.460651289009498, "grad_norm": 7.858868598937988, "learning_rate": 9.842636999390807e-05, "loss": 1.0705, "step": 9523 }, { "epoch": 6.521031207598372, "grad_norm": 10.367132186889648, "learning_rate": 9.838554980816312e-05, "loss": 1.0489, "step": 9612 }, { "epoch": 6.581411126187246, "grad_norm": 13.918916702270508, "learning_rate": 9.834421562348428e-05, "loss": 1.0753, "step": 9701 }, { "epoch": 6.641791044776119, "grad_norm": 9.345829010009766, "learning_rate": 9.830236787896391e-05, "loss": 1.0584, "step": 9790 }, { "epoch": 6.702170963364993, "grad_norm": 12.244129180908203, "learning_rate": 9.826000701914998e-05, "loss": 1.0402, "step": 9879 }, { "epoch": 6.762550881953867, "grad_norm": 8.918442726135254, "learning_rate": 9.821713349404119e-05, "loss": 1.0522, "step": 9968 }, { "epoch": 6.822930800542741, "grad_norm": 8.40239143371582, "learning_rate": 9.817374775908237e-05, "loss": 1.0277, "step": 10057 }, { "epoch": 6.8833107191316145, "grad_norm": 12.844498634338379, "learning_rate": 9.812985027515947e-05, "loss": 1.077, "step": 10146 }, { "epoch": 6.943690637720488, "grad_norm": 8.832013130187988, "learning_rate": 9.808544150859476e-05, "loss": 1.0239, "step": 10235 }, { "epoch": 7.004070556309363, "grad_norm": 5.591742038726807, "learning_rate": 9.804052193114189e-05, "loss": 1.0128, "step": 10324 }, { "epoch": 7.064450474898236, "grad_norm": 6.905234336853027, "learning_rate": 9.799509201998083e-05, "loss": 0.9019, "step": 10413 }, { "epoch": 7.12483039348711, "grad_norm": 9.313871383666992, "learning_rate": 9.794915225771279e-05, "loss": 0.9515, "step": 10502 }, { "epoch": 7.185210312075983, "grad_norm": 8.015510559082031, "learning_rate": 9.790270313235517e-05, "loss": 0.9301, "step": 10591 }, { "epoch": 7.200814111261873, "eval_accuracy": 0.17578125, "eval_loss": 3.7450790405273438, "eval_runtime": 18.7511, "eval_samples_per_second": 27.305, "eval_steps_per_second": 0.213, "step": 10614 }, { "epoch": 7.245590230664858, "grad_norm": 7.311409950256348, "learning_rate": 9.785574513733625e-05, "loss": 0.9172, "step": 10680 }, { "epoch": 7.3059701492537314, "grad_norm": 5.454108238220215, "learning_rate": 9.780827877149013e-05, "loss": 0.9372, "step": 10769 }, { "epoch": 7.366350067842605, "grad_norm": 5.830528736114502, "learning_rate": 9.776030453905122e-05, "loss": 0.9163, "step": 10858 }, { "epoch": 7.426729986431479, "grad_norm": 9.309490203857422, "learning_rate": 9.771182294964905e-05, "loss": 0.9528, "step": 10947 }, { "epoch": 7.487109905020353, "grad_norm": 11.420437812805176, "learning_rate": 9.76628345183028e-05, "loss": 0.9198, "step": 11036 }, { "epoch": 7.547489823609227, "grad_norm": 11.052990913391113, "learning_rate": 9.761333976541578e-05, "loss": 0.9231, "step": 11125 }, { "epoch": 7.6078697421981, "grad_norm": 7.378238201141357, "learning_rate": 9.756333921676999e-05, "loss": 0.9452, "step": 11214 }, { "epoch": 7.668249660786974, "grad_norm": 11.708273887634277, "learning_rate": 9.751283340352044e-05, "loss": 0.9163, "step": 11303 }, { "epoch": 7.728629579375848, "grad_norm": 5.919505596160889, "learning_rate": 9.746182286218964e-05, "loss": 0.9254, "step": 11392 }, { "epoch": 7.789009497964722, "grad_norm": 10.179853439331055, "learning_rate": 9.741030813466172e-05, "loss": 0.9317, "step": 11481 }, { "epoch": 7.849389416553596, "grad_norm": 8.873759269714355, "learning_rate": 9.735828976817683e-05, "loss": 0.9474, "step": 11570 }, { "epoch": 7.909769335142469, "grad_norm": 6.65983772277832, "learning_rate": 9.730576831532528e-05, "loss": 0.9013, "step": 11659 }, { "epoch": 7.970149253731344, "grad_norm": 7.311088562011719, "learning_rate": 9.725274433404164e-05, "loss": 0.9119, "step": 11748 }, { "epoch": 8.030529172320216, "grad_norm": 10.026205062866211, "learning_rate": 9.719921838759878e-05, "loss": 0.876, "step": 11837 }, { "epoch": 8.090909090909092, "grad_norm": 8.08633804321289, "learning_rate": 9.714519104460202e-05, "loss": 0.8151, "step": 11926 }, { "epoch": 8.151289009497965, "grad_norm": 6.680150508880615, "learning_rate": 9.709066287898298e-05, "loss": 0.8111, "step": 12015 }, { "epoch": 8.211668928086839, "grad_norm": 8.399514198303223, "learning_rate": 9.70356344699935e-05, "loss": 0.8207, "step": 12104 }, { "epoch": 8.272048846675712, "grad_norm": 10.127174377441406, "learning_rate": 9.698010640219951e-05, "loss": 0.84, "step": 12193 }, { "epoch": 8.332428765264586, "grad_norm": 7.315372943878174, "learning_rate": 9.692407926547478e-05, "loss": 0.8473, "step": 12282 }, { "epoch": 8.39280868385346, "grad_norm": 11.611318588256836, "learning_rate": 9.686755365499471e-05, "loss": 0.8423, "step": 12371 }, { "epoch": 8.453188602442333, "grad_norm": 7.9076008796691895, "learning_rate": 9.681053017122996e-05, "loss": 0.8445, "step": 12460 }, { "epoch": 8.513568521031207, "grad_norm": 9.092277526855469, "learning_rate": 9.675300941994012e-05, "loss": 0.8652, "step": 12549 }, { "epoch": 8.573948439620082, "grad_norm": 8.704888343811035, "learning_rate": 9.669499201216723e-05, "loss": 0.8312, "step": 12638 }, { "epoch": 8.634328358208956, "grad_norm": 13.215127944946289, "learning_rate": 9.663647856422928e-05, "loss": 0.8306, "step": 12727 }, { "epoch": 8.69470827679783, "grad_norm": 6.171853542327881, "learning_rate": 9.657746969771371e-05, "loss": 0.8504, "step": 12816 }, { "epoch": 8.755088195386703, "grad_norm": 9.066251754760742, "learning_rate": 9.651796603947076e-05, "loss": 0.8711, "step": 12905 }, { "epoch": 8.815468113975577, "grad_norm": 7.504266262054443, "learning_rate": 9.645796822160691e-05, "loss": 0.8312, "step": 12994 }, { "epoch": 8.87584803256445, "grad_norm": 11.219298362731934, "learning_rate": 9.639747688147798e-05, "loss": 0.8264, "step": 13083 }, { "epoch": 8.936227951153324, "grad_norm": 9.841562271118164, "learning_rate": 9.633649266168256e-05, "loss": 0.8097, "step": 13172 }, { "epoch": 8.996607869742197, "grad_norm": 6.924744606018066, "learning_rate": 9.627501621005505e-05, "loss": 0.8315, "step": 13261 }, { "epoch": 9.056987788331073, "grad_norm": 12.85659408569336, "learning_rate": 9.62130481796588e-05, "loss": 0.7768, "step": 13350 }, { "epoch": 9.117367706919946, "grad_norm": 7.802920341491699, "learning_rate": 9.615058922877926e-05, "loss": 0.7363, "step": 13439 }, { "epoch": 9.17774762550882, "grad_norm": 5.512497425079346, "learning_rate": 9.608764002091686e-05, "loss": 0.7568, "step": 13528 }, { "epoch": 9.238127544097694, "grad_norm": 7.84502649307251, "learning_rate": 9.602420122478004e-05, "loss": 0.7754, "step": 13617 }, { "epoch": 9.298507462686567, "grad_norm": 7.394598484039307, "learning_rate": 9.596027351427814e-05, "loss": 0.7862, "step": 13706 }, { "epoch": 9.35888738127544, "grad_norm": 8.552702903747559, "learning_rate": 9.589585756851422e-05, "loss": 0.7404, "step": 13795 }, { "epoch": 9.419267299864314, "grad_norm": 8.93039608001709, "learning_rate": 9.583095407177788e-05, "loss": 0.7368, "step": 13884 }, { "epoch": 9.479647218453188, "grad_norm": 8.229623794555664, "learning_rate": 9.576556371353791e-05, "loss": 0.7699, "step": 13973 }, { "epoch": 9.540027137042063, "grad_norm": 10.284710884094238, "learning_rate": 9.569968718843507e-05, "loss": 0.7811, "step": 14062 }, { "epoch": 9.600407055630937, "grad_norm": 5.939275741577148, "learning_rate": 9.563332519627466e-05, "loss": 0.7419, "step": 14151 }, { "epoch": 9.601085481682496, "eval_accuracy": 0.20703125, "eval_loss": 3.737224578857422, "eval_runtime": 17.1346, "eval_samples_per_second": 29.881, "eval_steps_per_second": 0.233, "step": 14152 }, { "epoch": 9.66078697421981, "grad_norm": 7.720785140991211, "learning_rate": 9.556647844201908e-05, "loss": 0.7578, "step": 14240 }, { "epoch": 9.721166892808684, "grad_norm": 7.313141345977783, "learning_rate": 9.549914763578031e-05, "loss": 0.7662, "step": 14329 }, { "epoch": 9.781546811397558, "grad_norm": 10.582131385803223, "learning_rate": 9.543133349281248e-05, "loss": 0.7503, "step": 14418 }, { "epoch": 9.841926729986431, "grad_norm": 5.272374153137207, "learning_rate": 9.536303673350415e-05, "loss": 0.7729, "step": 14507 }, { "epoch": 9.902306648575305, "grad_norm": 6.4560370445251465, "learning_rate": 9.529425808337074e-05, "loss": 0.7659, "step": 14596 }, { "epoch": 9.962686567164178, "grad_norm": 4.996959686279297, "learning_rate": 9.522499827304674e-05, "loss": 0.7348, "step": 14685 }, { "epoch": 10.023066485753052, "grad_norm": 5.831302165985107, "learning_rate": 9.515525803827803e-05, "loss": 0.7534, "step": 14774 }, { "epoch": 10.083446404341927, "grad_norm": 6.166038990020752, "learning_rate": 9.508503811991405e-05, "loss": 0.7, "step": 14863 }, { "epoch": 10.143826322930801, "grad_norm": 9.589017868041992, "learning_rate": 9.501433926389986e-05, "loss": 0.6585, "step": 14952 }, { "epoch": 10.204206241519675, "grad_norm": 8.026691436767578, "learning_rate": 9.49431622212683e-05, "loss": 0.6973, "step": 15041 }, { "epoch": 10.264586160108548, "grad_norm": 8.68213939666748, "learning_rate": 9.487150774813198e-05, "loss": 0.698, "step": 15130 }, { "epoch": 10.324966078697422, "grad_norm": 11.472238540649414, "learning_rate": 9.479937660567523e-05, "loss": 0.7192, "step": 15219 }, { "epoch": 10.385345997286295, "grad_norm": 6.372411251068115, "learning_rate": 9.472676956014605e-05, "loss": 0.6859, "step": 15308 }, { "epoch": 10.445725915875169, "grad_norm": 5.333731174468994, "learning_rate": 9.465368738284794e-05, "loss": 0.7025, "step": 15397 }, { "epoch": 10.506105834464044, "grad_norm": 7.277047157287598, "learning_rate": 9.458013085013173e-05, "loss": 0.7102, "step": 15486 }, { "epoch": 10.566485753052918, "grad_norm": 10.157328605651855, "learning_rate": 9.45061007433873e-05, "loss": 0.6814, "step": 15575 }, { "epoch": 10.626865671641792, "grad_norm": 5.025580883026123, "learning_rate": 9.443159784903528e-05, "loss": 0.7038, "step": 15664 }, { "epoch": 10.687245590230665, "grad_norm": 7.037330627441406, "learning_rate": 9.43566229585188e-05, "loss": 0.6886, "step": 15753 }, { "epoch": 10.747625508819539, "grad_norm": 8.00758171081543, "learning_rate": 9.42811768682949e-05, "loss": 0.6988, "step": 15842 }, { "epoch": 10.808005427408412, "grad_norm": 6.200064659118652, "learning_rate": 9.42052603798262e-05, "loss": 0.6872, "step": 15931 }, { "epoch": 10.868385345997286, "grad_norm": 7.785628795623779, "learning_rate": 9.412887429957241e-05, "loss": 0.7191, "step": 16020 }, { "epoch": 10.92876526458616, "grad_norm": 5.606222629547119, "learning_rate": 9.405201943898162e-05, "loss": 0.6933, "step": 16109 }, { "epoch": 10.989145183175033, "grad_norm": 6.9870147705078125, "learning_rate": 9.397469661448182e-05, "loss": 0.6873, "step": 16198 }, { "epoch": 11.049525101763908, "grad_norm": 7.700918674468994, "learning_rate": 9.389690664747214e-05, "loss": 0.6515, "step": 16287 }, { "epoch": 11.109905020352782, "grad_norm": 4.668413162231445, "learning_rate": 9.38186503643142e-05, "loss": 0.6484, "step": 16376 }, { "epoch": 11.170284938941656, "grad_norm": 9.098540306091309, "learning_rate": 9.373992859632324e-05, "loss": 0.6479, "step": 16465 }, { "epoch": 11.23066485753053, "grad_norm": 7.96748161315918, "learning_rate": 9.366074217975938e-05, "loss": 0.6351, "step": 16554 }, { "epoch": 11.291044776119403, "grad_norm": 5.657280921936035, "learning_rate": 9.358109195581866e-05, "loss": 0.6362, "step": 16643 }, { "epoch": 11.351424694708276, "grad_norm": 7.184754371643066, "learning_rate": 9.350097877062418e-05, "loss": 0.6527, "step": 16732 }, { "epoch": 11.41180461329715, "grad_norm": 6.7868523597717285, "learning_rate": 9.342040347521702e-05, "loss": 0.667, "step": 16821 }, { "epoch": 11.472184531886024, "grad_norm": 7.017992973327637, "learning_rate": 9.333936692554729e-05, "loss": 0.633, "step": 16910 }, { "epoch": 11.532564450474899, "grad_norm": 6.653933048248291, "learning_rate": 9.325786998246498e-05, "loss": 0.6404, "step": 16999 }, { "epoch": 11.592944369063773, "grad_norm": 6.6855058670043945, "learning_rate": 9.317591351171082e-05, "loss": 0.6776, "step": 17088 }, { "epoch": 11.653324287652646, "grad_norm": 8.127620697021484, "learning_rate": 9.309349838390711e-05, "loss": 0.6385, "step": 17177 }, { "epoch": 11.71370420624152, "grad_norm": 7.420390605926514, "learning_rate": 9.301062547454849e-05, "loss": 0.6395, "step": 17266 }, { "epoch": 11.774084124830393, "grad_norm": 7.517685413360596, "learning_rate": 9.292729566399252e-05, "loss": 0.6335, "step": 17355 }, { "epoch": 11.834464043419267, "grad_norm": 7.267749786376953, "learning_rate": 9.284350983745049e-05, "loss": 0.6607, "step": 17444 }, { "epoch": 11.89484396200814, "grad_norm": 7.73004150390625, "learning_rate": 9.275926888497792e-05, "loss": 0.6671, "step": 17533 }, { "epoch": 11.955223880597014, "grad_norm": 7.934135913848877, "learning_rate": 9.267457370146513e-05, "loss": 0.6207, "step": 17622 }, { "epoch": 12.00135685210312, "eval_accuracy": 0.19140625, "eval_loss": 3.8003501892089844, "eval_runtime": 19.7102, "eval_samples_per_second": 25.976, "eval_steps_per_second": 0.203, "step": 17690 }, { "epoch": 12.01560379918589, "grad_norm": 5.052128314971924, "learning_rate": 9.25894251866277e-05, "loss": 0.6211, "step": 17711 }, { "epoch": 12.075983717774763, "grad_norm": 5.490070343017578, "learning_rate": 9.250382424499698e-05, "loss": 0.6037, "step": 17800 }, { "epoch": 12.136363636363637, "grad_norm": 6.631565570831299, "learning_rate": 9.241777178591043e-05, "loss": 0.6032, "step": 17889 }, { "epoch": 12.19674355495251, "grad_norm": 6.181819438934326, "learning_rate": 9.233126872350193e-05, "loss": 0.5988, "step": 17978 }, { "epoch": 12.257123473541384, "grad_norm": 5.3416361808776855, "learning_rate": 9.224431597669219e-05, "loss": 0.612, "step": 18067 }, { "epoch": 12.317503392130257, "grad_norm": 9.972622871398926, "learning_rate": 9.215691446917885e-05, "loss": 0.5976, "step": 18156 }, { "epoch": 12.377883310719131, "grad_norm": 6.693090915679932, "learning_rate": 9.206906512942676e-05, "loss": 0.6127, "step": 18245 }, { "epoch": 12.438263229308005, "grad_norm": 5.006298065185547, "learning_rate": 9.198076889065806e-05, "loss": 0.614, "step": 18334 }, { "epoch": 12.49864314789688, "grad_norm": 4.5717668533325195, "learning_rate": 9.189202669084233e-05, "loss": 0.6026, "step": 18423 }, { "epoch": 12.559023066485754, "grad_norm": 7.7340989112854, "learning_rate": 9.180283947268653e-05, "loss": 0.589, "step": 18512 }, { "epoch": 12.619402985074627, "grad_norm": 6.45162296295166, "learning_rate": 9.17132081836251e-05, "loss": 0.5889, "step": 18601 }, { "epoch": 12.6797829036635, "grad_norm": 7.008767604827881, "learning_rate": 9.162313377580979e-05, "loss": 0.5783, "step": 18690 }, { "epoch": 12.740162822252374, "grad_norm": 7.15552282333374, "learning_rate": 9.153261720609963e-05, "loss": 0.5953, "step": 18779 }, { "epoch": 12.800542740841248, "grad_norm": 5.7486748695373535, "learning_rate": 9.144165943605072e-05, "loss": 0.5965, "step": 18868 }, { "epoch": 12.860922659430122, "grad_norm": 5.747917652130127, "learning_rate": 9.135026143190601e-05, "loss": 0.5875, "step": 18957 }, { "epoch": 12.921302578018995, "grad_norm": 7.039977550506592, "learning_rate": 9.125842416458506e-05, "loss": 0.5954, "step": 19046 }, { "epoch": 12.98168249660787, "grad_norm": 3.8854663372039795, "learning_rate": 9.116614860967372e-05, "loss": 0.5818, "step": 19135 }, { "epoch": 13.042062415196744, "grad_norm": 5.661801815032959, "learning_rate": 9.107343574741374e-05, "loss": 0.5619, "step": 19224 }, { "epoch": 13.102442333785618, "grad_norm": 6.757572174072266, "learning_rate": 9.098028656269243e-05, "loss": 0.5639, "step": 19313 }, { "epoch": 13.162822252374491, "grad_norm": 7.3293352127075195, "learning_rate": 9.088670204503208e-05, "loss": 0.5633, "step": 19402 }, { "epoch": 13.223202170963365, "grad_norm": 7.053752899169922, "learning_rate": 9.079268318857957e-05, "loss": 0.5487, "step": 19491 }, { "epoch": 13.283582089552239, "grad_norm": 5.139120101928711, "learning_rate": 9.069823099209571e-05, "loss": 0.543, "step": 19580 }, { "epoch": 13.343962008141112, "grad_norm": 7.9965314865112305, "learning_rate": 9.060334645894472e-05, "loss": 0.5521, "step": 19669 }, { "epoch": 13.404341926729986, "grad_norm": 7.904087543487549, "learning_rate": 9.050803059708348e-05, "loss": 0.5763, "step": 19758 }, { "epoch": 13.464721845318861, "grad_norm": 4.6150221824646, "learning_rate": 9.041228441905092e-05, "loss": 0.5492, "step": 19847 }, { "epoch": 13.525101763907735, "grad_norm": 4.3521857261657715, "learning_rate": 9.031610894195715e-05, "loss": 0.5544, "step": 19936 }, { "epoch": 13.585481682496608, "grad_norm": 6.906470775604248, "learning_rate": 9.021950518747276e-05, "loss": 0.5922, "step": 20025 }, { "epoch": 13.645861601085482, "grad_norm": 7.304365158081055, "learning_rate": 9.012247418181792e-05, "loss": 0.5473, "step": 20114 }, { "epoch": 13.706241519674355, "grad_norm": 5.015029430389404, "learning_rate": 9.002501695575148e-05, "loss": 0.5843, "step": 20203 }, { "epoch": 13.766621438263229, "grad_norm": 5.353032112121582, "learning_rate": 8.992713454455999e-05, "loss": 0.5423, "step": 20292 }, { "epoch": 13.827001356852103, "grad_norm": 4.505341529846191, "learning_rate": 8.98288279880468e-05, "loss": 0.5511, "step": 20381 }, { "epoch": 13.887381275440976, "grad_norm": 6.68435525894165, "learning_rate": 8.973009833052087e-05, "loss": 0.5429, "step": 20470 }, { "epoch": 13.947761194029852, "grad_norm": 4.248044490814209, "learning_rate": 8.963094662078583e-05, "loss": 0.5637, "step": 20559 }, { "epoch": 14.008141112618725, "grad_norm": 4.230225563049316, "learning_rate": 8.953137391212875e-05, "loss": 0.5551, "step": 20648 }, { "epoch": 14.068521031207599, "grad_norm": 4.81500768661499, "learning_rate": 8.94313812623089e-05, "loss": 0.5027, "step": 20737 }, { "epoch": 14.128900949796472, "grad_norm": 6.79054594039917, "learning_rate": 8.933096973354664e-05, "loss": 0.4904, "step": 20826 }, { "epoch": 14.189280868385346, "grad_norm": 4.661177635192871, "learning_rate": 8.923014039251208e-05, "loss": 0.5076, "step": 20915 }, { "epoch": 14.24966078697422, "grad_norm": 10.014252662658691, "learning_rate": 8.91288943103137e-05, "loss": 0.5068, "step": 21004 }, { "epoch": 14.310040705563093, "grad_norm": 8.030250549316406, "learning_rate": 8.902723256248704e-05, "loss": 0.521, "step": 21093 }, { "epoch": 14.370420624151967, "grad_norm": 5.514551162719727, "learning_rate": 8.892515622898326e-05, "loss": 0.5053, "step": 21182 }, { "epoch": 14.401628222523746, "eval_accuracy": 0.193359375, "eval_loss": 3.79229736328125, "eval_runtime": 41.777, "eval_samples_per_second": 12.256, "eval_steps_per_second": 0.096, "step": 21228 }, { "epoch": 14.43080054274084, "grad_norm": 5.649023056030273, "learning_rate": 8.882266639415763e-05, "loss": 0.5103, "step": 21271 }, { "epoch": 14.491180461329716, "grad_norm": 6.628403663635254, "learning_rate": 8.871976414675805e-05, "loss": 0.5238, "step": 21360 }, { "epoch": 14.55156037991859, "grad_norm": 5.387028217315674, "learning_rate": 8.86164505799135e-05, "loss": 0.5278, "step": 21449 }, { "epoch": 14.611940298507463, "grad_norm": 5.111924171447754, "learning_rate": 8.851272679112234e-05, "loss": 0.5269, "step": 21538 }, { "epoch": 14.672320217096336, "grad_norm": 5.967355728149414, "learning_rate": 8.840859388224076e-05, "loss": 0.5188, "step": 21627 }, { "epoch": 14.73270013568521, "grad_norm": 5.387267589569092, "learning_rate": 8.830405295947102e-05, "loss": 0.5161, "step": 21716 }, { "epoch": 14.793080054274084, "grad_norm": 4.254080772399902, "learning_rate": 8.81991051333497e-05, "loss": 0.5228, "step": 21805 }, { "epoch": 14.853459972862957, "grad_norm": 3.855088233947754, "learning_rate": 8.809375151873589e-05, "loss": 0.5091, "step": 21894 }, { "epoch": 14.913839891451833, "grad_norm": 5.05858039855957, "learning_rate": 8.798799323479938e-05, "loss": 0.5259, "step": 21983 }, { "epoch": 14.974219810040706, "grad_norm": 8.726083755493164, "learning_rate": 8.788183140500874e-05, "loss": 0.5171, "step": 22072 }, { "epoch": 15.03459972862958, "grad_norm": 5.312582492828369, "learning_rate": 8.777526715711946e-05, "loss": 0.4804, "step": 22161 }, { "epoch": 15.094979647218453, "grad_norm": 4.794472694396973, "learning_rate": 8.766830162316183e-05, "loss": 0.4814, "step": 22250 }, { "epoch": 15.155359565807327, "grad_norm": 6.440197944641113, "learning_rate": 8.756093593942905e-05, "loss": 0.4829, "step": 22339 }, { "epoch": 15.2157394843962, "grad_norm": 4.757099151611328, "learning_rate": 8.745317124646508e-05, "loss": 0.4572, "step": 22428 }, { "epoch": 15.276119402985074, "grad_norm": 5.3460235595703125, "learning_rate": 8.734500868905258e-05, "loss": 0.476, "step": 22517 }, { "epoch": 15.336499321573948, "grad_norm": 4.173645496368408, "learning_rate": 8.723644941620065e-05, "loss": 0.4829, "step": 22606 }, { "epoch": 15.396879240162821, "grad_norm": 8.921795845031738, "learning_rate": 8.71274945811328e-05, "loss": 0.4758, "step": 22695 }, { "epoch": 15.457259158751697, "grad_norm": 5.059213161468506, "learning_rate": 8.701814534127446e-05, "loss": 0.4516, "step": 22784 }, { "epoch": 15.51763907734057, "grad_norm": 6.460654258728027, "learning_rate": 8.690840285824094e-05, "loss": 0.4946, "step": 22873 }, { "epoch": 15.578018995929444, "grad_norm": 5.588746547698975, "learning_rate": 8.679826829782485e-05, "loss": 0.5096, "step": 22962 }, { "epoch": 15.638398914518318, "grad_norm": 4.974047660827637, "learning_rate": 8.668774282998394e-05, "loss": 0.491, "step": 23051 }, { "epoch": 15.698778833107191, "grad_norm": 4.4067463874816895, "learning_rate": 8.65768276288285e-05, "loss": 0.487, "step": 23140 }, { "epoch": 15.759158751696065, "grad_norm": 5.659997463226318, "learning_rate": 8.646552387260898e-05, "loss": 0.4895, "step": 23229 }, { "epoch": 15.819538670284938, "grad_norm": 5.777614593505859, "learning_rate": 8.635383274370341e-05, "loss": 0.4951, "step": 23318 }, { "epoch": 15.879918588873814, "grad_norm": 6.594443321228027, "learning_rate": 8.62417554286049e-05, "loss": 0.4871, "step": 23407 }, { "epoch": 15.940298507462687, "grad_norm": 4.5751237869262695, "learning_rate": 8.612929311790899e-05, "loss": 0.5005, "step": 23496 }, { "epoch": 16.00067842605156, "grad_norm": 4.56909704208374, "learning_rate": 8.601644700630107e-05, "loss": 0.4875, "step": 23585 }, { "epoch": 16.061058344640433, "grad_norm": 5.793113708496094, "learning_rate": 8.590321829254358e-05, "loss": 0.4592, "step": 23674 }, { "epoch": 16.121438263229308, "grad_norm": 3.888392686843872, "learning_rate": 8.578960817946338e-05, "loss": 0.4343, "step": 23763 }, { "epoch": 16.181818181818183, "grad_norm": 3.910721778869629, "learning_rate": 8.567561787393888e-05, "loss": 0.4499, "step": 23852 }, { "epoch": 16.242198100407055, "grad_norm": 7.085721492767334, "learning_rate": 8.556124858688734e-05, "loss": 0.4391, "step": 23941 }, { "epoch": 16.30257801899593, "grad_norm": 6.454195022583008, "learning_rate": 8.54465015332519e-05, "loss": 0.4378, "step": 24030 }, { "epoch": 16.362957937584802, "grad_norm": 3.5428030490875244, "learning_rate": 8.533137793198866e-05, "loss": 0.4511, "step": 24119 }, { "epoch": 16.423337856173678, "grad_norm": 3.401646614074707, "learning_rate": 8.521587900605385e-05, "loss": 0.4642, "step": 24208 }, { "epoch": 16.48371777476255, "grad_norm": 6.838740825653076, "learning_rate": 8.510000598239075e-05, "loss": 0.4584, "step": 24297 }, { "epoch": 16.544097693351425, "grad_norm": 5.186567306518555, "learning_rate": 8.498376009191665e-05, "loss": 0.4741, "step": 24386 }, { "epoch": 16.604477611940297, "grad_norm": 3.8350930213928223, "learning_rate": 8.486714256950983e-05, "loss": 0.4475, "step": 24475 }, { "epoch": 16.664857530529172, "grad_norm": 5.290257453918457, "learning_rate": 8.475015465399638e-05, "loss": 0.4544, "step": 24564 }, { "epoch": 16.725237449118048, "grad_norm": 5.533965587615967, "learning_rate": 8.463279758813711e-05, "loss": 0.457, "step": 24653 }, { "epoch": 16.78561736770692, "grad_norm": 5.372981071472168, "learning_rate": 8.451507261861425e-05, "loss": 0.4537, "step": 24742 }, { "epoch": 16.80189959294437, "eval_accuracy": 0.16015625, "eval_loss": 3.9037704467773438, "eval_runtime": 39.2353, "eval_samples_per_second": 13.049, "eval_steps_per_second": 0.102, "step": 24766 }, { "epoch": 16.845997286295795, "grad_norm": 5.011608600616455, "learning_rate": 8.439698099601831e-05, "loss": 0.452, "step": 24831 }, { "epoch": 16.906377204884667, "grad_norm": 5.051270484924316, "learning_rate": 8.427852397483475e-05, "loss": 0.4493, "step": 24920 }, { "epoch": 16.966757123473542, "grad_norm": 3.670827627182007, "learning_rate": 8.415970281343061e-05, "loss": 0.4476, "step": 25009 }, { "epoch": 17.027137042062414, "grad_norm": 2.6706955432891846, "learning_rate": 8.404051877404126e-05, "loss": 0.4478, "step": 25098 }, { "epoch": 17.08751696065129, "grad_norm": 7.5127787590026855, "learning_rate": 8.392097312275686e-05, "loss": 0.4244, "step": 25187 }, { "epoch": 17.147896879240164, "grad_norm": 3.7548723220825195, "learning_rate": 8.380106712950896e-05, "loss": 0.4289, "step": 25276 }, { "epoch": 17.208276797829036, "grad_norm": 3.9628028869628906, "learning_rate": 8.368080206805706e-05, "loss": 0.4337, "step": 25365 }, { "epoch": 17.26865671641791, "grad_norm": 4.179431915283203, "learning_rate": 8.3560179215975e-05, "loss": 0.4147, "step": 25454 }, { "epoch": 17.329036635006783, "grad_norm": 3.3942129611968994, "learning_rate": 8.343919985463745e-05, "loss": 0.4175, "step": 25543 }, { "epoch": 17.38941655359566, "grad_norm": 4.166045665740967, "learning_rate": 8.331786526920626e-05, "loss": 0.423, "step": 25632 }, { "epoch": 17.44979647218453, "grad_norm": 3.0724310874938965, "learning_rate": 8.319617674861682e-05, "loss": 0.41, "step": 25721 }, { "epoch": 17.510176390773406, "grad_norm": 6.462100028991699, "learning_rate": 8.307413558556437e-05, "loss": 0.4125, "step": 25810 }, { "epoch": 17.570556309362278, "grad_norm": 4.838727951049805, "learning_rate": 8.295174307649024e-05, "loss": 0.4254, "step": 25899 }, { "epoch": 17.630936227951153, "grad_norm": 3.9609103202819824, "learning_rate": 8.282900052156817e-05, "loss": 0.4141, "step": 25988 }, { "epoch": 17.69131614654003, "grad_norm": 3.537935972213745, "learning_rate": 8.270590922469037e-05, "loss": 0.4189, "step": 26077 }, { "epoch": 17.7516960651289, "grad_norm": 5.015251159667969, "learning_rate": 8.258247049345373e-05, "loss": 0.439, "step": 26166 }, { "epoch": 17.812075983717776, "grad_norm": 4.997931957244873, "learning_rate": 8.245868563914598e-05, "loss": 0.4079, "step": 26255 }, { "epoch": 17.872455902306648, "grad_norm": 5.362955093383789, "learning_rate": 8.233455597673165e-05, "loss": 0.4165, "step": 26344 }, { "epoch": 17.932835820895523, "grad_norm": 6.1235880851745605, "learning_rate": 8.22100828248382e-05, "loss": 0.4121, "step": 26433 }, { "epoch": 17.993215739484395, "grad_norm": 4.939189434051514, "learning_rate": 8.208526750574199e-05, "loss": 0.4191, "step": 26522 }, { "epoch": 18.05359565807327, "grad_norm": 4.338520050048828, "learning_rate": 8.196011134535416e-05, "loss": 0.369, "step": 26611 }, { "epoch": 18.113975576662146, "grad_norm": 4.328836441040039, "learning_rate": 8.183461567320662e-05, "loss": 0.3939, "step": 26700 }, { "epoch": 18.174355495251017, "grad_norm": 3.7861499786376953, "learning_rate": 8.170878182243792e-05, "loss": 0.3841, "step": 26789 }, { "epoch": 18.234735413839893, "grad_norm": 4.84774112701416, "learning_rate": 8.158261112977913e-05, "loss": 0.3702, "step": 26878 }, { "epoch": 18.295115332428765, "grad_norm": 7.082802772521973, "learning_rate": 8.145610493553948e-05, "loss": 0.4059, "step": 26967 }, { "epoch": 18.35549525101764, "grad_norm": 2.84909987449646, "learning_rate": 8.13292645835923e-05, "loss": 0.41, "step": 27056 }, { "epoch": 18.41587516960651, "grad_norm": 4.116001605987549, "learning_rate": 8.120209142136065e-05, "loss": 0.4014, "step": 27145 }, { "epoch": 18.476255088195387, "grad_norm": 4.0977783203125, "learning_rate": 8.107458679980302e-05, "loss": 0.4041, "step": 27234 }, { "epoch": 18.53663500678426, "grad_norm": 9.48543930053711, "learning_rate": 8.0946752073399e-05, "loss": 0.3979, "step": 27323 }, { "epoch": 18.597014925373134, "grad_norm": 3.692593574523926, "learning_rate": 8.081858860013488e-05, "loss": 0.4034, "step": 27412 }, { "epoch": 18.65739484396201, "grad_norm": 3.500662326812744, "learning_rate": 8.069009774148923e-05, "loss": 0.3884, "step": 27501 }, { "epoch": 18.71777476255088, "grad_norm": 3.7085442543029785, "learning_rate": 8.056128086241841e-05, "loss": 0.3829, "step": 27590 }, { "epoch": 18.778154681139757, "grad_norm": 4.753846168518066, "learning_rate": 8.043213933134208e-05, "loss": 0.4079, "step": 27679 }, { "epoch": 18.83853459972863, "grad_norm": 3.4297168254852295, "learning_rate": 8.030267452012872e-05, "loss": 0.3934, "step": 27768 }, { "epoch": 18.898914518317504, "grad_norm": 5.62887716293335, "learning_rate": 8.017288780408096e-05, "loss": 0.4036, "step": 27857 }, { "epoch": 18.959294436906376, "grad_norm": 3.0904860496520996, "learning_rate": 8.004278056192107e-05, "loss": 0.3933, "step": 27946 }, { "epoch": 19.01967435549525, "grad_norm": 4.35064697265625, "learning_rate": 7.991235417577621e-05, "loss": 0.3759, "step": 28035 }, { "epoch": 19.080054274084127, "grad_norm": 5.101808547973633, "learning_rate": 7.978161003116382e-05, "loss": 0.3693, "step": 28124 }, { "epoch": 19.140434192673, "grad_norm": 4.391759395599365, "learning_rate": 7.96505495169769e-05, "loss": 0.3472, "step": 28213 }, { "epoch": 19.200814111261874, "grad_norm": 4.793941974639893, "learning_rate": 7.951917402546926e-05, "loss": 0.3551, "step": 28302 }, { "epoch": 19.202170963364992, "eval_accuracy": 0.19140625, "eval_loss": 3.9524879455566406, "eval_runtime": 19.8893, "eval_samples_per_second": 25.742, "eval_steps_per_second": 0.201, "step": 28304 }, { "epoch": 19.261194029850746, "grad_norm": 3.726491689682007, "learning_rate": 7.938748495224061e-05, "loss": 0.3555, "step": 28391 }, { "epoch": 19.32157394843962, "grad_norm": 3.4001190662384033, "learning_rate": 7.925548369622199e-05, "loss": 0.361, "step": 28480 }, { "epoch": 19.381953867028493, "grad_norm": 4.480808258056641, "learning_rate": 7.912317165966059e-05, "loss": 0.3656, "step": 28569 }, { "epoch": 19.442333785617368, "grad_norm": 3.043093681335449, "learning_rate": 7.899055024810511e-05, "loss": 0.3819, "step": 28658 }, { "epoch": 19.50271370420624, "grad_norm": 3.813091516494751, "learning_rate": 7.885762087039075e-05, "loss": 0.3939, "step": 28747 }, { "epoch": 19.563093622795115, "grad_norm": 4.2613525390625, "learning_rate": 7.872438493862415e-05, "loss": 0.353, "step": 28836 }, { "epoch": 19.62347354138399, "grad_norm": 2.884284734725952, "learning_rate": 7.859084386816854e-05, "loss": 0.3696, "step": 28925 }, { "epoch": 19.683853459972863, "grad_norm": 6.607941627502441, "learning_rate": 7.845699907762862e-05, "loss": 0.3869, "step": 29014 }, { "epoch": 19.744233378561738, "grad_norm": 6.069945335388184, "learning_rate": 7.832285198883548e-05, "loss": 0.3688, "step": 29103 }, { "epoch": 19.80461329715061, "grad_norm": 2.9537928104400635, "learning_rate": 7.818840402683151e-05, "loss": 0.3624, "step": 29192 }, { "epoch": 19.864993215739485, "grad_norm": 4.354130268096924, "learning_rate": 7.805365661985535e-05, "loss": 0.3589, "step": 29281 }, { "epoch": 19.925373134328357, "grad_norm": 3.5923469066619873, "learning_rate": 7.791861119932652e-05, "loss": 0.3432, "step": 29370 }, { "epoch": 19.985753052917232, "grad_norm": 3.5997955799102783, "learning_rate": 7.778326919983046e-05, "loss": 0.3611, "step": 29459 }, { "epoch": 20.046132971506104, "grad_norm": 2.281196355819702, "learning_rate": 7.764763205910304e-05, "loss": 0.3296, "step": 29548 }, { "epoch": 20.10651289009498, "grad_norm": 7.429330348968506, "learning_rate": 7.75117012180155e-05, "loss": 0.34, "step": 29637 }, { "epoch": 20.166892808683855, "grad_norm": 7.913335800170898, "learning_rate": 7.737547812055901e-05, "loss": 0.3428, "step": 29726 }, { "epoch": 20.227272727272727, "grad_norm": 2.8572380542755127, "learning_rate": 7.723896421382942e-05, "loss": 0.3394, "step": 29815 }, { "epoch": 20.287652645861602, "grad_norm": 2.90544056892395, "learning_rate": 7.710216094801179e-05, "loss": 0.3322, "step": 29904 }, { "epoch": 20.348032564450474, "grad_norm": 7.801008224487305, "learning_rate": 7.696506977636506e-05, "loss": 0.343, "step": 29993 }, { "epoch": 20.40841248303935, "grad_norm": 4.56928014755249, "learning_rate": 7.682769215520658e-05, "loss": 0.3513, "step": 30082 }, { "epoch": 20.46879240162822, "grad_norm": 3.2972512245178223, "learning_rate": 7.669002954389668e-05, "loss": 0.3361, "step": 30171 }, { "epoch": 20.529172320217096, "grad_norm": 2.6529455184936523, "learning_rate": 7.65520834048231e-05, "loss": 0.3481, "step": 30260 }, { "epoch": 20.58955223880597, "grad_norm": 2.281811475753784, "learning_rate": 7.641385520338551e-05, "loss": 0.3439, "step": 30349 }, { "epoch": 20.649932157394844, "grad_norm": 5.415365695953369, "learning_rate": 7.627534640797991e-05, "loss": 0.3426, "step": 30438 }, { "epoch": 20.71031207598372, "grad_norm": 4.79844856262207, "learning_rate": 7.613655848998305e-05, "loss": 0.3237, "step": 30527 }, { "epoch": 20.77069199457259, "grad_norm": 4.5184855461120605, "learning_rate": 7.599749292373679e-05, "loss": 0.3433, "step": 30616 }, { "epoch": 20.831071913161466, "grad_norm": 3.099209785461426, "learning_rate": 7.585815118653248e-05, "loss": 0.329, "step": 30705 }, { "epoch": 20.891451831750338, "grad_norm": 2.415534257888794, "learning_rate": 7.571853475859519e-05, "loss": 0.3377, "step": 30794 }, { "epoch": 20.951831750339213, "grad_norm": 4.010440349578857, "learning_rate": 7.557864512306802e-05, "loss": 0.3375, "step": 30883 }, { "epoch": 21.012211668928085, "grad_norm": 3.8156368732452393, "learning_rate": 7.543848376599637e-05, "loss": 0.3216, "step": 30972 }, { "epoch": 21.07259158751696, "grad_norm": 8.568528175354004, "learning_rate": 7.529805217631214e-05, "loss": 0.3043, "step": 31061 }, { "epoch": 21.132971506105836, "grad_norm": 5.376992225646973, "learning_rate": 7.515735184581791e-05, "loss": 0.3175, "step": 31150 }, { "epoch": 21.193351424694708, "grad_norm": 2.6105728149414062, "learning_rate": 7.501638426917106e-05, "loss": 0.3105, "step": 31239 }, { "epoch": 21.253731343283583, "grad_norm": 2.6053969860076904, "learning_rate": 7.487515094386792e-05, "loss": 0.3002, "step": 31328 }, { "epoch": 21.314111261872455, "grad_norm": 2.5073657035827637, "learning_rate": 7.473365337022791e-05, "loss": 0.3172, "step": 31417 }, { "epoch": 21.37449118046133, "grad_norm": 2.63193941116333, "learning_rate": 7.459189305137751e-05, "loss": 0.3183, "step": 31506 }, { "epoch": 21.434871099050202, "grad_norm": 2.9518582820892334, "learning_rate": 7.444987149323435e-05, "loss": 0.302, "step": 31595 }, { "epoch": 21.495251017639077, "grad_norm": 3.682440757751465, "learning_rate": 7.430759020449123e-05, "loss": 0.3106, "step": 31684 }, { "epoch": 21.555630936227953, "grad_norm": 3.564025402069092, "learning_rate": 7.416505069660003e-05, "loss": 0.3114, "step": 31773 }, { "epoch": 21.602442333785618, "eval_accuracy": 0.1953125, "eval_loss": 3.8780479431152344, "eval_runtime": 40.0838, "eval_samples_per_second": 12.773, "eval_steps_per_second": 0.1, "step": 31842 }, { "epoch": 21.616010854816825, "grad_norm": 4.089629173278809, "learning_rate": 7.402225448375569e-05, "loss": 0.3152, "step": 31862 }, { "epoch": 21.6763907734057, "grad_norm": 4.699454307556152, "learning_rate": 7.387920308288014e-05, "loss": 0.3094, "step": 31951 }, { "epoch": 21.736770691994572, "grad_norm": 3.2713539600372314, "learning_rate": 7.373589801360616e-05, "loss": 0.3276, "step": 32040 }, { "epoch": 21.797150610583447, "grad_norm": 1.9568812847137451, "learning_rate": 7.359234079826123e-05, "loss": 0.3181, "step": 32129 }, { "epoch": 21.85753052917232, "grad_norm": 2.7409889698028564, "learning_rate": 7.344853296185141e-05, "loss": 0.3023, "step": 32218 }, { "epoch": 21.917910447761194, "grad_norm": 2.9756550788879395, "learning_rate": 7.330447603204507e-05, "loss": 0.3162, "step": 32307 }, { "epoch": 21.978290366350066, "grad_norm": 3.314568281173706, "learning_rate": 7.316017153915671e-05, "loss": 0.2991, "step": 32396 }, { "epoch": 22.03867028493894, "grad_norm": 4.315303802490234, "learning_rate": 7.301562101613068e-05, "loss": 0.305, "step": 32485 }, { "epoch": 22.099050203527817, "grad_norm": 4.505661487579346, "learning_rate": 7.287082599852493e-05, "loss": 0.2807, "step": 32574 }, { "epoch": 22.15943012211669, "grad_norm": 3.841827392578125, "learning_rate": 7.272578802449464e-05, "loss": 0.2742, "step": 32663 }, { "epoch": 22.219810040705564, "grad_norm": 4.61216926574707, "learning_rate": 7.25805086347759e-05, "loss": 0.2994, "step": 32752 }, { "epoch": 22.280189959294436, "grad_norm": 2.9822754859924316, "learning_rate": 7.243498937266943e-05, "loss": 0.2854, "step": 32841 }, { "epoch": 22.34056987788331, "grad_norm": 3.7797086238861084, "learning_rate": 7.228923178402403e-05, "loss": 0.2967, "step": 32930 }, { "epoch": 22.400949796472183, "grad_norm": 2.8511717319488525, "learning_rate": 7.214323741722027e-05, "loss": 0.2772, "step": 33019 }, { "epoch": 22.46132971506106, "grad_norm": 2.439438581466675, "learning_rate": 7.199700782315403e-05, "loss": 0.2957, "step": 33108 }, { "epoch": 22.521709633649934, "grad_norm": 2.507317066192627, "learning_rate": 7.185054455521994e-05, "loss": 0.2883, "step": 33197 }, { "epoch": 22.582089552238806, "grad_norm": 2.963704824447632, "learning_rate": 7.170384916929504e-05, "loss": 0.2892, "step": 33286 }, { "epoch": 22.64246947082768, "grad_norm": 3.137892007827759, "learning_rate": 7.155692322372208e-05, "loss": 0.2936, "step": 33375 }, { "epoch": 22.702849389416553, "grad_norm": 2.860560178756714, "learning_rate": 7.140976827929308e-05, "loss": 0.2719, "step": 33464 }, { "epoch": 22.763229308005428, "grad_norm": 3.778202533721924, "learning_rate": 7.126238589923269e-05, "loss": 0.2909, "step": 33553 }, { "epoch": 22.8236092265943, "grad_norm": 8.442693710327148, "learning_rate": 7.111477764918159e-05, "loss": 0.2957, "step": 33642 }, { "epoch": 22.883989145183175, "grad_norm": 2.855881452560425, "learning_rate": 7.096694509717994e-05, "loss": 0.2893, "step": 33731 }, { "epoch": 22.944369063772047, "grad_norm": 3.649304151535034, "learning_rate": 7.081888981365062e-05, "loss": 0.3019, "step": 33820 }, { "epoch": 23.004748982360923, "grad_norm": 3.577422857284546, "learning_rate": 7.067061337138249e-05, "loss": 0.2794, "step": 33909 }, { "epoch": 23.065128900949798, "grad_norm": 3.4041476249694824, "learning_rate": 7.052211734551398e-05, "loss": 0.2653, "step": 33998 }, { "epoch": 23.12550881953867, "grad_norm": 3.21398663520813, "learning_rate": 7.037340331351592e-05, "loss": 0.2635, "step": 34087 }, { "epoch": 23.185888738127545, "grad_norm": 3.606840133666992, "learning_rate": 7.022447285517522e-05, "loss": 0.2612, "step": 34176 }, { "epoch": 23.246268656716417, "grad_norm": 3.4414963722229004, "learning_rate": 7.007532755257776e-05, "loss": 0.2621, "step": 34265 }, { "epoch": 23.306648575305292, "grad_norm": 3.429677724838257, "learning_rate": 6.992596899009174e-05, "loss": 0.2627, "step": 34354 }, { "epoch": 23.367028493894164, "grad_norm": 2.394657850265503, "learning_rate": 6.977639875435082e-05, "loss": 0.2651, "step": 34443 }, { "epoch": 23.42740841248304, "grad_norm": 3.796799421310425, "learning_rate": 6.962661843423725e-05, "loss": 0.2575, "step": 34532 }, { "epoch": 23.487788331071915, "grad_norm": 1.8303537368774414, "learning_rate": 6.947662962086506e-05, "loss": 0.2656, "step": 34621 }, { "epoch": 23.548168249660787, "grad_norm": 5.206216335296631, "learning_rate": 6.932643390756298e-05, "loss": 0.2789, "step": 34710 }, { "epoch": 23.608548168249662, "grad_norm": 2.8069159984588623, "learning_rate": 6.917603288985775e-05, "loss": 0.2679, "step": 34799 }, { "epoch": 23.668928086838534, "grad_norm": 2.3087520599365234, "learning_rate": 6.902542816545701e-05, "loss": 0.2625, "step": 34888 }, { "epoch": 23.72930800542741, "grad_norm": 3.139498472213745, "learning_rate": 6.887462133423237e-05, "loss": 0.2722, "step": 34977 }, { "epoch": 23.78968792401628, "grad_norm": 2.9781806468963623, "learning_rate": 6.872361399820245e-05, "loss": 0.2633, "step": 35066 }, { "epoch": 23.850067842605156, "grad_norm": 3.456528425216675, "learning_rate": 6.857240776151576e-05, "loss": 0.2767, "step": 35155 }, { "epoch": 23.91044776119403, "grad_norm": 2.8766520023345947, "learning_rate": 6.842100423043381e-05, "loss": 0.2655, "step": 35244 }, { "epoch": 23.970827679782904, "grad_norm": 2.811938524246216, "learning_rate": 6.826940501331391e-05, "loss": 0.26, "step": 35333 }, { "epoch": 24.00271370420624, "eval_accuracy": 0.1875, "eval_loss": 3.9947586059570312, "eval_runtime": 23.8781, "eval_samples_per_second": 21.442, "eval_steps_per_second": 0.168, "step": 35380 }, { "epoch": 24.03120759837178, "grad_norm": 3.4529502391815186, "learning_rate": 6.811761172059213e-05, "loss": 0.2424, "step": 35422 }, { "epoch": 24.09158751696065, "grad_norm": 7.157485485076904, "learning_rate": 6.796562596476629e-05, "loss": 0.2328, "step": 35511 }, { "epoch": 24.151967435549526, "grad_norm": 2.098388433456421, "learning_rate": 6.781344936037864e-05, "loss": 0.2368, "step": 35600 }, { "epoch": 24.212347354138398, "grad_norm": 2.5846946239471436, "learning_rate": 6.766108352399885e-05, "loss": 0.252, "step": 35689 }, { "epoch": 24.272727272727273, "grad_norm": 3.213495969772339, "learning_rate": 6.750853007420684e-05, "loss": 0.2563, "step": 35778 }, { "epoch": 24.333107191316145, "grad_norm": 5.0729498863220215, "learning_rate": 6.735579063157545e-05, "loss": 0.2623, "step": 35867 }, { "epoch": 24.39348710990502, "grad_norm": 2.973792791366577, "learning_rate": 6.720286681865339e-05, "loss": 0.2558, "step": 35956 }, { "epoch": 24.453867028493896, "grad_norm": 1.9252829551696777, "learning_rate": 6.704976025994796e-05, "loss": 0.2486, "step": 36045 }, { "epoch": 24.514246947082768, "grad_norm": 3.5804240703582764, "learning_rate": 6.689647258190768e-05, "loss": 0.2493, "step": 36134 }, { "epoch": 24.574626865671643, "grad_norm": 3.92348575592041, "learning_rate": 6.674300541290517e-05, "loss": 0.2447, "step": 36223 }, { "epoch": 24.635006784260515, "grad_norm": 2.7622110843658447, "learning_rate": 6.658936038321971e-05, "loss": 0.2381, "step": 36312 }, { "epoch": 24.69538670284939, "grad_norm": 2.5953946113586426, "learning_rate": 6.643553912502007e-05, "loss": 0.2467, "step": 36401 }, { "epoch": 24.755766621438262, "grad_norm": 2.8284683227539062, "learning_rate": 6.628154327234704e-05, "loss": 0.2435, "step": 36490 }, { "epoch": 24.816146540027137, "grad_norm": 2.8667030334472656, "learning_rate": 6.612737446109614e-05, "loss": 0.2476, "step": 36579 }, { "epoch": 24.87652645861601, "grad_norm": 2.5920257568359375, "learning_rate": 6.597303432900021e-05, "loss": 0.248, "step": 36668 }, { "epoch": 24.936906377204885, "grad_norm": 3.2936460971832275, "learning_rate": 6.581852451561207e-05, "loss": 0.2545, "step": 36757 }, { "epoch": 24.99728629579376, "grad_norm": 2.2897655963897705, "learning_rate": 6.5663846662287e-05, "loss": 0.2405, "step": 36846 }, { "epoch": 25.057666214382632, "grad_norm": 2.2279489040374756, "learning_rate": 6.550900241216545e-05, "loss": 0.2235, "step": 36935 }, { "epoch": 25.118046132971507, "grad_norm": 1.6091116666793823, "learning_rate": 6.535399341015543e-05, "loss": 0.2345, "step": 37024 }, { "epoch": 25.17842605156038, "grad_norm": 2.490220308303833, "learning_rate": 6.51988213029151e-05, "loss": 0.2264, "step": 37113 }, { "epoch": 25.238805970149254, "grad_norm": 2.3575713634490967, "learning_rate": 6.504348773883534e-05, "loss": 0.2384, "step": 37202 }, { "epoch": 25.299185888738126, "grad_norm": 2.0898985862731934, "learning_rate": 6.488799436802216e-05, "loss": 0.2332, "step": 37291 }, { "epoch": 25.359565807327, "grad_norm": 4.023237705230713, "learning_rate": 6.473234284227919e-05, "loss": 0.2186, "step": 37380 }, { "epoch": 25.419945725915873, "grad_norm": 1.7770565748214722, "learning_rate": 6.45765348150901e-05, "loss": 0.2318, "step": 37469 }, { "epoch": 25.48032564450475, "grad_norm": 3.1752917766571045, "learning_rate": 6.442057194160116e-05, "loss": 0.2234, "step": 37558 }, { "epoch": 25.540705563093624, "grad_norm": 3.1734275817871094, "learning_rate": 6.42644558786035e-05, "loss": 0.2388, "step": 37647 }, { "epoch": 25.601085481682496, "grad_norm": 3.916975259780884, "learning_rate": 6.410818828451557e-05, "loss": 0.227, "step": 37736 }, { "epoch": 25.66146540027137, "grad_norm": 2.7766647338867188, "learning_rate": 6.395177081936562e-05, "loss": 0.23, "step": 37825 }, { "epoch": 25.721845318860243, "grad_norm": 3.657627820968628, "learning_rate": 6.379520514477388e-05, "loss": 0.2329, "step": 37914 }, { "epoch": 25.78222523744912, "grad_norm": 4.11094331741333, "learning_rate": 6.363849292393507e-05, "loss": 0.2241, "step": 38003 }, { "epoch": 25.84260515603799, "grad_norm": 2.6179704666137695, "learning_rate": 6.348163582160062e-05, "loss": 0.2268, "step": 38092 }, { "epoch": 25.902985074626866, "grad_norm": 3.4568240642547607, "learning_rate": 6.332463550406107e-05, "loss": 0.2197, "step": 38181 }, { "epoch": 25.96336499321574, "grad_norm": 1.789491057395935, "learning_rate": 6.316749363912833e-05, "loss": 0.2087, "step": 38270 }, { "epoch": 26.023744911804613, "grad_norm": 2.606367588043213, "learning_rate": 6.301021189611793e-05, "loss": 0.2153, "step": 38359 }, { "epoch": 26.08412483039349, "grad_norm": 2.6728904247283936, "learning_rate": 6.28527919458314e-05, "loss": 0.2043, "step": 38448 }, { "epoch": 26.14450474898236, "grad_norm": 2.2943668365478516, "learning_rate": 6.269523546053832e-05, "loss": 0.2123, "step": 38537 }, { "epoch": 26.204884667571235, "grad_norm": 3.1198699474334717, "learning_rate": 6.253754411395882e-05, "loss": 0.2128, "step": 38626 }, { "epoch": 26.265264586160107, "grad_norm": 2.269235372543335, "learning_rate": 6.237971958124559e-05, "loss": 0.2213, "step": 38715 }, { "epoch": 26.325644504748983, "grad_norm": 3.090557098388672, "learning_rate": 6.22217635389661e-05, "loss": 0.2253, "step": 38804 }, { "epoch": 26.386024423337854, "grad_norm": 4.030007839202881, "learning_rate": 6.206367766508497e-05, "loss": 0.2104, "step": 38893 }, { "epoch": 26.402985074626866, "eval_accuracy": 0.181640625, "eval_loss": 4.0000152587890625, "eval_runtime": 20.3217, "eval_samples_per_second": 25.195, "eval_steps_per_second": 0.197, "step": 38918 }, { "epoch": 26.44640434192673, "grad_norm": 3.711073637008667, "learning_rate": 6.190546363894589e-05, "loss": 0.2019, "step": 38982 }, { "epoch": 26.506784260515605, "grad_norm": 4.125629901885986, "learning_rate": 6.1747123141254e-05, "loss": 0.218, "step": 39071 }, { "epoch": 26.567164179104477, "grad_norm": 2.719214916229248, "learning_rate": 6.158865785405792e-05, "loss": 0.2138, "step": 39160 }, { "epoch": 26.627544097693352, "grad_norm": 5.083952903747559, "learning_rate": 6.143006946073187e-05, "loss": 0.2098, "step": 39249 }, { "epoch": 26.687924016282224, "grad_norm": 2.9340269565582275, "learning_rate": 6.127135964595789e-05, "loss": 0.2004, "step": 39338 }, { "epoch": 26.7483039348711, "grad_norm": 1.930010437965393, "learning_rate": 6.111253009570781e-05, "loss": 0.2212, "step": 39427 }, { "epoch": 26.80868385345997, "grad_norm": 3.872161388397217, "learning_rate": 6.095358249722548e-05, "loss": 0.2116, "step": 39516 }, { "epoch": 26.869063772048847, "grad_norm": 2.4142067432403564, "learning_rate": 6.0794518539008716e-05, "loss": 0.2223, "step": 39605 }, { "epoch": 26.929443690637722, "grad_norm": 2.2030022144317627, "learning_rate": 6.063533991079143e-05, "loss": 0.2155, "step": 39694 }, { "epoch": 26.989823609226594, "grad_norm": 3.7845208644866943, "learning_rate": 6.0476048303525725e-05, "loss": 0.2177, "step": 39783 }, { "epoch": 27.05020352781547, "grad_norm": 2.8146162033081055, "learning_rate": 6.0316645409363794e-05, "loss": 0.1945, "step": 39872 }, { "epoch": 27.11058344640434, "grad_norm": 2.4782633781433105, "learning_rate": 6.015713292164008e-05, "loss": 0.1978, "step": 39961 }, { "epoch": 27.170963364993217, "grad_norm": 1.6334956884384155, "learning_rate": 5.999751253485325e-05, "loss": 0.1949, "step": 40050 }, { "epoch": 27.23134328358209, "grad_norm": 3.610597610473633, "learning_rate": 5.983778594464814e-05, "loss": 0.1997, "step": 40139 }, { "epoch": 27.291723202170964, "grad_norm": 3.140693426132202, "learning_rate": 5.967795484779781e-05, "loss": 0.1905, "step": 40228 }, { "epoch": 27.352103120759836, "grad_norm": 2.56771183013916, "learning_rate": 5.9518020942185494e-05, "loss": 0.1893, "step": 40317 }, { "epoch": 27.41248303934871, "grad_norm": 1.921730875968933, "learning_rate": 5.935798592678653e-05, "loss": 0.1972, "step": 40406 }, { "epoch": 27.472862957937586, "grad_norm": 2.7568604946136475, "learning_rate": 5.91978515016504e-05, "loss": 0.2038, "step": 40495 }, { "epoch": 27.533242876526458, "grad_norm": 3.526125192642212, "learning_rate": 5.903761936788255e-05, "loss": 0.1881, "step": 40584 }, { "epoch": 27.593622795115333, "grad_norm": 2.400557279586792, "learning_rate": 5.887729122762644e-05, "loss": 0.1908, "step": 40673 }, { "epoch": 27.654002713704205, "grad_norm": 2.814988374710083, "learning_rate": 5.8716868784045374e-05, "loss": 0.1946, "step": 40762 }, { "epoch": 27.71438263229308, "grad_norm": 3.351440906524658, "learning_rate": 5.855635374130442e-05, "loss": 0.199, "step": 40851 }, { "epoch": 27.774762550881952, "grad_norm": 3.108304023742676, "learning_rate": 5.839574780455239e-05, "loss": 0.2009, "step": 40940 }, { "epoch": 27.835142469470828, "grad_norm": 3.37080979347229, "learning_rate": 5.823505267990359e-05, "loss": 0.1929, "step": 41029 }, { "epoch": 27.895522388059703, "grad_norm": 2.852602005004883, "learning_rate": 5.807427007441981e-05, "loss": 0.1946, "step": 41118 }, { "epoch": 27.955902306648575, "grad_norm": 2.15985369682312, "learning_rate": 5.791340169609214e-05, "loss": 0.1997, "step": 41207 }, { "epoch": 28.01628222523745, "grad_norm": 0.9773418307304382, "learning_rate": 5.7752449253822815e-05, "loss": 0.1789, "step": 41296 }, { "epoch": 28.076662143826322, "grad_norm": 2.572413444519043, "learning_rate": 5.759141445740713e-05, "loss": 0.1816, "step": 41385 }, { "epoch": 28.137042062415198, "grad_norm": 1.8453723192214966, "learning_rate": 5.7430299017515166e-05, "loss": 0.1795, "step": 41474 }, { "epoch": 28.19742198100407, "grad_norm": 2.09143328666687, "learning_rate": 5.726910464567371e-05, "loss": 0.1748, "step": 41563 }, { "epoch": 28.257801899592945, "grad_norm": 4.368978977203369, "learning_rate": 5.710783305424804e-05, "loss": 0.1865, "step": 41652 }, { "epoch": 28.318181818181817, "grad_norm": 1.7974387407302856, "learning_rate": 5.694648595642372e-05, "loss": 0.1878, "step": 41741 }, { "epoch": 28.378561736770692, "grad_norm": 3.7262039184570312, "learning_rate": 5.6785065066188446e-05, "loss": 0.1873, "step": 41830 }, { "epoch": 28.438941655359567, "grad_norm": 5.437527656555176, "learning_rate": 5.662357209831378e-05, "loss": 0.1958, "step": 41919 }, { "epoch": 28.49932157394844, "grad_norm": 2.2726356983184814, "learning_rate": 5.646200876833699e-05, "loss": 0.1818, "step": 42008 }, { "epoch": 28.559701492537314, "grad_norm": 3.234407663345337, "learning_rate": 5.630037679254278e-05, "loss": 0.1893, "step": 42097 }, { "epoch": 28.620081411126186, "grad_norm": 2.5418026447296143, "learning_rate": 5.613867788794508e-05, "loss": 0.185, "step": 42186 }, { "epoch": 28.68046132971506, "grad_norm": 2.394573926925659, "learning_rate": 5.5976913772268823e-05, "loss": 0.1803, "step": 42275 }, { "epoch": 28.740841248303933, "grad_norm": 2.4603261947631836, "learning_rate": 5.581508616393165e-05, "loss": 0.1793, "step": 42364 }, { "epoch": 28.80122116689281, "grad_norm": 3.139146566390991, "learning_rate": 5.5653196782025696e-05, "loss": 0.1797, "step": 42453 }, { "epoch": 28.80325644504749, "eval_accuracy": 0.18359375, "eval_loss": 4.114618301391602, "eval_runtime": 29.6243, "eval_samples_per_second": 17.283, "eval_steps_per_second": 0.135, "step": 42456 }, { "epoch": 28.86160108548168, "grad_norm": 1.7056379318237305, "learning_rate": 5.5491247346299334e-05, "loss": 0.1811, "step": 42542 }, { "epoch": 28.921981004070556, "grad_norm": 1.6604520082473755, "learning_rate": 5.532923957713885e-05, "loss": 0.1751, "step": 42631 }, { "epoch": 28.98236092265943, "grad_norm": 2.6219496726989746, "learning_rate": 5.5167175195550235e-05, "loss": 0.1814, "step": 42720 }, { "epoch": 29.042740841248303, "grad_norm": 1.9368810653686523, "learning_rate": 5.500505592314086e-05, "loss": 0.1732, "step": 42809 }, { "epoch": 29.10312075983718, "grad_norm": 2.2955291271209717, "learning_rate": 5.484288348210121e-05, "loss": 0.1611, "step": 42898 }, { "epoch": 29.16350067842605, "grad_norm": 1.331339716911316, "learning_rate": 5.468065959518656e-05, "loss": 0.1682, "step": 42987 }, { "epoch": 29.223880597014926, "grad_norm": 1.5474261045455933, "learning_rate": 5.4518385985698714e-05, "loss": 0.1652, "step": 43076 }, { "epoch": 29.284260515603798, "grad_norm": 1.5148978233337402, "learning_rate": 5.4356064377467684e-05, "loss": 0.18, "step": 43165 }, { "epoch": 29.344640434192673, "grad_norm": 5.3867878913879395, "learning_rate": 5.4193696494833346e-05, "loss": 0.1693, "step": 43254 }, { "epoch": 29.40502035278155, "grad_norm": 2.20180082321167, "learning_rate": 5.4031284062627165e-05, "loss": 0.1599, "step": 43343 }, { "epoch": 29.46540027137042, "grad_norm": 2.1975841522216797, "learning_rate": 5.386882880615383e-05, "loss": 0.171, "step": 43432 }, { "epoch": 29.525780189959296, "grad_norm": 2.5658628940582275, "learning_rate": 5.3706332451173006e-05, "loss": 0.1714, "step": 43521 }, { "epoch": 29.586160108548167, "grad_norm": 2.7179007530212402, "learning_rate": 5.354379672388089e-05, "loss": 0.1713, "step": 43610 }, { "epoch": 29.646540027137043, "grad_norm": 1.867160677909851, "learning_rate": 5.338122335089196e-05, "loss": 0.1684, "step": 43699 }, { "epoch": 29.706919945725915, "grad_norm": 1.5570918321609497, "learning_rate": 5.321861405922063e-05, "loss": 0.1713, "step": 43788 }, { "epoch": 29.76729986431479, "grad_norm": 3.943268060684204, "learning_rate": 5.305597057626279e-05, "loss": 0.1714, "step": 43877 }, { "epoch": 29.827679782903665, "grad_norm": 1.6523535251617432, "learning_rate": 5.2893294629777644e-05, "loss": 0.1754, "step": 43966 }, { "epoch": 29.888059701492537, "grad_norm": 2.623303174972534, "learning_rate": 5.273058794786918e-05, "loss": 0.1724, "step": 44055 }, { "epoch": 29.948439620081412, "grad_norm": 1.8316419124603271, "learning_rate": 5.256785225896794e-05, "loss": 0.17, "step": 44144 }, { "epoch": 30.008819538670284, "grad_norm": 2.2553136348724365, "learning_rate": 5.240508929181258e-05, "loss": 0.1766, "step": 44233 }, { "epoch": 30.06919945725916, "grad_norm": 1.168664574623108, "learning_rate": 5.224230077543153e-05, "loss": 0.1523, "step": 44322 }, { "epoch": 30.12957937584803, "grad_norm": 1.4800312519073486, "learning_rate": 5.2079488439124644e-05, "loss": 0.1553, "step": 44411 }, { "epoch": 30.189959294436907, "grad_norm": 1.983797550201416, "learning_rate": 5.1916654012444796e-05, "loss": 0.1605, "step": 44500 }, { "epoch": 30.25033921302578, "grad_norm": 1.0819350481033325, "learning_rate": 5.1753799225179545e-05, "loss": 0.1592, "step": 44589 }, { "epoch": 30.310719131614654, "grad_norm": 2.143650531768799, "learning_rate": 5.159092580733276e-05, "loss": 0.1608, "step": 44678 }, { "epoch": 30.37109905020353, "grad_norm": 1.6740977764129639, "learning_rate": 5.142803548910614e-05, "loss": 0.1591, "step": 44767 }, { "epoch": 30.4314789687924, "grad_norm": 2.7269814014434814, "learning_rate": 5.126513000088101e-05, "loss": 0.1624, "step": 44856 }, { "epoch": 30.491858887381277, "grad_norm": 2.9973506927490234, "learning_rate": 5.1102211073199805e-05, "loss": 0.1597, "step": 44945 }, { "epoch": 30.55223880597015, "grad_norm": 1.9262616634368896, "learning_rate": 5.093928043674772e-05, "loss": 0.1517, "step": 45034 }, { "epoch": 30.612618724559024, "grad_norm": 2.577742099761963, "learning_rate": 5.077633982233433e-05, "loss": 0.1668, "step": 45123 }, { "epoch": 30.672998643147896, "grad_norm": 1.0925939083099365, "learning_rate": 5.061339096087523e-05, "loss": 0.1611, "step": 45212 }, { "epoch": 30.73337856173677, "grad_norm": 1.5580718517303467, "learning_rate": 5.0450435583373624e-05, "loss": 0.1691, "step": 45301 }, { "epoch": 30.793758480325643, "grad_norm": 1.7016775608062744, "learning_rate": 5.028747542090189e-05, "loss": 0.1565, "step": 45390 }, { "epoch": 30.854138398914518, "grad_norm": 2.930467128753662, "learning_rate": 5.012451220458328e-05, "loss": 0.1685, "step": 45479 }, { "epoch": 30.914518317503394, "grad_norm": 2.0711212158203125, "learning_rate": 4.996154766557351e-05, "loss": 0.1606, "step": 45568 }, { "epoch": 30.974898236092265, "grad_norm": 1.6559313535690308, "learning_rate": 4.9798583535042254e-05, "loss": 0.1695, "step": 45657 }, { "epoch": 31.03527815468114, "grad_norm": 2.794700860977173, "learning_rate": 4.9635621544154945e-05, "loss": 0.1506, "step": 45746 }, { "epoch": 31.095658073270013, "grad_norm": 2.3707473278045654, "learning_rate": 4.947266342405424e-05, "loss": 0.1474, "step": 45835 }, { "epoch": 31.156037991858888, "grad_norm": 1.6921839714050293, "learning_rate": 4.930971090584168e-05, "loss": 0.1468, "step": 45924 }, { "epoch": 31.203527815468114, "eval_accuracy": 0.181640625, "eval_loss": 4.099109649658203, "eval_runtime": 19.3439, "eval_samples_per_second": 26.468, "eval_steps_per_second": 0.207, "step": 45994 }, { "epoch": 31.21641791044776, "grad_norm": 1.611038327217102, "learning_rate": 4.91467657205593e-05, "loss": 0.1511, "step": 46013 }, { "epoch": 31.276797829036635, "grad_norm": 1.53565514087677, "learning_rate": 4.8983829599171235e-05, "loss": 0.1545, "step": 46102 }, { "epoch": 31.33717774762551, "grad_norm": 1.6248897314071655, "learning_rate": 4.8820904272545336e-05, "loss": 0.1456, "step": 46191 }, { "epoch": 31.397557666214382, "grad_norm": 1.318975806236267, "learning_rate": 4.865799147143479e-05, "loss": 0.1483, "step": 46280 }, { "epoch": 31.457937584803258, "grad_norm": 1.2955539226531982, "learning_rate": 4.8495092926459736e-05, "loss": 0.1515, "step": 46369 }, { "epoch": 31.51831750339213, "grad_norm": 3.0391619205474854, "learning_rate": 4.833221036808882e-05, "loss": 0.1479, "step": 46458 }, { "epoch": 31.578697421981005, "grad_norm": 1.7275387048721313, "learning_rate": 4.81693455266209e-05, "loss": 0.1517, "step": 46547 }, { "epoch": 31.639077340569877, "grad_norm": 2.1065945625305176, "learning_rate": 4.8006500132166625e-05, "loss": 0.1501, "step": 46636 }, { "epoch": 31.699457259158752, "grad_norm": 0.9785634875297546, "learning_rate": 4.784367591463008e-05, "loss": 0.1501, "step": 46725 }, { "epoch": 31.759837177747624, "grad_norm": 1.2350496053695679, "learning_rate": 4.768087460369036e-05, "loss": 0.146, "step": 46814 }, { "epoch": 31.8202170963365, "grad_norm": 1.5443971157073975, "learning_rate": 4.75180979287832e-05, "loss": 0.1514, "step": 46903 }, { "epoch": 31.880597014925375, "grad_norm": 1.1282203197479248, "learning_rate": 4.735534761908267e-05, "loss": 0.1478, "step": 46992 }, { "epoch": 31.940976933514246, "grad_norm": 1.1595454216003418, "learning_rate": 4.719262540348275e-05, "loss": 0.15, "step": 47081 }, { "epoch": 32.00135685210312, "grad_norm": 1.392354130744934, "learning_rate": 4.702993301057897e-05, "loss": 0.1402, "step": 47170 }, { "epoch": 32.061736770691994, "grad_norm": 1.6813993453979492, "learning_rate": 4.686727216865008e-05, "loss": 0.1458, "step": 47259 }, { "epoch": 32.122116689280865, "grad_norm": 2.200620174407959, "learning_rate": 4.6704644605639617e-05, "loss": 0.1426, "step": 47348 }, { "epoch": 32.182496607869744, "grad_norm": 1.1454344987869263, "learning_rate": 4.654205204913762e-05, "loss": 0.1417, "step": 47437 }, { "epoch": 32.242876526458616, "grad_norm": 1.6104034185409546, "learning_rate": 4.6379496226362285e-05, "loss": 0.1364, "step": 47526 }, { "epoch": 32.30325644504749, "grad_norm": 2.7888503074645996, "learning_rate": 4.621697886414152e-05, "loss": 0.1415, "step": 47615 }, { "epoch": 32.36363636363637, "grad_norm": 1.4862406253814697, "learning_rate": 4.605450168889475e-05, "loss": 0.1449, "step": 47704 }, { "epoch": 32.42401628222524, "grad_norm": 1.396264672279358, "learning_rate": 4.5892066426614426e-05, "loss": 0.1351, "step": 47793 }, { "epoch": 32.48439620081411, "grad_norm": 0.8358775973320007, "learning_rate": 4.572967480284777e-05, "loss": 0.1478, "step": 47882 }, { "epoch": 32.54477611940298, "grad_norm": 1.150931477546692, "learning_rate": 4.556732854267846e-05, "loss": 0.1388, "step": 47971 }, { "epoch": 32.60515603799186, "grad_norm": 1.9280314445495605, "learning_rate": 4.540502937070826e-05, "loss": 0.1336, "step": 48060 }, { "epoch": 32.66553595658073, "grad_norm": 1.9232927560806274, "learning_rate": 4.5242779011038746e-05, "loss": 0.1357, "step": 48149 }, { "epoch": 32.725915875169605, "grad_norm": 1.9297000169754028, "learning_rate": 4.5080579187252875e-05, "loss": 0.1434, "step": 48238 }, { "epoch": 32.786295793758484, "grad_norm": 1.3162543773651123, "learning_rate": 4.491843162239686e-05, "loss": 0.1357, "step": 48327 }, { "epoch": 32.846675712347356, "grad_norm": 2.112964391708374, "learning_rate": 4.4756338038961734e-05, "loss": 0.1347, "step": 48416 }, { "epoch": 32.90705563093623, "grad_norm": 2.025836944580078, "learning_rate": 4.459430015886507e-05, "loss": 0.1361, "step": 48505 }, { "epoch": 32.9674355495251, "grad_norm": 2.231003999710083, "learning_rate": 4.443231970343273e-05, "loss": 0.1493, "step": 48594 }, { "epoch": 33.02781546811398, "grad_norm": 0.858778715133667, "learning_rate": 4.427039839338051e-05, "loss": 0.1335, "step": 48683 }, { "epoch": 33.08819538670285, "grad_norm": 1.7645868062973022, "learning_rate": 4.410853794879596e-05, "loss": 0.1318, "step": 48772 }, { "epoch": 33.14857530529172, "grad_norm": 3.9215147495269775, "learning_rate": 4.3946740089120036e-05, "loss": 0.1289, "step": 48861 }, { "epoch": 33.208955223880594, "grad_norm": 1.0674065351486206, "learning_rate": 4.378500653312886e-05, "loss": 0.1314, "step": 48950 }, { "epoch": 33.26933514246947, "grad_norm": 1.1910934448242188, "learning_rate": 4.362333899891545e-05, "loss": 0.1285, "step": 49039 }, { "epoch": 33.329715061058344, "grad_norm": 3.3156814575195312, "learning_rate": 4.346173920387146e-05, "loss": 0.1314, "step": 49128 }, { "epoch": 33.390094979647216, "grad_norm": 1.518210530281067, "learning_rate": 4.330020886466898e-05, "loss": 0.1327, "step": 49217 }, { "epoch": 33.450474898236095, "grad_norm": 2.037992238998413, "learning_rate": 4.313874969724227e-05, "loss": 0.1294, "step": 49306 }, { "epoch": 33.51085481682497, "grad_norm": 1.0530787706375122, "learning_rate": 4.2977363416769495e-05, "loss": 0.1338, "step": 49395 }, { "epoch": 33.57123473541384, "grad_norm": 2.6281962394714355, "learning_rate": 4.281605173765462e-05, "loss": 0.1385, "step": 49484 }, { "epoch": 33.60379918588874, "eval_accuracy": 0.177734375, "eval_loss": 4.106353759765625, "eval_runtime": 43.8825, "eval_samples_per_second": 11.668, "eval_steps_per_second": 0.091, "step": 49532 }, { "epoch": 33.63161465400271, "grad_norm": 1.659097671508789, "learning_rate": 4.265481637350902e-05, "loss": 0.1334, "step": 49573 }, { "epoch": 33.69199457259159, "grad_norm": 1.2055881023406982, "learning_rate": 4.249365903713345e-05, "loss": 0.1277, "step": 49662 }, { "epoch": 33.75237449118046, "grad_norm": 1.3534148931503296, "learning_rate": 4.2332581440499765e-05, "loss": 0.1241, "step": 49751 }, { "epoch": 33.81275440976933, "grad_norm": 1.6355328559875488, "learning_rate": 4.217158529473275e-05, "loss": 0.1309, "step": 49840 }, { "epoch": 33.87313432835821, "grad_norm": 1.2613086700439453, "learning_rate": 4.2010672310091895e-05, "loss": 0.1306, "step": 49929 }, { "epoch": 33.933514246947084, "grad_norm": 2.427302837371826, "learning_rate": 4.1849844195953314e-05, "loss": 0.1335, "step": 50018 }, { "epoch": 33.993894165535956, "grad_norm": 1.0683902502059937, "learning_rate": 4.1689102660791536e-05, "loss": 0.137, "step": 50107 }, { "epoch": 34.05427408412483, "grad_norm": 1.184240460395813, "learning_rate": 4.1528449412161375e-05, "loss": 0.1206, "step": 50196 }, { "epoch": 34.114654002713706, "grad_norm": 2.108067512512207, "learning_rate": 4.136788615667974e-05, "loss": 0.125, "step": 50285 }, { "epoch": 34.17503392130258, "grad_norm": 1.4755454063415527, "learning_rate": 4.120741460000758e-05, "loss": 0.1283, "step": 50374 }, { "epoch": 34.23541383989145, "grad_norm": 1.8144526481628418, "learning_rate": 4.1047036446831686e-05, "loss": 0.1279, "step": 50463 }, { "epoch": 34.29579375848033, "grad_norm": 1.2851365804672241, "learning_rate": 4.088675340084668e-05, "loss": 0.1207, "step": 50552 }, { "epoch": 34.3561736770692, "grad_norm": 1.1482937335968018, "learning_rate": 4.072656716473684e-05, "loss": 0.1251, "step": 50641 }, { "epoch": 34.41655359565807, "grad_norm": 1.2348805665969849, "learning_rate": 4.0566479440158036e-05, "loss": 0.1235, "step": 50730 }, { "epoch": 34.476933514246944, "grad_norm": 1.1819324493408203, "learning_rate": 4.040649192771962e-05, "loss": 0.132, "step": 50819 }, { "epoch": 34.53731343283582, "grad_norm": 1.1830766201019287, "learning_rate": 4.0246606326966425e-05, "loss": 0.1176, "step": 50908 }, { "epoch": 34.597693351424695, "grad_norm": 1.9532086849212646, "learning_rate": 4.0086824336360676e-05, "loss": 0.1231, "step": 50997 }, { "epoch": 34.65807327001357, "grad_norm": 1.529571294784546, "learning_rate": 3.992714765326396e-05, "loss": 0.1242, "step": 51086 }, { "epoch": 34.71845318860244, "grad_norm": 1.2561233043670654, "learning_rate": 3.9767577973919146e-05, "loss": 0.1255, "step": 51175 }, { "epoch": 34.77883310719132, "grad_norm": 1.7090590000152588, "learning_rate": 3.960811699343243e-05, "loss": 0.1215, "step": 51264 }, { "epoch": 34.83921302578019, "grad_norm": 2.106395959854126, "learning_rate": 3.94487664057553e-05, "loss": 0.1285, "step": 51353 }, { "epoch": 34.89959294436906, "grad_norm": 1.165230393409729, "learning_rate": 3.928952790366654e-05, "loss": 0.1216, "step": 51442 }, { "epoch": 34.95997286295794, "grad_norm": 1.2061336040496826, "learning_rate": 3.913040317875424e-05, "loss": 0.1164, "step": 51531 }, { "epoch": 35.02035278154681, "grad_norm": 1.160407304763794, "learning_rate": 3.897139392139788e-05, "loss": 0.1258, "step": 51620 }, { "epoch": 35.080732700135684, "grad_norm": 0.8674483299255371, "learning_rate": 3.881250182075026e-05, "loss": 0.1129, "step": 51709 }, { "epoch": 35.141112618724556, "grad_norm": 1.4497802257537842, "learning_rate": 3.8653728564719674e-05, "loss": 0.1244, "step": 51798 }, { "epoch": 35.201492537313435, "grad_norm": 1.649856448173523, "learning_rate": 3.8495075839951937e-05, "loss": 0.1157, "step": 51887 }, { "epoch": 35.26187245590231, "grad_norm": 1.97478187084198, "learning_rate": 3.833654533181244e-05, "loss": 0.1182, "step": 51976 }, { "epoch": 35.32225237449118, "grad_norm": 1.4241811037063599, "learning_rate": 3.8178138724368275e-05, "loss": 0.1195, "step": 52065 }, { "epoch": 35.38263229308006, "grad_norm": 1.9427152872085571, "learning_rate": 3.8019857700370345e-05, "loss": 0.1214, "step": 52154 }, { "epoch": 35.44301221166893, "grad_norm": 1.2185932397842407, "learning_rate": 3.7861703941235444e-05, "loss": 0.1149, "step": 52243 }, { "epoch": 35.5033921302578, "grad_norm": 1.1983317136764526, "learning_rate": 3.770367912702849e-05, "loss": 0.1182, "step": 52332 }, { "epoch": 35.56377204884667, "grad_norm": 0.9646018147468567, "learning_rate": 3.7545784936444605e-05, "loss": 0.1272, "step": 52421 }, { "epoch": 35.62415196743555, "grad_norm": 1.189382791519165, "learning_rate": 3.73880230467913e-05, "loss": 0.1139, "step": 52510 }, { "epoch": 35.68453188602442, "grad_norm": 1.0490000247955322, "learning_rate": 3.7230395133970595e-05, "loss": 0.1179, "step": 52599 }, { "epoch": 35.744911804613295, "grad_norm": 1.055656909942627, "learning_rate": 3.7072902872461365e-05, "loss": 0.1184, "step": 52688 }, { "epoch": 35.805291723202174, "grad_norm": 1.564658522605896, "learning_rate": 3.691554793530143e-05, "loss": 0.12, "step": 52777 }, { "epoch": 35.865671641791046, "grad_norm": 1.054408311843872, "learning_rate": 3.6758331994069784e-05, "loss": 0.1145, "step": 52866 }, { "epoch": 35.92605156037992, "grad_norm": 1.5454896688461304, "learning_rate": 3.660125671886892e-05, "loss": 0.1104, "step": 52955 }, { "epoch": 35.98643147896879, "grad_norm": 0.9646552801132202, "learning_rate": 3.6444323778307e-05, "loss": 0.1192, "step": 53044 }, { "epoch": 36.004070556309365, "eval_accuracy": 0.181640625, "eval_loss": 4.21491813659668, "eval_runtime": 20.6978, "eval_samples_per_second": 24.737, "eval_steps_per_second": 0.193, "step": 53070 }, { "epoch": 36.04681139755767, "grad_norm": 2.2136874198913574, "learning_rate": 3.628753483948017e-05, "loss": 0.115, "step": 53133 }, { "epoch": 36.10719131614654, "grad_norm": 0.8969342708587646, "learning_rate": 3.613089156795489e-05, "loss": 0.111, "step": 53222 }, { "epoch": 36.16757123473541, "grad_norm": 1.5373083353042603, "learning_rate": 3.5974395627750136e-05, "loss": 0.1181, "step": 53311 }, { "epoch": 36.22795115332429, "grad_norm": 1.0511338710784912, "learning_rate": 3.581804868131986e-05, "loss": 0.1089, "step": 53400 }, { "epoch": 36.28833107191316, "grad_norm": 0.6941206455230713, "learning_rate": 3.566185238953516e-05, "loss": 0.1133, "step": 53489 }, { "epoch": 36.348710990502035, "grad_norm": 1.0698457956314087, "learning_rate": 3.5505808411666805e-05, "loss": 0.1046, "step": 53578 }, { "epoch": 36.40909090909091, "grad_norm": 1.1524955034255981, "learning_rate": 3.5349918405367533e-05, "loss": 0.1111, "step": 53667 }, { "epoch": 36.469470827679785, "grad_norm": 0.7653555274009705, "learning_rate": 3.519418402665441e-05, "loss": 0.1102, "step": 53756 }, { "epoch": 36.52985074626866, "grad_norm": 0.7626907229423523, "learning_rate": 3.503860692989129e-05, "loss": 0.1109, "step": 53845 }, { "epoch": 36.59023066485753, "grad_norm": 1.2246617078781128, "learning_rate": 3.4883188767771235e-05, "loss": 0.1087, "step": 53934 }, { "epoch": 36.6506105834464, "grad_norm": 0.8445035815238953, "learning_rate": 3.472793119129891e-05, "loss": 0.1104, "step": 54023 }, { "epoch": 36.71099050203528, "grad_norm": 0.4783117175102234, "learning_rate": 3.4572835849773124e-05, "loss": 0.1101, "step": 54112 }, { "epoch": 36.77137042062415, "grad_norm": 0.6431951522827148, "learning_rate": 3.441790439076924e-05, "loss": 0.1128, "step": 54201 }, { "epoch": 36.83175033921302, "grad_norm": 0.8060305118560791, "learning_rate": 3.426313846012174e-05, "loss": 0.1077, "step": 54290 }, { "epoch": 36.8921302578019, "grad_norm": 1.309480905532837, "learning_rate": 3.410853970190662e-05, "loss": 0.1094, "step": 54379 }, { "epoch": 36.952510176390774, "grad_norm": 0.7138769030570984, "learning_rate": 3.395410975842408e-05, "loss": 0.1119, "step": 54468 }, { "epoch": 37.012890094979646, "grad_norm": 1.4216080904006958, "learning_rate": 3.379985027018098e-05, "loss": 0.1117, "step": 54557 }, { "epoch": 37.07327001356852, "grad_norm": 1.4457802772521973, "learning_rate": 3.3645762875873415e-05, "loss": 0.1024, "step": 54646 }, { "epoch": 37.1336499321574, "grad_norm": 0.7809485793113708, "learning_rate": 3.349184921236939e-05, "loss": 0.1054, "step": 54735 }, { "epoch": 37.19402985074627, "grad_norm": 1.7159395217895508, "learning_rate": 3.333811091469129e-05, "loss": 0.1028, "step": 54824 }, { "epoch": 37.25440976933514, "grad_norm": 0.48482632637023926, "learning_rate": 3.318454961599864e-05, "loss": 0.105, "step": 54913 }, { "epoch": 37.31478968792402, "grad_norm": 1.0585776567459106, "learning_rate": 3.30311669475707e-05, "loss": 0.0995, "step": 55002 }, { "epoch": 37.37516960651289, "grad_norm": 0.7327682971954346, "learning_rate": 3.2877964538789154e-05, "loss": 0.1072, "step": 55091 }, { "epoch": 37.43554952510176, "grad_norm": 1.583203911781311, "learning_rate": 3.272494401712078e-05, "loss": 0.104, "step": 55180 }, { "epoch": 37.495929443690635, "grad_norm": 1.2305749654769897, "learning_rate": 3.257210700810015e-05, "loss": 0.1038, "step": 55269 }, { "epoch": 37.556309362279514, "grad_norm": 8.245156288146973, "learning_rate": 3.241945513531241e-05, "loss": 0.1087, "step": 55358 }, { "epoch": 37.616689280868385, "grad_norm": 0.8825012445449829, "learning_rate": 3.226699002037602e-05, "loss": 0.109, "step": 55447 }, { "epoch": 37.67706919945726, "grad_norm": 1.1218957901000977, "learning_rate": 3.2114713282925466e-05, "loss": 0.1038, "step": 55536 }, { "epoch": 37.737449118046136, "grad_norm": 1.4190541505813599, "learning_rate": 3.196262654059419e-05, "loss": 0.108, "step": 55625 }, { "epoch": 37.79782903663501, "grad_norm": 0.5339131951332092, "learning_rate": 3.1810731408997185e-05, "loss": 0.1103, "step": 55714 }, { "epoch": 37.85820895522388, "grad_norm": 1.3955272436141968, "learning_rate": 3.1659029501714077e-05, "loss": 0.0993, "step": 55803 }, { "epoch": 37.91858887381275, "grad_norm": 1.7271915674209595, "learning_rate": 3.150752243027185e-05, "loss": 0.1081, "step": 55892 }, { "epoch": 37.97896879240163, "grad_norm": 1.272377848625183, "learning_rate": 3.1356211804127726e-05, "loss": 0.0988, "step": 55981 }, { "epoch": 38.0393487109905, "grad_norm": 1.4303677082061768, "learning_rate": 3.1205099230652134e-05, "loss": 0.0947, "step": 56070 }, { "epoch": 38.099728629579374, "grad_norm": 0.8582963347434998, "learning_rate": 3.105418631511151e-05, "loss": 0.1023, "step": 56159 }, { "epoch": 38.16010854816825, "grad_norm": 0.9735032916069031, "learning_rate": 3.090347466065141e-05, "loss": 0.0994, "step": 56248 }, { "epoch": 38.220488466757125, "grad_norm": 0.8023036122322083, "learning_rate": 3.075296586827938e-05, "loss": 0.0968, "step": 56337 }, { "epoch": 38.280868385346, "grad_norm": 0.8283627033233643, "learning_rate": 3.060266153684792e-05, "loss": 0.0988, "step": 56426 }, { "epoch": 38.34124830393487, "grad_norm": 1.1911463737487793, "learning_rate": 3.045256326303762e-05, "loss": 0.1015, "step": 56515 }, { "epoch": 38.40162822252375, "grad_norm": 1.3183075189590454, "learning_rate": 3.030267264134003e-05, "loss": 0.1014, "step": 56604 }, { "epoch": 38.404341926729984, "eval_accuracy": 0.169921875, "eval_loss": 4.234889984130859, "eval_runtime": 20.5487, "eval_samples_per_second": 24.916, "eval_steps_per_second": 0.195, "step": 56608 }, { "epoch": 38.46200814111262, "grad_norm": 0.9125858545303345, "learning_rate": 3.0152991264040888e-05, "loss": 0.101, "step": 56693 }, { "epoch": 38.52238805970149, "grad_norm": 0.5899451971054077, "learning_rate": 3.0003520721203106e-05, "loss": 0.0969, "step": 56782 }, { "epoch": 38.58276797829036, "grad_norm": 1.2424014806747437, "learning_rate": 2.9854262600649907e-05, "loss": 0.1017, "step": 56871 }, { "epoch": 38.64314789687924, "grad_norm": 1.250267744064331, "learning_rate": 2.9705218487947984e-05, "loss": 0.0982, "step": 56960 }, { "epoch": 38.703527815468114, "grad_norm": 1.153306245803833, "learning_rate": 2.9556389966390552e-05, "loss": 0.1006, "step": 57049 }, { "epoch": 38.763907734056986, "grad_norm": 1.2941042184829712, "learning_rate": 2.940777861698068e-05, "loss": 0.0975, "step": 57138 }, { "epoch": 38.824287652645864, "grad_norm": 1.0015143156051636, "learning_rate": 2.9259386018414396e-05, "loss": 0.1054, "step": 57227 }, { "epoch": 38.884667571234736, "grad_norm": 0.8103719353675842, "learning_rate": 2.9111213747063915e-05, "loss": 0.1004, "step": 57316 }, { "epoch": 38.94504748982361, "grad_norm": 2.045173406600952, "learning_rate": 2.896326337696098e-05, "loss": 0.0993, "step": 57405 }, { "epoch": 39.00542740841248, "grad_norm": 0.9834128022193909, "learning_rate": 2.8815536479780014e-05, "loss": 0.0971, "step": 57494 }, { "epoch": 39.06580732700136, "grad_norm": 0.6491034030914307, "learning_rate": 2.8668034624821514e-05, "loss": 0.0957, "step": 57583 }, { "epoch": 39.12618724559023, "grad_norm": 1.0920275449752808, "learning_rate": 2.852075937899541e-05, "loss": 0.0938, "step": 57672 }, { "epoch": 39.1865671641791, "grad_norm": 0.9111031293869019, "learning_rate": 2.8373712306804267e-05, "loss": 0.0954, "step": 57761 }, { "epoch": 39.24694708276798, "grad_norm": 0.7507003545761108, "learning_rate": 2.8226894970326856e-05, "loss": 0.0926, "step": 57850 }, { "epoch": 39.30732700135685, "grad_norm": 1.0884746313095093, "learning_rate": 2.8080308929201392e-05, "loss": 0.0946, "step": 57939 }, { "epoch": 39.367706919945725, "grad_norm": 0.7752851843833923, "learning_rate": 2.793395574060911e-05, "loss": 0.0925, "step": 58028 }, { "epoch": 39.4280868385346, "grad_norm": 0.8282026052474976, "learning_rate": 2.7787836959257617e-05, "loss": 0.0954, "step": 58117 }, { "epoch": 39.488466757123476, "grad_norm": 0.7554723620414734, "learning_rate": 2.764195413736444e-05, "loss": 0.0965, "step": 58206 }, { "epoch": 39.54884667571235, "grad_norm": 1.461937427520752, "learning_rate": 2.7496308824640505e-05, "loss": 0.0963, "step": 58295 }, { "epoch": 39.60922659430122, "grad_norm": 1.260448694229126, "learning_rate": 2.735090256827365e-05, "loss": 0.0901, "step": 58384 }, { "epoch": 39.6696065128901, "grad_norm": 0.5917372703552246, "learning_rate": 2.720573691291226e-05, "loss": 0.0912, "step": 58473 }, { "epoch": 39.72998643147897, "grad_norm": 1.0899447202682495, "learning_rate": 2.70608134006488e-05, "loss": 0.0971, "step": 58562 }, { "epoch": 39.79036635006784, "grad_norm": 0.700945258140564, "learning_rate": 2.691613357100348e-05, "loss": 0.0959, "step": 58651 }, { "epoch": 39.850746268656714, "grad_norm": 0.563937783241272, "learning_rate": 2.6771698960907844e-05, "loss": 0.0924, "step": 58740 }, { "epoch": 39.91112618724559, "grad_norm": 1.2287607192993164, "learning_rate": 2.6627511104688463e-05, "loss": 0.0915, "step": 58829 }, { "epoch": 39.971506105834465, "grad_norm": 1.0432151556015015, "learning_rate": 2.6483571534050684e-05, "loss": 0.094, "step": 58918 }, { "epoch": 40.031886024423336, "grad_norm": 0.9087603092193604, "learning_rate": 2.6339881778062286e-05, "loss": 0.0914, "step": 59007 }, { "epoch": 40.09226594301221, "grad_norm": 1.0434340238571167, "learning_rate": 2.6196443363137295e-05, "loss": 0.0932, "step": 59096 }, { "epoch": 40.15264586160109, "grad_norm": 1.4416966438293457, "learning_rate": 2.6053257813019756e-05, "loss": 0.0951, "step": 59185 }, { "epoch": 40.21302578018996, "grad_norm": 0.5194874405860901, "learning_rate": 2.5910326648767464e-05, "loss": 0.0909, "step": 59274 }, { "epoch": 40.27340569877883, "grad_norm": 0.4782836139202118, "learning_rate": 2.5767651388735976e-05, "loss": 0.0917, "step": 59363 }, { "epoch": 40.33378561736771, "grad_norm": 0.7723681926727295, "learning_rate": 2.5625233548562288e-05, "loss": 0.0928, "step": 59452 }, { "epoch": 40.39416553595658, "grad_norm": 0.5637179017066956, "learning_rate": 2.5483074641148896e-05, "loss": 0.095, "step": 59541 }, { "epoch": 40.45454545454545, "grad_norm": 0.9517094492912292, "learning_rate": 2.534117617664766e-05, "loss": 0.0857, "step": 59630 }, { "epoch": 40.514925373134325, "grad_norm": 1.0360537767410278, "learning_rate": 2.5199539662443683e-05, "loss": 0.0923, "step": 59719 }, { "epoch": 40.575305291723204, "grad_norm": 0.993859589099884, "learning_rate": 2.5058166603139453e-05, "loss": 0.0918, "step": 59808 }, { "epoch": 40.635685210312076, "grad_norm": 0.5905105471611023, "learning_rate": 2.491705850053876e-05, "loss": 0.0914, "step": 59897 }, { "epoch": 40.69606512890095, "grad_norm": 1.8507524728775024, "learning_rate": 2.4776216853630747e-05, "loss": 0.0948, "step": 59986 }, { "epoch": 40.75644504748983, "grad_norm": 0.8569918274879456, "learning_rate": 2.4635643158574034e-05, "loss": 0.0933, "step": 60075 }, { "epoch": 40.80461329715061, "eval_accuracy": 0.17578125, "eval_loss": 4.287986755371094, "eval_runtime": 29.4248, "eval_samples_per_second": 17.4, "eval_steps_per_second": 0.136, "step": 60146 }, { "epoch": 40.8168249660787, "grad_norm": 1.2466926574707031, "learning_rate": 2.4495338908680733e-05, "loss": 0.0884, "step": 60164 }, { "epoch": 40.87720488466757, "grad_norm": 1.0967109203338623, "learning_rate": 2.4355305594400703e-05, "loss": 0.0885, "step": 60253 }, { "epoch": 40.93758480325644, "grad_norm": 3.7335941791534424, "learning_rate": 2.4215544703305624e-05, "loss": 0.0863, "step": 60342 }, { "epoch": 40.99796472184532, "grad_norm": 0.7128244638442993, "learning_rate": 2.4076057720073263e-05, "loss": 0.0916, "step": 60431 }, { "epoch": 41.05834464043419, "grad_norm": 0.6948025226593018, "learning_rate": 2.393684612647165e-05, "loss": 0.0907, "step": 60520 }, { "epoch": 41.118724559023065, "grad_norm": 0.9347543716430664, "learning_rate": 2.3797911401343324e-05, "loss": 0.0863, "step": 60609 }, { "epoch": 41.17910447761194, "grad_norm": 0.6577604413032532, "learning_rate": 2.3659255020589693e-05, "loss": 0.0893, "step": 60698 }, { "epoch": 41.239484396200815, "grad_norm": 1.0613411664962769, "learning_rate": 2.3520878457155317e-05, "loss": 0.0907, "step": 60787 }, { "epoch": 41.29986431478969, "grad_norm": 0.7223649024963379, "learning_rate": 2.338278318101224e-05, "loss": 0.0858, "step": 60876 }, { "epoch": 41.36024423337856, "grad_norm": 0.6473923325538635, "learning_rate": 2.3244970659144434e-05, "loss": 0.0881, "step": 60965 }, { "epoch": 41.42062415196744, "grad_norm": 0.6310983300209045, "learning_rate": 2.3107442355532105e-05, "loss": 0.0866, "step": 61054 }, { "epoch": 41.48100407055631, "grad_norm": 1.2830203771591187, "learning_rate": 2.2970199731136305e-05, "loss": 0.0882, "step": 61143 }, { "epoch": 41.54138398914518, "grad_norm": 0.6028885245323181, "learning_rate": 2.2833244243883222e-05, "loss": 0.0861, "step": 61232 }, { "epoch": 41.60176390773406, "grad_norm": 1.1787885427474976, "learning_rate": 2.2696577348648867e-05, "loss": 0.0897, "step": 61321 }, { "epoch": 41.66214382632293, "grad_norm": 0.5341454148292542, "learning_rate": 2.2560200497243537e-05, "loss": 0.0871, "step": 61410 }, { "epoch": 41.722523744911804, "grad_norm": 1.4164313077926636, "learning_rate": 2.2424115138396336e-05, "loss": 0.0924, "step": 61499 }, { "epoch": 41.782903663500676, "grad_norm": 0.7035442590713501, "learning_rate": 2.2288322717739912e-05, "loss": 0.088, "step": 61588 }, { "epoch": 41.843283582089555, "grad_norm": 0.6574503779411316, "learning_rate": 2.2152824677795003e-05, "loss": 0.0868, "step": 61677 }, { "epoch": 41.90366350067843, "grad_norm": 0.4766522943973541, "learning_rate": 2.201762245795516e-05, "loss": 0.0887, "step": 61766 }, { "epoch": 41.9640434192673, "grad_norm": 2.5811030864715576, "learning_rate": 2.188271749447146e-05, "loss": 0.0872, "step": 61855 }, { "epoch": 42.02442333785617, "grad_norm": 0.7208371758460999, "learning_rate": 2.1748111220437163e-05, "loss": 0.0825, "step": 61944 }, { "epoch": 42.08480325644505, "grad_norm": 0.7155792713165283, "learning_rate": 2.161380506577262e-05, "loss": 0.0913, "step": 62033 }, { "epoch": 42.14518317503392, "grad_norm": 0.7777039408683777, "learning_rate": 2.147980045720999e-05, "loss": 0.0837, "step": 62122 }, { "epoch": 42.20556309362279, "grad_norm": 0.5456185340881348, "learning_rate": 2.134609881827813e-05, "loss": 0.0825, "step": 62211 }, { "epoch": 42.26594301221167, "grad_norm": 0.614791750907898, "learning_rate": 2.1212701569287463e-05, "loss": 0.078, "step": 62300 }, { "epoch": 42.32632293080054, "grad_norm": 0.9303745627403259, "learning_rate": 2.1079610127314827e-05, "loss": 0.0815, "step": 62389 }, { "epoch": 42.386702849389415, "grad_norm": 0.6811819672584534, "learning_rate": 2.094682590618852e-05, "loss": 0.0842, "step": 62478 }, { "epoch": 42.44708276797829, "grad_norm": 0.7549321055412292, "learning_rate": 2.081435031647326e-05, "loss": 0.0834, "step": 62567 }, { "epoch": 42.507462686567166, "grad_norm": 0.6525147557258606, "learning_rate": 2.0682184765455143e-05, "loss": 0.0853, "step": 62656 }, { "epoch": 42.56784260515604, "grad_norm": 0.7095387578010559, "learning_rate": 2.0550330657126715e-05, "loss": 0.0873, "step": 62745 }, { "epoch": 42.62822252374491, "grad_norm": 0.8400213122367859, "learning_rate": 2.041878939217211e-05, "loss": 0.0875, "step": 62834 }, { "epoch": 42.68860244233379, "grad_norm": 0.9360200762748718, "learning_rate": 2.028756236795213e-05, "loss": 0.0843, "step": 62923 }, { "epoch": 42.74898236092266, "grad_norm": 0.5572984218597412, "learning_rate": 2.015665097848935e-05, "loss": 0.0813, "step": 63012 }, { "epoch": 42.80936227951153, "grad_norm": 0.9234522581100464, "learning_rate": 2.002605661445342e-05, "loss": 0.083, "step": 63101 }, { "epoch": 42.869742198100404, "grad_norm": 0.5887913107872009, "learning_rate": 1.989578066314623e-05, "loss": 0.0839, "step": 63190 }, { "epoch": 42.93012211668928, "grad_norm": 0.8760083913803101, "learning_rate": 1.9765824508487125e-05, "loss": 0.085, "step": 63279 }, { "epoch": 42.990502035278155, "grad_norm": 0.7094123959541321, "learning_rate": 1.9636189530998307e-05, "loss": 0.0798, "step": 63368 }, { "epoch": 43.05088195386703, "grad_norm": 0.5656801462173462, "learning_rate": 1.95068771077901e-05, "loss": 0.0785, "step": 63457 }, { "epoch": 43.111261872455906, "grad_norm": 0.8483113646507263, "learning_rate": 1.937788861254634e-05, "loss": 0.081, "step": 63546 }, { "epoch": 43.17164179104478, "grad_norm": 0.5962135791778564, "learning_rate": 1.9249225415509807e-05, "loss": 0.0832, "step": 63635 }, { "epoch": 43.204884667571235, "eval_accuracy": 0.177734375, "eval_loss": 4.267856597900391, "eval_runtime": 19.6828, "eval_samples_per_second": 26.013, "eval_steps_per_second": 0.203, "step": 63684 }, { "epoch": 43.23202170963365, "grad_norm": 1.1795192956924438, "learning_rate": 1.9120888883467574e-05, "loss": 0.0881, "step": 63724 }, { "epoch": 43.29240162822252, "grad_norm": 1.2301242351531982, "learning_rate": 1.899288037973662e-05, "loss": 0.0779, "step": 63813 }, { "epoch": 43.3527815468114, "grad_norm": 0.5368560552597046, "learning_rate": 1.8865201264149267e-05, "loss": 0.0793, "step": 63902 }, { "epoch": 43.41316146540027, "grad_norm": 1.1031700372695923, "learning_rate": 1.873785289303875e-05, "loss": 0.0824, "step": 63991 }, { "epoch": 43.473541383989144, "grad_norm": 0.9713082313537598, "learning_rate": 1.861083661922482e-05, "loss": 0.0766, "step": 64080 }, { "epoch": 43.53392130257802, "grad_norm": 0.681328296661377, "learning_rate": 1.8484153791999326e-05, "loss": 0.0799, "step": 64169 }, { "epoch": 43.594301221166894, "grad_norm": 0.8199315071105957, "learning_rate": 1.8357805757111966e-05, "loss": 0.0811, "step": 64258 }, { "epoch": 43.654681139755766, "grad_norm": 1.327650785446167, "learning_rate": 1.823179385675593e-05, "loss": 0.08, "step": 64347 }, { "epoch": 43.71506105834464, "grad_norm": 0.9341023564338684, "learning_rate": 1.810611942955365e-05, "loss": 0.0787, "step": 64436 }, { "epoch": 43.77544097693352, "grad_norm": 0.5767560601234436, "learning_rate": 1.7980783810542577e-05, "loss": 0.0812, "step": 64525 }, { "epoch": 43.83582089552239, "grad_norm": 0.5114635229110718, "learning_rate": 1.785578833116104e-05, "loss": 0.0823, "step": 64614 }, { "epoch": 43.89620081411126, "grad_norm": 0.5436065196990967, "learning_rate": 1.7731134319234016e-05, "loss": 0.0819, "step": 64703 }, { "epoch": 43.95658073270013, "grad_norm": 0.4684976041316986, "learning_rate": 1.760682309895913e-05, "loss": 0.0842, "step": 64792 }, { "epoch": 44.01696065128901, "grad_norm": 1.0648964643478394, "learning_rate": 1.7482855990892517e-05, "loss": 0.0822, "step": 64881 }, { "epoch": 44.07734056987788, "grad_norm": 0.6211819648742676, "learning_rate": 1.735923431193483e-05, "loss": 0.0797, "step": 64970 }, { "epoch": 44.137720488466755, "grad_norm": 0.4334025979042053, "learning_rate": 1.7235959375317185e-05, "loss": 0.0759, "step": 65059 }, { "epoch": 44.198100407055634, "grad_norm": 1.0753127336502075, "learning_rate": 1.711303249058731e-05, "loss": 0.0756, "step": 65148 }, { "epoch": 44.258480325644506, "grad_norm": 0.6846993565559387, "learning_rate": 1.6990454963595577e-05, "loss": 0.0795, "step": 65237 }, { "epoch": 44.31886024423338, "grad_norm": 0.44795066118240356, "learning_rate": 1.6868228096481104e-05, "loss": 0.0815, "step": 65326 }, { "epoch": 44.37924016282225, "grad_norm": 1.4556400775909424, "learning_rate": 1.674635318765801e-05, "loss": 0.0789, "step": 65415 }, { "epoch": 44.43962008141113, "grad_norm": 0.6817762851715088, "learning_rate": 1.66248315318015e-05, "loss": 0.0734, "step": 65504 }, { "epoch": 44.5, "grad_norm": 0.8499571681022644, "learning_rate": 1.6503664419834215e-05, "loss": 0.0798, "step": 65593 }, { "epoch": 44.56037991858887, "grad_norm": 0.5608311891555786, "learning_rate": 1.6382853138912485e-05, "loss": 0.0759, "step": 65682 }, { "epoch": 44.62075983717775, "grad_norm": 1.1510560512542725, "learning_rate": 1.6262398972412644e-05, "loss": 0.0774, "step": 65771 }, { "epoch": 44.68113975576662, "grad_norm": 0.591827392578125, "learning_rate": 1.614230319991743e-05, "loss": 0.0827, "step": 65860 }, { "epoch": 44.741519674355494, "grad_norm": 0.7560341358184814, "learning_rate": 1.60225670972023e-05, "loss": 0.0752, "step": 65949 }, { "epoch": 44.801899592944366, "grad_norm": 1.0043483972549438, "learning_rate": 1.5903191936222016e-05, "loss": 0.0794, "step": 66038 }, { "epoch": 44.862279511533245, "grad_norm": 1.1438446044921875, "learning_rate": 1.5784178985097024e-05, "loss": 0.08, "step": 66127 }, { "epoch": 44.92265943012212, "grad_norm": 0.39144688844680786, "learning_rate": 1.5665529508100052e-05, "loss": 0.0729, "step": 66216 }, { "epoch": 44.98303934871099, "grad_norm": 0.7558673620223999, "learning_rate": 1.5547244765642588e-05, "loss": 0.0759, "step": 66305 }, { "epoch": 45.04341926729987, "grad_norm": 0.6959690451622009, "learning_rate": 1.5429326014261632e-05, "loss": 0.0817, "step": 66394 }, { "epoch": 45.10379918588874, "grad_norm": 0.41576260328292847, "learning_rate": 1.531177450660618e-05, "loss": 0.0738, "step": 66483 }, { "epoch": 45.16417910447761, "grad_norm": 0.8202412724494934, "learning_rate": 1.5194591491424064e-05, "loss": 0.0763, "step": 66572 }, { "epoch": 45.22455902306648, "grad_norm": 1.1920087337493896, "learning_rate": 1.5077778213548622e-05, "loss": 0.0756, "step": 66661 }, { "epoch": 45.28493894165536, "grad_norm": 0.6442920565605164, "learning_rate": 1.496133591388547e-05, "loss": 0.0795, "step": 66750 }, { "epoch": 45.345318860244234, "grad_norm": 0.7332776784896851, "learning_rate": 1.4845265829399296e-05, "loss": 0.0766, "step": 66839 }, { "epoch": 45.405698778833106, "grad_norm": 0.887069821357727, "learning_rate": 1.4729569193100795e-05, "loss": 0.0756, "step": 66928 }, { "epoch": 45.46607869742198, "grad_norm": 0.6151465177536011, "learning_rate": 1.4614247234033518e-05, "loss": 0.0793, "step": 67017 }, { "epoch": 45.526458616010856, "grad_norm": 0.7770605087280273, "learning_rate": 1.449930117726081e-05, "loss": 0.0793, "step": 67106 }, { "epoch": 45.58683853459973, "grad_norm": 0.5736819505691528, "learning_rate": 1.438473224385285e-05, "loss": 0.0728, "step": 67195 }, { "epoch": 45.60515603799186, "eval_accuracy": 0.185546875, "eval_loss": 4.269733428955078, "eval_runtime": 40.0503, "eval_samples_per_second": 12.784, "eval_steps_per_second": 0.1, "step": 67222 }, { "epoch": 45.6472184531886, "grad_norm": 0.4615430533885956, "learning_rate": 1.4270541650873582e-05, "loss": 0.0706, "step": 67284 }, { "epoch": 45.70759837177748, "grad_norm": 0.7554183006286621, "learning_rate": 1.415673061136788e-05, "loss": 0.0788, "step": 67373 }, { "epoch": 45.76797829036635, "grad_norm": 0.6309983134269714, "learning_rate": 1.4043300334348641e-05, "loss": 0.0779, "step": 67462 }, { "epoch": 45.82835820895522, "grad_norm": 0.4782220125198364, "learning_rate": 1.3930252024783903e-05, "loss": 0.0769, "step": 67551 }, { "epoch": 45.888738127544094, "grad_norm": 0.5289342403411865, "learning_rate": 1.3817586883584094e-05, "loss": 0.0768, "step": 67640 }, { "epoch": 45.94911804613297, "grad_norm": 0.5275683403015137, "learning_rate": 1.370530610758921e-05, "loss": 0.0743, "step": 67729 }, { "epoch": 46.009497964721845, "grad_norm": 0.3685113787651062, "learning_rate": 1.359341088955618e-05, "loss": 0.0734, "step": 67818 }, { "epoch": 46.06987788331072, "grad_norm": 0.6584441661834717, "learning_rate": 1.3481902418146154e-05, "loss": 0.0742, "step": 67907 }, { "epoch": 46.130257801899596, "grad_norm": 0.7138823866844177, "learning_rate": 1.3370781877911842e-05, "loss": 0.0695, "step": 67996 }, { "epoch": 46.19063772048847, "grad_norm": 0.39327022433280945, "learning_rate": 1.326005044928501e-05, "loss": 0.0717, "step": 68085 }, { "epoch": 46.25101763907734, "grad_norm": 0.4522133469581604, "learning_rate": 1.3149709308563901e-05, "loss": 0.0749, "step": 68174 }, { "epoch": 46.31139755766621, "grad_norm": 0.6930340528488159, "learning_rate": 1.3039759627900672e-05, "loss": 0.074, "step": 68263 }, { "epoch": 46.37177747625509, "grad_norm": 2.3860812187194824, "learning_rate": 1.293020257528908e-05, "loss": 0.0756, "step": 68352 }, { "epoch": 46.43215739484396, "grad_norm": 0.8091538548469543, "learning_rate": 1.2821039314551958e-05, "loss": 0.0765, "step": 68441 }, { "epoch": 46.492537313432834, "grad_norm": 0.6948747038841248, "learning_rate": 1.2712271005328924e-05, "loss": 0.0746, "step": 68530 }, { "epoch": 46.55291723202171, "grad_norm": 1.2013221979141235, "learning_rate": 1.260389880306399e-05, "loss": 0.0709, "step": 68619 }, { "epoch": 46.613297150610585, "grad_norm": 1.0223325490951538, "learning_rate": 1.2495923858993364e-05, "loss": 0.076, "step": 68708 }, { "epoch": 46.67367706919946, "grad_norm": 0.7184458374977112, "learning_rate": 1.2388347320133182e-05, "loss": 0.0684, "step": 68797 }, { "epoch": 46.73405698778833, "grad_norm": 0.4814877510070801, "learning_rate": 1.2281170329267322e-05, "loss": 0.0724, "step": 68886 }, { "epoch": 46.79443690637721, "grad_norm": 0.5036719441413879, "learning_rate": 1.2174394024935281e-05, "loss": 0.0704, "step": 68975 }, { "epoch": 46.85481682496608, "grad_norm": 0.5806756019592285, "learning_rate": 1.2068019541420033e-05, "loss": 0.0723, "step": 69064 }, { "epoch": 46.91519674355495, "grad_norm": 1.2670601606369019, "learning_rate": 1.1962048008736053e-05, "loss": 0.0706, "step": 69153 }, { "epoch": 46.97557666214383, "grad_norm": 0.5702329277992249, "learning_rate": 1.1856480552617272e-05, "loss": 0.0702, "step": 69242 }, { "epoch": 47.0359565807327, "grad_norm": 0.49773919582366943, "learning_rate": 1.1751318294505104e-05, "loss": 0.0738, "step": 69331 }, { "epoch": 47.09633649932157, "grad_norm": 0.5580993294715881, "learning_rate": 1.1646562351536589e-05, "loss": 0.0714, "step": 69420 }, { "epoch": 47.156716417910445, "grad_norm": 0.47159460186958313, "learning_rate": 1.1542213836532417e-05, "loss": 0.0736, "step": 69509 }, { "epoch": 47.217096336499324, "grad_norm": 0.6028949618339539, "learning_rate": 1.1438273857985244e-05, "loss": 0.0748, "step": 69598 }, { "epoch": 47.277476255088196, "grad_norm": 0.7113878130912781, "learning_rate": 1.1334743520047836e-05, "loss": 0.0753, "step": 69687 }, { "epoch": 47.33785617367707, "grad_norm": 0.3303639888763428, "learning_rate": 1.1231623922521317e-05, "loss": 0.0716, "step": 69776 }, { "epoch": 47.39823609226594, "grad_norm": 1.0966421365737915, "learning_rate": 1.1128916160843578e-05, "loss": 0.0733, "step": 69865 }, { "epoch": 47.45861601085482, "grad_norm": 0.575943648815155, "learning_rate": 1.1026621326077525e-05, "loss": 0.0725, "step": 69954 }, { "epoch": 47.51899592944369, "grad_norm": 0.8768503665924072, "learning_rate": 1.0924740504899584e-05, "loss": 0.0704, "step": 70043 }, { "epoch": 47.57937584803256, "grad_norm": 0.6844857931137085, "learning_rate": 1.0823274779588122e-05, "loss": 0.0746, "step": 70132 }, { "epoch": 47.63975576662144, "grad_norm": 0.5367492437362671, "learning_rate": 1.0722225228011946e-05, "loss": 0.0714, "step": 70221 }, { "epoch": 47.70013568521031, "grad_norm": 0.5591740012168884, "learning_rate": 1.0621592923618856e-05, "loss": 0.0662, "step": 70310 }, { "epoch": 47.760515603799185, "grad_norm": 0.4710708558559418, "learning_rate": 1.0521378935424214e-05, "loss": 0.0743, "step": 70399 }, { "epoch": 47.82089552238806, "grad_norm": 0.7445366382598877, "learning_rate": 1.0421584327999651e-05, "loss": 0.0689, "step": 70488 }, { "epoch": 47.881275440976935, "grad_norm": 0.8262448906898499, "learning_rate": 1.0322210161461715e-05, "loss": 0.0763, "step": 70577 }, { "epoch": 47.94165535956581, "grad_norm": 0.5951725840568542, "learning_rate": 1.0223257491460608e-05, "loss": 0.0706, "step": 70666 }, { "epoch": 48.00203527815468, "grad_norm": 0.6799793243408203, "learning_rate": 1.0124727369169002e-05, "loss": 0.074, "step": 70755 }, { "epoch": 48.00542740841248, "eval_accuracy": 0.18359375, "eval_loss": 4.300548553466797, "eval_runtime": 21.4171, "eval_samples_per_second": 23.906, "eval_steps_per_second": 0.187, "step": 70760 }, { "epoch": 48.06241519674356, "grad_norm": 0.7727463841438293, "learning_rate": 1.0026620841270807e-05, "loss": 0.0711, "step": 70844 }, { "epoch": 48.12279511533243, "grad_norm": 0.641099214553833, "learning_rate": 9.928938949950133e-06, "loss": 0.0716, "step": 70933 } ], "logging_steps": 89, "max_steps": 88440, "num_input_tokens_seen": 0, "num_train_epochs": 60, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2159619256203346e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }