{ "best_global_step": 2800, "best_metric": 0.9999996161228406, "best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/fj_n3/checkpoint-2800", "epoch": 0.9073417721518987, "eval_steps": 40, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006481012658227848, "grad_norm": 52.902732849121094, "learning_rate": 2.0000000000000002e-07, "loss": 2.8041, "step": 2 }, { "epoch": 0.0012962025316455696, "grad_norm": 52.87858200073242, "learning_rate": 6.000000000000001e-07, "loss": 3.0228, "step": 4 }, { "epoch": 0.0019443037974683545, "grad_norm": 59.53159713745117, "learning_rate": 1.0000000000000002e-06, "loss": 3.0448, "step": 6 }, { "epoch": 0.002592405063291139, "grad_norm": 41.063140869140625, "learning_rate": 1.4000000000000001e-06, "loss": 2.5576, "step": 8 }, { "epoch": 0.0032405063291139242, "grad_norm": 46.71913528442383, "learning_rate": 1.8000000000000001e-06, "loss": 2.5334, "step": 10 }, { "epoch": 0.003888607594936709, "grad_norm": 38.23870086669922, "learning_rate": 2.2e-06, "loss": 2.0494, "step": 12 }, { "epoch": 0.004536708860759494, "grad_norm": 26.98814582824707, "learning_rate": 2.6e-06, "loss": 1.6988, "step": 14 }, { "epoch": 0.005184810126582278, "grad_norm": 24.897260665893555, "learning_rate": 3e-06, "loss": 1.4495, "step": 16 }, { "epoch": 0.005832911392405063, "grad_norm": 24.14356231689453, "learning_rate": 3.4000000000000005e-06, "loss": 1.2273, "step": 18 }, { "epoch": 0.0064810126582278485, "grad_norm": 20.519506454467773, "learning_rate": 3.8000000000000005e-06, "loss": 0.5886, "step": 20 }, { "epoch": 0.007129113924050633, "grad_norm": 13.519320487976074, "learning_rate": 4.2000000000000004e-06, "loss": 0.3763, "step": 22 }, { "epoch": 0.007777215189873418, "grad_norm": 5.407296657562256, "learning_rate": 4.600000000000001e-06, "loss": 0.2153, "step": 24 }, { "epoch": 0.008425316455696203, "grad_norm": 12.552337646484375, "learning_rate": 5e-06, "loss": 0.166, "step": 26 }, { "epoch": 0.009073417721518987, "grad_norm": 10.085341453552246, "learning_rate": 5.400000000000001e-06, "loss": 0.1352, "step": 28 }, { "epoch": 0.009721518987341773, "grad_norm": 9.081425666809082, "learning_rate": 5.8e-06, "loss": 0.1306, "step": 30 }, { "epoch": 0.010369620253164557, "grad_norm": 3.1638600826263428, "learning_rate": 6.200000000000001e-06, "loss": 0.1188, "step": 32 }, { "epoch": 0.011017721518987342, "grad_norm": 2.7224133014678955, "learning_rate": 6.600000000000001e-06, "loss": 0.1128, "step": 34 }, { "epoch": 0.011665822784810126, "grad_norm": 2.1373937129974365, "learning_rate": 7e-06, "loss": 0.0943, "step": 36 }, { "epoch": 0.012313924050632911, "grad_norm": 2.3295581340789795, "learning_rate": 7.4e-06, "loss": 0.0962, "step": 38 }, { "epoch": 0.012962025316455697, "grad_norm": 3.900003671646118, "learning_rate": 7.800000000000002e-06, "loss": 0.0952, "step": 40 }, { "epoch": 0.012962025316455697, "eval_accuracy": 0.9631154472461956, "eval_loss": 0.09466289728879929, "eval_runtime": 99.4408, "eval_samples_per_second": 50.281, "eval_steps_per_second": 12.57, "step": 40 }, { "epoch": 0.01361012658227848, "grad_norm": 1.0516443252563477, "learning_rate": 8.2e-06, "loss": 0.0836, "step": 42 }, { "epoch": 0.014258227848101266, "grad_norm": 3.0402073860168457, "learning_rate": 8.6e-06, "loss": 0.0763, "step": 44 }, { "epoch": 0.01490632911392405, "grad_norm": 3.903961181640625, "learning_rate": 9e-06, "loss": 0.0685, "step": 46 }, { "epoch": 0.015554430379746836, "grad_norm": 4.473770618438721, "learning_rate": 9.4e-06, "loss": 0.0843, "step": 48 }, { "epoch": 0.01620253164556962, "grad_norm": 1.465899109840393, "learning_rate": 9.800000000000001e-06, "loss": 0.0734, "step": 50 }, { "epoch": 0.016850632911392407, "grad_norm": 1.6887052059173584, "learning_rate": 9.999998993000299e-06, "loss": 0.07, "step": 52 }, { "epoch": 0.01749873417721519, "grad_norm": 2.927128553390503, "learning_rate": 9.999990937005126e-06, "loss": 0.0653, "step": 54 }, { "epoch": 0.018146835443037974, "grad_norm": 0.7761722207069397, "learning_rate": 9.999974825027756e-06, "loss": 0.0631, "step": 56 }, { "epoch": 0.01879493670886076, "grad_norm": 1.8903884887695312, "learning_rate": 9.999950657094151e-06, "loss": 0.0647, "step": 58 }, { "epoch": 0.019443037974683545, "grad_norm": 3.654014825820923, "learning_rate": 9.999918433243253e-06, "loss": 0.0654, "step": 60 }, { "epoch": 0.020091139240506328, "grad_norm": 1.6327952146530151, "learning_rate": 9.999878153526974e-06, "loss": 0.0755, "step": 62 }, { "epoch": 0.020739240506329113, "grad_norm": 1.149726390838623, "learning_rate": 9.99982981801022e-06, "loss": 0.0588, "step": 64 }, { "epoch": 0.0213873417721519, "grad_norm": 1.9310941696166992, "learning_rate": 9.999773426770864e-06, "loss": 0.0658, "step": 66 }, { "epoch": 0.022035443037974684, "grad_norm": 0.8754224181175232, "learning_rate": 9.999708979899769e-06, "loss": 0.049, "step": 68 }, { "epoch": 0.02268354430379747, "grad_norm": 0.8598536252975464, "learning_rate": 9.999636477500765e-06, "loss": 0.055, "step": 70 }, { "epoch": 0.023331645569620252, "grad_norm": 1.237684726715088, "learning_rate": 9.999555919690673e-06, "loss": 0.06, "step": 72 }, { "epoch": 0.023979746835443037, "grad_norm": 0.7132460474967957, "learning_rate": 9.999467306599285e-06, "loss": 0.0571, "step": 74 }, { "epoch": 0.024627848101265823, "grad_norm": 1.8192301988601685, "learning_rate": 9.999370638369377e-06, "loss": 0.0578, "step": 76 }, { "epoch": 0.02527594936708861, "grad_norm": 1.6354628801345825, "learning_rate": 9.999265915156697e-06, "loss": 0.0661, "step": 78 }, { "epoch": 0.025924050632911394, "grad_norm": 0.6722614169120789, "learning_rate": 9.999153137129978e-06, "loss": 0.0609, "step": 80 }, { "epoch": 0.025924050632911394, "eval_accuracy": 0.9742360751297139, "eval_loss": 0.059016820043325424, "eval_runtime": 100.1177, "eval_samples_per_second": 49.941, "eval_steps_per_second": 12.485, "step": 80 }, { "epoch": 0.026572151898734176, "grad_norm": 0.7650524377822876, "learning_rate": 9.999032304470926e-06, "loss": 0.0502, "step": 82 }, { "epoch": 0.02722025316455696, "grad_norm": 1.3773459196090698, "learning_rate": 9.998903417374228e-06, "loss": 0.0534, "step": 84 }, { "epoch": 0.027868354430379747, "grad_norm": 0.6049540638923645, "learning_rate": 9.998766476047546e-06, "loss": 0.042, "step": 86 }, { "epoch": 0.028516455696202533, "grad_norm": 0.8149325847625732, "learning_rate": 9.998621480711522e-06, "loss": 0.0475, "step": 88 }, { "epoch": 0.029164556962025315, "grad_norm": 0.7447686791419983, "learning_rate": 9.998468431599768e-06, "loss": 0.045, "step": 90 }, { "epoch": 0.0298126582278481, "grad_norm": 1.151421308517456, "learning_rate": 9.99830732895888e-06, "loss": 0.0508, "step": 92 }, { "epoch": 0.030460759493670886, "grad_norm": 0.9233959913253784, "learning_rate": 9.998138173048424e-06, "loss": 0.0445, "step": 94 }, { "epoch": 0.03110886075949367, "grad_norm": 1.1906189918518066, "learning_rate": 9.997960964140946e-06, "loss": 0.0494, "step": 96 }, { "epoch": 0.03175696202531646, "grad_norm": 1.164791226387024, "learning_rate": 9.997775702521965e-06, "loss": 0.0469, "step": 98 }, { "epoch": 0.03240506329113924, "grad_norm": 0.9656791090965271, "learning_rate": 9.997582388489975e-06, "loss": 0.0473, "step": 100 }, { "epoch": 0.03305316455696203, "grad_norm": 0.9113268256187439, "learning_rate": 9.99738102235644e-06, "loss": 0.0414, "step": 102 }, { "epoch": 0.033701265822784814, "grad_norm": 0.8496657609939575, "learning_rate": 9.997171604445803e-06, "loss": 0.0465, "step": 104 }, { "epoch": 0.03434936708860759, "grad_norm": 0.7116028666496277, "learning_rate": 9.99695413509548e-06, "loss": 0.042, "step": 106 }, { "epoch": 0.03499746835443038, "grad_norm": 0.8646735548973083, "learning_rate": 9.996728614655854e-06, "loss": 0.0437, "step": 108 }, { "epoch": 0.03564556962025316, "grad_norm": 0.4896007180213928, "learning_rate": 9.996495043490285e-06, "loss": 0.0367, "step": 110 }, { "epoch": 0.03629367088607595, "grad_norm": 0.7575715184211731, "learning_rate": 9.996253421975103e-06, "loss": 0.0414, "step": 112 }, { "epoch": 0.036941772151898734, "grad_norm": 0.6563818454742432, "learning_rate": 9.996003750499608e-06, "loss": 0.0406, "step": 114 }, { "epoch": 0.03758987341772152, "grad_norm": 0.7096058130264282, "learning_rate": 9.995746029466071e-06, "loss": 0.042, "step": 116 }, { "epoch": 0.038237974683544305, "grad_norm": 0.8628009557723999, "learning_rate": 9.995480259289731e-06, "loss": 0.043, "step": 118 }, { "epoch": 0.03888607594936709, "grad_norm": 0.9108039140701294, "learning_rate": 9.995206440398798e-06, "loss": 0.0401, "step": 120 }, { "epoch": 0.03888607594936709, "eval_accuracy": 0.9794015821356299, "eval_loss": 0.04449129104614258, "eval_runtime": 97.7024, "eval_samples_per_second": 51.176, "eval_steps_per_second": 12.794, "step": 120 }, { "epoch": 0.03953417721518988, "grad_norm": 1.005293846130371, "learning_rate": 9.994924573234448e-06, "loss": 0.0407, "step": 122 }, { "epoch": 0.040182278481012655, "grad_norm": 1.1099708080291748, "learning_rate": 9.994634658250825e-06, "loss": 0.0433, "step": 124 }, { "epoch": 0.04083037974683544, "grad_norm": 0.9993194341659546, "learning_rate": 9.994336695915041e-06, "loss": 0.0389, "step": 126 }, { "epoch": 0.041478481012658226, "grad_norm": 0.6605297923088074, "learning_rate": 9.994030686707171e-06, "loss": 0.0378, "step": 128 }, { "epoch": 0.04212658227848101, "grad_norm": 0.5856267809867859, "learning_rate": 9.993716631120259e-06, "loss": 0.0432, "step": 130 }, { "epoch": 0.0427746835443038, "grad_norm": 0.6284976601600647, "learning_rate": 9.993394529660307e-06, "loss": 0.044, "step": 132 }, { "epoch": 0.04342278481012658, "grad_norm": 0.7984368801116943, "learning_rate": 9.99306438284629e-06, "loss": 0.0418, "step": 134 }, { "epoch": 0.04407088607594937, "grad_norm": 0.9223721027374268, "learning_rate": 9.992726191210139e-06, "loss": 0.038, "step": 136 }, { "epoch": 0.044718987341772154, "grad_norm": 0.5600959062576294, "learning_rate": 9.992379955296745e-06, "loss": 0.039, "step": 138 }, { "epoch": 0.04536708860759494, "grad_norm": 0.6998872756958008, "learning_rate": 9.992025675663966e-06, "loss": 0.036, "step": 140 }, { "epoch": 0.04601518987341772, "grad_norm": 0.5963979363441467, "learning_rate": 9.991663352882615e-06, "loss": 0.0379, "step": 142 }, { "epoch": 0.046663291139240504, "grad_norm": 0.7889736890792847, "learning_rate": 9.991292987536469e-06, "loss": 0.0379, "step": 144 }, { "epoch": 0.04731139240506329, "grad_norm": 0.5946393013000488, "learning_rate": 9.990914580222258e-06, "loss": 0.0348, "step": 146 }, { "epoch": 0.047959493670886075, "grad_norm": 0.45124557614326477, "learning_rate": 9.990528131549674e-06, "loss": 0.0385, "step": 148 }, { "epoch": 0.04860759493670886, "grad_norm": 0.6986305117607117, "learning_rate": 9.990133642141359e-06, "loss": 0.0371, "step": 150 }, { "epoch": 0.049255696202531646, "grad_norm": 0.9078485369682312, "learning_rate": 9.989731112632917e-06, "loss": 0.0327, "step": 152 }, { "epoch": 0.04990379746835443, "grad_norm": 0.8524628281593323, "learning_rate": 9.989320543672904e-06, "loss": 0.0384, "step": 154 }, { "epoch": 0.05055189873417722, "grad_norm": 0.9344510436058044, "learning_rate": 9.988901935922826e-06, "loss": 0.0328, "step": 156 }, { "epoch": 0.0512, "grad_norm": 0.7619643211364746, "learning_rate": 9.988475290057145e-06, "loss": 0.033, "step": 158 }, { "epoch": 0.05184810126582279, "grad_norm": 0.4197481572628021, "learning_rate": 9.988040606763272e-06, "loss": 0.0336, "step": 160 }, { "epoch": 0.05184810126582279, "eval_accuracy": 0.9819056194501201, "eval_loss": 0.03950502350926399, "eval_runtime": 98.6835, "eval_samples_per_second": 50.667, "eval_steps_per_second": 12.667, "step": 160 }, { "epoch": 0.05249620253164557, "grad_norm": 0.5393734574317932, "learning_rate": 9.98759788674157e-06, "loss": 0.0352, "step": 162 }, { "epoch": 0.05314430379746835, "grad_norm": 0.7958114743232727, "learning_rate": 9.987147130705347e-06, "loss": 0.0352, "step": 164 }, { "epoch": 0.05379240506329114, "grad_norm": 0.48281773924827576, "learning_rate": 9.986688339380863e-06, "loss": 0.0355, "step": 166 }, { "epoch": 0.05444050632911392, "grad_norm": 0.6643773317337036, "learning_rate": 9.98622151350732e-06, "loss": 0.0322, "step": 168 }, { "epoch": 0.05508860759493671, "grad_norm": 0.4064873158931732, "learning_rate": 9.985746653836867e-06, "loss": 0.0296, "step": 170 }, { "epoch": 0.055736708860759494, "grad_norm": 0.4580860137939453, "learning_rate": 9.985263761134602e-06, "loss": 0.037, "step": 172 }, { "epoch": 0.05638481012658228, "grad_norm": 0.9311563372612, "learning_rate": 9.984772836178559e-06, "loss": 0.0308, "step": 174 }, { "epoch": 0.057032911392405065, "grad_norm": 0.7613935470581055, "learning_rate": 9.984273879759713e-06, "loss": 0.03, "step": 176 }, { "epoch": 0.05768101265822785, "grad_norm": 0.5291257500648499, "learning_rate": 9.983766892681985e-06, "loss": 0.0345, "step": 178 }, { "epoch": 0.05832911392405063, "grad_norm": 0.6109746098518372, "learning_rate": 9.983251875762234e-06, "loss": 0.0368, "step": 180 }, { "epoch": 0.058977215189873415, "grad_norm": 0.6333370804786682, "learning_rate": 9.982728829830252e-06, "loss": 0.0321, "step": 182 }, { "epoch": 0.0596253164556962, "grad_norm": 0.6072647571563721, "learning_rate": 9.982197755728771e-06, "loss": 0.0268, "step": 184 }, { "epoch": 0.060273417721518986, "grad_norm": 0.5882136821746826, "learning_rate": 9.981658654313458e-06, "loss": 0.0354, "step": 186 }, { "epoch": 0.06092151898734177, "grad_norm": 0.742362380027771, "learning_rate": 9.981111526452912e-06, "loss": 0.0333, "step": 188 }, { "epoch": 0.06156962025316456, "grad_norm": 0.6676820516586304, "learning_rate": 9.980556373028665e-06, "loss": 0.0319, "step": 190 }, { "epoch": 0.06221772151898734, "grad_norm": 0.6671105623245239, "learning_rate": 9.979993194935182e-06, "loss": 0.0307, "step": 192 }, { "epoch": 0.06286582278481012, "grad_norm": 0.7644891142845154, "learning_rate": 9.979421993079853e-06, "loss": 0.0346, "step": 194 }, { "epoch": 0.06351392405063291, "grad_norm": 0.4839600920677185, "learning_rate": 9.978842768382999e-06, "loss": 0.0312, "step": 196 }, { "epoch": 0.06416202531645569, "grad_norm": 0.859524130821228, "learning_rate": 9.978255521777865e-06, "loss": 0.0317, "step": 198 }, { "epoch": 0.06481012658227848, "grad_norm": 0.6183629631996155, "learning_rate": 9.977660254210623e-06, "loss": 0.0279, "step": 200 }, { "epoch": 0.06481012658227848, "eval_accuracy": 0.9843501386117544, "eval_loss": 0.03353777900338173, "eval_runtime": 99.3735, "eval_samples_per_second": 50.315, "eval_steps_per_second": 12.579, "step": 200 }, { "epoch": 0.06545822784810126, "grad_norm": 0.5192128419876099, "learning_rate": 9.977056966640368e-06, "loss": 0.0317, "step": 202 }, { "epoch": 0.06610632911392406, "grad_norm": 0.5304135680198669, "learning_rate": 9.976445660039118e-06, "loss": 0.026, "step": 204 }, { "epoch": 0.06675443037974683, "grad_norm": 0.7126085162162781, "learning_rate": 9.975826335391808e-06, "loss": 0.028, "step": 206 }, { "epoch": 0.06740253164556963, "grad_norm": 0.4410451352596283, "learning_rate": 9.975198993696294e-06, "loss": 0.0287, "step": 208 }, { "epoch": 0.0680506329113924, "grad_norm": 0.4624532461166382, "learning_rate": 9.974563635963348e-06, "loss": 0.0273, "step": 210 }, { "epoch": 0.06869873417721518, "grad_norm": 0.5675721764564514, "learning_rate": 9.973920263216658e-06, "loss": 0.0257, "step": 212 }, { "epoch": 0.06934683544303798, "grad_norm": 0.5974188446998596, "learning_rate": 9.973268876492827e-06, "loss": 0.0301, "step": 214 }, { "epoch": 0.06999493670886076, "grad_norm": 0.55629962682724, "learning_rate": 9.972609476841368e-06, "loss": 0.0256, "step": 216 }, { "epoch": 0.07064303797468355, "grad_norm": 0.7521873116493225, "learning_rate": 9.971942065324704e-06, "loss": 0.0309, "step": 218 }, { "epoch": 0.07129113924050633, "grad_norm": 0.9954202175140381, "learning_rate": 9.971266643018171e-06, "loss": 0.0291, "step": 220 }, { "epoch": 0.07193924050632912, "grad_norm": 0.7211986780166626, "learning_rate": 9.970583211010008e-06, "loss": 0.0331, "step": 222 }, { "epoch": 0.0725873417721519, "grad_norm": 0.8910839557647705, "learning_rate": 9.969891770401358e-06, "loss": 0.0289, "step": 224 }, { "epoch": 0.07323544303797469, "grad_norm": 0.522318422794342, "learning_rate": 9.969192322306271e-06, "loss": 0.0278, "step": 226 }, { "epoch": 0.07388354430379747, "grad_norm": 0.6214476823806763, "learning_rate": 9.968484867851698e-06, "loss": 0.0248, "step": 228 }, { "epoch": 0.07453164556962025, "grad_norm": 0.5874454379081726, "learning_rate": 9.96776940817749e-06, "loss": 0.0245, "step": 230 }, { "epoch": 0.07517974683544304, "grad_norm": 1.1179145574569702, "learning_rate": 9.967045944436392e-06, "loss": 0.027, "step": 232 }, { "epoch": 0.07582784810126582, "grad_norm": 0.6947236657142639, "learning_rate": 9.966314477794052e-06, "loss": 0.0281, "step": 234 }, { "epoch": 0.07647594936708861, "grad_norm": 0.5388737320899963, "learning_rate": 9.965575009429006e-06, "loss": 0.0237, "step": 236 }, { "epoch": 0.07712405063291139, "grad_norm": 0.5712288022041321, "learning_rate": 9.964827540532685e-06, "loss": 0.0221, "step": 238 }, { "epoch": 0.07777215189873418, "grad_norm": 0.7290524244308472, "learning_rate": 9.964072072309412e-06, "loss": 0.024, "step": 240 }, { "epoch": 0.07777215189873418, "eval_accuracy": 0.9883965760621634, "eval_loss": 0.02650127001106739, "eval_runtime": 102.4631, "eval_samples_per_second": 48.798, "eval_steps_per_second": 12.2, "step": 240 }, { "epoch": 0.07842025316455696, "grad_norm": 0.6895317435264587, "learning_rate": 9.963308605976397e-06, "loss": 0.0249, "step": 242 }, { "epoch": 0.07906835443037975, "grad_norm": 0.5299035310745239, "learning_rate": 9.962537142763733e-06, "loss": 0.0257, "step": 244 }, { "epoch": 0.07971645569620253, "grad_norm": 0.41885891556739807, "learning_rate": 9.961757683914406e-06, "loss": 0.0235, "step": 246 }, { "epoch": 0.08036455696202531, "grad_norm": 0.7982097864151001, "learning_rate": 9.960970230684276e-06, "loss": 0.0293, "step": 248 }, { "epoch": 0.0810126582278481, "grad_norm": 0.7573944330215454, "learning_rate": 9.96017478434209e-06, "loss": 0.0255, "step": 250 }, { "epoch": 0.08166075949367088, "grad_norm": 0.575404703617096, "learning_rate": 9.959371346169466e-06, "loss": 0.023, "step": 252 }, { "epoch": 0.08230886075949367, "grad_norm": 0.4789620339870453, "learning_rate": 9.958559917460909e-06, "loss": 0.0238, "step": 254 }, { "epoch": 0.08295696202531645, "grad_norm": 0.6227384209632874, "learning_rate": 9.957740499523787e-06, "loss": 0.0194, "step": 256 }, { "epoch": 0.08360506329113924, "grad_norm": 0.6454212665557861, "learning_rate": 9.95691309367835e-06, "loss": 0.0188, "step": 258 }, { "epoch": 0.08425316455696202, "grad_norm": 1.03157377243042, "learning_rate": 9.95607770125771e-06, "loss": 0.0194, "step": 260 }, { "epoch": 0.08490126582278482, "grad_norm": 0.659908652305603, "learning_rate": 9.955234323607854e-06, "loss": 0.0182, "step": 262 }, { "epoch": 0.0855493670886076, "grad_norm": 0.5437667369842529, "learning_rate": 9.954382962087628e-06, "loss": 0.0181, "step": 264 }, { "epoch": 0.08619746835443037, "grad_norm": 0.6034504771232605, "learning_rate": 9.95352361806875e-06, "loss": 0.0179, "step": 266 }, { "epoch": 0.08684556962025317, "grad_norm": 0.6281765103340149, "learning_rate": 9.95265629293579e-06, "loss": 0.019, "step": 268 }, { "epoch": 0.08749367088607594, "grad_norm": 0.6361079216003418, "learning_rate": 9.951780988086183e-06, "loss": 0.0154, "step": 270 }, { "epoch": 0.08814177215189874, "grad_norm": 0.5678423643112183, "learning_rate": 9.950897704930223e-06, "loss": 0.0161, "step": 272 }, { "epoch": 0.08878987341772152, "grad_norm": 1.9274733066558838, "learning_rate": 9.95000644489105e-06, "loss": 0.0159, "step": 274 }, { "epoch": 0.08943797468354431, "grad_norm": 0.7999576926231384, "learning_rate": 9.949107209404664e-06, "loss": 0.0179, "step": 276 }, { "epoch": 0.09008607594936709, "grad_norm": 0.8246878981590271, "learning_rate": 9.948199999919914e-06, "loss": 0.0158, "step": 278 }, { "epoch": 0.09073417721518988, "grad_norm": 1.3101437091827393, "learning_rate": 9.947284817898493e-06, "loss": 0.0164, "step": 280 }, { "epoch": 0.09073417721518988, "eval_accuracy": 0.9941715658085519, "eval_loss": 0.014373338781297207, "eval_runtime": 102.9242, "eval_samples_per_second": 48.579, "eval_steps_per_second": 12.145, "step": 280 }, { "epoch": 0.09138227848101266, "grad_norm": 1.0926411151885986, "learning_rate": 9.946361664814942e-06, "loss": 0.0139, "step": 282 }, { "epoch": 0.09203037974683544, "grad_norm": 1.1184602975845337, "learning_rate": 9.945430542156647e-06, "loss": 0.0148, "step": 284 }, { "epoch": 0.09267848101265823, "grad_norm": 0.7032978534698486, "learning_rate": 9.944491451423829e-06, "loss": 0.0124, "step": 286 }, { "epoch": 0.09332658227848101, "grad_norm": 0.7714938521385193, "learning_rate": 9.943544394129552e-06, "loss": 0.0109, "step": 288 }, { "epoch": 0.0939746835443038, "grad_norm": 0.6936782598495483, "learning_rate": 9.942589371799715e-06, "loss": 0.009, "step": 290 }, { "epoch": 0.09462278481012658, "grad_norm": 0.743622899055481, "learning_rate": 9.941626385973047e-06, "loss": 0.0093, "step": 292 }, { "epoch": 0.09527088607594937, "grad_norm": 0.815427303314209, "learning_rate": 9.940655438201113e-06, "loss": 0.0089, "step": 294 }, { "epoch": 0.09591898734177215, "grad_norm": 1.016943335533142, "learning_rate": 9.9396765300483e-06, "loss": 0.0105, "step": 296 }, { "epoch": 0.09656708860759494, "grad_norm": 0.7963289618492126, "learning_rate": 9.938689663091828e-06, "loss": 0.0082, "step": 298 }, { "epoch": 0.09721518987341772, "grad_norm": 0.80525141954422, "learning_rate": 9.937694838921734e-06, "loss": 0.0116, "step": 300 }, { "epoch": 0.09786329113924051, "grad_norm": 1.1018229722976685, "learning_rate": 9.93669205914088e-06, "loss": 0.0083, "step": 302 }, { "epoch": 0.09851139240506329, "grad_norm": 0.6011309623718262, "learning_rate": 9.93568132536494e-06, "loss": 0.0055, "step": 304 }, { "epoch": 0.09915949367088607, "grad_norm": 0.6187568306922913, "learning_rate": 9.934662639222412e-06, "loss": 0.0079, "step": 306 }, { "epoch": 0.09980759493670886, "grad_norm": 0.7109318971633911, "learning_rate": 9.9336360023546e-06, "loss": 0.0056, "step": 308 }, { "epoch": 0.10045569620253164, "grad_norm": 0.9586265087127686, "learning_rate": 9.932601416415622e-06, "loss": 0.0055, "step": 310 }, { "epoch": 0.10110379746835443, "grad_norm": 0.6494437456130981, "learning_rate": 9.931558883072403e-06, "loss": 0.0048, "step": 312 }, { "epoch": 0.10175189873417721, "grad_norm": 0.7673395872116089, "learning_rate": 9.930508404004668e-06, "loss": 0.0059, "step": 314 }, { "epoch": 0.1024, "grad_norm": 0.5997980237007141, "learning_rate": 9.929449980904952e-06, "loss": 0.0045, "step": 316 }, { "epoch": 0.10304810126582278, "grad_norm": 0.6281548142433167, "learning_rate": 9.928383615478586e-06, "loss": 0.0046, "step": 318 }, { "epoch": 0.10369620253164558, "grad_norm": 0.7596728801727295, "learning_rate": 9.927309309443696e-06, "loss": 0.0052, "step": 320 }, { "epoch": 0.10369620253164558, "eval_accuracy": 0.9985858696992592, "eval_loss": 0.005098371766507626, "eval_runtime": 102.4403, "eval_samples_per_second": 48.809, "eval_steps_per_second": 12.202, "step": 320 }, { "epoch": 0.10434430379746835, "grad_norm": 0.5863160490989685, "learning_rate": 9.9262270645312e-06, "loss": 0.0072, "step": 322 }, { "epoch": 0.10499240506329113, "grad_norm": 0.4260634183883667, "learning_rate": 9.925136882484816e-06, "loss": 0.0037, "step": 324 }, { "epoch": 0.10564050632911393, "grad_norm": 0.6246248483657837, "learning_rate": 9.924038765061042e-06, "loss": 0.0048, "step": 326 }, { "epoch": 0.1062886075949367, "grad_norm": 0.4939432144165039, "learning_rate": 9.922932714029163e-06, "loss": 0.0037, "step": 328 }, { "epoch": 0.1069367088607595, "grad_norm": 1.4279497861862183, "learning_rate": 9.921818731171249e-06, "loss": 0.0032, "step": 330 }, { "epoch": 0.10758481012658228, "grad_norm": 0.4592280983924866, "learning_rate": 9.920696818282147e-06, "loss": 0.0013, "step": 332 }, { "epoch": 0.10823291139240507, "grad_norm": 0.6403955817222595, "learning_rate": 9.919566977169486e-06, "loss": 0.003, "step": 334 }, { "epoch": 0.10888101265822785, "grad_norm": 0.6681948304176331, "learning_rate": 9.918429209653662e-06, "loss": 0.0026, "step": 336 }, { "epoch": 0.10952911392405064, "grad_norm": 0.40499842166900635, "learning_rate": 9.917283517567845e-06, "loss": 0.0079, "step": 338 }, { "epoch": 0.11017721518987342, "grad_norm": 0.4384974539279938, "learning_rate": 9.916129902757977e-06, "loss": 0.0022, "step": 340 }, { "epoch": 0.1108253164556962, "grad_norm": 0.917611837387085, "learning_rate": 9.914968367082756e-06, "loss": 0.0019, "step": 342 }, { "epoch": 0.11147341772151899, "grad_norm": 0.5483088493347168, "learning_rate": 9.913798912413653e-06, "loss": 0.0041, "step": 344 }, { "epoch": 0.11212151898734177, "grad_norm": 0.6087149381637573, "learning_rate": 9.912621540634889e-06, "loss": 0.003, "step": 346 }, { "epoch": 0.11276962025316456, "grad_norm": 0.9792701005935669, "learning_rate": 9.911436253643445e-06, "loss": 0.0051, "step": 348 }, { "epoch": 0.11341772151898734, "grad_norm": 0.46137937903404236, "learning_rate": 9.910243053349055e-06, "loss": 0.0058, "step": 350 }, { "epoch": 0.11406582278481013, "grad_norm": 0.322523832321167, "learning_rate": 9.909041941674205e-06, "loss": 0.0036, "step": 352 }, { "epoch": 0.11471392405063291, "grad_norm": 0.6017354726791382, "learning_rate": 9.90783292055412e-06, "loss": 0.0024, "step": 354 }, { "epoch": 0.1153620253164557, "grad_norm": 0.3198406994342804, "learning_rate": 9.906615991936781e-06, "loss": 0.0022, "step": 356 }, { "epoch": 0.11601012658227848, "grad_norm": 1.0170084238052368, "learning_rate": 9.905391157782897e-06, "loss": 0.0021, "step": 358 }, { "epoch": 0.11665822784810126, "grad_norm": 0.4881608188152313, "learning_rate": 9.904158420065923e-06, "loss": 0.0014, "step": 360 }, { "epoch": 0.11665822784810126, "eval_accuracy": 0.9992789522224573, "eval_loss": 0.002443755976855755, "eval_runtime": 102.565, "eval_samples_per_second": 48.75, "eval_steps_per_second": 12.187, "step": 360 }, { "epoch": 0.11730632911392405, "grad_norm": 0.4268784523010254, "learning_rate": 9.902917780772043e-06, "loss": 0.0016, "step": 362 }, { "epoch": 0.11795443037974683, "grad_norm": 0.3397619128227234, "learning_rate": 9.901669241900178e-06, "loss": 0.0027, "step": 364 }, { "epoch": 0.11860253164556962, "grad_norm": 0.36573904752731323, "learning_rate": 9.900412805461968e-06, "loss": 0.0016, "step": 366 }, { "epoch": 0.1192506329113924, "grad_norm": 0.3953874111175537, "learning_rate": 9.899148473481786e-06, "loss": 0.0055, "step": 368 }, { "epoch": 0.1198987341772152, "grad_norm": 0.7349480390548706, "learning_rate": 9.89787624799672e-06, "loss": 0.0021, "step": 370 }, { "epoch": 0.12054683544303797, "grad_norm": 0.24667468667030334, "learning_rate": 9.896596131056583e-06, "loss": 0.0008, "step": 372 }, { "epoch": 0.12119493670886076, "grad_norm": 0.18324322998523712, "learning_rate": 9.895308124723897e-06, "loss": 0.001, "step": 374 }, { "epoch": 0.12184303797468354, "grad_norm": 0.3875872492790222, "learning_rate": 9.894012231073895e-06, "loss": 0.0023, "step": 376 }, { "epoch": 0.12249113924050634, "grad_norm": 0.5702228546142578, "learning_rate": 9.892708452194522e-06, "loss": 0.0012, "step": 378 }, { "epoch": 0.12313924050632911, "grad_norm": 0.4540491998195648, "learning_rate": 9.891396790186424e-06, "loss": 0.0013, "step": 380 }, { "epoch": 0.1237873417721519, "grad_norm": 0.17990654706954956, "learning_rate": 9.890077247162951e-06, "loss": 0.0034, "step": 382 }, { "epoch": 0.12443544303797469, "grad_norm": 0.1970151662826538, "learning_rate": 9.888749825250151e-06, "loss": 0.0008, "step": 384 }, { "epoch": 0.12508354430379748, "grad_norm": 0.37782326340675354, "learning_rate": 9.887414526586764e-06, "loss": 0.0013, "step": 386 }, { "epoch": 0.12573164556962024, "grad_norm": 0.7092784643173218, "learning_rate": 9.886071353324223e-06, "loss": 0.0027, "step": 388 }, { "epoch": 0.12637974683544304, "grad_norm": 0.907179594039917, "learning_rate": 9.884720307626647e-06, "loss": 0.0027, "step": 390 }, { "epoch": 0.12702784810126583, "grad_norm": 0.22963130474090576, "learning_rate": 9.883361391670841e-06, "loss": 0.0011, "step": 392 }, { "epoch": 0.12767594936708862, "grad_norm": 0.592887282371521, "learning_rate": 9.881994607646288e-06, "loss": 0.0041, "step": 394 }, { "epoch": 0.12832405063291139, "grad_norm": 0.639837920665741, "learning_rate": 9.880619957755151e-06, "loss": 0.0012, "step": 396 }, { "epoch": 0.12897215189873418, "grad_norm": 0.5888617038726807, "learning_rate": 9.879237444212265e-06, "loss": 0.0024, "step": 398 }, { "epoch": 0.12962025316455697, "grad_norm": 0.6880702972412109, "learning_rate": 9.877847069245134e-06, "loss": 0.0025, "step": 400 }, { "epoch": 0.12962025316455697, "eval_accuracy": 0.999652682586963, "eval_loss": 0.001261554891243577, "eval_runtime": 101.9742, "eval_samples_per_second": 49.032, "eval_steps_per_second": 12.258, "step": 400 }, { "epoch": 0.13026835443037973, "grad_norm": 0.5125930905342102, "learning_rate": 9.87644883509393e-06, "loss": 0.0008, "step": 402 }, { "epoch": 0.13091645569620253, "grad_norm": 0.6037792563438416, "learning_rate": 9.875042744011487e-06, "loss": 0.0012, "step": 404 }, { "epoch": 0.13156455696202532, "grad_norm": 0.14983819425106049, "learning_rate": 9.873628798263297e-06, "loss": 0.0013, "step": 406 }, { "epoch": 0.1322126582278481, "grad_norm": 0.14885587990283966, "learning_rate": 9.87220700012751e-06, "loss": 0.0003, "step": 408 }, { "epoch": 0.13286075949367088, "grad_norm": 0.28928548097610474, "learning_rate": 9.870777351894926e-06, "loss": 0.0006, "step": 410 }, { "epoch": 0.13350886075949367, "grad_norm": 0.23244060575962067, "learning_rate": 9.869339855868992e-06, "loss": 0.0019, "step": 412 }, { "epoch": 0.13415696202531646, "grad_norm": 0.4798615574836731, "learning_rate": 9.867894514365802e-06, "loss": 0.0029, "step": 414 }, { "epoch": 0.13480506329113925, "grad_norm": 0.2007167637348175, "learning_rate": 9.86644132971409e-06, "loss": 0.0003, "step": 416 }, { "epoch": 0.13545316455696202, "grad_norm": 0.42807483673095703, "learning_rate": 9.864980304255222e-06, "loss": 0.0009, "step": 418 }, { "epoch": 0.1361012658227848, "grad_norm": 0.09191492944955826, "learning_rate": 9.863511440343206e-06, "loss": 0.0003, "step": 420 }, { "epoch": 0.1367493670886076, "grad_norm": 0.4787420928478241, "learning_rate": 9.862034740344673e-06, "loss": 0.0007, "step": 422 }, { "epoch": 0.13739746835443037, "grad_norm": 0.34788092970848083, "learning_rate": 9.860550206638881e-06, "loss": 0.0006, "step": 424 }, { "epoch": 0.13804556962025316, "grad_norm": 0.2141241580247879, "learning_rate": 9.859057841617709e-06, "loss": 0.0008, "step": 426 }, { "epoch": 0.13869367088607595, "grad_norm": 0.18016201257705688, "learning_rate": 9.857557647685657e-06, "loss": 0.0003, "step": 428 }, { "epoch": 0.13934177215189875, "grad_norm": 0.13558854162693024, "learning_rate": 9.856049627259833e-06, "loss": 0.0022, "step": 430 }, { "epoch": 0.1399898734177215, "grad_norm": 0.17645730078220367, "learning_rate": 9.85453378276996e-06, "loss": 0.0005, "step": 432 }, { "epoch": 0.1406379746835443, "grad_norm": 0.3373895585536957, "learning_rate": 9.853010116658368e-06, "loss": 0.002, "step": 434 }, { "epoch": 0.1412860759493671, "grad_norm": 0.8115471005439758, "learning_rate": 9.851478631379982e-06, "loss": 0.0025, "step": 436 }, { "epoch": 0.14193417721518986, "grad_norm": 0.5957593321800232, "learning_rate": 9.849939329402337e-06, "loss": 0.0009, "step": 438 }, { "epoch": 0.14258227848101265, "grad_norm": 0.23780769109725952, "learning_rate": 9.848392213205549e-06, "loss": 0.0002, "step": 440 }, { "epoch": 0.14258227848101265, "eval_accuracy": 0.999539490974426, "eval_loss": 0.0016721197171136737, "eval_runtime": 103.3024, "eval_samples_per_second": 48.402, "eval_steps_per_second": 12.1, "step": 440 }, { "epoch": 0.14323037974683545, "grad_norm": 0.8508957624435425, "learning_rate": 9.846837285282331e-06, "loss": 0.0026, "step": 442 }, { "epoch": 0.14387848101265824, "grad_norm": 0.272156298160553, "learning_rate": 9.845274548137986e-06, "loss": 0.0008, "step": 444 }, { "epoch": 0.144526582278481, "grad_norm": 0.3533732295036316, "learning_rate": 9.843704004290393e-06, "loss": 0.0006, "step": 446 }, { "epoch": 0.1451746835443038, "grad_norm": 0.4029495418071747, "learning_rate": 9.842125656270011e-06, "loss": 0.001, "step": 448 }, { "epoch": 0.1458227848101266, "grad_norm": 0.8233634233474731, "learning_rate": 9.840539506619874e-06, "loss": 0.0008, "step": 450 }, { "epoch": 0.14647088607594938, "grad_norm": 0.5356901288032532, "learning_rate": 9.838945557895586e-06, "loss": 0.0007, "step": 452 }, { "epoch": 0.14711898734177214, "grad_norm": 0.2129882127046585, "learning_rate": 9.837343812665311e-06, "loss": 0.0021, "step": 454 }, { "epoch": 0.14776708860759494, "grad_norm": 0.5427262783050537, "learning_rate": 9.835734273509787e-06, "loss": 0.0011, "step": 456 }, { "epoch": 0.14841518987341773, "grad_norm": 0.19061443209648132, "learning_rate": 9.834116943022299e-06, "loss": 0.0006, "step": 458 }, { "epoch": 0.1490632911392405, "grad_norm": 0.1520262062549591, "learning_rate": 9.832491823808688e-06, "loss": 0.0004, "step": 460 }, { "epoch": 0.1497113924050633, "grad_norm": 0.35970884561538696, "learning_rate": 9.830858918487347e-06, "loss": 0.0043, "step": 462 }, { "epoch": 0.15035949367088608, "grad_norm": 0.3962363004684448, "learning_rate": 9.829218229689211e-06, "loss": 0.0013, "step": 464 }, { "epoch": 0.15100759493670887, "grad_norm": 0.37777262926101685, "learning_rate": 9.827569760057755e-06, "loss": 0.0031, "step": 466 }, { "epoch": 0.15165569620253164, "grad_norm": 0.445781946182251, "learning_rate": 9.825913512248996e-06, "loss": 0.0014, "step": 468 }, { "epoch": 0.15230379746835443, "grad_norm": 0.4247877895832062, "learning_rate": 9.824249488931477e-06, "loss": 0.0011, "step": 470 }, { "epoch": 0.15295189873417722, "grad_norm": 0.20235390961170197, "learning_rate": 9.822577692786272e-06, "loss": 0.0005, "step": 472 }, { "epoch": 0.1536, "grad_norm": 0.5411356091499329, "learning_rate": 9.820898126506978e-06, "loss": 0.0026, "step": 474 }, { "epoch": 0.15424810126582278, "grad_norm": 0.07654337584972382, "learning_rate": 9.819210792799711e-06, "loss": 0.0003, "step": 476 }, { "epoch": 0.15489620253164557, "grad_norm": 0.08077862858772278, "learning_rate": 9.817515694383102e-06, "loss": 0.0003, "step": 478 }, { "epoch": 0.15554430379746836, "grad_norm": 0.17977231740951538, "learning_rate": 9.815812833988292e-06, "loss": 0.0038, "step": 480 }, { "epoch": 0.15554430379746836, "eval_accuracy": 0.9997459873278298, "eval_loss": 0.0012969060335308313, "eval_runtime": 99.7243, "eval_samples_per_second": 50.138, "eval_steps_per_second": 12.535, "step": 480 }, { "epoch": 0.15619240506329113, "grad_norm": 0.22738438844680786, "learning_rate": 9.814102214358928e-06, "loss": 0.0002, "step": 482 }, { "epoch": 0.15684050632911392, "grad_norm": 0.20708250999450684, "learning_rate": 9.81238383825116e-06, "loss": 0.0043, "step": 484 }, { "epoch": 0.1574886075949367, "grad_norm": 0.29505881667137146, "learning_rate": 9.810657708433637e-06, "loss": 0.0004, "step": 486 }, { "epoch": 0.1581367088607595, "grad_norm": 0.7022032737731934, "learning_rate": 9.808923827687494e-06, "loss": 0.0017, "step": 488 }, { "epoch": 0.15878481012658227, "grad_norm": 0.34339165687561035, "learning_rate": 9.807182198806362e-06, "loss": 0.0034, "step": 490 }, { "epoch": 0.15943291139240506, "grad_norm": 0.19041544198989868, "learning_rate": 9.805432824596347e-06, "loss": 0.0014, "step": 492 }, { "epoch": 0.16008101265822786, "grad_norm": 0.24253234267234802, "learning_rate": 9.803675707876048e-06, "loss": 0.0038, "step": 494 }, { "epoch": 0.16072911392405062, "grad_norm": 0.41031545400619507, "learning_rate": 9.801910851476524e-06, "loss": 0.0006, "step": 496 }, { "epoch": 0.1613772151898734, "grad_norm": 0.0826662927865982, "learning_rate": 9.800138258241311e-06, "loss": 0.002, "step": 498 }, { "epoch": 0.1620253164556962, "grad_norm": 0.16029556095600128, "learning_rate": 9.798357931026411e-06, "loss": 0.0016, "step": 500 }, { "epoch": 0.162673417721519, "grad_norm": 0.40231359004974365, "learning_rate": 9.796569872700287e-06, "loss": 0.0012, "step": 502 }, { "epoch": 0.16332151898734176, "grad_norm": 0.13478736579418182, "learning_rate": 9.79477408614386e-06, "loss": 0.0005, "step": 504 }, { "epoch": 0.16396962025316456, "grad_norm": 0.39503753185272217, "learning_rate": 9.792970574250493e-06, "loss": 0.0035, "step": 506 }, { "epoch": 0.16461772151898735, "grad_norm": 0.7558596730232239, "learning_rate": 9.791159339926009e-06, "loss": 0.0064, "step": 508 }, { "epoch": 0.16526582278481014, "grad_norm": 0.6682188510894775, "learning_rate": 9.789340386088663e-06, "loss": 0.0035, "step": 510 }, { "epoch": 0.1659139240506329, "grad_norm": 0.4081304669380188, "learning_rate": 9.787513715669158e-06, "loss": 0.0015, "step": 512 }, { "epoch": 0.1665620253164557, "grad_norm": 0.22257903218269348, "learning_rate": 9.78567933161062e-06, "loss": 0.0008, "step": 514 }, { "epoch": 0.1672101265822785, "grad_norm": 0.201273575425148, "learning_rate": 9.78383723686861e-06, "loss": 0.0007, "step": 516 }, { "epoch": 0.16785822784810125, "grad_norm": 0.08263271301984787, "learning_rate": 9.781987434411106e-06, "loss": 0.0005, "step": 518 }, { "epoch": 0.16850632911392405, "grad_norm": 0.052383746951818466, "learning_rate": 9.780129927218513e-06, "loss": 0.0017, "step": 520 }, { "epoch": 0.16850632911392405, "eval_accuracy": 0.9997890522710435, "eval_loss": 0.0008646832429803908, "eval_runtime": 102.7258, "eval_samples_per_second": 48.673, "eval_steps_per_second": 12.168, "step": 520 }, { "epoch": 0.16915443037974684, "grad_norm": 0.16042329370975494, "learning_rate": 9.778264718283644e-06, "loss": 0.0031, "step": 522 }, { "epoch": 0.16980253164556963, "grad_norm": 0.13061323761940002, "learning_rate": 9.776391810611719e-06, "loss": 0.0002, "step": 524 }, { "epoch": 0.1704506329113924, "grad_norm": 0.15899308025836945, "learning_rate": 9.774511207220369e-06, "loss": 0.0008, "step": 526 }, { "epoch": 0.1710987341772152, "grad_norm": 0.12816283106803894, "learning_rate": 9.772622911139622e-06, "loss": 0.0005, "step": 528 }, { "epoch": 0.17174683544303798, "grad_norm": 0.16552942991256714, "learning_rate": 9.770726925411898e-06, "loss": 0.0002, "step": 530 }, { "epoch": 0.17239493670886075, "grad_norm": 0.07594162225723267, "learning_rate": 9.768823253092008e-06, "loss": 0.0003, "step": 532 }, { "epoch": 0.17304303797468354, "grad_norm": 0.03090669773519039, "learning_rate": 9.766911897247147e-06, "loss": 0.0002, "step": 534 }, { "epoch": 0.17369113924050633, "grad_norm": 0.19276919960975647, "learning_rate": 9.76499286095689e-06, "loss": 0.0005, "step": 536 }, { "epoch": 0.17433924050632912, "grad_norm": 0.17801803350448608, "learning_rate": 9.763066147313189e-06, "loss": 0.0036, "step": 538 }, { "epoch": 0.1749873417721519, "grad_norm": 0.0805857926607132, "learning_rate": 9.76113175942036e-06, "loss": 0.002, "step": 540 }, { "epoch": 0.17563544303797468, "grad_norm": 0.8715735077857971, "learning_rate": 9.759189700395096e-06, "loss": 0.0013, "step": 542 }, { "epoch": 0.17628354430379747, "grad_norm": 0.1977226585149765, "learning_rate": 9.75723997336643e-06, "loss": 0.0003, "step": 544 }, { "epoch": 0.17693164556962027, "grad_norm": 0.5493549704551697, "learning_rate": 9.755282581475769e-06, "loss": 0.0003, "step": 546 }, { "epoch": 0.17757974683544303, "grad_norm": 1.054047703742981, "learning_rate": 9.753317527876857e-06, "loss": 0.0009, "step": 548 }, { "epoch": 0.17822784810126582, "grad_norm": 0.6819576621055603, "learning_rate": 9.751344815735791e-06, "loss": 0.0013, "step": 550 }, { "epoch": 0.17887594936708862, "grad_norm": 0.12686581909656525, "learning_rate": 9.749364448231001e-06, "loss": 0.0005, "step": 552 }, { "epoch": 0.17952405063291138, "grad_norm": 1.1412973403930664, "learning_rate": 9.747376428553255e-06, "loss": 0.0025, "step": 554 }, { "epoch": 0.18017215189873417, "grad_norm": 0.3469236493110657, "learning_rate": 9.745380759905648e-06, "loss": 0.0021, "step": 556 }, { "epoch": 0.18082025316455697, "grad_norm": 0.31161972880363464, "learning_rate": 9.743377445503598e-06, "loss": 0.0019, "step": 558 }, { "epoch": 0.18146835443037976, "grad_norm": 0.11834688484668732, "learning_rate": 9.74136648857485e-06, "loss": 0.0018, "step": 560 }, { "epoch": 0.18146835443037976, "eval_accuracy": 0.9997172934457434, "eval_loss": 0.0010605291463434696, "eval_runtime": 102.2732, "eval_samples_per_second": 48.889, "eval_steps_per_second": 12.222, "step": 560 }, { "epoch": 0.18211645569620252, "grad_norm": 0.30010661482810974, "learning_rate": 9.739347892359453e-06, "loss": 0.0009, "step": 562 }, { "epoch": 0.18276455696202532, "grad_norm": 0.10496673732995987, "learning_rate": 9.737321660109767e-06, "loss": 0.0006, "step": 564 }, { "epoch": 0.1834126582278481, "grad_norm": 0.36886048316955566, "learning_rate": 9.735287795090455e-06, "loss": 0.0015, "step": 566 }, { "epoch": 0.18406075949367087, "grad_norm": 0.4008212387561798, "learning_rate": 9.733246300578482e-06, "loss": 0.0008, "step": 568 }, { "epoch": 0.18470886075949366, "grad_norm": 0.6184799671173096, "learning_rate": 9.731197179863104e-06, "loss": 0.0009, "step": 570 }, { "epoch": 0.18535696202531646, "grad_norm": 0.12914961576461792, "learning_rate": 9.729140436245857e-06, "loss": 0.0003, "step": 572 }, { "epoch": 0.18600506329113925, "grad_norm": 0.2764883041381836, "learning_rate": 9.72707607304057e-06, "loss": 0.0026, "step": 574 }, { "epoch": 0.18665316455696201, "grad_norm": 0.3653884530067444, "learning_rate": 9.725004093573343e-06, "loss": 0.0003, "step": 576 }, { "epoch": 0.1873012658227848, "grad_norm": 1.2126648426055908, "learning_rate": 9.722924501182546e-06, "loss": 0.0005, "step": 578 }, { "epoch": 0.1879493670886076, "grad_norm": 0.2613915205001831, "learning_rate": 9.72083729921882e-06, "loss": 0.002, "step": 580 }, { "epoch": 0.1885974683544304, "grad_norm": 0.0996515303850174, "learning_rate": 9.718742491045061e-06, "loss": 0.0006, "step": 582 }, { "epoch": 0.18924556962025316, "grad_norm": 0.5431665778160095, "learning_rate": 9.716640080036423e-06, "loss": 0.0023, "step": 584 }, { "epoch": 0.18989367088607595, "grad_norm": 0.7025941014289856, "learning_rate": 9.71453006958031e-06, "loss": 0.0022, "step": 586 }, { "epoch": 0.19054177215189874, "grad_norm": 0.8454611897468567, "learning_rate": 9.712412463076368e-06, "loss": 0.0035, "step": 588 }, { "epoch": 0.1911898734177215, "grad_norm": 0.19344277679920197, "learning_rate": 9.710287263936485e-06, "loss": 0.0019, "step": 590 }, { "epoch": 0.1918379746835443, "grad_norm": 0.2613272964954376, "learning_rate": 9.708154475584779e-06, "loss": 0.0009, "step": 592 }, { "epoch": 0.1924860759493671, "grad_norm": 0.6521615982055664, "learning_rate": 9.7060141014576e-06, "loss": 0.0028, "step": 594 }, { "epoch": 0.19313417721518988, "grad_norm": 0.20852723717689514, "learning_rate": 9.703866145003512e-06, "loss": 0.002, "step": 596 }, { "epoch": 0.19378227848101265, "grad_norm": 0.28225457668304443, "learning_rate": 9.701710609683305e-06, "loss": 0.0008, "step": 598 }, { "epoch": 0.19443037974683544, "grad_norm": 0.3033112585544586, "learning_rate": 9.699547498969978e-06, "loss": 0.0009, "step": 600 }, { "epoch": 0.19443037974683544, "eval_accuracy": 0.9997885038585653, "eval_loss": 0.0007942758384160697, "eval_runtime": 105.5854, "eval_samples_per_second": 47.355, "eval_steps_per_second": 11.839, "step": 600 }, { "epoch": 0.19507848101265823, "grad_norm": 0.13795693218708038, "learning_rate": 9.697376816348732e-06, "loss": 0.0002, "step": 602 }, { "epoch": 0.19572658227848103, "grad_norm": 0.07170484960079193, "learning_rate": 9.695198565316966e-06, "loss": 0.0002, "step": 604 }, { "epoch": 0.1963746835443038, "grad_norm": 0.24718345701694489, "learning_rate": 9.69301274938428e-06, "loss": 0.002, "step": 606 }, { "epoch": 0.19702278481012658, "grad_norm": 0.33944374322891235, "learning_rate": 9.690819372072457e-06, "loss": 0.0016, "step": 608 }, { "epoch": 0.19767088607594938, "grad_norm": 0.11184486001729965, "learning_rate": 9.68861843691547e-06, "loss": 0.0002, "step": 610 }, { "epoch": 0.19831898734177214, "grad_norm": 0.0845549926161766, "learning_rate": 9.68640994745946e-06, "loss": 0.0003, "step": 612 }, { "epoch": 0.19896708860759493, "grad_norm": 0.28036901354789734, "learning_rate": 9.684193907262742e-06, "loss": 0.0019, "step": 614 }, { "epoch": 0.19961518987341773, "grad_norm": 0.019151655957102776, "learning_rate": 9.681970319895804e-06, "loss": 0.0004, "step": 616 }, { "epoch": 0.20026329113924052, "grad_norm": 0.0641448125243187, "learning_rate": 9.679739188941283e-06, "loss": 0.0001, "step": 618 }, { "epoch": 0.20091139240506328, "grad_norm": 0.012318900786340237, "learning_rate": 9.677500517993983e-06, "loss": 0.0018, "step": 620 }, { "epoch": 0.20155949367088608, "grad_norm": 0.012480619363486767, "learning_rate": 9.675254310660842e-06, "loss": 0.0, "step": 622 }, { "epoch": 0.20220759493670887, "grad_norm": 0.12060311436653137, "learning_rate": 9.673000570560952e-06, "loss": 0.0002, "step": 624 }, { "epoch": 0.20285569620253163, "grad_norm": 0.13326050341129303, "learning_rate": 9.670739301325534e-06, "loss": 0.0014, "step": 626 }, { "epoch": 0.20350379746835442, "grad_norm": 0.10389236360788345, "learning_rate": 9.668470506597946e-06, "loss": 0.0013, "step": 628 }, { "epoch": 0.20415189873417722, "grad_norm": 0.06838095188140869, "learning_rate": 9.66619419003367e-06, "loss": 0.0012, "step": 630 }, { "epoch": 0.2048, "grad_norm": 0.022482849657535553, "learning_rate": 9.663910355300306e-06, "loss": 0.0001, "step": 632 }, { "epoch": 0.20544810126582277, "grad_norm": 0.17222675681114197, "learning_rate": 9.661619006077562e-06, "loss": 0.0001, "step": 634 }, { "epoch": 0.20609620253164557, "grad_norm": 0.04022524133324623, "learning_rate": 9.659320146057263e-06, "loss": 0.0001, "step": 636 }, { "epoch": 0.20674430379746836, "grad_norm": 0.0643358826637268, "learning_rate": 9.657013778943328e-06, "loss": 0.0, "step": 638 }, { "epoch": 0.20739240506329115, "grad_norm": 0.16372860968112946, "learning_rate": 9.654699908451777e-06, "loss": 0.0012, "step": 640 }, { "epoch": 0.20739240506329115, "eval_accuracy": 0.999866675350015, "eval_loss": 0.0004598509694915265, "eval_runtime": 105.4679, "eval_samples_per_second": 47.408, "eval_steps_per_second": 11.852, "step": 640 }, { "epoch": 0.20804050632911392, "grad_norm": 0.009580990299582481, "learning_rate": 9.652378538310715e-06, "loss": 0.0, "step": 642 }, { "epoch": 0.2086886075949367, "grad_norm": 0.014977443031966686, "learning_rate": 9.650049672260333e-06, "loss": 0.0004, "step": 644 }, { "epoch": 0.2093367088607595, "grad_norm": 0.045772042125463486, "learning_rate": 9.647713314052896e-06, "loss": 0.0013, "step": 646 }, { "epoch": 0.20998481012658227, "grad_norm": 0.009746350347995758, "learning_rate": 9.645369467452746e-06, "loss": 0.0, "step": 648 }, { "epoch": 0.21063291139240506, "grad_norm": 0.02030940167605877, "learning_rate": 9.643018136236286e-06, "loss": 0.0, "step": 650 }, { "epoch": 0.21128101265822785, "grad_norm": 0.25576162338256836, "learning_rate": 9.64065932419198e-06, "loss": 0.0004, "step": 652 }, { "epoch": 0.21192911392405064, "grad_norm": 0.06584621220827103, "learning_rate": 9.638293035120342e-06, "loss": 0.0001, "step": 654 }, { "epoch": 0.2125772151898734, "grad_norm": 0.10090325772762299, "learning_rate": 9.635919272833938e-06, "loss": 0.0004, "step": 656 }, { "epoch": 0.2132253164556962, "grad_norm": 0.2609342336654663, "learning_rate": 9.63353804115737e-06, "loss": 0.0017, "step": 658 }, { "epoch": 0.213873417721519, "grad_norm": 0.8355330228805542, "learning_rate": 9.63114934392728e-06, "loss": 0.0017, "step": 660 }, { "epoch": 0.21452151898734176, "grad_norm": 0.05891817435622215, "learning_rate": 9.628753184992334e-06, "loss": 0.0002, "step": 662 }, { "epoch": 0.21516962025316455, "grad_norm": 0.5131203532218933, "learning_rate": 9.62634956821322e-06, "loss": 0.002, "step": 664 }, { "epoch": 0.21581772151898734, "grad_norm": 0.17226867377758026, "learning_rate": 9.623938497462647e-06, "loss": 0.0045, "step": 666 }, { "epoch": 0.21646582278481014, "grad_norm": 0.007893595844507217, "learning_rate": 9.621519976625327e-06, "loss": 0.0, "step": 668 }, { "epoch": 0.2171139240506329, "grad_norm": 0.07217761129140854, "learning_rate": 9.619094009597982e-06, "loss": 0.0005, "step": 670 }, { "epoch": 0.2177620253164557, "grad_norm": 0.01869891956448555, "learning_rate": 9.616660600289329e-06, "loss": 0.0017, "step": 672 }, { "epoch": 0.21841012658227849, "grad_norm": 0.21836678683757782, "learning_rate": 9.614219752620074e-06, "loss": 0.0012, "step": 674 }, { "epoch": 0.21905822784810128, "grad_norm": 0.06981157511472702, "learning_rate": 9.611771470522908e-06, "loss": 0.0013, "step": 676 }, { "epoch": 0.21970632911392404, "grad_norm": 0.34331014752388, "learning_rate": 9.609315757942504e-06, "loss": 0.001, "step": 678 }, { "epoch": 0.22035443037974684, "grad_norm": 0.10346855223178864, "learning_rate": 9.606852618835503e-06, "loss": 0.0005, "step": 680 }, { "epoch": 0.22035443037974684, "eval_accuracy": 0.999794656342699, "eval_loss": 0.0007786342175677419, "eval_runtime": 104.3292, "eval_samples_per_second": 47.925, "eval_steps_per_second": 11.981, "step": 680 }, { "epoch": 0.22100253164556963, "grad_norm": 0.20089657604694366, "learning_rate": 9.604382057170514e-06, "loss": 0.0004, "step": 682 }, { "epoch": 0.2216506329113924, "grad_norm": 0.1667291522026062, "learning_rate": 9.601904076928103e-06, "loss": 0.0005, "step": 684 }, { "epoch": 0.22229873417721518, "grad_norm": 0.3179922103881836, "learning_rate": 9.599418682100793e-06, "loss": 0.0017, "step": 686 }, { "epoch": 0.22294683544303798, "grad_norm": 1.0040173530578613, "learning_rate": 9.596925876693047e-06, "loss": 0.0033, "step": 688 }, { "epoch": 0.22359493670886077, "grad_norm": 0.1465279906988144, "learning_rate": 9.594425664721275e-06, "loss": 0.0003, "step": 690 }, { "epoch": 0.22424303797468353, "grad_norm": 0.4364776015281677, "learning_rate": 9.591918050213814e-06, "loss": 0.002, "step": 692 }, { "epoch": 0.22489113924050633, "grad_norm": 0.06411771476268768, "learning_rate": 9.589403037210933e-06, "loss": 0.0007, "step": 694 }, { "epoch": 0.22553924050632912, "grad_norm": 0.33405566215515137, "learning_rate": 9.586880629764817e-06, "loss": 0.0017, "step": 696 }, { "epoch": 0.2261873417721519, "grad_norm": 0.14692090451717377, "learning_rate": 9.584350831939571e-06, "loss": 0.0016, "step": 698 }, { "epoch": 0.22683544303797468, "grad_norm": 0.21872135996818542, "learning_rate": 9.581813647811199e-06, "loss": 0.0006, "step": 700 }, { "epoch": 0.22748354430379747, "grad_norm": 0.3012380599975586, "learning_rate": 9.579269081467614e-06, "loss": 0.0004, "step": 702 }, { "epoch": 0.22813164556962026, "grad_norm": 0.22102029621601105, "learning_rate": 9.576717137008617e-06, "loss": 0.0003, "step": 704 }, { "epoch": 0.22877974683544303, "grad_norm": 0.05813106894493103, "learning_rate": 9.574157818545902e-06, "loss": 0.0005, "step": 706 }, { "epoch": 0.22942784810126582, "grad_norm": 0.0369562990963459, "learning_rate": 9.57159113020304e-06, "loss": 0.0011, "step": 708 }, { "epoch": 0.2300759493670886, "grad_norm": 0.21441668272018433, "learning_rate": 9.569017076115476e-06, "loss": 0.0006, "step": 710 }, { "epoch": 0.2307240506329114, "grad_norm": 0.5280594229698181, "learning_rate": 9.566435660430528e-06, "loss": 0.0013, "step": 712 }, { "epoch": 0.23137215189873417, "grad_norm": 0.3137824833393097, "learning_rate": 9.563846887307369e-06, "loss": 0.0014, "step": 714 }, { "epoch": 0.23202025316455696, "grad_norm": 0.11566320061683655, "learning_rate": 9.561250760917026e-06, "loss": 0.0002, "step": 716 }, { "epoch": 0.23266835443037975, "grad_norm": 0.07832954823970795, "learning_rate": 9.558647285442382e-06, "loss": 0.0003, "step": 718 }, { "epoch": 0.23331645569620252, "grad_norm": 0.15707625448703766, "learning_rate": 9.55603646507815e-06, "loss": 0.0003, "step": 720 }, { "epoch": 0.23331645569620252, "eval_accuracy": 0.9998935399990166, "eval_loss": 0.0003976961597800255, "eval_runtime": 104.7543, "eval_samples_per_second": 47.731, "eval_steps_per_second": 11.933, "step": 720 }, { "epoch": 0.2339645569620253, "grad_norm": 0.1931619644165039, "learning_rate": 9.553418304030886e-06, "loss": 0.0008, "step": 722 }, { "epoch": 0.2346126582278481, "grad_norm": 0.15913955867290497, "learning_rate": 9.550792806518967e-06, "loss": 0.0001, "step": 724 }, { "epoch": 0.2352607594936709, "grad_norm": 0.08975693583488464, "learning_rate": 9.548159976772593e-06, "loss": 0.0003, "step": 726 }, { "epoch": 0.23590886075949366, "grad_norm": 0.14222075045108795, "learning_rate": 9.545519819033777e-06, "loss": 0.0011, "step": 728 }, { "epoch": 0.23655696202531645, "grad_norm": 0.034183427691459656, "learning_rate": 9.542872337556341e-06, "loss": 0.0, "step": 730 }, { "epoch": 0.23720506329113925, "grad_norm": 0.006229052785784006, "learning_rate": 9.540217536605906e-06, "loss": 0.0009, "step": 732 }, { "epoch": 0.23785316455696204, "grad_norm": 0.08647812902927399, "learning_rate": 9.537555420459883e-06, "loss": 0.0, "step": 734 }, { "epoch": 0.2385012658227848, "grad_norm": 0.022472726181149483, "learning_rate": 9.534885993407474e-06, "loss": 0.0002, "step": 736 }, { "epoch": 0.2391493670886076, "grad_norm": 0.012393929995596409, "learning_rate": 9.532209259749658e-06, "loss": 0.0005, "step": 738 }, { "epoch": 0.2397974683544304, "grad_norm": 0.08754698187112808, "learning_rate": 9.529525223799185e-06, "loss": 0.0009, "step": 740 }, { "epoch": 0.24044556962025315, "grad_norm": 0.056353695690631866, "learning_rate": 9.526833889880573e-06, "loss": 0.0001, "step": 742 }, { "epoch": 0.24109367088607594, "grad_norm": 0.39609044790267944, "learning_rate": 9.524135262330098e-06, "loss": 0.0014, "step": 744 }, { "epoch": 0.24174177215189874, "grad_norm": 0.3467492163181305, "learning_rate": 9.521429345495787e-06, "loss": 0.0002, "step": 746 }, { "epoch": 0.24238987341772153, "grad_norm": 0.4665900766849518, "learning_rate": 9.51871614373741e-06, "loss": 0.0012, "step": 748 }, { "epoch": 0.2430379746835443, "grad_norm": 0.24675174057483673, "learning_rate": 9.515995661426478e-06, "loss": 0.0015, "step": 750 }, { "epoch": 0.2436860759493671, "grad_norm": 0.03283111751079559, "learning_rate": 9.513267902946228e-06, "loss": 0.0, "step": 752 }, { "epoch": 0.24433417721518988, "grad_norm": 0.10860046744346619, "learning_rate": 9.510532872691624e-06, "loss": 0.0014, "step": 754 }, { "epoch": 0.24498227848101267, "grad_norm": 0.1112566888332367, "learning_rate": 9.507790575069347e-06, "loss": 0.0001, "step": 756 }, { "epoch": 0.24563037974683544, "grad_norm": 0.11744935810565948, "learning_rate": 9.50504101449778e-06, "loss": 0.0012, "step": 758 }, { "epoch": 0.24627848101265823, "grad_norm": 0.13507533073425293, "learning_rate": 9.50228419540702e-06, "loss": 0.0009, "step": 760 }, { "epoch": 0.24627848101265823, "eval_accuracy": 0.9998955433397497, "eval_loss": 0.0003588534309528768, "eval_runtime": 105.2425, "eval_samples_per_second": 47.509, "eval_steps_per_second": 11.877, "step": 760 }, { "epoch": 0.24692658227848102, "grad_norm": 0.17681878805160522, "learning_rate": 9.499520122238846e-06, "loss": 0.0012, "step": 762 }, { "epoch": 0.2475746835443038, "grad_norm": 0.061774663627147675, "learning_rate": 9.496748799446733e-06, "loss": 0.0001, "step": 764 }, { "epoch": 0.24822278481012658, "grad_norm": 0.010383802466094494, "learning_rate": 9.493970231495836e-06, "loss": 0.0008, "step": 766 }, { "epoch": 0.24887088607594937, "grad_norm": 0.09358537942171097, "learning_rate": 9.49118442286298e-06, "loss": 0.0002, "step": 768 }, { "epoch": 0.24951898734177216, "grad_norm": 0.007097979541867971, "learning_rate": 9.488391378036662e-06, "loss": 0.0, "step": 770 }, { "epoch": 0.25016708860759496, "grad_norm": 0.24831758439540863, "learning_rate": 9.485591101517027e-06, "loss": 0.0019, "step": 772 }, { "epoch": 0.2508151898734177, "grad_norm": 0.043643102049827576, "learning_rate": 9.482783597815883e-06, "loss": 0.0008, "step": 774 }, { "epoch": 0.2514632911392405, "grad_norm": 0.05945923179388046, "learning_rate": 9.47996887145668e-06, "loss": 0.0009, "step": 776 }, { "epoch": 0.2521113924050633, "grad_norm": 0.011810753494501114, "learning_rate": 9.477146926974501e-06, "loss": 0.0001, "step": 778 }, { "epoch": 0.25275949367088607, "grad_norm": 0.0021227251272648573, "learning_rate": 9.47431776891606e-06, "loss": 0.0, "step": 780 }, { "epoch": 0.25340759493670884, "grad_norm": 0.002886489499360323, "learning_rate": 9.471481401839696e-06, "loss": 0.0001, "step": 782 }, { "epoch": 0.25405569620253166, "grad_norm": 0.1468556523323059, "learning_rate": 9.468637830315364e-06, "loss": 0.0006, "step": 784 }, { "epoch": 0.2547037974683544, "grad_norm": 0.015433026477694511, "learning_rate": 9.46578705892462e-06, "loss": 0.0, "step": 786 }, { "epoch": 0.25535189873417724, "grad_norm": 0.10068199783563614, "learning_rate": 9.46292909226063e-06, "loss": 0.0013, "step": 788 }, { "epoch": 0.256, "grad_norm": 0.12383728474378586, "learning_rate": 9.460063934928142e-06, "loss": 0.0006, "step": 790 }, { "epoch": 0.25664810126582277, "grad_norm": 0.17582979798316956, "learning_rate": 9.4571915915435e-06, "loss": 0.0, "step": 792 }, { "epoch": 0.2572962025316456, "grad_norm": 0.005818579345941544, "learning_rate": 9.454312066734624e-06, "loss": 0.0, "step": 794 }, { "epoch": 0.25794430379746836, "grad_norm": 0.01889118365943432, "learning_rate": 9.451425365140997e-06, "loss": 0.0002, "step": 796 }, { "epoch": 0.2585924050632911, "grad_norm": 0.00686600711196661, "learning_rate": 9.448531491413673e-06, "loss": 0.0, "step": 798 }, { "epoch": 0.25924050632911394, "grad_norm": 0.0015769926831126213, "learning_rate": 9.445630450215259e-06, "loss": 0.0, "step": 800 }, { "epoch": 0.25924050632911394, "eval_accuracy": 0.9999590503640193, "eval_loss": 0.00015591137344017625, "eval_runtime": 105.7845, "eval_samples_per_second": 47.266, "eval_steps_per_second": 11.816, "step": 800 }, { "epoch": 0.2598886075949367, "grad_norm": 0.06982430070638657, "learning_rate": 9.442722246219915e-06, "loss": 0.0, "step": 802 }, { "epoch": 0.26053670886075947, "grad_norm": 0.0993858277797699, "learning_rate": 9.439806884113331e-06, "loss": 0.0005, "step": 804 }, { "epoch": 0.2611848101265823, "grad_norm": 0.0011815716279670596, "learning_rate": 9.43688436859274e-06, "loss": 0.0007, "step": 806 }, { "epoch": 0.26183291139240505, "grad_norm": 0.0009712717146612704, "learning_rate": 9.433954704366897e-06, "loss": 0.0002, "step": 808 }, { "epoch": 0.2624810126582279, "grad_norm": 0.0008742675418034196, "learning_rate": 9.431017896156074e-06, "loss": 0.0001, "step": 810 }, { "epoch": 0.26312911392405064, "grad_norm": 0.1526733636856079, "learning_rate": 9.428073948692056e-06, "loss": 0.0001, "step": 812 }, { "epoch": 0.2637772151898734, "grad_norm": 0.008984135463833809, "learning_rate": 9.425122866718128e-06, "loss": 0.0, "step": 814 }, { "epoch": 0.2644253164556962, "grad_norm": 0.0007589040324091911, "learning_rate": 9.422164654989073e-06, "loss": 0.0003, "step": 816 }, { "epoch": 0.265073417721519, "grad_norm": 0.11218152195215225, "learning_rate": 9.419199318271158e-06, "loss": 0.0004, "step": 818 }, { "epoch": 0.26572151898734175, "grad_norm": 0.16130465269088745, "learning_rate": 9.416226861342132e-06, "loss": 0.0004, "step": 820 }, { "epoch": 0.2663696202531646, "grad_norm": 0.028433728963136673, "learning_rate": 9.413247288991216e-06, "loss": 0.0, "step": 822 }, { "epoch": 0.26701772151898734, "grad_norm": 0.3576768934726715, "learning_rate": 9.410260606019095e-06, "loss": 0.0009, "step": 824 }, { "epoch": 0.2676658227848101, "grad_norm": 0.03127586469054222, "learning_rate": 9.40726681723791e-06, "loss": 0.0, "step": 826 }, { "epoch": 0.2683139240506329, "grad_norm": 0.012010865844786167, "learning_rate": 9.404265927471255e-06, "loss": 0.0004, "step": 828 }, { "epoch": 0.2689620253164557, "grad_norm": 0.01447245478630066, "learning_rate": 9.401257941554157e-06, "loss": 0.0, "step": 830 }, { "epoch": 0.2696101265822785, "grad_norm": 0.07084700465202332, "learning_rate": 9.398242864333084e-06, "loss": 0.0008, "step": 832 }, { "epoch": 0.2702582278481013, "grad_norm": 0.23775817453861237, "learning_rate": 9.395220700665924e-06, "loss": 0.0002, "step": 834 }, { "epoch": 0.27090632911392404, "grad_norm": 0.27267056703567505, "learning_rate": 9.392191455421989e-06, "loss": 0.001, "step": 836 }, { "epoch": 0.27155443037974686, "grad_norm": 0.0014412677846848965, "learning_rate": 9.389155133481993e-06, "loss": 0.0, "step": 838 }, { "epoch": 0.2722025316455696, "grad_norm": 0.22994522750377655, "learning_rate": 9.386111739738057e-06, "loss": 0.0002, "step": 840 }, { "epoch": 0.2722025316455696, "eval_accuracy": 0.9999324854944309, "eval_loss": 0.00023814353335183114, "eval_runtime": 105.6702, "eval_samples_per_second": 47.317, "eval_steps_per_second": 11.829, "step": 840 }, { "epoch": 0.2728506329113924, "grad_norm": 0.16264504194259644, "learning_rate": 9.383061279093697e-06, "loss": 0.0008, "step": 842 }, { "epoch": 0.2734987341772152, "grad_norm": 0.21666684746742249, "learning_rate": 9.380003756463812e-06, "loss": 0.0003, "step": 844 }, { "epoch": 0.274146835443038, "grad_norm": 0.020299425348639488, "learning_rate": 9.376939176774678e-06, "loss": 0.0001, "step": 846 }, { "epoch": 0.27479493670886074, "grad_norm": 0.04464450851082802, "learning_rate": 9.373867544963949e-06, "loss": 0.0002, "step": 848 }, { "epoch": 0.27544303797468356, "grad_norm": 0.13788706064224243, "learning_rate": 9.370788865980633e-06, "loss": 0.0011, "step": 850 }, { "epoch": 0.2760911392405063, "grad_norm": 0.3510144352912903, "learning_rate": 9.367703144785097e-06, "loss": 0.001, "step": 852 }, { "epoch": 0.2767392405063291, "grad_norm": 0.048665471374988556, "learning_rate": 9.364610386349048e-06, "loss": 0.0001, "step": 854 }, { "epoch": 0.2773873417721519, "grad_norm": 0.02506359852850437, "learning_rate": 9.361510595655545e-06, "loss": 0.0006, "step": 856 }, { "epoch": 0.27803544303797467, "grad_norm": 0.1422048807144165, "learning_rate": 9.358403777698962e-06, "loss": 0.001, "step": 858 }, { "epoch": 0.2786835443037975, "grad_norm": 0.007875354960560799, "learning_rate": 9.355289937485005e-06, "loss": 0.0, "step": 860 }, { "epoch": 0.27933164556962026, "grad_norm": 0.009137582033872604, "learning_rate": 9.35216908003069e-06, "loss": 0.0, "step": 862 }, { "epoch": 0.279979746835443, "grad_norm": 0.020093752071261406, "learning_rate": 9.349041210364343e-06, "loss": 0.0003, "step": 864 }, { "epoch": 0.28062784810126584, "grad_norm": 0.0040946886874735355, "learning_rate": 9.345906333525582e-06, "loss": 0.0001, "step": 866 }, { "epoch": 0.2812759493670886, "grad_norm": 0.005941552110016346, "learning_rate": 9.342764454565321e-06, "loss": 0.0006, "step": 868 }, { "epoch": 0.28192405063291137, "grad_norm": 0.0024547127541154623, "learning_rate": 9.339615578545753e-06, "loss": 0.0, "step": 870 }, { "epoch": 0.2825721518987342, "grad_norm": 0.002859026426449418, "learning_rate": 9.336459710540344e-06, "loss": 0.0014, "step": 872 }, { "epoch": 0.28322025316455696, "grad_norm": 0.0011856557102873921, "learning_rate": 9.333296855633828e-06, "loss": 0.0, "step": 874 }, { "epoch": 0.2838683544303797, "grad_norm": 0.41726920008659363, "learning_rate": 9.330127018922195e-06, "loss": 0.0011, "step": 876 }, { "epoch": 0.28451645569620254, "grad_norm": 0.45102396607398987, "learning_rate": 9.326950205512682e-06, "loss": 0.0013, "step": 878 }, { "epoch": 0.2851645569620253, "grad_norm": 0.16208650171756744, "learning_rate": 9.323766420523768e-06, "loss": 0.0007, "step": 880 }, { "epoch": 0.2851645569620253, "eval_accuracy": 0.999945549853298, "eval_loss": 0.00017561792628839612, "eval_runtime": 105.2387, "eval_samples_per_second": 47.511, "eval_steps_per_second": 11.878, "step": 880 }, { "epoch": 0.2858126582278481, "grad_norm": 0.4453209638595581, "learning_rate": 9.32057566908517e-06, "loss": 0.0005, "step": 882 }, { "epoch": 0.2864607594936709, "grad_norm": 0.0023153494112193584, "learning_rate": 9.31737795633782e-06, "loss": 0.0, "step": 884 }, { "epoch": 0.28710886075949366, "grad_norm": 0.050638675689697266, "learning_rate": 9.314173287433874e-06, "loss": 0.0001, "step": 886 }, { "epoch": 0.2877569620253165, "grad_norm": 0.08961086720228195, "learning_rate": 9.310961667536689e-06, "loss": 0.0002, "step": 888 }, { "epoch": 0.28840506329113924, "grad_norm": 0.10929686576128006, "learning_rate": 9.307743101820828e-06, "loss": 0.0002, "step": 890 }, { "epoch": 0.289053164556962, "grad_norm": 0.010287183336913586, "learning_rate": 9.30451759547204e-06, "loss": 0.0, "step": 892 }, { "epoch": 0.2897012658227848, "grad_norm": 0.04592491313815117, "learning_rate": 9.301285153687261e-06, "loss": 0.0002, "step": 894 }, { "epoch": 0.2903493670886076, "grad_norm": 0.13677294552326202, "learning_rate": 9.298045781674595e-06, "loss": 0.0018, "step": 896 }, { "epoch": 0.29099746835443036, "grad_norm": 0.05407572165131569, "learning_rate": 9.294799484653323e-06, "loss": 0.0002, "step": 898 }, { "epoch": 0.2916455696202532, "grad_norm": 0.20654959976673126, "learning_rate": 9.291546267853871e-06, "loss": 0.0016, "step": 900 }, { "epoch": 0.29229367088607594, "grad_norm": 0.13771550357341766, "learning_rate": 9.28828613651782e-06, "loss": 0.0015, "step": 902 }, { "epoch": 0.29294177215189876, "grad_norm": 0.24270237982273102, "learning_rate": 9.285019095897894e-06, "loss": 0.0003, "step": 904 }, { "epoch": 0.2935898734177215, "grad_norm": 0.06425748020410538, "learning_rate": 9.281745151257946e-06, "loss": 0.0003, "step": 906 }, { "epoch": 0.2942379746835443, "grad_norm": 0.3214038014411926, "learning_rate": 9.278464307872952e-06, "loss": 0.0014, "step": 908 }, { "epoch": 0.2948860759493671, "grad_norm": 0.3001636266708374, "learning_rate": 9.275176571029008e-06, "loss": 0.0007, "step": 910 }, { "epoch": 0.2955341772151899, "grad_norm": 0.5608838796615601, "learning_rate": 9.271881946023309e-06, "loss": 0.0007, "step": 912 }, { "epoch": 0.29618227848101264, "grad_norm": 1.1040050983428955, "learning_rate": 9.268580438164157e-06, "loss": 0.0017, "step": 914 }, { "epoch": 0.29683037974683546, "grad_norm": 0.013682582415640354, "learning_rate": 9.265272052770936e-06, "loss": 0.0004, "step": 916 }, { "epoch": 0.2974784810126582, "grad_norm": 0.05224961042404175, "learning_rate": 9.261956795174116e-06, "loss": 0.0003, "step": 918 }, { "epoch": 0.298126582278481, "grad_norm": 0.2133917510509491, "learning_rate": 9.25863467071524e-06, "loss": 0.0004, "step": 920 }, { "epoch": 0.298126582278481, "eval_accuracy": 0.9998670163935581, "eval_loss": 0.00046491899411194026, "eval_runtime": 106.1612, "eval_samples_per_second": 47.098, "eval_steps_per_second": 11.775, "step": 920 }, { "epoch": 0.2987746835443038, "grad_norm": 0.28349632024765015, "learning_rate": 9.255305684746908e-06, "loss": 0.0003, "step": 922 }, { "epoch": 0.2994227848101266, "grad_norm": 0.21036334335803986, "learning_rate": 9.251969842632785e-06, "loss": 0.001, "step": 924 }, { "epoch": 0.3000708860759494, "grad_norm": 0.07668338716030121, "learning_rate": 9.248627149747573e-06, "loss": 0.0007, "step": 926 }, { "epoch": 0.30071898734177216, "grad_norm": 0.03533896058797836, "learning_rate": 9.24527761147702e-06, "loss": 0.0004, "step": 928 }, { "epoch": 0.3013670886075949, "grad_norm": 0.44851046800613403, "learning_rate": 9.241921233217899e-06, "loss": 0.0035, "step": 930 }, { "epoch": 0.30201518987341774, "grad_norm": 1.7721484899520874, "learning_rate": 9.238558020378003e-06, "loss": 0.0047, "step": 932 }, { "epoch": 0.3026632911392405, "grad_norm": 0.3624114692211151, "learning_rate": 9.235187978376141e-06, "loss": 0.0022, "step": 934 }, { "epoch": 0.3033113924050633, "grad_norm": 0.0940459743142128, "learning_rate": 9.231811112642121e-06, "loss": 0.0013, "step": 936 }, { "epoch": 0.3039594936708861, "grad_norm": 0.12335839867591858, "learning_rate": 9.228427428616749e-06, "loss": 0.0013, "step": 938 }, { "epoch": 0.30460759493670886, "grad_norm": 0.15589086711406708, "learning_rate": 9.225036931751811e-06, "loss": 0.0008, "step": 940 }, { "epoch": 0.3052556962025316, "grad_norm": 1.7561614513397217, "learning_rate": 9.221639627510076e-06, "loss": 0.0009, "step": 942 }, { "epoch": 0.30590379746835444, "grad_norm": 0.5845587253570557, "learning_rate": 9.218235521365278e-06, "loss": 0.0009, "step": 944 }, { "epoch": 0.3065518987341772, "grad_norm": 0.1916065365076065, "learning_rate": 9.214824618802108e-06, "loss": 0.0005, "step": 946 }, { "epoch": 0.3072, "grad_norm": 0.4482108950614929, "learning_rate": 9.211406925316214e-06, "loss": 0.0018, "step": 948 }, { "epoch": 0.3078481012658228, "grad_norm": 0.20742888748645782, "learning_rate": 9.20798244641418e-06, "loss": 0.0005, "step": 950 }, { "epoch": 0.30849620253164556, "grad_norm": 0.12374088913202286, "learning_rate": 9.204551187613521e-06, "loss": 0.0013, "step": 952 }, { "epoch": 0.3091443037974684, "grad_norm": 0.1217096671462059, "learning_rate": 9.201113154442685e-06, "loss": 0.0004, "step": 954 }, { "epoch": 0.30979240506329114, "grad_norm": 0.463105708360672, "learning_rate": 9.197668352441025e-06, "loss": 0.0006, "step": 956 }, { "epoch": 0.3104405063291139, "grad_norm": 0.21807028353214264, "learning_rate": 9.194216787158805e-06, "loss": 0.0006, "step": 958 }, { "epoch": 0.31108860759493673, "grad_norm": 0.29980653524398804, "learning_rate": 9.190758464157184e-06, "loss": 0.0004, "step": 960 }, { "epoch": 0.31108860759493673, "eval_accuracy": 0.9997704680437066, "eval_loss": 0.0012385541340336204, "eval_runtime": 105.1546, "eval_samples_per_second": 47.549, "eval_steps_per_second": 11.887, "step": 960 }, { "epoch": 0.3117367088607595, "grad_norm": 1.859007477760315, "learning_rate": 9.18729338900821e-06, "loss": 0.0002, "step": 962 }, { "epoch": 0.31238481012658226, "grad_norm": 0.0305470023304224, "learning_rate": 9.18382156729481e-06, "loss": 0.0005, "step": 964 }, { "epoch": 0.3130329113924051, "grad_norm": 0.1439788043498993, "learning_rate": 9.18034300461078e-06, "loss": 0.0002, "step": 966 }, { "epoch": 0.31368101265822784, "grad_norm": 0.1942347288131714, "learning_rate": 9.17685770656078e-06, "loss": 0.0006, "step": 968 }, { "epoch": 0.3143291139240506, "grad_norm": 15.445067405700684, "learning_rate": 9.173365678760318e-06, "loss": 0.0054, "step": 970 }, { "epoch": 0.3149772151898734, "grad_norm": 0.431581050157547, "learning_rate": 9.169866926835749e-06, "loss": 0.0027, "step": 972 }, { "epoch": 0.3156253164556962, "grad_norm": 0.2632336914539337, "learning_rate": 9.166361456424257e-06, "loss": 0.0007, "step": 974 }, { "epoch": 0.316273417721519, "grad_norm": 1.0372949838638306, "learning_rate": 9.162849273173857e-06, "loss": 0.0022, "step": 976 }, { "epoch": 0.3169215189873418, "grad_norm": 0.19878140091896057, "learning_rate": 9.159330382743375e-06, "loss": 0.0019, "step": 978 }, { "epoch": 0.31756962025316454, "grad_norm": 0.7643746733665466, "learning_rate": 9.155804790802444e-06, "loss": 0.0015, "step": 980 }, { "epoch": 0.31821772151898736, "grad_norm": 0.17058204114437103, "learning_rate": 9.152272503031496e-06, "loss": 0.0009, "step": 982 }, { "epoch": 0.3188658227848101, "grad_norm": 0.32723182439804077, "learning_rate": 9.148733525121751e-06, "loss": 0.0015, "step": 984 }, { "epoch": 0.3195139240506329, "grad_norm": 0.25325924158096313, "learning_rate": 9.145187862775208e-06, "loss": 0.0019, "step": 986 }, { "epoch": 0.3201620253164557, "grad_norm": 0.5356621146202087, "learning_rate": 9.141635521704638e-06, "loss": 0.0008, "step": 988 }, { "epoch": 0.3208101265822785, "grad_norm": 0.04259486123919487, "learning_rate": 9.138076507633566e-06, "loss": 0.0003, "step": 990 }, { "epoch": 0.32145822784810124, "grad_norm": 0.5347451567649841, "learning_rate": 9.134510826296277e-06, "loss": 0.0005, "step": 992 }, { "epoch": 0.32210632911392406, "grad_norm": 0.09377581626176834, "learning_rate": 9.130938483437792e-06, "loss": 0.0006, "step": 994 }, { "epoch": 0.3227544303797468, "grad_norm": 0.15030620992183685, "learning_rate": 9.12735948481387e-06, "loss": 0.0004, "step": 996 }, { "epoch": 0.32340253164556965, "grad_norm": 0.20812305808067322, "learning_rate": 9.12377383619099e-06, "loss": 0.0006, "step": 998 }, { "epoch": 0.3240506329113924, "grad_norm": 1.2129936218261719, "learning_rate": 9.120181543346348e-06, "loss": 0.0008, "step": 1000 }, { "epoch": 0.3240506329113924, "eval_accuracy": 0.9998334119906976, "eval_loss": 0.0007947856211103499, "eval_runtime": 105.0477, "eval_samples_per_second": 47.597, "eval_steps_per_second": 11.899, "step": 1000 }, { "epoch": 0.3246987341772152, "grad_norm": 0.22441734373569489, "learning_rate": 9.11658261206784e-06, "loss": 0.0002, "step": 1002 }, { "epoch": 0.325346835443038, "grad_norm": 0.46537208557128906, "learning_rate": 9.112977048154066e-06, "loss": 0.0005, "step": 1004 }, { "epoch": 0.32599493670886076, "grad_norm": 0.56826251745224, "learning_rate": 9.109364857414306e-06, "loss": 0.0035, "step": 1006 }, { "epoch": 0.3266430379746835, "grad_norm": 0.10446134209632874, "learning_rate": 9.10574604566852e-06, "loss": 0.0007, "step": 1008 }, { "epoch": 0.32729113924050635, "grad_norm": 0.1642230600118637, "learning_rate": 9.102120618747336e-06, "loss": 0.001, "step": 1010 }, { "epoch": 0.3279392405063291, "grad_norm": 0.06929557770490646, "learning_rate": 9.09848858249204e-06, "loss": 0.0002, "step": 1012 }, { "epoch": 0.3285873417721519, "grad_norm": 0.2563181221485138, "learning_rate": 9.094849942754564e-06, "loss": 0.0005, "step": 1014 }, { "epoch": 0.3292354430379747, "grad_norm": 0.41675665974617004, "learning_rate": 9.091204705397485e-06, "loss": 0.001, "step": 1016 }, { "epoch": 0.32988354430379746, "grad_norm": 0.02498539723455906, "learning_rate": 9.087552876294003e-06, "loss": 0.0001, "step": 1018 }, { "epoch": 0.3305316455696203, "grad_norm": 0.0656026154756546, "learning_rate": 9.083894461327946e-06, "loss": 0.0003, "step": 1020 }, { "epoch": 0.33117974683544305, "grad_norm": 0.11689463257789612, "learning_rate": 9.08022946639375e-06, "loss": 0.0003, "step": 1022 }, { "epoch": 0.3318278481012658, "grad_norm": 0.41019096970558167, "learning_rate": 9.076557897396452e-06, "loss": 0.0003, "step": 1024 }, { "epoch": 0.33247594936708863, "grad_norm": 0.5071588754653931, "learning_rate": 9.07287976025168e-06, "loss": 0.0009, "step": 1026 }, { "epoch": 0.3331240506329114, "grad_norm": 0.7110933661460876, "learning_rate": 9.069195060885647e-06, "loss": 0.0007, "step": 1028 }, { "epoch": 0.33377215189873416, "grad_norm": 0.4573919177055359, "learning_rate": 9.065503805235139e-06, "loss": 0.0008, "step": 1030 }, { "epoch": 0.334420253164557, "grad_norm": 0.5230830311775208, "learning_rate": 9.061805999247504e-06, "loss": 0.0009, "step": 1032 }, { "epoch": 0.33506835443037974, "grad_norm": 0.4800056219100952, "learning_rate": 9.058101648880646e-06, "loss": 0.0009, "step": 1034 }, { "epoch": 0.3357164556962025, "grad_norm": 0.30046916007995605, "learning_rate": 9.05439076010301e-06, "loss": 0.001, "step": 1036 }, { "epoch": 0.33636455696202533, "grad_norm": 0.1976243406534195, "learning_rate": 9.050673338893578e-06, "loss": 0.0004, "step": 1038 }, { "epoch": 0.3370126582278481, "grad_norm": 0.04080143943428993, "learning_rate": 9.046949391241859e-06, "loss": 0.0001, "step": 1040 }, { "epoch": 0.3370126582278481, "eval_accuracy": 0.9999459609045181, "eval_loss": 0.0002253134734928608, "eval_runtime": 105.8583, "eval_samples_per_second": 47.233, "eval_steps_per_second": 11.808, "step": 1040 }, { "epoch": 0.33766075949367086, "grad_norm": 0.04524859040975571, "learning_rate": 9.043218923147874e-06, "loss": 0.0001, "step": 1042 }, { "epoch": 0.3383088607594937, "grad_norm": 0.16028116643428802, "learning_rate": 9.039481940622148e-06, "loss": 0.0001, "step": 1044 }, { "epoch": 0.33895696202531644, "grad_norm": 0.12261617928743362, "learning_rate": 9.035738449685707e-06, "loss": 0.0002, "step": 1046 }, { "epoch": 0.33960506329113926, "grad_norm": 0.06535837799310684, "learning_rate": 9.031988456370062e-06, "loss": 0.0002, "step": 1048 }, { "epoch": 0.34025316455696203, "grad_norm": 0.258878231048584, "learning_rate": 9.0282319667172e-06, "loss": 0.0002, "step": 1050 }, { "epoch": 0.3409012658227848, "grad_norm": 0.04995712265372276, "learning_rate": 9.02446898677957e-06, "loss": 0.0002, "step": 1052 }, { "epoch": 0.3415493670886076, "grad_norm": 0.03698616847395897, "learning_rate": 9.020699522620091e-06, "loss": 0.0004, "step": 1054 }, { "epoch": 0.3421974683544304, "grad_norm": 0.0686570331454277, "learning_rate": 9.016923580312114e-06, "loss": 0.0003, "step": 1056 }, { "epoch": 0.34284556962025314, "grad_norm": 0.011084304191172123, "learning_rate": 9.013141165939439e-06, "loss": 0.0006, "step": 1058 }, { "epoch": 0.34349367088607596, "grad_norm": 0.07190306484699249, "learning_rate": 9.009352285596287e-06, "loss": 0.0, "step": 1060 }, { "epoch": 0.34414177215189873, "grad_norm": 0.12991821765899658, "learning_rate": 9.005556945387301e-06, "loss": 0.0003, "step": 1062 }, { "epoch": 0.3447898734177215, "grad_norm": 0.11152586340904236, "learning_rate": 9.001755151427532e-06, "loss": 0.0007, "step": 1064 }, { "epoch": 0.3454379746835443, "grad_norm": 0.08615229278802872, "learning_rate": 8.997946909842426e-06, "loss": 0.0003, "step": 1066 }, { "epoch": 0.3460860759493671, "grad_norm": 0.11003229767084122, "learning_rate": 8.99413222676782e-06, "loss": 0.0003, "step": 1068 }, { "epoch": 0.3467341772151899, "grad_norm": 0.4303703010082245, "learning_rate": 8.990311108349926e-06, "loss": 0.0003, "step": 1070 }, { "epoch": 0.34738227848101266, "grad_norm": 0.45959335565567017, "learning_rate": 8.986483560745335e-06, "loss": 0.0008, "step": 1072 }, { "epoch": 0.3480303797468354, "grad_norm": 0.38193613290786743, "learning_rate": 8.982649590120982e-06, "loss": 0.0001, "step": 1074 }, { "epoch": 0.34867848101265825, "grad_norm": 1.4580763578414917, "learning_rate": 8.978809202654161e-06, "loss": 0.0006, "step": 1076 }, { "epoch": 0.349326582278481, "grad_norm": 0.23947280645370483, "learning_rate": 8.974962404532503e-06, "loss": 0.0009, "step": 1078 }, { "epoch": 0.3499746835443038, "grad_norm": 0.40106865763664246, "learning_rate": 8.971109201953962e-06, "loss": 0.0008, "step": 1080 }, { "epoch": 0.3499746835443038, "eval_accuracy": 0.9997245806688072, "eval_loss": 0.00159825652372092, "eval_runtime": 105.5986, "eval_samples_per_second": 47.349, "eval_steps_per_second": 11.837, "step": 1080 }, { "epoch": 0.3506227848101266, "grad_norm": 0.16527506709098816, "learning_rate": 8.967249601126821e-06, "loss": 0.0006, "step": 1082 }, { "epoch": 0.35127088607594936, "grad_norm": 0.4305400252342224, "learning_rate": 8.963383608269665e-06, "loss": 0.0009, "step": 1084 }, { "epoch": 0.3519189873417721, "grad_norm": 0.07541073113679886, "learning_rate": 8.959511229611377e-06, "loss": 0.0002, "step": 1086 }, { "epoch": 0.35256708860759495, "grad_norm": 0.04376499727368355, "learning_rate": 8.955632471391132e-06, "loss": 0.0002, "step": 1088 }, { "epoch": 0.3532151898734177, "grad_norm": 0.3669925332069397, "learning_rate": 8.951747339858383e-06, "loss": 0.0004, "step": 1090 }, { "epoch": 0.35386329113924053, "grad_norm": 0.018304547294974327, "learning_rate": 8.947855841272852e-06, "loss": 0.0001, "step": 1092 }, { "epoch": 0.3545113924050633, "grad_norm": 0.15526734292507172, "learning_rate": 8.943957981904518e-06, "loss": 0.0006, "step": 1094 }, { "epoch": 0.35515949367088606, "grad_norm": 0.04184693470597267, "learning_rate": 8.94005376803361e-06, "loss": 0.0002, "step": 1096 }, { "epoch": 0.3558075949367089, "grad_norm": 0.19317471981048584, "learning_rate": 8.936143205950596e-06, "loss": 0.0008, "step": 1098 }, { "epoch": 0.35645569620253165, "grad_norm": 0.14545564353466034, "learning_rate": 8.93222630195617e-06, "loss": 0.0005, "step": 1100 }, { "epoch": 0.3571037974683544, "grad_norm": 0.26518160104751587, "learning_rate": 8.928303062361244e-06, "loss": 0.0005, "step": 1102 }, { "epoch": 0.35775189873417723, "grad_norm": 0.37797990441322327, "learning_rate": 8.924373493486941e-06, "loss": 0.0002, "step": 1104 }, { "epoch": 0.3584, "grad_norm": 0.17237479984760284, "learning_rate": 8.92043760166458e-06, "loss": 0.0001, "step": 1106 }, { "epoch": 0.35904810126582276, "grad_norm": 0.04698828235268593, "learning_rate": 8.916495393235666e-06, "loss": 0.0008, "step": 1108 }, { "epoch": 0.3596962025316456, "grad_norm": 0.04225539043545723, "learning_rate": 8.912546874551883e-06, "loss": 0.0004, "step": 1110 }, { "epoch": 0.36034430379746835, "grad_norm": 0.041860613971948624, "learning_rate": 8.908592051975083e-06, "loss": 0.0003, "step": 1112 }, { "epoch": 0.36099240506329117, "grad_norm": 0.49554023146629333, "learning_rate": 8.904630931877271e-06, "loss": 0.0011, "step": 1114 }, { "epoch": 0.36164050632911393, "grad_norm": 0.019335318356752396, "learning_rate": 8.900663520640605e-06, "loss": 0.0011, "step": 1116 }, { "epoch": 0.3622886075949367, "grad_norm": 1.3709319829940796, "learning_rate": 8.896689824657371e-06, "loss": 0.0003, "step": 1118 }, { "epoch": 0.3629367088607595, "grad_norm": 0.5006961226463318, "learning_rate": 8.892709850329991e-06, "loss": 0.0016, "step": 1120 }, { "epoch": 0.3629367088607595, "eval_accuracy": 0.9998284606779089, "eval_loss": 0.0006213401211425662, "eval_runtime": 104.31, "eval_samples_per_second": 47.934, "eval_steps_per_second": 11.984, "step": 1120 }, { "epoch": 0.3635848101265823, "grad_norm": 0.29077577590942383, "learning_rate": 8.88872360407099e-06, "loss": 0.0008, "step": 1122 }, { "epoch": 0.36423291139240505, "grad_norm": 0.18928442895412445, "learning_rate": 8.884731092303011e-06, "loss": 0.0006, "step": 1124 }, { "epoch": 0.36488101265822787, "grad_norm": 0.011645007878541946, "learning_rate": 8.880732321458785e-06, "loss": 0.0001, "step": 1126 }, { "epoch": 0.36552911392405063, "grad_norm": 1.1214470863342285, "learning_rate": 8.876727297981129e-06, "loss": 0.0007, "step": 1128 }, { "epoch": 0.3661772151898734, "grad_norm": 0.548606812953949, "learning_rate": 8.872716028322931e-06, "loss": 0.0007, "step": 1130 }, { "epoch": 0.3668253164556962, "grad_norm": 0.0687018632888794, "learning_rate": 8.868698518947152e-06, "loss": 0.0001, "step": 1132 }, { "epoch": 0.367473417721519, "grad_norm": 0.5724080204963684, "learning_rate": 8.864674776326798e-06, "loss": 0.0006, "step": 1134 }, { "epoch": 0.36812151898734174, "grad_norm": 0.08760637789964676, "learning_rate": 8.860644806944917e-06, "loss": 0.0006, "step": 1136 }, { "epoch": 0.36876962025316457, "grad_norm": 0.08993974328041077, "learning_rate": 8.8566086172946e-06, "loss": 0.0003, "step": 1138 }, { "epoch": 0.36941772151898733, "grad_norm": 0.108463354408741, "learning_rate": 8.852566213878947e-06, "loss": 0.0007, "step": 1140 }, { "epoch": 0.37006582278481015, "grad_norm": 0.054592058062553406, "learning_rate": 8.84851760321108e-06, "loss": 0.0001, "step": 1142 }, { "epoch": 0.3707139240506329, "grad_norm": 0.16010713577270508, "learning_rate": 8.844462791814113e-06, "loss": 0.0003, "step": 1144 }, { "epoch": 0.3713620253164557, "grad_norm": 0.04880848154425621, "learning_rate": 8.84040178622116e-06, "loss": 0.0002, "step": 1146 }, { "epoch": 0.3720101265822785, "grad_norm": 0.16024944186210632, "learning_rate": 8.83633459297531e-06, "loss": 0.0004, "step": 1148 }, { "epoch": 0.37265822784810126, "grad_norm": 0.04184041544795036, "learning_rate": 8.83226121862962e-06, "loss": 0.0002, "step": 1150 }, { "epoch": 0.37330632911392403, "grad_norm": 0.006558520253747702, "learning_rate": 8.828181669747111e-06, "loss": 0.0002, "step": 1152 }, { "epoch": 0.37395443037974685, "grad_norm": 0.2030220627784729, "learning_rate": 8.824095952900746e-06, "loss": 0.0006, "step": 1154 }, { "epoch": 0.3746025316455696, "grad_norm": 0.015400746837258339, "learning_rate": 8.820004074673433e-06, "loss": 0.0001, "step": 1156 }, { "epoch": 0.3752506329113924, "grad_norm": 0.10592592507600784, "learning_rate": 8.815906041658001e-06, "loss": 0.0005, "step": 1158 }, { "epoch": 0.3758987341772152, "grad_norm": 0.17723920941352844, "learning_rate": 8.8118018604572e-06, "loss": 0.0003, "step": 1160 }, { "epoch": 0.3758987341772152, "eval_accuracy": 0.9999317506510353, "eval_loss": 0.0002906288718804717, "eval_runtime": 105.6508, "eval_samples_per_second": 47.326, "eval_steps_per_second": 11.831, "step": 1160 }, { "epoch": 0.37654683544303796, "grad_norm": 0.05428408086299896, "learning_rate": 8.807691537683685e-06, "loss": 0.0007, "step": 1162 }, { "epoch": 0.3771949367088608, "grad_norm": 0.12672647833824158, "learning_rate": 8.80357507996e-06, "loss": 0.0009, "step": 1164 }, { "epoch": 0.37784303797468355, "grad_norm": 0.035302940756082535, "learning_rate": 8.799452493918586e-06, "loss": 0.0, "step": 1166 }, { "epoch": 0.3784911392405063, "grad_norm": 0.01754443719983101, "learning_rate": 8.795323786201746e-06, "loss": 0.0, "step": 1168 }, { "epoch": 0.37913924050632913, "grad_norm": 0.21614503860473633, "learning_rate": 8.791188963461653e-06, "loss": 0.001, "step": 1170 }, { "epoch": 0.3797873417721519, "grad_norm": 0.0031652958132326603, "learning_rate": 8.787048032360332e-06, "loss": 0.001, "step": 1172 }, { "epoch": 0.38043544303797466, "grad_norm": 0.05934705585241318, "learning_rate": 8.782900999569646e-06, "loss": 0.0001, "step": 1174 }, { "epoch": 0.3810835443037975, "grad_norm": 0.01244109682738781, "learning_rate": 8.778747871771293e-06, "loss": 0.0, "step": 1176 }, { "epoch": 0.38173164556962025, "grad_norm": 0.017785053700208664, "learning_rate": 8.774588655656787e-06, "loss": 0.0, "step": 1178 }, { "epoch": 0.382379746835443, "grad_norm": 0.17515426874160767, "learning_rate": 8.770423357927463e-06, "loss": 0.0022, "step": 1180 }, { "epoch": 0.38302784810126583, "grad_norm": 0.2343391627073288, "learning_rate": 8.766251985294435e-06, "loss": 0.0001, "step": 1182 }, { "epoch": 0.3836759493670886, "grad_norm": 0.17780566215515137, "learning_rate": 8.762074544478622e-06, "loss": 0.0001, "step": 1184 }, { "epoch": 0.3843240506329114, "grad_norm": 0.005761538632214069, "learning_rate": 8.757891042210713e-06, "loss": 0.0, "step": 1186 }, { "epoch": 0.3849721518987342, "grad_norm": 0.13648106157779694, "learning_rate": 8.753701485231165e-06, "loss": 0.0005, "step": 1188 }, { "epoch": 0.38562025316455695, "grad_norm": 0.025816872715950012, "learning_rate": 8.749505880290188e-06, "loss": 0.0001, "step": 1190 }, { "epoch": 0.38626835443037977, "grad_norm": 0.04343462362885475, "learning_rate": 8.74530423414774e-06, "loss": 0.0002, "step": 1192 }, { "epoch": 0.38691645569620253, "grad_norm": 0.08946521580219269, "learning_rate": 8.741096553573506e-06, "loss": 0.0001, "step": 1194 }, { "epoch": 0.3875645569620253, "grad_norm": 0.06926504522562027, "learning_rate": 8.736882845346906e-06, "loss": 0.0001, "step": 1196 }, { "epoch": 0.3882126582278481, "grad_norm": 0.13481925427913666, "learning_rate": 8.732663116257057e-06, "loss": 0.0005, "step": 1198 }, { "epoch": 0.3888607594936709, "grad_norm": 0.10871945321559906, "learning_rate": 8.728437373102784e-06, "loss": 0.0002, "step": 1200 }, { "epoch": 0.3888607594936709, "eval_accuracy": 0.9999794378783441, "eval_loss": 7.780938176438212e-05, "eval_runtime": 105.7236, "eval_samples_per_second": 47.293, "eval_steps_per_second": 11.823, "step": 1200 }, { "epoch": 0.38950886075949365, "grad_norm": 0.011972387321293354, "learning_rate": 8.724205622692608e-06, "loss": 0.0001, "step": 1202 }, { "epoch": 0.39015696202531647, "grad_norm": 0.14289769530296326, "learning_rate": 8.719967871844715e-06, "loss": 0.0, "step": 1204 }, { "epoch": 0.39080506329113923, "grad_norm": 0.06000698730349541, "learning_rate": 8.715724127386971e-06, "loss": 0.0001, "step": 1206 }, { "epoch": 0.39145316455696205, "grad_norm": 0.055760614573955536, "learning_rate": 8.711474396156894e-06, "loss": 0.0001, "step": 1208 }, { "epoch": 0.3921012658227848, "grad_norm": 0.04712485894560814, "learning_rate": 8.707218685001648e-06, "loss": 0.0001, "step": 1210 }, { "epoch": 0.3927493670886076, "grad_norm": 0.022366875782608986, "learning_rate": 8.702957000778029e-06, "loss": 0.0002, "step": 1212 }, { "epoch": 0.3933974683544304, "grad_norm": 0.47528231143951416, "learning_rate": 8.698689350352465e-06, "loss": 0.0002, "step": 1214 }, { "epoch": 0.39404556962025317, "grad_norm": 0.10005582869052887, "learning_rate": 8.69441574060099e-06, "loss": 0.0001, "step": 1216 }, { "epoch": 0.39469367088607593, "grad_norm": 0.4457380473613739, "learning_rate": 8.690136178409237e-06, "loss": 0.0002, "step": 1218 }, { "epoch": 0.39534177215189875, "grad_norm": 0.013173989951610565, "learning_rate": 8.685850670672438e-06, "loss": 0.0, "step": 1220 }, { "epoch": 0.3959898734177215, "grad_norm": 0.003624015487730503, "learning_rate": 8.681559224295401e-06, "loss": 0.0, "step": 1222 }, { "epoch": 0.3966379746835443, "grad_norm": 0.23442572355270386, "learning_rate": 8.6772618461925e-06, "loss": 0.0002, "step": 1224 }, { "epoch": 0.3972860759493671, "grad_norm": 0.3089965283870697, "learning_rate": 8.672958543287666e-06, "loss": 0.0002, "step": 1226 }, { "epoch": 0.39793417721518987, "grad_norm": 0.18517152965068817, "learning_rate": 8.668649322514382e-06, "loss": 0.0001, "step": 1228 }, { "epoch": 0.39858227848101263, "grad_norm": 0.6624205708503723, "learning_rate": 8.66433419081566e-06, "loss": 0.0006, "step": 1230 }, { "epoch": 0.39923037974683545, "grad_norm": 0.02062745951116085, "learning_rate": 8.660013155144036e-06, "loss": 0.0001, "step": 1232 }, { "epoch": 0.3998784810126582, "grad_norm": 0.035230252891778946, "learning_rate": 8.655686222461561e-06, "loss": 0.0002, "step": 1234 }, { "epoch": 0.40052658227848104, "grad_norm": 0.17802561819553375, "learning_rate": 8.651353399739787e-06, "loss": 0.0001, "step": 1236 }, { "epoch": 0.4011746835443038, "grad_norm": 0.07029800117015839, "learning_rate": 8.647014693959754e-06, "loss": 0.0003, "step": 1238 }, { "epoch": 0.40182278481012657, "grad_norm": 0.04419781640172005, "learning_rate": 8.642670112111982e-06, "loss": 0.0001, "step": 1240 }, { "epoch": 0.40182278481012657, "eval_accuracy": 0.9998813481127686, "eval_loss": 0.00046045694034546614, "eval_runtime": 107.728, "eval_samples_per_second": 46.413, "eval_steps_per_second": 11.603, "step": 1240 }, { "epoch": 0.4024708860759494, "grad_norm": 0.37598854303359985, "learning_rate": 8.63831966119646e-06, "loss": 0.0008, "step": 1242 }, { "epoch": 0.40311898734177215, "grad_norm": 0.02678529918193817, "learning_rate": 8.633963348222628e-06, "loss": 0.0005, "step": 1244 }, { "epoch": 0.4037670886075949, "grad_norm": 0.040035102516412735, "learning_rate": 8.629601180209382e-06, "loss": 0.0001, "step": 1246 }, { "epoch": 0.40441518987341774, "grad_norm": 0.044874873012304306, "learning_rate": 8.625233164185035e-06, "loss": 0.0, "step": 1248 }, { "epoch": 0.4050632911392405, "grad_norm": 0.11809933930635452, "learning_rate": 8.620859307187339e-06, "loss": 0.0002, "step": 1250 }, { "epoch": 0.40571139240506326, "grad_norm": 0.07839110493659973, "learning_rate": 8.616479616263444e-06, "loss": 0.0, "step": 1252 }, { "epoch": 0.4063594936708861, "grad_norm": 0.20940014719963074, "learning_rate": 8.61209409846991e-06, "loss": 0.0005, "step": 1254 }, { "epoch": 0.40700759493670885, "grad_norm": 0.00456192996352911, "learning_rate": 8.607702760872679e-06, "loss": 0.0004, "step": 1256 }, { "epoch": 0.40765569620253167, "grad_norm": 0.01556658186018467, "learning_rate": 8.60330561054707e-06, "loss": 0.0, "step": 1258 }, { "epoch": 0.40830379746835443, "grad_norm": 0.0071419524028897285, "learning_rate": 8.598902654577768e-06, "loss": 0.0001, "step": 1260 }, { "epoch": 0.4089518987341772, "grad_norm": 0.10579927265644073, "learning_rate": 8.594493900058817e-06, "loss": 0.0001, "step": 1262 }, { "epoch": 0.4096, "grad_norm": 0.08323294669389725, "learning_rate": 8.590079354093594e-06, "loss": 0.0002, "step": 1264 }, { "epoch": 0.4102481012658228, "grad_norm": 0.06471659243106842, "learning_rate": 8.585659023794818e-06, "loss": 0.0001, "step": 1266 }, { "epoch": 0.41089620253164555, "grad_norm": 0.27636727690696716, "learning_rate": 8.581232916284519e-06, "loss": 0.0002, "step": 1268 }, { "epoch": 0.41154430379746837, "grad_norm": 0.009334628470242023, "learning_rate": 8.57680103869404e-06, "loss": 0.0, "step": 1270 }, { "epoch": 0.41219240506329113, "grad_norm": 0.00627860426902771, "learning_rate": 8.572363398164017e-06, "loss": 0.0, "step": 1272 }, { "epoch": 0.4128405063291139, "grad_norm": 0.06497501581907272, "learning_rate": 8.567920001844376e-06, "loss": 0.0003, "step": 1274 }, { "epoch": 0.4134886075949367, "grad_norm": 0.0032212065998464823, "learning_rate": 8.563470856894316e-06, "loss": 0.0002, "step": 1276 }, { "epoch": 0.4141367088607595, "grad_norm": 0.15634731948375702, "learning_rate": 8.559015970482292e-06, "loss": 0.0012, "step": 1278 }, { "epoch": 0.4147848101265823, "grad_norm": 0.2659966051578522, "learning_rate": 8.554555349786016e-06, "loss": 0.0005, "step": 1280 }, { "epoch": 0.4147848101265823, "eval_accuracy": 0.9999672238180145, "eval_loss": 0.00013511251017916948, "eval_runtime": 107.0127, "eval_samples_per_second": 46.723, "eval_steps_per_second": 11.681, "step": 1280 }, { "epoch": 0.41543291139240507, "grad_norm": 0.14331784844398499, "learning_rate": 8.550089001992438e-06, "loss": 0.0005, "step": 1282 }, { "epoch": 0.41608101265822783, "grad_norm": 0.3357856571674347, "learning_rate": 8.545616934297733e-06, "loss": 0.0006, "step": 1284 }, { "epoch": 0.41672911392405065, "grad_norm": 0.08820825070142746, "learning_rate": 8.541139153907296e-06, "loss": 0.0001, "step": 1286 }, { "epoch": 0.4173772151898734, "grad_norm": 0.11874289065599442, "learning_rate": 8.536655668035723e-06, "loss": 0.0003, "step": 1288 }, { "epoch": 0.4180253164556962, "grad_norm": 0.07864240556955338, "learning_rate": 8.532166483906804e-06, "loss": 0.0003, "step": 1290 }, { "epoch": 0.418673417721519, "grad_norm": 0.008136331103742123, "learning_rate": 8.527671608753508e-06, "loss": 0.0001, "step": 1292 }, { "epoch": 0.41932151898734177, "grad_norm": 0.22352826595306396, "learning_rate": 8.523171049817974e-06, "loss": 0.0002, "step": 1294 }, { "epoch": 0.41996962025316453, "grad_norm": 0.11811576783657074, "learning_rate": 8.518664814351502e-06, "loss": 0.0001, "step": 1296 }, { "epoch": 0.42061772151898735, "grad_norm": 0.07777664810419083, "learning_rate": 8.514152909614538e-06, "loss": 0.0001, "step": 1298 }, { "epoch": 0.4212658227848101, "grad_norm": 0.008249818347394466, "learning_rate": 8.509635342876655e-06, "loss": 0.0, "step": 1300 }, { "epoch": 0.42191392405063294, "grad_norm": 0.14114025235176086, "learning_rate": 8.505112121416554e-06, "loss": 0.0002, "step": 1302 }, { "epoch": 0.4225620253164557, "grad_norm": 0.17114797234535217, "learning_rate": 8.500583252522053e-06, "loss": 0.0003, "step": 1304 }, { "epoch": 0.42321012658227847, "grad_norm": 0.03196744620800018, "learning_rate": 8.496048743490053e-06, "loss": 0.0001, "step": 1306 }, { "epoch": 0.4238582278481013, "grad_norm": 0.022405337542295456, "learning_rate": 8.49150860162656e-06, "loss": 0.0001, "step": 1308 }, { "epoch": 0.42450632911392405, "grad_norm": 0.009215210564434528, "learning_rate": 8.486962834246646e-06, "loss": 0.0001, "step": 1310 }, { "epoch": 0.4251544303797468, "grad_norm": 0.08726584166288376, "learning_rate": 8.482411448674445e-06, "loss": 0.0001, "step": 1312 }, { "epoch": 0.42580253164556964, "grad_norm": 0.006170928943902254, "learning_rate": 8.477854452243149e-06, "loss": 0.0, "step": 1314 }, { "epoch": 0.4264506329113924, "grad_norm": 0.27486130595207214, "learning_rate": 8.473291852294986e-06, "loss": 0.0005, "step": 1316 }, { "epoch": 0.42709873417721517, "grad_norm": 0.5437067151069641, "learning_rate": 8.468723656181219e-06, "loss": 0.001, "step": 1318 }, { "epoch": 0.427746835443038, "grad_norm": 0.009669135324656963, "learning_rate": 8.464149871262118e-06, "loss": 0.0, "step": 1320 }, { "epoch": 0.427746835443038, "eval_accuracy": 0.9999560106092086, "eval_loss": 0.00023698546283412725, "eval_runtime": 105.6712, "eval_samples_per_second": 47.317, "eval_steps_per_second": 11.829, "step": 1320 }, { "epoch": 0.42839493670886075, "grad_norm": 0.02640676125884056, "learning_rate": 8.459570504906962e-06, "loss": 0.0006, "step": 1322 }, { "epoch": 0.4290430379746835, "grad_norm": 0.20899447798728943, "learning_rate": 8.454985564494025e-06, "loss": 0.0001, "step": 1324 }, { "epoch": 0.42969113924050634, "grad_norm": 0.5605078339576721, "learning_rate": 8.450395057410561e-06, "loss": 0.0009, "step": 1326 }, { "epoch": 0.4303392405063291, "grad_norm": 0.18222258985042572, "learning_rate": 8.445798991052791e-06, "loss": 0.0004, "step": 1328 }, { "epoch": 0.4309873417721519, "grad_norm": 0.16136883199214935, "learning_rate": 8.441197372825892e-06, "loss": 0.0001, "step": 1330 }, { "epoch": 0.4316354430379747, "grad_norm": 0.1953054517507553, "learning_rate": 8.436590210143991e-06, "loss": 0.0006, "step": 1332 }, { "epoch": 0.43228354430379745, "grad_norm": 0.2527662515640259, "learning_rate": 8.431977510430145e-06, "loss": 0.0004, "step": 1334 }, { "epoch": 0.43293164556962027, "grad_norm": 0.06676214188337326, "learning_rate": 8.427359281116335e-06, "loss": 0.0002, "step": 1336 }, { "epoch": 0.43357974683544304, "grad_norm": 0.2816427946090698, "learning_rate": 8.422735529643445e-06, "loss": 0.0003, "step": 1338 }, { "epoch": 0.4342278481012658, "grad_norm": 0.019613001495599747, "learning_rate": 8.418106263461261e-06, "loss": 0.0003, "step": 1340 }, { "epoch": 0.4348759493670886, "grad_norm": 0.02441325969994068, "learning_rate": 8.413471490028456e-06, "loss": 0.0002, "step": 1342 }, { "epoch": 0.4355240506329114, "grad_norm": 0.0942113846540451, "learning_rate": 8.408831216812574e-06, "loss": 0.0001, "step": 1344 }, { "epoch": 0.43617215189873415, "grad_norm": 0.2570306360721588, "learning_rate": 8.404185451290017e-06, "loss": 0.0005, "step": 1346 }, { "epoch": 0.43682025316455697, "grad_norm": 0.01734016090631485, "learning_rate": 8.399534200946044e-06, "loss": 0.0006, "step": 1348 }, { "epoch": 0.43746835443037974, "grad_norm": 0.06334984302520752, "learning_rate": 8.394877473274743e-06, "loss": 0.0002, "step": 1350 }, { "epoch": 0.43811645569620256, "grad_norm": 0.09384873509407043, "learning_rate": 8.39021527577903e-06, "loss": 0.0002, "step": 1352 }, { "epoch": 0.4387645569620253, "grad_norm": 0.02200414426624775, "learning_rate": 8.38554761597064e-06, "loss": 0.0003, "step": 1354 }, { "epoch": 0.4394126582278481, "grad_norm": 0.007787387352436781, "learning_rate": 8.380874501370098e-06, "loss": 0.0, "step": 1356 }, { "epoch": 0.4400607594936709, "grad_norm": 0.02325863018631935, "learning_rate": 8.376195939506727e-06, "loss": 0.0001, "step": 1358 }, { "epoch": 0.44070886075949367, "grad_norm": 0.02358267642557621, "learning_rate": 8.371511937918616e-06, "loss": 0.0001, "step": 1360 }, { "epoch": 0.44070886075949367, "eval_accuracy": 0.9999532019066854, "eval_loss": 0.00014842751261312515, "eval_runtime": 100.4462, "eval_samples_per_second": 49.778, "eval_steps_per_second": 12.444, "step": 1360 }, { "epoch": 0.44135696202531643, "grad_norm": 0.009702431038022041, "learning_rate": 8.366822504152636e-06, "loss": 0.0, "step": 1362 }, { "epoch": 0.44200506329113926, "grad_norm": 0.003049626015126705, "learning_rate": 8.362127645764392e-06, "loss": 0.0005, "step": 1364 }, { "epoch": 0.442653164556962, "grad_norm": 0.3415703773498535, "learning_rate": 8.357427370318239e-06, "loss": 0.0004, "step": 1366 }, { "epoch": 0.4433012658227848, "grad_norm": 0.04481835663318634, "learning_rate": 8.352721685387258e-06, "loss": 0.0003, "step": 1368 }, { "epoch": 0.4439493670886076, "grad_norm": 0.22728939354419708, "learning_rate": 8.348010598553245e-06, "loss": 0.0002, "step": 1370 }, { "epoch": 0.44459746835443037, "grad_norm": 0.08108057081699371, "learning_rate": 8.3432941174067e-06, "loss": 0.001, "step": 1372 }, { "epoch": 0.4452455696202532, "grad_norm": 1.374200701713562, "learning_rate": 8.338572249546813e-06, "loss": 0.0022, "step": 1374 }, { "epoch": 0.44589367088607595, "grad_norm": 0.008610721677541733, "learning_rate": 8.33384500258146e-06, "loss": 0.0004, "step": 1376 }, { "epoch": 0.4465417721518987, "grad_norm": 0.0495627261698246, "learning_rate": 8.329112384127172e-06, "loss": 0.0006, "step": 1378 }, { "epoch": 0.44718987341772154, "grad_norm": 0.02110774628818035, "learning_rate": 8.324374401809144e-06, "loss": 0.0002, "step": 1380 }, { "epoch": 0.4478379746835443, "grad_norm": 0.23234134912490845, "learning_rate": 8.319631063261209e-06, "loss": 0.0004, "step": 1382 }, { "epoch": 0.44848607594936707, "grad_norm": 0.12905079126358032, "learning_rate": 8.314882376125832e-06, "loss": 0.0007, "step": 1384 }, { "epoch": 0.4491341772151899, "grad_norm": 0.11328396201133728, "learning_rate": 8.310128348054093e-06, "loss": 0.0005, "step": 1386 }, { "epoch": 0.44978227848101265, "grad_norm": 0.016495974734425545, "learning_rate": 8.305368986705683e-06, "loss": 0.0001, "step": 1388 }, { "epoch": 0.4504303797468354, "grad_norm": 0.06582915037870407, "learning_rate": 8.300604299748876e-06, "loss": 0.0001, "step": 1390 }, { "epoch": 0.45107848101265824, "grad_norm": 0.0918450728058815, "learning_rate": 8.295834294860535e-06, "loss": 0.0001, "step": 1392 }, { "epoch": 0.451726582278481, "grad_norm": 0.36612367630004883, "learning_rate": 8.291058979726092e-06, "loss": 0.0003, "step": 1394 }, { "epoch": 0.4523746835443038, "grad_norm": 0.014073346741497517, "learning_rate": 8.286278362039527e-06, "loss": 0.0001, "step": 1396 }, { "epoch": 0.4530227848101266, "grad_norm": 0.14093072712421417, "learning_rate": 8.281492449503372e-06, "loss": 0.0003, "step": 1398 }, { "epoch": 0.45367088607594935, "grad_norm": 0.2960674464702606, "learning_rate": 8.276701249828684e-06, "loss": 0.0002, "step": 1400 }, { "epoch": 0.45367088607594935, "eval_accuracy": 0.9999100233106311, "eval_loss": 0.0003401956637389958, "eval_runtime": 102.0123, "eval_samples_per_second": 49.014, "eval_steps_per_second": 12.253, "step": 1400 }, { "epoch": 0.4543189873417722, "grad_norm": 0.005298405420035124, "learning_rate": 8.271904770735042e-06, "loss": 0.0004, "step": 1402 }, { "epoch": 0.45496708860759494, "grad_norm": 0.08128544688224792, "learning_rate": 8.267103019950529e-06, "loss": 0.0002, "step": 1404 }, { "epoch": 0.4556151898734177, "grad_norm": 0.03566949442028999, "learning_rate": 8.262296005211722e-06, "loss": 0.0001, "step": 1406 }, { "epoch": 0.4562632911392405, "grad_norm": 0.049706194549798965, "learning_rate": 8.257483734263682e-06, "loss": 0.0002, "step": 1408 }, { "epoch": 0.4569113924050633, "grad_norm": 0.43301084637641907, "learning_rate": 8.252666214859936e-06, "loss": 0.0019, "step": 1410 }, { "epoch": 0.45755949367088605, "grad_norm": 0.01807190477848053, "learning_rate": 8.247843454762467e-06, "loss": 0.0003, "step": 1412 }, { "epoch": 0.4582075949367089, "grad_norm": 0.040173470973968506, "learning_rate": 8.243015461741707e-06, "loss": 0.0001, "step": 1414 }, { "epoch": 0.45885569620253164, "grad_norm": 0.3392210304737091, "learning_rate": 8.238182243576512e-06, "loss": 0.0002, "step": 1416 }, { "epoch": 0.45950379746835446, "grad_norm": 0.1378878653049469, "learning_rate": 8.233343808054159e-06, "loss": 0.0006, "step": 1418 }, { "epoch": 0.4601518987341772, "grad_norm": 0.29194381833076477, "learning_rate": 8.228500162970333e-06, "loss": 0.0003, "step": 1420 }, { "epoch": 0.4608, "grad_norm": 0.03227922320365906, "learning_rate": 8.223651316129115e-06, "loss": 0.0001, "step": 1422 }, { "epoch": 0.4614481012658228, "grad_norm": 0.2710287868976593, "learning_rate": 8.21879727534296e-06, "loss": 0.0007, "step": 1424 }, { "epoch": 0.4620962025316456, "grad_norm": 0.01669517531991005, "learning_rate": 8.213938048432697e-06, "loss": 0.0, "step": 1426 }, { "epoch": 0.46274430379746834, "grad_norm": 0.6960672736167908, "learning_rate": 8.20907364322751e-06, "loss": 0.0002, "step": 1428 }, { "epoch": 0.46339240506329116, "grad_norm": 0.30904194712638855, "learning_rate": 8.204204067564924e-06, "loss": 0.0004, "step": 1430 }, { "epoch": 0.4640405063291139, "grad_norm": 0.2030443251132965, "learning_rate": 8.199329329290798e-06, "loss": 0.0003, "step": 1432 }, { "epoch": 0.4646886075949367, "grad_norm": 0.06898464262485504, "learning_rate": 8.194449436259305e-06, "loss": 0.0001, "step": 1434 }, { "epoch": 0.4653367088607595, "grad_norm": 0.39258521795272827, "learning_rate": 8.189564396332927e-06, "loss": 0.0008, "step": 1436 }, { "epoch": 0.46598481012658227, "grad_norm": 0.12645263969898224, "learning_rate": 8.184674217382438e-06, "loss": 0.0008, "step": 1438 }, { "epoch": 0.46663291139240504, "grad_norm": 0.1911512017250061, "learning_rate": 8.179778907286889e-06, "loss": 0.0008, "step": 1440 }, { "epoch": 0.46663291139240504, "eval_accuracy": 0.99995684467669, "eval_loss": 0.00017216747801285237, "eval_runtime": 101.363, "eval_samples_per_second": 49.328, "eval_steps_per_second": 12.332, "step": 1440 }, { "epoch": 0.46728101265822786, "grad_norm": 0.1570432484149933, "learning_rate": 8.174878473933601e-06, "loss": 0.0002, "step": 1442 }, { "epoch": 0.4679291139240506, "grad_norm": 0.1353127658367157, "learning_rate": 8.16997292521815e-06, "loss": 0.0005, "step": 1444 }, { "epoch": 0.46857721518987344, "grad_norm": 0.09459888190031052, "learning_rate": 8.165062269044353e-06, "loss": 0.0002, "step": 1446 }, { "epoch": 0.4692253164556962, "grad_norm": 0.03600757569074631, "learning_rate": 8.160146513324256e-06, "loss": 0.0001, "step": 1448 }, { "epoch": 0.46987341772151897, "grad_norm": 0.2478301078081131, "learning_rate": 8.15522566597812e-06, "loss": 0.0005, "step": 1450 }, { "epoch": 0.4705215189873418, "grad_norm": 0.16877983510494232, "learning_rate": 8.150299734934413e-06, "loss": 0.0004, "step": 1452 }, { "epoch": 0.47116962025316456, "grad_norm": 0.0916639119386673, "learning_rate": 8.14536872812979e-06, "loss": 0.0007, "step": 1454 }, { "epoch": 0.4718177215189873, "grad_norm": 0.04261047765612602, "learning_rate": 8.140432653509089e-06, "loss": 0.0001, "step": 1456 }, { "epoch": 0.47246582278481014, "grad_norm": 0.14687563478946686, "learning_rate": 8.135491519025307e-06, "loss": 0.0002, "step": 1458 }, { "epoch": 0.4731139240506329, "grad_norm": 0.15630154311656952, "learning_rate": 8.130545332639599e-06, "loss": 0.0001, "step": 1460 }, { "epoch": 0.47376202531645567, "grad_norm": 0.017821237444877625, "learning_rate": 8.125594102321256e-06, "loss": 0.0001, "step": 1462 }, { "epoch": 0.4744101265822785, "grad_norm": 0.025156579911708832, "learning_rate": 8.120637836047698e-06, "loss": 0.0, "step": 1464 }, { "epoch": 0.47505822784810126, "grad_norm": 0.24539688229560852, "learning_rate": 8.115676541804456e-06, "loss": 0.0003, "step": 1466 }, { "epoch": 0.4757063291139241, "grad_norm": 0.007439673412591219, "learning_rate": 8.110710227585169e-06, "loss": 0.0, "step": 1468 }, { "epoch": 0.47635443037974684, "grad_norm": 0.224391371011734, "learning_rate": 8.105738901391553e-06, "loss": 0.0001, "step": 1470 }, { "epoch": 0.4770025316455696, "grad_norm": 0.12915731966495514, "learning_rate": 8.100762571233409e-06, "loss": 0.0003, "step": 1472 }, { "epoch": 0.4776506329113924, "grad_norm": 0.02007501944899559, "learning_rate": 8.095781245128598e-06, "loss": 0.0, "step": 1474 }, { "epoch": 0.4782987341772152, "grad_norm": 0.009752247482538223, "learning_rate": 8.090794931103026e-06, "loss": 0.0, "step": 1476 }, { "epoch": 0.47894683544303795, "grad_norm": 0.10017750412225723, "learning_rate": 8.085803637190643e-06, "loss": 0.0003, "step": 1478 }, { "epoch": 0.4795949367088608, "grad_norm": 0.005627952981740236, "learning_rate": 8.080807371433415e-06, "loss": 0.0, "step": 1480 }, { "epoch": 0.4795949367088608, "eval_accuracy": 0.9999832096964334, "eval_loss": 7.632683264091611e-05, "eval_runtime": 100.8851, "eval_samples_per_second": 49.561, "eval_steps_per_second": 12.39, "step": 1480 }, { "epoch": 0.48024303797468354, "grad_norm": 0.14481300115585327, "learning_rate": 8.075806141881327e-06, "loss": 0.0001, "step": 1482 }, { "epoch": 0.4808911392405063, "grad_norm": 0.07196534425020218, "learning_rate": 8.07079995659235e-06, "loss": 0.0001, "step": 1484 }, { "epoch": 0.4815392405063291, "grad_norm": 0.1497977375984192, "learning_rate": 8.065788823632451e-06, "loss": 0.0005, "step": 1486 }, { "epoch": 0.4821873417721519, "grad_norm": 0.01656440645456314, "learning_rate": 8.060772751075564e-06, "loss": 0.0001, "step": 1488 }, { "epoch": 0.4828354430379747, "grad_norm": 0.001540559809654951, "learning_rate": 8.05575174700358e-06, "loss": 0.0, "step": 1490 }, { "epoch": 0.4834835443037975, "grad_norm": 0.0042270757257938385, "learning_rate": 8.05072581950634e-06, "loss": 0.0001, "step": 1492 }, { "epoch": 0.48413164556962024, "grad_norm": 0.10281050950288773, "learning_rate": 8.045694976681613e-06, "loss": 0.0001, "step": 1494 }, { "epoch": 0.48477974683544306, "grad_norm": 0.07573399692773819, "learning_rate": 8.04065922663509e-06, "loss": 0.0, "step": 1496 }, { "epoch": 0.4854278481012658, "grad_norm": 0.0019800164736807346, "learning_rate": 8.035618577480369e-06, "loss": 0.0, "step": 1498 }, { "epoch": 0.4860759493670886, "grad_norm": 0.4453049302101135, "learning_rate": 8.030573037338942e-06, "loss": 0.0003, "step": 1500 }, { "epoch": 0.4867240506329114, "grad_norm": 0.0011820768704637885, "learning_rate": 8.025522614340177e-06, "loss": 0.0, "step": 1502 }, { "epoch": 0.4873721518987342, "grad_norm": 0.0030645334627479315, "learning_rate": 8.020467316621316e-06, "loss": 0.0, "step": 1504 }, { "epoch": 0.48802025316455694, "grad_norm": 0.007107930723577738, "learning_rate": 8.015407152327448e-06, "loss": 0.0, "step": 1506 }, { "epoch": 0.48866835443037976, "grad_norm": 0.0038615772500634193, "learning_rate": 8.010342129611508e-06, "loss": 0.0, "step": 1508 }, { "epoch": 0.4893164556962025, "grad_norm": 0.04887912794947624, "learning_rate": 8.005272256634257e-06, "loss": 0.0001, "step": 1510 }, { "epoch": 0.48996455696202534, "grad_norm": 0.0012970338575541973, "learning_rate": 8.000197541564273e-06, "loss": 0.0, "step": 1512 }, { "epoch": 0.4906126582278481, "grad_norm": 0.0027239499613642693, "learning_rate": 7.99511799257793e-06, "loss": 0.0002, "step": 1514 }, { "epoch": 0.4912607594936709, "grad_norm": 0.08472245186567307, "learning_rate": 7.990033617859396e-06, "loss": 0.0, "step": 1516 }, { "epoch": 0.4919088607594937, "grad_norm": 0.0267314575612545, "learning_rate": 7.984944425600614e-06, "loss": 0.0001, "step": 1518 }, { "epoch": 0.49255696202531646, "grad_norm": 0.031123576685786247, "learning_rate": 7.979850424001283e-06, "loss": 0.0001, "step": 1520 }, { "epoch": 0.49255696202531646, "eval_accuracy": 0.9999826136620499, "eval_loss": 6.179293995955959e-05, "eval_runtime": 101.4096, "eval_samples_per_second": 49.305, "eval_steps_per_second": 12.326, "step": 1520 }, { "epoch": 0.4932050632911392, "grad_norm": 0.011671812273561954, "learning_rate": 7.97475162126886e-06, "loss": 0.0, "step": 1522 }, { "epoch": 0.49385316455696204, "grad_norm": 0.007582854479551315, "learning_rate": 7.96964802561853e-06, "loss": 0.0005, "step": 1524 }, { "epoch": 0.4945012658227848, "grad_norm": 0.06047774478793144, "learning_rate": 7.964539645273204e-06, "loss": 0.0001, "step": 1526 }, { "epoch": 0.4951493670886076, "grad_norm": 0.00793007854372263, "learning_rate": 7.9594264884635e-06, "loss": 0.0001, "step": 1528 }, { "epoch": 0.4957974683544304, "grad_norm": 0.008908102288842201, "learning_rate": 7.954308563427732e-06, "loss": 0.0, "step": 1530 }, { "epoch": 0.49644556962025316, "grad_norm": 0.04215071350336075, "learning_rate": 7.9491858784119e-06, "loss": 0.0005, "step": 1532 }, { "epoch": 0.4970936708860759, "grad_norm": 0.0012058537686243653, "learning_rate": 7.944058441669671e-06, "loss": 0.0002, "step": 1534 }, { "epoch": 0.49774177215189874, "grad_norm": 0.015041892416775227, "learning_rate": 7.938926261462366e-06, "loss": 0.0, "step": 1536 }, { "epoch": 0.4983898734177215, "grad_norm": 0.0026610875502228737, "learning_rate": 7.933789346058951e-06, "loss": 0.0, "step": 1538 }, { "epoch": 0.4990379746835443, "grad_norm": 0.002617933787405491, "learning_rate": 7.928647703736024e-06, "loss": 0.0, "step": 1540 }, { "epoch": 0.4996860759493671, "grad_norm": 0.044066380709409714, "learning_rate": 7.923501342777788e-06, "loss": 0.0, "step": 1542 }, { "epoch": 0.5003341772151899, "grad_norm": 0.0029755565337836742, "learning_rate": 7.918350271476064e-06, "loss": 0.0, "step": 1544 }, { "epoch": 0.5009822784810126, "grad_norm": 0.02002215012907982, "learning_rate": 7.913194498130252e-06, "loss": 0.0, "step": 1546 }, { "epoch": 0.5016303797468354, "grad_norm": 0.14786788821220398, "learning_rate": 7.90803403104733e-06, "loss": 0.0004, "step": 1548 }, { "epoch": 0.5022784810126583, "grad_norm": 0.003735872684046626, "learning_rate": 7.90286887854184e-06, "loss": 0.0, "step": 1550 }, { "epoch": 0.502926582278481, "grad_norm": 0.003643750213086605, "learning_rate": 7.897699048935875e-06, "loss": 0.0, "step": 1552 }, { "epoch": 0.5035746835443038, "grad_norm": 0.0010872690472751856, "learning_rate": 7.892524550559056e-06, "loss": 0.0, "step": 1554 }, { "epoch": 0.5042227848101266, "grad_norm": 0.004607509821653366, "learning_rate": 7.887345391748533e-06, "loss": 0.0003, "step": 1556 }, { "epoch": 0.5048708860759493, "grad_norm": 0.018745901063084602, "learning_rate": 7.882161580848966e-06, "loss": 0.0001, "step": 1558 }, { "epoch": 0.5055189873417721, "grad_norm": 0.0027148567605763674, "learning_rate": 7.876973126212507e-06, "loss": 0.0, "step": 1560 }, { "epoch": 0.5055189873417721, "eval_accuracy": 0.9999803987308754, "eval_loss": 7.348901272052899e-05, "eval_runtime": 102.1608, "eval_samples_per_second": 48.942, "eval_steps_per_second": 12.236, "step": 1560 }, { "epoch": 0.506167088607595, "grad_norm": 0.06728499382734299, "learning_rate": 7.87178003619879e-06, "loss": 0.0001, "step": 1562 }, { "epoch": 0.5068151898734177, "grad_norm": 0.03146948292851448, "learning_rate": 7.866582319174918e-06, "loss": 0.0, "step": 1564 }, { "epoch": 0.5074632911392405, "grad_norm": 0.07248188555240631, "learning_rate": 7.861379983515449e-06, "loss": 0.0001, "step": 1566 }, { "epoch": 0.5081113924050633, "grad_norm": 0.0029003811068832874, "learning_rate": 7.856173037602383e-06, "loss": 0.0001, "step": 1568 }, { "epoch": 0.508759493670886, "grad_norm": 0.052399907261133194, "learning_rate": 7.85096148982515e-06, "loss": 0.0003, "step": 1570 }, { "epoch": 0.5094075949367088, "grad_norm": 0.0011442819377407432, "learning_rate": 7.845745348580592e-06, "loss": 0.0, "step": 1572 }, { "epoch": 0.5100556962025317, "grad_norm": 0.0014424960827454925, "learning_rate": 7.840524622272949e-06, "loss": 0.0001, "step": 1574 }, { "epoch": 0.5107037974683545, "grad_norm": 0.05243639647960663, "learning_rate": 7.835299319313854e-06, "loss": 0.0001, "step": 1576 }, { "epoch": 0.5113518987341772, "grad_norm": 0.021167520433664322, "learning_rate": 7.830069448122313e-06, "loss": 0.0, "step": 1578 }, { "epoch": 0.512, "grad_norm": 0.006568542681634426, "learning_rate": 7.82483501712469e-06, "loss": 0.0001, "step": 1580 }, { "epoch": 0.5126481012658228, "grad_norm": 0.0223622377961874, "learning_rate": 7.819596034754696e-06, "loss": 0.0, "step": 1582 }, { "epoch": 0.5132962025316455, "grad_norm": 0.02252521552145481, "learning_rate": 7.81435250945338e-06, "loss": 0.0001, "step": 1584 }, { "epoch": 0.5139443037974684, "grad_norm": 0.0036857861559838057, "learning_rate": 7.8091044496691e-06, "loss": 0.0, "step": 1586 }, { "epoch": 0.5145924050632912, "grad_norm": 0.0021720710210502148, "learning_rate": 7.803851863857533e-06, "loss": 0.0001, "step": 1588 }, { "epoch": 0.5152405063291139, "grad_norm": 0.004408895969390869, "learning_rate": 7.798594760481639e-06, "loss": 0.0, "step": 1590 }, { "epoch": 0.5158886075949367, "grad_norm": 0.004357462283223867, "learning_rate": 7.793333148011658e-06, "loss": 0.0, "step": 1592 }, { "epoch": 0.5165367088607595, "grad_norm": 0.02102765254676342, "learning_rate": 7.7880670349251e-06, "loss": 0.0001, "step": 1594 }, { "epoch": 0.5171848101265822, "grad_norm": 0.3509201109409332, "learning_rate": 7.782796429706721e-06, "loss": 0.0003, "step": 1596 }, { "epoch": 0.5178329113924051, "grad_norm": 0.0007810965762473643, "learning_rate": 7.777521340848515e-06, "loss": 0.0, "step": 1598 }, { "epoch": 0.5184810126582279, "grad_norm": 0.11072275787591934, "learning_rate": 7.772241776849705e-06, "loss": 0.0001, "step": 1600 }, { "epoch": 0.5184810126582279, "eval_accuracy": 0.9999901331856902, "eval_loss": 3.327821468701586e-05, "eval_runtime": 102.4497, "eval_samples_per_second": 48.804, "eval_steps_per_second": 12.201, "step": 1600 }, { "epoch": 0.5191291139240506, "grad_norm": 0.4169808626174927, "learning_rate": 7.76695774621672e-06, "loss": 0.0002, "step": 1602 }, { "epoch": 0.5197772151898734, "grad_norm": 0.01579488068819046, "learning_rate": 7.761669257463188e-06, "loss": 0.0, "step": 1604 }, { "epoch": 0.5204253164556962, "grad_norm": 0.05845312774181366, "learning_rate": 7.756376319109917e-06, "loss": 0.0001, "step": 1606 }, { "epoch": 0.5210734177215189, "grad_norm": 0.011482629925012589, "learning_rate": 7.751078939684886e-06, "loss": 0.0004, "step": 1608 }, { "epoch": 0.5217215189873418, "grad_norm": 0.01010491605848074, "learning_rate": 7.74577712772323e-06, "loss": 0.0, "step": 1610 }, { "epoch": 0.5223696202531646, "grad_norm": 0.19445572793483734, "learning_rate": 7.740470891767225e-06, "loss": 0.0002, "step": 1612 }, { "epoch": 0.5230177215189873, "grad_norm": 0.03457183763384819, "learning_rate": 7.735160240366276e-06, "loss": 0.0001, "step": 1614 }, { "epoch": 0.5236658227848101, "grad_norm": 0.00079920026473701, "learning_rate": 7.729845182076896e-06, "loss": 0.0, "step": 1616 }, { "epoch": 0.5243139240506329, "grad_norm": 0.06374679505825043, "learning_rate": 7.72452572546271e-06, "loss": 0.0001, "step": 1618 }, { "epoch": 0.5249620253164557, "grad_norm": 0.015360563062131405, "learning_rate": 7.71920187909442e-06, "loss": 0.0, "step": 1620 }, { "epoch": 0.5256101265822785, "grad_norm": 0.07180308550596237, "learning_rate": 7.713873651549805e-06, "loss": 0.0003, "step": 1622 }, { "epoch": 0.5262582278481013, "grad_norm": 0.13326337933540344, "learning_rate": 7.7085410514137e-06, "loss": 0.0001, "step": 1624 }, { "epoch": 0.5269063291139241, "grad_norm": 0.005201876629143953, "learning_rate": 7.703204087277989e-06, "loss": 0.0, "step": 1626 }, { "epoch": 0.5275544303797468, "grad_norm": 0.004504207521677017, "learning_rate": 7.697862767741584e-06, "loss": 0.0001, "step": 1628 }, { "epoch": 0.5282025316455696, "grad_norm": 0.2609665095806122, "learning_rate": 7.692517101410414e-06, "loss": 0.0001, "step": 1630 }, { "epoch": 0.5288506329113924, "grad_norm": 0.00579993799328804, "learning_rate": 7.68716709689742e-06, "loss": 0.0, "step": 1632 }, { "epoch": 0.5294987341772152, "grad_norm": 0.325564444065094, "learning_rate": 7.681812762822517e-06, "loss": 0.0005, "step": 1634 }, { "epoch": 0.530146835443038, "grad_norm": 0.025982841849327087, "learning_rate": 7.676454107812608e-06, "loss": 0.0001, "step": 1636 }, { "epoch": 0.5307949367088608, "grad_norm": 0.005460761021822691, "learning_rate": 7.671091140501557e-06, "loss": 0.0, "step": 1638 }, { "epoch": 0.5314430379746835, "grad_norm": 0.008271248079836369, "learning_rate": 7.66572386953017e-06, "loss": 0.0, "step": 1640 }, { "epoch": 0.5314430379746835, "eval_accuracy": 0.9999786783875229, "eval_loss": 8.938193059293553e-05, "eval_runtime": 101.5581, "eval_samples_per_second": 49.233, "eval_steps_per_second": 12.308, "step": 1640 }, { "epoch": 0.5320911392405063, "grad_norm": 0.015748729929327965, "learning_rate": 7.660352303546192e-06, "loss": 0.0001, "step": 1642 }, { "epoch": 0.5327392405063291, "grad_norm": 0.1777859777212143, "learning_rate": 7.654976451204288e-06, "loss": 0.0002, "step": 1644 }, { "epoch": 0.5333873417721519, "grad_norm": 0.029370561242103577, "learning_rate": 7.649596321166024e-06, "loss": 0.0002, "step": 1646 }, { "epoch": 0.5340354430379747, "grad_norm": 0.018781421706080437, "learning_rate": 7.644211922099867e-06, "loss": 0.0001, "step": 1648 }, { "epoch": 0.5346835443037975, "grad_norm": 0.014029051177203655, "learning_rate": 7.638823262681155e-06, "loss": 0.0, "step": 1650 }, { "epoch": 0.5353316455696202, "grad_norm": 0.08904850482940674, "learning_rate": 7.633430351592093e-06, "loss": 0.0001, "step": 1652 }, { "epoch": 0.535979746835443, "grad_norm": 0.49554842710494995, "learning_rate": 7.6280331975217356e-06, "loss": 0.0009, "step": 1654 }, { "epoch": 0.5366278481012658, "grad_norm": 0.0705832988023758, "learning_rate": 7.622631809165972e-06, "loss": 0.0002, "step": 1656 }, { "epoch": 0.5372759493670886, "grad_norm": 0.07367483526468277, "learning_rate": 7.617226195227518e-06, "loss": 0.0001, "step": 1658 }, { "epoch": 0.5379240506329114, "grad_norm": 0.14305748045444489, "learning_rate": 7.611816364415896e-06, "loss": 0.0001, "step": 1660 }, { "epoch": 0.5385721518987342, "grad_norm": 0.11214498430490494, "learning_rate": 7.606402325447421e-06, "loss": 0.0001, "step": 1662 }, { "epoch": 0.539220253164557, "grad_norm": 0.003934099338948727, "learning_rate": 7.600984087045187e-06, "loss": 0.0004, "step": 1664 }, { "epoch": 0.5398683544303797, "grad_norm": 0.0044866688549518585, "learning_rate": 7.595561657939061e-06, "loss": 0.0001, "step": 1666 }, { "epoch": 0.5405164556962025, "grad_norm": 0.025382693856954575, "learning_rate": 7.590135046865652e-06, "loss": 0.0002, "step": 1668 }, { "epoch": 0.5411645569620254, "grad_norm": 0.08539170771837234, "learning_rate": 7.584704262568315e-06, "loss": 0.0001, "step": 1670 }, { "epoch": 0.5418126582278481, "grad_norm": 0.0013932752190157771, "learning_rate": 7.579269313797126e-06, "loss": 0.0001, "step": 1672 }, { "epoch": 0.5424607594936709, "grad_norm": 0.1252427101135254, "learning_rate": 7.573830209308872e-06, "loss": 0.0001, "step": 1674 }, { "epoch": 0.5431088607594937, "grad_norm": 0.03497989848256111, "learning_rate": 7.568386957867033e-06, "loss": 0.0001, "step": 1676 }, { "epoch": 0.5437569620253164, "grad_norm": 0.03806183114647865, "learning_rate": 7.562939568241772e-06, "loss": 0.0, "step": 1678 }, { "epoch": 0.5444050632911392, "grad_norm": 0.0008746628882363439, "learning_rate": 7.557488049209921e-06, "loss": 0.0, "step": 1680 }, { "epoch": 0.5444050632911392, "eval_accuracy": 0.9999900735137393, "eval_loss": 5.2154053264530376e-05, "eval_runtime": 101.5637, "eval_samples_per_second": 49.23, "eval_steps_per_second": 12.308, "step": 1680 }, { "epoch": 0.5450531645569621, "grad_norm": 0.18731200695037842, "learning_rate": 7.552032409554963e-06, "loss": 0.0003, "step": 1682 }, { "epoch": 0.5457012658227848, "grad_norm": 0.0005909568862989545, "learning_rate": 7.546572658067022e-06, "loss": 0.0003, "step": 1684 }, { "epoch": 0.5463493670886076, "grad_norm": 0.001809874433092773, "learning_rate": 7.541108803542846e-06, "loss": 0.0, "step": 1686 }, { "epoch": 0.5469974683544304, "grad_norm": 0.002301193308085203, "learning_rate": 7.535640854785793e-06, "loss": 0.0001, "step": 1688 }, { "epoch": 0.5476455696202531, "grad_norm": 0.012999876402318478, "learning_rate": 7.530168820605819e-06, "loss": 0.0, "step": 1690 }, { "epoch": 0.548293670886076, "grad_norm": 0.011647144332528114, "learning_rate": 7.5246927098194636e-06, "loss": 0.0, "step": 1692 }, { "epoch": 0.5489417721518988, "grad_norm": 0.001015910878777504, "learning_rate": 7.51921253124983e-06, "loss": 0.0002, "step": 1694 }, { "epoch": 0.5495898734177215, "grad_norm": 0.1255691796541214, "learning_rate": 7.5137282937265796e-06, "loss": 0.0001, "step": 1696 }, { "epoch": 0.5502379746835443, "grad_norm": 0.06820951402187347, "learning_rate": 7.508240006085914e-06, "loss": 0.0, "step": 1698 }, { "epoch": 0.5508860759493671, "grad_norm": 0.09389737248420715, "learning_rate": 7.502747677170556e-06, "loss": 0.0007, "step": 1700 }, { "epoch": 0.5515341772151898, "grad_norm": 0.1673169583082199, "learning_rate": 7.497251315829744e-06, "loss": 0.0001, "step": 1702 }, { "epoch": 0.5521822784810126, "grad_norm": 0.0026740762405097485, "learning_rate": 7.4917509309192125e-06, "loss": 0.0, "step": 1704 }, { "epoch": 0.5528303797468355, "grad_norm": 0.005095075815916061, "learning_rate": 7.486246531301178e-06, "loss": 0.0, "step": 1706 }, { "epoch": 0.5534784810126582, "grad_norm": 0.06378457695245743, "learning_rate": 7.480738125844322e-06, "loss": 0.0001, "step": 1708 }, { "epoch": 0.554126582278481, "grad_norm": 0.0014856786001473665, "learning_rate": 7.475225723423789e-06, "loss": 0.0001, "step": 1710 }, { "epoch": 0.5547746835443038, "grad_norm": 0.0007130180601961911, "learning_rate": 7.469709332921155e-06, "loss": 0.0, "step": 1712 }, { "epoch": 0.5554227848101266, "grad_norm": 0.04547552391886711, "learning_rate": 7.464188963224428e-06, "loss": 0.0001, "step": 1714 }, { "epoch": 0.5560708860759493, "grad_norm": 0.10711254179477692, "learning_rate": 7.45866462322802e-06, "loss": 0.0001, "step": 1716 }, { "epoch": 0.5567189873417722, "grad_norm": 0.00046518215094693005, "learning_rate": 7.453136321832746e-06, "loss": 0.0, "step": 1718 }, { "epoch": 0.557367088607595, "grad_norm": 0.0007845875225029886, "learning_rate": 7.447604067945803e-06, "loss": 0.0, "step": 1720 }, { "epoch": 0.557367088607595, "eval_accuracy": 0.999991254640404, "eval_loss": 4.263612208887935e-05, "eval_runtime": 100.7388, "eval_samples_per_second": 49.633, "eval_steps_per_second": 12.408, "step": 1720 }, { "epoch": 0.5580151898734177, "grad_norm": 0.20458361506462097, "learning_rate": 7.442067870480752e-06, "loss": 0.0004, "step": 1722 }, { "epoch": 0.5586632911392405, "grad_norm": 0.00046549775288440287, "learning_rate": 7.436527738357514e-06, "loss": 0.0003, "step": 1724 }, { "epoch": 0.5593113924050633, "grad_norm": 0.001776853110641241, "learning_rate": 7.430983680502344e-06, "loss": 0.0, "step": 1726 }, { "epoch": 0.559959493670886, "grad_norm": 0.00032903251121751964, "learning_rate": 7.425435705847825e-06, "loss": 0.0, "step": 1728 }, { "epoch": 0.5606075949367089, "grad_norm": 0.012358637526631355, "learning_rate": 7.419883823332851e-06, "loss": 0.0, "step": 1730 }, { "epoch": 0.5612556962025317, "grad_norm": 0.12481050938367844, "learning_rate": 7.414328041902611e-06, "loss": 0.0003, "step": 1732 }, { "epoch": 0.5619037974683544, "grad_norm": 0.04864579811692238, "learning_rate": 7.408768370508577e-06, "loss": 0.0001, "step": 1734 }, { "epoch": 0.5625518987341772, "grad_norm": 0.002378104953095317, "learning_rate": 7.403204818108487e-06, "loss": 0.0, "step": 1736 }, { "epoch": 0.5632, "grad_norm": 0.17984673380851746, "learning_rate": 7.397637393666333e-06, "loss": 0.0001, "step": 1738 }, { "epoch": 0.5638481012658227, "grad_norm": 0.3844810128211975, "learning_rate": 7.392066106152347e-06, "loss": 0.0004, "step": 1740 }, { "epoch": 0.5644962025316456, "grad_norm": 0.010118959471583366, "learning_rate": 7.386490964542983e-06, "loss": 0.0, "step": 1742 }, { "epoch": 0.5651443037974684, "grad_norm": 0.003273730631917715, "learning_rate": 7.380911977820907e-06, "loss": 0.0, "step": 1744 }, { "epoch": 0.5657924050632911, "grad_norm": 0.0061654625460505486, "learning_rate": 7.3753291549749764e-06, "loss": 0.0, "step": 1746 }, { "epoch": 0.5664405063291139, "grad_norm": 0.009560778737068176, "learning_rate": 7.369742505000232e-06, "loss": 0.0, "step": 1748 }, { "epoch": 0.5670886075949367, "grad_norm": 0.012677104212343693, "learning_rate": 7.364152036897883e-06, "loss": 0.0, "step": 1750 }, { "epoch": 0.5677367088607594, "grad_norm": 0.0025769511703401804, "learning_rate": 7.358557759675284e-06, "loss": 0.0, "step": 1752 }, { "epoch": 0.5683848101265823, "grad_norm": 0.603306233882904, "learning_rate": 7.352959682345936e-06, "loss": 0.0021, "step": 1754 }, { "epoch": 0.5690329113924051, "grad_norm": 0.18951795995235443, "learning_rate": 7.347357813929455e-06, "loss": 0.0007, "step": 1756 }, { "epoch": 0.5696810126582279, "grad_norm": 0.0019815692212432623, "learning_rate": 7.341752163451568e-06, "loss": 0.0001, "step": 1758 }, { "epoch": 0.5703291139240506, "grad_norm": 0.003150013741105795, "learning_rate": 7.3361427399440945e-06, "loss": 0.0, "step": 1760 }, { "epoch": 0.5703291139240506, "eval_accuracy": 0.9999741535564335, "eval_loss": 0.0001262763689737767, "eval_runtime": 100.825, "eval_samples_per_second": 49.591, "eval_steps_per_second": 12.398, "step": 1760 }, { "epoch": 0.5709772151898734, "grad_norm": 0.009336372837424278, "learning_rate": 7.330529552444934e-06, "loss": 0.0, "step": 1762 }, { "epoch": 0.5716253164556963, "grad_norm": 0.05104972794651985, "learning_rate": 7.324912609998054e-06, "loss": 0.0001, "step": 1764 }, { "epoch": 0.572273417721519, "grad_norm": 0.13936738669872284, "learning_rate": 7.319291921653464e-06, "loss": 0.0012, "step": 1766 }, { "epoch": 0.5729215189873418, "grad_norm": 0.12685565650463104, "learning_rate": 7.313667496467216e-06, "loss": 0.0005, "step": 1768 }, { "epoch": 0.5735696202531646, "grad_norm": 0.004788834135979414, "learning_rate": 7.308039343501381e-06, "loss": 0.0, "step": 1770 }, { "epoch": 0.5742177215189873, "grad_norm": 0.12622016668319702, "learning_rate": 7.302407471824034e-06, "loss": 0.0005, "step": 1772 }, { "epoch": 0.5748658227848101, "grad_norm": 0.004526766948401928, "learning_rate": 7.296771890509242e-06, "loss": 0.0, "step": 1774 }, { "epoch": 0.575513924050633, "grad_norm": 0.0036531174555420876, "learning_rate": 7.291132608637053e-06, "loss": 0.0, "step": 1776 }, { "epoch": 0.5761620253164557, "grad_norm": 0.043615106493234634, "learning_rate": 7.285489635293472e-06, "loss": 0.0004, "step": 1778 }, { "epoch": 0.5768101265822785, "grad_norm": 0.052893441170454025, "learning_rate": 7.279842979570454e-06, "loss": 0.0002, "step": 1780 }, { "epoch": 0.5774582278481013, "grad_norm": 0.02373715490102768, "learning_rate": 7.27419265056589e-06, "loss": 0.0002, "step": 1782 }, { "epoch": 0.578106329113924, "grad_norm": 0.002014788566157222, "learning_rate": 7.268538657383581e-06, "loss": 0.0, "step": 1784 }, { "epoch": 0.5787544303797468, "grad_norm": 0.06467452645301819, "learning_rate": 7.262881009133242e-06, "loss": 0.0002, "step": 1786 }, { "epoch": 0.5794025316455697, "grad_norm": 0.005883384495973587, "learning_rate": 7.2572197149304715e-06, "loss": 0.0002, "step": 1788 }, { "epoch": 0.5800506329113924, "grad_norm": 0.031703103333711624, "learning_rate": 7.251554783896741e-06, "loss": 0.0001, "step": 1790 }, { "epoch": 0.5806987341772152, "grad_norm": 0.0037112203426659107, "learning_rate": 7.245886225159386e-06, "loss": 0.0, "step": 1792 }, { "epoch": 0.581346835443038, "grad_norm": 0.06743831187486649, "learning_rate": 7.240214047851583e-06, "loss": 0.0001, "step": 1794 }, { "epoch": 0.5819949367088607, "grad_norm": 0.00042398099321871996, "learning_rate": 7.234538261112342e-06, "loss": 0.0, "step": 1796 }, { "epoch": 0.5826430379746835, "grad_norm": 0.0017583374865353107, "learning_rate": 7.2288588740864855e-06, "loss": 0.0, "step": 1798 }, { "epoch": 0.5832911392405064, "grad_norm": 0.06879211962223053, "learning_rate": 7.223175895924638e-06, "loss": 0.0, "step": 1800 }, { "epoch": 0.5832911392405064, "eval_accuracy": 0.9999984173205008, "eval_loss": 1.2915682418679353e-05, "eval_runtime": 100.8375, "eval_samples_per_second": 49.585, "eval_steps_per_second": 12.396, "step": 1800 }, { "epoch": 0.5839392405063291, "grad_norm": 0.0013730882201343775, "learning_rate": 7.217489335783212e-06, "loss": 0.0, "step": 1802 }, { "epoch": 0.5845873417721519, "grad_norm": 0.00027484947349876165, "learning_rate": 7.211799202824389e-06, "loss": 0.0, "step": 1804 }, { "epoch": 0.5852354430379747, "grad_norm": 0.11204996705055237, "learning_rate": 7.206105506216107e-06, "loss": 0.0005, "step": 1806 }, { "epoch": 0.5858835443037975, "grad_norm": 0.010664002038538456, "learning_rate": 7.200408255132046e-06, "loss": 0.0, "step": 1808 }, { "epoch": 0.5865316455696202, "grad_norm": 0.02932300604879856, "learning_rate": 7.194707458751615e-06, "loss": 0.0, "step": 1810 }, { "epoch": 0.587179746835443, "grad_norm": 0.1193203330039978, "learning_rate": 7.189003126259932e-06, "loss": 0.0002, "step": 1812 }, { "epoch": 0.5878278481012659, "grad_norm": 0.020759958773851395, "learning_rate": 7.1832952668478155e-06, "loss": 0.0, "step": 1814 }, { "epoch": 0.5884759493670886, "grad_norm": 0.07171918451786041, "learning_rate": 7.177583889711763e-06, "loss": 0.0001, "step": 1816 }, { "epoch": 0.5891240506329114, "grad_norm": 0.0004759156727232039, "learning_rate": 7.1718690040539404e-06, "loss": 0.0001, "step": 1818 }, { "epoch": 0.5897721518987342, "grad_norm": 0.0007953441818244755, "learning_rate": 7.166150619082171e-06, "loss": 0.0, "step": 1820 }, { "epoch": 0.5904202531645569, "grad_norm": 0.008654981851577759, "learning_rate": 7.160428744009913e-06, "loss": 0.0001, "step": 1822 }, { "epoch": 0.5910683544303797, "grad_norm": 0.10353202372789383, "learning_rate": 7.154703388056246e-06, "loss": 0.0001, "step": 1824 }, { "epoch": 0.5917164556962026, "grad_norm": 0.0015258926432579756, "learning_rate": 7.148974560445859e-06, "loss": 0.0001, "step": 1826 }, { "epoch": 0.5923645569620253, "grad_norm": 0.00039332304731942713, "learning_rate": 7.143242270409039e-06, "loss": 0.0, "step": 1828 }, { "epoch": 0.5930126582278481, "grad_norm": 0.03245238959789276, "learning_rate": 7.137506527181643e-06, "loss": 0.0, "step": 1830 }, { "epoch": 0.5936607594936709, "grad_norm": 0.004285217262804508, "learning_rate": 7.131767340005102e-06, "loss": 0.0, "step": 1832 }, { "epoch": 0.5943088607594936, "grad_norm": 0.001428083865903318, "learning_rate": 7.126024718126388e-06, "loss": 0.0, "step": 1834 }, { "epoch": 0.5949569620253164, "grad_norm": 0.005078149959445, "learning_rate": 7.12027867079801e-06, "loss": 0.0, "step": 1836 }, { "epoch": 0.5956050632911393, "grad_norm": 0.0003838093834929168, "learning_rate": 7.114529207277996e-06, "loss": 0.0001, "step": 1838 }, { "epoch": 0.596253164556962, "grad_norm": 0.0006412325892597437, "learning_rate": 7.1087763368298764e-06, "loss": 0.0, "step": 1840 }, { "epoch": 0.596253164556962, "eval_accuracy": 0.9999988402108444, "eval_loss": 1.187924590340117e-05, "eval_runtime": 100.0044, "eval_samples_per_second": 49.998, "eval_steps_per_second": 12.499, "step": 1840 }, { "epoch": 0.5969012658227848, "grad_norm": 0.002054619137197733, "learning_rate": 7.103020068722675e-06, "loss": 0.0, "step": 1842 }, { "epoch": 0.5975493670886076, "grad_norm": 0.0006610119016841054, "learning_rate": 7.0972604122308865e-06, "loss": 0.0, "step": 1844 }, { "epoch": 0.5981974683544303, "grad_norm": 0.0009412910440005362, "learning_rate": 7.0914973766344645e-06, "loss": 0.0, "step": 1846 }, { "epoch": 0.5988455696202531, "grad_norm": 0.0044436645694077015, "learning_rate": 7.085730971218809e-06, "loss": 0.0, "step": 1848 }, { "epoch": 0.599493670886076, "grad_norm": 0.0017420395743101835, "learning_rate": 7.079961205274749e-06, "loss": 0.0, "step": 1850 }, { "epoch": 0.6001417721518988, "grad_norm": 0.0020820838399231434, "learning_rate": 7.074188088098528e-06, "loss": 0.0, "step": 1852 }, { "epoch": 0.6007898734177215, "grad_norm": 0.00021655585442204028, "learning_rate": 7.0684116289917885e-06, "loss": 0.0, "step": 1854 }, { "epoch": 0.6014379746835443, "grad_norm": 0.002999892458319664, "learning_rate": 7.062631837261556e-06, "loss": 0.0, "step": 1856 }, { "epoch": 0.6020860759493671, "grad_norm": 0.00042861109250225127, "learning_rate": 7.05684872222023e-06, "loss": 0.0001, "step": 1858 }, { "epoch": 0.6027341772151898, "grad_norm": 0.0014900608221068978, "learning_rate": 7.05106229318556e-06, "loss": 0.0, "step": 1860 }, { "epoch": 0.6033822784810127, "grad_norm": 0.03731833025813103, "learning_rate": 7.045272559480636e-06, "loss": 0.0, "step": 1862 }, { "epoch": 0.6040303797468355, "grad_norm": 0.004172660410404205, "learning_rate": 7.039479530433875e-06, "loss": 0.0, "step": 1864 }, { "epoch": 0.6046784810126582, "grad_norm": 0.00023338549362961203, "learning_rate": 7.033683215379002e-06, "loss": 0.0, "step": 1866 }, { "epoch": 0.605326582278481, "grad_norm": 0.06045857444405556, "learning_rate": 7.027883623655035e-06, "loss": 0.0001, "step": 1868 }, { "epoch": 0.6059746835443038, "grad_norm": 0.047318872064352036, "learning_rate": 7.022080764606272e-06, "loss": 0.0001, "step": 1870 }, { "epoch": 0.6066227848101265, "grad_norm": 0.030598580837249756, "learning_rate": 7.016274647582276e-06, "loss": 0.0, "step": 1872 }, { "epoch": 0.6072708860759494, "grad_norm": 0.0005781942745670676, "learning_rate": 7.010465281937859e-06, "loss": 0.0, "step": 1874 }, { "epoch": 0.6079189873417722, "grad_norm": 0.10540540516376495, "learning_rate": 7.004652677033069e-06, "loss": 0.0001, "step": 1876 }, { "epoch": 0.6085670886075949, "grad_norm": 0.00097875006031245, "learning_rate": 6.99883684223317e-06, "loss": 0.0, "step": 1878 }, { "epoch": 0.6092151898734177, "grad_norm": 0.09228157252073288, "learning_rate": 6.993017786908631e-06, "loss": 0.0, "step": 1880 }, { "epoch": 0.6092151898734177, "eval_accuracy": 0.9999984465216039, "eval_loss": 9.824881090025883e-06, "eval_runtime": 101.3407, "eval_samples_per_second": 49.339, "eval_steps_per_second": 12.335, "step": 1880 }, { "epoch": 0.6098632911392405, "grad_norm": 0.0008249924867413938, "learning_rate": 6.9871955204351094e-06, "loss": 0.0, "step": 1882 }, { "epoch": 0.6105113924050632, "grad_norm": 0.02752879448235035, "learning_rate": 6.9813700521934394e-06, "loss": 0.0, "step": 1884 }, { "epoch": 0.6111594936708861, "grad_norm": 0.002536400454118848, "learning_rate": 6.9755413915696105e-06, "loss": 0.0, "step": 1886 }, { "epoch": 0.6118075949367089, "grad_norm": 0.0435812771320343, "learning_rate": 6.9697095479547564e-06, "loss": 0.0, "step": 1888 }, { "epoch": 0.6124556962025316, "grad_norm": 0.02748386561870575, "learning_rate": 6.963874530745141e-06, "loss": 0.0, "step": 1890 }, { "epoch": 0.6131037974683544, "grad_norm": 0.09339939802885056, "learning_rate": 6.95803634934214e-06, "loss": 0.0001, "step": 1892 }, { "epoch": 0.6137518987341772, "grad_norm": 0.016583425924181938, "learning_rate": 6.952195013152227e-06, "loss": 0.0, "step": 1894 }, { "epoch": 0.6144, "grad_norm": 0.00017733302956912667, "learning_rate": 6.946350531586959e-06, "loss": 0.0, "step": 1896 }, { "epoch": 0.6150481012658228, "grad_norm": 0.00218210369348526, "learning_rate": 6.940502914062961e-06, "loss": 0.0001, "step": 1898 }, { "epoch": 0.6156962025316456, "grad_norm": 0.0015262187225744128, "learning_rate": 6.934652170001911e-06, "loss": 0.0002, "step": 1900 }, { "epoch": 0.6163443037974684, "grad_norm": 0.00028145458782091737, "learning_rate": 6.928798308830524e-06, "loss": 0.0, "step": 1902 }, { "epoch": 0.6169924050632911, "grad_norm": 0.09535115212202072, "learning_rate": 6.922941339980538e-06, "loss": 0.0, "step": 1904 }, { "epoch": 0.6176405063291139, "grad_norm": 0.000556255632545799, "learning_rate": 6.917081272888697e-06, "loss": 0.0002, "step": 1906 }, { "epoch": 0.6182886075949368, "grad_norm": 0.0006218308117240667, "learning_rate": 6.911218116996738e-06, "loss": 0.0, "step": 1908 }, { "epoch": 0.6189367088607595, "grad_norm": 0.0027631232514977455, "learning_rate": 6.905351881751372e-06, "loss": 0.0, "step": 1910 }, { "epoch": 0.6195848101265823, "grad_norm": 0.03704715520143509, "learning_rate": 6.899482576604275e-06, "loss": 0.0, "step": 1912 }, { "epoch": 0.6202329113924051, "grad_norm": 0.060138948261737823, "learning_rate": 6.893610211012067e-06, "loss": 0.0002, "step": 1914 }, { "epoch": 0.6208810126582278, "grad_norm": 0.09946001321077347, "learning_rate": 6.887734794436301e-06, "loss": 0.0001, "step": 1916 }, { "epoch": 0.6215291139240506, "grad_norm": 0.011256850324571133, "learning_rate": 6.881856336343442e-06, "loss": 0.0, "step": 1918 }, { "epoch": 0.6221772151898735, "grad_norm": 0.03586261346936226, "learning_rate": 6.8759748462048595e-06, "loss": 0.0001, "step": 1920 }, { "epoch": 0.6221772151898735, "eval_accuracy": 0.9999977681314496, "eval_loss": 1.365809021081077e-05, "eval_runtime": 101.3963, "eval_samples_per_second": 49.311, "eval_steps_per_second": 12.328, "step": 1920 }, { "epoch": 0.6228253164556962, "grad_norm": 0.0019096017349511385, "learning_rate": 6.870090333496807e-06, "loss": 0.0, "step": 1922 }, { "epoch": 0.623473417721519, "grad_norm": 0.0029806571546941996, "learning_rate": 6.864202807700407e-06, "loss": 0.0, "step": 1924 }, { "epoch": 0.6241215189873418, "grad_norm": 0.022626426070928574, "learning_rate": 6.858312278301638e-06, "loss": 0.0, "step": 1926 }, { "epoch": 0.6247696202531645, "grad_norm": 0.004040713422000408, "learning_rate": 6.852418754791317e-06, "loss": 0.0, "step": 1928 }, { "epoch": 0.6254177215189873, "grad_norm": 0.0006340648396871984, "learning_rate": 6.8465222466650835e-06, "loss": 0.0, "step": 1930 }, { "epoch": 0.6260658227848102, "grad_norm": 0.007957760244607925, "learning_rate": 6.840622763423391e-06, "loss": 0.0, "step": 1932 }, { "epoch": 0.6267139240506329, "grad_norm": 0.0036092489026486874, "learning_rate": 6.83472031457148e-06, "loss": 0.0, "step": 1934 }, { "epoch": 0.6273620253164557, "grad_norm": 0.0009143880452029407, "learning_rate": 6.828814909619374e-06, "loss": 0.0, "step": 1936 }, { "epoch": 0.6280101265822785, "grad_norm": 0.0002480478142388165, "learning_rate": 6.822906558081856e-06, "loss": 0.0, "step": 1938 }, { "epoch": 0.6286582278481012, "grad_norm": 0.0009710952290333807, "learning_rate": 6.81699526947846e-06, "loss": 0.0, "step": 1940 }, { "epoch": 0.629306329113924, "grad_norm": 0.07902362197637558, "learning_rate": 6.81108105333345e-06, "loss": 0.0, "step": 1942 }, { "epoch": 0.6299544303797469, "grad_norm": 0.0639423131942749, "learning_rate": 6.8051639191758065e-06, "loss": 0.0, "step": 1944 }, { "epoch": 0.6306025316455697, "grad_norm": 0.00019927980611100793, "learning_rate": 6.799243876539213e-06, "loss": 0.0, "step": 1946 }, { "epoch": 0.6312506329113924, "grad_norm": 0.0010397526202723384, "learning_rate": 6.793320934962039e-06, "loss": 0.0, "step": 1948 }, { "epoch": 0.6318987341772152, "grad_norm": 0.01125252153724432, "learning_rate": 6.787395103987323e-06, "loss": 0.0001, "step": 1950 }, { "epoch": 0.632546835443038, "grad_norm": 0.0007871537236496806, "learning_rate": 6.781466393162761e-06, "loss": 0.0, "step": 1952 }, { "epoch": 0.6331949367088607, "grad_norm": 0.002556710736826062, "learning_rate": 6.775534812040686e-06, "loss": 0.0, "step": 1954 }, { "epoch": 0.6338430379746836, "grad_norm": 0.000299537496175617, "learning_rate": 6.76960037017806e-06, "loss": 0.0, "step": 1956 }, { "epoch": 0.6344911392405064, "grad_norm": 0.0010419267928227782, "learning_rate": 6.763663077136451e-06, "loss": 0.0, "step": 1958 }, { "epoch": 0.6351392405063291, "grad_norm": 0.09013451635837555, "learning_rate": 6.757722942482022e-06, "loss": 0.0001, "step": 1960 }, { "epoch": 0.6351392405063291, "eval_accuracy": 0.9999996055226824, "eval_loss": 5.647259058605414e-06, "eval_runtime": 101.9211, "eval_samples_per_second": 49.058, "eval_steps_per_second": 12.264, "step": 1960 }, { "epoch": 0.6357873417721519, "grad_norm": 0.00781452190130949, "learning_rate": 6.751779975785515e-06, "loss": 0.0, "step": 1962 }, { "epoch": 0.6364354430379747, "grad_norm": 0.040681142359972, "learning_rate": 6.745834186622232e-06, "loss": 0.0, "step": 1964 }, { "epoch": 0.6370835443037974, "grad_norm": 0.0002076182863675058, "learning_rate": 6.739885584572026e-06, "loss": 0.0, "step": 1966 }, { "epoch": 0.6377316455696203, "grad_norm": 0.00034984247758984566, "learning_rate": 6.733934179219281e-06, "loss": 0.0, "step": 1968 }, { "epoch": 0.6383797468354431, "grad_norm": 0.0007839368190616369, "learning_rate": 6.727979980152899e-06, "loss": 0.0, "step": 1970 }, { "epoch": 0.6390278481012658, "grad_norm": 0.0017967225285246968, "learning_rate": 6.7220229969662776e-06, "loss": 0.0001, "step": 1972 }, { "epoch": 0.6396759493670886, "grad_norm": 0.0005148016498424113, "learning_rate": 6.716063239257307e-06, "loss": 0.0, "step": 1974 }, { "epoch": 0.6403240506329114, "grad_norm": 0.0007883037906140089, "learning_rate": 6.710100716628345e-06, "loss": 0.0, "step": 1976 }, { "epoch": 0.6409721518987341, "grad_norm": 0.0005258942837826908, "learning_rate": 6.704135438686203e-06, "loss": 0.0, "step": 1978 }, { "epoch": 0.641620253164557, "grad_norm": 0.00028024116181768477, "learning_rate": 6.698167415042135e-06, "loss": 0.0, "step": 1980 }, { "epoch": 0.6422683544303798, "grad_norm": 0.0011886393185704947, "learning_rate": 6.692196655311814e-06, "loss": 0.0, "step": 1982 }, { "epoch": 0.6429164556962025, "grad_norm": 0.04780174791812897, "learning_rate": 6.686223169115328e-06, "loss": 0.0, "step": 1984 }, { "epoch": 0.6435645569620253, "grad_norm": 0.060146622359752655, "learning_rate": 6.680246966077151e-06, "loss": 0.0, "step": 1986 }, { "epoch": 0.6442126582278481, "grad_norm": 0.00024338187358807772, "learning_rate": 6.674268055826139e-06, "loss": 0.0, "step": 1988 }, { "epoch": 0.6448607594936708, "grad_norm": 0.014128066599369049, "learning_rate": 6.6682864479955075e-06, "loss": 0.0, "step": 1990 }, { "epoch": 0.6455088607594937, "grad_norm": 0.014804531820118427, "learning_rate": 6.66230215222282e-06, "loss": 0.0, "step": 1992 }, { "epoch": 0.6461569620253165, "grad_norm": 0.0006120464531704783, "learning_rate": 6.656315178149971e-06, "loss": 0.0, "step": 1994 }, { "epoch": 0.6468050632911393, "grad_norm": 0.000343716616043821, "learning_rate": 6.650325535423166e-06, "loss": 0.0, "step": 1996 }, { "epoch": 0.647453164556962, "grad_norm": 0.0001859806216089055, "learning_rate": 6.644333233692917e-06, "loss": 0.0, "step": 1998 }, { "epoch": 0.6481012658227848, "grad_norm": 0.030580680817365646, "learning_rate": 6.638338282614014e-06, "loss": 0.0, "step": 2000 }, { "epoch": 0.6481012658227848, "eval_accuracy": 0.999998421308061, "eval_loss": 1.1490382348711137e-05, "eval_runtime": 101.8472, "eval_samples_per_second": 49.093, "eval_steps_per_second": 12.273, "step": 2000 }, { "epoch": 0.6487493670886076, "grad_norm": 0.0011633732356131077, "learning_rate": 6.6323406918455205e-06, "loss": 0.0, "step": 2002 }, { "epoch": 0.6493974683544304, "grad_norm": 0.0025772363878786564, "learning_rate": 6.6263404710507495e-06, "loss": 0.0, "step": 2004 }, { "epoch": 0.6500455696202532, "grad_norm": 0.002858225954696536, "learning_rate": 6.6203376298972535e-06, "loss": 0.0, "step": 2006 }, { "epoch": 0.650693670886076, "grad_norm": 0.0053984252735972404, "learning_rate": 6.614332178056806e-06, "loss": 0.0, "step": 2008 }, { "epoch": 0.6513417721518987, "grad_norm": 0.000622738036327064, "learning_rate": 6.608324125205389e-06, "loss": 0.0, "step": 2010 }, { "epoch": 0.6519898734177215, "grad_norm": 0.004378859885036945, "learning_rate": 6.60231348102317e-06, "loss": 0.0, "step": 2012 }, { "epoch": 0.6526379746835443, "grad_norm": 0.0009536651195958257, "learning_rate": 6.596300255194496e-06, "loss": 0.0, "step": 2014 }, { "epoch": 0.653286075949367, "grad_norm": 0.001671510748565197, "learning_rate": 6.590284457407876e-06, "loss": 0.0, "step": 2016 }, { "epoch": 0.6539341772151899, "grad_norm": 0.0006021432927809656, "learning_rate": 6.5842660973559545e-06, "loss": 0.0, "step": 2018 }, { "epoch": 0.6545822784810127, "grad_norm": 0.00020325230434536934, "learning_rate": 6.578245184735513e-06, "loss": 0.0, "step": 2020 }, { "epoch": 0.6552303797468354, "grad_norm": 0.00012371053162496537, "learning_rate": 6.572221729247441e-06, "loss": 0.0, "step": 2022 }, { "epoch": 0.6558784810126582, "grad_norm": 0.014866070821881294, "learning_rate": 6.5661957405967255e-06, "loss": 0.0, "step": 2024 }, { "epoch": 0.656526582278481, "grad_norm": 0.009082837030291557, "learning_rate": 6.560167228492436e-06, "loss": 0.0, "step": 2026 }, { "epoch": 0.6571746835443038, "grad_norm": 0.004811509046703577, "learning_rate": 6.554136202647707e-06, "loss": 0.0, "step": 2028 }, { "epoch": 0.6578227848101266, "grad_norm": 0.00021759387163911015, "learning_rate": 6.548102672779725e-06, "loss": 0.0, "step": 2030 }, { "epoch": 0.6584708860759494, "grad_norm": 0.001678065280430019, "learning_rate": 6.5420666486097084e-06, "loss": 0.0, "step": 2032 }, { "epoch": 0.6591189873417721, "grad_norm": 0.00032246127375401556, "learning_rate": 6.536028139862895e-06, "loss": 0.0, "step": 2034 }, { "epoch": 0.6597670886075949, "grad_norm": 0.05951784551143646, "learning_rate": 6.529987156268527e-06, "loss": 0.0, "step": 2036 }, { "epoch": 0.6604151898734177, "grad_norm": 0.022436069324612617, "learning_rate": 6.523943707559832e-06, "loss": 0.0, "step": 2038 }, { "epoch": 0.6610632911392406, "grad_norm": 0.0005385543336160481, "learning_rate": 6.517897803474011e-06, "loss": 0.0, "step": 2040 }, { "epoch": 0.6610632911392406, "eval_accuracy": 0.9999992102657654, "eval_loss": 6.353677235892974e-06, "eval_runtime": 105.7714, "eval_samples_per_second": 47.272, "eval_steps_per_second": 11.818, "step": 2040 }, { "epoch": 0.6617113924050633, "grad_norm": 0.002471289364621043, "learning_rate": 6.5118494537522235e-06, "loss": 0.0, "step": 2042 }, { "epoch": 0.6623594936708861, "grad_norm": 0.15845777094364166, "learning_rate": 6.505798668139563e-06, "loss": 0.0, "step": 2044 }, { "epoch": 0.6630075949367089, "grad_norm": 0.004233754705637693, "learning_rate": 6.499745456385054e-06, "loss": 0.0, "step": 2046 }, { "epoch": 0.6636556962025316, "grad_norm": 0.06344658881425858, "learning_rate": 6.493689828241625e-06, "loss": 0.0001, "step": 2048 }, { "epoch": 0.6643037974683544, "grad_norm": 0.05400916561484337, "learning_rate": 6.4876317934661036e-06, "loss": 0.0, "step": 2050 }, { "epoch": 0.6649518987341773, "grad_norm": 0.025929005816578865, "learning_rate": 6.481571361819189e-06, "loss": 0.0, "step": 2052 }, { "epoch": 0.6656, "grad_norm": 0.0014809060376137495, "learning_rate": 6.475508543065445e-06, "loss": 0.0, "step": 2054 }, { "epoch": 0.6662481012658228, "grad_norm": 0.016997920349240303, "learning_rate": 6.469443346973281e-06, "loss": 0.0, "step": 2056 }, { "epoch": 0.6668962025316456, "grad_norm": 0.00024033906811382622, "learning_rate": 6.463375783314938e-06, "loss": 0.0, "step": 2058 }, { "epoch": 0.6675443037974683, "grad_norm": 0.001820797217078507, "learning_rate": 6.457305861866471e-06, "loss": 0.0, "step": 2060 }, { "epoch": 0.6681924050632911, "grad_norm": 0.034896451979875565, "learning_rate": 6.451233592407732e-06, "loss": 0.0, "step": 2062 }, { "epoch": 0.668840506329114, "grad_norm": 0.00012074632104486227, "learning_rate": 6.445158984722358e-06, "loss": 0.0, "step": 2064 }, { "epoch": 0.6694886075949367, "grad_norm": 0.0386367104947567, "learning_rate": 6.439082048597755e-06, "loss": 0.0, "step": 2066 }, { "epoch": 0.6701367088607595, "grad_norm": 0.0002681062906049192, "learning_rate": 6.433002793825076e-06, "loss": 0.0, "step": 2068 }, { "epoch": 0.6707848101265823, "grad_norm": 0.0019434065325185657, "learning_rate": 6.426921230199215e-06, "loss": 0.0, "step": 2070 }, { "epoch": 0.671432911392405, "grad_norm": 0.006111921276897192, "learning_rate": 6.420837367518781e-06, "loss": 0.0, "step": 2072 }, { "epoch": 0.6720810126582278, "grad_norm": 0.00020428838615771383, "learning_rate": 6.414751215586091e-06, "loss": 0.0, "step": 2074 }, { "epoch": 0.6727291139240507, "grad_norm": 0.0007174932979978621, "learning_rate": 6.408662784207149e-06, "loss": 0.0, "step": 2076 }, { "epoch": 0.6733772151898734, "grad_norm": 0.00031421848689205945, "learning_rate": 6.402572083191632e-06, "loss": 0.0, "step": 2078 }, { "epoch": 0.6740253164556962, "grad_norm": 0.0009968363447114825, "learning_rate": 6.396479122352872e-06, "loss": 0.0, "step": 2080 }, { "epoch": 0.6740253164556962, "eval_accuracy": 0.9999969388560157, "eval_loss": 1.1616197298280895e-05, "eval_runtime": 105.7511, "eval_samples_per_second": 47.281, "eval_steps_per_second": 11.82, "step": 2080 }, { "epoch": 0.674673417721519, "grad_norm": 0.012512893415987492, "learning_rate": 6.390383911507845e-06, "loss": 0.0, "step": 2082 }, { "epoch": 0.6753215189873417, "grad_norm": 0.02030804194509983, "learning_rate": 6.384286460477149e-06, "loss": 0.0, "step": 2084 }, { "epoch": 0.6759696202531645, "grad_norm": 0.00017589876370038837, "learning_rate": 6.378186779084996e-06, "loss": 0.0, "step": 2086 }, { "epoch": 0.6766177215189874, "grad_norm": 0.001576538779772818, "learning_rate": 6.3720848771591884e-06, "loss": 0.0, "step": 2088 }, { "epoch": 0.6772658227848102, "grad_norm": 9.243958629667759e-05, "learning_rate": 6.3659807645311056e-06, "loss": 0.0, "step": 2090 }, { "epoch": 0.6779139240506329, "grad_norm": 0.00014198373537510633, "learning_rate": 6.359874451035688e-06, "loss": 0.0, "step": 2092 }, { "epoch": 0.6785620253164557, "grad_norm": 0.0007102972595021129, "learning_rate": 6.3537659465114275e-06, "loss": 0.0, "step": 2094 }, { "epoch": 0.6792101265822785, "grad_norm": 0.00011982554860878736, "learning_rate": 6.34765526080034e-06, "loss": 0.0001, "step": 2096 }, { "epoch": 0.6798582278481012, "grad_norm": 0.00011331916903145611, "learning_rate": 6.34154240374796e-06, "loss": 0.0, "step": 2098 }, { "epoch": 0.6805063291139241, "grad_norm": 0.00014351024583447725, "learning_rate": 6.33542738520332e-06, "loss": 0.0, "step": 2100 }, { "epoch": 0.6811544303797469, "grad_norm": 0.0023922757245600224, "learning_rate": 6.329310215018931e-06, "loss": 0.0, "step": 2102 }, { "epoch": 0.6818025316455696, "grad_norm": 0.00023756278096698225, "learning_rate": 6.323190903050776e-06, "loss": 0.0, "step": 2104 }, { "epoch": 0.6824506329113924, "grad_norm": 0.00014805899991188198, "learning_rate": 6.317069459158284e-06, "loss": 0.0, "step": 2106 }, { "epoch": 0.6830987341772152, "grad_norm": 0.002821107627823949, "learning_rate": 6.310945893204324e-06, "loss": 0.0, "step": 2108 }, { "epoch": 0.6837468354430379, "grad_norm": 0.0003693237667903304, "learning_rate": 6.30482021505518e-06, "loss": 0.0, "step": 2110 }, { "epoch": 0.6843949367088608, "grad_norm": 0.0005288827233016491, "learning_rate": 6.298692434580543e-06, "loss": 0.0, "step": 2112 }, { "epoch": 0.6850430379746836, "grad_norm": 0.00016826622595544904, "learning_rate": 6.292562561653486e-06, "loss": 0.0, "step": 2114 }, { "epoch": 0.6856911392405063, "grad_norm": 0.0013678655959665775, "learning_rate": 6.286430606150458e-06, "loss": 0.0, "step": 2116 }, { "epoch": 0.6863392405063291, "grad_norm": 0.00178819231223315, "learning_rate": 6.280296577951262e-06, "loss": 0.0, "step": 2118 }, { "epoch": 0.6869873417721519, "grad_norm": 0.004927210509777069, "learning_rate": 6.27416048693904e-06, "loss": 0.0, "step": 2120 }, { "epoch": 0.6869873417721519, "eval_accuracy": 0.9999992141332499, "eval_loss": 4.2000106077466626e-06, "eval_runtime": 106.5125, "eval_samples_per_second": 46.943, "eval_steps_per_second": 11.736, "step": 2120 }, { "epoch": 0.6876354430379746, "grad_norm": 0.0004625746514648199, "learning_rate": 6.268022343000258e-06, "loss": 0.0, "step": 2122 }, { "epoch": 0.6882835443037975, "grad_norm": 0.0015599478501826525, "learning_rate": 6.261882156024688e-06, "loss": 0.0, "step": 2124 }, { "epoch": 0.6889316455696203, "grad_norm": 0.00570697570219636, "learning_rate": 6.255739935905396e-06, "loss": 0.0, "step": 2126 }, { "epoch": 0.689579746835443, "grad_norm": 0.03438084200024605, "learning_rate": 6.249595692538726e-06, "loss": 0.0, "step": 2128 }, { "epoch": 0.6902278481012658, "grad_norm": 0.07320534437894821, "learning_rate": 6.243449435824276e-06, "loss": 0.0, "step": 2130 }, { "epoch": 0.6908759493670886, "grad_norm": 7.549316069344059e-05, "learning_rate": 6.2373011756648905e-06, "loss": 0.0, "step": 2132 }, { "epoch": 0.6915240506329114, "grad_norm": 0.0001575332717038691, "learning_rate": 6.231150921966643e-06, "loss": 0.0, "step": 2134 }, { "epoch": 0.6921721518987342, "grad_norm": 0.0009007062180899084, "learning_rate": 6.22499868463882e-06, "loss": 0.0, "step": 2136 }, { "epoch": 0.692820253164557, "grad_norm": 0.00013916321040596813, "learning_rate": 6.2188444735939e-06, "loss": 0.0, "step": 2138 }, { "epoch": 0.6934683544303798, "grad_norm": 0.0005380238289944828, "learning_rate": 6.212688298747546e-06, "loss": 0.0, "step": 2140 }, { "epoch": 0.6941164556962025, "grad_norm": 0.0021836934611201286, "learning_rate": 6.206530170018581e-06, "loss": 0.0, "step": 2142 }, { "epoch": 0.6947645569620253, "grad_norm": 0.0002022333355853334, "learning_rate": 6.2003700973289785e-06, "loss": 0.0, "step": 2144 }, { "epoch": 0.6954126582278481, "grad_norm": 0.0020313316490501165, "learning_rate": 6.194208090603845e-06, "loss": 0.0, "step": 2146 }, { "epoch": 0.6960607594936709, "grad_norm": 0.00018010415078606457, "learning_rate": 6.1880441597714e-06, "loss": 0.0002, "step": 2148 }, { "epoch": 0.6967088607594937, "grad_norm": 0.007566555868834257, "learning_rate": 6.181878314762968e-06, "loss": 0.0, "step": 2150 }, { "epoch": 0.6973569620253165, "grad_norm": 8.020079258130863e-05, "learning_rate": 6.17571056551295e-06, "loss": 0.0, "step": 2152 }, { "epoch": 0.6980050632911392, "grad_norm": 0.000367714004823938, "learning_rate": 6.169540921958823e-06, "loss": 0.0, "step": 2154 }, { "epoch": 0.698653164556962, "grad_norm": 0.03462158143520355, "learning_rate": 6.163369394041112e-06, "loss": 0.0001, "step": 2156 }, { "epoch": 0.6993012658227848, "grad_norm": 0.0005256628501228988, "learning_rate": 6.157195991703378e-06, "loss": 0.0, "step": 2158 }, { "epoch": 0.6999493670886076, "grad_norm": 0.0006937627913430333, "learning_rate": 6.151020724892205e-06, "loss": 0.0, "step": 2160 }, { "epoch": 0.6999493670886076, "eval_accuracy": 0.9999996086105675, "eval_loss": 3.4480619888199726e-06, "eval_runtime": 106.2725, "eval_samples_per_second": 47.049, "eval_steps_per_second": 11.762, "step": 2160 }, { "epoch": 0.7005974683544304, "grad_norm": 0.00030152147519402206, "learning_rate": 6.144843603557176e-06, "loss": 0.0, "step": 2162 }, { "epoch": 0.7012455696202532, "grad_norm": 0.0002933765936177224, "learning_rate": 6.138664637650867e-06, "loss": 0.0, "step": 2164 }, { "epoch": 0.7018936708860759, "grad_norm": 0.0011545540764927864, "learning_rate": 6.132483837128823e-06, "loss": 0.0, "step": 2166 }, { "epoch": 0.7025417721518987, "grad_norm": 0.00038900863728486, "learning_rate": 6.1263012119495455e-06, "loss": 0.0, "step": 2168 }, { "epoch": 0.7031898734177215, "grad_norm": 0.0003068090882152319, "learning_rate": 6.120116772074478e-06, "loss": 0.0, "step": 2170 }, { "epoch": 0.7038379746835443, "grad_norm": 0.0013358498690649867, "learning_rate": 6.1139305274679835e-06, "loss": 0.0, "step": 2172 }, { "epoch": 0.7044860759493671, "grad_norm": 0.0202906783670187, "learning_rate": 6.107742488097338e-06, "loss": 0.0, "step": 2174 }, { "epoch": 0.7051341772151899, "grad_norm": 0.0001334721309831366, "learning_rate": 6.101552663932704e-06, "loss": 0.0, "step": 2176 }, { "epoch": 0.7057822784810126, "grad_norm": 0.008306547999382019, "learning_rate": 6.095361064947124e-06, "loss": 0.0, "step": 2178 }, { "epoch": 0.7064303797468354, "grad_norm": 0.0003196684701833874, "learning_rate": 6.089167701116498e-06, "loss": 0.0, "step": 2180 }, { "epoch": 0.7070784810126582, "grad_norm": 0.0022242022678256035, "learning_rate": 6.082972582419569e-06, "loss": 0.0, "step": 2182 }, { "epoch": 0.7077265822784811, "grad_norm": 0.0001364542986266315, "learning_rate": 6.076775718837911e-06, "loss": 0.0, "step": 2184 }, { "epoch": 0.7083746835443038, "grad_norm": 0.0006018828134983778, "learning_rate": 6.070577120355903e-06, "loss": 0.0, "step": 2186 }, { "epoch": 0.7090227848101266, "grad_norm": 0.000917417521122843, "learning_rate": 6.064376796960723e-06, "loss": 0.0, "step": 2188 }, { "epoch": 0.7096708860759494, "grad_norm": 0.00041519958176650107, "learning_rate": 6.058174758642332e-06, "loss": 0.0002, "step": 2190 }, { "epoch": 0.7103189873417721, "grad_norm": 7.386893412331119e-05, "learning_rate": 6.051971015393447e-06, "loss": 0.0, "step": 2192 }, { "epoch": 0.7109670886075949, "grad_norm": 0.0014044854324311018, "learning_rate": 6.045765577209536e-06, "loss": 0.0, "step": 2194 }, { "epoch": 0.7116151898734178, "grad_norm": 0.00031422023312188685, "learning_rate": 6.039558454088796e-06, "loss": 0.0, "step": 2196 }, { "epoch": 0.7122632911392405, "grad_norm": 0.00011502230336191133, "learning_rate": 6.033349656032143e-06, "loss": 0.0, "step": 2198 }, { "epoch": 0.7129113924050633, "grad_norm": 8.400805381825194e-05, "learning_rate": 6.027139193043185e-06, "loss": 0.0, "step": 2200 }, { "epoch": 0.7129113924050633, "eval_accuracy": 0.9999996055226824, "eval_loss": 3.1884246709523723e-06, "eval_runtime": 105.3881, "eval_samples_per_second": 47.444, "eval_steps_per_second": 11.861, "step": 2200 }, { "epoch": 0.7135594936708861, "grad_norm": 9.059866715688258e-05, "learning_rate": 6.0209270751282165e-06, "loss": 0.0, "step": 2202 }, { "epoch": 0.7142075949367088, "grad_norm": 0.009854246862232685, "learning_rate": 6.014713312296198e-06, "loss": 0.0, "step": 2204 }, { "epoch": 0.7148556962025316, "grad_norm": 0.00022580361110158265, "learning_rate": 6.0084979145587444e-06, "loss": 0.0, "step": 2206 }, { "epoch": 0.7155037974683545, "grad_norm": 0.0002859641390386969, "learning_rate": 6.002280891930093e-06, "loss": 0.0, "step": 2208 }, { "epoch": 0.7161518987341772, "grad_norm": 8.833897300064564e-05, "learning_rate": 5.996062254427112e-06, "loss": 0.0, "step": 2210 }, { "epoch": 0.7168, "grad_norm": 0.008814448490738869, "learning_rate": 5.989842012069265e-06, "loss": 0.0001, "step": 2212 }, { "epoch": 0.7174481012658228, "grad_norm": 0.05799108371138573, "learning_rate": 5.983620174878601e-06, "loss": 0.0002, "step": 2214 }, { "epoch": 0.7180962025316455, "grad_norm": 0.0002262613852508366, "learning_rate": 5.977396752879742e-06, "loss": 0.0, "step": 2216 }, { "epoch": 0.7187443037974683, "grad_norm": 0.00012421239807736129, "learning_rate": 5.97117175609986e-06, "loss": 0.0, "step": 2218 }, { "epoch": 0.7193924050632912, "grad_norm": 0.06700967252254486, "learning_rate": 5.964945194568669e-06, "loss": 0.0, "step": 2220 }, { "epoch": 0.7200405063291139, "grad_norm": 0.0005706732044927776, "learning_rate": 5.958717078318397e-06, "loss": 0.0002, "step": 2222 }, { "epoch": 0.7206886075949367, "grad_norm": 0.000181859519216232, "learning_rate": 5.952487417383782e-06, "loss": 0.0, "step": 2224 }, { "epoch": 0.7213367088607595, "grad_norm": 0.00011903359700227156, "learning_rate": 5.946256221802052e-06, "loss": 0.0, "step": 2226 }, { "epoch": 0.7219848101265823, "grad_norm": 9.368642349727452e-05, "learning_rate": 5.940023501612902e-06, "loss": 0.0, "step": 2228 }, { "epoch": 0.722632911392405, "grad_norm": 9.8723205155693e-05, "learning_rate": 5.9337892668584896e-06, "loss": 0.0, "step": 2230 }, { "epoch": 0.7232810126582279, "grad_norm": 9.426200267625973e-05, "learning_rate": 5.927553527583407e-06, "loss": 0.0, "step": 2232 }, { "epoch": 0.7239291139240507, "grad_norm": 0.00010700670827645808, "learning_rate": 5.9213162938346765e-06, "loss": 0.0, "step": 2234 }, { "epoch": 0.7245772151898734, "grad_norm": 0.0726417824625969, "learning_rate": 5.915077575661723e-06, "loss": 0.0002, "step": 2236 }, { "epoch": 0.7252253164556962, "grad_norm": 0.00032428756821900606, "learning_rate": 5.908837383116367e-06, "loss": 0.0, "step": 2238 }, { "epoch": 0.725873417721519, "grad_norm": 0.0002373167808400467, "learning_rate": 5.902595726252801e-06, "loss": 0.0, "step": 2240 }, { "epoch": 0.725873417721519, "eval_accuracy": 0.9999988227438175, "eval_loss": 4.894884114037268e-06, "eval_runtime": 105.8929, "eval_samples_per_second": 47.218, "eval_steps_per_second": 11.804, "step": 2240 }, { "epoch": 0.7265215189873417, "grad_norm": 0.00012383979628793895, "learning_rate": 5.896352615127578e-06, "loss": 0.0, "step": 2242 }, { "epoch": 0.7271696202531646, "grad_norm": 0.0012550248065963387, "learning_rate": 5.890108059799596e-06, "loss": 0.0, "step": 2244 }, { "epoch": 0.7278177215189874, "grad_norm": 0.09790094941854477, "learning_rate": 5.883862070330079e-06, "loss": 0.0001, "step": 2246 }, { "epoch": 0.7284658227848101, "grad_norm": 0.00010724524327088147, "learning_rate": 5.877614656782559e-06, "loss": 0.0, "step": 2248 }, { "epoch": 0.7291139240506329, "grad_norm": 0.0463559553027153, "learning_rate": 5.8713658292228695e-06, "loss": 0.0, "step": 2250 }, { "epoch": 0.7297620253164557, "grad_norm": 0.0001051365616149269, "learning_rate": 5.865115597719111e-06, "loss": 0.0, "step": 2252 }, { "epoch": 0.7304101265822784, "grad_norm": 0.0003419144486542791, "learning_rate": 5.858863972341656e-06, "loss": 0.0, "step": 2254 }, { "epoch": 0.7310582278481013, "grad_norm": 0.001729366835206747, "learning_rate": 5.85261096316312e-06, "loss": 0.0, "step": 2256 }, { "epoch": 0.7317063291139241, "grad_norm": 0.00013600761303678155, "learning_rate": 5.846356580258345e-06, "loss": 0.0, "step": 2258 }, { "epoch": 0.7323544303797468, "grad_norm": 0.0004988170694559813, "learning_rate": 5.840100833704392e-06, "loss": 0.0, "step": 2260 }, { "epoch": 0.7330025316455696, "grad_norm": 0.0006871929508633912, "learning_rate": 5.8338437335805124e-06, "loss": 0.0, "step": 2262 }, { "epoch": 0.7336506329113924, "grad_norm": 0.00016507611144334078, "learning_rate": 5.827585289968143e-06, "loss": 0.0, "step": 2264 }, { "epoch": 0.7342987341772151, "grad_norm": 8.90670926310122e-05, "learning_rate": 5.821325512950886e-06, "loss": 0.0, "step": 2266 }, { "epoch": 0.734946835443038, "grad_norm": 0.004662638995796442, "learning_rate": 5.815064412614487e-06, "loss": 0.0, "step": 2268 }, { "epoch": 0.7355949367088608, "grad_norm": 0.0012238593772053719, "learning_rate": 5.80880199904683e-06, "loss": 0.0, "step": 2270 }, { "epoch": 0.7362430379746835, "grad_norm": 0.000732690910808742, "learning_rate": 5.80253828233791e-06, "loss": 0.0, "step": 2272 }, { "epoch": 0.7368911392405063, "grad_norm": 0.05466936528682709, "learning_rate": 5.796273272579823e-06, "loss": 0.0001, "step": 2274 }, { "epoch": 0.7375392405063291, "grad_norm": 0.034032344818115234, "learning_rate": 5.79000697986675e-06, "loss": 0.0, "step": 2276 }, { "epoch": 0.738187341772152, "grad_norm": 0.0003485069319140166, "learning_rate": 5.783739414294938e-06, "loss": 0.0, "step": 2278 }, { "epoch": 0.7388354430379747, "grad_norm": 0.00011134272790513933, "learning_rate": 5.777470585962682e-06, "loss": 0.0, "step": 2280 }, { "epoch": 0.7388354430379747, "eval_accuracy": 0.9999988602777062, "eval_loss": 5.289937234920217e-06, "eval_runtime": 105.9113, "eval_samples_per_second": 47.209, "eval_steps_per_second": 11.802, "step": 2280 }, { "epoch": 0.7394835443037975, "grad_norm": 0.00734311668202281, "learning_rate": 5.771200504970316e-06, "loss": 0.0, "step": 2282 }, { "epoch": 0.7401316455696203, "grad_norm": 0.01817069947719574, "learning_rate": 5.764929181420191e-06, "loss": 0.0, "step": 2284 }, { "epoch": 0.740779746835443, "grad_norm": 0.001817168784327805, "learning_rate": 5.758656625416659e-06, "loss": 0.0, "step": 2286 }, { "epoch": 0.7414278481012658, "grad_norm": 0.017299983650445938, "learning_rate": 5.752382847066058e-06, "loss": 0.0, "step": 2288 }, { "epoch": 0.7420759493670886, "grad_norm": 0.0002641058526933193, "learning_rate": 5.7461078564766945e-06, "loss": 0.0, "step": 2290 }, { "epoch": 0.7427240506329114, "grad_norm": 0.01204910408705473, "learning_rate": 5.739831663758834e-06, "loss": 0.0, "step": 2292 }, { "epoch": 0.7433721518987342, "grad_norm": 0.00013036782911513, "learning_rate": 5.733554279024668e-06, "loss": 0.0, "step": 2294 }, { "epoch": 0.744020253164557, "grad_norm": 7.867599924793467e-05, "learning_rate": 5.727275712388318e-06, "loss": 0.0, "step": 2296 }, { "epoch": 0.7446683544303797, "grad_norm": 0.00017100705008488148, "learning_rate": 5.720995973965806e-06, "loss": 0.0, "step": 2298 }, { "epoch": 0.7453164556962025, "grad_norm": 0.07865839451551437, "learning_rate": 5.714715073875043e-06, "loss": 0.0, "step": 2300 }, { "epoch": 0.7459645569620253, "grad_norm": 0.000345155771356076, "learning_rate": 5.7084330222358106e-06, "loss": 0.0001, "step": 2302 }, { "epoch": 0.7466126582278481, "grad_norm": 0.0002492070198059082, "learning_rate": 5.7021498291697465e-06, "loss": 0.0, "step": 2304 }, { "epoch": 0.7472607594936709, "grad_norm": 5.335691821528599e-05, "learning_rate": 5.695865504800328e-06, "loss": 0.0, "step": 2306 }, { "epoch": 0.7479088607594937, "grad_norm": 9.89673935691826e-05, "learning_rate": 5.689580059252852e-06, "loss": 0.0001, "step": 2308 }, { "epoch": 0.7485569620253164, "grad_norm": 0.00012796987721230835, "learning_rate": 5.683293502654429e-06, "loss": 0.0, "step": 2310 }, { "epoch": 0.7492050632911392, "grad_norm": 0.00014111780910752714, "learning_rate": 5.6770058451339514e-06, "loss": 0.0, "step": 2312 }, { "epoch": 0.749853164556962, "grad_norm": 0.016285588964819908, "learning_rate": 5.6707170968220895e-06, "loss": 0.0, "step": 2314 }, { "epoch": 0.7505012658227848, "grad_norm": 0.0005048218881711364, "learning_rate": 5.664427267851271e-06, "loss": 0.0, "step": 2316 }, { "epoch": 0.7511493670886076, "grad_norm": 0.011887839995324612, "learning_rate": 5.658136368355665e-06, "loss": 0.0, "step": 2318 }, { "epoch": 0.7517974683544304, "grad_norm": 0.009634653106331825, "learning_rate": 5.651844408471162e-06, "loss": 0.0, "step": 2320 }, { "epoch": 0.7517974683544304, "eval_accuracy": 0.9999980149850695, "eval_loss": 7.766235285089351e-06, "eval_runtime": 107.1514, "eval_samples_per_second": 46.663, "eval_steps_per_second": 11.666, "step": 2320 }, { "epoch": 0.7524455696202532, "grad_norm": 0.006753602996468544, "learning_rate": 5.645551398335367e-06, "loss": 0.0, "step": 2322 }, { "epoch": 0.7530936708860759, "grad_norm": 8.452868496533483e-05, "learning_rate": 5.6392573480875724e-06, "loss": 0.0, "step": 2324 }, { "epoch": 0.7537417721518987, "grad_norm": 0.0007264796877279878, "learning_rate": 5.632962267868747e-06, "loss": 0.0, "step": 2326 }, { "epoch": 0.7543898734177216, "grad_norm": 0.00017660489538684487, "learning_rate": 5.626666167821522e-06, "loss": 0.0, "step": 2328 }, { "epoch": 0.7550379746835443, "grad_norm": 0.0025927009992301464, "learning_rate": 5.620369058090168e-06, "loss": 0.0, "step": 2330 }, { "epoch": 0.7556860759493671, "grad_norm": 0.00015297834761440754, "learning_rate": 5.6140709488205854e-06, "loss": 0.0, "step": 2332 }, { "epoch": 0.7563341772151899, "grad_norm": 0.010506555438041687, "learning_rate": 5.607771850160285e-06, "loss": 0.0, "step": 2334 }, { "epoch": 0.7569822784810126, "grad_norm": 0.00015023894957266748, "learning_rate": 5.601471772258368e-06, "loss": 0.0, "step": 2336 }, { "epoch": 0.7576303797468354, "grad_norm": 0.00028166870470158756, "learning_rate": 5.595170725265517e-06, "loss": 0.0, "step": 2338 }, { "epoch": 0.7582784810126583, "grad_norm": 0.007426573429256678, "learning_rate": 5.588868719333974e-06, "loss": 0.0002, "step": 2340 }, { "epoch": 0.758926582278481, "grad_norm": 0.0006746530998498201, "learning_rate": 5.582565764617528e-06, "loss": 0.0, "step": 2342 }, { "epoch": 0.7595746835443038, "grad_norm": 0.0007765135960653424, "learning_rate": 5.576261871271494e-06, "loss": 0.0, "step": 2344 }, { "epoch": 0.7602227848101266, "grad_norm": 0.10703450441360474, "learning_rate": 5.569957049452703e-06, "loss": 0.0, "step": 2346 }, { "epoch": 0.7608708860759493, "grad_norm": 0.0001581802061991766, "learning_rate": 5.56365130931948e-06, "loss": 0.0, "step": 2348 }, { "epoch": 0.7615189873417721, "grad_norm": 0.00017560912237968296, "learning_rate": 5.557344661031628e-06, "loss": 0.0, "step": 2350 }, { "epoch": 0.762167088607595, "grad_norm": 0.0002994161914102733, "learning_rate": 5.551037114750415e-06, "loss": 0.0, "step": 2352 }, { "epoch": 0.7628151898734177, "grad_norm": 0.0007976372726261616, "learning_rate": 5.544728680638557e-06, "loss": 0.0002, "step": 2354 }, { "epoch": 0.7634632911392405, "grad_norm": 0.003032875480130315, "learning_rate": 5.538419368860196e-06, "loss": 0.0, "step": 2356 }, { "epoch": 0.7641113924050633, "grad_norm": 0.05126134678721428, "learning_rate": 5.532109189580893e-06, "loss": 0.0, "step": 2358 }, { "epoch": 0.764759493670886, "grad_norm": 0.05150891840457916, "learning_rate": 5.525798152967605e-06, "loss": 0.0, "step": 2360 }, { "epoch": 0.764759493670886, "eval_accuracy": 0.9999976613323593, "eval_loss": 1.1560122402443085e-05, "eval_runtime": 106.4124, "eval_samples_per_second": 46.987, "eval_steps_per_second": 11.747, "step": 2360 }, { "epoch": 0.7654075949367088, "grad_norm": 0.00036254760925658047, "learning_rate": 5.519486269188669e-06, "loss": 0.0, "step": 2362 }, { "epoch": 0.7660556962025317, "grad_norm": 0.00011886703578056768, "learning_rate": 5.513173548413789e-06, "loss": 0.0, "step": 2364 }, { "epoch": 0.7667037974683544, "grad_norm": 7.184002606663853e-05, "learning_rate": 5.506860000814017e-06, "loss": 0.0, "step": 2366 }, { "epoch": 0.7673518987341772, "grad_norm": 0.00030205457005649805, "learning_rate": 5.500545636561737e-06, "loss": 0.0, "step": 2368 }, { "epoch": 0.768, "grad_norm": 0.002157788723707199, "learning_rate": 5.494230465830648e-06, "loss": 0.0, "step": 2370 }, { "epoch": 0.7686481012658228, "grad_norm": 0.006820139940828085, "learning_rate": 5.487914498795748e-06, "loss": 0.0, "step": 2372 }, { "epoch": 0.7692962025316455, "grad_norm": 0.0009759216918610036, "learning_rate": 5.4815977456333205e-06, "loss": 0.0, "step": 2374 }, { "epoch": 0.7699443037974684, "grad_norm": 0.00026600834098644555, "learning_rate": 5.475280216520913e-06, "loss": 0.0, "step": 2376 }, { "epoch": 0.7705924050632912, "grad_norm": 0.00012866378528997302, "learning_rate": 5.468961921637327e-06, "loss": 0.0, "step": 2378 }, { "epoch": 0.7712405063291139, "grad_norm": 0.00221284874714911, "learning_rate": 5.462642871162592e-06, "loss": 0.0001, "step": 2380 }, { "epoch": 0.7718886075949367, "grad_norm": 0.0002176949055865407, "learning_rate": 5.4563230752779595e-06, "loss": 0.0, "step": 2382 }, { "epoch": 0.7725367088607595, "grad_norm": 7.449064287357032e-05, "learning_rate": 5.450002544165881e-06, "loss": 0.0, "step": 2384 }, { "epoch": 0.7731848101265822, "grad_norm": 0.006494743749499321, "learning_rate": 5.443681288009991e-06, "loss": 0.0, "step": 2386 }, { "epoch": 0.7738329113924051, "grad_norm": 0.0006466159829869866, "learning_rate": 5.437359316995094e-06, "loss": 0.0, "step": 2388 }, { "epoch": 0.7744810126582279, "grad_norm": 0.00047616238589398563, "learning_rate": 5.431036641307146e-06, "loss": 0.0, "step": 2390 }, { "epoch": 0.7751291139240506, "grad_norm": 0.0010342790046706796, "learning_rate": 5.424713271133237e-06, "loss": 0.0, "step": 2392 }, { "epoch": 0.7757772151898734, "grad_norm": 0.0027879138942807913, "learning_rate": 5.41838921666158e-06, "loss": 0.0003, "step": 2394 }, { "epoch": 0.7764253164556962, "grad_norm": 0.06997310370206833, "learning_rate": 5.412064488081482e-06, "loss": 0.0, "step": 2396 }, { "epoch": 0.7770734177215189, "grad_norm": 0.0062122284434735775, "learning_rate": 5.4057390955833455e-06, "loss": 0.0, "step": 2398 }, { "epoch": 0.7777215189873418, "grad_norm": 0.32322806119918823, "learning_rate": 5.3994130493586385e-06, "loss": 0.0012, "step": 2400 }, { "epoch": 0.7777215189873418, "eval_accuracy": 0.9999614380181616, "eval_loss": 0.0002183700999012217, "eval_runtime": 105.1957, "eval_samples_per_second": 47.53, "eval_steps_per_second": 11.883, "step": 2400 }, { "epoch": 0.7783696202531646, "grad_norm": 0.24002808332443237, "learning_rate": 5.393086359599882e-06, "loss": 0.0003, "step": 2402 }, { "epoch": 0.7790177215189873, "grad_norm": 0.00038687174674123526, "learning_rate": 5.386759036500635e-06, "loss": 0.0005, "step": 2404 }, { "epoch": 0.7796658227848101, "grad_norm": 0.0652102380990982, "learning_rate": 5.380431090255475e-06, "loss": 0.0, "step": 2406 }, { "epoch": 0.7803139240506329, "grad_norm": 0.003415727987885475, "learning_rate": 5.3741025310599885e-06, "loss": 0.0, "step": 2408 }, { "epoch": 0.7809620253164556, "grad_norm": 0.028407571837306023, "learning_rate": 5.367773369110741e-06, "loss": 0.0, "step": 2410 }, { "epoch": 0.7816101265822785, "grad_norm": 0.00183353410102427, "learning_rate": 5.361443614605279e-06, "loss": 0.0, "step": 2412 }, { "epoch": 0.7822582278481013, "grad_norm": 0.02177795208990574, "learning_rate": 5.355113277742095e-06, "loss": 0.0, "step": 2414 }, { "epoch": 0.7829063291139241, "grad_norm": 0.005271558184176683, "learning_rate": 5.348782368720627e-06, "loss": 0.0, "step": 2416 }, { "epoch": 0.7835544303797468, "grad_norm": 0.0031913723796606064, "learning_rate": 5.3424508977412285e-06, "loss": 0.0001, "step": 2418 }, { "epoch": 0.7842025316455696, "grad_norm": 0.004424366634339094, "learning_rate": 5.336118875005165e-06, "loss": 0.0001, "step": 2420 }, { "epoch": 0.7848506329113925, "grad_norm": 0.09619469195604324, "learning_rate": 5.329786310714583e-06, "loss": 0.0, "step": 2422 }, { "epoch": 0.7854987341772152, "grad_norm": 0.15482549369335175, "learning_rate": 5.3234532150725096e-06, "loss": 0.0003, "step": 2424 }, { "epoch": 0.786146835443038, "grad_norm": 0.21717675030231476, "learning_rate": 5.317119598282823e-06, "loss": 0.0003, "step": 2426 }, { "epoch": 0.7867949367088608, "grad_norm": 0.027250362560153008, "learning_rate": 5.310785470550243e-06, "loss": 0.0001, "step": 2428 }, { "epoch": 0.7874430379746835, "grad_norm": 0.00278554018586874, "learning_rate": 5.304450842080312e-06, "loss": 0.0, "step": 2430 }, { "epoch": 0.7880911392405063, "grad_norm": 0.09197168052196503, "learning_rate": 5.29811572307938e-06, "loss": 0.0001, "step": 2432 }, { "epoch": 0.7887392405063292, "grad_norm": 0.0017266845097765326, "learning_rate": 5.291780123754585e-06, "loss": 0.0, "step": 2434 }, { "epoch": 0.7893873417721519, "grad_norm": 0.001447897288016975, "learning_rate": 5.285444054313841e-06, "loss": 0.0, "step": 2436 }, { "epoch": 0.7900354430379747, "grad_norm": 0.0027126609347760677, "learning_rate": 5.27910752496582e-06, "loss": 0.0002, "step": 2438 }, { "epoch": 0.7906835443037975, "grad_norm": 0.05010005086660385, "learning_rate": 5.2727705459199345e-06, "loss": 0.0, "step": 2440 }, { "epoch": 0.7906835443037975, "eval_accuracy": 0.9999877555061656, "eval_loss": 3.793625000980683e-05, "eval_runtime": 106.2945, "eval_samples_per_second": 47.039, "eval_steps_per_second": 11.76, "step": 2440 }, { "epoch": 0.7913316455696202, "grad_norm": 0.03448754549026489, "learning_rate": 5.266433127386319e-06, "loss": 0.0002, "step": 2442 }, { "epoch": 0.791979746835443, "grad_norm": 0.005970288068056107, "learning_rate": 5.260095279575818e-06, "loss": 0.0, "step": 2444 }, { "epoch": 0.7926278481012659, "grad_norm": 0.11865130066871643, "learning_rate": 5.253757012699972e-06, "loss": 0.0001, "step": 2446 }, { "epoch": 0.7932759493670886, "grad_norm": 0.10202252864837646, "learning_rate": 5.247418336970989e-06, "loss": 0.0002, "step": 2448 }, { "epoch": 0.7939240506329114, "grad_norm": 0.007061738055199385, "learning_rate": 5.241079262601738e-06, "loss": 0.0, "step": 2450 }, { "epoch": 0.7945721518987342, "grad_norm": 0.006444692611694336, "learning_rate": 5.234739799805735e-06, "loss": 0.0, "step": 2452 }, { "epoch": 0.7952202531645569, "grad_norm": 0.01060519739985466, "learning_rate": 5.228399958797117e-06, "loss": 0.0, "step": 2454 }, { "epoch": 0.7958683544303797, "grad_norm": 0.002685698214918375, "learning_rate": 5.2220597497906315e-06, "loss": 0.0, "step": 2456 }, { "epoch": 0.7965164556962026, "grad_norm": 0.0019370040390640497, "learning_rate": 5.215719183001619e-06, "loss": 0.0, "step": 2458 }, { "epoch": 0.7971645569620253, "grad_norm": 0.00024689623387530446, "learning_rate": 5.209378268645998e-06, "loss": 0.0002, "step": 2460 }, { "epoch": 0.7978126582278481, "grad_norm": 0.06425344198942184, "learning_rate": 5.203037016940245e-06, "loss": 0.0001, "step": 2462 }, { "epoch": 0.7984607594936709, "grad_norm": 0.00064616504823789, "learning_rate": 5.19669543810138e-06, "loss": 0.0, "step": 2464 }, { "epoch": 0.7991088607594937, "grad_norm": 0.020757561549544334, "learning_rate": 5.190353542346951e-06, "loss": 0.0, "step": 2466 }, { "epoch": 0.7997569620253164, "grad_norm": 0.0072888876311481, "learning_rate": 5.184011339895015e-06, "loss": 0.0, "step": 2468 }, { "epoch": 0.8004050632911393, "grad_norm": 0.037369176745414734, "learning_rate": 5.177668840964128e-06, "loss": 0.0001, "step": 2470 }, { "epoch": 0.8010531645569621, "grad_norm": 0.00951011385768652, "learning_rate": 5.171326055773318e-06, "loss": 0.0, "step": 2472 }, { "epoch": 0.8017012658227848, "grad_norm": 0.004345647059381008, "learning_rate": 5.164982994542076e-06, "loss": 0.0, "step": 2474 }, { "epoch": 0.8023493670886076, "grad_norm": 0.018001670017838478, "learning_rate": 5.15863966749034e-06, "loss": 0.0, "step": 2476 }, { "epoch": 0.8029974683544304, "grad_norm": 0.0005489176837727427, "learning_rate": 5.1522960848384715e-06, "loss": 0.0, "step": 2478 }, { "epoch": 0.8036455696202531, "grad_norm": 0.002194503555074334, "learning_rate": 5.1459522568072495e-06, "loss": 0.0, "step": 2480 }, { "epoch": 0.8036455696202531, "eval_accuracy": 0.9999988498303606, "eval_loss": 1.258536121895304e-05, "eval_runtime": 105.8044, "eval_samples_per_second": 47.257, "eval_steps_per_second": 11.814, "step": 2480 }, { "epoch": 0.804293670886076, "grad_norm": 0.0060137673281133175, "learning_rate": 5.139608193617846e-06, "loss": 0.0, "step": 2482 }, { "epoch": 0.8049417721518988, "grad_norm": 0.0109030706807971, "learning_rate": 5.133263905491809e-06, "loss": 0.0, "step": 2484 }, { "epoch": 0.8055898734177215, "grad_norm": 0.005852850619703531, "learning_rate": 5.126919402651053e-06, "loss": 0.0, "step": 2486 }, { "epoch": 0.8062379746835443, "grad_norm": 0.0023848526179790497, "learning_rate": 5.120574695317837e-06, "loss": 0.0, "step": 2488 }, { "epoch": 0.8068860759493671, "grad_norm": 0.0017674455884844065, "learning_rate": 5.114229793714749e-06, "loss": 0.0, "step": 2490 }, { "epoch": 0.8075341772151898, "grad_norm": 0.0013527893461287022, "learning_rate": 5.1078847080646894e-06, "loss": 0.0, "step": 2492 }, { "epoch": 0.8081822784810127, "grad_norm": 0.00026338460156694055, "learning_rate": 5.101539448590859e-06, "loss": 0.0, "step": 2494 }, { "epoch": 0.8088303797468355, "grad_norm": 0.031443241983652115, "learning_rate": 5.095194025516733e-06, "loss": 0.0001, "step": 2496 }, { "epoch": 0.8094784810126582, "grad_norm": 0.00025254758656956255, "learning_rate": 5.088848449066055e-06, "loss": 0.0, "step": 2498 }, { "epoch": 0.810126582278481, "grad_norm": 0.0013786570634692907, "learning_rate": 5.082502729462813e-06, "loss": 0.0, "step": 2500 }, { "epoch": 0.8107746835443038, "grad_norm": 0.0034877141006290913, "learning_rate": 5.076156876931225e-06, "loss": 0.0, "step": 2502 }, { "epoch": 0.8114227848101265, "grad_norm": 0.0005392807070165873, "learning_rate": 5.069810901695727e-06, "loss": 0.0, "step": 2504 }, { "epoch": 0.8120708860759493, "grad_norm": 0.008301528170704842, "learning_rate": 5.063464813980948e-06, "loss": 0.0, "step": 2506 }, { "epoch": 0.8127189873417722, "grad_norm": 0.0008546709432266653, "learning_rate": 5.057118624011702e-06, "loss": 0.0, "step": 2508 }, { "epoch": 0.813367088607595, "grad_norm": 0.0025733408983796835, "learning_rate": 5.050772342012966e-06, "loss": 0.0001, "step": 2510 }, { "epoch": 0.8140151898734177, "grad_norm": 0.11622051149606705, "learning_rate": 5.044425978209864e-06, "loss": 0.0, "step": 2512 }, { "epoch": 0.8146632911392405, "grad_norm": 0.009716551750898361, "learning_rate": 5.038079542827654e-06, "loss": 0.0, "step": 2514 }, { "epoch": 0.8153113924050633, "grad_norm": 0.00016989122377708554, "learning_rate": 5.03173304609171e-06, "loss": 0.0, "step": 2516 }, { "epoch": 0.815959493670886, "grad_norm": 0.005587367806583643, "learning_rate": 5.025386498227501e-06, "loss": 0.0, "step": 2518 }, { "epoch": 0.8166075949367089, "grad_norm": 0.00024819717509672046, "learning_rate": 5.019039909460584e-06, "loss": 0.0, "step": 2520 }, { "epoch": 0.8166075949367089, "eval_accuracy": 0.999998455353043, "eval_loss": 7.359149094554596e-06, "eval_runtime": 105.9092, "eval_samples_per_second": 47.21, "eval_steps_per_second": 11.803, "step": 2520 }, { "epoch": 0.8172556962025317, "grad_norm": 0.02661811001598835, "learning_rate": 5.012693290016576e-06, "loss": 0.0, "step": 2522 }, { "epoch": 0.8179037974683544, "grad_norm": 0.005714859813451767, "learning_rate": 5.006346650121148e-06, "loss": 0.0, "step": 2524 }, { "epoch": 0.8185518987341772, "grad_norm": 0.32837799191474915, "learning_rate": 5e-06, "loss": 0.0001, "step": 2526 }, { "epoch": 0.8192, "grad_norm": 0.19652798771858215, "learning_rate": 4.993653349878854e-06, "loss": 0.0001, "step": 2528 }, { "epoch": 0.8198481012658227, "grad_norm": 0.00043144775554537773, "learning_rate": 4.987306709983426e-06, "loss": 0.0, "step": 2530 }, { "epoch": 0.8204962025316456, "grad_norm": 0.0030640813056379557, "learning_rate": 4.980960090539417e-06, "loss": 0.0, "step": 2532 }, { "epoch": 0.8211443037974684, "grad_norm": 0.10829441994428635, "learning_rate": 4.9746135017725e-06, "loss": 0.0001, "step": 2534 }, { "epoch": 0.8217924050632911, "grad_norm": 0.039655912667512894, "learning_rate": 4.9682669539082914e-06, "loss": 0.0, "step": 2536 }, { "epoch": 0.8224405063291139, "grad_norm": 0.027690451592206955, "learning_rate": 4.961920457172347e-06, "loss": 0.0001, "step": 2538 }, { "epoch": 0.8230886075949367, "grad_norm": 0.03515208885073662, "learning_rate": 4.955574021790138e-06, "loss": 0.0, "step": 2540 }, { "epoch": 0.8237367088607594, "grad_norm": 0.10855144262313843, "learning_rate": 4.9492276579870355e-06, "loss": 0.0001, "step": 2542 }, { "epoch": 0.8243848101265823, "grad_norm": 0.004899628460407257, "learning_rate": 4.9428813759883e-06, "loss": 0.0001, "step": 2544 }, { "epoch": 0.8250329113924051, "grad_norm": 0.0006479221628978848, "learning_rate": 4.936535186019053e-06, "loss": 0.0, "step": 2546 }, { "epoch": 0.8256810126582278, "grad_norm": 0.0010982794919982553, "learning_rate": 4.9301890983042744e-06, "loss": 0.0, "step": 2548 }, { "epoch": 0.8263291139240506, "grad_norm": 0.0006811083876527846, "learning_rate": 4.923843123068776e-06, "loss": 0.0, "step": 2550 }, { "epoch": 0.8269772151898734, "grad_norm": 0.00013663475692737848, "learning_rate": 4.917497270537188e-06, "loss": 0.0001, "step": 2552 }, { "epoch": 0.8276253164556961, "grad_norm": 0.0004709422937594354, "learning_rate": 4.911151550933946e-06, "loss": 0.0, "step": 2554 }, { "epoch": 0.828273417721519, "grad_norm": 0.07884550839662552, "learning_rate": 4.904805974483267e-06, "loss": 0.0, "step": 2556 }, { "epoch": 0.8289215189873418, "grad_norm": 0.1472872942686081, "learning_rate": 4.898460551409141e-06, "loss": 0.0001, "step": 2558 }, { "epoch": 0.8295696202531646, "grad_norm": 0.007455206476151943, "learning_rate": 4.8921152919353105e-06, "loss": 0.0, "step": 2560 }, { "epoch": 0.8295696202531646, "eval_accuracy": 0.9999908549339463, "eval_loss": 3.50044465449173e-05, "eval_runtime": 104.6129, "eval_samples_per_second": 47.795, "eval_steps_per_second": 11.949, "step": 2560 }, { "epoch": 0.8302177215189873, "grad_norm": 0.008790325373411179, "learning_rate": 4.8857702062852515e-06, "loss": 0.0, "step": 2562 }, { "epoch": 0.8308658227848101, "grad_norm": 0.004961876664310694, "learning_rate": 4.879425304682164e-06, "loss": 0.0, "step": 2564 }, { "epoch": 0.831513924050633, "grad_norm": 0.02836470678448677, "learning_rate": 4.873080597348948e-06, "loss": 0.0, "step": 2566 }, { "epoch": 0.8321620253164557, "grad_norm": 0.009730171412229538, "learning_rate": 4.866736094508191e-06, "loss": 0.0004, "step": 2568 }, { "epoch": 0.8328101265822785, "grad_norm": 0.03526896610856056, "learning_rate": 4.860391806382157e-06, "loss": 0.0, "step": 2570 }, { "epoch": 0.8334582278481013, "grad_norm": 0.07535210996866226, "learning_rate": 4.854047743192752e-06, "loss": 0.0, "step": 2572 }, { "epoch": 0.834106329113924, "grad_norm": 0.0026582893915474415, "learning_rate": 4.847703915161531e-06, "loss": 0.0, "step": 2574 }, { "epoch": 0.8347544303797468, "grad_norm": 0.02271987311542034, "learning_rate": 4.841360332509663e-06, "loss": 0.0, "step": 2576 }, { "epoch": 0.8354025316455697, "grad_norm": 0.31956908106803894, "learning_rate": 4.835017005457926e-06, "loss": 0.0001, "step": 2578 }, { "epoch": 0.8360506329113924, "grad_norm": 0.05217691510915756, "learning_rate": 4.828673944226684e-06, "loss": 0.0002, "step": 2580 }, { "epoch": 0.8366987341772152, "grad_norm": 0.07773158699274063, "learning_rate": 4.822331159035873e-06, "loss": 0.0, "step": 2582 }, { "epoch": 0.837346835443038, "grad_norm": 0.0017431579763069749, "learning_rate": 4.815988660104986e-06, "loss": 0.0001, "step": 2584 }, { "epoch": 0.8379949367088607, "grad_norm": 0.0009354711510241032, "learning_rate": 4.809646457653051e-06, "loss": 0.0001, "step": 2586 }, { "epoch": 0.8386430379746835, "grad_norm": 0.00018994830315932631, "learning_rate": 4.803304561898622e-06, "loss": 0.0, "step": 2588 }, { "epoch": 0.8392911392405064, "grad_norm": 0.0062407576479017735, "learning_rate": 4.796962983059757e-06, "loss": 0.0, "step": 2590 }, { "epoch": 0.8399392405063291, "grad_norm": 0.0004235352680552751, "learning_rate": 4.7906217313540035e-06, "loss": 0.0, "step": 2592 }, { "epoch": 0.8405873417721519, "grad_norm": 0.2927955687046051, "learning_rate": 4.784280816998382e-06, "loss": 0.0, "step": 2594 }, { "epoch": 0.8412354430379747, "grad_norm": 0.00975685752928257, "learning_rate": 4.777940250209369e-06, "loss": 0.0, "step": 2596 }, { "epoch": 0.8418835443037974, "grad_norm": 0.06928249448537827, "learning_rate": 4.771600041202884e-06, "loss": 0.0, "step": 2598 }, { "epoch": 0.8425316455696202, "grad_norm": 0.006853208411484957, "learning_rate": 4.765260200194266e-06, "loss": 0.0, "step": 2600 }, { "epoch": 0.8425316455696202, "eval_accuracy": 0.9999832612866548, "eval_loss": 9.795609366847202e-05, "eval_runtime": 105.1244, "eval_samples_per_second": 47.563, "eval_steps_per_second": 11.891, "step": 2600 }, { "epoch": 0.8431797468354431, "grad_norm": 0.023050669580698013, "learning_rate": 4.7589207373982635e-06, "loss": 0.0003, "step": 2602 }, { "epoch": 0.8438278481012659, "grad_norm": 0.0012926086783409119, "learning_rate": 4.7525816630290135e-06, "loss": 0.0, "step": 2604 }, { "epoch": 0.8444759493670886, "grad_norm": 0.0985993891954422, "learning_rate": 4.74624298730003e-06, "loss": 0.0001, "step": 2606 }, { "epoch": 0.8451240506329114, "grad_norm": 0.047884199768304825, "learning_rate": 4.7399047204241826e-06, "loss": 0.0, "step": 2608 }, { "epoch": 0.8457721518987342, "grad_norm": 0.044315554201602936, "learning_rate": 4.733566872613683e-06, "loss": 0.0, "step": 2610 }, { "epoch": 0.8464202531645569, "grad_norm": 0.28911229968070984, "learning_rate": 4.727229454080068e-06, "loss": 0.0001, "step": 2612 }, { "epoch": 0.8470683544303798, "grad_norm": 0.001446192734874785, "learning_rate": 4.720892475034181e-06, "loss": 0.0, "step": 2614 }, { "epoch": 0.8477164556962026, "grad_norm": 0.0012995201395824552, "learning_rate": 4.71455594568616e-06, "loss": 0.0, "step": 2616 }, { "epoch": 0.8483645569620253, "grad_norm": 0.03472326695919037, "learning_rate": 4.7082198762454165e-06, "loss": 0.0, "step": 2618 }, { "epoch": 0.8490126582278481, "grad_norm": 0.03457796573638916, "learning_rate": 4.701884276920622e-06, "loss": 0.0, "step": 2620 }, { "epoch": 0.8496607594936709, "grad_norm": 0.0026486103888601065, "learning_rate": 4.69554915791969e-06, "loss": 0.0002, "step": 2622 }, { "epoch": 0.8503088607594936, "grad_norm": 0.002102876780554652, "learning_rate": 4.689214529449758e-06, "loss": 0.0, "step": 2624 }, { "epoch": 0.8509569620253165, "grad_norm": 0.0034834700636565685, "learning_rate": 4.682880401717178e-06, "loss": 0.0, "step": 2626 }, { "epoch": 0.8516050632911393, "grad_norm": 0.06586204469203949, "learning_rate": 4.676546784927491e-06, "loss": 0.0002, "step": 2628 }, { "epoch": 0.852253164556962, "grad_norm": 0.0015004536835476756, "learning_rate": 4.670213689285418e-06, "loss": 0.0, "step": 2630 }, { "epoch": 0.8529012658227848, "grad_norm": 0.14395491778850555, "learning_rate": 4.663881124994837e-06, "loss": 0.0003, "step": 2632 }, { "epoch": 0.8535493670886076, "grad_norm": 0.024621322751045227, "learning_rate": 4.6575491022587714e-06, "loss": 0.0, "step": 2634 }, { "epoch": 0.8541974683544303, "grad_norm": 0.002342685591429472, "learning_rate": 4.651217631279374e-06, "loss": 0.0, "step": 2636 }, { "epoch": 0.8548455696202532, "grad_norm": 0.0156905185431242, "learning_rate": 4.644886722257905e-06, "loss": 0.0, "step": 2638 }, { "epoch": 0.855493670886076, "grad_norm": 0.0143441678956151, "learning_rate": 4.638556385394721e-06, "loss": 0.0, "step": 2640 }, { "epoch": 0.855493670886076, "eval_accuracy": 0.9999904080133325, "eval_loss": 4.1821800550678745e-05, "eval_runtime": 106.2091, "eval_samples_per_second": 47.077, "eval_steps_per_second": 11.769, "step": 2640 }, { "epoch": 0.8561417721518987, "grad_norm": 0.25288715958595276, "learning_rate": 4.632226630889258e-06, "loss": 0.0004, "step": 2642 }, { "epoch": 0.8567898734177215, "grad_norm": 0.010879195295274258, "learning_rate": 4.625897468940012e-06, "loss": 0.0, "step": 2644 }, { "epoch": 0.8574379746835443, "grad_norm": 0.008445970714092255, "learning_rate": 4.619568909744524e-06, "loss": 0.0, "step": 2646 }, { "epoch": 0.858086075949367, "grad_norm": 0.001091435202397406, "learning_rate": 4.6132409634993655e-06, "loss": 0.0, "step": 2648 }, { "epoch": 0.8587341772151899, "grad_norm": 0.0010871505364775658, "learning_rate": 4.606913640400118e-06, "loss": 0.0, "step": 2650 }, { "epoch": 0.8593822784810127, "grad_norm": 0.0015741437673568726, "learning_rate": 4.600586950641362e-06, "loss": 0.0, "step": 2652 }, { "epoch": 0.8600303797468355, "grad_norm": 0.0032445641700178385, "learning_rate": 4.594260904416656e-06, "loss": 0.0, "step": 2654 }, { "epoch": 0.8606784810126582, "grad_norm": 0.03352479264140129, "learning_rate": 4.587935511918521e-06, "loss": 0.0, "step": 2656 }, { "epoch": 0.861326582278481, "grad_norm": 0.0031426376663148403, "learning_rate": 4.581610783338424e-06, "loss": 0.0, "step": 2658 }, { "epoch": 0.8619746835443038, "grad_norm": 0.0009672287269495428, "learning_rate": 4.575286728866765e-06, "loss": 0.0, "step": 2660 }, { "epoch": 0.8626227848101266, "grad_norm": 0.022923622280359268, "learning_rate": 4.568963358692856e-06, "loss": 0.0, "step": 2662 }, { "epoch": 0.8632708860759494, "grad_norm": 0.04482516646385193, "learning_rate": 4.562640683004907e-06, "loss": 0.0, "step": 2664 }, { "epoch": 0.8639189873417722, "grad_norm": 0.0032297135330736637, "learning_rate": 4.55631871199001e-06, "loss": 0.0, "step": 2666 }, { "epoch": 0.8645670886075949, "grad_norm": 0.0007287651533260942, "learning_rate": 4.549997455834121e-06, "loss": 0.0, "step": 2668 }, { "epoch": 0.8652151898734177, "grad_norm": 0.00027854496147483587, "learning_rate": 4.543676924722042e-06, "loss": 0.0001, "step": 2670 }, { "epoch": 0.8658632911392405, "grad_norm": 0.0007697210530750453, "learning_rate": 4.53735712883741e-06, "loss": 0.0, "step": 2672 }, { "epoch": 0.8665113924050633, "grad_norm": 0.12043469399213791, "learning_rate": 4.531038078362675e-06, "loss": 0.0001, "step": 2674 }, { "epoch": 0.8671594936708861, "grad_norm": 0.0005098318797536194, "learning_rate": 4.524719783479088e-06, "loss": 0.0, "step": 2676 }, { "epoch": 0.8678075949367089, "grad_norm": 0.0008214846020564437, "learning_rate": 4.518402254366681e-06, "loss": 0.0, "step": 2678 }, { "epoch": 0.8684556962025316, "grad_norm": 0.0033130354713648558, "learning_rate": 4.512085501204254e-06, "loss": 0.0002, "step": 2680 }, { "epoch": 0.8684556962025316, "eval_accuracy": 0.9999965564468379, "eval_loss": 1.2069258445990272e-05, "eval_runtime": 106.3063, "eval_samples_per_second": 47.034, "eval_steps_per_second": 11.758, "step": 2680 }, { "epoch": 0.8691037974683544, "grad_norm": 0.0812884122133255, "learning_rate": 4.505769534169354e-06, "loss": 0.0005, "step": 2682 }, { "epoch": 0.8697518987341772, "grad_norm": 0.001252077054232359, "learning_rate": 4.499454363438265e-06, "loss": 0.0, "step": 2684 }, { "epoch": 0.8704, "grad_norm": 0.0012147113448008895, "learning_rate": 4.493139999185984e-06, "loss": 0.0, "step": 2686 }, { "epoch": 0.8710481012658228, "grad_norm": 0.00036030332557857037, "learning_rate": 4.4868264515862115e-06, "loss": 0.0, "step": 2688 }, { "epoch": 0.8716962025316456, "grad_norm": 0.0011000720551237464, "learning_rate": 4.480513730811332e-06, "loss": 0.0, "step": 2690 }, { "epoch": 0.8723443037974683, "grad_norm": 0.002217898378148675, "learning_rate": 4.474201847032396e-06, "loss": 0.0, "step": 2692 }, { "epoch": 0.8729924050632911, "grad_norm": 0.05367765948176384, "learning_rate": 4.467890810419108e-06, "loss": 0.0, "step": 2694 }, { "epoch": 0.8736405063291139, "grad_norm": 0.013308125548064709, "learning_rate": 4.461580631139806e-06, "loss": 0.0, "step": 2696 }, { "epoch": 0.8742886075949368, "grad_norm": 0.08926782011985779, "learning_rate": 4.455271319361445e-06, "loss": 0.0001, "step": 2698 }, { "epoch": 0.8749367088607595, "grad_norm": 0.0002369306021137163, "learning_rate": 4.448962885249587e-06, "loss": 0.0001, "step": 2700 }, { "epoch": 0.8755848101265823, "grad_norm": 0.009098300710320473, "learning_rate": 4.442655338968373e-06, "loss": 0.0, "step": 2702 }, { "epoch": 0.8762329113924051, "grad_norm": 0.13955150544643402, "learning_rate": 4.436348690680521e-06, "loss": 0.0003, "step": 2704 }, { "epoch": 0.8768810126582278, "grad_norm": 0.005642596632242203, "learning_rate": 4.430042950547298e-06, "loss": 0.0, "step": 2706 }, { "epoch": 0.8775291139240506, "grad_norm": 0.020765792578458786, "learning_rate": 4.423738128728507e-06, "loss": 0.0, "step": 2708 }, { "epoch": 0.8781772151898735, "grad_norm": 0.0027800104580819607, "learning_rate": 4.417434235382474e-06, "loss": 0.0, "step": 2710 }, { "epoch": 0.8788253164556962, "grad_norm": 0.007515923120081425, "learning_rate": 4.411131280666028e-06, "loss": 0.0, "step": 2712 }, { "epoch": 0.879473417721519, "grad_norm": 0.011895989999175072, "learning_rate": 4.404829274734485e-06, "loss": 0.0, "step": 2714 }, { "epoch": 0.8801215189873418, "grad_norm": 0.002676513744518161, "learning_rate": 4.398528227741634e-06, "loss": 0.0, "step": 2716 }, { "epoch": 0.8807696202531645, "grad_norm": 0.024730712175369263, "learning_rate": 4.392228149839716e-06, "loss": 0.0, "step": 2718 }, { "epoch": 0.8814177215189873, "grad_norm": 0.006555337458848953, "learning_rate": 4.3859290511794145e-06, "loss": 0.0, "step": 2720 }, { "epoch": 0.8814177215189873, "eval_accuracy": 0.9999992141332499, "eval_loss": 6.6279590100748464e-06, "eval_runtime": 105.6669, "eval_samples_per_second": 47.318, "eval_steps_per_second": 11.83, "step": 2720 }, { "epoch": 0.8820658227848102, "grad_norm": 0.0008663420449011028, "learning_rate": 4.379630941909832e-06, "loss": 0.0, "step": 2722 }, { "epoch": 0.8827139240506329, "grad_norm": 0.00045760456123389304, "learning_rate": 4.373333832178478e-06, "loss": 0.0, "step": 2724 }, { "epoch": 0.8833620253164557, "grad_norm": 0.0008591710939072073, "learning_rate": 4.367037732131254e-06, "loss": 0.0, "step": 2726 }, { "epoch": 0.8840101265822785, "grad_norm": 0.005888012237846851, "learning_rate": 4.360742651912428e-06, "loss": 0.0, "step": 2728 }, { "epoch": 0.8846582278481012, "grad_norm": 0.00036615473800338805, "learning_rate": 4.3544486016646335e-06, "loss": 0.0, "step": 2730 }, { "epoch": 0.885306329113924, "grad_norm": 0.00028013926930725574, "learning_rate": 4.34815559152884e-06, "loss": 0.0, "step": 2732 }, { "epoch": 0.8859544303797469, "grad_norm": 0.004798305220901966, "learning_rate": 4.341863631644337e-06, "loss": 0.0, "step": 2734 }, { "epoch": 0.8866025316455696, "grad_norm": 0.20881663262844086, "learning_rate": 4.33557273214873e-06, "loss": 0.0, "step": 2736 }, { "epoch": 0.8872506329113924, "grad_norm": 0.000936617492698133, "learning_rate": 4.329282903177911e-06, "loss": 0.0, "step": 2738 }, { "epoch": 0.8878987341772152, "grad_norm": 0.00045031809713691473, "learning_rate": 4.32299415486605e-06, "loss": 0.0, "step": 2740 }, { "epoch": 0.888546835443038, "grad_norm": 0.0002908929600380361, "learning_rate": 4.316706497345572e-06, "loss": 0.0002, "step": 2742 }, { "epoch": 0.8891949367088607, "grad_norm": 0.0002155246038455516, "learning_rate": 4.3104199407471485e-06, "loss": 0.0, "step": 2744 }, { "epoch": 0.8898430379746836, "grad_norm": 0.0013598958030343056, "learning_rate": 4.304134495199675e-06, "loss": 0.0, "step": 2746 }, { "epoch": 0.8904911392405064, "grad_norm": 0.012999402359127998, "learning_rate": 4.297850170830255e-06, "loss": 0.0, "step": 2748 }, { "epoch": 0.8911392405063291, "grad_norm": 0.0005251656984910369, "learning_rate": 4.291566977764192e-06, "loss": 0.0, "step": 2750 }, { "epoch": 0.8917873417721519, "grad_norm": 0.006405015476047993, "learning_rate": 4.285284926124959e-06, "loss": 0.0, "step": 2752 }, { "epoch": 0.8924354430379747, "grad_norm": 0.046361591666936874, "learning_rate": 4.279004026034196e-06, "loss": 0.0001, "step": 2754 }, { "epoch": 0.8930835443037974, "grad_norm": 0.004697976168245077, "learning_rate": 4.272724287611684e-06, "loss": 0.0001, "step": 2756 }, { "epoch": 0.8937316455696203, "grad_norm": 0.0007214845390990376, "learning_rate": 4.266445720975334e-06, "loss": 0.0, "step": 2758 }, { "epoch": 0.8943797468354431, "grad_norm": 0.0014260972384363413, "learning_rate": 4.260168336241169e-06, "loss": 0.0, "step": 2760 }, { "epoch": 0.8943797468354431, "eval_accuracy": 0.9999988370301384, "eval_loss": 7.5536236181505956e-06, "eval_runtime": 106.0073, "eval_samples_per_second": 47.167, "eval_steps_per_second": 11.792, "step": 2760 }, { "epoch": 0.8950278481012658, "grad_norm": 0.0002829917357303202, "learning_rate": 4.253892143523306e-06, "loss": 0.0, "step": 2762 }, { "epoch": 0.8956759493670886, "grad_norm": 0.0004675632808357477, "learning_rate": 4.247617152933944e-06, "loss": 0.0, "step": 2764 }, { "epoch": 0.8963240506329114, "grad_norm": 0.004662278573960066, "learning_rate": 4.241343374583343e-06, "loss": 0.0, "step": 2766 }, { "epoch": 0.8969721518987341, "grad_norm": 0.003993092570453882, "learning_rate": 4.23507081857981e-06, "loss": 0.0, "step": 2768 }, { "epoch": 0.897620253164557, "grad_norm": 0.0005434814374893904, "learning_rate": 4.228799495029685e-06, "loss": 0.0, "step": 2770 }, { "epoch": 0.8982683544303798, "grad_norm": 0.002016532002016902, "learning_rate": 4.222529414037319e-06, "loss": 0.0, "step": 2772 }, { "epoch": 0.8989164556962025, "grad_norm": 0.0012540030293166637, "learning_rate": 4.2162605857050645e-06, "loss": 0.0, "step": 2774 }, { "epoch": 0.8995645569620253, "grad_norm": 0.0007384781492874026, "learning_rate": 4.209993020133251e-06, "loss": 0.0, "step": 2776 }, { "epoch": 0.9002126582278481, "grad_norm": 0.000652509625069797, "learning_rate": 4.2037267274201785e-06, "loss": 0.0, "step": 2778 }, { "epoch": 0.9008607594936708, "grad_norm": 0.0025625203270465136, "learning_rate": 4.197461717662092e-06, "loss": 0.0, "step": 2780 }, { "epoch": 0.9015088607594937, "grad_norm": 0.11876733601093292, "learning_rate": 4.191198000953171e-06, "loss": 0.0, "step": 2782 }, { "epoch": 0.9021569620253165, "grad_norm": 0.0005562319420278072, "learning_rate": 4.1849355873855135e-06, "loss": 0.0, "step": 2784 }, { "epoch": 0.9028050632911392, "grad_norm": 0.00024305633269250393, "learning_rate": 4.178674487049116e-06, "loss": 0.0, "step": 2786 }, { "epoch": 0.903453164556962, "grad_norm": 0.0007249072659760714, "learning_rate": 4.172414710031858e-06, "loss": 0.0, "step": 2788 }, { "epoch": 0.9041012658227848, "grad_norm": 0.0016171886818483472, "learning_rate": 4.166156266419489e-06, "loss": 0.0, "step": 2790 }, { "epoch": 0.9047493670886076, "grad_norm": 0.0005725118098780513, "learning_rate": 4.15989916629561e-06, "loss": 0.0, "step": 2792 }, { "epoch": 0.9053974683544304, "grad_norm": 0.0003646530385594815, "learning_rate": 4.153643419741656e-06, "loss": 0.0, "step": 2794 }, { "epoch": 0.9060455696202532, "grad_norm": 0.06287111341953278, "learning_rate": 4.147389036836881e-06, "loss": 0.0, "step": 2796 }, { "epoch": 0.906693670886076, "grad_norm": 0.0005466180737130344, "learning_rate": 4.141136027658345e-06, "loss": 0.0, "step": 2798 }, { "epoch": 0.9073417721518987, "grad_norm": 0.008458413183689117, "learning_rate": 4.13488440228089e-06, "loss": 0.0, "step": 2800 }, { "epoch": 0.9073417721518987, "eval_accuracy": 0.9999996161228406, "eval_loss": 4.550272933556698e-06, "eval_runtime": 106.6358, "eval_samples_per_second": 46.889, "eval_steps_per_second": 11.722, "step": 2800 } ], "logging_steps": 2, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.93561237640446e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }