{ "best_global_step": 1280, "best_metric": 1.0, "best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/mpi_async_n4/checkpoint-1280", "epoch": 0.6287425149700598, "eval_steps": 40, "global_step": 1680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007485029940119761, "grad_norm": 7.328390598297119, "learning_rate": 2.0000000000000002e-07, "loss": 0.48, "step": 2 }, { "epoch": 0.0014970059880239522, "grad_norm": 7.235108852386475, "learning_rate": 6.000000000000001e-07, "loss": 0.4252, "step": 4 }, { "epoch": 0.002245508982035928, "grad_norm": 8.011260986328125, "learning_rate": 1.0000000000000002e-06, "loss": 0.4299, "step": 6 }, { "epoch": 0.0029940119760479044, "grad_norm": 6.425393104553223, "learning_rate": 1.4000000000000001e-06, "loss": 0.4424, "step": 8 }, { "epoch": 0.0037425149700598802, "grad_norm": 6.826442241668701, "learning_rate": 1.8000000000000001e-06, "loss": 0.4549, "step": 10 }, { "epoch": 0.004491017964071856, "grad_norm": 4.996034622192383, "learning_rate": 2.2e-06, "loss": 0.3498, "step": 12 }, { "epoch": 0.005239520958083832, "grad_norm": 4.402273654937744, "learning_rate": 2.6e-06, "loss": 0.3288, "step": 14 }, { "epoch": 0.005988023952095809, "grad_norm": 4.156887054443359, "learning_rate": 3e-06, "loss": 0.2507, "step": 16 }, { "epoch": 0.006736526946107785, "grad_norm": 2.647883176803589, "learning_rate": 3.4000000000000005e-06, "loss": 0.197, "step": 18 }, { "epoch": 0.0074850299401197605, "grad_norm": 2.444559097290039, "learning_rate": 3.8000000000000005e-06, "loss": 0.1474, "step": 20 }, { "epoch": 0.008233532934131737, "grad_norm": 1.8110377788543701, "learning_rate": 4.2000000000000004e-06, "loss": 0.1494, "step": 22 }, { "epoch": 0.008982035928143712, "grad_norm": 1.4763550758361816, "learning_rate": 4.600000000000001e-06, "loss": 0.107, "step": 24 }, { "epoch": 0.009730538922155689, "grad_norm": 1.2829464673995972, "learning_rate": 5e-06, "loss": 0.0854, "step": 26 }, { "epoch": 0.010479041916167664, "grad_norm": 1.1430706977844238, "learning_rate": 5.400000000000001e-06, "loss": 0.0597, "step": 28 }, { "epoch": 0.01122754491017964, "grad_norm": 1.3779264688491821, "learning_rate": 5.8e-06, "loss": 0.0642, "step": 30 }, { "epoch": 0.011976047904191617, "grad_norm": 0.9946982860565186, "learning_rate": 6.200000000000001e-06, "loss": 0.0398, "step": 32 }, { "epoch": 0.012724550898203593, "grad_norm": 1.1442718505859375, "learning_rate": 6.600000000000001e-06, "loss": 0.04, "step": 34 }, { "epoch": 0.01347305389221557, "grad_norm": 0.6475897431373596, "learning_rate": 7e-06, "loss": 0.0283, "step": 36 }, { "epoch": 0.014221556886227544, "grad_norm": 0.8502711057662964, "learning_rate": 7.4e-06, "loss": 0.0345, "step": 38 }, { "epoch": 0.014970059880239521, "grad_norm": 0.5282578468322754, "learning_rate": 7.800000000000002e-06, "loss": 0.0173, "step": 40 }, { "epoch": 0.014970059880239521, "eval_accuracy": 0.9910189955595328, "eval_loss": 0.02636127918958664, "eval_runtime": 156.2607, "eval_samples_per_second": 31.998, "eval_steps_per_second": 7.999, "step": 40 }, { "epoch": 0.015718562874251496, "grad_norm": 0.755415141582489, "learning_rate": 8.2e-06, "loss": 0.023, "step": 42 }, { "epoch": 0.016467065868263474, "grad_norm": 0.6637702584266663, "learning_rate": 8.6e-06, "loss": 0.0165, "step": 44 }, { "epoch": 0.01721556886227545, "grad_norm": 0.42257505655288696, "learning_rate": 9e-06, "loss": 0.0149, "step": 46 }, { "epoch": 0.017964071856287425, "grad_norm": 0.6686341166496277, "learning_rate": 9.4e-06, "loss": 0.019, "step": 48 }, { "epoch": 0.0187125748502994, "grad_norm": 0.5314021110534668, "learning_rate": 9.800000000000001e-06, "loss": 0.0169, "step": 50 }, { "epoch": 0.019461077844311378, "grad_norm": 0.39661431312561035, "learning_rate": 9.999998993000299e-06, "loss": 0.0152, "step": 52 }, { "epoch": 0.020209580838323353, "grad_norm": 0.571976900100708, "learning_rate": 9.999990937005126e-06, "loss": 0.0139, "step": 54 }, { "epoch": 0.020958083832335328, "grad_norm": 0.5158469676971436, "learning_rate": 9.999974825027756e-06, "loss": 0.0092, "step": 56 }, { "epoch": 0.021706586826347306, "grad_norm": 0.7198213338851929, "learning_rate": 9.999950657094151e-06, "loss": 0.0113, "step": 58 }, { "epoch": 0.02245508982035928, "grad_norm": 0.48938679695129395, "learning_rate": 9.999918433243253e-06, "loss": 0.0085, "step": 60 }, { "epoch": 0.023203592814371257, "grad_norm": 0.5157604813575745, "learning_rate": 9.999878153526974e-06, "loss": 0.0114, "step": 62 }, { "epoch": 0.023952095808383235, "grad_norm": 0.510836124420166, "learning_rate": 9.99982981801022e-06, "loss": 0.009, "step": 64 }, { "epoch": 0.02470059880239521, "grad_norm": 0.34386318922042847, "learning_rate": 9.999773426770864e-06, "loss": 0.0102, "step": 66 }, { "epoch": 0.025449101796407185, "grad_norm": 0.31605905294418335, "learning_rate": 9.999708979899769e-06, "loss": 0.0095, "step": 68 }, { "epoch": 0.02619760479041916, "grad_norm": 0.6626418828964233, "learning_rate": 9.999636477500765e-06, "loss": 0.0079, "step": 70 }, { "epoch": 0.02694610778443114, "grad_norm": 0.49883756041526794, "learning_rate": 9.999555919690673e-06, "loss": 0.0065, "step": 72 }, { "epoch": 0.027694610778443114, "grad_norm": 0.3710748255252838, "learning_rate": 9.999467306599285e-06, "loss": 0.0055, "step": 74 }, { "epoch": 0.02844311377245509, "grad_norm": 0.33792468905448914, "learning_rate": 9.999370638369377e-06, "loss": 0.0065, "step": 76 }, { "epoch": 0.029191616766467067, "grad_norm": 0.33830082416534424, "learning_rate": 9.999265915156697e-06, "loss": 0.0067, "step": 78 }, { "epoch": 0.029940119760479042, "grad_norm": 0.3766763210296631, "learning_rate": 9.999153137129978e-06, "loss": 0.0054, "step": 80 }, { "epoch": 0.029940119760479042, "eval_accuracy": 0.9975511997129987, "eval_loss": 0.006336509715765715, "eval_runtime": 152.2618, "eval_samples_per_second": 32.838, "eval_steps_per_second": 8.21, "step": 80 }, { "epoch": 0.030688622754491017, "grad_norm": 0.3682909607887268, "learning_rate": 9.999032304470926e-06, "loss": 0.0052, "step": 82 }, { "epoch": 0.03143712574850299, "grad_norm": 0.48871251940727234, "learning_rate": 9.998903417374228e-06, "loss": 0.0065, "step": 84 }, { "epoch": 0.03218562874251497, "grad_norm": 0.4313011169433594, "learning_rate": 9.998766476047546e-06, "loss": 0.0056, "step": 86 }, { "epoch": 0.03293413173652695, "grad_norm": 0.3613654673099518, "learning_rate": 9.998621480711522e-06, "loss": 0.0034, "step": 88 }, { "epoch": 0.033682634730538924, "grad_norm": 0.39512524008750916, "learning_rate": 9.998468431599768e-06, "loss": 0.0039, "step": 90 }, { "epoch": 0.0344311377245509, "grad_norm": 0.26590684056282043, "learning_rate": 9.99830732895888e-06, "loss": 0.0041, "step": 92 }, { "epoch": 0.035179640718562874, "grad_norm": 0.29519563913345337, "learning_rate": 9.998138173048424e-06, "loss": 0.0048, "step": 94 }, { "epoch": 0.03592814371257485, "grad_norm": 0.8653535842895508, "learning_rate": 9.997960964140946e-06, "loss": 0.0037, "step": 96 }, { "epoch": 0.036676646706586824, "grad_norm": 0.5562458038330078, "learning_rate": 9.997775702521965e-06, "loss": 0.004, "step": 98 }, { "epoch": 0.0374251497005988, "grad_norm": 0.31169670820236206, "learning_rate": 9.997582388489975e-06, "loss": 0.004, "step": 100 }, { "epoch": 0.03817365269461078, "grad_norm": 0.3139854371547699, "learning_rate": 9.99738102235644e-06, "loss": 0.0032, "step": 102 }, { "epoch": 0.038922155688622756, "grad_norm": 0.4420141875743866, "learning_rate": 9.997171604445803e-06, "loss": 0.0037, "step": 104 }, { "epoch": 0.03967065868263473, "grad_norm": 0.46555566787719727, "learning_rate": 9.99695413509548e-06, "loss": 0.0034, "step": 106 }, { "epoch": 0.040419161676646706, "grad_norm": 0.2851720154285431, "learning_rate": 9.996728614655854e-06, "loss": 0.0029, "step": 108 }, { "epoch": 0.04116766467065868, "grad_norm": 0.3109307885169983, "learning_rate": 9.996495043490285e-06, "loss": 0.0029, "step": 110 }, { "epoch": 0.041916167664670656, "grad_norm": 0.37666594982147217, "learning_rate": 9.996253421975103e-06, "loss": 0.0038, "step": 112 }, { "epoch": 0.04266467065868264, "grad_norm": 0.5034800171852112, "learning_rate": 9.996003750499608e-06, "loss": 0.0032, "step": 114 }, { "epoch": 0.04341317365269461, "grad_norm": 0.3710559606552124, "learning_rate": 9.995746029466071e-06, "loss": 0.0022, "step": 116 }, { "epoch": 0.04416167664670659, "grad_norm": 0.4710935056209564, "learning_rate": 9.995480259289731e-06, "loss": 0.0025, "step": 118 }, { "epoch": 0.04491017964071856, "grad_norm": 0.31052565574645996, "learning_rate": 9.995206440398798e-06, "loss": 0.0024, "step": 120 }, { "epoch": 0.04491017964071856, "eval_accuracy": 0.9988225992721916, "eval_loss": 0.002896190620958805, "eval_runtime": 154.576, "eval_samples_per_second": 32.347, "eval_steps_per_second": 8.087, "step": 120 }, { "epoch": 0.04565868263473054, "grad_norm": 0.34983423352241516, "learning_rate": 9.994924573234448e-06, "loss": 0.0022, "step": 122 }, { "epoch": 0.04640718562874251, "grad_norm": 0.2754887640476227, "learning_rate": 9.994634658250825e-06, "loss": 0.0021, "step": 124 }, { "epoch": 0.04715568862275449, "grad_norm": 0.49522289633750916, "learning_rate": 9.994336695915041e-06, "loss": 0.0021, "step": 126 }, { "epoch": 0.04790419161676647, "grad_norm": 0.37913596630096436, "learning_rate": 9.994030686707171e-06, "loss": 0.002, "step": 128 }, { "epoch": 0.048652694610778445, "grad_norm": 0.3330959379673004, "learning_rate": 9.993716631120259e-06, "loss": 0.0017, "step": 130 }, { "epoch": 0.04940119760479042, "grad_norm": 0.2224518060684204, "learning_rate": 9.993394529660307e-06, "loss": 0.0018, "step": 132 }, { "epoch": 0.050149700598802395, "grad_norm": 0.2787413001060486, "learning_rate": 9.99306438284629e-06, "loss": 0.0015, "step": 134 }, { "epoch": 0.05089820359281437, "grad_norm": 0.43909233808517456, "learning_rate": 9.992726191210139e-06, "loss": 0.0023, "step": 136 }, { "epoch": 0.051646706586826345, "grad_norm": 0.1608552634716034, "learning_rate": 9.992379955296745e-06, "loss": 0.0012, "step": 138 }, { "epoch": 0.05239520958083832, "grad_norm": 0.34503915905952454, "learning_rate": 9.992025675663966e-06, "loss": 0.0018, "step": 140 }, { "epoch": 0.0531437125748503, "grad_norm": 0.17146268486976624, "learning_rate": 9.991663352882615e-06, "loss": 0.0013, "step": 142 }, { "epoch": 0.05389221556886228, "grad_norm": 0.47353699803352356, "learning_rate": 9.991292987536469e-06, "loss": 0.002, "step": 144 }, { "epoch": 0.05464071856287425, "grad_norm": 0.10907532274723053, "learning_rate": 9.990914580222258e-06, "loss": 0.001, "step": 146 }, { "epoch": 0.05538922155688623, "grad_norm": 0.195388525724411, "learning_rate": 9.990528131549674e-06, "loss": 0.0013, "step": 148 }, { "epoch": 0.0561377245508982, "grad_norm": 0.124148428440094, "learning_rate": 9.990133642141359e-06, "loss": 0.0007, "step": 150 }, { "epoch": 0.05688622754491018, "grad_norm": 0.3281680643558502, "learning_rate": 9.989731112632917e-06, "loss": 0.0018, "step": 152 }, { "epoch": 0.05763473053892216, "grad_norm": 0.3646385669708252, "learning_rate": 9.989320543672904e-06, "loss": 0.0014, "step": 154 }, { "epoch": 0.058383233532934134, "grad_norm": 0.20738907158374786, "learning_rate": 9.988901935922826e-06, "loss": 0.0012, "step": 156 }, { "epoch": 0.05913173652694611, "grad_norm": 0.19206871092319489, "learning_rate": 9.988475290057145e-06, "loss": 0.0008, "step": 158 }, { "epoch": 0.059880239520958084, "grad_norm": 0.4680192470550537, "learning_rate": 9.988040606763272e-06, "loss": 0.0011, "step": 160 }, { "epoch": 0.059880239520958084, "eval_accuracy": 0.9995344699843772, "eval_loss": 0.0012882280861958861, "eval_runtime": 155.4759, "eval_samples_per_second": 32.159, "eval_steps_per_second": 8.04, "step": 160 }, { "epoch": 0.06062874251497006, "grad_norm": 0.10511677712202072, "learning_rate": 9.98759788674157e-06, "loss": 0.0006, "step": 162 }, { "epoch": 0.061377245508982034, "grad_norm": 0.264397531747818, "learning_rate": 9.987147130705347e-06, "loss": 0.0008, "step": 164 }, { "epoch": 0.06212574850299401, "grad_norm": 0.15092360973358154, "learning_rate": 9.986688339380863e-06, "loss": 0.001, "step": 166 }, { "epoch": 0.06287425149700598, "grad_norm": 0.23679876327514648, "learning_rate": 9.98622151350732e-06, "loss": 0.0009, "step": 168 }, { "epoch": 0.06362275449101797, "grad_norm": 0.3080887198448181, "learning_rate": 9.985746653836867e-06, "loss": 0.0015, "step": 170 }, { "epoch": 0.06437125748502993, "grad_norm": 0.13096538186073303, "learning_rate": 9.985263761134602e-06, "loss": 0.001, "step": 172 }, { "epoch": 0.06511976047904192, "grad_norm": 0.27316954731941223, "learning_rate": 9.984772836178559e-06, "loss": 0.0008, "step": 174 }, { "epoch": 0.0658682634730539, "grad_norm": 0.314272940158844, "learning_rate": 9.984273879759713e-06, "loss": 0.0017, "step": 176 }, { "epoch": 0.06661676646706587, "grad_norm": 0.20915231108665466, "learning_rate": 9.983766892681985e-06, "loss": 0.0012, "step": 178 }, { "epoch": 0.06736526946107785, "grad_norm": 0.18497829139232635, "learning_rate": 9.983251875762234e-06, "loss": 0.0008, "step": 180 }, { "epoch": 0.06811377245508982, "grad_norm": 0.20126977562904358, "learning_rate": 9.982728829830252e-06, "loss": 0.0008, "step": 182 }, { "epoch": 0.0688622754491018, "grad_norm": 0.15316377580165863, "learning_rate": 9.982197755728771e-06, "loss": 0.001, "step": 184 }, { "epoch": 0.06961077844311377, "grad_norm": 0.14749199151992798, "learning_rate": 9.981658654313458e-06, "loss": 0.0005, "step": 186 }, { "epoch": 0.07035928143712575, "grad_norm": 0.25107651948928833, "learning_rate": 9.981111526452912e-06, "loss": 0.0011, "step": 188 }, { "epoch": 0.07110778443113773, "grad_norm": 0.07325785607099533, "learning_rate": 9.980556373028665e-06, "loss": 0.0004, "step": 190 }, { "epoch": 0.0718562874251497, "grad_norm": 0.11805955320596695, "learning_rate": 9.979993194935182e-06, "loss": 0.0005, "step": 192 }, { "epoch": 0.07260479041916168, "grad_norm": 0.19970782101154327, "learning_rate": 9.979421993079853e-06, "loss": 0.0008, "step": 194 }, { "epoch": 0.07335329341317365, "grad_norm": 0.24476714432239532, "learning_rate": 9.978842768382999e-06, "loss": 0.0005, "step": 196 }, { "epoch": 0.07410179640718563, "grad_norm": 0.12824182212352753, "learning_rate": 9.978255521777865e-06, "loss": 0.0004, "step": 198 }, { "epoch": 0.0748502994011976, "grad_norm": 0.08068165183067322, "learning_rate": 9.977660254210623e-06, "loss": 0.0004, "step": 200 }, { "epoch": 0.0748502994011976, "eval_accuracy": 0.9997569708964628, "eval_loss": 0.000611252966336906, "eval_runtime": 156.7213, "eval_samples_per_second": 31.904, "eval_steps_per_second": 7.976, "step": 200 }, { "epoch": 0.07559880239520958, "grad_norm": 0.08569593727588654, "learning_rate": 9.977056966640368e-06, "loss": 0.0005, "step": 202 }, { "epoch": 0.07634730538922156, "grad_norm": 0.10873577743768692, "learning_rate": 9.976445660039118e-06, "loss": 0.0003, "step": 204 }, { "epoch": 0.07709580838323353, "grad_norm": 0.06685052067041397, "learning_rate": 9.975826335391808e-06, "loss": 0.0004, "step": 206 }, { "epoch": 0.07784431137724551, "grad_norm": 0.171136736869812, "learning_rate": 9.975198993696294e-06, "loss": 0.0005, "step": 208 }, { "epoch": 0.07859281437125748, "grad_norm": 0.2799069881439209, "learning_rate": 9.974563635963348e-06, "loss": 0.0009, "step": 210 }, { "epoch": 0.07934131736526946, "grad_norm": 0.09249293059110641, "learning_rate": 9.973920263216658e-06, "loss": 0.0005, "step": 212 }, { "epoch": 0.08008982035928144, "grad_norm": 0.19255271553993225, "learning_rate": 9.973268876492827e-06, "loss": 0.0004, "step": 214 }, { "epoch": 0.08083832335329341, "grad_norm": 0.1604669839143753, "learning_rate": 9.972609476841368e-06, "loss": 0.0004, "step": 216 }, { "epoch": 0.0815868263473054, "grad_norm": 0.08825163543224335, "learning_rate": 9.971942065324704e-06, "loss": 0.0007, "step": 218 }, { "epoch": 0.08233532934131736, "grad_norm": 0.2524869441986084, "learning_rate": 9.971266643018171e-06, "loss": 0.0006, "step": 220 }, { "epoch": 0.08308383233532934, "grad_norm": 0.10447513312101364, "learning_rate": 9.970583211010008e-06, "loss": 0.0006, "step": 222 }, { "epoch": 0.08383233532934131, "grad_norm": 0.17385387420654297, "learning_rate": 9.969891770401358e-06, "loss": 0.0003, "step": 224 }, { "epoch": 0.0845808383233533, "grad_norm": 0.0575445182621479, "learning_rate": 9.969192322306271e-06, "loss": 0.0002, "step": 226 }, { "epoch": 0.08532934131736528, "grad_norm": 0.20742414891719818, "learning_rate": 9.968484867851698e-06, "loss": 0.0004, "step": 228 }, { "epoch": 0.08607784431137724, "grad_norm": 0.22014112770557404, "learning_rate": 9.96776940817749e-06, "loss": 0.0005, "step": 230 }, { "epoch": 0.08682634730538923, "grad_norm": 0.1331041306257248, "learning_rate": 9.967045944436392e-06, "loss": 0.0004, "step": 232 }, { "epoch": 0.0875748502994012, "grad_norm": 0.14387176930904388, "learning_rate": 9.966314477794052e-06, "loss": 0.0006, "step": 234 }, { "epoch": 0.08832335329341318, "grad_norm": 0.1632365584373474, "learning_rate": 9.965575009429006e-06, "loss": 0.0003, "step": 236 }, { "epoch": 0.08907185628742514, "grad_norm": 0.1252838671207428, "learning_rate": 9.964827540532685e-06, "loss": 0.0005, "step": 238 }, { "epoch": 0.08982035928143713, "grad_norm": 0.08947388827800751, "learning_rate": 9.964072072309412e-06, "loss": 0.0004, "step": 240 }, { "epoch": 0.08982035928143713, "eval_accuracy": 0.99981676586026, "eval_loss": 0.000518214248586446, "eval_runtime": 154.0762, "eval_samples_per_second": 32.451, "eval_steps_per_second": 8.113, "step": 240 }, { "epoch": 0.09056886227544911, "grad_norm": 0.1822632998228073, "learning_rate": 9.963308605976397e-06, "loss": 0.0003, "step": 242 }, { "epoch": 0.09131736526946108, "grad_norm": 0.1965271681547165, "learning_rate": 9.962537142763733e-06, "loss": 0.0003, "step": 244 }, { "epoch": 0.09206586826347306, "grad_norm": 0.12774410843849182, "learning_rate": 9.961757683914406e-06, "loss": 0.0004, "step": 246 }, { "epoch": 0.09281437125748503, "grad_norm": 0.06404659152030945, "learning_rate": 9.960970230684276e-06, "loss": 0.0003, "step": 248 }, { "epoch": 0.09356287425149701, "grad_norm": 0.07961199432611465, "learning_rate": 9.96017478434209e-06, "loss": 0.0002, "step": 250 }, { "epoch": 0.09431137724550898, "grad_norm": 0.07755598425865173, "learning_rate": 9.959371346169466e-06, "loss": 0.0001, "step": 252 }, { "epoch": 0.09505988023952096, "grad_norm": 0.10230294615030289, "learning_rate": 9.958559917460909e-06, "loss": 0.0004, "step": 254 }, { "epoch": 0.09580838323353294, "grad_norm": 0.4232734441757202, "learning_rate": 9.957740499523787e-06, "loss": 0.0002, "step": 256 }, { "epoch": 0.09655688622754491, "grad_norm": 0.45036637783050537, "learning_rate": 9.95691309367835e-06, "loss": 0.0006, "step": 258 }, { "epoch": 0.09730538922155689, "grad_norm": 0.2974064350128174, "learning_rate": 9.95607770125771e-06, "loss": 0.0006, "step": 260 }, { "epoch": 0.09805389221556886, "grad_norm": 0.12492769956588745, "learning_rate": 9.955234323607854e-06, "loss": 0.0005, "step": 262 }, { "epoch": 0.09880239520958084, "grad_norm": 0.08176768571138382, "learning_rate": 9.954382962087628e-06, "loss": 0.0003, "step": 264 }, { "epoch": 0.09955089820359281, "grad_norm": 0.11267261207103729, "learning_rate": 9.95352361806875e-06, "loss": 0.0004, "step": 266 }, { "epoch": 0.10029940119760479, "grad_norm": 0.07069454342126846, "learning_rate": 9.95265629293579e-06, "loss": 0.0002, "step": 268 }, { "epoch": 0.10104790419161677, "grad_norm": 0.13988761603832245, "learning_rate": 9.951780988086183e-06, "loss": 0.0004, "step": 270 }, { "epoch": 0.10179640718562874, "grad_norm": 0.07328484207391739, "learning_rate": 9.950897704930223e-06, "loss": 0.0002, "step": 272 }, { "epoch": 0.10254491017964072, "grad_norm": 0.1726737767457962, "learning_rate": 9.95000644489105e-06, "loss": 0.0003, "step": 274 }, { "epoch": 0.10329341317365269, "grad_norm": 0.189790740609169, "learning_rate": 9.949107209404664e-06, "loss": 0.0005, "step": 276 }, { "epoch": 0.10404191616766467, "grad_norm": 0.08902551233768463, "learning_rate": 9.948199999919914e-06, "loss": 0.0001, "step": 278 }, { "epoch": 0.10479041916167664, "grad_norm": 0.10343684256076813, "learning_rate": 9.947284817898493e-06, "loss": 0.0002, "step": 280 }, { "epoch": 0.10479041916167664, "eval_accuracy": 0.9998052416354287, "eval_loss": 0.0006224001408554614, "eval_runtime": 156.154, "eval_samples_per_second": 32.02, "eval_steps_per_second": 8.005, "step": 280 }, { "epoch": 0.10553892215568862, "grad_norm": 0.20946663618087769, "learning_rate": 9.946361664814942e-06, "loss": 0.0007, "step": 282 }, { "epoch": 0.1062874251497006, "grad_norm": 0.024475887417793274, "learning_rate": 9.945430542156647e-06, "loss": 0.0001, "step": 284 }, { "epoch": 0.10703592814371257, "grad_norm": 0.12402810901403427, "learning_rate": 9.944491451423829e-06, "loss": 0.0003, "step": 286 }, { "epoch": 0.10778443113772455, "grad_norm": 0.3434118330478668, "learning_rate": 9.943544394129552e-06, "loss": 0.0004, "step": 288 }, { "epoch": 0.10853293413173652, "grad_norm": 0.21301892399787903, "learning_rate": 9.942589371799715e-06, "loss": 0.0003, "step": 290 }, { "epoch": 0.1092814371257485, "grad_norm": 0.2948126196861267, "learning_rate": 9.941626385973047e-06, "loss": 0.0006, "step": 292 }, { "epoch": 0.11002994011976049, "grad_norm": 0.1591068059206009, "learning_rate": 9.940655438201113e-06, "loss": 0.0003, "step": 294 }, { "epoch": 0.11077844311377245, "grad_norm": 0.04139701649546623, "learning_rate": 9.9396765300483e-06, "loss": 0.0002, "step": 296 }, { "epoch": 0.11152694610778444, "grad_norm": 0.11029073596000671, "learning_rate": 9.938689663091828e-06, "loss": 0.0003, "step": 298 }, { "epoch": 0.1122754491017964, "grad_norm": 0.0646573156118393, "learning_rate": 9.937694838921734e-06, "loss": 0.0002, "step": 300 }, { "epoch": 0.11302395209580839, "grad_norm": 0.14302918314933777, "learning_rate": 9.93669205914088e-06, "loss": 0.0003, "step": 302 }, { "epoch": 0.11377245508982035, "grad_norm": 0.17884957790374756, "learning_rate": 9.93568132536494e-06, "loss": 0.0004, "step": 304 }, { "epoch": 0.11452095808383234, "grad_norm": 0.09195531904697418, "learning_rate": 9.934662639222412e-06, "loss": 0.0002, "step": 306 }, { "epoch": 0.11526946107784432, "grad_norm": 0.2769736647605896, "learning_rate": 9.9336360023546e-06, "loss": 0.0003, "step": 308 }, { "epoch": 0.11601796407185629, "grad_norm": 0.029257414862513542, "learning_rate": 9.932601416415622e-06, "loss": 0.0003, "step": 310 }, { "epoch": 0.11676646706586827, "grad_norm": 0.08587785065174103, "learning_rate": 9.931558883072403e-06, "loss": 0.0004, "step": 312 }, { "epoch": 0.11751497005988024, "grad_norm": 0.20471642911434174, "learning_rate": 9.930508404004668e-06, "loss": 0.0004, "step": 314 }, { "epoch": 0.11826347305389222, "grad_norm": 0.22900666296482086, "learning_rate": 9.929449980904952e-06, "loss": 0.0006, "step": 316 }, { "epoch": 0.11901197604790419, "grad_norm": 0.16436566412448883, "learning_rate": 9.928383615478586e-06, "loss": 0.0003, "step": 318 }, { "epoch": 0.11976047904191617, "grad_norm": 0.05877704173326492, "learning_rate": 9.927309309443696e-06, "loss": 0.0001, "step": 320 }, { "epoch": 0.11976047904191617, "eval_accuracy": 0.9999357040300619, "eval_loss": 0.00022764925961382687, "eval_runtime": 158.1146, "eval_samples_per_second": 31.623, "eval_steps_per_second": 7.906, "step": 320 }, { "epoch": 0.12050898203592815, "grad_norm": 0.261000394821167, "learning_rate": 9.9262270645312e-06, "loss": 0.0003, "step": 322 }, { "epoch": 0.12125748502994012, "grad_norm": 0.17999576032161713, "learning_rate": 9.925136882484816e-06, "loss": 0.0003, "step": 324 }, { "epoch": 0.1220059880239521, "grad_norm": 0.15744219720363617, "learning_rate": 9.924038765061042e-06, "loss": 0.0006, "step": 326 }, { "epoch": 0.12275449101796407, "grad_norm": 0.031700655817985535, "learning_rate": 9.922932714029163e-06, "loss": 0.0004, "step": 328 }, { "epoch": 0.12350299401197605, "grad_norm": 0.2377641499042511, "learning_rate": 9.921818731171249e-06, "loss": 0.0003, "step": 330 }, { "epoch": 0.12425149700598802, "grad_norm": 0.08403676003217697, "learning_rate": 9.920696818282147e-06, "loss": 0.0002, "step": 332 }, { "epoch": 0.125, "grad_norm": 0.1424562782049179, "learning_rate": 9.919566977169486e-06, "loss": 0.0004, "step": 334 }, { "epoch": 0.12574850299401197, "grad_norm": 0.0928482636809349, "learning_rate": 9.918429209653662e-06, "loss": 0.0002, "step": 336 }, { "epoch": 0.12649700598802396, "grad_norm": 0.08917529135942459, "learning_rate": 9.917283517567845e-06, "loss": 0.0004, "step": 338 }, { "epoch": 0.12724550898203593, "grad_norm": 0.09952011704444885, "learning_rate": 9.916129902757977e-06, "loss": 0.0003, "step": 340 }, { "epoch": 0.1279940119760479, "grad_norm": 0.05392898619174957, "learning_rate": 9.914968367082756e-06, "loss": 0.0001, "step": 342 }, { "epoch": 0.12874251497005987, "grad_norm": 0.12771159410476685, "learning_rate": 9.913798912413653e-06, "loss": 0.0002, "step": 344 }, { "epoch": 0.12949101796407186, "grad_norm": 0.9677438735961914, "learning_rate": 9.912621540634889e-06, "loss": 0.0003, "step": 346 }, { "epoch": 0.13023952095808383, "grad_norm": 0.03891558572649956, "learning_rate": 9.911436253643445e-06, "loss": 0.0001, "step": 348 }, { "epoch": 0.1309880239520958, "grad_norm": 0.03757692128419876, "learning_rate": 9.910243053349055e-06, "loss": 0.0, "step": 350 }, { "epoch": 0.1317365269461078, "grad_norm": 0.20588494837284088, "learning_rate": 9.909041941674205e-06, "loss": 0.0004, "step": 352 }, { "epoch": 0.13248502994011976, "grad_norm": 0.29803666472435, "learning_rate": 9.90783292055412e-06, "loss": 0.0004, "step": 354 }, { "epoch": 0.13323353293413173, "grad_norm": 0.14101789891719818, "learning_rate": 9.906615991936781e-06, "loss": 0.0002, "step": 356 }, { "epoch": 0.1339820359281437, "grad_norm": 0.24130620062351227, "learning_rate": 9.905391157782897e-06, "loss": 0.0002, "step": 358 }, { "epoch": 0.1347305389221557, "grad_norm": 0.2917313575744629, "learning_rate": 9.904158420065923e-06, "loss": 0.0005, "step": 360 }, { "epoch": 0.1347305389221557, "eval_accuracy": 0.9999076577000782, "eval_loss": 0.0005576053517870605, "eval_runtime": 155.2214, "eval_samples_per_second": 32.212, "eval_steps_per_second": 8.053, "step": 360 }, { "epoch": 0.13547904191616766, "grad_norm": 0.12759952247142792, "learning_rate": 9.902917780772043e-06, "loss": 0.0003, "step": 362 }, { "epoch": 0.13622754491017963, "grad_norm": 0.1657952070236206, "learning_rate": 9.901669241900178e-06, "loss": 0.0007, "step": 364 }, { "epoch": 0.13697604790419163, "grad_norm": 0.10384248197078705, "learning_rate": 9.900412805461968e-06, "loss": 0.0005, "step": 366 }, { "epoch": 0.1377245508982036, "grad_norm": 0.20811188220977783, "learning_rate": 9.899148473481786e-06, "loss": 0.0006, "step": 368 }, { "epoch": 0.13847305389221556, "grad_norm": 0.051202207803726196, "learning_rate": 9.89787624799672e-06, "loss": 0.0003, "step": 370 }, { "epoch": 0.13922155688622753, "grad_norm": 0.13106031715869904, "learning_rate": 9.896596131056583e-06, "loss": 0.0002, "step": 372 }, { "epoch": 0.13997005988023953, "grad_norm": 0.1166054904460907, "learning_rate": 9.895308124723897e-06, "loss": 0.0003, "step": 374 }, { "epoch": 0.1407185628742515, "grad_norm": 0.10474357008934021, "learning_rate": 9.894012231073895e-06, "loss": 0.0003, "step": 376 }, { "epoch": 0.14146706586826346, "grad_norm": 0.08845887333154678, "learning_rate": 9.892708452194522e-06, "loss": 0.0004, "step": 378 }, { "epoch": 0.14221556886227546, "grad_norm": 0.1545616239309311, "learning_rate": 9.891396790186424e-06, "loss": 0.0004, "step": 380 }, { "epoch": 0.14296407185628743, "grad_norm": 0.04785681515932083, "learning_rate": 9.890077247162951e-06, "loss": 0.0001, "step": 382 }, { "epoch": 0.1437125748502994, "grad_norm": 0.11323319375514984, "learning_rate": 9.888749825250151e-06, "loss": 0.0001, "step": 384 }, { "epoch": 0.14446107784431136, "grad_norm": 0.1407540738582611, "learning_rate": 9.887414526586764e-06, "loss": 0.0002, "step": 386 }, { "epoch": 0.14520958083832336, "grad_norm": 0.09322088956832886, "learning_rate": 9.886071353324223e-06, "loss": 0.0001, "step": 388 }, { "epoch": 0.14595808383233533, "grad_norm": 0.07416640967130661, "learning_rate": 9.884720307626647e-06, "loss": 0.0001, "step": 390 }, { "epoch": 0.1467065868263473, "grad_norm": 0.031197911128401756, "learning_rate": 9.883361391670841e-06, "loss": 0.0, "step": 392 }, { "epoch": 0.1474550898203593, "grad_norm": 0.1820898950099945, "learning_rate": 9.881994607646288e-06, "loss": 0.0003, "step": 394 }, { "epoch": 0.14820359281437126, "grad_norm": 0.1383231282234192, "learning_rate": 9.880619957755151e-06, "loss": 0.0002, "step": 396 }, { "epoch": 0.14895209580838323, "grad_norm": 0.019146692007780075, "learning_rate": 9.879237444212265e-06, "loss": 0.0, "step": 398 }, { "epoch": 0.1497005988023952, "grad_norm": 0.04791894555091858, "learning_rate": 9.877847069245134e-06, "loss": 0.0001, "step": 400 }, { "epoch": 0.1497005988023952, "eval_accuracy": 0.9999685074988971, "eval_loss": 0.00012279710790608078, "eval_runtime": 156.0918, "eval_samples_per_second": 32.032, "eval_steps_per_second": 8.008, "step": 400 }, { "epoch": 0.1504491017964072, "grad_norm": 0.06451380997896194, "learning_rate": 9.87644883509393e-06, "loss": 0.0001, "step": 402 }, { "epoch": 0.15119760479041916, "grad_norm": 0.10077822208404541, "learning_rate": 9.875042744011487e-06, "loss": 0.0001, "step": 404 }, { "epoch": 0.15194610778443113, "grad_norm": 0.07988882809877396, "learning_rate": 9.873628798263297e-06, "loss": 0.0001, "step": 406 }, { "epoch": 0.15269461077844312, "grad_norm": 0.08547152578830719, "learning_rate": 9.87220700012751e-06, "loss": 0.0003, "step": 408 }, { "epoch": 0.1534431137724551, "grad_norm": 0.06369513273239136, "learning_rate": 9.870777351894926e-06, "loss": 0.0001, "step": 410 }, { "epoch": 0.15419161676646706, "grad_norm": 0.1190333142876625, "learning_rate": 9.869339855868992e-06, "loss": 0.0002, "step": 412 }, { "epoch": 0.15494011976047903, "grad_norm": 0.4799070954322815, "learning_rate": 9.867894514365802e-06, "loss": 0.0001, "step": 414 }, { "epoch": 0.15568862275449102, "grad_norm": 0.05317097157239914, "learning_rate": 9.86644132971409e-06, "loss": 0.0001, "step": 416 }, { "epoch": 0.156437125748503, "grad_norm": 0.08004628121852875, "learning_rate": 9.864980304255222e-06, "loss": 0.0003, "step": 418 }, { "epoch": 0.15718562874251496, "grad_norm": 0.06639832258224487, "learning_rate": 9.863511440343206e-06, "loss": 0.0001, "step": 420 }, { "epoch": 0.15793413173652696, "grad_norm": 0.20095159113407135, "learning_rate": 9.862034740344673e-06, "loss": 0.0002, "step": 422 }, { "epoch": 0.15868263473053892, "grad_norm": 0.14772972464561462, "learning_rate": 9.860550206638881e-06, "loss": 0.0002, "step": 424 }, { "epoch": 0.1594311377245509, "grad_norm": 0.15753412246704102, "learning_rate": 9.859057841617709e-06, "loss": 0.0002, "step": 426 }, { "epoch": 0.1601796407185629, "grad_norm": 0.08705739676952362, "learning_rate": 9.857557647685657e-06, "loss": 0.0002, "step": 428 }, { "epoch": 0.16092814371257486, "grad_norm": 0.32878294587135315, "learning_rate": 9.856049627259833e-06, "loss": 0.0006, "step": 430 }, { "epoch": 0.16167664670658682, "grad_norm": 0.19281232357025146, "learning_rate": 9.85453378276996e-06, "loss": 0.0001, "step": 432 }, { "epoch": 0.1624251497005988, "grad_norm": 0.4002825617790222, "learning_rate": 9.853010116658368e-06, "loss": 0.0009, "step": 434 }, { "epoch": 0.1631736526946108, "grad_norm": 0.15032881498336792, "learning_rate": 9.851478631379982e-06, "loss": 0.0002, "step": 436 }, { "epoch": 0.16392215568862276, "grad_norm": 0.46663233637809753, "learning_rate": 9.849939329402337e-06, "loss": 0.0009, "step": 438 }, { "epoch": 0.16467065868263472, "grad_norm": 0.032840508967638016, "learning_rate": 9.848392213205549e-06, "loss": 0.0003, "step": 440 }, { "epoch": 0.16467065868263472, "eval_accuracy": 0.9997846752245753, "eval_loss": 0.0007238321122713387, "eval_runtime": 154.7968, "eval_samples_per_second": 32.3, "eval_steps_per_second": 8.075, "step": 440 }, { "epoch": 0.16541916167664672, "grad_norm": 0.17962802946567535, "learning_rate": 9.846837285282331e-06, "loss": 0.0006, "step": 442 }, { "epoch": 0.1661676646706587, "grad_norm": 0.03923157975077629, "learning_rate": 9.845274548137986e-06, "loss": 0.0002, "step": 444 }, { "epoch": 0.16691616766467066, "grad_norm": 0.07774964720010757, "learning_rate": 9.843704004290393e-06, "loss": 0.0002, "step": 446 }, { "epoch": 0.16766467065868262, "grad_norm": 0.2827122211456299, "learning_rate": 9.842125656270011e-06, "loss": 0.0006, "step": 448 }, { "epoch": 0.16841317365269462, "grad_norm": 0.30080848932266235, "learning_rate": 9.840539506619874e-06, "loss": 0.0003, "step": 450 }, { "epoch": 0.1691616766467066, "grad_norm": 0.19179034233093262, "learning_rate": 9.838945557895586e-06, "loss": 0.0002, "step": 452 }, { "epoch": 0.16991017964071856, "grad_norm": 0.044639382511377335, "learning_rate": 9.837343812665311e-06, "loss": 0.0002, "step": 454 }, { "epoch": 0.17065868263473055, "grad_norm": 0.14254966378211975, "learning_rate": 9.835734273509787e-06, "loss": 0.0007, "step": 456 }, { "epoch": 0.17140718562874252, "grad_norm": 0.10285581648349762, "learning_rate": 9.834116943022299e-06, "loss": 0.0003, "step": 458 }, { "epoch": 0.1721556886227545, "grad_norm": 0.12203399091959, "learning_rate": 9.832491823808688e-06, "loss": 0.0003, "step": 460 }, { "epoch": 0.17290419161676646, "grad_norm": 0.10512761771678925, "learning_rate": 9.830858918487347e-06, "loss": 0.0001, "step": 462 }, { "epoch": 0.17365269461077845, "grad_norm": 0.14217980206012726, "learning_rate": 9.829218229689211e-06, "loss": 0.0004, "step": 464 }, { "epoch": 0.17440119760479042, "grad_norm": 0.05573190748691559, "learning_rate": 9.827569760057755e-06, "loss": 0.0002, "step": 466 }, { "epoch": 0.1751497005988024, "grad_norm": 0.1435333788394928, "learning_rate": 9.825913512248996e-06, "loss": 0.0002, "step": 468 }, { "epoch": 0.17589820359281438, "grad_norm": 0.14290957152843475, "learning_rate": 9.824249488931477e-06, "loss": 0.0005, "step": 470 }, { "epoch": 0.17664670658682635, "grad_norm": 0.0923268049955368, "learning_rate": 9.822577692786272e-06, "loss": 0.0003, "step": 472 }, { "epoch": 0.17739520958083832, "grad_norm": 0.0938134640455246, "learning_rate": 9.820898126506978e-06, "loss": 0.0002, "step": 474 }, { "epoch": 0.1781437125748503, "grad_norm": 0.09895174205303192, "learning_rate": 9.819210792799711e-06, "loss": 0.0003, "step": 476 }, { "epoch": 0.17889221556886228, "grad_norm": 0.010202400386333466, "learning_rate": 9.817515694383102e-06, "loss": 0.0001, "step": 478 }, { "epoch": 0.17964071856287425, "grad_norm": 0.045472726225852966, "learning_rate": 9.815812833988292e-06, "loss": 0.0001, "step": 480 }, { "epoch": 0.17964071856287425, "eval_accuracy": 0.9998838001657226, "eval_loss": 0.000438039394794032, "eval_runtime": 154.4563, "eval_samples_per_second": 32.372, "eval_steps_per_second": 8.093, "step": 480 }, { "epoch": 0.18038922155688622, "grad_norm": 0.1489792764186859, "learning_rate": 9.814102214358928e-06, "loss": 0.0002, "step": 482 }, { "epoch": 0.18113772455089822, "grad_norm": 0.15599974989891052, "learning_rate": 9.81238383825116e-06, "loss": 0.0005, "step": 484 }, { "epoch": 0.18188622754491018, "grad_norm": 0.03606925159692764, "learning_rate": 9.810657708433637e-06, "loss": 0.0004, "step": 486 }, { "epoch": 0.18263473053892215, "grad_norm": 0.04655231162905693, "learning_rate": 9.808923827687494e-06, "loss": 0.0001, "step": 488 }, { "epoch": 0.18338323353293412, "grad_norm": 0.2198714017868042, "learning_rate": 9.807182198806362e-06, "loss": 0.0002, "step": 490 }, { "epoch": 0.18413173652694612, "grad_norm": 0.05768256261944771, "learning_rate": 9.805432824596347e-06, "loss": 0.0003, "step": 492 }, { "epoch": 0.18488023952095808, "grad_norm": 0.17893020808696747, "learning_rate": 9.803675707876048e-06, "loss": 0.0005, "step": 494 }, { "epoch": 0.18562874251497005, "grad_norm": 0.12833981215953827, "learning_rate": 9.801910851476524e-06, "loss": 0.0002, "step": 496 }, { "epoch": 0.18637724550898205, "grad_norm": 0.03174396604299545, "learning_rate": 9.800138258241311e-06, "loss": 0.0001, "step": 498 }, { "epoch": 0.18712574850299402, "grad_norm": 0.11265647411346436, "learning_rate": 9.798357931026411e-06, "loss": 0.0002, "step": 500 }, { "epoch": 0.18787425149700598, "grad_norm": 0.10834460705518723, "learning_rate": 9.796569872700287e-06, "loss": 0.0004, "step": 502 }, { "epoch": 0.18862275449101795, "grad_norm": 0.061082735657691956, "learning_rate": 9.79477408614386e-06, "loss": 0.0001, "step": 504 }, { "epoch": 0.18937125748502995, "grad_norm": 0.16802391409873962, "learning_rate": 9.792970574250493e-06, "loss": 0.0002, "step": 506 }, { "epoch": 0.19011976047904192, "grad_norm": 0.11000331491231918, "learning_rate": 9.791159339926009e-06, "loss": 0.0001, "step": 508 }, { "epoch": 0.19086826347305388, "grad_norm": 0.06801439821720123, "learning_rate": 9.789340386088663e-06, "loss": 0.0002, "step": 510 }, { "epoch": 0.19161676646706588, "grad_norm": 0.012815337628126144, "learning_rate": 9.787513715669158e-06, "loss": 0.0, "step": 512 }, { "epoch": 0.19236526946107785, "grad_norm": 0.011311142705380917, "learning_rate": 9.78567933161062e-06, "loss": 0.0, "step": 514 }, { "epoch": 0.19311377245508982, "grad_norm": 0.06330162286758423, "learning_rate": 9.78383723686861e-06, "loss": 0.0, "step": 516 }, { "epoch": 0.19386227544910178, "grad_norm": 0.071534164249897, "learning_rate": 9.781987434411106e-06, "loss": 0.0001, "step": 518 }, { "epoch": 0.19461077844311378, "grad_norm": 0.11816436052322388, "learning_rate": 9.780129927218513e-06, "loss": 0.0001, "step": 520 }, { "epoch": 0.19461077844311378, "eval_accuracy": 0.9999860097407319, "eval_loss": 5.2422070439206436e-05, "eval_runtime": 155.6149, "eval_samples_per_second": 32.131, "eval_steps_per_second": 8.033, "step": 520 }, { "epoch": 0.19535928143712575, "grad_norm": 0.06640541553497314, "learning_rate": 9.778264718283644e-06, "loss": 0.0, "step": 522 }, { "epoch": 0.19610778443113772, "grad_norm": 0.026967424899339676, "learning_rate": 9.776391810611719e-06, "loss": 0.0, "step": 524 }, { "epoch": 0.1968562874251497, "grad_norm": 0.11123115569353104, "learning_rate": 9.774511207220369e-06, "loss": 0.0001, "step": 526 }, { "epoch": 0.19760479041916168, "grad_norm": 0.13741283118724823, "learning_rate": 9.772622911139622e-06, "loss": 0.0001, "step": 528 }, { "epoch": 0.19835329341317365, "grad_norm": 0.009464044123888016, "learning_rate": 9.770726925411898e-06, "loss": 0.0, "step": 530 }, { "epoch": 0.19910179640718562, "grad_norm": 0.0769435316324234, "learning_rate": 9.768823253092008e-06, "loss": 0.0001, "step": 532 }, { "epoch": 0.1998502994011976, "grad_norm": 0.046003557741642, "learning_rate": 9.766911897247147e-06, "loss": 0.0001, "step": 534 }, { "epoch": 0.20059880239520958, "grad_norm": 0.10196753591299057, "learning_rate": 9.76499286095689e-06, "loss": 0.0002, "step": 536 }, { "epoch": 0.20134730538922155, "grad_norm": 0.020359348505735397, "learning_rate": 9.763066147313189e-06, "loss": 0.0, "step": 538 }, { "epoch": 0.20209580838323354, "grad_norm": 0.20479270815849304, "learning_rate": 9.76113175942036e-06, "loss": 0.0001, "step": 540 }, { "epoch": 0.2028443113772455, "grad_norm": 0.11673811078071594, "learning_rate": 9.759189700395096e-06, "loss": 0.0001, "step": 542 }, { "epoch": 0.20359281437125748, "grad_norm": 0.04004862159490585, "learning_rate": 9.75723997336643e-06, "loss": 0.0001, "step": 544 }, { "epoch": 0.20434131736526945, "grad_norm": 0.13865888118743896, "learning_rate": 9.755282581475769e-06, "loss": 0.0004, "step": 546 }, { "epoch": 0.20508982035928144, "grad_norm": 0.08988627046346664, "learning_rate": 9.753317527876857e-06, "loss": 0.0002, "step": 548 }, { "epoch": 0.2058383233532934, "grad_norm": 0.09014202654361725, "learning_rate": 9.751344815735791e-06, "loss": 0.0003, "step": 550 }, { "epoch": 0.20658682634730538, "grad_norm": 0.17278143763542175, "learning_rate": 9.749364448231001e-06, "loss": 0.0003, "step": 552 }, { "epoch": 0.20733532934131738, "grad_norm": 0.07624712586402893, "learning_rate": 9.747376428553255e-06, "loss": 0.0002, "step": 554 }, { "epoch": 0.20808383233532934, "grad_norm": 0.02646615356206894, "learning_rate": 9.745380759905648e-06, "loss": 0.0005, "step": 556 }, { "epoch": 0.2088323353293413, "grad_norm": 0.1350707858800888, "learning_rate": 9.743377445503598e-06, "loss": 0.0005, "step": 558 }, { "epoch": 0.20958083832335328, "grad_norm": 0.045723576098680496, "learning_rate": 9.74136648857485e-06, "loss": 0.0004, "step": 560 }, { "epoch": 0.20958083832335328, "eval_accuracy": 0.9998195029826557, "eval_loss": 0.0005425158306024969, "eval_runtime": 155.8793, "eval_samples_per_second": 32.076, "eval_steps_per_second": 8.019, "step": 560 }, { "epoch": 0.21032934131736528, "grad_norm": 0.11474994570016861, "learning_rate": 9.739347892359453e-06, "loss": 0.0003, "step": 562 }, { "epoch": 0.21107784431137724, "grad_norm": 0.0819924846291542, "learning_rate": 9.737321660109767e-06, "loss": 0.0002, "step": 564 }, { "epoch": 0.2118263473053892, "grad_norm": 0.098919577896595, "learning_rate": 9.735287795090455e-06, "loss": 0.0004, "step": 566 }, { "epoch": 0.2125748502994012, "grad_norm": 0.034899163991212845, "learning_rate": 9.733246300578482e-06, "loss": 0.0004, "step": 568 }, { "epoch": 0.21332335329341318, "grad_norm": 0.10499320924282074, "learning_rate": 9.731197179863104e-06, "loss": 0.0003, "step": 570 }, { "epoch": 0.21407185628742514, "grad_norm": 0.078518345952034, "learning_rate": 9.729140436245857e-06, "loss": 0.0001, "step": 572 }, { "epoch": 0.2148203592814371, "grad_norm": 0.04776620492339134, "learning_rate": 9.72707607304057e-06, "loss": 0.0002, "step": 574 }, { "epoch": 0.2155688622754491, "grad_norm": 0.043205343186855316, "learning_rate": 9.725004093573343e-06, "loss": 0.0001, "step": 576 }, { "epoch": 0.21631736526946108, "grad_norm": 0.0973254144191742, "learning_rate": 9.722924501182546e-06, "loss": 0.0002, "step": 578 }, { "epoch": 0.21706586826347304, "grad_norm": 0.07782719284296036, "learning_rate": 9.72083729921882e-06, "loss": 0.0001, "step": 580 }, { "epoch": 0.21781437125748504, "grad_norm": 0.04242849349975586, "learning_rate": 9.718742491045061e-06, "loss": 0.0001, "step": 582 }, { "epoch": 0.218562874251497, "grad_norm": 0.04837155342102051, "learning_rate": 9.716640080036423e-06, "loss": 0.0001, "step": 584 }, { "epoch": 0.21931137724550898, "grad_norm": 0.0814133882522583, "learning_rate": 9.71453006958031e-06, "loss": 0.0002, "step": 586 }, { "epoch": 0.22005988023952097, "grad_norm": 0.047387998551130295, "learning_rate": 9.712412463076368e-06, "loss": 0.0, "step": 588 }, { "epoch": 0.22080838323353294, "grad_norm": 0.017673810943961143, "learning_rate": 9.710287263936485e-06, "loss": 0.0, "step": 590 }, { "epoch": 0.2215568862275449, "grad_norm": 0.021801825612783432, "learning_rate": 9.708154475584779e-06, "loss": 0.0001, "step": 592 }, { "epoch": 0.22230538922155688, "grad_norm": 0.03839518874883652, "learning_rate": 9.7060141014576e-06, "loss": 0.0002, "step": 594 }, { "epoch": 0.22305389221556887, "grad_norm": 0.007782716304063797, "learning_rate": 9.703866145003512e-06, "loss": 0.0001, "step": 596 }, { "epoch": 0.22380239520958084, "grad_norm": 0.02108747325837612, "learning_rate": 9.701710609683305e-06, "loss": 0.0001, "step": 598 }, { "epoch": 0.2245508982035928, "grad_norm": 0.0026378484908491373, "learning_rate": 9.699547498969978e-06, "loss": 0.0, "step": 600 }, { "epoch": 0.2245508982035928, "eval_accuracy": 0.9999875145529564, "eval_loss": 4.2638039303710684e-05, "eval_runtime": 160.7051, "eval_samples_per_second": 31.113, "eval_steps_per_second": 7.778, "step": 600 }, { "epoch": 0.2252994011976048, "grad_norm": 0.02909325808286667, "learning_rate": 9.697376816348732e-06, "loss": 0.0001, "step": 602 }, { "epoch": 0.22604790419161677, "grad_norm": 0.0025581123773008585, "learning_rate": 9.695198565316966e-06, "loss": 0.0001, "step": 604 }, { "epoch": 0.22679640718562874, "grad_norm": 0.02005714178085327, "learning_rate": 9.69301274938428e-06, "loss": 0.0, "step": 606 }, { "epoch": 0.2275449101796407, "grad_norm": 0.0037004246842116117, "learning_rate": 9.690819372072457e-06, "loss": 0.0, "step": 608 }, { "epoch": 0.2282934131736527, "grad_norm": 0.032148100435733795, "learning_rate": 9.68861843691547e-06, "loss": 0.0001, "step": 610 }, { "epoch": 0.22904191616766467, "grad_norm": 0.014080125838518143, "learning_rate": 9.68640994745946e-06, "loss": 0.0002, "step": 612 }, { "epoch": 0.22979041916167664, "grad_norm": 0.010853869840502739, "learning_rate": 9.684193907262742e-06, "loss": 0.0, "step": 614 }, { "epoch": 0.23053892215568864, "grad_norm": 0.032357871532440186, "learning_rate": 9.681970319895804e-06, "loss": 0.0, "step": 616 }, { "epoch": 0.2312874251497006, "grad_norm": 0.008318758569657803, "learning_rate": 9.679739188941283e-06, "loss": 0.0, "step": 618 }, { "epoch": 0.23203592814371257, "grad_norm": 0.037990834563970566, "learning_rate": 9.677500517993983e-06, "loss": 0.0, "step": 620 }, { "epoch": 0.23278443113772454, "grad_norm": 0.00843075942248106, "learning_rate": 9.675254310660842e-06, "loss": 0.0001, "step": 622 }, { "epoch": 0.23353293413173654, "grad_norm": 0.05007459223270416, "learning_rate": 9.673000570560952e-06, "loss": 0.0, "step": 624 }, { "epoch": 0.2342814371257485, "grad_norm": 0.0009229404386132956, "learning_rate": 9.670739301325534e-06, "loss": 0.0, "step": 626 }, { "epoch": 0.23502994011976047, "grad_norm": 0.02507946826517582, "learning_rate": 9.668470506597946e-06, "loss": 0.0, "step": 628 }, { "epoch": 0.23577844311377247, "grad_norm": 0.09565775096416473, "learning_rate": 9.66619419003367e-06, "loss": 0.0002, "step": 630 }, { "epoch": 0.23652694610778444, "grad_norm": 0.0022729237098246813, "learning_rate": 9.663910355300306e-06, "loss": 0.0, "step": 632 }, { "epoch": 0.2372754491017964, "grad_norm": 0.0015811071498319507, "learning_rate": 9.661619006077562e-06, "loss": 0.0, "step": 634 }, { "epoch": 0.23802395209580837, "grad_norm": 0.10619401931762695, "learning_rate": 9.659320146057263e-06, "loss": 0.0001, "step": 636 }, { "epoch": 0.23877245508982037, "grad_norm": 0.0017936922376975417, "learning_rate": 9.657013778943328e-06, "loss": 0.0, "step": 638 }, { "epoch": 0.23952095808383234, "grad_norm": 0.00227470719255507, "learning_rate": 9.654699908451777e-06, "loss": 0.0, "step": 640 }, { "epoch": 0.23952095808383234, "eval_accuracy": 0.9999895288063639, "eval_loss": 2.794685133267194e-05, "eval_runtime": 156.1163, "eval_samples_per_second": 32.027, "eval_steps_per_second": 8.007, "step": 640 }, { "epoch": 0.2402694610778443, "grad_norm": 0.007332763634622097, "learning_rate": 9.652378538310715e-06, "loss": 0.0, "step": 642 }, { "epoch": 0.2410179640718563, "grad_norm": 0.08600316196680069, "learning_rate": 9.650049672260333e-06, "loss": 0.0, "step": 644 }, { "epoch": 0.24176646706586827, "grad_norm": 0.005560212302953005, "learning_rate": 9.647713314052896e-06, "loss": 0.0, "step": 646 }, { "epoch": 0.24251497005988024, "grad_norm": 0.00411292864009738, "learning_rate": 9.645369467452746e-06, "loss": 0.0, "step": 648 }, { "epoch": 0.2432634730538922, "grad_norm": 0.0018659079214558005, "learning_rate": 9.643018136236286e-06, "loss": 0.0, "step": 650 }, { "epoch": 0.2440119760479042, "grad_norm": 0.004269044380635023, "learning_rate": 9.64065932419198e-06, "loss": 0.0, "step": 652 }, { "epoch": 0.24476047904191617, "grad_norm": 0.00309938658028841, "learning_rate": 9.638293035120342e-06, "loss": 0.0, "step": 654 }, { "epoch": 0.24550898203592814, "grad_norm": 0.0024809043388813734, "learning_rate": 9.635919272833938e-06, "loss": 0.0, "step": 656 }, { "epoch": 0.24625748502994013, "grad_norm": 0.003469419199973345, "learning_rate": 9.63353804115737e-06, "loss": 0.0, "step": 658 }, { "epoch": 0.2470059880239521, "grad_norm": 0.0016053810250014067, "learning_rate": 9.63114934392728e-06, "loss": 0.0, "step": 660 }, { "epoch": 0.24775449101796407, "grad_norm": 0.02661885879933834, "learning_rate": 9.628753184992334e-06, "loss": 0.0, "step": 662 }, { "epoch": 0.24850299401197604, "grad_norm": 0.0016741787549108267, "learning_rate": 9.62634956821322e-06, "loss": 0.0001, "step": 664 }, { "epoch": 0.24925149700598803, "grad_norm": 0.0019377709832042456, "learning_rate": 9.623938497462647e-06, "loss": 0.0, "step": 666 }, { "epoch": 0.25, "grad_norm": 0.0012623120564967394, "learning_rate": 9.621519976625327e-06, "loss": 0.0, "step": 668 }, { "epoch": 0.25074850299401197, "grad_norm": 0.0024038818664848804, "learning_rate": 9.619094009597982e-06, "loss": 0.0, "step": 670 }, { "epoch": 0.25149700598802394, "grad_norm": 0.006172757130116224, "learning_rate": 9.616660600289329e-06, "loss": 0.0, "step": 672 }, { "epoch": 0.2522455089820359, "grad_norm": 0.0028510144911706448, "learning_rate": 9.614219752620074e-06, "loss": 0.0, "step": 674 }, { "epoch": 0.25299401197604793, "grad_norm": 0.02679716795682907, "learning_rate": 9.611771470522908e-06, "loss": 0.0, "step": 676 }, { "epoch": 0.2537425149700599, "grad_norm": 0.02851109206676483, "learning_rate": 9.609315757942504e-06, "loss": 0.0, "step": 678 }, { "epoch": 0.25449101796407186, "grad_norm": 0.0017305930377915502, "learning_rate": 9.606852618835503e-06, "loss": 0.0001, "step": 680 }, { "epoch": 0.25449101796407186, "eval_accuracy": 0.9999997747747748, "eval_loss": 8.644859917694703e-06, "eval_runtime": 159.0892, "eval_samples_per_second": 31.429, "eval_steps_per_second": 7.857, "step": 680 }, { "epoch": 0.25523952095808383, "grad_norm": 0.00403413875028491, "learning_rate": 9.604382057170514e-06, "loss": 0.0, "step": 682 }, { "epoch": 0.2559880239520958, "grad_norm": 0.0027754653710871935, "learning_rate": 9.601904076928103e-06, "loss": 0.0, "step": 684 }, { "epoch": 0.25673652694610777, "grad_norm": 0.0013081474462524056, "learning_rate": 9.599418682100793e-06, "loss": 0.0, "step": 686 }, { "epoch": 0.25748502994011974, "grad_norm": 0.05064619705080986, "learning_rate": 9.596925876693047e-06, "loss": 0.0, "step": 688 }, { "epoch": 0.25823353293413176, "grad_norm": 0.002823168644681573, "learning_rate": 9.594425664721275e-06, "loss": 0.0, "step": 690 }, { "epoch": 0.25898203592814373, "grad_norm": 0.030349284410476685, "learning_rate": 9.591918050213814e-06, "loss": 0.0, "step": 692 }, { "epoch": 0.2597305389221557, "grad_norm": 0.001790383132174611, "learning_rate": 9.589403037210933e-06, "loss": 0.0001, "step": 694 }, { "epoch": 0.26047904191616766, "grad_norm": 0.010972312651574612, "learning_rate": 9.586880629764817e-06, "loss": 0.0, "step": 696 }, { "epoch": 0.26122754491017963, "grad_norm": 0.06688281893730164, "learning_rate": 9.584350831939571e-06, "loss": 0.0001, "step": 698 }, { "epoch": 0.2619760479041916, "grad_norm": 0.149211123585701, "learning_rate": 9.581813647811199e-06, "loss": 0.0001, "step": 700 }, { "epoch": 0.26272455089820357, "grad_norm": 0.00245782732963562, "learning_rate": 9.579269081467614e-06, "loss": 0.0, "step": 702 }, { "epoch": 0.2634730538922156, "grad_norm": 0.01430213451385498, "learning_rate": 9.576717137008617e-06, "loss": 0.0001, "step": 704 }, { "epoch": 0.26422155688622756, "grad_norm": 0.013654684647917747, "learning_rate": 9.574157818545902e-06, "loss": 0.0, "step": 706 }, { "epoch": 0.26497005988023953, "grad_norm": 0.015040101483464241, "learning_rate": 9.57159113020304e-06, "loss": 0.0, "step": 708 }, { "epoch": 0.2657185628742515, "grad_norm": 0.01307929027825594, "learning_rate": 9.569017076115476e-06, "loss": 0.0001, "step": 710 }, { "epoch": 0.26646706586826346, "grad_norm": 0.02330423705279827, "learning_rate": 9.566435660430528e-06, "loss": 0.0, "step": 712 }, { "epoch": 0.26721556886227543, "grad_norm": 0.002268057782202959, "learning_rate": 9.563846887307369e-06, "loss": 0.0, "step": 714 }, { "epoch": 0.2679640718562874, "grad_norm": 0.011261685751378536, "learning_rate": 9.561250760917026e-06, "loss": 0.0001, "step": 716 }, { "epoch": 0.2687125748502994, "grad_norm": 0.03315627574920654, "learning_rate": 9.558647285442382e-06, "loss": 0.0, "step": 718 }, { "epoch": 0.2694610778443114, "grad_norm": 0.002093307441100478, "learning_rate": 9.55603646507815e-06, "loss": 0.0, "step": 720 }, { "epoch": 0.2694610778443114, "eval_accuracy": 0.9999976841259713, "eval_loss": 9.612030225980561e-06, "eval_runtime": 155.2847, "eval_samples_per_second": 32.199, "eval_steps_per_second": 8.05, "step": 720 }, { "epoch": 0.27020958083832336, "grad_norm": 0.001716041355393827, "learning_rate": 9.553418304030886e-06, "loss": 0.0, "step": 722 }, { "epoch": 0.27095808383233533, "grad_norm": 0.0027342389803379774, "learning_rate": 9.550792806518967e-06, "loss": 0.0, "step": 724 }, { "epoch": 0.2717065868263473, "grad_norm": 0.1821688860654831, "learning_rate": 9.548159976772593e-06, "loss": 0.0001, "step": 726 }, { "epoch": 0.27245508982035926, "grad_norm": 0.0016638662200421095, "learning_rate": 9.545519819033777e-06, "loss": 0.0001, "step": 728 }, { "epoch": 0.27320359281437123, "grad_norm": 0.021991174668073654, "learning_rate": 9.542872337556341e-06, "loss": 0.0, "step": 730 }, { "epoch": 0.27395209580838326, "grad_norm": 0.0012851693900302052, "learning_rate": 9.540217536605906e-06, "loss": 0.0, "step": 732 }, { "epoch": 0.2747005988023952, "grad_norm": 0.0014544121222570539, "learning_rate": 9.537555420459883e-06, "loss": 0.0, "step": 734 }, { "epoch": 0.2754491017964072, "grad_norm": 0.009950781241059303, "learning_rate": 9.534885993407474e-06, "loss": 0.0, "step": 736 }, { "epoch": 0.27619760479041916, "grad_norm": 0.00411807419732213, "learning_rate": 9.532209259749658e-06, "loss": 0.0, "step": 738 }, { "epoch": 0.27694610778443113, "grad_norm": 0.006487260106950998, "learning_rate": 9.529525223799185e-06, "loss": 0.0, "step": 740 }, { "epoch": 0.2776946107784431, "grad_norm": 0.007635013200342655, "learning_rate": 9.526833889880573e-06, "loss": 0.0, "step": 742 }, { "epoch": 0.27844311377245506, "grad_norm": 0.000996310613118112, "learning_rate": 9.524135262330098e-06, "loss": 0.0, "step": 744 }, { "epoch": 0.2791916167664671, "grad_norm": 0.0031566142570227385, "learning_rate": 9.521429345495787e-06, "loss": 0.0001, "step": 746 }, { "epoch": 0.27994011976047906, "grad_norm": 0.002025540452450514, "learning_rate": 9.51871614373741e-06, "loss": 0.0, "step": 748 }, { "epoch": 0.280688622754491, "grad_norm": 0.1011413112282753, "learning_rate": 9.515995661426478e-06, "loss": 0.0001, "step": 750 }, { "epoch": 0.281437125748503, "grad_norm": 0.021610310301184654, "learning_rate": 9.513267902946228e-06, "loss": 0.0, "step": 752 }, { "epoch": 0.28218562874251496, "grad_norm": 0.0016732689691707492, "learning_rate": 9.510532872691624e-06, "loss": 0.0, "step": 754 }, { "epoch": 0.28293413173652693, "grad_norm": 0.11272062361240387, "learning_rate": 9.507790575069347e-06, "loss": 0.0001, "step": 756 }, { "epoch": 0.2836826347305389, "grad_norm": 0.0009099426679313183, "learning_rate": 9.50504101449778e-06, "loss": 0.0, "step": 758 }, { "epoch": 0.2844311377245509, "grad_norm": 0.0009794794023036957, "learning_rate": 9.50228419540702e-06, "loss": 0.0, "step": 760 }, { "epoch": 0.2844311377245509, "eval_accuracy": 0.9999983934801854, "eval_loss": 9.44385647017043e-06, "eval_runtime": 156.9097, "eval_samples_per_second": 31.865, "eval_steps_per_second": 7.966, "step": 760 }, { "epoch": 0.2851796407185629, "grad_norm": 0.03243451938033104, "learning_rate": 9.499520122238846e-06, "loss": 0.0, "step": 762 }, { "epoch": 0.28592814371257486, "grad_norm": 0.02839779108762741, "learning_rate": 9.496748799446733e-06, "loss": 0.0001, "step": 764 }, { "epoch": 0.2866766467065868, "grad_norm": 0.0816827118396759, "learning_rate": 9.493970231495836e-06, "loss": 0.0, "step": 766 }, { "epoch": 0.2874251497005988, "grad_norm": 0.0025276602245867252, "learning_rate": 9.49118442286298e-06, "loss": 0.0, "step": 768 }, { "epoch": 0.28817365269461076, "grad_norm": 0.0015131831169128418, "learning_rate": 9.488391378036662e-06, "loss": 0.0, "step": 770 }, { "epoch": 0.28892215568862273, "grad_norm": 0.001832049572840333, "learning_rate": 9.485591101517027e-06, "loss": 0.0, "step": 772 }, { "epoch": 0.28967065868263475, "grad_norm": 0.047806382179260254, "learning_rate": 9.482783597815883e-06, "loss": 0.0, "step": 774 }, { "epoch": 0.2904191616766467, "grad_norm": 0.03347828611731529, "learning_rate": 9.47996887145668e-06, "loss": 0.0, "step": 776 }, { "epoch": 0.2911676646706587, "grad_norm": 0.0017931102775037289, "learning_rate": 9.477146926974501e-06, "loss": 0.0, "step": 778 }, { "epoch": 0.29191616766467066, "grad_norm": 0.009210226126015186, "learning_rate": 9.47431776891606e-06, "loss": 0.0, "step": 780 }, { "epoch": 0.2926646706586826, "grad_norm": 0.0013418138260021806, "learning_rate": 9.471481401839696e-06, "loss": 0.0, "step": 782 }, { "epoch": 0.2934131736526946, "grad_norm": 0.0009674608591012657, "learning_rate": 9.468637830315364e-06, "loss": 0.0, "step": 784 }, { "epoch": 0.29416167664670656, "grad_norm": 0.0006195507594384253, "learning_rate": 9.46578705892462e-06, "loss": 0.0, "step": 786 }, { "epoch": 0.2949101796407186, "grad_norm": 0.0013804810587316751, "learning_rate": 9.46292909226063e-06, "loss": 0.0, "step": 788 }, { "epoch": 0.29565868263473055, "grad_norm": 0.0004127651918679476, "learning_rate": 9.460063934928142e-06, "loss": 0.0, "step": 790 }, { "epoch": 0.2964071856287425, "grad_norm": 0.0004895281745120883, "learning_rate": 9.4571915915435e-06, "loss": 0.0, "step": 792 }, { "epoch": 0.2971556886227545, "grad_norm": 0.00033658542088232934, "learning_rate": 9.454312066734624e-06, "loss": 0.0, "step": 794 }, { "epoch": 0.29790419161676646, "grad_norm": 0.07587553560733795, "learning_rate": 9.451425365140997e-06, "loss": 0.0, "step": 796 }, { "epoch": 0.2986526946107784, "grad_norm": 0.00075916713103652, "learning_rate": 9.448531491413673e-06, "loss": 0.0, "step": 798 }, { "epoch": 0.2994011976047904, "grad_norm": 0.0008038659580051899, "learning_rate": 9.445630450215259e-06, "loss": 0.0, "step": 800 }, { "epoch": 0.2994011976047904, "eval_accuracy": 0.9999963675587793, "eval_loss": 1.0820390343724284e-05, "eval_runtime": 155.3675, "eval_samples_per_second": 32.182, "eval_steps_per_second": 8.045, "step": 800 }, { "epoch": 0.3001497005988024, "grad_norm": 0.0022112810984253883, "learning_rate": 9.442722246219915e-06, "loss": 0.0, "step": 802 }, { "epoch": 0.3008982035928144, "grad_norm": 0.0013486716197803617, "learning_rate": 9.439806884113331e-06, "loss": 0.0, "step": 804 }, { "epoch": 0.30164670658682635, "grad_norm": 0.005311549641191959, "learning_rate": 9.43688436859274e-06, "loss": 0.0002, "step": 806 }, { "epoch": 0.3023952095808383, "grad_norm": 0.000981526798568666, "learning_rate": 9.433954704366897e-06, "loss": 0.0, "step": 808 }, { "epoch": 0.3031437125748503, "grad_norm": 0.09638898819684982, "learning_rate": 9.431017896156074e-06, "loss": 0.0, "step": 810 }, { "epoch": 0.30389221556886226, "grad_norm": 0.04560961201786995, "learning_rate": 9.428073948692056e-06, "loss": 0.0001, "step": 812 }, { "epoch": 0.3046407185628742, "grad_norm": 0.040918540209531784, "learning_rate": 9.425122866718128e-06, "loss": 0.0003, "step": 814 }, { "epoch": 0.30538922155688625, "grad_norm": 0.03442908823490143, "learning_rate": 9.422164654989073e-06, "loss": 0.0, "step": 816 }, { "epoch": 0.3061377245508982, "grad_norm": 0.13045716285705566, "learning_rate": 9.419199318271158e-06, "loss": 0.0001, "step": 818 }, { "epoch": 0.3068862275449102, "grad_norm": 0.027492402121424675, "learning_rate": 9.416226861342132e-06, "loss": 0.0001, "step": 820 }, { "epoch": 0.30763473053892215, "grad_norm": 0.003682814771309495, "learning_rate": 9.413247288991216e-06, "loss": 0.0, "step": 822 }, { "epoch": 0.3083832335329341, "grad_norm": 0.10141133517026901, "learning_rate": 9.410260606019095e-06, "loss": 0.0002, "step": 824 }, { "epoch": 0.3091317365269461, "grad_norm": 0.0007527911802753806, "learning_rate": 9.40726681723791e-06, "loss": 0.0001, "step": 826 }, { "epoch": 0.30988023952095806, "grad_norm": 0.005670532584190369, "learning_rate": 9.404265927471255e-06, "loss": 0.0, "step": 828 }, { "epoch": 0.3106287425149701, "grad_norm": 0.03817495331168175, "learning_rate": 9.401257941554157e-06, "loss": 0.0, "step": 830 }, { "epoch": 0.31137724550898205, "grad_norm": 0.009812482632696629, "learning_rate": 9.398242864333084e-06, "loss": 0.0, "step": 832 }, { "epoch": 0.312125748502994, "grad_norm": 0.007045481353998184, "learning_rate": 9.395220700665924e-06, "loss": 0.0, "step": 834 }, { "epoch": 0.312874251497006, "grad_norm": 0.003998721018433571, "learning_rate": 9.392191455421989e-06, "loss": 0.0, "step": 836 }, { "epoch": 0.31362275449101795, "grad_norm": 0.031697846949100494, "learning_rate": 9.389155133481993e-06, "loss": 0.0001, "step": 838 }, { "epoch": 0.3143712574850299, "grad_norm": 0.0006167310057207942, "learning_rate": 9.386111739738057e-06, "loss": 0.0, "step": 840 }, { "epoch": 0.3143712574850299, "eval_accuracy": 0.9999984969179706, "eval_loss": 6.494924491562415e-06, "eval_runtime": 155.2786, "eval_samples_per_second": 32.2, "eval_steps_per_second": 8.05, "step": 840 }, { "epoch": 0.31511976047904194, "grad_norm": 0.47339311242103577, "learning_rate": 9.383061279093697e-06, "loss": 0.0002, "step": 842 }, { "epoch": 0.3158682634730539, "grad_norm": 0.0039043284486979246, "learning_rate": 9.380003756463812e-06, "loss": 0.0, "step": 844 }, { "epoch": 0.3166167664670659, "grad_norm": 0.18402549624443054, "learning_rate": 9.376939176774678e-06, "loss": 0.0001, "step": 846 }, { "epoch": 0.31736526946107785, "grad_norm": 0.03785166144371033, "learning_rate": 9.373867544963949e-06, "loss": 0.0004, "step": 848 }, { "epoch": 0.3181137724550898, "grad_norm": 0.07002092897891998, "learning_rate": 9.370788865980633e-06, "loss": 0.0001, "step": 850 }, { "epoch": 0.3188622754491018, "grad_norm": 0.009300635196268559, "learning_rate": 9.367703144785097e-06, "loss": 0.0, "step": 852 }, { "epoch": 0.31961077844311375, "grad_norm": 0.2740118205547333, "learning_rate": 9.364610386349048e-06, "loss": 0.0003, "step": 854 }, { "epoch": 0.3203592814371258, "grad_norm": 0.023412982001900673, "learning_rate": 9.361510595655545e-06, "loss": 0.0001, "step": 856 }, { "epoch": 0.32110778443113774, "grad_norm": 0.10502910614013672, "learning_rate": 9.358403777698962e-06, "loss": 0.0001, "step": 858 }, { "epoch": 0.3218562874251497, "grad_norm": 0.17004919052124023, "learning_rate": 9.355289937485005e-06, "loss": 0.0001, "step": 860 }, { "epoch": 0.3226047904191617, "grad_norm": 0.020658617839217186, "learning_rate": 9.35216908003069e-06, "loss": 0.0, "step": 862 }, { "epoch": 0.32335329341317365, "grad_norm": 0.2423926740884781, "learning_rate": 9.349041210364343e-06, "loss": 0.0003, "step": 864 }, { "epoch": 0.3241017964071856, "grad_norm": 0.02749599702656269, "learning_rate": 9.345906333525582e-06, "loss": 0.0001, "step": 866 }, { "epoch": 0.3248502994011976, "grad_norm": 0.10116691887378693, "learning_rate": 9.342764454565321e-06, "loss": 0.0001, "step": 868 }, { "epoch": 0.3255988023952096, "grad_norm": 0.09531649202108383, "learning_rate": 9.339615578545753e-06, "loss": 0.0001, "step": 870 }, { "epoch": 0.3263473053892216, "grad_norm": 0.023796789348125458, "learning_rate": 9.336459710540344e-06, "loss": 0.0, "step": 872 }, { "epoch": 0.32709580838323354, "grad_norm": 0.08885123580694199, "learning_rate": 9.333296855633828e-06, "loss": 0.0001, "step": 874 }, { "epoch": 0.3278443113772455, "grad_norm": 0.13661184906959534, "learning_rate": 9.330127018922195e-06, "loss": 0.0001, "step": 876 }, { "epoch": 0.3285928143712575, "grad_norm": 0.009723243303596973, "learning_rate": 9.326950205512682e-06, "loss": 0.0001, "step": 878 }, { "epoch": 0.32934131736526945, "grad_norm": 0.017450012266635895, "learning_rate": 9.323766420523768e-06, "loss": 0.0001, "step": 880 }, { "epoch": 0.32934131736526945, "eval_accuracy": 0.9999853499863853, "eval_loss": 5.076894740341231e-05, "eval_runtime": 154.2114, "eval_samples_per_second": 32.423, "eval_steps_per_second": 8.106, "step": 880 }, { "epoch": 0.3300898203592814, "grad_norm": 0.09020084142684937, "learning_rate": 9.32057566908517e-06, "loss": 0.0001, "step": 882 }, { "epoch": 0.33083832335329344, "grad_norm": 0.014794589951634407, "learning_rate": 9.31737795633782e-06, "loss": 0.0, "step": 884 }, { "epoch": 0.3315868263473054, "grad_norm": 0.1351051777601242, "learning_rate": 9.314173287433874e-06, "loss": 0.0001, "step": 886 }, { "epoch": 0.3323353293413174, "grad_norm": 0.02759048528969288, "learning_rate": 9.310961667536689e-06, "loss": 0.0, "step": 888 }, { "epoch": 0.33308383233532934, "grad_norm": 0.006297203712165356, "learning_rate": 9.307743101820828e-06, "loss": 0.0, "step": 890 }, { "epoch": 0.3338323353293413, "grad_norm": 0.1679803431034088, "learning_rate": 9.30451759547204e-06, "loss": 0.0004, "step": 892 }, { "epoch": 0.3345808383233533, "grad_norm": 0.018898937851190567, "learning_rate": 9.301285153687261e-06, "loss": 0.0001, "step": 894 }, { "epoch": 0.33532934131736525, "grad_norm": 0.010490099899470806, "learning_rate": 9.298045781674595e-06, "loss": 0.0001, "step": 896 }, { "epoch": 0.33607784431137727, "grad_norm": 0.08461616188287735, "learning_rate": 9.294799484653323e-06, "loss": 0.0002, "step": 898 }, { "epoch": 0.33682634730538924, "grad_norm": 0.009152884595096111, "learning_rate": 9.291546267853871e-06, "loss": 0.0001, "step": 900 }, { "epoch": 0.3375748502994012, "grad_norm": 0.04316161200404167, "learning_rate": 9.28828613651782e-06, "loss": 0.0001, "step": 902 }, { "epoch": 0.3383233532934132, "grad_norm": 0.04677840694785118, "learning_rate": 9.285019095897894e-06, "loss": 0.0, "step": 904 }, { "epoch": 0.33907185628742514, "grad_norm": 0.006453138776123524, "learning_rate": 9.281745151257946e-06, "loss": 0.0002, "step": 906 }, { "epoch": 0.3398203592814371, "grad_norm": 0.00727870361879468, "learning_rate": 9.278464307872952e-06, "loss": 0.0, "step": 908 }, { "epoch": 0.3405688622754491, "grad_norm": 0.15015535056591034, "learning_rate": 9.275176571029008e-06, "loss": 0.0002, "step": 910 }, { "epoch": 0.3413173652694611, "grad_norm": 0.01364520750939846, "learning_rate": 9.271881946023309e-06, "loss": 0.0, "step": 912 }, { "epoch": 0.34206586826347307, "grad_norm": 0.13824740052223206, "learning_rate": 9.268580438164157e-06, "loss": 0.0001, "step": 914 }, { "epoch": 0.34281437125748504, "grad_norm": 0.02371104806661606, "learning_rate": 9.265272052770936e-06, "loss": 0.0, "step": 916 }, { "epoch": 0.343562874251497, "grad_norm": 0.07769843935966492, "learning_rate": 9.261956795174116e-06, "loss": 0.0002, "step": 918 }, { "epoch": 0.344311377245509, "grad_norm": 0.0038983135018497705, "learning_rate": 9.25863467071524e-06, "loss": 0.0, "step": 920 }, { "epoch": 0.344311377245509, "eval_accuracy": 0.9999698145371103, "eval_loss": 0.0001175394281744957, "eval_runtime": 154.4022, "eval_samples_per_second": 32.383, "eval_steps_per_second": 8.096, "step": 920 }, { "epoch": 0.34505988023952094, "grad_norm": 0.04528482258319855, "learning_rate": 9.255305684746908e-06, "loss": 0.0001, "step": 922 }, { "epoch": 0.3458083832335329, "grad_norm": 0.04112999513745308, "learning_rate": 9.251969842632785e-06, "loss": 0.0, "step": 924 }, { "epoch": 0.34655688622754494, "grad_norm": 0.01982693374156952, "learning_rate": 9.248627149747573e-06, "loss": 0.0, "step": 926 }, { "epoch": 0.3473053892215569, "grad_norm": 0.002507114317268133, "learning_rate": 9.24527761147702e-06, "loss": 0.0, "step": 928 }, { "epoch": 0.34805389221556887, "grad_norm": 0.018373820930719376, "learning_rate": 9.241921233217899e-06, "loss": 0.0, "step": 930 }, { "epoch": 0.34880239520958084, "grad_norm": 0.015127432532608509, "learning_rate": 9.238558020378003e-06, "loss": 0.0, "step": 932 }, { "epoch": 0.3495508982035928, "grad_norm": 0.006092644762247801, "learning_rate": 9.235187978376141e-06, "loss": 0.0001, "step": 934 }, { "epoch": 0.3502994011976048, "grad_norm": 0.14546248316764832, "learning_rate": 9.231811112642121e-06, "loss": 0.0002, "step": 936 }, { "epoch": 0.35104790419161674, "grad_norm": 0.003949570469558239, "learning_rate": 9.228427428616749e-06, "loss": 0.0001, "step": 938 }, { "epoch": 0.35179640718562877, "grad_norm": 0.008468257263302803, "learning_rate": 9.225036931751811e-06, "loss": 0.0002, "step": 940 }, { "epoch": 0.35254491017964074, "grad_norm": 0.10494138300418854, "learning_rate": 9.221639627510076e-06, "loss": 0.0002, "step": 942 }, { "epoch": 0.3532934131736527, "grad_norm": 0.06659938395023346, "learning_rate": 9.218235521365278e-06, "loss": 0.0004, "step": 944 }, { "epoch": 0.35404191616766467, "grad_norm": 0.09659219533205032, "learning_rate": 9.214824618802108e-06, "loss": 0.0001, "step": 946 }, { "epoch": 0.35479041916167664, "grad_norm": 0.022609582170844078, "learning_rate": 9.211406925316214e-06, "loss": 0.0001, "step": 948 }, { "epoch": 0.3555389221556886, "grad_norm": 0.017719948664307594, "learning_rate": 9.20798244641418e-06, "loss": 0.0001, "step": 950 }, { "epoch": 0.3562874251497006, "grad_norm": 0.06319057196378708, "learning_rate": 9.204551187613521e-06, "loss": 0.0002, "step": 952 }, { "epoch": 0.3570359281437126, "grad_norm": 0.03745066374540329, "learning_rate": 9.201113154442685e-06, "loss": 0.0001, "step": 954 }, { "epoch": 0.35778443113772457, "grad_norm": 0.021028850227594376, "learning_rate": 9.197668352441025e-06, "loss": 0.0, "step": 956 }, { "epoch": 0.35853293413173654, "grad_norm": 0.02389431931078434, "learning_rate": 9.194216787158805e-06, "loss": 0.0001, "step": 958 }, { "epoch": 0.3592814371257485, "grad_norm": 0.03340911120176315, "learning_rate": 9.190758464157184e-06, "loss": 0.0002, "step": 960 }, { "epoch": 0.3592814371257485, "eval_accuracy": 0.9999895989441676, "eval_loss": 5.814629912492819e-05, "eval_runtime": 155.1915, "eval_samples_per_second": 32.218, "eval_steps_per_second": 8.055, "step": 960 }, { "epoch": 0.36002994011976047, "grad_norm": 0.016582539305090904, "learning_rate": 9.18729338900821e-06, "loss": 0.0, "step": 962 }, { "epoch": 0.36077844311377244, "grad_norm": 0.009625283069908619, "learning_rate": 9.18382156729481e-06, "loss": 0.0, "step": 964 }, { "epoch": 0.3615269461077844, "grad_norm": 0.0010095112957060337, "learning_rate": 9.18034300461078e-06, "loss": 0.0, "step": 966 }, { "epoch": 0.36227544910179643, "grad_norm": 0.0017203768948093057, "learning_rate": 9.17685770656078e-06, "loss": 0.0, "step": 968 }, { "epoch": 0.3630239520958084, "grad_norm": 0.03041454404592514, "learning_rate": 9.173365678760318e-06, "loss": 0.0, "step": 970 }, { "epoch": 0.36377245508982037, "grad_norm": 0.3855910897254944, "learning_rate": 9.169866926835749e-06, "loss": 0.0002, "step": 972 }, { "epoch": 0.36452095808383234, "grad_norm": 0.04365074634552002, "learning_rate": 9.166361456424257e-06, "loss": 0.0001, "step": 974 }, { "epoch": 0.3652694610778443, "grad_norm": 0.007284363266080618, "learning_rate": 9.162849273173857e-06, "loss": 0.0, "step": 976 }, { "epoch": 0.36601796407185627, "grad_norm": 0.043903883546590805, "learning_rate": 9.159330382743375e-06, "loss": 0.0, "step": 978 }, { "epoch": 0.36676646706586824, "grad_norm": 0.016840385273098946, "learning_rate": 9.155804790802444e-06, "loss": 0.0, "step": 980 }, { "epoch": 0.36751497005988026, "grad_norm": 0.11350879073143005, "learning_rate": 9.152272503031496e-06, "loss": 0.0, "step": 982 }, { "epoch": 0.36826347305389223, "grad_norm": 0.06382304430007935, "learning_rate": 9.148733525121751e-06, "loss": 0.0002, "step": 984 }, { "epoch": 0.3690119760479042, "grad_norm": 0.09389964491128922, "learning_rate": 9.145187862775208e-06, "loss": 0.0001, "step": 986 }, { "epoch": 0.36976047904191617, "grad_norm": 0.002736086491495371, "learning_rate": 9.141635521704638e-06, "loss": 0.0001, "step": 988 }, { "epoch": 0.37050898203592814, "grad_norm": 0.07442247867584229, "learning_rate": 9.138076507633566e-06, "loss": 0.0001, "step": 990 }, { "epoch": 0.3712574850299401, "grad_norm": 0.026373956352472305, "learning_rate": 9.134510826296277e-06, "loss": 0.0, "step": 992 }, { "epoch": 0.37200598802395207, "grad_norm": 0.0026233713142573833, "learning_rate": 9.130938483437792e-06, "loss": 0.0001, "step": 994 }, { "epoch": 0.3727544910179641, "grad_norm": 0.1102319285273552, "learning_rate": 9.12735948481387e-06, "loss": 0.0001, "step": 996 }, { "epoch": 0.37350299401197606, "grad_norm": 0.08953434228897095, "learning_rate": 9.12377383619099e-06, "loss": 0.0, "step": 998 }, { "epoch": 0.37425149700598803, "grad_norm": 0.026983065530657768, "learning_rate": 9.120181543346348e-06, "loss": 0.0, "step": 1000 }, { "epoch": 0.37425149700598803, "eval_accuracy": 0.9999914303936028, "eval_loss": 4.0267019357997924e-05, "eval_runtime": 154.1351, "eval_samples_per_second": 32.439, "eval_steps_per_second": 8.11, "step": 1000 }, { "epoch": 0.375, "grad_norm": 0.03703652322292328, "learning_rate": 9.11658261206784e-06, "loss": 0.0, "step": 1002 }, { "epoch": 0.37574850299401197, "grad_norm": 0.06015906482934952, "learning_rate": 9.112977048154066e-06, "loss": 0.0, "step": 1004 }, { "epoch": 0.37649700598802394, "grad_norm": 0.171669602394104, "learning_rate": 9.109364857414306e-06, "loss": 0.0001, "step": 1006 }, { "epoch": 0.3772455089820359, "grad_norm": 0.027005095034837723, "learning_rate": 9.10574604566852e-06, "loss": 0.0, "step": 1008 }, { "epoch": 0.37799401197604793, "grad_norm": 0.06816914677619934, "learning_rate": 9.102120618747336e-06, "loss": 0.0, "step": 1010 }, { "epoch": 0.3787425149700599, "grad_norm": 0.029688792303204536, "learning_rate": 9.09848858249204e-06, "loss": 0.0, "step": 1012 }, { "epoch": 0.37949101796407186, "grad_norm": 0.0352199524641037, "learning_rate": 9.094849942754564e-06, "loss": 0.0, "step": 1014 }, { "epoch": 0.38023952095808383, "grad_norm": 0.42947277426719666, "learning_rate": 9.091204705397485e-06, "loss": 0.0002, "step": 1016 }, { "epoch": 0.3809880239520958, "grad_norm": 0.038584258407354355, "learning_rate": 9.087552876294003e-06, "loss": 0.0, "step": 1018 }, { "epoch": 0.38173652694610777, "grad_norm": 0.2603873312473297, "learning_rate": 9.083894461327946e-06, "loss": 0.0015, "step": 1020 }, { "epoch": 0.38248502994011974, "grad_norm": 0.13592231273651123, "learning_rate": 9.08022946639375e-06, "loss": 0.0002, "step": 1022 }, { "epoch": 0.38323353293413176, "grad_norm": 0.013513598591089249, "learning_rate": 9.076557897396452e-06, "loss": 0.0001, "step": 1024 }, { "epoch": 0.38398203592814373, "grad_norm": 0.06492534279823303, "learning_rate": 9.07287976025168e-06, "loss": 0.0001, "step": 1026 }, { "epoch": 0.3847305389221557, "grad_norm": 0.04138237237930298, "learning_rate": 9.069195060885647e-06, "loss": 0.0002, "step": 1028 }, { "epoch": 0.38547904191616766, "grad_norm": 0.013964397832751274, "learning_rate": 9.065503805235139e-06, "loss": 0.0001, "step": 1030 }, { "epoch": 0.38622754491017963, "grad_norm": 0.1758122593164444, "learning_rate": 9.061805999247504e-06, "loss": 0.0001, "step": 1032 }, { "epoch": 0.3869760479041916, "grad_norm": 0.185356006026268, "learning_rate": 9.058101648880646e-06, "loss": 0.0003, "step": 1034 }, { "epoch": 0.38772455089820357, "grad_norm": 0.020207742229104042, "learning_rate": 9.05439076010301e-06, "loss": 0.0003, "step": 1036 }, { "epoch": 0.3884730538922156, "grad_norm": 0.07574658840894699, "learning_rate": 9.050673338893578e-06, "loss": 0.0002, "step": 1038 }, { "epoch": 0.38922155688622756, "grad_norm": 0.15675880014896393, "learning_rate": 9.046949391241859e-06, "loss": 0.0003, "step": 1040 }, { "epoch": 0.38922155688622756, "eval_accuracy": 0.9999522992784509, "eval_loss": 0.00014203271712176502, "eval_runtime": 154.4084, "eval_samples_per_second": 32.382, "eval_steps_per_second": 8.095, "step": 1040 }, { "epoch": 0.38997005988023953, "grad_norm": 0.10081563144922256, "learning_rate": 9.043218923147874e-06, "loss": 0.0001, "step": 1042 }, { "epoch": 0.3907185628742515, "grad_norm": 0.028760971501469612, "learning_rate": 9.039481940622148e-06, "loss": 0.0003, "step": 1044 }, { "epoch": 0.39146706586826346, "grad_norm": 0.37775400280952454, "learning_rate": 9.035738449685707e-06, "loss": 0.0007, "step": 1046 }, { "epoch": 0.39221556886227543, "grad_norm": 0.14730341732501984, "learning_rate": 9.031988456370062e-06, "loss": 0.0003, "step": 1048 }, { "epoch": 0.3929640718562874, "grad_norm": 0.16259920597076416, "learning_rate": 9.0282319667172e-06, "loss": 0.0006, "step": 1050 }, { "epoch": 0.3937125748502994, "grad_norm": 0.11165869981050491, "learning_rate": 9.02446898677957e-06, "loss": 0.0002, "step": 1052 }, { "epoch": 0.3944610778443114, "grad_norm": 0.236286461353302, "learning_rate": 9.020699522620091e-06, "loss": 0.0006, "step": 1054 }, { "epoch": 0.39520958083832336, "grad_norm": 0.17146489024162292, "learning_rate": 9.016923580312114e-06, "loss": 0.0006, "step": 1056 }, { "epoch": 0.39595808383233533, "grad_norm": 0.13749942183494568, "learning_rate": 9.013141165939439e-06, "loss": 0.0005, "step": 1058 }, { "epoch": 0.3967065868263473, "grad_norm": 0.0854322612285614, "learning_rate": 9.009352285596287e-06, "loss": 0.0004, "step": 1060 }, { "epoch": 0.39745508982035926, "grad_norm": 0.3005140423774719, "learning_rate": 9.005556945387301e-06, "loss": 0.0009, "step": 1062 }, { "epoch": 0.39820359281437123, "grad_norm": 0.061198897659778595, "learning_rate": 9.001755151427532e-06, "loss": 0.0002, "step": 1064 }, { "epoch": 0.39895209580838326, "grad_norm": 0.13300061225891113, "learning_rate": 8.997946909842426e-06, "loss": 0.0003, "step": 1066 }, { "epoch": 0.3997005988023952, "grad_norm": 0.05639196187257767, "learning_rate": 8.99413222676782e-06, "loss": 0.0002, "step": 1068 }, { "epoch": 0.4004491017964072, "grad_norm": 0.0920565128326416, "learning_rate": 8.990311108349926e-06, "loss": 0.0002, "step": 1070 }, { "epoch": 0.40119760479041916, "grad_norm": 0.2794632613658905, "learning_rate": 8.986483560745335e-06, "loss": 0.0003, "step": 1072 }, { "epoch": 0.40194610778443113, "grad_norm": 0.05511578544974327, "learning_rate": 8.982649590120982e-06, "loss": 0.0001, "step": 1074 }, { "epoch": 0.4026946107784431, "grad_norm": 0.11161552369594574, "learning_rate": 8.978809202654161e-06, "loss": 0.0003, "step": 1076 }, { "epoch": 0.40344311377245506, "grad_norm": 0.04912755638360977, "learning_rate": 8.974962404532503e-06, "loss": 0.0002, "step": 1078 }, { "epoch": 0.4041916167664671, "grad_norm": 0.130497545003891, "learning_rate": 8.971109201953962e-06, "loss": 0.0002, "step": 1080 }, { "epoch": 0.4041916167664671, "eval_accuracy": 0.9999309907446557, "eval_loss": 0.00030308307032100856, "eval_runtime": 157.0526, "eval_samples_per_second": 31.836, "eval_steps_per_second": 7.959, "step": 1080 }, { "epoch": 0.40494011976047906, "grad_norm": 0.06516057252883911, "learning_rate": 8.967249601126821e-06, "loss": 0.0001, "step": 1082 }, { "epoch": 0.405688622754491, "grad_norm": 0.0653974711894989, "learning_rate": 8.963383608269665e-06, "loss": 0.0001, "step": 1084 }, { "epoch": 0.406437125748503, "grad_norm": 0.1652081459760666, "learning_rate": 8.959511229611377e-06, "loss": 0.0005, "step": 1086 }, { "epoch": 0.40718562874251496, "grad_norm": 0.2547818720340729, "learning_rate": 8.955632471391132e-06, "loss": 0.0004, "step": 1088 }, { "epoch": 0.40793413173652693, "grad_norm": 0.11153703182935715, "learning_rate": 8.951747339858383e-06, "loss": 0.0001, "step": 1090 }, { "epoch": 0.4086826347305389, "grad_norm": 0.20618999004364014, "learning_rate": 8.947855841272852e-06, "loss": 0.0004, "step": 1092 }, { "epoch": 0.4094311377245509, "grad_norm": 0.06252986937761307, "learning_rate": 8.943957981904518e-06, "loss": 0.0003, "step": 1094 }, { "epoch": 0.4101796407185629, "grad_norm": 0.12335634976625443, "learning_rate": 8.94005376803361e-06, "loss": 0.0002, "step": 1096 }, { "epoch": 0.41092814371257486, "grad_norm": 0.15102048218250275, "learning_rate": 8.936143205950596e-06, "loss": 0.0003, "step": 1098 }, { "epoch": 0.4116766467065868, "grad_norm": 0.2645941376686096, "learning_rate": 8.93222630195617e-06, "loss": 0.0002, "step": 1100 }, { "epoch": 0.4124251497005988, "grad_norm": 0.16175216436386108, "learning_rate": 8.928303062361244e-06, "loss": 0.0002, "step": 1102 }, { "epoch": 0.41317365269461076, "grad_norm": 0.390656977891922, "learning_rate": 8.924373493486941e-06, "loss": 0.0008, "step": 1104 }, { "epoch": 0.41392215568862273, "grad_norm": 0.19943471252918243, "learning_rate": 8.92043760166458e-06, "loss": 0.0006, "step": 1106 }, { "epoch": 0.41467065868263475, "grad_norm": 0.08877554535865784, "learning_rate": 8.916495393235666e-06, "loss": 0.0003, "step": 1108 }, { "epoch": 0.4154191616766467, "grad_norm": 0.02073746733367443, "learning_rate": 8.912546874551883e-06, "loss": 0.0003, "step": 1110 }, { "epoch": 0.4161676646706587, "grad_norm": 0.229649618268013, "learning_rate": 8.908592051975083e-06, "loss": 0.0003, "step": 1112 }, { "epoch": 0.41691616766467066, "grad_norm": 0.2585594952106476, "learning_rate": 8.904630931877271e-06, "loss": 0.0005, "step": 1114 }, { "epoch": 0.4176646706586826, "grad_norm": 0.09236887842416763, "learning_rate": 8.900663520640605e-06, "loss": 0.0003, "step": 1116 }, { "epoch": 0.4184131736526946, "grad_norm": 0.1604318916797638, "learning_rate": 8.896689824657371e-06, "loss": 0.0008, "step": 1118 }, { "epoch": 0.41916167664670656, "grad_norm": 0.1640581637620926, "learning_rate": 8.892709850329991e-06, "loss": 0.0009, "step": 1120 }, { "epoch": 0.41916167664670656, "eval_accuracy": 0.9998298540459971, "eval_loss": 0.0006637079059146345, "eval_runtime": 158.0379, "eval_samples_per_second": 31.638, "eval_steps_per_second": 7.909, "step": 1120 }, { "epoch": 0.4199101796407186, "grad_norm": 0.1830235719680786, "learning_rate": 8.88872360407099e-06, "loss": 0.0007, "step": 1122 }, { "epoch": 0.42065868263473055, "grad_norm": 0.15978126227855682, "learning_rate": 8.884731092303011e-06, "loss": 0.0008, "step": 1124 }, { "epoch": 0.4214071856287425, "grad_norm": 0.07531040906906128, "learning_rate": 8.880732321458785e-06, "loss": 0.0004, "step": 1126 }, { "epoch": 0.4221556886227545, "grad_norm": 0.07047852128744125, "learning_rate": 8.876727297981129e-06, "loss": 0.0004, "step": 1128 }, { "epoch": 0.42290419161676646, "grad_norm": 0.11832007020711899, "learning_rate": 8.872716028322931e-06, "loss": 0.0006, "step": 1130 }, { "epoch": 0.4236526946107784, "grad_norm": 0.11789973080158234, "learning_rate": 8.868698518947152e-06, "loss": 0.0003, "step": 1132 }, { "epoch": 0.4244011976047904, "grad_norm": 0.06593231111764908, "learning_rate": 8.864674776326798e-06, "loss": 0.0003, "step": 1134 }, { "epoch": 0.4251497005988024, "grad_norm": 0.12147919833660126, "learning_rate": 8.860644806944917e-06, "loss": 0.0003, "step": 1136 }, { "epoch": 0.4258982035928144, "grad_norm": 0.014330295845866203, "learning_rate": 8.8566086172946e-06, "loss": 0.0001, "step": 1138 }, { "epoch": 0.42664670658682635, "grad_norm": 0.13002386689186096, "learning_rate": 8.852566213878947e-06, "loss": 0.0002, "step": 1140 }, { "epoch": 0.4273952095808383, "grad_norm": 0.028262050822377205, "learning_rate": 8.84851760321108e-06, "loss": 0.0001, "step": 1142 }, { "epoch": 0.4281437125748503, "grad_norm": 0.058746110647916794, "learning_rate": 8.844462791814113e-06, "loss": 0.0002, "step": 1144 }, { "epoch": 0.42889221556886226, "grad_norm": 0.006739677395671606, "learning_rate": 8.84040178622116e-06, "loss": 0.0, "step": 1146 }, { "epoch": 0.4296407185628742, "grad_norm": 0.0508301667869091, "learning_rate": 8.83633459297531e-06, "loss": 0.0, "step": 1148 }, { "epoch": 0.43038922155688625, "grad_norm": 0.06738423556089401, "learning_rate": 8.83226121862962e-06, "loss": 0.0, "step": 1150 }, { "epoch": 0.4311377245508982, "grad_norm": 0.093570277094841, "learning_rate": 8.828181669747111e-06, "loss": 0.0002, "step": 1152 }, { "epoch": 0.4318862275449102, "grad_norm": 0.22780318558216095, "learning_rate": 8.824095952900746e-06, "loss": 0.0003, "step": 1154 }, { "epoch": 0.43263473053892215, "grad_norm": 0.006822109688073397, "learning_rate": 8.820004074673433e-06, "loss": 0.0, "step": 1156 }, { "epoch": 0.4333832335329341, "grad_norm": 0.03218008950352669, "learning_rate": 8.815906041658001e-06, "loss": 0.0, "step": 1158 }, { "epoch": 0.4341317365269461, "grad_norm": 0.021438656374812126, "learning_rate": 8.8118018604572e-06, "loss": 0.0, "step": 1160 }, { "epoch": 0.4341317365269461, "eval_accuracy": 0.9999750413955637, "eval_loss": 0.000117507777758874, "eval_runtime": 153.4917, "eval_samples_per_second": 32.575, "eval_steps_per_second": 8.144, "step": 1160 }, { "epoch": 0.43488023952095806, "grad_norm": 0.003635610453784466, "learning_rate": 8.807691537683685e-06, "loss": 0.0001, "step": 1162 }, { "epoch": 0.4356287425149701, "grad_norm": 0.03301112353801727, "learning_rate": 8.80357507996e-06, "loss": 0.0, "step": 1164 }, { "epoch": 0.43637724550898205, "grad_norm": 0.09721909463405609, "learning_rate": 8.799452493918586e-06, "loss": 0.0002, "step": 1166 }, { "epoch": 0.437125748502994, "grad_norm": 0.042455609887838364, "learning_rate": 8.795323786201746e-06, "loss": 0.0, "step": 1168 }, { "epoch": 0.437874251497006, "grad_norm": 0.022428715601563454, "learning_rate": 8.791188963461653e-06, "loss": 0.0001, "step": 1170 }, { "epoch": 0.43862275449101795, "grad_norm": 0.0063305930234491825, "learning_rate": 8.787048032360332e-06, "loss": 0.0, "step": 1172 }, { "epoch": 0.4393712574850299, "grad_norm": 0.027178645133972168, "learning_rate": 8.782900999569646e-06, "loss": 0.0001, "step": 1174 }, { "epoch": 0.44011976047904194, "grad_norm": 0.016478972509503365, "learning_rate": 8.778747871771293e-06, "loss": 0.0001, "step": 1176 }, { "epoch": 0.4408682634730539, "grad_norm": 0.0024430316407233477, "learning_rate": 8.774588655656787e-06, "loss": 0.0, "step": 1178 }, { "epoch": 0.4416167664670659, "grad_norm": 0.0025271910708397627, "learning_rate": 8.770423357927463e-06, "loss": 0.0, "step": 1180 }, { "epoch": 0.44236526946107785, "grad_norm": 0.003522343933582306, "learning_rate": 8.766251985294435e-06, "loss": 0.0001, "step": 1182 }, { "epoch": 0.4431137724550898, "grad_norm": 0.0037217664066702127, "learning_rate": 8.762074544478622e-06, "loss": 0.0002, "step": 1184 }, { "epoch": 0.4438622754491018, "grad_norm": 0.003973813261836767, "learning_rate": 8.757891042210713e-06, "loss": 0.0, "step": 1186 }, { "epoch": 0.44461077844311375, "grad_norm": 0.005763462278991938, "learning_rate": 8.753701485231165e-06, "loss": 0.0, "step": 1188 }, { "epoch": 0.4453592814371258, "grad_norm": 0.005196116399019957, "learning_rate": 8.749505880290188e-06, "loss": 0.0, "step": 1190 }, { "epoch": 0.44610778443113774, "grad_norm": 0.0032948690932244062, "learning_rate": 8.74530423414774e-06, "loss": 0.0, "step": 1192 }, { "epoch": 0.4468562874251497, "grad_norm": 0.009100310504436493, "learning_rate": 8.741096553573506e-06, "loss": 0.0, "step": 1194 }, { "epoch": 0.4476047904191617, "grad_norm": 0.0061983345076441765, "learning_rate": 8.736882845346906e-06, "loss": 0.0, "step": 1196 }, { "epoch": 0.44835329341317365, "grad_norm": 0.0011341345962136984, "learning_rate": 8.732663116257057e-06, "loss": 0.0, "step": 1198 }, { "epoch": 0.4491017964071856, "grad_norm": 0.0010620451066643, "learning_rate": 8.728437373102784e-06, "loss": 0.0, "step": 1200 }, { "epoch": 0.4491017964071856, "eval_accuracy": 0.9999988700564972, "eval_loss": 9.44121893553529e-06, "eval_runtime": 153.9703, "eval_samples_per_second": 32.474, "eval_steps_per_second": 8.118, "step": 1200 }, { "epoch": 0.4498502994011976, "grad_norm": 0.0015724517870694399, "learning_rate": 8.724205622692608e-06, "loss": 0.0001, "step": 1202 }, { "epoch": 0.4505988023952096, "grad_norm": 0.0011631123488768935, "learning_rate": 8.719967871844715e-06, "loss": 0.0, "step": 1204 }, { "epoch": 0.4513473053892216, "grad_norm": 0.003892699722200632, "learning_rate": 8.715724127386971e-06, "loss": 0.0, "step": 1206 }, { "epoch": 0.45209580838323354, "grad_norm": 0.001457210979424417, "learning_rate": 8.711474396156894e-06, "loss": 0.0, "step": 1208 }, { "epoch": 0.4528443113772455, "grad_norm": 0.04398849606513977, "learning_rate": 8.707218685001648e-06, "loss": 0.0, "step": 1210 }, { "epoch": 0.4535928143712575, "grad_norm": 0.0016321828588843346, "learning_rate": 8.702957000778029e-06, "loss": 0.0, "step": 1212 }, { "epoch": 0.45434131736526945, "grad_norm": 0.006306509952992201, "learning_rate": 8.698689350352465e-06, "loss": 0.0, "step": 1214 }, { "epoch": 0.4550898203592814, "grad_norm": 0.003088061697781086, "learning_rate": 8.69441574060099e-06, "loss": 0.0, "step": 1216 }, { "epoch": 0.45583832335329344, "grad_norm": 0.0059152874164283276, "learning_rate": 8.690136178409237e-06, "loss": 0.0, "step": 1218 }, { "epoch": 0.4565868263473054, "grad_norm": 0.0006292685866355896, "learning_rate": 8.685850670672438e-06, "loss": 0.0, "step": 1220 }, { "epoch": 0.4573353293413174, "grad_norm": 0.001894032466225326, "learning_rate": 8.681559224295401e-06, "loss": 0.0, "step": 1222 }, { "epoch": 0.45808383233532934, "grad_norm": 0.000890376337338239, "learning_rate": 8.6772618461925e-06, "loss": 0.0, "step": 1224 }, { "epoch": 0.4588323353293413, "grad_norm": 0.047846511006355286, "learning_rate": 8.672958543287666e-06, "loss": 0.0, "step": 1226 }, { "epoch": 0.4595808383233533, "grad_norm": 0.0039296639151871204, "learning_rate": 8.668649322514382e-06, "loss": 0.0, "step": 1228 }, { "epoch": 0.46032934131736525, "grad_norm": 0.0010013898136094213, "learning_rate": 8.66433419081566e-06, "loss": 0.0, "step": 1230 }, { "epoch": 0.46107784431137727, "grad_norm": 0.0013401862233877182, "learning_rate": 8.660013155144036e-06, "loss": 0.0, "step": 1232 }, { "epoch": 0.46182634730538924, "grad_norm": 0.01721956580877304, "learning_rate": 8.655686222461561e-06, "loss": 0.0, "step": 1234 }, { "epoch": 0.4625748502994012, "grad_norm": 0.10839847475290298, "learning_rate": 8.651353399739787e-06, "loss": 0.0, "step": 1236 }, { "epoch": 0.4633233532934132, "grad_norm": 0.0019374943803995848, "learning_rate": 8.647014693959754e-06, "loss": 0.0, "step": 1238 }, { "epoch": 0.46407185628742514, "grad_norm": 0.0010845274664461613, "learning_rate": 8.642670112111982e-06, "loss": 0.0, "step": 1240 }, { "epoch": 0.46407185628742514, "eval_accuracy": 0.999998779749899, "eval_loss": 7.805577297403943e-06, "eval_runtime": 153.998, "eval_samples_per_second": 32.468, "eval_steps_per_second": 8.117, "step": 1240 }, { "epoch": 0.4648203592814371, "grad_norm": 0.14908014237880707, "learning_rate": 8.63831966119646e-06, "loss": 0.0002, "step": 1242 }, { "epoch": 0.4655688622754491, "grad_norm": 0.0005565496394410729, "learning_rate": 8.633963348222628e-06, "loss": 0.0, "step": 1244 }, { "epoch": 0.4663173652694611, "grad_norm": 0.016671478748321533, "learning_rate": 8.629601180209382e-06, "loss": 0.0, "step": 1246 }, { "epoch": 0.46706586826347307, "grad_norm": 0.0019113154849037528, "learning_rate": 8.625233164185035e-06, "loss": 0.0, "step": 1248 }, { "epoch": 0.46781437125748504, "grad_norm": 0.24114775657653809, "learning_rate": 8.620859307187339e-06, "loss": 0.0002, "step": 1250 }, { "epoch": 0.468562874251497, "grad_norm": 0.12908697128295898, "learning_rate": 8.616479616263444e-06, "loss": 0.0001, "step": 1252 }, { "epoch": 0.469311377245509, "grad_norm": 0.04974567890167236, "learning_rate": 8.61209409846991e-06, "loss": 0.0, "step": 1254 }, { "epoch": 0.47005988023952094, "grad_norm": 0.14523474872112274, "learning_rate": 8.607702760872679e-06, "loss": 0.0005, "step": 1256 }, { "epoch": 0.4708083832335329, "grad_norm": 0.0028332837391644716, "learning_rate": 8.60330561054707e-06, "loss": 0.0003, "step": 1258 }, { "epoch": 0.47155688622754494, "grad_norm": 0.09443452209234238, "learning_rate": 8.598902654577768e-06, "loss": 0.0002, "step": 1260 }, { "epoch": 0.4723053892215569, "grad_norm": 0.1735847145318985, "learning_rate": 8.594493900058817e-06, "loss": 0.0005, "step": 1262 }, { "epoch": 0.47305389221556887, "grad_norm": 0.09729497134685516, "learning_rate": 8.590079354093594e-06, "loss": 0.0001, "step": 1264 }, { "epoch": 0.47380239520958084, "grad_norm": 0.014571278356015682, "learning_rate": 8.585659023794818e-06, "loss": 0.0001, "step": 1266 }, { "epoch": 0.4745508982035928, "grad_norm": 0.08880387991666794, "learning_rate": 8.581232916284519e-06, "loss": 0.0002, "step": 1268 }, { "epoch": 0.4752994011976048, "grad_norm": 0.056134164333343506, "learning_rate": 8.57680103869404e-06, "loss": 0.0001, "step": 1270 }, { "epoch": 0.47604790419161674, "grad_norm": 0.06601478904485703, "learning_rate": 8.572363398164017e-06, "loss": 0.0001, "step": 1272 }, { "epoch": 0.47679640718562877, "grad_norm": 0.04634417966008186, "learning_rate": 8.567920001844376e-06, "loss": 0.0001, "step": 1274 }, { "epoch": 0.47754491017964074, "grad_norm": 0.06786518543958664, "learning_rate": 8.563470856894316e-06, "loss": 0.0002, "step": 1276 }, { "epoch": 0.4782934131736527, "grad_norm": 0.0038419270422309637, "learning_rate": 8.559015970482292e-06, "loss": 0.0, "step": 1278 }, { "epoch": 0.47904191616766467, "grad_norm": 0.005290990229696035, "learning_rate": 8.554555349786016e-06, "loss": 0.0, "step": 1280 }, { "epoch": 0.47904191616766467, "eval_accuracy": 1.0, "eval_loss": 1.6949796190601774e-05, "eval_runtime": 162.9441, "eval_samples_per_second": 30.685, "eval_steps_per_second": 7.671, "step": 1280 }, { "epoch": 0.47979041916167664, "grad_norm": 0.015714962035417557, "learning_rate": 8.550089001992438e-06, "loss": 0.0, "step": 1282 }, { "epoch": 0.4805389221556886, "grad_norm": 0.016865752637386322, "learning_rate": 8.545616934297733e-06, "loss": 0.0, "step": 1284 }, { "epoch": 0.4812874251497006, "grad_norm": 0.0030540430452674627, "learning_rate": 8.541139153907296e-06, "loss": 0.0, "step": 1286 }, { "epoch": 0.4820359281437126, "grad_norm": 0.004076346755027771, "learning_rate": 8.536655668035723e-06, "loss": 0.0, "step": 1288 }, { "epoch": 0.48278443113772457, "grad_norm": 0.0015489828074350953, "learning_rate": 8.532166483906804e-06, "loss": 0.0, "step": 1290 }, { "epoch": 0.48353293413173654, "grad_norm": 0.0032020832877606153, "learning_rate": 8.527671608753508e-06, "loss": 0.0, "step": 1292 }, { "epoch": 0.4842814371257485, "grad_norm": 0.0029930637683719397, "learning_rate": 8.523171049817974e-06, "loss": 0.0, "step": 1294 }, { "epoch": 0.48502994011976047, "grad_norm": 0.00045903853606432676, "learning_rate": 8.518664814351502e-06, "loss": 0.0, "step": 1296 }, { "epoch": 0.48577844311377244, "grad_norm": 0.002175545785576105, "learning_rate": 8.514152909614538e-06, "loss": 0.0, "step": 1298 }, { "epoch": 0.4865269461077844, "grad_norm": 0.00034521182533353567, "learning_rate": 8.509635342876655e-06, "loss": 0.0, "step": 1300 }, { "epoch": 0.48727544910179643, "grad_norm": 0.0007213663193397224, "learning_rate": 8.505112121416554e-06, "loss": 0.0, "step": 1302 }, { "epoch": 0.4880239520958084, "grad_norm": 0.000988309970125556, "learning_rate": 8.500583252522053e-06, "loss": 0.0, "step": 1304 }, { "epoch": 0.48877245508982037, "grad_norm": 0.0006475381087511778, "learning_rate": 8.496048743490053e-06, "loss": 0.0, "step": 1306 }, { "epoch": 0.48952095808383234, "grad_norm": 0.004469580017030239, "learning_rate": 8.49150860162656e-06, "loss": 0.0, "step": 1308 }, { "epoch": 0.4902694610778443, "grad_norm": 0.0006323789712041616, "learning_rate": 8.486962834246646e-06, "loss": 0.0, "step": 1310 }, { "epoch": 0.49101796407185627, "grad_norm": 0.0003597615868784487, "learning_rate": 8.482411448674445e-06, "loss": 0.0, "step": 1312 }, { "epoch": 0.49176646706586824, "grad_norm": 0.0009737180080264807, "learning_rate": 8.477854452243149e-06, "loss": 0.0, "step": 1314 }, { "epoch": 0.49251497005988026, "grad_norm": 0.00047102788812480867, "learning_rate": 8.473291852294986e-06, "loss": 0.0, "step": 1316 }, { "epoch": 0.49326347305389223, "grad_norm": 0.0006197803886607289, "learning_rate": 8.468723656181219e-06, "loss": 0.0, "step": 1318 }, { "epoch": 0.4940119760479042, "grad_norm": 0.0006234439788386226, "learning_rate": 8.464149871262118e-06, "loss": 0.0, "step": 1320 }, { "epoch": 0.4940119760479042, "eval_accuracy": 1.0, "eval_loss": 1.9553074253053637e-06, "eval_runtime": 156.0817, "eval_samples_per_second": 32.035, "eval_steps_per_second": 8.009, "step": 1320 }, { "epoch": 0.49476047904191617, "grad_norm": 0.0012577202869579196, "learning_rate": 8.459570504906962e-06, "loss": 0.0, "step": 1322 }, { "epoch": 0.49550898203592814, "grad_norm": 0.0013276775134727359, "learning_rate": 8.454985564494025e-06, "loss": 0.0, "step": 1324 }, { "epoch": 0.4962574850299401, "grad_norm": 0.0007339988951571286, "learning_rate": 8.450395057410561e-06, "loss": 0.0, "step": 1326 }, { "epoch": 0.49700598802395207, "grad_norm": 0.00023488645092584193, "learning_rate": 8.445798991052791e-06, "loss": 0.0, "step": 1328 }, { "epoch": 0.4977544910179641, "grad_norm": 0.008007602766156197, "learning_rate": 8.441197372825892e-06, "loss": 0.0, "step": 1330 }, { "epoch": 0.49850299401197606, "grad_norm": 0.00031445815693587065, "learning_rate": 8.436590210143991e-06, "loss": 0.0, "step": 1332 }, { "epoch": 0.49925149700598803, "grad_norm": 0.000408270803745836, "learning_rate": 8.431977510430145e-06, "loss": 0.0, "step": 1334 }, { "epoch": 0.5, "grad_norm": 0.00037585021345876157, "learning_rate": 8.427359281116335e-06, "loss": 0.0, "step": 1336 }, { "epoch": 0.500748502994012, "grad_norm": 0.00020924248383380473, "learning_rate": 8.422735529643445e-06, "loss": 0.0, "step": 1338 }, { "epoch": 0.5014970059880239, "grad_norm": 0.00034080087789334357, "learning_rate": 8.418106263461261e-06, "loss": 0.0, "step": 1340 }, { "epoch": 0.5022455089820359, "grad_norm": 0.001094786450266838, "learning_rate": 8.413471490028456e-06, "loss": 0.0, "step": 1342 }, { "epoch": 0.5029940119760479, "grad_norm": 0.00020074410713277757, "learning_rate": 8.408831216812574e-06, "loss": 0.0, "step": 1344 }, { "epoch": 0.5037425149700598, "grad_norm": 0.00031989437411539257, "learning_rate": 8.404185451290017e-06, "loss": 0.0, "step": 1346 }, { "epoch": 0.5044910179640718, "grad_norm": 0.0004511797451414168, "learning_rate": 8.399534200946044e-06, "loss": 0.0, "step": 1348 }, { "epoch": 0.5052395209580839, "grad_norm": 0.0033039976842701435, "learning_rate": 8.394877473274743e-06, "loss": 0.0, "step": 1350 }, { "epoch": 0.5059880239520959, "grad_norm": 0.00033330474980175495, "learning_rate": 8.39021527577903e-06, "loss": 0.0, "step": 1352 }, { "epoch": 0.5067365269461078, "grad_norm": 0.00710050156340003, "learning_rate": 8.38554761597064e-06, "loss": 0.0, "step": 1354 }, { "epoch": 0.5074850299401198, "grad_norm": 0.00019886674999725074, "learning_rate": 8.380874501370098e-06, "loss": 0.0, "step": 1356 }, { "epoch": 0.5082335329341318, "grad_norm": 0.0007205790607258677, "learning_rate": 8.376195939506727e-06, "loss": 0.0, "step": 1358 }, { "epoch": 0.5089820359281437, "grad_norm": 0.00022511072165798396, "learning_rate": 8.371511937918616e-06, "loss": 0.0, "step": 1360 }, { "epoch": 0.5089820359281437, "eval_accuracy": 1.0, "eval_loss": 1.2852336794821895e-06, "eval_runtime": 155.6867, "eval_samples_per_second": 32.116, "eval_steps_per_second": 8.029, "step": 1360 }, { "epoch": 0.5097305389221557, "grad_norm": 0.000407641549827531, "learning_rate": 8.366822504152636e-06, "loss": 0.0, "step": 1362 }, { "epoch": 0.5104790419161677, "grad_norm": 0.001749478979036212, "learning_rate": 8.362127645764392e-06, "loss": 0.0, "step": 1364 }, { "epoch": 0.5112275449101796, "grad_norm": 0.00031519352342002094, "learning_rate": 8.357427370318239e-06, "loss": 0.0, "step": 1366 }, { "epoch": 0.5119760479041916, "grad_norm": 0.0004909691051580012, "learning_rate": 8.352721685387258e-06, "loss": 0.0, "step": 1368 }, { "epoch": 0.5127245508982036, "grad_norm": 0.0003617761831264943, "learning_rate": 8.348010598553245e-06, "loss": 0.0, "step": 1370 }, { "epoch": 0.5134730538922155, "grad_norm": 0.0003041178279090673, "learning_rate": 8.3432941174067e-06, "loss": 0.0, "step": 1372 }, { "epoch": 0.5142215568862275, "grad_norm": 0.0009227189584635198, "learning_rate": 8.338572249546813e-06, "loss": 0.0, "step": 1374 }, { "epoch": 0.5149700598802395, "grad_norm": 0.00037840052391402423, "learning_rate": 8.33384500258146e-06, "loss": 0.0, "step": 1376 }, { "epoch": 0.5157185628742516, "grad_norm": 0.0003028454084414989, "learning_rate": 8.329112384127172e-06, "loss": 0.0, "step": 1378 }, { "epoch": 0.5164670658682635, "grad_norm": 0.00024328533618245274, "learning_rate": 8.324374401809144e-06, "loss": 0.0, "step": 1380 }, { "epoch": 0.5172155688622755, "grad_norm": 0.00041966489516198635, "learning_rate": 8.319631063261209e-06, "loss": 0.0, "step": 1382 }, { "epoch": 0.5179640718562875, "grad_norm": 0.0001933051535161212, "learning_rate": 8.314882376125832e-06, "loss": 0.0, "step": 1384 }, { "epoch": 0.5187125748502994, "grad_norm": 0.00017571724310982972, "learning_rate": 8.310128348054093e-06, "loss": 0.0, "step": 1386 }, { "epoch": 0.5194610778443114, "grad_norm": 0.00010965206456603482, "learning_rate": 8.305368986705683e-06, "loss": 0.0, "step": 1388 }, { "epoch": 0.5202095808383234, "grad_norm": 0.00016223240527324378, "learning_rate": 8.300604299748876e-06, "loss": 0.0, "step": 1390 }, { "epoch": 0.5209580838323353, "grad_norm": 0.00020105067233089358, "learning_rate": 8.295834294860535e-06, "loss": 0.0, "step": 1392 }, { "epoch": 0.5217065868263473, "grad_norm": 0.00012361881090328097, "learning_rate": 8.291058979726092e-06, "loss": 0.0, "step": 1394 }, { "epoch": 0.5224550898203593, "grad_norm": 0.00031712997588329017, "learning_rate": 8.286278362039527e-06, "loss": 0.0, "step": 1396 }, { "epoch": 0.5232035928143712, "grad_norm": 0.0007049996056593955, "learning_rate": 8.281492449503372e-06, "loss": 0.0, "step": 1398 }, { "epoch": 0.5239520958083832, "grad_norm": 0.00017834010941442102, "learning_rate": 8.276701249828684e-06, "loss": 0.0, "step": 1400 }, { "epoch": 0.5239520958083832, "eval_accuracy": 1.0, "eval_loss": 1.0108310561918188e-06, "eval_runtime": 164.1638, "eval_samples_per_second": 30.457, "eval_steps_per_second": 7.614, "step": 1400 }, { "epoch": 0.5247005988023952, "grad_norm": 0.00016547701670788229, "learning_rate": 8.271904770735042e-06, "loss": 0.0, "step": 1402 }, { "epoch": 0.5254491017964071, "grad_norm": 0.00018670795543584973, "learning_rate": 8.267103019950529e-06, "loss": 0.0, "step": 1404 }, { "epoch": 0.5261976047904192, "grad_norm": 0.0005327853723429143, "learning_rate": 8.262296005211722e-06, "loss": 0.0, "step": 1406 }, { "epoch": 0.5269461077844312, "grad_norm": 0.00011932725465158, "learning_rate": 8.257483734263682e-06, "loss": 0.0, "step": 1408 }, { "epoch": 0.5276946107784432, "grad_norm": 0.0010636606020852923, "learning_rate": 8.252666214859936e-06, "loss": 0.0, "step": 1410 }, { "epoch": 0.5284431137724551, "grad_norm": 0.00017524044960737228, "learning_rate": 8.247843454762467e-06, "loss": 0.0, "step": 1412 }, { "epoch": 0.5291916167664671, "grad_norm": 0.00048075238009914756, "learning_rate": 8.243015461741707e-06, "loss": 0.0, "step": 1414 }, { "epoch": 0.5299401197604791, "grad_norm": 0.000147451224620454, "learning_rate": 8.238182243576512e-06, "loss": 0.0, "step": 1416 }, { "epoch": 0.530688622754491, "grad_norm": 0.00012748232984449714, "learning_rate": 8.233343808054159e-06, "loss": 0.0, "step": 1418 }, { "epoch": 0.531437125748503, "grad_norm": 0.00022851829999126494, "learning_rate": 8.228500162970333e-06, "loss": 0.0, "step": 1420 }, { "epoch": 0.532185628742515, "grad_norm": 0.00014870429004076868, "learning_rate": 8.223651316129115e-06, "loss": 0.0, "step": 1422 }, { "epoch": 0.5329341317365269, "grad_norm": 0.00023895545746199787, "learning_rate": 8.21879727534296e-06, "loss": 0.0, "step": 1424 }, { "epoch": 0.5336826347305389, "grad_norm": 0.0001619049144210294, "learning_rate": 8.213938048432697e-06, "loss": 0.0, "step": 1426 }, { "epoch": 0.5344311377245509, "grad_norm": 0.0002205699129262939, "learning_rate": 8.20907364322751e-06, "loss": 0.0, "step": 1428 }, { "epoch": 0.5351796407185628, "grad_norm": 0.0001235086820088327, "learning_rate": 8.204204067564924e-06, "loss": 0.0, "step": 1430 }, { "epoch": 0.5359281437125748, "grad_norm": 0.00023735742433927953, "learning_rate": 8.199329329290798e-06, "loss": 0.0, "step": 1432 }, { "epoch": 0.5366766467065869, "grad_norm": 0.00017734240100253373, "learning_rate": 8.194449436259305e-06, "loss": 0.0, "step": 1434 }, { "epoch": 0.5374251497005988, "grad_norm": 0.00017838150961324573, "learning_rate": 8.189564396332927e-06, "loss": 0.0, "step": 1436 }, { "epoch": 0.5381736526946108, "grad_norm": 0.0001554026093799621, "learning_rate": 8.184674217382438e-06, "loss": 0.0, "step": 1438 }, { "epoch": 0.5389221556886228, "grad_norm": 0.00021117751020938158, "learning_rate": 8.179778907286889e-06, "loss": 0.0, "step": 1440 }, { "epoch": 0.5389221556886228, "eval_accuracy": 1.0, "eval_loss": 8.898123837752792e-07, "eval_runtime": 163.5535, "eval_samples_per_second": 30.571, "eval_steps_per_second": 7.643, "step": 1440 }, { "epoch": 0.5396706586826348, "grad_norm": 0.0002229697274742648, "learning_rate": 8.174878473933601e-06, "loss": 0.0, "step": 1442 }, { "epoch": 0.5404191616766467, "grad_norm": 9.283604595111683e-05, "learning_rate": 8.16997292521815e-06, "loss": 0.0, "step": 1444 }, { "epoch": 0.5411676646706587, "grad_norm": 0.0001886676182039082, "learning_rate": 8.165062269044353e-06, "loss": 0.0, "step": 1446 }, { "epoch": 0.5419161676646707, "grad_norm": 0.0001873478468041867, "learning_rate": 8.160146513324256e-06, "loss": 0.0, "step": 1448 }, { "epoch": 0.5426646706586826, "grad_norm": 0.00011518682003952563, "learning_rate": 8.15522566597812e-06, "loss": 0.0, "step": 1450 }, { "epoch": 0.5434131736526946, "grad_norm": 0.0001653393410379067, "learning_rate": 8.150299734934413e-06, "loss": 0.0, "step": 1452 }, { "epoch": 0.5441616766467066, "grad_norm": 0.0001261346333194524, "learning_rate": 8.14536872812979e-06, "loss": 0.0, "step": 1454 }, { "epoch": 0.5449101796407185, "grad_norm": 0.00019827600044663996, "learning_rate": 8.140432653509089e-06, "loss": 0.0, "step": 1456 }, { "epoch": 0.5456586826347305, "grad_norm": 0.00042794988257810473, "learning_rate": 8.135491519025307e-06, "loss": 0.0, "step": 1458 }, { "epoch": 0.5464071856287425, "grad_norm": 0.00011408683349145576, "learning_rate": 8.130545332639599e-06, "loss": 0.0, "step": 1460 }, { "epoch": 0.5471556886227545, "grad_norm": 0.0001599421666469425, "learning_rate": 8.125594102321256e-06, "loss": 0.0, "step": 1462 }, { "epoch": 0.5479041916167665, "grad_norm": 0.00013200732064433396, "learning_rate": 8.120637836047698e-06, "loss": 0.0, "step": 1464 }, { "epoch": 0.5486526946107785, "grad_norm": 0.00012747867731377482, "learning_rate": 8.115676541804456e-06, "loss": 0.0, "step": 1466 }, { "epoch": 0.5494011976047904, "grad_norm": 0.00013039227633271366, "learning_rate": 8.110710227585169e-06, "loss": 0.0, "step": 1468 }, { "epoch": 0.5501497005988024, "grad_norm": 0.0001378858432872221, "learning_rate": 8.105738901391553e-06, "loss": 0.0, "step": 1470 }, { "epoch": 0.5508982035928144, "grad_norm": 0.00019915043958462775, "learning_rate": 8.100762571233409e-06, "loss": 0.0, "step": 1472 }, { "epoch": 0.5516467065868264, "grad_norm": 0.00013786421914119273, "learning_rate": 8.095781245128598e-06, "loss": 0.0, "step": 1474 }, { "epoch": 0.5523952095808383, "grad_norm": 0.00011769870616262779, "learning_rate": 8.090794931103026e-06, "loss": 0.0, "step": 1476 }, { "epoch": 0.5531437125748503, "grad_norm": 0.0001314223773078993, "learning_rate": 8.085803637190643e-06, "loss": 0.0, "step": 1478 }, { "epoch": 0.5538922155688623, "grad_norm": 0.0017582617001608014, "learning_rate": 8.080807371433415e-06, "loss": 0.0, "step": 1480 }, { "epoch": 0.5538922155688623, "eval_accuracy": 1.0, "eval_loss": 7.754564421702526e-07, "eval_runtime": 165.5806, "eval_samples_per_second": 30.197, "eval_steps_per_second": 7.549, "step": 1480 }, { "epoch": 0.5546407185628742, "grad_norm": 0.0001497814228059724, "learning_rate": 8.075806141881327e-06, "loss": 0.0001, "step": 1482 }, { "epoch": 0.5553892215568862, "grad_norm": 0.022318042814731598, "learning_rate": 8.07079995659235e-06, "loss": 0.0, "step": 1484 }, { "epoch": 0.5561377245508982, "grad_norm": 0.00037483617779798806, "learning_rate": 8.065788823632451e-06, "loss": 0.0, "step": 1486 }, { "epoch": 0.5568862275449101, "grad_norm": 0.017027398571372032, "learning_rate": 8.060772751075564e-06, "loss": 0.0, "step": 1488 }, { "epoch": 0.5576347305389222, "grad_norm": 0.0005349895800463855, "learning_rate": 8.05575174700358e-06, "loss": 0.0, "step": 1490 }, { "epoch": 0.5583832335329342, "grad_norm": 9.526366193313152e-05, "learning_rate": 8.05072581950634e-06, "loss": 0.0, "step": 1492 }, { "epoch": 0.5591317365269461, "grad_norm": 0.0018775092903524637, "learning_rate": 8.045694976681613e-06, "loss": 0.0, "step": 1494 }, { "epoch": 0.5598802395209581, "grad_norm": 0.006572918966412544, "learning_rate": 8.04065922663509e-06, "loss": 0.0, "step": 1496 }, { "epoch": 0.5606287425149701, "grad_norm": 0.00012014804087812081, "learning_rate": 8.035618577480369e-06, "loss": 0.0, "step": 1498 }, { "epoch": 0.561377245508982, "grad_norm": 0.00019066958338953555, "learning_rate": 8.030573037338942e-06, "loss": 0.0, "step": 1500 }, { "epoch": 0.562125748502994, "grad_norm": 0.000158317168825306, "learning_rate": 8.025522614340177e-06, "loss": 0.0, "step": 1502 }, { "epoch": 0.562874251497006, "grad_norm": 0.0006968253292143345, "learning_rate": 8.020467316621316e-06, "loss": 0.0, "step": 1504 }, { "epoch": 0.563622754491018, "grad_norm": 0.00015155051369220018, "learning_rate": 8.015407152327448e-06, "loss": 0.0, "step": 1506 }, { "epoch": 0.5643712574850299, "grad_norm": 0.00017856295744422823, "learning_rate": 8.010342129611508e-06, "loss": 0.0, "step": 1508 }, { "epoch": 0.5651197604790419, "grad_norm": 0.0007032952271401882, "learning_rate": 8.005272256634257e-06, "loss": 0.0, "step": 1510 }, { "epoch": 0.5658682634730539, "grad_norm": 0.0001873714500106871, "learning_rate": 8.000197541564273e-06, "loss": 0.0, "step": 1512 }, { "epoch": 0.5666167664670658, "grad_norm": 0.00024626540835015476, "learning_rate": 7.99511799257793e-06, "loss": 0.0, "step": 1514 }, { "epoch": 0.5673652694610778, "grad_norm": 0.00011799616186181083, "learning_rate": 7.990033617859396e-06, "loss": 0.0, "step": 1516 }, { "epoch": 0.5681137724550899, "grad_norm": 0.00017817386833485216, "learning_rate": 7.984944425600614e-06, "loss": 0.0, "step": 1518 }, { "epoch": 0.5688622754491018, "grad_norm": 0.00012566296209115535, "learning_rate": 7.979850424001283e-06, "loss": 0.0, "step": 1520 }, { "epoch": 0.5688622754491018, "eval_accuracy": 0.9999997747747748, "eval_loss": 2.201959887315752e-06, "eval_runtime": 164.4185, "eval_samples_per_second": 30.41, "eval_steps_per_second": 7.603, "step": 1520 }, { "epoch": 0.5696107784431138, "grad_norm": 0.00012715034245047718, "learning_rate": 7.97475162126886e-06, "loss": 0.0, "step": 1522 }, { "epoch": 0.5703592814371258, "grad_norm": 0.0005135888350196183, "learning_rate": 7.96964802561853e-06, "loss": 0.0, "step": 1524 }, { "epoch": 0.5711077844311377, "grad_norm": 0.0008899507811293006, "learning_rate": 7.964539645273204e-06, "loss": 0.0, "step": 1526 }, { "epoch": 0.5718562874251497, "grad_norm": 0.00014234622358344495, "learning_rate": 7.9594264884635e-06, "loss": 0.0, "step": 1528 }, { "epoch": 0.5726047904191617, "grad_norm": 0.00044490184518508613, "learning_rate": 7.954308563427732e-06, "loss": 0.0, "step": 1530 }, { "epoch": 0.5733532934131736, "grad_norm": 0.0004495025204960257, "learning_rate": 7.9491858784119e-06, "loss": 0.0, "step": 1532 }, { "epoch": 0.5741017964071856, "grad_norm": 0.0003148759133182466, "learning_rate": 7.944058441669671e-06, "loss": 0.0, "step": 1534 }, { "epoch": 0.5748502994011976, "grad_norm": 0.01583685912191868, "learning_rate": 7.938926261462366e-06, "loss": 0.0, "step": 1536 }, { "epoch": 0.5755988023952096, "grad_norm": 0.0001242299476871267, "learning_rate": 7.933789346058951e-06, "loss": 0.0, "step": 1538 }, { "epoch": 0.5763473053892215, "grad_norm": 0.0006354754441417754, "learning_rate": 7.928647703736024e-06, "loss": 0.0, "step": 1540 }, { "epoch": 0.5770958083832335, "grad_norm": 0.00038198617403395474, "learning_rate": 7.923501342777788e-06, "loss": 0.0, "step": 1542 }, { "epoch": 0.5778443113772455, "grad_norm": 0.0022715749219059944, "learning_rate": 7.918350271476064e-06, "loss": 0.0, "step": 1544 }, { "epoch": 0.5785928143712575, "grad_norm": 7.122563692973927e-05, "learning_rate": 7.913194498130252e-06, "loss": 0.0, "step": 1546 }, { "epoch": 0.5793413173652695, "grad_norm": 0.00045244794455356896, "learning_rate": 7.90803403104733e-06, "loss": 0.0, "step": 1548 }, { "epoch": 0.5800898203592815, "grad_norm": 0.00015517730207648128, "learning_rate": 7.90286887854184e-06, "loss": 0.0, "step": 1550 }, { "epoch": 0.5808383233532934, "grad_norm": 0.0001861519122030586, "learning_rate": 7.897699048935875e-06, "loss": 0.0, "step": 1552 }, { "epoch": 0.5815868263473054, "grad_norm": 0.00018396999803371727, "learning_rate": 7.892524550559056e-06, "loss": 0.0, "step": 1554 }, { "epoch": 0.5823353293413174, "grad_norm": 0.000191383485798724, "learning_rate": 7.887345391748533e-06, "loss": 0.0, "step": 1556 }, { "epoch": 0.5830838323353293, "grad_norm": 0.0018197696190327406, "learning_rate": 7.882161580848966e-06, "loss": 0.0, "step": 1558 }, { "epoch": 0.5838323353293413, "grad_norm": 0.0004387347144074738, "learning_rate": 7.876973126212507e-06, "loss": 0.0, "step": 1560 }, { "epoch": 0.5838323353293413, "eval_accuracy": 1.0, "eval_loss": 1.0737475122368778e-06, "eval_runtime": 162.5348, "eval_samples_per_second": 30.763, "eval_steps_per_second": 7.691, "step": 1560 }, { "epoch": 0.5845808383233533, "grad_norm": 0.0022077385801821947, "learning_rate": 7.87178003619879e-06, "loss": 0.0, "step": 1562 }, { "epoch": 0.5853293413173652, "grad_norm": 0.0002099570701830089, "learning_rate": 7.866582319174918e-06, "loss": 0.0, "step": 1564 }, { "epoch": 0.5860778443113772, "grad_norm": 0.00012316112406551838, "learning_rate": 7.861379983515449e-06, "loss": 0.0, "step": 1566 }, { "epoch": 0.5868263473053892, "grad_norm": 0.00024173619749490172, "learning_rate": 7.856173037602383e-06, "loss": 0.0, "step": 1568 }, { "epoch": 0.5875748502994012, "grad_norm": 8.62416927702725e-05, "learning_rate": 7.85096148982515e-06, "loss": 0.0, "step": 1570 }, { "epoch": 0.5883233532934131, "grad_norm": 0.0014817145420238376, "learning_rate": 7.845745348580592e-06, "loss": 0.0, "step": 1572 }, { "epoch": 0.5890718562874252, "grad_norm": 0.0001866584934759885, "learning_rate": 7.840524622272949e-06, "loss": 0.0, "step": 1574 }, { "epoch": 0.5898203592814372, "grad_norm": 0.00014589863712899387, "learning_rate": 7.835299319313854e-06, "loss": 0.0, "step": 1576 }, { "epoch": 0.5905688622754491, "grad_norm": 0.0001201587583636865, "learning_rate": 7.830069448122313e-06, "loss": 0.0, "step": 1578 }, { "epoch": 0.5913173652694611, "grad_norm": 0.0004075410251971334, "learning_rate": 7.82483501712469e-06, "loss": 0.0, "step": 1580 }, { "epoch": 0.5920658682634731, "grad_norm": 0.0007497837650589645, "learning_rate": 7.819596034754696e-06, "loss": 0.0, "step": 1582 }, { "epoch": 0.592814371257485, "grad_norm": 0.0001937482156790793, "learning_rate": 7.81435250945338e-06, "loss": 0.0, "step": 1584 }, { "epoch": 0.593562874251497, "grad_norm": 0.00015605123189743608, "learning_rate": 7.8091044496691e-06, "loss": 0.0, "step": 1586 }, { "epoch": 0.594311377245509, "grad_norm": 0.0008134747622534633, "learning_rate": 7.803851863857533e-06, "loss": 0.0, "step": 1588 }, { "epoch": 0.5950598802395209, "grad_norm": 0.016990555450320244, "learning_rate": 7.798594760481639e-06, "loss": 0.0, "step": 1590 }, { "epoch": 0.5958083832335329, "grad_norm": 8.968533074948937e-05, "learning_rate": 7.793333148011658e-06, "loss": 0.0, "step": 1592 }, { "epoch": 0.5965568862275449, "grad_norm": 0.00021849350014235824, "learning_rate": 7.7880670349251e-06, "loss": 0.0, "step": 1594 }, { "epoch": 0.5973053892215568, "grad_norm": 0.00010406466026324779, "learning_rate": 7.782796429706721e-06, "loss": 0.0, "step": 1596 }, { "epoch": 0.5980538922155688, "grad_norm": 0.0001633252395549789, "learning_rate": 7.777521340848515e-06, "loss": 0.0, "step": 1598 }, { "epoch": 0.5988023952095808, "grad_norm": 0.00010362563625676557, "learning_rate": 7.772241776849705e-06, "loss": 0.0, "step": 1600 }, { "epoch": 0.5988023952095808, "eval_accuracy": 1.0, "eval_loss": 8.31741829188104e-07, "eval_runtime": 160.7648, "eval_samples_per_second": 31.101, "eval_steps_per_second": 7.775, "step": 1600 }, { "epoch": 0.5995508982035929, "grad_norm": 0.0001440924679627642, "learning_rate": 7.76695774621672e-06, "loss": 0.0, "step": 1602 }, { "epoch": 0.6002994011976048, "grad_norm": 0.00024095825210679322, "learning_rate": 7.761669257463188e-06, "loss": 0.0, "step": 1604 }, { "epoch": 0.6010479041916168, "grad_norm": 0.0010255592642351985, "learning_rate": 7.756376319109917e-06, "loss": 0.0, "step": 1606 }, { "epoch": 0.6017964071856288, "grad_norm": 0.0003512499970383942, "learning_rate": 7.751078939684886e-06, "loss": 0.0, "step": 1608 }, { "epoch": 0.6025449101796407, "grad_norm": 0.0001344636984867975, "learning_rate": 7.74577712772323e-06, "loss": 0.0, "step": 1610 }, { "epoch": 0.6032934131736527, "grad_norm": 0.00015981386241037399, "learning_rate": 7.740470891767225e-06, "loss": 0.0, "step": 1612 }, { "epoch": 0.6040419161676647, "grad_norm": 0.00013109891733620316, "learning_rate": 7.735160240366276e-06, "loss": 0.0, "step": 1614 }, { "epoch": 0.6047904191616766, "grad_norm": 0.00010152284085052088, "learning_rate": 7.729845182076896e-06, "loss": 0.0, "step": 1616 }, { "epoch": 0.6055389221556886, "grad_norm": 0.0010211137123405933, "learning_rate": 7.72452572546271e-06, "loss": 0.0, "step": 1618 }, { "epoch": 0.6062874251497006, "grad_norm": 0.00015486738993786275, "learning_rate": 7.71920187909442e-06, "loss": 0.0, "step": 1620 }, { "epoch": 0.6070359281437125, "grad_norm": 0.00011970350897172466, "learning_rate": 7.713873651549805e-06, "loss": 0.0, "step": 1622 }, { "epoch": 0.6077844311377245, "grad_norm": 7.165825081756338e-05, "learning_rate": 7.7085410514137e-06, "loss": 0.0, "step": 1624 }, { "epoch": 0.6085329341317365, "grad_norm": 0.00015439889102708548, "learning_rate": 7.703204087277989e-06, "loss": 0.0, "step": 1626 }, { "epoch": 0.6092814371257484, "grad_norm": 9.300145757151768e-05, "learning_rate": 7.697862767741584e-06, "loss": 0.0, "step": 1628 }, { "epoch": 0.6100299401197605, "grad_norm": 0.00014098809333518147, "learning_rate": 7.692517101410414e-06, "loss": 0.0, "step": 1630 }, { "epoch": 0.6107784431137725, "grad_norm": 0.00011235267447773367, "learning_rate": 7.68716709689742e-06, "loss": 0.0, "step": 1632 }, { "epoch": 0.6115269461077845, "grad_norm": 0.00014485006977338344, "learning_rate": 7.681812762822517e-06, "loss": 0.0, "step": 1634 }, { "epoch": 0.6122754491017964, "grad_norm": 8.417559729423374e-05, "learning_rate": 7.676454107812608e-06, "loss": 0.0, "step": 1636 }, { "epoch": 0.6130239520958084, "grad_norm": 0.00010664058936526999, "learning_rate": 7.671091140501557e-06, "loss": 0.0, "step": 1638 }, { "epoch": 0.6137724550898204, "grad_norm": 0.00019991857698187232, "learning_rate": 7.66572386953017e-06, "loss": 0.0, "step": 1640 }, { "epoch": 0.6137724550898204, "eval_accuracy": 1.0, "eval_loss": 6.937613648005936e-07, "eval_runtime": 164.9592, "eval_samples_per_second": 30.311, "eval_steps_per_second": 7.578, "step": 1640 }, { "epoch": 0.6145209580838323, "grad_norm": 0.00013954796304460615, "learning_rate": 7.660352303546192e-06, "loss": 0.0, "step": 1642 }, { "epoch": 0.6152694610778443, "grad_norm": 0.00022430805256590247, "learning_rate": 7.654976451204288e-06, "loss": 0.0, "step": 1644 }, { "epoch": 0.6160179640718563, "grad_norm": 8.612870442448184e-05, "learning_rate": 7.649596321166024e-06, "loss": 0.0, "step": 1646 }, { "epoch": 0.6167664670658682, "grad_norm": 0.00018191659182775766, "learning_rate": 7.644211922099867e-06, "loss": 0.0, "step": 1648 }, { "epoch": 0.6175149700598802, "grad_norm": 0.00012960025924257934, "learning_rate": 7.638823262681155e-06, "loss": 0.0, "step": 1650 }, { "epoch": 0.6182634730538922, "grad_norm": 0.00048115866957232356, "learning_rate": 7.633430351592093e-06, "loss": 0.0, "step": 1652 }, { "epoch": 0.6190119760479041, "grad_norm": 0.004453903064131737, "learning_rate": 7.6280331975217356e-06, "loss": 0.0, "step": 1654 }, { "epoch": 0.6197604790419161, "grad_norm": 0.00019366122432984412, "learning_rate": 7.622631809165972e-06, "loss": 0.0, "step": 1656 }, { "epoch": 0.6205089820359282, "grad_norm": 0.0001328593207290396, "learning_rate": 7.617226195227518e-06, "loss": 0.0, "step": 1658 }, { "epoch": 0.6212574850299402, "grad_norm": 0.00019288925977889448, "learning_rate": 7.611816364415896e-06, "loss": 0.0, "step": 1660 }, { "epoch": 0.6220059880239521, "grad_norm": 0.00020607073383871466, "learning_rate": 7.606402325447421e-06, "loss": 0.0, "step": 1662 }, { "epoch": 0.6227544910179641, "grad_norm": 0.0010563160758465528, "learning_rate": 7.600984087045187e-06, "loss": 0.0, "step": 1664 }, { "epoch": 0.6235029940119761, "grad_norm": 7.731228834018111e-05, "learning_rate": 7.595561657939061e-06, "loss": 0.0, "step": 1666 }, { "epoch": 0.624251497005988, "grad_norm": 0.00039579844451509416, "learning_rate": 7.590135046865652e-06, "loss": 0.0, "step": 1668 }, { "epoch": 0.625, "grad_norm": 0.00012335921928752214, "learning_rate": 7.584704262568315e-06, "loss": 0.0, "step": 1670 }, { "epoch": 0.625748502994012, "grad_norm": 8.330697164637968e-05, "learning_rate": 7.579269313797126e-06, "loss": 0.0, "step": 1672 }, { "epoch": 0.6264970059880239, "grad_norm": 0.00023956908262334764, "learning_rate": 7.573830209308872e-06, "loss": 0.0, "step": 1674 }, { "epoch": 0.6272455089820359, "grad_norm": 0.00013747882621828467, "learning_rate": 7.568386957867033e-06, "loss": 0.0, "step": 1676 }, { "epoch": 0.6279940119760479, "grad_norm": 0.00012644138769246638, "learning_rate": 7.562939568241772e-06, "loss": 0.0, "step": 1678 }, { "epoch": 0.6287425149700598, "grad_norm": 0.00021938206919003278, "learning_rate": 7.557488049209921e-06, "loss": 0.0, "step": 1680 }, { "epoch": 0.6287425149700598, "eval_accuracy": 1.0, "eval_loss": 6.026603500686178e-07, "eval_runtime": 164.079, "eval_samples_per_second": 30.473, "eval_steps_per_second": 7.618, "step": 1680 } ], "logging_steps": 2, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5997706414881505e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }