mpi_async_n4 / trainer_state.json
daman1209arora's picture
Upload folder using huggingface_hub
583d2f2 verified
{
"best_global_step": 1280,
"best_metric": 1.0,
"best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/mpi_async_n4/checkpoint-1280",
"epoch": 0.6287425149700598,
"eval_steps": 40,
"global_step": 1680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007485029940119761,
"grad_norm": 7.328390598297119,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.48,
"step": 2
},
{
"epoch": 0.0014970059880239522,
"grad_norm": 7.235108852386475,
"learning_rate": 6.000000000000001e-07,
"loss": 0.4252,
"step": 4
},
{
"epoch": 0.002245508982035928,
"grad_norm": 8.011260986328125,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4299,
"step": 6
},
{
"epoch": 0.0029940119760479044,
"grad_norm": 6.425393104553223,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.4424,
"step": 8
},
{
"epoch": 0.0037425149700598802,
"grad_norm": 6.826442241668701,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.4549,
"step": 10
},
{
"epoch": 0.004491017964071856,
"grad_norm": 4.996034622192383,
"learning_rate": 2.2e-06,
"loss": 0.3498,
"step": 12
},
{
"epoch": 0.005239520958083832,
"grad_norm": 4.402273654937744,
"learning_rate": 2.6e-06,
"loss": 0.3288,
"step": 14
},
{
"epoch": 0.005988023952095809,
"grad_norm": 4.156887054443359,
"learning_rate": 3e-06,
"loss": 0.2507,
"step": 16
},
{
"epoch": 0.006736526946107785,
"grad_norm": 2.647883176803589,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.197,
"step": 18
},
{
"epoch": 0.0074850299401197605,
"grad_norm": 2.444559097290039,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.1474,
"step": 20
},
{
"epoch": 0.008233532934131737,
"grad_norm": 1.8110377788543701,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.1494,
"step": 22
},
{
"epoch": 0.008982035928143712,
"grad_norm": 1.4763550758361816,
"learning_rate": 4.600000000000001e-06,
"loss": 0.107,
"step": 24
},
{
"epoch": 0.009730538922155689,
"grad_norm": 1.2829464673995972,
"learning_rate": 5e-06,
"loss": 0.0854,
"step": 26
},
{
"epoch": 0.010479041916167664,
"grad_norm": 1.1430706977844238,
"learning_rate": 5.400000000000001e-06,
"loss": 0.0597,
"step": 28
},
{
"epoch": 0.01122754491017964,
"grad_norm": 1.3779264688491821,
"learning_rate": 5.8e-06,
"loss": 0.0642,
"step": 30
},
{
"epoch": 0.011976047904191617,
"grad_norm": 0.9946982860565186,
"learning_rate": 6.200000000000001e-06,
"loss": 0.0398,
"step": 32
},
{
"epoch": 0.012724550898203593,
"grad_norm": 1.1442718505859375,
"learning_rate": 6.600000000000001e-06,
"loss": 0.04,
"step": 34
},
{
"epoch": 0.01347305389221557,
"grad_norm": 0.6475897431373596,
"learning_rate": 7e-06,
"loss": 0.0283,
"step": 36
},
{
"epoch": 0.014221556886227544,
"grad_norm": 0.8502711057662964,
"learning_rate": 7.4e-06,
"loss": 0.0345,
"step": 38
},
{
"epoch": 0.014970059880239521,
"grad_norm": 0.5282578468322754,
"learning_rate": 7.800000000000002e-06,
"loss": 0.0173,
"step": 40
},
{
"epoch": 0.014970059880239521,
"eval_accuracy": 0.9910189955595328,
"eval_loss": 0.02636127918958664,
"eval_runtime": 156.2607,
"eval_samples_per_second": 31.998,
"eval_steps_per_second": 7.999,
"step": 40
},
{
"epoch": 0.015718562874251496,
"grad_norm": 0.755415141582489,
"learning_rate": 8.2e-06,
"loss": 0.023,
"step": 42
},
{
"epoch": 0.016467065868263474,
"grad_norm": 0.6637702584266663,
"learning_rate": 8.6e-06,
"loss": 0.0165,
"step": 44
},
{
"epoch": 0.01721556886227545,
"grad_norm": 0.42257505655288696,
"learning_rate": 9e-06,
"loss": 0.0149,
"step": 46
},
{
"epoch": 0.017964071856287425,
"grad_norm": 0.6686341166496277,
"learning_rate": 9.4e-06,
"loss": 0.019,
"step": 48
},
{
"epoch": 0.0187125748502994,
"grad_norm": 0.5314021110534668,
"learning_rate": 9.800000000000001e-06,
"loss": 0.0169,
"step": 50
},
{
"epoch": 0.019461077844311378,
"grad_norm": 0.39661431312561035,
"learning_rate": 9.999998993000299e-06,
"loss": 0.0152,
"step": 52
},
{
"epoch": 0.020209580838323353,
"grad_norm": 0.571976900100708,
"learning_rate": 9.999990937005126e-06,
"loss": 0.0139,
"step": 54
},
{
"epoch": 0.020958083832335328,
"grad_norm": 0.5158469676971436,
"learning_rate": 9.999974825027756e-06,
"loss": 0.0092,
"step": 56
},
{
"epoch": 0.021706586826347306,
"grad_norm": 0.7198213338851929,
"learning_rate": 9.999950657094151e-06,
"loss": 0.0113,
"step": 58
},
{
"epoch": 0.02245508982035928,
"grad_norm": 0.48938679695129395,
"learning_rate": 9.999918433243253e-06,
"loss": 0.0085,
"step": 60
},
{
"epoch": 0.023203592814371257,
"grad_norm": 0.5157604813575745,
"learning_rate": 9.999878153526974e-06,
"loss": 0.0114,
"step": 62
},
{
"epoch": 0.023952095808383235,
"grad_norm": 0.510836124420166,
"learning_rate": 9.99982981801022e-06,
"loss": 0.009,
"step": 64
},
{
"epoch": 0.02470059880239521,
"grad_norm": 0.34386318922042847,
"learning_rate": 9.999773426770864e-06,
"loss": 0.0102,
"step": 66
},
{
"epoch": 0.025449101796407185,
"grad_norm": 0.31605905294418335,
"learning_rate": 9.999708979899769e-06,
"loss": 0.0095,
"step": 68
},
{
"epoch": 0.02619760479041916,
"grad_norm": 0.6626418828964233,
"learning_rate": 9.999636477500765e-06,
"loss": 0.0079,
"step": 70
},
{
"epoch": 0.02694610778443114,
"grad_norm": 0.49883756041526794,
"learning_rate": 9.999555919690673e-06,
"loss": 0.0065,
"step": 72
},
{
"epoch": 0.027694610778443114,
"grad_norm": 0.3710748255252838,
"learning_rate": 9.999467306599285e-06,
"loss": 0.0055,
"step": 74
},
{
"epoch": 0.02844311377245509,
"grad_norm": 0.33792468905448914,
"learning_rate": 9.999370638369377e-06,
"loss": 0.0065,
"step": 76
},
{
"epoch": 0.029191616766467067,
"grad_norm": 0.33830082416534424,
"learning_rate": 9.999265915156697e-06,
"loss": 0.0067,
"step": 78
},
{
"epoch": 0.029940119760479042,
"grad_norm": 0.3766763210296631,
"learning_rate": 9.999153137129978e-06,
"loss": 0.0054,
"step": 80
},
{
"epoch": 0.029940119760479042,
"eval_accuracy": 0.9975511997129987,
"eval_loss": 0.006336509715765715,
"eval_runtime": 152.2618,
"eval_samples_per_second": 32.838,
"eval_steps_per_second": 8.21,
"step": 80
},
{
"epoch": 0.030688622754491017,
"grad_norm": 0.3682909607887268,
"learning_rate": 9.999032304470926e-06,
"loss": 0.0052,
"step": 82
},
{
"epoch": 0.03143712574850299,
"grad_norm": 0.48871251940727234,
"learning_rate": 9.998903417374228e-06,
"loss": 0.0065,
"step": 84
},
{
"epoch": 0.03218562874251497,
"grad_norm": 0.4313011169433594,
"learning_rate": 9.998766476047546e-06,
"loss": 0.0056,
"step": 86
},
{
"epoch": 0.03293413173652695,
"grad_norm": 0.3613654673099518,
"learning_rate": 9.998621480711522e-06,
"loss": 0.0034,
"step": 88
},
{
"epoch": 0.033682634730538924,
"grad_norm": 0.39512524008750916,
"learning_rate": 9.998468431599768e-06,
"loss": 0.0039,
"step": 90
},
{
"epoch": 0.0344311377245509,
"grad_norm": 0.26590684056282043,
"learning_rate": 9.99830732895888e-06,
"loss": 0.0041,
"step": 92
},
{
"epoch": 0.035179640718562874,
"grad_norm": 0.29519563913345337,
"learning_rate": 9.998138173048424e-06,
"loss": 0.0048,
"step": 94
},
{
"epoch": 0.03592814371257485,
"grad_norm": 0.8653535842895508,
"learning_rate": 9.997960964140946e-06,
"loss": 0.0037,
"step": 96
},
{
"epoch": 0.036676646706586824,
"grad_norm": 0.5562458038330078,
"learning_rate": 9.997775702521965e-06,
"loss": 0.004,
"step": 98
},
{
"epoch": 0.0374251497005988,
"grad_norm": 0.31169670820236206,
"learning_rate": 9.997582388489975e-06,
"loss": 0.004,
"step": 100
},
{
"epoch": 0.03817365269461078,
"grad_norm": 0.3139854371547699,
"learning_rate": 9.99738102235644e-06,
"loss": 0.0032,
"step": 102
},
{
"epoch": 0.038922155688622756,
"grad_norm": 0.4420141875743866,
"learning_rate": 9.997171604445803e-06,
"loss": 0.0037,
"step": 104
},
{
"epoch": 0.03967065868263473,
"grad_norm": 0.46555566787719727,
"learning_rate": 9.99695413509548e-06,
"loss": 0.0034,
"step": 106
},
{
"epoch": 0.040419161676646706,
"grad_norm": 0.2851720154285431,
"learning_rate": 9.996728614655854e-06,
"loss": 0.0029,
"step": 108
},
{
"epoch": 0.04116766467065868,
"grad_norm": 0.3109307885169983,
"learning_rate": 9.996495043490285e-06,
"loss": 0.0029,
"step": 110
},
{
"epoch": 0.041916167664670656,
"grad_norm": 0.37666594982147217,
"learning_rate": 9.996253421975103e-06,
"loss": 0.0038,
"step": 112
},
{
"epoch": 0.04266467065868264,
"grad_norm": 0.5034800171852112,
"learning_rate": 9.996003750499608e-06,
"loss": 0.0032,
"step": 114
},
{
"epoch": 0.04341317365269461,
"grad_norm": 0.3710559606552124,
"learning_rate": 9.995746029466071e-06,
"loss": 0.0022,
"step": 116
},
{
"epoch": 0.04416167664670659,
"grad_norm": 0.4710935056209564,
"learning_rate": 9.995480259289731e-06,
"loss": 0.0025,
"step": 118
},
{
"epoch": 0.04491017964071856,
"grad_norm": 0.31052565574645996,
"learning_rate": 9.995206440398798e-06,
"loss": 0.0024,
"step": 120
},
{
"epoch": 0.04491017964071856,
"eval_accuracy": 0.9988225992721916,
"eval_loss": 0.002896190620958805,
"eval_runtime": 154.576,
"eval_samples_per_second": 32.347,
"eval_steps_per_second": 8.087,
"step": 120
},
{
"epoch": 0.04565868263473054,
"grad_norm": 0.34983423352241516,
"learning_rate": 9.994924573234448e-06,
"loss": 0.0022,
"step": 122
},
{
"epoch": 0.04640718562874251,
"grad_norm": 0.2754887640476227,
"learning_rate": 9.994634658250825e-06,
"loss": 0.0021,
"step": 124
},
{
"epoch": 0.04715568862275449,
"grad_norm": 0.49522289633750916,
"learning_rate": 9.994336695915041e-06,
"loss": 0.0021,
"step": 126
},
{
"epoch": 0.04790419161676647,
"grad_norm": 0.37913596630096436,
"learning_rate": 9.994030686707171e-06,
"loss": 0.002,
"step": 128
},
{
"epoch": 0.048652694610778445,
"grad_norm": 0.3330959379673004,
"learning_rate": 9.993716631120259e-06,
"loss": 0.0017,
"step": 130
},
{
"epoch": 0.04940119760479042,
"grad_norm": 0.2224518060684204,
"learning_rate": 9.993394529660307e-06,
"loss": 0.0018,
"step": 132
},
{
"epoch": 0.050149700598802395,
"grad_norm": 0.2787413001060486,
"learning_rate": 9.99306438284629e-06,
"loss": 0.0015,
"step": 134
},
{
"epoch": 0.05089820359281437,
"grad_norm": 0.43909233808517456,
"learning_rate": 9.992726191210139e-06,
"loss": 0.0023,
"step": 136
},
{
"epoch": 0.051646706586826345,
"grad_norm": 0.1608552634716034,
"learning_rate": 9.992379955296745e-06,
"loss": 0.0012,
"step": 138
},
{
"epoch": 0.05239520958083832,
"grad_norm": 0.34503915905952454,
"learning_rate": 9.992025675663966e-06,
"loss": 0.0018,
"step": 140
},
{
"epoch": 0.0531437125748503,
"grad_norm": 0.17146268486976624,
"learning_rate": 9.991663352882615e-06,
"loss": 0.0013,
"step": 142
},
{
"epoch": 0.05389221556886228,
"grad_norm": 0.47353699803352356,
"learning_rate": 9.991292987536469e-06,
"loss": 0.002,
"step": 144
},
{
"epoch": 0.05464071856287425,
"grad_norm": 0.10907532274723053,
"learning_rate": 9.990914580222258e-06,
"loss": 0.001,
"step": 146
},
{
"epoch": 0.05538922155688623,
"grad_norm": 0.195388525724411,
"learning_rate": 9.990528131549674e-06,
"loss": 0.0013,
"step": 148
},
{
"epoch": 0.0561377245508982,
"grad_norm": 0.124148428440094,
"learning_rate": 9.990133642141359e-06,
"loss": 0.0007,
"step": 150
},
{
"epoch": 0.05688622754491018,
"grad_norm": 0.3281680643558502,
"learning_rate": 9.989731112632917e-06,
"loss": 0.0018,
"step": 152
},
{
"epoch": 0.05763473053892216,
"grad_norm": 0.3646385669708252,
"learning_rate": 9.989320543672904e-06,
"loss": 0.0014,
"step": 154
},
{
"epoch": 0.058383233532934134,
"grad_norm": 0.20738907158374786,
"learning_rate": 9.988901935922826e-06,
"loss": 0.0012,
"step": 156
},
{
"epoch": 0.05913173652694611,
"grad_norm": 0.19206871092319489,
"learning_rate": 9.988475290057145e-06,
"loss": 0.0008,
"step": 158
},
{
"epoch": 0.059880239520958084,
"grad_norm": 0.4680192470550537,
"learning_rate": 9.988040606763272e-06,
"loss": 0.0011,
"step": 160
},
{
"epoch": 0.059880239520958084,
"eval_accuracy": 0.9995344699843772,
"eval_loss": 0.0012882280861958861,
"eval_runtime": 155.4759,
"eval_samples_per_second": 32.159,
"eval_steps_per_second": 8.04,
"step": 160
},
{
"epoch": 0.06062874251497006,
"grad_norm": 0.10511677712202072,
"learning_rate": 9.98759788674157e-06,
"loss": 0.0006,
"step": 162
},
{
"epoch": 0.061377245508982034,
"grad_norm": 0.264397531747818,
"learning_rate": 9.987147130705347e-06,
"loss": 0.0008,
"step": 164
},
{
"epoch": 0.06212574850299401,
"grad_norm": 0.15092360973358154,
"learning_rate": 9.986688339380863e-06,
"loss": 0.001,
"step": 166
},
{
"epoch": 0.06287425149700598,
"grad_norm": 0.23679876327514648,
"learning_rate": 9.98622151350732e-06,
"loss": 0.0009,
"step": 168
},
{
"epoch": 0.06362275449101797,
"grad_norm": 0.3080887198448181,
"learning_rate": 9.985746653836867e-06,
"loss": 0.0015,
"step": 170
},
{
"epoch": 0.06437125748502993,
"grad_norm": 0.13096538186073303,
"learning_rate": 9.985263761134602e-06,
"loss": 0.001,
"step": 172
},
{
"epoch": 0.06511976047904192,
"grad_norm": 0.27316954731941223,
"learning_rate": 9.984772836178559e-06,
"loss": 0.0008,
"step": 174
},
{
"epoch": 0.0658682634730539,
"grad_norm": 0.314272940158844,
"learning_rate": 9.984273879759713e-06,
"loss": 0.0017,
"step": 176
},
{
"epoch": 0.06661676646706587,
"grad_norm": 0.20915231108665466,
"learning_rate": 9.983766892681985e-06,
"loss": 0.0012,
"step": 178
},
{
"epoch": 0.06736526946107785,
"grad_norm": 0.18497829139232635,
"learning_rate": 9.983251875762234e-06,
"loss": 0.0008,
"step": 180
},
{
"epoch": 0.06811377245508982,
"grad_norm": 0.20126977562904358,
"learning_rate": 9.982728829830252e-06,
"loss": 0.0008,
"step": 182
},
{
"epoch": 0.0688622754491018,
"grad_norm": 0.15316377580165863,
"learning_rate": 9.982197755728771e-06,
"loss": 0.001,
"step": 184
},
{
"epoch": 0.06961077844311377,
"grad_norm": 0.14749199151992798,
"learning_rate": 9.981658654313458e-06,
"loss": 0.0005,
"step": 186
},
{
"epoch": 0.07035928143712575,
"grad_norm": 0.25107651948928833,
"learning_rate": 9.981111526452912e-06,
"loss": 0.0011,
"step": 188
},
{
"epoch": 0.07110778443113773,
"grad_norm": 0.07325785607099533,
"learning_rate": 9.980556373028665e-06,
"loss": 0.0004,
"step": 190
},
{
"epoch": 0.0718562874251497,
"grad_norm": 0.11805955320596695,
"learning_rate": 9.979993194935182e-06,
"loss": 0.0005,
"step": 192
},
{
"epoch": 0.07260479041916168,
"grad_norm": 0.19970782101154327,
"learning_rate": 9.979421993079853e-06,
"loss": 0.0008,
"step": 194
},
{
"epoch": 0.07335329341317365,
"grad_norm": 0.24476714432239532,
"learning_rate": 9.978842768382999e-06,
"loss": 0.0005,
"step": 196
},
{
"epoch": 0.07410179640718563,
"grad_norm": 0.12824182212352753,
"learning_rate": 9.978255521777865e-06,
"loss": 0.0004,
"step": 198
},
{
"epoch": 0.0748502994011976,
"grad_norm": 0.08068165183067322,
"learning_rate": 9.977660254210623e-06,
"loss": 0.0004,
"step": 200
},
{
"epoch": 0.0748502994011976,
"eval_accuracy": 0.9997569708964628,
"eval_loss": 0.000611252966336906,
"eval_runtime": 156.7213,
"eval_samples_per_second": 31.904,
"eval_steps_per_second": 7.976,
"step": 200
},
{
"epoch": 0.07559880239520958,
"grad_norm": 0.08569593727588654,
"learning_rate": 9.977056966640368e-06,
"loss": 0.0005,
"step": 202
},
{
"epoch": 0.07634730538922156,
"grad_norm": 0.10873577743768692,
"learning_rate": 9.976445660039118e-06,
"loss": 0.0003,
"step": 204
},
{
"epoch": 0.07709580838323353,
"grad_norm": 0.06685052067041397,
"learning_rate": 9.975826335391808e-06,
"loss": 0.0004,
"step": 206
},
{
"epoch": 0.07784431137724551,
"grad_norm": 0.171136736869812,
"learning_rate": 9.975198993696294e-06,
"loss": 0.0005,
"step": 208
},
{
"epoch": 0.07859281437125748,
"grad_norm": 0.2799069881439209,
"learning_rate": 9.974563635963348e-06,
"loss": 0.0009,
"step": 210
},
{
"epoch": 0.07934131736526946,
"grad_norm": 0.09249293059110641,
"learning_rate": 9.973920263216658e-06,
"loss": 0.0005,
"step": 212
},
{
"epoch": 0.08008982035928144,
"grad_norm": 0.19255271553993225,
"learning_rate": 9.973268876492827e-06,
"loss": 0.0004,
"step": 214
},
{
"epoch": 0.08083832335329341,
"grad_norm": 0.1604669839143753,
"learning_rate": 9.972609476841368e-06,
"loss": 0.0004,
"step": 216
},
{
"epoch": 0.0815868263473054,
"grad_norm": 0.08825163543224335,
"learning_rate": 9.971942065324704e-06,
"loss": 0.0007,
"step": 218
},
{
"epoch": 0.08233532934131736,
"grad_norm": 0.2524869441986084,
"learning_rate": 9.971266643018171e-06,
"loss": 0.0006,
"step": 220
},
{
"epoch": 0.08308383233532934,
"grad_norm": 0.10447513312101364,
"learning_rate": 9.970583211010008e-06,
"loss": 0.0006,
"step": 222
},
{
"epoch": 0.08383233532934131,
"grad_norm": 0.17385387420654297,
"learning_rate": 9.969891770401358e-06,
"loss": 0.0003,
"step": 224
},
{
"epoch": 0.0845808383233533,
"grad_norm": 0.0575445182621479,
"learning_rate": 9.969192322306271e-06,
"loss": 0.0002,
"step": 226
},
{
"epoch": 0.08532934131736528,
"grad_norm": 0.20742414891719818,
"learning_rate": 9.968484867851698e-06,
"loss": 0.0004,
"step": 228
},
{
"epoch": 0.08607784431137724,
"grad_norm": 0.22014112770557404,
"learning_rate": 9.96776940817749e-06,
"loss": 0.0005,
"step": 230
},
{
"epoch": 0.08682634730538923,
"grad_norm": 0.1331041306257248,
"learning_rate": 9.967045944436392e-06,
"loss": 0.0004,
"step": 232
},
{
"epoch": 0.0875748502994012,
"grad_norm": 0.14387176930904388,
"learning_rate": 9.966314477794052e-06,
"loss": 0.0006,
"step": 234
},
{
"epoch": 0.08832335329341318,
"grad_norm": 0.1632365584373474,
"learning_rate": 9.965575009429006e-06,
"loss": 0.0003,
"step": 236
},
{
"epoch": 0.08907185628742514,
"grad_norm": 0.1252838671207428,
"learning_rate": 9.964827540532685e-06,
"loss": 0.0005,
"step": 238
},
{
"epoch": 0.08982035928143713,
"grad_norm": 0.08947388827800751,
"learning_rate": 9.964072072309412e-06,
"loss": 0.0004,
"step": 240
},
{
"epoch": 0.08982035928143713,
"eval_accuracy": 0.99981676586026,
"eval_loss": 0.000518214248586446,
"eval_runtime": 154.0762,
"eval_samples_per_second": 32.451,
"eval_steps_per_second": 8.113,
"step": 240
},
{
"epoch": 0.09056886227544911,
"grad_norm": 0.1822632998228073,
"learning_rate": 9.963308605976397e-06,
"loss": 0.0003,
"step": 242
},
{
"epoch": 0.09131736526946108,
"grad_norm": 0.1965271681547165,
"learning_rate": 9.962537142763733e-06,
"loss": 0.0003,
"step": 244
},
{
"epoch": 0.09206586826347306,
"grad_norm": 0.12774410843849182,
"learning_rate": 9.961757683914406e-06,
"loss": 0.0004,
"step": 246
},
{
"epoch": 0.09281437125748503,
"grad_norm": 0.06404659152030945,
"learning_rate": 9.960970230684276e-06,
"loss": 0.0003,
"step": 248
},
{
"epoch": 0.09356287425149701,
"grad_norm": 0.07961199432611465,
"learning_rate": 9.96017478434209e-06,
"loss": 0.0002,
"step": 250
},
{
"epoch": 0.09431137724550898,
"grad_norm": 0.07755598425865173,
"learning_rate": 9.959371346169466e-06,
"loss": 0.0001,
"step": 252
},
{
"epoch": 0.09505988023952096,
"grad_norm": 0.10230294615030289,
"learning_rate": 9.958559917460909e-06,
"loss": 0.0004,
"step": 254
},
{
"epoch": 0.09580838323353294,
"grad_norm": 0.4232734441757202,
"learning_rate": 9.957740499523787e-06,
"loss": 0.0002,
"step": 256
},
{
"epoch": 0.09655688622754491,
"grad_norm": 0.45036637783050537,
"learning_rate": 9.95691309367835e-06,
"loss": 0.0006,
"step": 258
},
{
"epoch": 0.09730538922155689,
"grad_norm": 0.2974064350128174,
"learning_rate": 9.95607770125771e-06,
"loss": 0.0006,
"step": 260
},
{
"epoch": 0.09805389221556886,
"grad_norm": 0.12492769956588745,
"learning_rate": 9.955234323607854e-06,
"loss": 0.0005,
"step": 262
},
{
"epoch": 0.09880239520958084,
"grad_norm": 0.08176768571138382,
"learning_rate": 9.954382962087628e-06,
"loss": 0.0003,
"step": 264
},
{
"epoch": 0.09955089820359281,
"grad_norm": 0.11267261207103729,
"learning_rate": 9.95352361806875e-06,
"loss": 0.0004,
"step": 266
},
{
"epoch": 0.10029940119760479,
"grad_norm": 0.07069454342126846,
"learning_rate": 9.95265629293579e-06,
"loss": 0.0002,
"step": 268
},
{
"epoch": 0.10104790419161677,
"grad_norm": 0.13988761603832245,
"learning_rate": 9.951780988086183e-06,
"loss": 0.0004,
"step": 270
},
{
"epoch": 0.10179640718562874,
"grad_norm": 0.07328484207391739,
"learning_rate": 9.950897704930223e-06,
"loss": 0.0002,
"step": 272
},
{
"epoch": 0.10254491017964072,
"grad_norm": 0.1726737767457962,
"learning_rate": 9.95000644489105e-06,
"loss": 0.0003,
"step": 274
},
{
"epoch": 0.10329341317365269,
"grad_norm": 0.189790740609169,
"learning_rate": 9.949107209404664e-06,
"loss": 0.0005,
"step": 276
},
{
"epoch": 0.10404191616766467,
"grad_norm": 0.08902551233768463,
"learning_rate": 9.948199999919914e-06,
"loss": 0.0001,
"step": 278
},
{
"epoch": 0.10479041916167664,
"grad_norm": 0.10343684256076813,
"learning_rate": 9.947284817898493e-06,
"loss": 0.0002,
"step": 280
},
{
"epoch": 0.10479041916167664,
"eval_accuracy": 0.9998052416354287,
"eval_loss": 0.0006224001408554614,
"eval_runtime": 156.154,
"eval_samples_per_second": 32.02,
"eval_steps_per_second": 8.005,
"step": 280
},
{
"epoch": 0.10553892215568862,
"grad_norm": 0.20946663618087769,
"learning_rate": 9.946361664814942e-06,
"loss": 0.0007,
"step": 282
},
{
"epoch": 0.1062874251497006,
"grad_norm": 0.024475887417793274,
"learning_rate": 9.945430542156647e-06,
"loss": 0.0001,
"step": 284
},
{
"epoch": 0.10703592814371257,
"grad_norm": 0.12402810901403427,
"learning_rate": 9.944491451423829e-06,
"loss": 0.0003,
"step": 286
},
{
"epoch": 0.10778443113772455,
"grad_norm": 0.3434118330478668,
"learning_rate": 9.943544394129552e-06,
"loss": 0.0004,
"step": 288
},
{
"epoch": 0.10853293413173652,
"grad_norm": 0.21301892399787903,
"learning_rate": 9.942589371799715e-06,
"loss": 0.0003,
"step": 290
},
{
"epoch": 0.1092814371257485,
"grad_norm": 0.2948126196861267,
"learning_rate": 9.941626385973047e-06,
"loss": 0.0006,
"step": 292
},
{
"epoch": 0.11002994011976049,
"grad_norm": 0.1591068059206009,
"learning_rate": 9.940655438201113e-06,
"loss": 0.0003,
"step": 294
},
{
"epoch": 0.11077844311377245,
"grad_norm": 0.04139701649546623,
"learning_rate": 9.9396765300483e-06,
"loss": 0.0002,
"step": 296
},
{
"epoch": 0.11152694610778444,
"grad_norm": 0.11029073596000671,
"learning_rate": 9.938689663091828e-06,
"loss": 0.0003,
"step": 298
},
{
"epoch": 0.1122754491017964,
"grad_norm": 0.0646573156118393,
"learning_rate": 9.937694838921734e-06,
"loss": 0.0002,
"step": 300
},
{
"epoch": 0.11302395209580839,
"grad_norm": 0.14302918314933777,
"learning_rate": 9.93669205914088e-06,
"loss": 0.0003,
"step": 302
},
{
"epoch": 0.11377245508982035,
"grad_norm": 0.17884957790374756,
"learning_rate": 9.93568132536494e-06,
"loss": 0.0004,
"step": 304
},
{
"epoch": 0.11452095808383234,
"grad_norm": 0.09195531904697418,
"learning_rate": 9.934662639222412e-06,
"loss": 0.0002,
"step": 306
},
{
"epoch": 0.11526946107784432,
"grad_norm": 0.2769736647605896,
"learning_rate": 9.9336360023546e-06,
"loss": 0.0003,
"step": 308
},
{
"epoch": 0.11601796407185629,
"grad_norm": 0.029257414862513542,
"learning_rate": 9.932601416415622e-06,
"loss": 0.0003,
"step": 310
},
{
"epoch": 0.11676646706586827,
"grad_norm": 0.08587785065174103,
"learning_rate": 9.931558883072403e-06,
"loss": 0.0004,
"step": 312
},
{
"epoch": 0.11751497005988024,
"grad_norm": 0.20471642911434174,
"learning_rate": 9.930508404004668e-06,
"loss": 0.0004,
"step": 314
},
{
"epoch": 0.11826347305389222,
"grad_norm": 0.22900666296482086,
"learning_rate": 9.929449980904952e-06,
"loss": 0.0006,
"step": 316
},
{
"epoch": 0.11901197604790419,
"grad_norm": 0.16436566412448883,
"learning_rate": 9.928383615478586e-06,
"loss": 0.0003,
"step": 318
},
{
"epoch": 0.11976047904191617,
"grad_norm": 0.05877704173326492,
"learning_rate": 9.927309309443696e-06,
"loss": 0.0001,
"step": 320
},
{
"epoch": 0.11976047904191617,
"eval_accuracy": 0.9999357040300619,
"eval_loss": 0.00022764925961382687,
"eval_runtime": 158.1146,
"eval_samples_per_second": 31.623,
"eval_steps_per_second": 7.906,
"step": 320
},
{
"epoch": 0.12050898203592815,
"grad_norm": 0.261000394821167,
"learning_rate": 9.9262270645312e-06,
"loss": 0.0003,
"step": 322
},
{
"epoch": 0.12125748502994012,
"grad_norm": 0.17999576032161713,
"learning_rate": 9.925136882484816e-06,
"loss": 0.0003,
"step": 324
},
{
"epoch": 0.1220059880239521,
"grad_norm": 0.15744219720363617,
"learning_rate": 9.924038765061042e-06,
"loss": 0.0006,
"step": 326
},
{
"epoch": 0.12275449101796407,
"grad_norm": 0.031700655817985535,
"learning_rate": 9.922932714029163e-06,
"loss": 0.0004,
"step": 328
},
{
"epoch": 0.12350299401197605,
"grad_norm": 0.2377641499042511,
"learning_rate": 9.921818731171249e-06,
"loss": 0.0003,
"step": 330
},
{
"epoch": 0.12425149700598802,
"grad_norm": 0.08403676003217697,
"learning_rate": 9.920696818282147e-06,
"loss": 0.0002,
"step": 332
},
{
"epoch": 0.125,
"grad_norm": 0.1424562782049179,
"learning_rate": 9.919566977169486e-06,
"loss": 0.0004,
"step": 334
},
{
"epoch": 0.12574850299401197,
"grad_norm": 0.0928482636809349,
"learning_rate": 9.918429209653662e-06,
"loss": 0.0002,
"step": 336
},
{
"epoch": 0.12649700598802396,
"grad_norm": 0.08917529135942459,
"learning_rate": 9.917283517567845e-06,
"loss": 0.0004,
"step": 338
},
{
"epoch": 0.12724550898203593,
"grad_norm": 0.09952011704444885,
"learning_rate": 9.916129902757977e-06,
"loss": 0.0003,
"step": 340
},
{
"epoch": 0.1279940119760479,
"grad_norm": 0.05392898619174957,
"learning_rate": 9.914968367082756e-06,
"loss": 0.0001,
"step": 342
},
{
"epoch": 0.12874251497005987,
"grad_norm": 0.12771159410476685,
"learning_rate": 9.913798912413653e-06,
"loss": 0.0002,
"step": 344
},
{
"epoch": 0.12949101796407186,
"grad_norm": 0.9677438735961914,
"learning_rate": 9.912621540634889e-06,
"loss": 0.0003,
"step": 346
},
{
"epoch": 0.13023952095808383,
"grad_norm": 0.03891558572649956,
"learning_rate": 9.911436253643445e-06,
"loss": 0.0001,
"step": 348
},
{
"epoch": 0.1309880239520958,
"grad_norm": 0.03757692128419876,
"learning_rate": 9.910243053349055e-06,
"loss": 0.0,
"step": 350
},
{
"epoch": 0.1317365269461078,
"grad_norm": 0.20588494837284088,
"learning_rate": 9.909041941674205e-06,
"loss": 0.0004,
"step": 352
},
{
"epoch": 0.13248502994011976,
"grad_norm": 0.29803666472435,
"learning_rate": 9.90783292055412e-06,
"loss": 0.0004,
"step": 354
},
{
"epoch": 0.13323353293413173,
"grad_norm": 0.14101789891719818,
"learning_rate": 9.906615991936781e-06,
"loss": 0.0002,
"step": 356
},
{
"epoch": 0.1339820359281437,
"grad_norm": 0.24130620062351227,
"learning_rate": 9.905391157782897e-06,
"loss": 0.0002,
"step": 358
},
{
"epoch": 0.1347305389221557,
"grad_norm": 0.2917313575744629,
"learning_rate": 9.904158420065923e-06,
"loss": 0.0005,
"step": 360
},
{
"epoch": 0.1347305389221557,
"eval_accuracy": 0.9999076577000782,
"eval_loss": 0.0005576053517870605,
"eval_runtime": 155.2214,
"eval_samples_per_second": 32.212,
"eval_steps_per_second": 8.053,
"step": 360
},
{
"epoch": 0.13547904191616766,
"grad_norm": 0.12759952247142792,
"learning_rate": 9.902917780772043e-06,
"loss": 0.0003,
"step": 362
},
{
"epoch": 0.13622754491017963,
"grad_norm": 0.1657952070236206,
"learning_rate": 9.901669241900178e-06,
"loss": 0.0007,
"step": 364
},
{
"epoch": 0.13697604790419163,
"grad_norm": 0.10384248197078705,
"learning_rate": 9.900412805461968e-06,
"loss": 0.0005,
"step": 366
},
{
"epoch": 0.1377245508982036,
"grad_norm": 0.20811188220977783,
"learning_rate": 9.899148473481786e-06,
"loss": 0.0006,
"step": 368
},
{
"epoch": 0.13847305389221556,
"grad_norm": 0.051202207803726196,
"learning_rate": 9.89787624799672e-06,
"loss": 0.0003,
"step": 370
},
{
"epoch": 0.13922155688622753,
"grad_norm": 0.13106031715869904,
"learning_rate": 9.896596131056583e-06,
"loss": 0.0002,
"step": 372
},
{
"epoch": 0.13997005988023953,
"grad_norm": 0.1166054904460907,
"learning_rate": 9.895308124723897e-06,
"loss": 0.0003,
"step": 374
},
{
"epoch": 0.1407185628742515,
"grad_norm": 0.10474357008934021,
"learning_rate": 9.894012231073895e-06,
"loss": 0.0003,
"step": 376
},
{
"epoch": 0.14146706586826346,
"grad_norm": 0.08845887333154678,
"learning_rate": 9.892708452194522e-06,
"loss": 0.0004,
"step": 378
},
{
"epoch": 0.14221556886227546,
"grad_norm": 0.1545616239309311,
"learning_rate": 9.891396790186424e-06,
"loss": 0.0004,
"step": 380
},
{
"epoch": 0.14296407185628743,
"grad_norm": 0.04785681515932083,
"learning_rate": 9.890077247162951e-06,
"loss": 0.0001,
"step": 382
},
{
"epoch": 0.1437125748502994,
"grad_norm": 0.11323319375514984,
"learning_rate": 9.888749825250151e-06,
"loss": 0.0001,
"step": 384
},
{
"epoch": 0.14446107784431136,
"grad_norm": 0.1407540738582611,
"learning_rate": 9.887414526586764e-06,
"loss": 0.0002,
"step": 386
},
{
"epoch": 0.14520958083832336,
"grad_norm": 0.09322088956832886,
"learning_rate": 9.886071353324223e-06,
"loss": 0.0001,
"step": 388
},
{
"epoch": 0.14595808383233533,
"grad_norm": 0.07416640967130661,
"learning_rate": 9.884720307626647e-06,
"loss": 0.0001,
"step": 390
},
{
"epoch": 0.1467065868263473,
"grad_norm": 0.031197911128401756,
"learning_rate": 9.883361391670841e-06,
"loss": 0.0,
"step": 392
},
{
"epoch": 0.1474550898203593,
"grad_norm": 0.1820898950099945,
"learning_rate": 9.881994607646288e-06,
"loss": 0.0003,
"step": 394
},
{
"epoch": 0.14820359281437126,
"grad_norm": 0.1383231282234192,
"learning_rate": 9.880619957755151e-06,
"loss": 0.0002,
"step": 396
},
{
"epoch": 0.14895209580838323,
"grad_norm": 0.019146692007780075,
"learning_rate": 9.879237444212265e-06,
"loss": 0.0,
"step": 398
},
{
"epoch": 0.1497005988023952,
"grad_norm": 0.04791894555091858,
"learning_rate": 9.877847069245134e-06,
"loss": 0.0001,
"step": 400
},
{
"epoch": 0.1497005988023952,
"eval_accuracy": 0.9999685074988971,
"eval_loss": 0.00012279710790608078,
"eval_runtime": 156.0918,
"eval_samples_per_second": 32.032,
"eval_steps_per_second": 8.008,
"step": 400
},
{
"epoch": 0.1504491017964072,
"grad_norm": 0.06451380997896194,
"learning_rate": 9.87644883509393e-06,
"loss": 0.0001,
"step": 402
},
{
"epoch": 0.15119760479041916,
"grad_norm": 0.10077822208404541,
"learning_rate": 9.875042744011487e-06,
"loss": 0.0001,
"step": 404
},
{
"epoch": 0.15194610778443113,
"grad_norm": 0.07988882809877396,
"learning_rate": 9.873628798263297e-06,
"loss": 0.0001,
"step": 406
},
{
"epoch": 0.15269461077844312,
"grad_norm": 0.08547152578830719,
"learning_rate": 9.87220700012751e-06,
"loss": 0.0003,
"step": 408
},
{
"epoch": 0.1534431137724551,
"grad_norm": 0.06369513273239136,
"learning_rate": 9.870777351894926e-06,
"loss": 0.0001,
"step": 410
},
{
"epoch": 0.15419161676646706,
"grad_norm": 0.1190333142876625,
"learning_rate": 9.869339855868992e-06,
"loss": 0.0002,
"step": 412
},
{
"epoch": 0.15494011976047903,
"grad_norm": 0.4799070954322815,
"learning_rate": 9.867894514365802e-06,
"loss": 0.0001,
"step": 414
},
{
"epoch": 0.15568862275449102,
"grad_norm": 0.05317097157239914,
"learning_rate": 9.86644132971409e-06,
"loss": 0.0001,
"step": 416
},
{
"epoch": 0.156437125748503,
"grad_norm": 0.08004628121852875,
"learning_rate": 9.864980304255222e-06,
"loss": 0.0003,
"step": 418
},
{
"epoch": 0.15718562874251496,
"grad_norm": 0.06639832258224487,
"learning_rate": 9.863511440343206e-06,
"loss": 0.0001,
"step": 420
},
{
"epoch": 0.15793413173652696,
"grad_norm": 0.20095159113407135,
"learning_rate": 9.862034740344673e-06,
"loss": 0.0002,
"step": 422
},
{
"epoch": 0.15868263473053892,
"grad_norm": 0.14772972464561462,
"learning_rate": 9.860550206638881e-06,
"loss": 0.0002,
"step": 424
},
{
"epoch": 0.1594311377245509,
"grad_norm": 0.15753412246704102,
"learning_rate": 9.859057841617709e-06,
"loss": 0.0002,
"step": 426
},
{
"epoch": 0.1601796407185629,
"grad_norm": 0.08705739676952362,
"learning_rate": 9.857557647685657e-06,
"loss": 0.0002,
"step": 428
},
{
"epoch": 0.16092814371257486,
"grad_norm": 0.32878294587135315,
"learning_rate": 9.856049627259833e-06,
"loss": 0.0006,
"step": 430
},
{
"epoch": 0.16167664670658682,
"grad_norm": 0.19281232357025146,
"learning_rate": 9.85453378276996e-06,
"loss": 0.0001,
"step": 432
},
{
"epoch": 0.1624251497005988,
"grad_norm": 0.4002825617790222,
"learning_rate": 9.853010116658368e-06,
"loss": 0.0009,
"step": 434
},
{
"epoch": 0.1631736526946108,
"grad_norm": 0.15032881498336792,
"learning_rate": 9.851478631379982e-06,
"loss": 0.0002,
"step": 436
},
{
"epoch": 0.16392215568862276,
"grad_norm": 0.46663233637809753,
"learning_rate": 9.849939329402337e-06,
"loss": 0.0009,
"step": 438
},
{
"epoch": 0.16467065868263472,
"grad_norm": 0.032840508967638016,
"learning_rate": 9.848392213205549e-06,
"loss": 0.0003,
"step": 440
},
{
"epoch": 0.16467065868263472,
"eval_accuracy": 0.9997846752245753,
"eval_loss": 0.0007238321122713387,
"eval_runtime": 154.7968,
"eval_samples_per_second": 32.3,
"eval_steps_per_second": 8.075,
"step": 440
},
{
"epoch": 0.16541916167664672,
"grad_norm": 0.17962802946567535,
"learning_rate": 9.846837285282331e-06,
"loss": 0.0006,
"step": 442
},
{
"epoch": 0.1661676646706587,
"grad_norm": 0.03923157975077629,
"learning_rate": 9.845274548137986e-06,
"loss": 0.0002,
"step": 444
},
{
"epoch": 0.16691616766467066,
"grad_norm": 0.07774964720010757,
"learning_rate": 9.843704004290393e-06,
"loss": 0.0002,
"step": 446
},
{
"epoch": 0.16766467065868262,
"grad_norm": 0.2827122211456299,
"learning_rate": 9.842125656270011e-06,
"loss": 0.0006,
"step": 448
},
{
"epoch": 0.16841317365269462,
"grad_norm": 0.30080848932266235,
"learning_rate": 9.840539506619874e-06,
"loss": 0.0003,
"step": 450
},
{
"epoch": 0.1691616766467066,
"grad_norm": 0.19179034233093262,
"learning_rate": 9.838945557895586e-06,
"loss": 0.0002,
"step": 452
},
{
"epoch": 0.16991017964071856,
"grad_norm": 0.044639382511377335,
"learning_rate": 9.837343812665311e-06,
"loss": 0.0002,
"step": 454
},
{
"epoch": 0.17065868263473055,
"grad_norm": 0.14254966378211975,
"learning_rate": 9.835734273509787e-06,
"loss": 0.0007,
"step": 456
},
{
"epoch": 0.17140718562874252,
"grad_norm": 0.10285581648349762,
"learning_rate": 9.834116943022299e-06,
"loss": 0.0003,
"step": 458
},
{
"epoch": 0.1721556886227545,
"grad_norm": 0.12203399091959,
"learning_rate": 9.832491823808688e-06,
"loss": 0.0003,
"step": 460
},
{
"epoch": 0.17290419161676646,
"grad_norm": 0.10512761771678925,
"learning_rate": 9.830858918487347e-06,
"loss": 0.0001,
"step": 462
},
{
"epoch": 0.17365269461077845,
"grad_norm": 0.14217980206012726,
"learning_rate": 9.829218229689211e-06,
"loss": 0.0004,
"step": 464
},
{
"epoch": 0.17440119760479042,
"grad_norm": 0.05573190748691559,
"learning_rate": 9.827569760057755e-06,
"loss": 0.0002,
"step": 466
},
{
"epoch": 0.1751497005988024,
"grad_norm": 0.1435333788394928,
"learning_rate": 9.825913512248996e-06,
"loss": 0.0002,
"step": 468
},
{
"epoch": 0.17589820359281438,
"grad_norm": 0.14290957152843475,
"learning_rate": 9.824249488931477e-06,
"loss": 0.0005,
"step": 470
},
{
"epoch": 0.17664670658682635,
"grad_norm": 0.0923268049955368,
"learning_rate": 9.822577692786272e-06,
"loss": 0.0003,
"step": 472
},
{
"epoch": 0.17739520958083832,
"grad_norm": 0.0938134640455246,
"learning_rate": 9.820898126506978e-06,
"loss": 0.0002,
"step": 474
},
{
"epoch": 0.1781437125748503,
"grad_norm": 0.09895174205303192,
"learning_rate": 9.819210792799711e-06,
"loss": 0.0003,
"step": 476
},
{
"epoch": 0.17889221556886228,
"grad_norm": 0.010202400386333466,
"learning_rate": 9.817515694383102e-06,
"loss": 0.0001,
"step": 478
},
{
"epoch": 0.17964071856287425,
"grad_norm": 0.045472726225852966,
"learning_rate": 9.815812833988292e-06,
"loss": 0.0001,
"step": 480
},
{
"epoch": 0.17964071856287425,
"eval_accuracy": 0.9998838001657226,
"eval_loss": 0.000438039394794032,
"eval_runtime": 154.4563,
"eval_samples_per_second": 32.372,
"eval_steps_per_second": 8.093,
"step": 480
},
{
"epoch": 0.18038922155688622,
"grad_norm": 0.1489792764186859,
"learning_rate": 9.814102214358928e-06,
"loss": 0.0002,
"step": 482
},
{
"epoch": 0.18113772455089822,
"grad_norm": 0.15599974989891052,
"learning_rate": 9.81238383825116e-06,
"loss": 0.0005,
"step": 484
},
{
"epoch": 0.18188622754491018,
"grad_norm": 0.03606925159692764,
"learning_rate": 9.810657708433637e-06,
"loss": 0.0004,
"step": 486
},
{
"epoch": 0.18263473053892215,
"grad_norm": 0.04655231162905693,
"learning_rate": 9.808923827687494e-06,
"loss": 0.0001,
"step": 488
},
{
"epoch": 0.18338323353293412,
"grad_norm": 0.2198714017868042,
"learning_rate": 9.807182198806362e-06,
"loss": 0.0002,
"step": 490
},
{
"epoch": 0.18413173652694612,
"grad_norm": 0.05768256261944771,
"learning_rate": 9.805432824596347e-06,
"loss": 0.0003,
"step": 492
},
{
"epoch": 0.18488023952095808,
"grad_norm": 0.17893020808696747,
"learning_rate": 9.803675707876048e-06,
"loss": 0.0005,
"step": 494
},
{
"epoch": 0.18562874251497005,
"grad_norm": 0.12833981215953827,
"learning_rate": 9.801910851476524e-06,
"loss": 0.0002,
"step": 496
},
{
"epoch": 0.18637724550898205,
"grad_norm": 0.03174396604299545,
"learning_rate": 9.800138258241311e-06,
"loss": 0.0001,
"step": 498
},
{
"epoch": 0.18712574850299402,
"grad_norm": 0.11265647411346436,
"learning_rate": 9.798357931026411e-06,
"loss": 0.0002,
"step": 500
},
{
"epoch": 0.18787425149700598,
"grad_norm": 0.10834460705518723,
"learning_rate": 9.796569872700287e-06,
"loss": 0.0004,
"step": 502
},
{
"epoch": 0.18862275449101795,
"grad_norm": 0.061082735657691956,
"learning_rate": 9.79477408614386e-06,
"loss": 0.0001,
"step": 504
},
{
"epoch": 0.18937125748502995,
"grad_norm": 0.16802391409873962,
"learning_rate": 9.792970574250493e-06,
"loss": 0.0002,
"step": 506
},
{
"epoch": 0.19011976047904192,
"grad_norm": 0.11000331491231918,
"learning_rate": 9.791159339926009e-06,
"loss": 0.0001,
"step": 508
},
{
"epoch": 0.19086826347305388,
"grad_norm": 0.06801439821720123,
"learning_rate": 9.789340386088663e-06,
"loss": 0.0002,
"step": 510
},
{
"epoch": 0.19161676646706588,
"grad_norm": 0.012815337628126144,
"learning_rate": 9.787513715669158e-06,
"loss": 0.0,
"step": 512
},
{
"epoch": 0.19236526946107785,
"grad_norm": 0.011311142705380917,
"learning_rate": 9.78567933161062e-06,
"loss": 0.0,
"step": 514
},
{
"epoch": 0.19311377245508982,
"grad_norm": 0.06330162286758423,
"learning_rate": 9.78383723686861e-06,
"loss": 0.0,
"step": 516
},
{
"epoch": 0.19386227544910178,
"grad_norm": 0.071534164249897,
"learning_rate": 9.781987434411106e-06,
"loss": 0.0001,
"step": 518
},
{
"epoch": 0.19461077844311378,
"grad_norm": 0.11816436052322388,
"learning_rate": 9.780129927218513e-06,
"loss": 0.0001,
"step": 520
},
{
"epoch": 0.19461077844311378,
"eval_accuracy": 0.9999860097407319,
"eval_loss": 5.2422070439206436e-05,
"eval_runtime": 155.6149,
"eval_samples_per_second": 32.131,
"eval_steps_per_second": 8.033,
"step": 520
},
{
"epoch": 0.19535928143712575,
"grad_norm": 0.06640541553497314,
"learning_rate": 9.778264718283644e-06,
"loss": 0.0,
"step": 522
},
{
"epoch": 0.19610778443113772,
"grad_norm": 0.026967424899339676,
"learning_rate": 9.776391810611719e-06,
"loss": 0.0,
"step": 524
},
{
"epoch": 0.1968562874251497,
"grad_norm": 0.11123115569353104,
"learning_rate": 9.774511207220369e-06,
"loss": 0.0001,
"step": 526
},
{
"epoch": 0.19760479041916168,
"grad_norm": 0.13741283118724823,
"learning_rate": 9.772622911139622e-06,
"loss": 0.0001,
"step": 528
},
{
"epoch": 0.19835329341317365,
"grad_norm": 0.009464044123888016,
"learning_rate": 9.770726925411898e-06,
"loss": 0.0,
"step": 530
},
{
"epoch": 0.19910179640718562,
"grad_norm": 0.0769435316324234,
"learning_rate": 9.768823253092008e-06,
"loss": 0.0001,
"step": 532
},
{
"epoch": 0.1998502994011976,
"grad_norm": 0.046003557741642,
"learning_rate": 9.766911897247147e-06,
"loss": 0.0001,
"step": 534
},
{
"epoch": 0.20059880239520958,
"grad_norm": 0.10196753591299057,
"learning_rate": 9.76499286095689e-06,
"loss": 0.0002,
"step": 536
},
{
"epoch": 0.20134730538922155,
"grad_norm": 0.020359348505735397,
"learning_rate": 9.763066147313189e-06,
"loss": 0.0,
"step": 538
},
{
"epoch": 0.20209580838323354,
"grad_norm": 0.20479270815849304,
"learning_rate": 9.76113175942036e-06,
"loss": 0.0001,
"step": 540
},
{
"epoch": 0.2028443113772455,
"grad_norm": 0.11673811078071594,
"learning_rate": 9.759189700395096e-06,
"loss": 0.0001,
"step": 542
},
{
"epoch": 0.20359281437125748,
"grad_norm": 0.04004862159490585,
"learning_rate": 9.75723997336643e-06,
"loss": 0.0001,
"step": 544
},
{
"epoch": 0.20434131736526945,
"grad_norm": 0.13865888118743896,
"learning_rate": 9.755282581475769e-06,
"loss": 0.0004,
"step": 546
},
{
"epoch": 0.20508982035928144,
"grad_norm": 0.08988627046346664,
"learning_rate": 9.753317527876857e-06,
"loss": 0.0002,
"step": 548
},
{
"epoch": 0.2058383233532934,
"grad_norm": 0.09014202654361725,
"learning_rate": 9.751344815735791e-06,
"loss": 0.0003,
"step": 550
},
{
"epoch": 0.20658682634730538,
"grad_norm": 0.17278143763542175,
"learning_rate": 9.749364448231001e-06,
"loss": 0.0003,
"step": 552
},
{
"epoch": 0.20733532934131738,
"grad_norm": 0.07624712586402893,
"learning_rate": 9.747376428553255e-06,
"loss": 0.0002,
"step": 554
},
{
"epoch": 0.20808383233532934,
"grad_norm": 0.02646615356206894,
"learning_rate": 9.745380759905648e-06,
"loss": 0.0005,
"step": 556
},
{
"epoch": 0.2088323353293413,
"grad_norm": 0.1350707858800888,
"learning_rate": 9.743377445503598e-06,
"loss": 0.0005,
"step": 558
},
{
"epoch": 0.20958083832335328,
"grad_norm": 0.045723576098680496,
"learning_rate": 9.74136648857485e-06,
"loss": 0.0004,
"step": 560
},
{
"epoch": 0.20958083832335328,
"eval_accuracy": 0.9998195029826557,
"eval_loss": 0.0005425158306024969,
"eval_runtime": 155.8793,
"eval_samples_per_second": 32.076,
"eval_steps_per_second": 8.019,
"step": 560
},
{
"epoch": 0.21032934131736528,
"grad_norm": 0.11474994570016861,
"learning_rate": 9.739347892359453e-06,
"loss": 0.0003,
"step": 562
},
{
"epoch": 0.21107784431137724,
"grad_norm": 0.0819924846291542,
"learning_rate": 9.737321660109767e-06,
"loss": 0.0002,
"step": 564
},
{
"epoch": 0.2118263473053892,
"grad_norm": 0.098919577896595,
"learning_rate": 9.735287795090455e-06,
"loss": 0.0004,
"step": 566
},
{
"epoch": 0.2125748502994012,
"grad_norm": 0.034899163991212845,
"learning_rate": 9.733246300578482e-06,
"loss": 0.0004,
"step": 568
},
{
"epoch": 0.21332335329341318,
"grad_norm": 0.10499320924282074,
"learning_rate": 9.731197179863104e-06,
"loss": 0.0003,
"step": 570
},
{
"epoch": 0.21407185628742514,
"grad_norm": 0.078518345952034,
"learning_rate": 9.729140436245857e-06,
"loss": 0.0001,
"step": 572
},
{
"epoch": 0.2148203592814371,
"grad_norm": 0.04776620492339134,
"learning_rate": 9.72707607304057e-06,
"loss": 0.0002,
"step": 574
},
{
"epoch": 0.2155688622754491,
"grad_norm": 0.043205343186855316,
"learning_rate": 9.725004093573343e-06,
"loss": 0.0001,
"step": 576
},
{
"epoch": 0.21631736526946108,
"grad_norm": 0.0973254144191742,
"learning_rate": 9.722924501182546e-06,
"loss": 0.0002,
"step": 578
},
{
"epoch": 0.21706586826347304,
"grad_norm": 0.07782719284296036,
"learning_rate": 9.72083729921882e-06,
"loss": 0.0001,
"step": 580
},
{
"epoch": 0.21781437125748504,
"grad_norm": 0.04242849349975586,
"learning_rate": 9.718742491045061e-06,
"loss": 0.0001,
"step": 582
},
{
"epoch": 0.218562874251497,
"grad_norm": 0.04837155342102051,
"learning_rate": 9.716640080036423e-06,
"loss": 0.0001,
"step": 584
},
{
"epoch": 0.21931137724550898,
"grad_norm": 0.0814133882522583,
"learning_rate": 9.71453006958031e-06,
"loss": 0.0002,
"step": 586
},
{
"epoch": 0.22005988023952097,
"grad_norm": 0.047387998551130295,
"learning_rate": 9.712412463076368e-06,
"loss": 0.0,
"step": 588
},
{
"epoch": 0.22080838323353294,
"grad_norm": 0.017673810943961143,
"learning_rate": 9.710287263936485e-06,
"loss": 0.0,
"step": 590
},
{
"epoch": 0.2215568862275449,
"grad_norm": 0.021801825612783432,
"learning_rate": 9.708154475584779e-06,
"loss": 0.0001,
"step": 592
},
{
"epoch": 0.22230538922155688,
"grad_norm": 0.03839518874883652,
"learning_rate": 9.7060141014576e-06,
"loss": 0.0002,
"step": 594
},
{
"epoch": 0.22305389221556887,
"grad_norm": 0.007782716304063797,
"learning_rate": 9.703866145003512e-06,
"loss": 0.0001,
"step": 596
},
{
"epoch": 0.22380239520958084,
"grad_norm": 0.02108747325837612,
"learning_rate": 9.701710609683305e-06,
"loss": 0.0001,
"step": 598
},
{
"epoch": 0.2245508982035928,
"grad_norm": 0.0026378484908491373,
"learning_rate": 9.699547498969978e-06,
"loss": 0.0,
"step": 600
},
{
"epoch": 0.2245508982035928,
"eval_accuracy": 0.9999875145529564,
"eval_loss": 4.2638039303710684e-05,
"eval_runtime": 160.7051,
"eval_samples_per_second": 31.113,
"eval_steps_per_second": 7.778,
"step": 600
},
{
"epoch": 0.2252994011976048,
"grad_norm": 0.02909325808286667,
"learning_rate": 9.697376816348732e-06,
"loss": 0.0001,
"step": 602
},
{
"epoch": 0.22604790419161677,
"grad_norm": 0.0025581123773008585,
"learning_rate": 9.695198565316966e-06,
"loss": 0.0001,
"step": 604
},
{
"epoch": 0.22679640718562874,
"grad_norm": 0.02005714178085327,
"learning_rate": 9.69301274938428e-06,
"loss": 0.0,
"step": 606
},
{
"epoch": 0.2275449101796407,
"grad_norm": 0.0037004246842116117,
"learning_rate": 9.690819372072457e-06,
"loss": 0.0,
"step": 608
},
{
"epoch": 0.2282934131736527,
"grad_norm": 0.032148100435733795,
"learning_rate": 9.68861843691547e-06,
"loss": 0.0001,
"step": 610
},
{
"epoch": 0.22904191616766467,
"grad_norm": 0.014080125838518143,
"learning_rate": 9.68640994745946e-06,
"loss": 0.0002,
"step": 612
},
{
"epoch": 0.22979041916167664,
"grad_norm": 0.010853869840502739,
"learning_rate": 9.684193907262742e-06,
"loss": 0.0,
"step": 614
},
{
"epoch": 0.23053892215568864,
"grad_norm": 0.032357871532440186,
"learning_rate": 9.681970319895804e-06,
"loss": 0.0,
"step": 616
},
{
"epoch": 0.2312874251497006,
"grad_norm": 0.008318758569657803,
"learning_rate": 9.679739188941283e-06,
"loss": 0.0,
"step": 618
},
{
"epoch": 0.23203592814371257,
"grad_norm": 0.037990834563970566,
"learning_rate": 9.677500517993983e-06,
"loss": 0.0,
"step": 620
},
{
"epoch": 0.23278443113772454,
"grad_norm": 0.00843075942248106,
"learning_rate": 9.675254310660842e-06,
"loss": 0.0001,
"step": 622
},
{
"epoch": 0.23353293413173654,
"grad_norm": 0.05007459223270416,
"learning_rate": 9.673000570560952e-06,
"loss": 0.0,
"step": 624
},
{
"epoch": 0.2342814371257485,
"grad_norm": 0.0009229404386132956,
"learning_rate": 9.670739301325534e-06,
"loss": 0.0,
"step": 626
},
{
"epoch": 0.23502994011976047,
"grad_norm": 0.02507946826517582,
"learning_rate": 9.668470506597946e-06,
"loss": 0.0,
"step": 628
},
{
"epoch": 0.23577844311377247,
"grad_norm": 0.09565775096416473,
"learning_rate": 9.66619419003367e-06,
"loss": 0.0002,
"step": 630
},
{
"epoch": 0.23652694610778444,
"grad_norm": 0.0022729237098246813,
"learning_rate": 9.663910355300306e-06,
"loss": 0.0,
"step": 632
},
{
"epoch": 0.2372754491017964,
"grad_norm": 0.0015811071498319507,
"learning_rate": 9.661619006077562e-06,
"loss": 0.0,
"step": 634
},
{
"epoch": 0.23802395209580837,
"grad_norm": 0.10619401931762695,
"learning_rate": 9.659320146057263e-06,
"loss": 0.0001,
"step": 636
},
{
"epoch": 0.23877245508982037,
"grad_norm": 0.0017936922376975417,
"learning_rate": 9.657013778943328e-06,
"loss": 0.0,
"step": 638
},
{
"epoch": 0.23952095808383234,
"grad_norm": 0.00227470719255507,
"learning_rate": 9.654699908451777e-06,
"loss": 0.0,
"step": 640
},
{
"epoch": 0.23952095808383234,
"eval_accuracy": 0.9999895288063639,
"eval_loss": 2.794685133267194e-05,
"eval_runtime": 156.1163,
"eval_samples_per_second": 32.027,
"eval_steps_per_second": 8.007,
"step": 640
},
{
"epoch": 0.2402694610778443,
"grad_norm": 0.007332763634622097,
"learning_rate": 9.652378538310715e-06,
"loss": 0.0,
"step": 642
},
{
"epoch": 0.2410179640718563,
"grad_norm": 0.08600316196680069,
"learning_rate": 9.650049672260333e-06,
"loss": 0.0,
"step": 644
},
{
"epoch": 0.24176646706586827,
"grad_norm": 0.005560212302953005,
"learning_rate": 9.647713314052896e-06,
"loss": 0.0,
"step": 646
},
{
"epoch": 0.24251497005988024,
"grad_norm": 0.00411292864009738,
"learning_rate": 9.645369467452746e-06,
"loss": 0.0,
"step": 648
},
{
"epoch": 0.2432634730538922,
"grad_norm": 0.0018659079214558005,
"learning_rate": 9.643018136236286e-06,
"loss": 0.0,
"step": 650
},
{
"epoch": 0.2440119760479042,
"grad_norm": 0.004269044380635023,
"learning_rate": 9.64065932419198e-06,
"loss": 0.0,
"step": 652
},
{
"epoch": 0.24476047904191617,
"grad_norm": 0.00309938658028841,
"learning_rate": 9.638293035120342e-06,
"loss": 0.0,
"step": 654
},
{
"epoch": 0.24550898203592814,
"grad_norm": 0.0024809043388813734,
"learning_rate": 9.635919272833938e-06,
"loss": 0.0,
"step": 656
},
{
"epoch": 0.24625748502994013,
"grad_norm": 0.003469419199973345,
"learning_rate": 9.63353804115737e-06,
"loss": 0.0,
"step": 658
},
{
"epoch": 0.2470059880239521,
"grad_norm": 0.0016053810250014067,
"learning_rate": 9.63114934392728e-06,
"loss": 0.0,
"step": 660
},
{
"epoch": 0.24775449101796407,
"grad_norm": 0.02661885879933834,
"learning_rate": 9.628753184992334e-06,
"loss": 0.0,
"step": 662
},
{
"epoch": 0.24850299401197604,
"grad_norm": 0.0016741787549108267,
"learning_rate": 9.62634956821322e-06,
"loss": 0.0001,
"step": 664
},
{
"epoch": 0.24925149700598803,
"grad_norm": 0.0019377709832042456,
"learning_rate": 9.623938497462647e-06,
"loss": 0.0,
"step": 666
},
{
"epoch": 0.25,
"grad_norm": 0.0012623120564967394,
"learning_rate": 9.621519976625327e-06,
"loss": 0.0,
"step": 668
},
{
"epoch": 0.25074850299401197,
"grad_norm": 0.0024038818664848804,
"learning_rate": 9.619094009597982e-06,
"loss": 0.0,
"step": 670
},
{
"epoch": 0.25149700598802394,
"grad_norm": 0.006172757130116224,
"learning_rate": 9.616660600289329e-06,
"loss": 0.0,
"step": 672
},
{
"epoch": 0.2522455089820359,
"grad_norm": 0.0028510144911706448,
"learning_rate": 9.614219752620074e-06,
"loss": 0.0,
"step": 674
},
{
"epoch": 0.25299401197604793,
"grad_norm": 0.02679716795682907,
"learning_rate": 9.611771470522908e-06,
"loss": 0.0,
"step": 676
},
{
"epoch": 0.2537425149700599,
"grad_norm": 0.02851109206676483,
"learning_rate": 9.609315757942504e-06,
"loss": 0.0,
"step": 678
},
{
"epoch": 0.25449101796407186,
"grad_norm": 0.0017305930377915502,
"learning_rate": 9.606852618835503e-06,
"loss": 0.0001,
"step": 680
},
{
"epoch": 0.25449101796407186,
"eval_accuracy": 0.9999997747747748,
"eval_loss": 8.644859917694703e-06,
"eval_runtime": 159.0892,
"eval_samples_per_second": 31.429,
"eval_steps_per_second": 7.857,
"step": 680
},
{
"epoch": 0.25523952095808383,
"grad_norm": 0.00403413875028491,
"learning_rate": 9.604382057170514e-06,
"loss": 0.0,
"step": 682
},
{
"epoch": 0.2559880239520958,
"grad_norm": 0.0027754653710871935,
"learning_rate": 9.601904076928103e-06,
"loss": 0.0,
"step": 684
},
{
"epoch": 0.25673652694610777,
"grad_norm": 0.0013081474462524056,
"learning_rate": 9.599418682100793e-06,
"loss": 0.0,
"step": 686
},
{
"epoch": 0.25748502994011974,
"grad_norm": 0.05064619705080986,
"learning_rate": 9.596925876693047e-06,
"loss": 0.0,
"step": 688
},
{
"epoch": 0.25823353293413176,
"grad_norm": 0.002823168644681573,
"learning_rate": 9.594425664721275e-06,
"loss": 0.0,
"step": 690
},
{
"epoch": 0.25898203592814373,
"grad_norm": 0.030349284410476685,
"learning_rate": 9.591918050213814e-06,
"loss": 0.0,
"step": 692
},
{
"epoch": 0.2597305389221557,
"grad_norm": 0.001790383132174611,
"learning_rate": 9.589403037210933e-06,
"loss": 0.0001,
"step": 694
},
{
"epoch": 0.26047904191616766,
"grad_norm": 0.010972312651574612,
"learning_rate": 9.586880629764817e-06,
"loss": 0.0,
"step": 696
},
{
"epoch": 0.26122754491017963,
"grad_norm": 0.06688281893730164,
"learning_rate": 9.584350831939571e-06,
"loss": 0.0001,
"step": 698
},
{
"epoch": 0.2619760479041916,
"grad_norm": 0.149211123585701,
"learning_rate": 9.581813647811199e-06,
"loss": 0.0001,
"step": 700
},
{
"epoch": 0.26272455089820357,
"grad_norm": 0.00245782732963562,
"learning_rate": 9.579269081467614e-06,
"loss": 0.0,
"step": 702
},
{
"epoch": 0.2634730538922156,
"grad_norm": 0.01430213451385498,
"learning_rate": 9.576717137008617e-06,
"loss": 0.0001,
"step": 704
},
{
"epoch": 0.26422155688622756,
"grad_norm": 0.013654684647917747,
"learning_rate": 9.574157818545902e-06,
"loss": 0.0,
"step": 706
},
{
"epoch": 0.26497005988023953,
"grad_norm": 0.015040101483464241,
"learning_rate": 9.57159113020304e-06,
"loss": 0.0,
"step": 708
},
{
"epoch": 0.2657185628742515,
"grad_norm": 0.01307929027825594,
"learning_rate": 9.569017076115476e-06,
"loss": 0.0001,
"step": 710
},
{
"epoch": 0.26646706586826346,
"grad_norm": 0.02330423705279827,
"learning_rate": 9.566435660430528e-06,
"loss": 0.0,
"step": 712
},
{
"epoch": 0.26721556886227543,
"grad_norm": 0.002268057782202959,
"learning_rate": 9.563846887307369e-06,
"loss": 0.0,
"step": 714
},
{
"epoch": 0.2679640718562874,
"grad_norm": 0.011261685751378536,
"learning_rate": 9.561250760917026e-06,
"loss": 0.0001,
"step": 716
},
{
"epoch": 0.2687125748502994,
"grad_norm": 0.03315627574920654,
"learning_rate": 9.558647285442382e-06,
"loss": 0.0,
"step": 718
},
{
"epoch": 0.2694610778443114,
"grad_norm": 0.002093307441100478,
"learning_rate": 9.55603646507815e-06,
"loss": 0.0,
"step": 720
},
{
"epoch": 0.2694610778443114,
"eval_accuracy": 0.9999976841259713,
"eval_loss": 9.612030225980561e-06,
"eval_runtime": 155.2847,
"eval_samples_per_second": 32.199,
"eval_steps_per_second": 8.05,
"step": 720
},
{
"epoch": 0.27020958083832336,
"grad_norm": 0.001716041355393827,
"learning_rate": 9.553418304030886e-06,
"loss": 0.0,
"step": 722
},
{
"epoch": 0.27095808383233533,
"grad_norm": 0.0027342389803379774,
"learning_rate": 9.550792806518967e-06,
"loss": 0.0,
"step": 724
},
{
"epoch": 0.2717065868263473,
"grad_norm": 0.1821688860654831,
"learning_rate": 9.548159976772593e-06,
"loss": 0.0001,
"step": 726
},
{
"epoch": 0.27245508982035926,
"grad_norm": 0.0016638662200421095,
"learning_rate": 9.545519819033777e-06,
"loss": 0.0001,
"step": 728
},
{
"epoch": 0.27320359281437123,
"grad_norm": 0.021991174668073654,
"learning_rate": 9.542872337556341e-06,
"loss": 0.0,
"step": 730
},
{
"epoch": 0.27395209580838326,
"grad_norm": 0.0012851693900302052,
"learning_rate": 9.540217536605906e-06,
"loss": 0.0,
"step": 732
},
{
"epoch": 0.2747005988023952,
"grad_norm": 0.0014544121222570539,
"learning_rate": 9.537555420459883e-06,
"loss": 0.0,
"step": 734
},
{
"epoch": 0.2754491017964072,
"grad_norm": 0.009950781241059303,
"learning_rate": 9.534885993407474e-06,
"loss": 0.0,
"step": 736
},
{
"epoch": 0.27619760479041916,
"grad_norm": 0.00411807419732213,
"learning_rate": 9.532209259749658e-06,
"loss": 0.0,
"step": 738
},
{
"epoch": 0.27694610778443113,
"grad_norm": 0.006487260106950998,
"learning_rate": 9.529525223799185e-06,
"loss": 0.0,
"step": 740
},
{
"epoch": 0.2776946107784431,
"grad_norm": 0.007635013200342655,
"learning_rate": 9.526833889880573e-06,
"loss": 0.0,
"step": 742
},
{
"epoch": 0.27844311377245506,
"grad_norm": 0.000996310613118112,
"learning_rate": 9.524135262330098e-06,
"loss": 0.0,
"step": 744
},
{
"epoch": 0.2791916167664671,
"grad_norm": 0.0031566142570227385,
"learning_rate": 9.521429345495787e-06,
"loss": 0.0001,
"step": 746
},
{
"epoch": 0.27994011976047906,
"grad_norm": 0.002025540452450514,
"learning_rate": 9.51871614373741e-06,
"loss": 0.0,
"step": 748
},
{
"epoch": 0.280688622754491,
"grad_norm": 0.1011413112282753,
"learning_rate": 9.515995661426478e-06,
"loss": 0.0001,
"step": 750
},
{
"epoch": 0.281437125748503,
"grad_norm": 0.021610310301184654,
"learning_rate": 9.513267902946228e-06,
"loss": 0.0,
"step": 752
},
{
"epoch": 0.28218562874251496,
"grad_norm": 0.0016732689691707492,
"learning_rate": 9.510532872691624e-06,
"loss": 0.0,
"step": 754
},
{
"epoch": 0.28293413173652693,
"grad_norm": 0.11272062361240387,
"learning_rate": 9.507790575069347e-06,
"loss": 0.0001,
"step": 756
},
{
"epoch": 0.2836826347305389,
"grad_norm": 0.0009099426679313183,
"learning_rate": 9.50504101449778e-06,
"loss": 0.0,
"step": 758
},
{
"epoch": 0.2844311377245509,
"grad_norm": 0.0009794794023036957,
"learning_rate": 9.50228419540702e-06,
"loss": 0.0,
"step": 760
},
{
"epoch": 0.2844311377245509,
"eval_accuracy": 0.9999983934801854,
"eval_loss": 9.44385647017043e-06,
"eval_runtime": 156.9097,
"eval_samples_per_second": 31.865,
"eval_steps_per_second": 7.966,
"step": 760
},
{
"epoch": 0.2851796407185629,
"grad_norm": 0.03243451938033104,
"learning_rate": 9.499520122238846e-06,
"loss": 0.0,
"step": 762
},
{
"epoch": 0.28592814371257486,
"grad_norm": 0.02839779108762741,
"learning_rate": 9.496748799446733e-06,
"loss": 0.0001,
"step": 764
},
{
"epoch": 0.2866766467065868,
"grad_norm": 0.0816827118396759,
"learning_rate": 9.493970231495836e-06,
"loss": 0.0,
"step": 766
},
{
"epoch": 0.2874251497005988,
"grad_norm": 0.0025276602245867252,
"learning_rate": 9.49118442286298e-06,
"loss": 0.0,
"step": 768
},
{
"epoch": 0.28817365269461076,
"grad_norm": 0.0015131831169128418,
"learning_rate": 9.488391378036662e-06,
"loss": 0.0,
"step": 770
},
{
"epoch": 0.28892215568862273,
"grad_norm": 0.001832049572840333,
"learning_rate": 9.485591101517027e-06,
"loss": 0.0,
"step": 772
},
{
"epoch": 0.28967065868263475,
"grad_norm": 0.047806382179260254,
"learning_rate": 9.482783597815883e-06,
"loss": 0.0,
"step": 774
},
{
"epoch": 0.2904191616766467,
"grad_norm": 0.03347828611731529,
"learning_rate": 9.47996887145668e-06,
"loss": 0.0,
"step": 776
},
{
"epoch": 0.2911676646706587,
"grad_norm": 0.0017931102775037289,
"learning_rate": 9.477146926974501e-06,
"loss": 0.0,
"step": 778
},
{
"epoch": 0.29191616766467066,
"grad_norm": 0.009210226126015186,
"learning_rate": 9.47431776891606e-06,
"loss": 0.0,
"step": 780
},
{
"epoch": 0.2926646706586826,
"grad_norm": 0.0013418138260021806,
"learning_rate": 9.471481401839696e-06,
"loss": 0.0,
"step": 782
},
{
"epoch": 0.2934131736526946,
"grad_norm": 0.0009674608591012657,
"learning_rate": 9.468637830315364e-06,
"loss": 0.0,
"step": 784
},
{
"epoch": 0.29416167664670656,
"grad_norm": 0.0006195507594384253,
"learning_rate": 9.46578705892462e-06,
"loss": 0.0,
"step": 786
},
{
"epoch": 0.2949101796407186,
"grad_norm": 0.0013804810587316751,
"learning_rate": 9.46292909226063e-06,
"loss": 0.0,
"step": 788
},
{
"epoch": 0.29565868263473055,
"grad_norm": 0.0004127651918679476,
"learning_rate": 9.460063934928142e-06,
"loss": 0.0,
"step": 790
},
{
"epoch": 0.2964071856287425,
"grad_norm": 0.0004895281745120883,
"learning_rate": 9.4571915915435e-06,
"loss": 0.0,
"step": 792
},
{
"epoch": 0.2971556886227545,
"grad_norm": 0.00033658542088232934,
"learning_rate": 9.454312066734624e-06,
"loss": 0.0,
"step": 794
},
{
"epoch": 0.29790419161676646,
"grad_norm": 0.07587553560733795,
"learning_rate": 9.451425365140997e-06,
"loss": 0.0,
"step": 796
},
{
"epoch": 0.2986526946107784,
"grad_norm": 0.00075916713103652,
"learning_rate": 9.448531491413673e-06,
"loss": 0.0,
"step": 798
},
{
"epoch": 0.2994011976047904,
"grad_norm": 0.0008038659580051899,
"learning_rate": 9.445630450215259e-06,
"loss": 0.0,
"step": 800
},
{
"epoch": 0.2994011976047904,
"eval_accuracy": 0.9999963675587793,
"eval_loss": 1.0820390343724284e-05,
"eval_runtime": 155.3675,
"eval_samples_per_second": 32.182,
"eval_steps_per_second": 8.045,
"step": 800
},
{
"epoch": 0.3001497005988024,
"grad_norm": 0.0022112810984253883,
"learning_rate": 9.442722246219915e-06,
"loss": 0.0,
"step": 802
},
{
"epoch": 0.3008982035928144,
"grad_norm": 0.0013486716197803617,
"learning_rate": 9.439806884113331e-06,
"loss": 0.0,
"step": 804
},
{
"epoch": 0.30164670658682635,
"grad_norm": 0.005311549641191959,
"learning_rate": 9.43688436859274e-06,
"loss": 0.0002,
"step": 806
},
{
"epoch": 0.3023952095808383,
"grad_norm": 0.000981526798568666,
"learning_rate": 9.433954704366897e-06,
"loss": 0.0,
"step": 808
},
{
"epoch": 0.3031437125748503,
"grad_norm": 0.09638898819684982,
"learning_rate": 9.431017896156074e-06,
"loss": 0.0,
"step": 810
},
{
"epoch": 0.30389221556886226,
"grad_norm": 0.04560961201786995,
"learning_rate": 9.428073948692056e-06,
"loss": 0.0001,
"step": 812
},
{
"epoch": 0.3046407185628742,
"grad_norm": 0.040918540209531784,
"learning_rate": 9.425122866718128e-06,
"loss": 0.0003,
"step": 814
},
{
"epoch": 0.30538922155688625,
"grad_norm": 0.03442908823490143,
"learning_rate": 9.422164654989073e-06,
"loss": 0.0,
"step": 816
},
{
"epoch": 0.3061377245508982,
"grad_norm": 0.13045716285705566,
"learning_rate": 9.419199318271158e-06,
"loss": 0.0001,
"step": 818
},
{
"epoch": 0.3068862275449102,
"grad_norm": 0.027492402121424675,
"learning_rate": 9.416226861342132e-06,
"loss": 0.0001,
"step": 820
},
{
"epoch": 0.30763473053892215,
"grad_norm": 0.003682814771309495,
"learning_rate": 9.413247288991216e-06,
"loss": 0.0,
"step": 822
},
{
"epoch": 0.3083832335329341,
"grad_norm": 0.10141133517026901,
"learning_rate": 9.410260606019095e-06,
"loss": 0.0002,
"step": 824
},
{
"epoch": 0.3091317365269461,
"grad_norm": 0.0007527911802753806,
"learning_rate": 9.40726681723791e-06,
"loss": 0.0001,
"step": 826
},
{
"epoch": 0.30988023952095806,
"grad_norm": 0.005670532584190369,
"learning_rate": 9.404265927471255e-06,
"loss": 0.0,
"step": 828
},
{
"epoch": 0.3106287425149701,
"grad_norm": 0.03817495331168175,
"learning_rate": 9.401257941554157e-06,
"loss": 0.0,
"step": 830
},
{
"epoch": 0.31137724550898205,
"grad_norm": 0.009812482632696629,
"learning_rate": 9.398242864333084e-06,
"loss": 0.0,
"step": 832
},
{
"epoch": 0.312125748502994,
"grad_norm": 0.007045481353998184,
"learning_rate": 9.395220700665924e-06,
"loss": 0.0,
"step": 834
},
{
"epoch": 0.312874251497006,
"grad_norm": 0.003998721018433571,
"learning_rate": 9.392191455421989e-06,
"loss": 0.0,
"step": 836
},
{
"epoch": 0.31362275449101795,
"grad_norm": 0.031697846949100494,
"learning_rate": 9.389155133481993e-06,
"loss": 0.0001,
"step": 838
},
{
"epoch": 0.3143712574850299,
"grad_norm": 0.0006167310057207942,
"learning_rate": 9.386111739738057e-06,
"loss": 0.0,
"step": 840
},
{
"epoch": 0.3143712574850299,
"eval_accuracy": 0.9999984969179706,
"eval_loss": 6.494924491562415e-06,
"eval_runtime": 155.2786,
"eval_samples_per_second": 32.2,
"eval_steps_per_second": 8.05,
"step": 840
},
{
"epoch": 0.31511976047904194,
"grad_norm": 0.47339311242103577,
"learning_rate": 9.383061279093697e-06,
"loss": 0.0002,
"step": 842
},
{
"epoch": 0.3158682634730539,
"grad_norm": 0.0039043284486979246,
"learning_rate": 9.380003756463812e-06,
"loss": 0.0,
"step": 844
},
{
"epoch": 0.3166167664670659,
"grad_norm": 0.18402549624443054,
"learning_rate": 9.376939176774678e-06,
"loss": 0.0001,
"step": 846
},
{
"epoch": 0.31736526946107785,
"grad_norm": 0.03785166144371033,
"learning_rate": 9.373867544963949e-06,
"loss": 0.0004,
"step": 848
},
{
"epoch": 0.3181137724550898,
"grad_norm": 0.07002092897891998,
"learning_rate": 9.370788865980633e-06,
"loss": 0.0001,
"step": 850
},
{
"epoch": 0.3188622754491018,
"grad_norm": 0.009300635196268559,
"learning_rate": 9.367703144785097e-06,
"loss": 0.0,
"step": 852
},
{
"epoch": 0.31961077844311375,
"grad_norm": 0.2740118205547333,
"learning_rate": 9.364610386349048e-06,
"loss": 0.0003,
"step": 854
},
{
"epoch": 0.3203592814371258,
"grad_norm": 0.023412982001900673,
"learning_rate": 9.361510595655545e-06,
"loss": 0.0001,
"step": 856
},
{
"epoch": 0.32110778443113774,
"grad_norm": 0.10502910614013672,
"learning_rate": 9.358403777698962e-06,
"loss": 0.0001,
"step": 858
},
{
"epoch": 0.3218562874251497,
"grad_norm": 0.17004919052124023,
"learning_rate": 9.355289937485005e-06,
"loss": 0.0001,
"step": 860
},
{
"epoch": 0.3226047904191617,
"grad_norm": 0.020658617839217186,
"learning_rate": 9.35216908003069e-06,
"loss": 0.0,
"step": 862
},
{
"epoch": 0.32335329341317365,
"grad_norm": 0.2423926740884781,
"learning_rate": 9.349041210364343e-06,
"loss": 0.0003,
"step": 864
},
{
"epoch": 0.3241017964071856,
"grad_norm": 0.02749599702656269,
"learning_rate": 9.345906333525582e-06,
"loss": 0.0001,
"step": 866
},
{
"epoch": 0.3248502994011976,
"grad_norm": 0.10116691887378693,
"learning_rate": 9.342764454565321e-06,
"loss": 0.0001,
"step": 868
},
{
"epoch": 0.3255988023952096,
"grad_norm": 0.09531649202108383,
"learning_rate": 9.339615578545753e-06,
"loss": 0.0001,
"step": 870
},
{
"epoch": 0.3263473053892216,
"grad_norm": 0.023796789348125458,
"learning_rate": 9.336459710540344e-06,
"loss": 0.0,
"step": 872
},
{
"epoch": 0.32709580838323354,
"grad_norm": 0.08885123580694199,
"learning_rate": 9.333296855633828e-06,
"loss": 0.0001,
"step": 874
},
{
"epoch": 0.3278443113772455,
"grad_norm": 0.13661184906959534,
"learning_rate": 9.330127018922195e-06,
"loss": 0.0001,
"step": 876
},
{
"epoch": 0.3285928143712575,
"grad_norm": 0.009723243303596973,
"learning_rate": 9.326950205512682e-06,
"loss": 0.0001,
"step": 878
},
{
"epoch": 0.32934131736526945,
"grad_norm": 0.017450012266635895,
"learning_rate": 9.323766420523768e-06,
"loss": 0.0001,
"step": 880
},
{
"epoch": 0.32934131736526945,
"eval_accuracy": 0.9999853499863853,
"eval_loss": 5.076894740341231e-05,
"eval_runtime": 154.2114,
"eval_samples_per_second": 32.423,
"eval_steps_per_second": 8.106,
"step": 880
},
{
"epoch": 0.3300898203592814,
"grad_norm": 0.09020084142684937,
"learning_rate": 9.32057566908517e-06,
"loss": 0.0001,
"step": 882
},
{
"epoch": 0.33083832335329344,
"grad_norm": 0.014794589951634407,
"learning_rate": 9.31737795633782e-06,
"loss": 0.0,
"step": 884
},
{
"epoch": 0.3315868263473054,
"grad_norm": 0.1351051777601242,
"learning_rate": 9.314173287433874e-06,
"loss": 0.0001,
"step": 886
},
{
"epoch": 0.3323353293413174,
"grad_norm": 0.02759048528969288,
"learning_rate": 9.310961667536689e-06,
"loss": 0.0,
"step": 888
},
{
"epoch": 0.33308383233532934,
"grad_norm": 0.006297203712165356,
"learning_rate": 9.307743101820828e-06,
"loss": 0.0,
"step": 890
},
{
"epoch": 0.3338323353293413,
"grad_norm": 0.1679803431034088,
"learning_rate": 9.30451759547204e-06,
"loss": 0.0004,
"step": 892
},
{
"epoch": 0.3345808383233533,
"grad_norm": 0.018898937851190567,
"learning_rate": 9.301285153687261e-06,
"loss": 0.0001,
"step": 894
},
{
"epoch": 0.33532934131736525,
"grad_norm": 0.010490099899470806,
"learning_rate": 9.298045781674595e-06,
"loss": 0.0001,
"step": 896
},
{
"epoch": 0.33607784431137727,
"grad_norm": 0.08461616188287735,
"learning_rate": 9.294799484653323e-06,
"loss": 0.0002,
"step": 898
},
{
"epoch": 0.33682634730538924,
"grad_norm": 0.009152884595096111,
"learning_rate": 9.291546267853871e-06,
"loss": 0.0001,
"step": 900
},
{
"epoch": 0.3375748502994012,
"grad_norm": 0.04316161200404167,
"learning_rate": 9.28828613651782e-06,
"loss": 0.0001,
"step": 902
},
{
"epoch": 0.3383233532934132,
"grad_norm": 0.04677840694785118,
"learning_rate": 9.285019095897894e-06,
"loss": 0.0,
"step": 904
},
{
"epoch": 0.33907185628742514,
"grad_norm": 0.006453138776123524,
"learning_rate": 9.281745151257946e-06,
"loss": 0.0002,
"step": 906
},
{
"epoch": 0.3398203592814371,
"grad_norm": 0.00727870361879468,
"learning_rate": 9.278464307872952e-06,
"loss": 0.0,
"step": 908
},
{
"epoch": 0.3405688622754491,
"grad_norm": 0.15015535056591034,
"learning_rate": 9.275176571029008e-06,
"loss": 0.0002,
"step": 910
},
{
"epoch": 0.3413173652694611,
"grad_norm": 0.01364520750939846,
"learning_rate": 9.271881946023309e-06,
"loss": 0.0,
"step": 912
},
{
"epoch": 0.34206586826347307,
"grad_norm": 0.13824740052223206,
"learning_rate": 9.268580438164157e-06,
"loss": 0.0001,
"step": 914
},
{
"epoch": 0.34281437125748504,
"grad_norm": 0.02371104806661606,
"learning_rate": 9.265272052770936e-06,
"loss": 0.0,
"step": 916
},
{
"epoch": 0.343562874251497,
"grad_norm": 0.07769843935966492,
"learning_rate": 9.261956795174116e-06,
"loss": 0.0002,
"step": 918
},
{
"epoch": 0.344311377245509,
"grad_norm": 0.0038983135018497705,
"learning_rate": 9.25863467071524e-06,
"loss": 0.0,
"step": 920
},
{
"epoch": 0.344311377245509,
"eval_accuracy": 0.9999698145371103,
"eval_loss": 0.0001175394281744957,
"eval_runtime": 154.4022,
"eval_samples_per_second": 32.383,
"eval_steps_per_second": 8.096,
"step": 920
},
{
"epoch": 0.34505988023952094,
"grad_norm": 0.04528482258319855,
"learning_rate": 9.255305684746908e-06,
"loss": 0.0001,
"step": 922
},
{
"epoch": 0.3458083832335329,
"grad_norm": 0.04112999513745308,
"learning_rate": 9.251969842632785e-06,
"loss": 0.0,
"step": 924
},
{
"epoch": 0.34655688622754494,
"grad_norm": 0.01982693374156952,
"learning_rate": 9.248627149747573e-06,
"loss": 0.0,
"step": 926
},
{
"epoch": 0.3473053892215569,
"grad_norm": 0.002507114317268133,
"learning_rate": 9.24527761147702e-06,
"loss": 0.0,
"step": 928
},
{
"epoch": 0.34805389221556887,
"grad_norm": 0.018373820930719376,
"learning_rate": 9.241921233217899e-06,
"loss": 0.0,
"step": 930
},
{
"epoch": 0.34880239520958084,
"grad_norm": 0.015127432532608509,
"learning_rate": 9.238558020378003e-06,
"loss": 0.0,
"step": 932
},
{
"epoch": 0.3495508982035928,
"grad_norm": 0.006092644762247801,
"learning_rate": 9.235187978376141e-06,
"loss": 0.0001,
"step": 934
},
{
"epoch": 0.3502994011976048,
"grad_norm": 0.14546248316764832,
"learning_rate": 9.231811112642121e-06,
"loss": 0.0002,
"step": 936
},
{
"epoch": 0.35104790419161674,
"grad_norm": 0.003949570469558239,
"learning_rate": 9.228427428616749e-06,
"loss": 0.0001,
"step": 938
},
{
"epoch": 0.35179640718562877,
"grad_norm": 0.008468257263302803,
"learning_rate": 9.225036931751811e-06,
"loss": 0.0002,
"step": 940
},
{
"epoch": 0.35254491017964074,
"grad_norm": 0.10494138300418854,
"learning_rate": 9.221639627510076e-06,
"loss": 0.0002,
"step": 942
},
{
"epoch": 0.3532934131736527,
"grad_norm": 0.06659938395023346,
"learning_rate": 9.218235521365278e-06,
"loss": 0.0004,
"step": 944
},
{
"epoch": 0.35404191616766467,
"grad_norm": 0.09659219533205032,
"learning_rate": 9.214824618802108e-06,
"loss": 0.0001,
"step": 946
},
{
"epoch": 0.35479041916167664,
"grad_norm": 0.022609582170844078,
"learning_rate": 9.211406925316214e-06,
"loss": 0.0001,
"step": 948
},
{
"epoch": 0.3555389221556886,
"grad_norm": 0.017719948664307594,
"learning_rate": 9.20798244641418e-06,
"loss": 0.0001,
"step": 950
},
{
"epoch": 0.3562874251497006,
"grad_norm": 0.06319057196378708,
"learning_rate": 9.204551187613521e-06,
"loss": 0.0002,
"step": 952
},
{
"epoch": 0.3570359281437126,
"grad_norm": 0.03745066374540329,
"learning_rate": 9.201113154442685e-06,
"loss": 0.0001,
"step": 954
},
{
"epoch": 0.35778443113772457,
"grad_norm": 0.021028850227594376,
"learning_rate": 9.197668352441025e-06,
"loss": 0.0,
"step": 956
},
{
"epoch": 0.35853293413173654,
"grad_norm": 0.02389431931078434,
"learning_rate": 9.194216787158805e-06,
"loss": 0.0001,
"step": 958
},
{
"epoch": 0.3592814371257485,
"grad_norm": 0.03340911120176315,
"learning_rate": 9.190758464157184e-06,
"loss": 0.0002,
"step": 960
},
{
"epoch": 0.3592814371257485,
"eval_accuracy": 0.9999895989441676,
"eval_loss": 5.814629912492819e-05,
"eval_runtime": 155.1915,
"eval_samples_per_second": 32.218,
"eval_steps_per_second": 8.055,
"step": 960
},
{
"epoch": 0.36002994011976047,
"grad_norm": 0.016582539305090904,
"learning_rate": 9.18729338900821e-06,
"loss": 0.0,
"step": 962
},
{
"epoch": 0.36077844311377244,
"grad_norm": 0.009625283069908619,
"learning_rate": 9.18382156729481e-06,
"loss": 0.0,
"step": 964
},
{
"epoch": 0.3615269461077844,
"grad_norm": 0.0010095112957060337,
"learning_rate": 9.18034300461078e-06,
"loss": 0.0,
"step": 966
},
{
"epoch": 0.36227544910179643,
"grad_norm": 0.0017203768948093057,
"learning_rate": 9.17685770656078e-06,
"loss": 0.0,
"step": 968
},
{
"epoch": 0.3630239520958084,
"grad_norm": 0.03041454404592514,
"learning_rate": 9.173365678760318e-06,
"loss": 0.0,
"step": 970
},
{
"epoch": 0.36377245508982037,
"grad_norm": 0.3855910897254944,
"learning_rate": 9.169866926835749e-06,
"loss": 0.0002,
"step": 972
},
{
"epoch": 0.36452095808383234,
"grad_norm": 0.04365074634552002,
"learning_rate": 9.166361456424257e-06,
"loss": 0.0001,
"step": 974
},
{
"epoch": 0.3652694610778443,
"grad_norm": 0.007284363266080618,
"learning_rate": 9.162849273173857e-06,
"loss": 0.0,
"step": 976
},
{
"epoch": 0.36601796407185627,
"grad_norm": 0.043903883546590805,
"learning_rate": 9.159330382743375e-06,
"loss": 0.0,
"step": 978
},
{
"epoch": 0.36676646706586824,
"grad_norm": 0.016840385273098946,
"learning_rate": 9.155804790802444e-06,
"loss": 0.0,
"step": 980
},
{
"epoch": 0.36751497005988026,
"grad_norm": 0.11350879073143005,
"learning_rate": 9.152272503031496e-06,
"loss": 0.0,
"step": 982
},
{
"epoch": 0.36826347305389223,
"grad_norm": 0.06382304430007935,
"learning_rate": 9.148733525121751e-06,
"loss": 0.0002,
"step": 984
},
{
"epoch": 0.3690119760479042,
"grad_norm": 0.09389964491128922,
"learning_rate": 9.145187862775208e-06,
"loss": 0.0001,
"step": 986
},
{
"epoch": 0.36976047904191617,
"grad_norm": 0.002736086491495371,
"learning_rate": 9.141635521704638e-06,
"loss": 0.0001,
"step": 988
},
{
"epoch": 0.37050898203592814,
"grad_norm": 0.07442247867584229,
"learning_rate": 9.138076507633566e-06,
"loss": 0.0001,
"step": 990
},
{
"epoch": 0.3712574850299401,
"grad_norm": 0.026373956352472305,
"learning_rate": 9.134510826296277e-06,
"loss": 0.0,
"step": 992
},
{
"epoch": 0.37200598802395207,
"grad_norm": 0.0026233713142573833,
"learning_rate": 9.130938483437792e-06,
"loss": 0.0001,
"step": 994
},
{
"epoch": 0.3727544910179641,
"grad_norm": 0.1102319285273552,
"learning_rate": 9.12735948481387e-06,
"loss": 0.0001,
"step": 996
},
{
"epoch": 0.37350299401197606,
"grad_norm": 0.08953434228897095,
"learning_rate": 9.12377383619099e-06,
"loss": 0.0,
"step": 998
},
{
"epoch": 0.37425149700598803,
"grad_norm": 0.026983065530657768,
"learning_rate": 9.120181543346348e-06,
"loss": 0.0,
"step": 1000
},
{
"epoch": 0.37425149700598803,
"eval_accuracy": 0.9999914303936028,
"eval_loss": 4.0267019357997924e-05,
"eval_runtime": 154.1351,
"eval_samples_per_second": 32.439,
"eval_steps_per_second": 8.11,
"step": 1000
},
{
"epoch": 0.375,
"grad_norm": 0.03703652322292328,
"learning_rate": 9.11658261206784e-06,
"loss": 0.0,
"step": 1002
},
{
"epoch": 0.37574850299401197,
"grad_norm": 0.06015906482934952,
"learning_rate": 9.112977048154066e-06,
"loss": 0.0,
"step": 1004
},
{
"epoch": 0.37649700598802394,
"grad_norm": 0.171669602394104,
"learning_rate": 9.109364857414306e-06,
"loss": 0.0001,
"step": 1006
},
{
"epoch": 0.3772455089820359,
"grad_norm": 0.027005095034837723,
"learning_rate": 9.10574604566852e-06,
"loss": 0.0,
"step": 1008
},
{
"epoch": 0.37799401197604793,
"grad_norm": 0.06816914677619934,
"learning_rate": 9.102120618747336e-06,
"loss": 0.0,
"step": 1010
},
{
"epoch": 0.3787425149700599,
"grad_norm": 0.029688792303204536,
"learning_rate": 9.09848858249204e-06,
"loss": 0.0,
"step": 1012
},
{
"epoch": 0.37949101796407186,
"grad_norm": 0.0352199524641037,
"learning_rate": 9.094849942754564e-06,
"loss": 0.0,
"step": 1014
},
{
"epoch": 0.38023952095808383,
"grad_norm": 0.42947277426719666,
"learning_rate": 9.091204705397485e-06,
"loss": 0.0002,
"step": 1016
},
{
"epoch": 0.3809880239520958,
"grad_norm": 0.038584258407354355,
"learning_rate": 9.087552876294003e-06,
"loss": 0.0,
"step": 1018
},
{
"epoch": 0.38173652694610777,
"grad_norm": 0.2603873312473297,
"learning_rate": 9.083894461327946e-06,
"loss": 0.0015,
"step": 1020
},
{
"epoch": 0.38248502994011974,
"grad_norm": 0.13592231273651123,
"learning_rate": 9.08022946639375e-06,
"loss": 0.0002,
"step": 1022
},
{
"epoch": 0.38323353293413176,
"grad_norm": 0.013513598591089249,
"learning_rate": 9.076557897396452e-06,
"loss": 0.0001,
"step": 1024
},
{
"epoch": 0.38398203592814373,
"grad_norm": 0.06492534279823303,
"learning_rate": 9.07287976025168e-06,
"loss": 0.0001,
"step": 1026
},
{
"epoch": 0.3847305389221557,
"grad_norm": 0.04138237237930298,
"learning_rate": 9.069195060885647e-06,
"loss": 0.0002,
"step": 1028
},
{
"epoch": 0.38547904191616766,
"grad_norm": 0.013964397832751274,
"learning_rate": 9.065503805235139e-06,
"loss": 0.0001,
"step": 1030
},
{
"epoch": 0.38622754491017963,
"grad_norm": 0.1758122593164444,
"learning_rate": 9.061805999247504e-06,
"loss": 0.0001,
"step": 1032
},
{
"epoch": 0.3869760479041916,
"grad_norm": 0.185356006026268,
"learning_rate": 9.058101648880646e-06,
"loss": 0.0003,
"step": 1034
},
{
"epoch": 0.38772455089820357,
"grad_norm": 0.020207742229104042,
"learning_rate": 9.05439076010301e-06,
"loss": 0.0003,
"step": 1036
},
{
"epoch": 0.3884730538922156,
"grad_norm": 0.07574658840894699,
"learning_rate": 9.050673338893578e-06,
"loss": 0.0002,
"step": 1038
},
{
"epoch": 0.38922155688622756,
"grad_norm": 0.15675880014896393,
"learning_rate": 9.046949391241859e-06,
"loss": 0.0003,
"step": 1040
},
{
"epoch": 0.38922155688622756,
"eval_accuracy": 0.9999522992784509,
"eval_loss": 0.00014203271712176502,
"eval_runtime": 154.4084,
"eval_samples_per_second": 32.382,
"eval_steps_per_second": 8.095,
"step": 1040
},
{
"epoch": 0.38997005988023953,
"grad_norm": 0.10081563144922256,
"learning_rate": 9.043218923147874e-06,
"loss": 0.0001,
"step": 1042
},
{
"epoch": 0.3907185628742515,
"grad_norm": 0.028760971501469612,
"learning_rate": 9.039481940622148e-06,
"loss": 0.0003,
"step": 1044
},
{
"epoch": 0.39146706586826346,
"grad_norm": 0.37775400280952454,
"learning_rate": 9.035738449685707e-06,
"loss": 0.0007,
"step": 1046
},
{
"epoch": 0.39221556886227543,
"grad_norm": 0.14730341732501984,
"learning_rate": 9.031988456370062e-06,
"loss": 0.0003,
"step": 1048
},
{
"epoch": 0.3929640718562874,
"grad_norm": 0.16259920597076416,
"learning_rate": 9.0282319667172e-06,
"loss": 0.0006,
"step": 1050
},
{
"epoch": 0.3937125748502994,
"grad_norm": 0.11165869981050491,
"learning_rate": 9.02446898677957e-06,
"loss": 0.0002,
"step": 1052
},
{
"epoch": 0.3944610778443114,
"grad_norm": 0.236286461353302,
"learning_rate": 9.020699522620091e-06,
"loss": 0.0006,
"step": 1054
},
{
"epoch": 0.39520958083832336,
"grad_norm": 0.17146489024162292,
"learning_rate": 9.016923580312114e-06,
"loss": 0.0006,
"step": 1056
},
{
"epoch": 0.39595808383233533,
"grad_norm": 0.13749942183494568,
"learning_rate": 9.013141165939439e-06,
"loss": 0.0005,
"step": 1058
},
{
"epoch": 0.3967065868263473,
"grad_norm": 0.0854322612285614,
"learning_rate": 9.009352285596287e-06,
"loss": 0.0004,
"step": 1060
},
{
"epoch": 0.39745508982035926,
"grad_norm": 0.3005140423774719,
"learning_rate": 9.005556945387301e-06,
"loss": 0.0009,
"step": 1062
},
{
"epoch": 0.39820359281437123,
"grad_norm": 0.061198897659778595,
"learning_rate": 9.001755151427532e-06,
"loss": 0.0002,
"step": 1064
},
{
"epoch": 0.39895209580838326,
"grad_norm": 0.13300061225891113,
"learning_rate": 8.997946909842426e-06,
"loss": 0.0003,
"step": 1066
},
{
"epoch": 0.3997005988023952,
"grad_norm": 0.05639196187257767,
"learning_rate": 8.99413222676782e-06,
"loss": 0.0002,
"step": 1068
},
{
"epoch": 0.4004491017964072,
"grad_norm": 0.0920565128326416,
"learning_rate": 8.990311108349926e-06,
"loss": 0.0002,
"step": 1070
},
{
"epoch": 0.40119760479041916,
"grad_norm": 0.2794632613658905,
"learning_rate": 8.986483560745335e-06,
"loss": 0.0003,
"step": 1072
},
{
"epoch": 0.40194610778443113,
"grad_norm": 0.05511578544974327,
"learning_rate": 8.982649590120982e-06,
"loss": 0.0001,
"step": 1074
},
{
"epoch": 0.4026946107784431,
"grad_norm": 0.11161552369594574,
"learning_rate": 8.978809202654161e-06,
"loss": 0.0003,
"step": 1076
},
{
"epoch": 0.40344311377245506,
"grad_norm": 0.04912755638360977,
"learning_rate": 8.974962404532503e-06,
"loss": 0.0002,
"step": 1078
},
{
"epoch": 0.4041916167664671,
"grad_norm": 0.130497545003891,
"learning_rate": 8.971109201953962e-06,
"loss": 0.0002,
"step": 1080
},
{
"epoch": 0.4041916167664671,
"eval_accuracy": 0.9999309907446557,
"eval_loss": 0.00030308307032100856,
"eval_runtime": 157.0526,
"eval_samples_per_second": 31.836,
"eval_steps_per_second": 7.959,
"step": 1080
},
{
"epoch": 0.40494011976047906,
"grad_norm": 0.06516057252883911,
"learning_rate": 8.967249601126821e-06,
"loss": 0.0001,
"step": 1082
},
{
"epoch": 0.405688622754491,
"grad_norm": 0.0653974711894989,
"learning_rate": 8.963383608269665e-06,
"loss": 0.0001,
"step": 1084
},
{
"epoch": 0.406437125748503,
"grad_norm": 0.1652081459760666,
"learning_rate": 8.959511229611377e-06,
"loss": 0.0005,
"step": 1086
},
{
"epoch": 0.40718562874251496,
"grad_norm": 0.2547818720340729,
"learning_rate": 8.955632471391132e-06,
"loss": 0.0004,
"step": 1088
},
{
"epoch": 0.40793413173652693,
"grad_norm": 0.11153703182935715,
"learning_rate": 8.951747339858383e-06,
"loss": 0.0001,
"step": 1090
},
{
"epoch": 0.4086826347305389,
"grad_norm": 0.20618999004364014,
"learning_rate": 8.947855841272852e-06,
"loss": 0.0004,
"step": 1092
},
{
"epoch": 0.4094311377245509,
"grad_norm": 0.06252986937761307,
"learning_rate": 8.943957981904518e-06,
"loss": 0.0003,
"step": 1094
},
{
"epoch": 0.4101796407185629,
"grad_norm": 0.12335634976625443,
"learning_rate": 8.94005376803361e-06,
"loss": 0.0002,
"step": 1096
},
{
"epoch": 0.41092814371257486,
"grad_norm": 0.15102048218250275,
"learning_rate": 8.936143205950596e-06,
"loss": 0.0003,
"step": 1098
},
{
"epoch": 0.4116766467065868,
"grad_norm": 0.2645941376686096,
"learning_rate": 8.93222630195617e-06,
"loss": 0.0002,
"step": 1100
},
{
"epoch": 0.4124251497005988,
"grad_norm": 0.16175216436386108,
"learning_rate": 8.928303062361244e-06,
"loss": 0.0002,
"step": 1102
},
{
"epoch": 0.41317365269461076,
"grad_norm": 0.390656977891922,
"learning_rate": 8.924373493486941e-06,
"loss": 0.0008,
"step": 1104
},
{
"epoch": 0.41392215568862273,
"grad_norm": 0.19943471252918243,
"learning_rate": 8.92043760166458e-06,
"loss": 0.0006,
"step": 1106
},
{
"epoch": 0.41467065868263475,
"grad_norm": 0.08877554535865784,
"learning_rate": 8.916495393235666e-06,
"loss": 0.0003,
"step": 1108
},
{
"epoch": 0.4154191616766467,
"grad_norm": 0.02073746733367443,
"learning_rate": 8.912546874551883e-06,
"loss": 0.0003,
"step": 1110
},
{
"epoch": 0.4161676646706587,
"grad_norm": 0.229649618268013,
"learning_rate": 8.908592051975083e-06,
"loss": 0.0003,
"step": 1112
},
{
"epoch": 0.41691616766467066,
"grad_norm": 0.2585594952106476,
"learning_rate": 8.904630931877271e-06,
"loss": 0.0005,
"step": 1114
},
{
"epoch": 0.4176646706586826,
"grad_norm": 0.09236887842416763,
"learning_rate": 8.900663520640605e-06,
"loss": 0.0003,
"step": 1116
},
{
"epoch": 0.4184131736526946,
"grad_norm": 0.1604318916797638,
"learning_rate": 8.896689824657371e-06,
"loss": 0.0008,
"step": 1118
},
{
"epoch": 0.41916167664670656,
"grad_norm": 0.1640581637620926,
"learning_rate": 8.892709850329991e-06,
"loss": 0.0009,
"step": 1120
},
{
"epoch": 0.41916167664670656,
"eval_accuracy": 0.9998298540459971,
"eval_loss": 0.0006637079059146345,
"eval_runtime": 158.0379,
"eval_samples_per_second": 31.638,
"eval_steps_per_second": 7.909,
"step": 1120
},
{
"epoch": 0.4199101796407186,
"grad_norm": 0.1830235719680786,
"learning_rate": 8.88872360407099e-06,
"loss": 0.0007,
"step": 1122
},
{
"epoch": 0.42065868263473055,
"grad_norm": 0.15978126227855682,
"learning_rate": 8.884731092303011e-06,
"loss": 0.0008,
"step": 1124
},
{
"epoch": 0.4214071856287425,
"grad_norm": 0.07531040906906128,
"learning_rate": 8.880732321458785e-06,
"loss": 0.0004,
"step": 1126
},
{
"epoch": 0.4221556886227545,
"grad_norm": 0.07047852128744125,
"learning_rate": 8.876727297981129e-06,
"loss": 0.0004,
"step": 1128
},
{
"epoch": 0.42290419161676646,
"grad_norm": 0.11832007020711899,
"learning_rate": 8.872716028322931e-06,
"loss": 0.0006,
"step": 1130
},
{
"epoch": 0.4236526946107784,
"grad_norm": 0.11789973080158234,
"learning_rate": 8.868698518947152e-06,
"loss": 0.0003,
"step": 1132
},
{
"epoch": 0.4244011976047904,
"grad_norm": 0.06593231111764908,
"learning_rate": 8.864674776326798e-06,
"loss": 0.0003,
"step": 1134
},
{
"epoch": 0.4251497005988024,
"grad_norm": 0.12147919833660126,
"learning_rate": 8.860644806944917e-06,
"loss": 0.0003,
"step": 1136
},
{
"epoch": 0.4258982035928144,
"grad_norm": 0.014330295845866203,
"learning_rate": 8.8566086172946e-06,
"loss": 0.0001,
"step": 1138
},
{
"epoch": 0.42664670658682635,
"grad_norm": 0.13002386689186096,
"learning_rate": 8.852566213878947e-06,
"loss": 0.0002,
"step": 1140
},
{
"epoch": 0.4273952095808383,
"grad_norm": 0.028262050822377205,
"learning_rate": 8.84851760321108e-06,
"loss": 0.0001,
"step": 1142
},
{
"epoch": 0.4281437125748503,
"grad_norm": 0.058746110647916794,
"learning_rate": 8.844462791814113e-06,
"loss": 0.0002,
"step": 1144
},
{
"epoch": 0.42889221556886226,
"grad_norm": 0.006739677395671606,
"learning_rate": 8.84040178622116e-06,
"loss": 0.0,
"step": 1146
},
{
"epoch": 0.4296407185628742,
"grad_norm": 0.0508301667869091,
"learning_rate": 8.83633459297531e-06,
"loss": 0.0,
"step": 1148
},
{
"epoch": 0.43038922155688625,
"grad_norm": 0.06738423556089401,
"learning_rate": 8.83226121862962e-06,
"loss": 0.0,
"step": 1150
},
{
"epoch": 0.4311377245508982,
"grad_norm": 0.093570277094841,
"learning_rate": 8.828181669747111e-06,
"loss": 0.0002,
"step": 1152
},
{
"epoch": 0.4318862275449102,
"grad_norm": 0.22780318558216095,
"learning_rate": 8.824095952900746e-06,
"loss": 0.0003,
"step": 1154
},
{
"epoch": 0.43263473053892215,
"grad_norm": 0.006822109688073397,
"learning_rate": 8.820004074673433e-06,
"loss": 0.0,
"step": 1156
},
{
"epoch": 0.4333832335329341,
"grad_norm": 0.03218008950352669,
"learning_rate": 8.815906041658001e-06,
"loss": 0.0,
"step": 1158
},
{
"epoch": 0.4341317365269461,
"grad_norm": 0.021438656374812126,
"learning_rate": 8.8118018604572e-06,
"loss": 0.0,
"step": 1160
},
{
"epoch": 0.4341317365269461,
"eval_accuracy": 0.9999750413955637,
"eval_loss": 0.000117507777758874,
"eval_runtime": 153.4917,
"eval_samples_per_second": 32.575,
"eval_steps_per_second": 8.144,
"step": 1160
},
{
"epoch": 0.43488023952095806,
"grad_norm": 0.003635610453784466,
"learning_rate": 8.807691537683685e-06,
"loss": 0.0001,
"step": 1162
},
{
"epoch": 0.4356287425149701,
"grad_norm": 0.03301112353801727,
"learning_rate": 8.80357507996e-06,
"loss": 0.0,
"step": 1164
},
{
"epoch": 0.43637724550898205,
"grad_norm": 0.09721909463405609,
"learning_rate": 8.799452493918586e-06,
"loss": 0.0002,
"step": 1166
},
{
"epoch": 0.437125748502994,
"grad_norm": 0.042455609887838364,
"learning_rate": 8.795323786201746e-06,
"loss": 0.0,
"step": 1168
},
{
"epoch": 0.437874251497006,
"grad_norm": 0.022428715601563454,
"learning_rate": 8.791188963461653e-06,
"loss": 0.0001,
"step": 1170
},
{
"epoch": 0.43862275449101795,
"grad_norm": 0.0063305930234491825,
"learning_rate": 8.787048032360332e-06,
"loss": 0.0,
"step": 1172
},
{
"epoch": 0.4393712574850299,
"grad_norm": 0.027178645133972168,
"learning_rate": 8.782900999569646e-06,
"loss": 0.0001,
"step": 1174
},
{
"epoch": 0.44011976047904194,
"grad_norm": 0.016478972509503365,
"learning_rate": 8.778747871771293e-06,
"loss": 0.0001,
"step": 1176
},
{
"epoch": 0.4408682634730539,
"grad_norm": 0.0024430316407233477,
"learning_rate": 8.774588655656787e-06,
"loss": 0.0,
"step": 1178
},
{
"epoch": 0.4416167664670659,
"grad_norm": 0.0025271910708397627,
"learning_rate": 8.770423357927463e-06,
"loss": 0.0,
"step": 1180
},
{
"epoch": 0.44236526946107785,
"grad_norm": 0.003522343933582306,
"learning_rate": 8.766251985294435e-06,
"loss": 0.0001,
"step": 1182
},
{
"epoch": 0.4431137724550898,
"grad_norm": 0.0037217664066702127,
"learning_rate": 8.762074544478622e-06,
"loss": 0.0002,
"step": 1184
},
{
"epoch": 0.4438622754491018,
"grad_norm": 0.003973813261836767,
"learning_rate": 8.757891042210713e-06,
"loss": 0.0,
"step": 1186
},
{
"epoch": 0.44461077844311375,
"grad_norm": 0.005763462278991938,
"learning_rate": 8.753701485231165e-06,
"loss": 0.0,
"step": 1188
},
{
"epoch": 0.4453592814371258,
"grad_norm": 0.005196116399019957,
"learning_rate": 8.749505880290188e-06,
"loss": 0.0,
"step": 1190
},
{
"epoch": 0.44610778443113774,
"grad_norm": 0.0032948690932244062,
"learning_rate": 8.74530423414774e-06,
"loss": 0.0,
"step": 1192
},
{
"epoch": 0.4468562874251497,
"grad_norm": 0.009100310504436493,
"learning_rate": 8.741096553573506e-06,
"loss": 0.0,
"step": 1194
},
{
"epoch": 0.4476047904191617,
"grad_norm": 0.0061983345076441765,
"learning_rate": 8.736882845346906e-06,
"loss": 0.0,
"step": 1196
},
{
"epoch": 0.44835329341317365,
"grad_norm": 0.0011341345962136984,
"learning_rate": 8.732663116257057e-06,
"loss": 0.0,
"step": 1198
},
{
"epoch": 0.4491017964071856,
"grad_norm": 0.0010620451066643,
"learning_rate": 8.728437373102784e-06,
"loss": 0.0,
"step": 1200
},
{
"epoch": 0.4491017964071856,
"eval_accuracy": 0.9999988700564972,
"eval_loss": 9.44121893553529e-06,
"eval_runtime": 153.9703,
"eval_samples_per_second": 32.474,
"eval_steps_per_second": 8.118,
"step": 1200
},
{
"epoch": 0.4498502994011976,
"grad_norm": 0.0015724517870694399,
"learning_rate": 8.724205622692608e-06,
"loss": 0.0001,
"step": 1202
},
{
"epoch": 0.4505988023952096,
"grad_norm": 0.0011631123488768935,
"learning_rate": 8.719967871844715e-06,
"loss": 0.0,
"step": 1204
},
{
"epoch": 0.4513473053892216,
"grad_norm": 0.003892699722200632,
"learning_rate": 8.715724127386971e-06,
"loss": 0.0,
"step": 1206
},
{
"epoch": 0.45209580838323354,
"grad_norm": 0.001457210979424417,
"learning_rate": 8.711474396156894e-06,
"loss": 0.0,
"step": 1208
},
{
"epoch": 0.4528443113772455,
"grad_norm": 0.04398849606513977,
"learning_rate": 8.707218685001648e-06,
"loss": 0.0,
"step": 1210
},
{
"epoch": 0.4535928143712575,
"grad_norm": 0.0016321828588843346,
"learning_rate": 8.702957000778029e-06,
"loss": 0.0,
"step": 1212
},
{
"epoch": 0.45434131736526945,
"grad_norm": 0.006306509952992201,
"learning_rate": 8.698689350352465e-06,
"loss": 0.0,
"step": 1214
},
{
"epoch": 0.4550898203592814,
"grad_norm": 0.003088061697781086,
"learning_rate": 8.69441574060099e-06,
"loss": 0.0,
"step": 1216
},
{
"epoch": 0.45583832335329344,
"grad_norm": 0.0059152874164283276,
"learning_rate": 8.690136178409237e-06,
"loss": 0.0,
"step": 1218
},
{
"epoch": 0.4565868263473054,
"grad_norm": 0.0006292685866355896,
"learning_rate": 8.685850670672438e-06,
"loss": 0.0,
"step": 1220
},
{
"epoch": 0.4573353293413174,
"grad_norm": 0.001894032466225326,
"learning_rate": 8.681559224295401e-06,
"loss": 0.0,
"step": 1222
},
{
"epoch": 0.45808383233532934,
"grad_norm": 0.000890376337338239,
"learning_rate": 8.6772618461925e-06,
"loss": 0.0,
"step": 1224
},
{
"epoch": 0.4588323353293413,
"grad_norm": 0.047846511006355286,
"learning_rate": 8.672958543287666e-06,
"loss": 0.0,
"step": 1226
},
{
"epoch": 0.4595808383233533,
"grad_norm": 0.0039296639151871204,
"learning_rate": 8.668649322514382e-06,
"loss": 0.0,
"step": 1228
},
{
"epoch": 0.46032934131736525,
"grad_norm": 0.0010013898136094213,
"learning_rate": 8.66433419081566e-06,
"loss": 0.0,
"step": 1230
},
{
"epoch": 0.46107784431137727,
"grad_norm": 0.0013401862233877182,
"learning_rate": 8.660013155144036e-06,
"loss": 0.0,
"step": 1232
},
{
"epoch": 0.46182634730538924,
"grad_norm": 0.01721956580877304,
"learning_rate": 8.655686222461561e-06,
"loss": 0.0,
"step": 1234
},
{
"epoch": 0.4625748502994012,
"grad_norm": 0.10839847475290298,
"learning_rate": 8.651353399739787e-06,
"loss": 0.0,
"step": 1236
},
{
"epoch": 0.4633233532934132,
"grad_norm": 0.0019374943803995848,
"learning_rate": 8.647014693959754e-06,
"loss": 0.0,
"step": 1238
},
{
"epoch": 0.46407185628742514,
"grad_norm": 0.0010845274664461613,
"learning_rate": 8.642670112111982e-06,
"loss": 0.0,
"step": 1240
},
{
"epoch": 0.46407185628742514,
"eval_accuracy": 0.999998779749899,
"eval_loss": 7.805577297403943e-06,
"eval_runtime": 153.998,
"eval_samples_per_second": 32.468,
"eval_steps_per_second": 8.117,
"step": 1240
},
{
"epoch": 0.4648203592814371,
"grad_norm": 0.14908014237880707,
"learning_rate": 8.63831966119646e-06,
"loss": 0.0002,
"step": 1242
},
{
"epoch": 0.4655688622754491,
"grad_norm": 0.0005565496394410729,
"learning_rate": 8.633963348222628e-06,
"loss": 0.0,
"step": 1244
},
{
"epoch": 0.4663173652694611,
"grad_norm": 0.016671478748321533,
"learning_rate": 8.629601180209382e-06,
"loss": 0.0,
"step": 1246
},
{
"epoch": 0.46706586826347307,
"grad_norm": 0.0019113154849037528,
"learning_rate": 8.625233164185035e-06,
"loss": 0.0,
"step": 1248
},
{
"epoch": 0.46781437125748504,
"grad_norm": 0.24114775657653809,
"learning_rate": 8.620859307187339e-06,
"loss": 0.0002,
"step": 1250
},
{
"epoch": 0.468562874251497,
"grad_norm": 0.12908697128295898,
"learning_rate": 8.616479616263444e-06,
"loss": 0.0001,
"step": 1252
},
{
"epoch": 0.469311377245509,
"grad_norm": 0.04974567890167236,
"learning_rate": 8.61209409846991e-06,
"loss": 0.0,
"step": 1254
},
{
"epoch": 0.47005988023952094,
"grad_norm": 0.14523474872112274,
"learning_rate": 8.607702760872679e-06,
"loss": 0.0005,
"step": 1256
},
{
"epoch": 0.4708083832335329,
"grad_norm": 0.0028332837391644716,
"learning_rate": 8.60330561054707e-06,
"loss": 0.0003,
"step": 1258
},
{
"epoch": 0.47155688622754494,
"grad_norm": 0.09443452209234238,
"learning_rate": 8.598902654577768e-06,
"loss": 0.0002,
"step": 1260
},
{
"epoch": 0.4723053892215569,
"grad_norm": 0.1735847145318985,
"learning_rate": 8.594493900058817e-06,
"loss": 0.0005,
"step": 1262
},
{
"epoch": 0.47305389221556887,
"grad_norm": 0.09729497134685516,
"learning_rate": 8.590079354093594e-06,
"loss": 0.0001,
"step": 1264
},
{
"epoch": 0.47380239520958084,
"grad_norm": 0.014571278356015682,
"learning_rate": 8.585659023794818e-06,
"loss": 0.0001,
"step": 1266
},
{
"epoch": 0.4745508982035928,
"grad_norm": 0.08880387991666794,
"learning_rate": 8.581232916284519e-06,
"loss": 0.0002,
"step": 1268
},
{
"epoch": 0.4752994011976048,
"grad_norm": 0.056134164333343506,
"learning_rate": 8.57680103869404e-06,
"loss": 0.0001,
"step": 1270
},
{
"epoch": 0.47604790419161674,
"grad_norm": 0.06601478904485703,
"learning_rate": 8.572363398164017e-06,
"loss": 0.0001,
"step": 1272
},
{
"epoch": 0.47679640718562877,
"grad_norm": 0.04634417966008186,
"learning_rate": 8.567920001844376e-06,
"loss": 0.0001,
"step": 1274
},
{
"epoch": 0.47754491017964074,
"grad_norm": 0.06786518543958664,
"learning_rate": 8.563470856894316e-06,
"loss": 0.0002,
"step": 1276
},
{
"epoch": 0.4782934131736527,
"grad_norm": 0.0038419270422309637,
"learning_rate": 8.559015970482292e-06,
"loss": 0.0,
"step": 1278
},
{
"epoch": 0.47904191616766467,
"grad_norm": 0.005290990229696035,
"learning_rate": 8.554555349786016e-06,
"loss": 0.0,
"step": 1280
},
{
"epoch": 0.47904191616766467,
"eval_accuracy": 1.0,
"eval_loss": 1.6949796190601774e-05,
"eval_runtime": 162.9441,
"eval_samples_per_second": 30.685,
"eval_steps_per_second": 7.671,
"step": 1280
},
{
"epoch": 0.47979041916167664,
"grad_norm": 0.015714962035417557,
"learning_rate": 8.550089001992438e-06,
"loss": 0.0,
"step": 1282
},
{
"epoch": 0.4805389221556886,
"grad_norm": 0.016865752637386322,
"learning_rate": 8.545616934297733e-06,
"loss": 0.0,
"step": 1284
},
{
"epoch": 0.4812874251497006,
"grad_norm": 0.0030540430452674627,
"learning_rate": 8.541139153907296e-06,
"loss": 0.0,
"step": 1286
},
{
"epoch": 0.4820359281437126,
"grad_norm": 0.004076346755027771,
"learning_rate": 8.536655668035723e-06,
"loss": 0.0,
"step": 1288
},
{
"epoch": 0.48278443113772457,
"grad_norm": 0.0015489828074350953,
"learning_rate": 8.532166483906804e-06,
"loss": 0.0,
"step": 1290
},
{
"epoch": 0.48353293413173654,
"grad_norm": 0.0032020832877606153,
"learning_rate": 8.527671608753508e-06,
"loss": 0.0,
"step": 1292
},
{
"epoch": 0.4842814371257485,
"grad_norm": 0.0029930637683719397,
"learning_rate": 8.523171049817974e-06,
"loss": 0.0,
"step": 1294
},
{
"epoch": 0.48502994011976047,
"grad_norm": 0.00045903853606432676,
"learning_rate": 8.518664814351502e-06,
"loss": 0.0,
"step": 1296
},
{
"epoch": 0.48577844311377244,
"grad_norm": 0.002175545785576105,
"learning_rate": 8.514152909614538e-06,
"loss": 0.0,
"step": 1298
},
{
"epoch": 0.4865269461077844,
"grad_norm": 0.00034521182533353567,
"learning_rate": 8.509635342876655e-06,
"loss": 0.0,
"step": 1300
},
{
"epoch": 0.48727544910179643,
"grad_norm": 0.0007213663193397224,
"learning_rate": 8.505112121416554e-06,
"loss": 0.0,
"step": 1302
},
{
"epoch": 0.4880239520958084,
"grad_norm": 0.000988309970125556,
"learning_rate": 8.500583252522053e-06,
"loss": 0.0,
"step": 1304
},
{
"epoch": 0.48877245508982037,
"grad_norm": 0.0006475381087511778,
"learning_rate": 8.496048743490053e-06,
"loss": 0.0,
"step": 1306
},
{
"epoch": 0.48952095808383234,
"grad_norm": 0.004469580017030239,
"learning_rate": 8.49150860162656e-06,
"loss": 0.0,
"step": 1308
},
{
"epoch": 0.4902694610778443,
"grad_norm": 0.0006323789712041616,
"learning_rate": 8.486962834246646e-06,
"loss": 0.0,
"step": 1310
},
{
"epoch": 0.49101796407185627,
"grad_norm": 0.0003597615868784487,
"learning_rate": 8.482411448674445e-06,
"loss": 0.0,
"step": 1312
},
{
"epoch": 0.49176646706586824,
"grad_norm": 0.0009737180080264807,
"learning_rate": 8.477854452243149e-06,
"loss": 0.0,
"step": 1314
},
{
"epoch": 0.49251497005988026,
"grad_norm": 0.00047102788812480867,
"learning_rate": 8.473291852294986e-06,
"loss": 0.0,
"step": 1316
},
{
"epoch": 0.49326347305389223,
"grad_norm": 0.0006197803886607289,
"learning_rate": 8.468723656181219e-06,
"loss": 0.0,
"step": 1318
},
{
"epoch": 0.4940119760479042,
"grad_norm": 0.0006234439788386226,
"learning_rate": 8.464149871262118e-06,
"loss": 0.0,
"step": 1320
},
{
"epoch": 0.4940119760479042,
"eval_accuracy": 1.0,
"eval_loss": 1.9553074253053637e-06,
"eval_runtime": 156.0817,
"eval_samples_per_second": 32.035,
"eval_steps_per_second": 8.009,
"step": 1320
},
{
"epoch": 0.49476047904191617,
"grad_norm": 0.0012577202869579196,
"learning_rate": 8.459570504906962e-06,
"loss": 0.0,
"step": 1322
},
{
"epoch": 0.49550898203592814,
"grad_norm": 0.0013276775134727359,
"learning_rate": 8.454985564494025e-06,
"loss": 0.0,
"step": 1324
},
{
"epoch": 0.4962574850299401,
"grad_norm": 0.0007339988951571286,
"learning_rate": 8.450395057410561e-06,
"loss": 0.0,
"step": 1326
},
{
"epoch": 0.49700598802395207,
"grad_norm": 0.00023488645092584193,
"learning_rate": 8.445798991052791e-06,
"loss": 0.0,
"step": 1328
},
{
"epoch": 0.4977544910179641,
"grad_norm": 0.008007602766156197,
"learning_rate": 8.441197372825892e-06,
"loss": 0.0,
"step": 1330
},
{
"epoch": 0.49850299401197606,
"grad_norm": 0.00031445815693587065,
"learning_rate": 8.436590210143991e-06,
"loss": 0.0,
"step": 1332
},
{
"epoch": 0.49925149700598803,
"grad_norm": 0.000408270803745836,
"learning_rate": 8.431977510430145e-06,
"loss": 0.0,
"step": 1334
},
{
"epoch": 0.5,
"grad_norm": 0.00037585021345876157,
"learning_rate": 8.427359281116335e-06,
"loss": 0.0,
"step": 1336
},
{
"epoch": 0.500748502994012,
"grad_norm": 0.00020924248383380473,
"learning_rate": 8.422735529643445e-06,
"loss": 0.0,
"step": 1338
},
{
"epoch": 0.5014970059880239,
"grad_norm": 0.00034080087789334357,
"learning_rate": 8.418106263461261e-06,
"loss": 0.0,
"step": 1340
},
{
"epoch": 0.5022455089820359,
"grad_norm": 0.001094786450266838,
"learning_rate": 8.413471490028456e-06,
"loss": 0.0,
"step": 1342
},
{
"epoch": 0.5029940119760479,
"grad_norm": 0.00020074410713277757,
"learning_rate": 8.408831216812574e-06,
"loss": 0.0,
"step": 1344
},
{
"epoch": 0.5037425149700598,
"grad_norm": 0.00031989437411539257,
"learning_rate": 8.404185451290017e-06,
"loss": 0.0,
"step": 1346
},
{
"epoch": 0.5044910179640718,
"grad_norm": 0.0004511797451414168,
"learning_rate": 8.399534200946044e-06,
"loss": 0.0,
"step": 1348
},
{
"epoch": 0.5052395209580839,
"grad_norm": 0.0033039976842701435,
"learning_rate": 8.394877473274743e-06,
"loss": 0.0,
"step": 1350
},
{
"epoch": 0.5059880239520959,
"grad_norm": 0.00033330474980175495,
"learning_rate": 8.39021527577903e-06,
"loss": 0.0,
"step": 1352
},
{
"epoch": 0.5067365269461078,
"grad_norm": 0.00710050156340003,
"learning_rate": 8.38554761597064e-06,
"loss": 0.0,
"step": 1354
},
{
"epoch": 0.5074850299401198,
"grad_norm": 0.00019886674999725074,
"learning_rate": 8.380874501370098e-06,
"loss": 0.0,
"step": 1356
},
{
"epoch": 0.5082335329341318,
"grad_norm": 0.0007205790607258677,
"learning_rate": 8.376195939506727e-06,
"loss": 0.0,
"step": 1358
},
{
"epoch": 0.5089820359281437,
"grad_norm": 0.00022511072165798396,
"learning_rate": 8.371511937918616e-06,
"loss": 0.0,
"step": 1360
},
{
"epoch": 0.5089820359281437,
"eval_accuracy": 1.0,
"eval_loss": 1.2852336794821895e-06,
"eval_runtime": 155.6867,
"eval_samples_per_second": 32.116,
"eval_steps_per_second": 8.029,
"step": 1360
},
{
"epoch": 0.5097305389221557,
"grad_norm": 0.000407641549827531,
"learning_rate": 8.366822504152636e-06,
"loss": 0.0,
"step": 1362
},
{
"epoch": 0.5104790419161677,
"grad_norm": 0.001749478979036212,
"learning_rate": 8.362127645764392e-06,
"loss": 0.0,
"step": 1364
},
{
"epoch": 0.5112275449101796,
"grad_norm": 0.00031519352342002094,
"learning_rate": 8.357427370318239e-06,
"loss": 0.0,
"step": 1366
},
{
"epoch": 0.5119760479041916,
"grad_norm": 0.0004909691051580012,
"learning_rate": 8.352721685387258e-06,
"loss": 0.0,
"step": 1368
},
{
"epoch": 0.5127245508982036,
"grad_norm": 0.0003617761831264943,
"learning_rate": 8.348010598553245e-06,
"loss": 0.0,
"step": 1370
},
{
"epoch": 0.5134730538922155,
"grad_norm": 0.0003041178279090673,
"learning_rate": 8.3432941174067e-06,
"loss": 0.0,
"step": 1372
},
{
"epoch": 0.5142215568862275,
"grad_norm": 0.0009227189584635198,
"learning_rate": 8.338572249546813e-06,
"loss": 0.0,
"step": 1374
},
{
"epoch": 0.5149700598802395,
"grad_norm": 0.00037840052391402423,
"learning_rate": 8.33384500258146e-06,
"loss": 0.0,
"step": 1376
},
{
"epoch": 0.5157185628742516,
"grad_norm": 0.0003028454084414989,
"learning_rate": 8.329112384127172e-06,
"loss": 0.0,
"step": 1378
},
{
"epoch": 0.5164670658682635,
"grad_norm": 0.00024328533618245274,
"learning_rate": 8.324374401809144e-06,
"loss": 0.0,
"step": 1380
},
{
"epoch": 0.5172155688622755,
"grad_norm": 0.00041966489516198635,
"learning_rate": 8.319631063261209e-06,
"loss": 0.0,
"step": 1382
},
{
"epoch": 0.5179640718562875,
"grad_norm": 0.0001933051535161212,
"learning_rate": 8.314882376125832e-06,
"loss": 0.0,
"step": 1384
},
{
"epoch": 0.5187125748502994,
"grad_norm": 0.00017571724310982972,
"learning_rate": 8.310128348054093e-06,
"loss": 0.0,
"step": 1386
},
{
"epoch": 0.5194610778443114,
"grad_norm": 0.00010965206456603482,
"learning_rate": 8.305368986705683e-06,
"loss": 0.0,
"step": 1388
},
{
"epoch": 0.5202095808383234,
"grad_norm": 0.00016223240527324378,
"learning_rate": 8.300604299748876e-06,
"loss": 0.0,
"step": 1390
},
{
"epoch": 0.5209580838323353,
"grad_norm": 0.00020105067233089358,
"learning_rate": 8.295834294860535e-06,
"loss": 0.0,
"step": 1392
},
{
"epoch": 0.5217065868263473,
"grad_norm": 0.00012361881090328097,
"learning_rate": 8.291058979726092e-06,
"loss": 0.0,
"step": 1394
},
{
"epoch": 0.5224550898203593,
"grad_norm": 0.00031712997588329017,
"learning_rate": 8.286278362039527e-06,
"loss": 0.0,
"step": 1396
},
{
"epoch": 0.5232035928143712,
"grad_norm": 0.0007049996056593955,
"learning_rate": 8.281492449503372e-06,
"loss": 0.0,
"step": 1398
},
{
"epoch": 0.5239520958083832,
"grad_norm": 0.00017834010941442102,
"learning_rate": 8.276701249828684e-06,
"loss": 0.0,
"step": 1400
},
{
"epoch": 0.5239520958083832,
"eval_accuracy": 1.0,
"eval_loss": 1.0108310561918188e-06,
"eval_runtime": 164.1638,
"eval_samples_per_second": 30.457,
"eval_steps_per_second": 7.614,
"step": 1400
},
{
"epoch": 0.5247005988023952,
"grad_norm": 0.00016547701670788229,
"learning_rate": 8.271904770735042e-06,
"loss": 0.0,
"step": 1402
},
{
"epoch": 0.5254491017964071,
"grad_norm": 0.00018670795543584973,
"learning_rate": 8.267103019950529e-06,
"loss": 0.0,
"step": 1404
},
{
"epoch": 0.5261976047904192,
"grad_norm": 0.0005327853723429143,
"learning_rate": 8.262296005211722e-06,
"loss": 0.0,
"step": 1406
},
{
"epoch": 0.5269461077844312,
"grad_norm": 0.00011932725465158,
"learning_rate": 8.257483734263682e-06,
"loss": 0.0,
"step": 1408
},
{
"epoch": 0.5276946107784432,
"grad_norm": 0.0010636606020852923,
"learning_rate": 8.252666214859936e-06,
"loss": 0.0,
"step": 1410
},
{
"epoch": 0.5284431137724551,
"grad_norm": 0.00017524044960737228,
"learning_rate": 8.247843454762467e-06,
"loss": 0.0,
"step": 1412
},
{
"epoch": 0.5291916167664671,
"grad_norm": 0.00048075238009914756,
"learning_rate": 8.243015461741707e-06,
"loss": 0.0,
"step": 1414
},
{
"epoch": 0.5299401197604791,
"grad_norm": 0.000147451224620454,
"learning_rate": 8.238182243576512e-06,
"loss": 0.0,
"step": 1416
},
{
"epoch": 0.530688622754491,
"grad_norm": 0.00012748232984449714,
"learning_rate": 8.233343808054159e-06,
"loss": 0.0,
"step": 1418
},
{
"epoch": 0.531437125748503,
"grad_norm": 0.00022851829999126494,
"learning_rate": 8.228500162970333e-06,
"loss": 0.0,
"step": 1420
},
{
"epoch": 0.532185628742515,
"grad_norm": 0.00014870429004076868,
"learning_rate": 8.223651316129115e-06,
"loss": 0.0,
"step": 1422
},
{
"epoch": 0.5329341317365269,
"grad_norm": 0.00023895545746199787,
"learning_rate": 8.21879727534296e-06,
"loss": 0.0,
"step": 1424
},
{
"epoch": 0.5336826347305389,
"grad_norm": 0.0001619049144210294,
"learning_rate": 8.213938048432697e-06,
"loss": 0.0,
"step": 1426
},
{
"epoch": 0.5344311377245509,
"grad_norm": 0.0002205699129262939,
"learning_rate": 8.20907364322751e-06,
"loss": 0.0,
"step": 1428
},
{
"epoch": 0.5351796407185628,
"grad_norm": 0.0001235086820088327,
"learning_rate": 8.204204067564924e-06,
"loss": 0.0,
"step": 1430
},
{
"epoch": 0.5359281437125748,
"grad_norm": 0.00023735742433927953,
"learning_rate": 8.199329329290798e-06,
"loss": 0.0,
"step": 1432
},
{
"epoch": 0.5366766467065869,
"grad_norm": 0.00017734240100253373,
"learning_rate": 8.194449436259305e-06,
"loss": 0.0,
"step": 1434
},
{
"epoch": 0.5374251497005988,
"grad_norm": 0.00017838150961324573,
"learning_rate": 8.189564396332927e-06,
"loss": 0.0,
"step": 1436
},
{
"epoch": 0.5381736526946108,
"grad_norm": 0.0001554026093799621,
"learning_rate": 8.184674217382438e-06,
"loss": 0.0,
"step": 1438
},
{
"epoch": 0.5389221556886228,
"grad_norm": 0.00021117751020938158,
"learning_rate": 8.179778907286889e-06,
"loss": 0.0,
"step": 1440
},
{
"epoch": 0.5389221556886228,
"eval_accuracy": 1.0,
"eval_loss": 8.898123837752792e-07,
"eval_runtime": 163.5535,
"eval_samples_per_second": 30.571,
"eval_steps_per_second": 7.643,
"step": 1440
},
{
"epoch": 0.5396706586826348,
"grad_norm": 0.0002229697274742648,
"learning_rate": 8.174878473933601e-06,
"loss": 0.0,
"step": 1442
},
{
"epoch": 0.5404191616766467,
"grad_norm": 9.283604595111683e-05,
"learning_rate": 8.16997292521815e-06,
"loss": 0.0,
"step": 1444
},
{
"epoch": 0.5411676646706587,
"grad_norm": 0.0001886676182039082,
"learning_rate": 8.165062269044353e-06,
"loss": 0.0,
"step": 1446
},
{
"epoch": 0.5419161676646707,
"grad_norm": 0.0001873478468041867,
"learning_rate": 8.160146513324256e-06,
"loss": 0.0,
"step": 1448
},
{
"epoch": 0.5426646706586826,
"grad_norm": 0.00011518682003952563,
"learning_rate": 8.15522566597812e-06,
"loss": 0.0,
"step": 1450
},
{
"epoch": 0.5434131736526946,
"grad_norm": 0.0001653393410379067,
"learning_rate": 8.150299734934413e-06,
"loss": 0.0,
"step": 1452
},
{
"epoch": 0.5441616766467066,
"grad_norm": 0.0001261346333194524,
"learning_rate": 8.14536872812979e-06,
"loss": 0.0,
"step": 1454
},
{
"epoch": 0.5449101796407185,
"grad_norm": 0.00019827600044663996,
"learning_rate": 8.140432653509089e-06,
"loss": 0.0,
"step": 1456
},
{
"epoch": 0.5456586826347305,
"grad_norm": 0.00042794988257810473,
"learning_rate": 8.135491519025307e-06,
"loss": 0.0,
"step": 1458
},
{
"epoch": 0.5464071856287425,
"grad_norm": 0.00011408683349145576,
"learning_rate": 8.130545332639599e-06,
"loss": 0.0,
"step": 1460
},
{
"epoch": 0.5471556886227545,
"grad_norm": 0.0001599421666469425,
"learning_rate": 8.125594102321256e-06,
"loss": 0.0,
"step": 1462
},
{
"epoch": 0.5479041916167665,
"grad_norm": 0.00013200732064433396,
"learning_rate": 8.120637836047698e-06,
"loss": 0.0,
"step": 1464
},
{
"epoch": 0.5486526946107785,
"grad_norm": 0.00012747867731377482,
"learning_rate": 8.115676541804456e-06,
"loss": 0.0,
"step": 1466
},
{
"epoch": 0.5494011976047904,
"grad_norm": 0.00013039227633271366,
"learning_rate": 8.110710227585169e-06,
"loss": 0.0,
"step": 1468
},
{
"epoch": 0.5501497005988024,
"grad_norm": 0.0001378858432872221,
"learning_rate": 8.105738901391553e-06,
"loss": 0.0,
"step": 1470
},
{
"epoch": 0.5508982035928144,
"grad_norm": 0.00019915043958462775,
"learning_rate": 8.100762571233409e-06,
"loss": 0.0,
"step": 1472
},
{
"epoch": 0.5516467065868264,
"grad_norm": 0.00013786421914119273,
"learning_rate": 8.095781245128598e-06,
"loss": 0.0,
"step": 1474
},
{
"epoch": 0.5523952095808383,
"grad_norm": 0.00011769870616262779,
"learning_rate": 8.090794931103026e-06,
"loss": 0.0,
"step": 1476
},
{
"epoch": 0.5531437125748503,
"grad_norm": 0.0001314223773078993,
"learning_rate": 8.085803637190643e-06,
"loss": 0.0,
"step": 1478
},
{
"epoch": 0.5538922155688623,
"grad_norm": 0.0017582617001608014,
"learning_rate": 8.080807371433415e-06,
"loss": 0.0,
"step": 1480
},
{
"epoch": 0.5538922155688623,
"eval_accuracy": 1.0,
"eval_loss": 7.754564421702526e-07,
"eval_runtime": 165.5806,
"eval_samples_per_second": 30.197,
"eval_steps_per_second": 7.549,
"step": 1480
},
{
"epoch": 0.5546407185628742,
"grad_norm": 0.0001497814228059724,
"learning_rate": 8.075806141881327e-06,
"loss": 0.0001,
"step": 1482
},
{
"epoch": 0.5553892215568862,
"grad_norm": 0.022318042814731598,
"learning_rate": 8.07079995659235e-06,
"loss": 0.0,
"step": 1484
},
{
"epoch": 0.5561377245508982,
"grad_norm": 0.00037483617779798806,
"learning_rate": 8.065788823632451e-06,
"loss": 0.0,
"step": 1486
},
{
"epoch": 0.5568862275449101,
"grad_norm": 0.017027398571372032,
"learning_rate": 8.060772751075564e-06,
"loss": 0.0,
"step": 1488
},
{
"epoch": 0.5576347305389222,
"grad_norm": 0.0005349895800463855,
"learning_rate": 8.05575174700358e-06,
"loss": 0.0,
"step": 1490
},
{
"epoch": 0.5583832335329342,
"grad_norm": 9.526366193313152e-05,
"learning_rate": 8.05072581950634e-06,
"loss": 0.0,
"step": 1492
},
{
"epoch": 0.5591317365269461,
"grad_norm": 0.0018775092903524637,
"learning_rate": 8.045694976681613e-06,
"loss": 0.0,
"step": 1494
},
{
"epoch": 0.5598802395209581,
"grad_norm": 0.006572918966412544,
"learning_rate": 8.04065922663509e-06,
"loss": 0.0,
"step": 1496
},
{
"epoch": 0.5606287425149701,
"grad_norm": 0.00012014804087812081,
"learning_rate": 8.035618577480369e-06,
"loss": 0.0,
"step": 1498
},
{
"epoch": 0.561377245508982,
"grad_norm": 0.00019066958338953555,
"learning_rate": 8.030573037338942e-06,
"loss": 0.0,
"step": 1500
},
{
"epoch": 0.562125748502994,
"grad_norm": 0.000158317168825306,
"learning_rate": 8.025522614340177e-06,
"loss": 0.0,
"step": 1502
},
{
"epoch": 0.562874251497006,
"grad_norm": 0.0006968253292143345,
"learning_rate": 8.020467316621316e-06,
"loss": 0.0,
"step": 1504
},
{
"epoch": 0.563622754491018,
"grad_norm": 0.00015155051369220018,
"learning_rate": 8.015407152327448e-06,
"loss": 0.0,
"step": 1506
},
{
"epoch": 0.5643712574850299,
"grad_norm": 0.00017856295744422823,
"learning_rate": 8.010342129611508e-06,
"loss": 0.0,
"step": 1508
},
{
"epoch": 0.5651197604790419,
"grad_norm": 0.0007032952271401882,
"learning_rate": 8.005272256634257e-06,
"loss": 0.0,
"step": 1510
},
{
"epoch": 0.5658682634730539,
"grad_norm": 0.0001873714500106871,
"learning_rate": 8.000197541564273e-06,
"loss": 0.0,
"step": 1512
},
{
"epoch": 0.5666167664670658,
"grad_norm": 0.00024626540835015476,
"learning_rate": 7.99511799257793e-06,
"loss": 0.0,
"step": 1514
},
{
"epoch": 0.5673652694610778,
"grad_norm": 0.00011799616186181083,
"learning_rate": 7.990033617859396e-06,
"loss": 0.0,
"step": 1516
},
{
"epoch": 0.5681137724550899,
"grad_norm": 0.00017817386833485216,
"learning_rate": 7.984944425600614e-06,
"loss": 0.0,
"step": 1518
},
{
"epoch": 0.5688622754491018,
"grad_norm": 0.00012566296209115535,
"learning_rate": 7.979850424001283e-06,
"loss": 0.0,
"step": 1520
},
{
"epoch": 0.5688622754491018,
"eval_accuracy": 0.9999997747747748,
"eval_loss": 2.201959887315752e-06,
"eval_runtime": 164.4185,
"eval_samples_per_second": 30.41,
"eval_steps_per_second": 7.603,
"step": 1520
},
{
"epoch": 0.5696107784431138,
"grad_norm": 0.00012715034245047718,
"learning_rate": 7.97475162126886e-06,
"loss": 0.0,
"step": 1522
},
{
"epoch": 0.5703592814371258,
"grad_norm": 0.0005135888350196183,
"learning_rate": 7.96964802561853e-06,
"loss": 0.0,
"step": 1524
},
{
"epoch": 0.5711077844311377,
"grad_norm": 0.0008899507811293006,
"learning_rate": 7.964539645273204e-06,
"loss": 0.0,
"step": 1526
},
{
"epoch": 0.5718562874251497,
"grad_norm": 0.00014234622358344495,
"learning_rate": 7.9594264884635e-06,
"loss": 0.0,
"step": 1528
},
{
"epoch": 0.5726047904191617,
"grad_norm": 0.00044490184518508613,
"learning_rate": 7.954308563427732e-06,
"loss": 0.0,
"step": 1530
},
{
"epoch": 0.5733532934131736,
"grad_norm": 0.0004495025204960257,
"learning_rate": 7.9491858784119e-06,
"loss": 0.0,
"step": 1532
},
{
"epoch": 0.5741017964071856,
"grad_norm": 0.0003148759133182466,
"learning_rate": 7.944058441669671e-06,
"loss": 0.0,
"step": 1534
},
{
"epoch": 0.5748502994011976,
"grad_norm": 0.01583685912191868,
"learning_rate": 7.938926261462366e-06,
"loss": 0.0,
"step": 1536
},
{
"epoch": 0.5755988023952096,
"grad_norm": 0.0001242299476871267,
"learning_rate": 7.933789346058951e-06,
"loss": 0.0,
"step": 1538
},
{
"epoch": 0.5763473053892215,
"grad_norm": 0.0006354754441417754,
"learning_rate": 7.928647703736024e-06,
"loss": 0.0,
"step": 1540
},
{
"epoch": 0.5770958083832335,
"grad_norm": 0.00038198617403395474,
"learning_rate": 7.923501342777788e-06,
"loss": 0.0,
"step": 1542
},
{
"epoch": 0.5778443113772455,
"grad_norm": 0.0022715749219059944,
"learning_rate": 7.918350271476064e-06,
"loss": 0.0,
"step": 1544
},
{
"epoch": 0.5785928143712575,
"grad_norm": 7.122563692973927e-05,
"learning_rate": 7.913194498130252e-06,
"loss": 0.0,
"step": 1546
},
{
"epoch": 0.5793413173652695,
"grad_norm": 0.00045244794455356896,
"learning_rate": 7.90803403104733e-06,
"loss": 0.0,
"step": 1548
},
{
"epoch": 0.5800898203592815,
"grad_norm": 0.00015517730207648128,
"learning_rate": 7.90286887854184e-06,
"loss": 0.0,
"step": 1550
},
{
"epoch": 0.5808383233532934,
"grad_norm": 0.0001861519122030586,
"learning_rate": 7.897699048935875e-06,
"loss": 0.0,
"step": 1552
},
{
"epoch": 0.5815868263473054,
"grad_norm": 0.00018396999803371727,
"learning_rate": 7.892524550559056e-06,
"loss": 0.0,
"step": 1554
},
{
"epoch": 0.5823353293413174,
"grad_norm": 0.000191383485798724,
"learning_rate": 7.887345391748533e-06,
"loss": 0.0,
"step": 1556
},
{
"epoch": 0.5830838323353293,
"grad_norm": 0.0018197696190327406,
"learning_rate": 7.882161580848966e-06,
"loss": 0.0,
"step": 1558
},
{
"epoch": 0.5838323353293413,
"grad_norm": 0.0004387347144074738,
"learning_rate": 7.876973126212507e-06,
"loss": 0.0,
"step": 1560
},
{
"epoch": 0.5838323353293413,
"eval_accuracy": 1.0,
"eval_loss": 1.0737475122368778e-06,
"eval_runtime": 162.5348,
"eval_samples_per_second": 30.763,
"eval_steps_per_second": 7.691,
"step": 1560
},
{
"epoch": 0.5845808383233533,
"grad_norm": 0.0022077385801821947,
"learning_rate": 7.87178003619879e-06,
"loss": 0.0,
"step": 1562
},
{
"epoch": 0.5853293413173652,
"grad_norm": 0.0002099570701830089,
"learning_rate": 7.866582319174918e-06,
"loss": 0.0,
"step": 1564
},
{
"epoch": 0.5860778443113772,
"grad_norm": 0.00012316112406551838,
"learning_rate": 7.861379983515449e-06,
"loss": 0.0,
"step": 1566
},
{
"epoch": 0.5868263473053892,
"grad_norm": 0.00024173619749490172,
"learning_rate": 7.856173037602383e-06,
"loss": 0.0,
"step": 1568
},
{
"epoch": 0.5875748502994012,
"grad_norm": 8.62416927702725e-05,
"learning_rate": 7.85096148982515e-06,
"loss": 0.0,
"step": 1570
},
{
"epoch": 0.5883233532934131,
"grad_norm": 0.0014817145420238376,
"learning_rate": 7.845745348580592e-06,
"loss": 0.0,
"step": 1572
},
{
"epoch": 0.5890718562874252,
"grad_norm": 0.0001866584934759885,
"learning_rate": 7.840524622272949e-06,
"loss": 0.0,
"step": 1574
},
{
"epoch": 0.5898203592814372,
"grad_norm": 0.00014589863712899387,
"learning_rate": 7.835299319313854e-06,
"loss": 0.0,
"step": 1576
},
{
"epoch": 0.5905688622754491,
"grad_norm": 0.0001201587583636865,
"learning_rate": 7.830069448122313e-06,
"loss": 0.0,
"step": 1578
},
{
"epoch": 0.5913173652694611,
"grad_norm": 0.0004075410251971334,
"learning_rate": 7.82483501712469e-06,
"loss": 0.0,
"step": 1580
},
{
"epoch": 0.5920658682634731,
"grad_norm": 0.0007497837650589645,
"learning_rate": 7.819596034754696e-06,
"loss": 0.0,
"step": 1582
},
{
"epoch": 0.592814371257485,
"grad_norm": 0.0001937482156790793,
"learning_rate": 7.81435250945338e-06,
"loss": 0.0,
"step": 1584
},
{
"epoch": 0.593562874251497,
"grad_norm": 0.00015605123189743608,
"learning_rate": 7.8091044496691e-06,
"loss": 0.0,
"step": 1586
},
{
"epoch": 0.594311377245509,
"grad_norm": 0.0008134747622534633,
"learning_rate": 7.803851863857533e-06,
"loss": 0.0,
"step": 1588
},
{
"epoch": 0.5950598802395209,
"grad_norm": 0.016990555450320244,
"learning_rate": 7.798594760481639e-06,
"loss": 0.0,
"step": 1590
},
{
"epoch": 0.5958083832335329,
"grad_norm": 8.968533074948937e-05,
"learning_rate": 7.793333148011658e-06,
"loss": 0.0,
"step": 1592
},
{
"epoch": 0.5965568862275449,
"grad_norm": 0.00021849350014235824,
"learning_rate": 7.7880670349251e-06,
"loss": 0.0,
"step": 1594
},
{
"epoch": 0.5973053892215568,
"grad_norm": 0.00010406466026324779,
"learning_rate": 7.782796429706721e-06,
"loss": 0.0,
"step": 1596
},
{
"epoch": 0.5980538922155688,
"grad_norm": 0.0001633252395549789,
"learning_rate": 7.777521340848515e-06,
"loss": 0.0,
"step": 1598
},
{
"epoch": 0.5988023952095808,
"grad_norm": 0.00010362563625676557,
"learning_rate": 7.772241776849705e-06,
"loss": 0.0,
"step": 1600
},
{
"epoch": 0.5988023952095808,
"eval_accuracy": 1.0,
"eval_loss": 8.31741829188104e-07,
"eval_runtime": 160.7648,
"eval_samples_per_second": 31.101,
"eval_steps_per_second": 7.775,
"step": 1600
},
{
"epoch": 0.5995508982035929,
"grad_norm": 0.0001440924679627642,
"learning_rate": 7.76695774621672e-06,
"loss": 0.0,
"step": 1602
},
{
"epoch": 0.6002994011976048,
"grad_norm": 0.00024095825210679322,
"learning_rate": 7.761669257463188e-06,
"loss": 0.0,
"step": 1604
},
{
"epoch": 0.6010479041916168,
"grad_norm": 0.0010255592642351985,
"learning_rate": 7.756376319109917e-06,
"loss": 0.0,
"step": 1606
},
{
"epoch": 0.6017964071856288,
"grad_norm": 0.0003512499970383942,
"learning_rate": 7.751078939684886e-06,
"loss": 0.0,
"step": 1608
},
{
"epoch": 0.6025449101796407,
"grad_norm": 0.0001344636984867975,
"learning_rate": 7.74577712772323e-06,
"loss": 0.0,
"step": 1610
},
{
"epoch": 0.6032934131736527,
"grad_norm": 0.00015981386241037399,
"learning_rate": 7.740470891767225e-06,
"loss": 0.0,
"step": 1612
},
{
"epoch": 0.6040419161676647,
"grad_norm": 0.00013109891733620316,
"learning_rate": 7.735160240366276e-06,
"loss": 0.0,
"step": 1614
},
{
"epoch": 0.6047904191616766,
"grad_norm": 0.00010152284085052088,
"learning_rate": 7.729845182076896e-06,
"loss": 0.0,
"step": 1616
},
{
"epoch": 0.6055389221556886,
"grad_norm": 0.0010211137123405933,
"learning_rate": 7.72452572546271e-06,
"loss": 0.0,
"step": 1618
},
{
"epoch": 0.6062874251497006,
"grad_norm": 0.00015486738993786275,
"learning_rate": 7.71920187909442e-06,
"loss": 0.0,
"step": 1620
},
{
"epoch": 0.6070359281437125,
"grad_norm": 0.00011970350897172466,
"learning_rate": 7.713873651549805e-06,
"loss": 0.0,
"step": 1622
},
{
"epoch": 0.6077844311377245,
"grad_norm": 7.165825081756338e-05,
"learning_rate": 7.7085410514137e-06,
"loss": 0.0,
"step": 1624
},
{
"epoch": 0.6085329341317365,
"grad_norm": 0.00015439889102708548,
"learning_rate": 7.703204087277989e-06,
"loss": 0.0,
"step": 1626
},
{
"epoch": 0.6092814371257484,
"grad_norm": 9.300145757151768e-05,
"learning_rate": 7.697862767741584e-06,
"loss": 0.0,
"step": 1628
},
{
"epoch": 0.6100299401197605,
"grad_norm": 0.00014098809333518147,
"learning_rate": 7.692517101410414e-06,
"loss": 0.0,
"step": 1630
},
{
"epoch": 0.6107784431137725,
"grad_norm": 0.00011235267447773367,
"learning_rate": 7.68716709689742e-06,
"loss": 0.0,
"step": 1632
},
{
"epoch": 0.6115269461077845,
"grad_norm": 0.00014485006977338344,
"learning_rate": 7.681812762822517e-06,
"loss": 0.0,
"step": 1634
},
{
"epoch": 0.6122754491017964,
"grad_norm": 8.417559729423374e-05,
"learning_rate": 7.676454107812608e-06,
"loss": 0.0,
"step": 1636
},
{
"epoch": 0.6130239520958084,
"grad_norm": 0.00010664058936526999,
"learning_rate": 7.671091140501557e-06,
"loss": 0.0,
"step": 1638
},
{
"epoch": 0.6137724550898204,
"grad_norm": 0.00019991857698187232,
"learning_rate": 7.66572386953017e-06,
"loss": 0.0,
"step": 1640
},
{
"epoch": 0.6137724550898204,
"eval_accuracy": 1.0,
"eval_loss": 6.937613648005936e-07,
"eval_runtime": 164.9592,
"eval_samples_per_second": 30.311,
"eval_steps_per_second": 7.578,
"step": 1640
},
{
"epoch": 0.6145209580838323,
"grad_norm": 0.00013954796304460615,
"learning_rate": 7.660352303546192e-06,
"loss": 0.0,
"step": 1642
},
{
"epoch": 0.6152694610778443,
"grad_norm": 0.00022430805256590247,
"learning_rate": 7.654976451204288e-06,
"loss": 0.0,
"step": 1644
},
{
"epoch": 0.6160179640718563,
"grad_norm": 8.612870442448184e-05,
"learning_rate": 7.649596321166024e-06,
"loss": 0.0,
"step": 1646
},
{
"epoch": 0.6167664670658682,
"grad_norm": 0.00018191659182775766,
"learning_rate": 7.644211922099867e-06,
"loss": 0.0,
"step": 1648
},
{
"epoch": 0.6175149700598802,
"grad_norm": 0.00012960025924257934,
"learning_rate": 7.638823262681155e-06,
"loss": 0.0,
"step": 1650
},
{
"epoch": 0.6182634730538922,
"grad_norm": 0.00048115866957232356,
"learning_rate": 7.633430351592093e-06,
"loss": 0.0,
"step": 1652
},
{
"epoch": 0.6190119760479041,
"grad_norm": 0.004453903064131737,
"learning_rate": 7.6280331975217356e-06,
"loss": 0.0,
"step": 1654
},
{
"epoch": 0.6197604790419161,
"grad_norm": 0.00019366122432984412,
"learning_rate": 7.622631809165972e-06,
"loss": 0.0,
"step": 1656
},
{
"epoch": 0.6205089820359282,
"grad_norm": 0.0001328593207290396,
"learning_rate": 7.617226195227518e-06,
"loss": 0.0,
"step": 1658
},
{
"epoch": 0.6212574850299402,
"grad_norm": 0.00019288925977889448,
"learning_rate": 7.611816364415896e-06,
"loss": 0.0,
"step": 1660
},
{
"epoch": 0.6220059880239521,
"grad_norm": 0.00020607073383871466,
"learning_rate": 7.606402325447421e-06,
"loss": 0.0,
"step": 1662
},
{
"epoch": 0.6227544910179641,
"grad_norm": 0.0010563160758465528,
"learning_rate": 7.600984087045187e-06,
"loss": 0.0,
"step": 1664
},
{
"epoch": 0.6235029940119761,
"grad_norm": 7.731228834018111e-05,
"learning_rate": 7.595561657939061e-06,
"loss": 0.0,
"step": 1666
},
{
"epoch": 0.624251497005988,
"grad_norm": 0.00039579844451509416,
"learning_rate": 7.590135046865652e-06,
"loss": 0.0,
"step": 1668
},
{
"epoch": 0.625,
"grad_norm": 0.00012335921928752214,
"learning_rate": 7.584704262568315e-06,
"loss": 0.0,
"step": 1670
},
{
"epoch": 0.625748502994012,
"grad_norm": 8.330697164637968e-05,
"learning_rate": 7.579269313797126e-06,
"loss": 0.0,
"step": 1672
},
{
"epoch": 0.6264970059880239,
"grad_norm": 0.00023956908262334764,
"learning_rate": 7.573830209308872e-06,
"loss": 0.0,
"step": 1674
},
{
"epoch": 0.6272455089820359,
"grad_norm": 0.00013747882621828467,
"learning_rate": 7.568386957867033e-06,
"loss": 0.0,
"step": 1676
},
{
"epoch": 0.6279940119760479,
"grad_norm": 0.00012644138769246638,
"learning_rate": 7.562939568241772e-06,
"loss": 0.0,
"step": 1678
},
{
"epoch": 0.6287425149700598,
"grad_norm": 0.00021938206919003278,
"learning_rate": 7.557488049209921e-06,
"loss": 0.0,
"step": 1680
},
{
"epoch": 0.6287425149700598,
"eval_accuracy": 1.0,
"eval_loss": 6.026603500686178e-07,
"eval_runtime": 164.079,
"eval_samples_per_second": 30.473,
"eval_steps_per_second": 7.618,
"step": 1680
}
],
"logging_steps": 2,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5997706414881505e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}