OmniRewardModel2 / trainer_state.json
jinzhuoran's picture
OmniRewardModel checkpoint upload
db397c0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9997052321296978,
"eval_steps": 100,
"global_step": 7632,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013100794235650537,
"grad_norm": 65.25566997593643,
"learning_rate": 2.6178010471204188e-08,
"loss": 0.8756,
"step": 5
},
{
"epoch": 0.0026201588471301074,
"grad_norm": 62.55868359719172,
"learning_rate": 5.2356020942408376e-08,
"loss": 0.8355,
"step": 10
},
{
"epoch": 0.003930238270695161,
"grad_norm": 72.36291944323244,
"learning_rate": 7.853403141361257e-08,
"loss": 0.9024,
"step": 15
},
{
"epoch": 0.005240317694260215,
"grad_norm": 78.2927923163889,
"learning_rate": 1.0471204188481675e-07,
"loss": 0.9112,
"step": 20
},
{
"epoch": 0.006550397117825268,
"grad_norm": 73.20997266140758,
"learning_rate": 1.3089005235602092e-07,
"loss": 0.9038,
"step": 25
},
{
"epoch": 0.007860476541390321,
"grad_norm": 79.5698279926454,
"learning_rate": 1.5706806282722514e-07,
"loss": 0.7385,
"step": 30
},
{
"epoch": 0.009170555964955375,
"grad_norm": 62.25008705240314,
"learning_rate": 1.8324607329842932e-07,
"loss": 0.8842,
"step": 35
},
{
"epoch": 0.01048063538852043,
"grad_norm": 71.93212371078113,
"learning_rate": 2.094240837696335e-07,
"loss": 0.8456,
"step": 40
},
{
"epoch": 0.011790714812085483,
"grad_norm": 76.81854898396818,
"learning_rate": 2.356020942408377e-07,
"loss": 0.8637,
"step": 45
},
{
"epoch": 0.013100794235650536,
"grad_norm": 72.77223708616008,
"learning_rate": 2.6178010471204185e-07,
"loss": 0.8457,
"step": 50
},
{
"epoch": 0.01441087365921559,
"grad_norm": 82.53172515882576,
"learning_rate": 2.879581151832461e-07,
"loss": 0.809,
"step": 55
},
{
"epoch": 0.015720953082780643,
"grad_norm": 63.36114555923586,
"learning_rate": 3.1413612565445027e-07,
"loss": 0.7974,
"step": 60
},
{
"epoch": 0.017031032506345696,
"grad_norm": 67.13005941209079,
"learning_rate": 3.4031413612565446e-07,
"loss": 0.7941,
"step": 65
},
{
"epoch": 0.01834111192991075,
"grad_norm": 62.35455050500593,
"learning_rate": 3.6649214659685864e-07,
"loss": 0.8121,
"step": 70
},
{
"epoch": 0.019651191353475806,
"grad_norm": 61.13921963104784,
"learning_rate": 3.926701570680628e-07,
"loss": 0.6837,
"step": 75
},
{
"epoch": 0.02096127077704086,
"grad_norm": 60.734863824940575,
"learning_rate": 4.18848167539267e-07,
"loss": 0.6569,
"step": 80
},
{
"epoch": 0.022271350200605913,
"grad_norm": 68.43319499092718,
"learning_rate": 4.450261780104712e-07,
"loss": 0.7585,
"step": 85
},
{
"epoch": 0.023581429624170966,
"grad_norm": 52.09969303256668,
"learning_rate": 4.712041884816754e-07,
"loss": 0.6591,
"step": 90
},
{
"epoch": 0.02489150904773602,
"grad_norm": 60.01362166751946,
"learning_rate": 4.973821989528796e-07,
"loss": 0.625,
"step": 95
},
{
"epoch": 0.026201588471301072,
"grad_norm": 58.796731028708756,
"learning_rate": 5.235602094240837e-07,
"loss": 0.687,
"step": 100
},
{
"epoch": 0.026201588471301072,
"eval_accuracy": 0.5504,
"eval_loss": 0.7788666486740112,
"eval_runtime": 138.8257,
"eval_samples_per_second": 9.004,
"eval_steps_per_second": 2.255,
"step": 100
},
{
"epoch": 0.027511667894866126,
"grad_norm": 45.6814258154,
"learning_rate": 5.497382198952879e-07,
"loss": 0.6697,
"step": 105
},
{
"epoch": 0.02882174731843118,
"grad_norm": 55.768935605768114,
"learning_rate": 5.759162303664922e-07,
"loss": 0.6187,
"step": 110
},
{
"epoch": 0.030131826741996232,
"grad_norm": 44.123709497680935,
"learning_rate": 6.020942408376963e-07,
"loss": 0.59,
"step": 115
},
{
"epoch": 0.031441906165561286,
"grad_norm": 40.077067899984414,
"learning_rate": 6.282722513089005e-07,
"loss": 0.6216,
"step": 120
},
{
"epoch": 0.03275198558912634,
"grad_norm": 39.216463798971496,
"learning_rate": 6.544502617801047e-07,
"loss": 0.6603,
"step": 125
},
{
"epoch": 0.03406206501269139,
"grad_norm": 34.95260150433843,
"learning_rate": 6.806282722513089e-07,
"loss": 0.5664,
"step": 130
},
{
"epoch": 0.03537214443625645,
"grad_norm": 37.834491450835614,
"learning_rate": 7.06806282722513e-07,
"loss": 0.5947,
"step": 135
},
{
"epoch": 0.0366822238598215,
"grad_norm": 32.63280938760141,
"learning_rate": 7.329842931937173e-07,
"loss": 0.6015,
"step": 140
},
{
"epoch": 0.037992303283386555,
"grad_norm": 37.4913705926391,
"learning_rate": 7.591623036649214e-07,
"loss": 0.5922,
"step": 145
},
{
"epoch": 0.03930238270695161,
"grad_norm": 26.355681906302205,
"learning_rate": 7.853403141361256e-07,
"loss": 0.5681,
"step": 150
},
{
"epoch": 0.04061246213051666,
"grad_norm": 31.880949398269596,
"learning_rate": 8.115183246073298e-07,
"loss": 0.5664,
"step": 155
},
{
"epoch": 0.04192254155408172,
"grad_norm": 20.65782299342999,
"learning_rate": 8.37696335078534e-07,
"loss": 0.5211,
"step": 160
},
{
"epoch": 0.04323262097764677,
"grad_norm": 22.258499509814012,
"learning_rate": 8.638743455497382e-07,
"loss": 0.5219,
"step": 165
},
{
"epoch": 0.044542700401211825,
"grad_norm": 21.846834289704923,
"learning_rate": 8.900523560209424e-07,
"loss": 0.5059,
"step": 170
},
{
"epoch": 0.045852779824776875,
"grad_norm": 23.18302368260191,
"learning_rate": 9.162303664921466e-07,
"loss": 0.518,
"step": 175
},
{
"epoch": 0.04716285924834193,
"grad_norm": 19.652604647477776,
"learning_rate": 9.424083769633508e-07,
"loss": 0.5271,
"step": 180
},
{
"epoch": 0.04847293867190698,
"grad_norm": 17.965969880689265,
"learning_rate": 9.68586387434555e-07,
"loss": 0.5313,
"step": 185
},
{
"epoch": 0.04978301809547204,
"grad_norm": 17.265465595838617,
"learning_rate": 9.947643979057591e-07,
"loss": 0.4635,
"step": 190
},
{
"epoch": 0.051093097519037095,
"grad_norm": 12.731564683260206,
"learning_rate": 1.0209424083769633e-06,
"loss": 0.4914,
"step": 195
},
{
"epoch": 0.052403176942602145,
"grad_norm": 14.240010539299435,
"learning_rate": 1.0471204188481674e-06,
"loss": 0.5163,
"step": 200
},
{
"epoch": 0.052403176942602145,
"eval_accuracy": 0.4488,
"eval_loss": 0.773366391658783,
"eval_runtime": 139.5762,
"eval_samples_per_second": 8.956,
"eval_steps_per_second": 2.243,
"step": 200
},
{
"epoch": 0.0537132563661672,
"grad_norm": 19.63370884990648,
"learning_rate": 1.0732984293193717e-06,
"loss": 0.5581,
"step": 205
},
{
"epoch": 0.05502333578973225,
"grad_norm": 10.618120041300497,
"learning_rate": 1.0994764397905759e-06,
"loss": 0.444,
"step": 210
},
{
"epoch": 0.05633341521329731,
"grad_norm": 17.44393406285321,
"learning_rate": 1.12565445026178e-06,
"loss": 0.5224,
"step": 215
},
{
"epoch": 0.05764349463686236,
"grad_norm": 12.189359941143175,
"learning_rate": 1.1518324607329843e-06,
"loss": 0.4787,
"step": 220
},
{
"epoch": 0.058953574060427415,
"grad_norm": 12.078094446605022,
"learning_rate": 1.1780104712041885e-06,
"loss": 0.5046,
"step": 225
},
{
"epoch": 0.060263653483992465,
"grad_norm": 12.745981182030365,
"learning_rate": 1.2041884816753926e-06,
"loss": 0.5234,
"step": 230
},
{
"epoch": 0.06157373290755752,
"grad_norm": 9.685521970587265,
"learning_rate": 1.2303664921465967e-06,
"loss": 0.4313,
"step": 235
},
{
"epoch": 0.06288381233112257,
"grad_norm": 14.736633585803254,
"learning_rate": 1.256544502617801e-06,
"loss": 0.4547,
"step": 240
},
{
"epoch": 0.06419389175468763,
"grad_norm": 8.794949376696293,
"learning_rate": 1.2827225130890052e-06,
"loss": 0.4405,
"step": 245
},
{
"epoch": 0.06550397117825268,
"grad_norm": 11.249829085031893,
"learning_rate": 1.3089005235602093e-06,
"loss": 0.4307,
"step": 250
},
{
"epoch": 0.06681405060181773,
"grad_norm": 13.390941109258982,
"learning_rate": 1.3350785340314135e-06,
"loss": 0.4998,
"step": 255
},
{
"epoch": 0.06812413002538278,
"grad_norm": 9.534912328167007,
"learning_rate": 1.3612565445026178e-06,
"loss": 0.472,
"step": 260
},
{
"epoch": 0.06943420944894785,
"grad_norm": 8.397391075769068,
"learning_rate": 1.387434554973822e-06,
"loss": 0.4433,
"step": 265
},
{
"epoch": 0.0707442888725129,
"grad_norm": 10.746633363582351,
"learning_rate": 1.413612565445026e-06,
"loss": 0.4567,
"step": 270
},
{
"epoch": 0.07205436829607795,
"grad_norm": 12.05720859118597,
"learning_rate": 1.4397905759162302e-06,
"loss": 0.4549,
"step": 275
},
{
"epoch": 0.073364447719643,
"grad_norm": 10.006378930278041,
"learning_rate": 1.4659685863874346e-06,
"loss": 0.4329,
"step": 280
},
{
"epoch": 0.07467452714320806,
"grad_norm": 9.597335107124772,
"learning_rate": 1.4921465968586387e-06,
"loss": 0.4084,
"step": 285
},
{
"epoch": 0.07598460656677311,
"grad_norm": 11.481222700352367,
"learning_rate": 1.5183246073298428e-06,
"loss": 0.4149,
"step": 290
},
{
"epoch": 0.07729468599033816,
"grad_norm": 19.43866947694484,
"learning_rate": 1.544502617801047e-06,
"loss": 0.4507,
"step": 295
},
{
"epoch": 0.07860476541390322,
"grad_norm": 12.044720730463665,
"learning_rate": 1.5706806282722513e-06,
"loss": 0.4566,
"step": 300
},
{
"epoch": 0.07860476541390322,
"eval_accuracy": 0.5296,
"eval_loss": 0.8311891555786133,
"eval_runtime": 139.1725,
"eval_samples_per_second": 8.982,
"eval_steps_per_second": 2.249,
"step": 300
},
{
"epoch": 0.07991484483746827,
"grad_norm": 11.917516090251366,
"learning_rate": 1.5968586387434554e-06,
"loss": 0.4058,
"step": 305
},
{
"epoch": 0.08122492426103332,
"grad_norm": 12.158737920925903,
"learning_rate": 1.6230366492146596e-06,
"loss": 0.3985,
"step": 310
},
{
"epoch": 0.08253500368459837,
"grad_norm": 11.207695168487689,
"learning_rate": 1.649214659685864e-06,
"loss": 0.3668,
"step": 315
},
{
"epoch": 0.08384508310816344,
"grad_norm": 14.299364939902476,
"learning_rate": 1.675392670157068e-06,
"loss": 0.4118,
"step": 320
},
{
"epoch": 0.08515516253172849,
"grad_norm": 8.796874162358842,
"learning_rate": 1.7015706806282722e-06,
"loss": 0.417,
"step": 325
},
{
"epoch": 0.08646524195529354,
"grad_norm": 6.423994778367414,
"learning_rate": 1.7277486910994763e-06,
"loss": 0.3693,
"step": 330
},
{
"epoch": 0.08777532137885859,
"grad_norm": 14.257916893772826,
"learning_rate": 1.7539267015706804e-06,
"loss": 0.4209,
"step": 335
},
{
"epoch": 0.08908540080242365,
"grad_norm": 10.247378725750938,
"learning_rate": 1.7801047120418848e-06,
"loss": 0.4086,
"step": 340
},
{
"epoch": 0.0903954802259887,
"grad_norm": 13.028352515928068,
"learning_rate": 1.806282722513089e-06,
"loss": 0.4366,
"step": 345
},
{
"epoch": 0.09170555964955375,
"grad_norm": 8.037401497631812,
"learning_rate": 1.8324607329842933e-06,
"loss": 0.3272,
"step": 350
},
{
"epoch": 0.09301563907311881,
"grad_norm": 6.9081571017698655,
"learning_rate": 1.8586387434554974e-06,
"loss": 0.3677,
"step": 355
},
{
"epoch": 0.09432571849668386,
"grad_norm": 8.726274241413915,
"learning_rate": 1.8848167539267015e-06,
"loss": 0.3582,
"step": 360
},
{
"epoch": 0.09563579792024891,
"grad_norm": 8.438818851906133,
"learning_rate": 1.9109947643979056e-06,
"loss": 0.3923,
"step": 365
},
{
"epoch": 0.09694587734381396,
"grad_norm": 9.975870802252798,
"learning_rate": 1.93717277486911e-06,
"loss": 0.3755,
"step": 370
},
{
"epoch": 0.09825595676737903,
"grad_norm": 11.07336790257551,
"learning_rate": 1.963350785340314e-06,
"loss": 0.3931,
"step": 375
},
{
"epoch": 0.09956603619094408,
"grad_norm": 6.87508978730872,
"learning_rate": 1.9895287958115183e-06,
"loss": 0.3723,
"step": 380
},
{
"epoch": 0.10087611561450913,
"grad_norm": 7.299612626576022,
"learning_rate": 1.999999155039932e-06,
"loss": 0.3936,
"step": 385
},
{
"epoch": 0.10218619503807419,
"grad_norm": 8.19359622835764,
"learning_rate": 1.999993991400246e-06,
"loss": 0.3347,
"step": 390
},
{
"epoch": 0.10349627446163924,
"grad_norm": 7.35341462576354,
"learning_rate": 1.9999841335673434e-06,
"loss": 0.3843,
"step": 395
},
{
"epoch": 0.10480635388520429,
"grad_norm": 6.711354211930067,
"learning_rate": 1.999969581587499e-06,
"loss": 0.3568,
"step": 400
},
{
"epoch": 0.10480635388520429,
"eval_accuracy": 0.6608,
"eval_loss": 0.739296019077301,
"eval_runtime": 142.7751,
"eval_samples_per_second": 8.755,
"eval_steps_per_second": 2.192,
"step": 400
},
{
"epoch": 0.10611643330876934,
"grad_norm": 10.024329644547777,
"learning_rate": 1.999950335529023e-06,
"loss": 0.377,
"step": 405
},
{
"epoch": 0.1074265127323344,
"grad_norm": 9.78542639003509,
"learning_rate": 1.999926395482261e-06,
"loss": 0.3109,
"step": 410
},
{
"epoch": 0.10873659215589945,
"grad_norm": 5.4448738851908445,
"learning_rate": 1.999897761559593e-06,
"loss": 0.3238,
"step": 415
},
{
"epoch": 0.1100466715794645,
"grad_norm": 5.403174975957244,
"learning_rate": 1.999864433895432e-06,
"loss": 0.317,
"step": 420
},
{
"epoch": 0.11135675100302955,
"grad_norm": 14.489233866558225,
"learning_rate": 1.9998264126462264e-06,
"loss": 0.3485,
"step": 425
},
{
"epoch": 0.11266683042659462,
"grad_norm": 7.983108029464349,
"learning_rate": 1.999783697990456e-06,
"loss": 0.3745,
"step": 430
},
{
"epoch": 0.11397690985015967,
"grad_norm": 6.210263065691716,
"learning_rate": 1.9997362901286328e-06,
"loss": 0.3134,
"step": 435
},
{
"epoch": 0.11528698927372472,
"grad_norm": 5.7095639740154445,
"learning_rate": 1.9996841892832997e-06,
"loss": 0.2956,
"step": 440
},
{
"epoch": 0.11659706869728978,
"grad_norm": 5.926962115905033,
"learning_rate": 1.9996273956990303e-06,
"loss": 0.3558,
"step": 445
},
{
"epoch": 0.11790714812085483,
"grad_norm": 7.303119154977969,
"learning_rate": 1.999565909642425e-06,
"loss": 0.3417,
"step": 450
},
{
"epoch": 0.11921722754441988,
"grad_norm": 10.835920993927127,
"learning_rate": 1.9994997314021146e-06,
"loss": 0.4127,
"step": 455
},
{
"epoch": 0.12052730696798493,
"grad_norm": 7.096230008529465,
"learning_rate": 1.999428861288753e-06,
"loss": 0.3652,
"step": 460
},
{
"epoch": 0.12183738639154999,
"grad_norm": 7.856938952483924,
"learning_rate": 1.999353299635021e-06,
"loss": 0.3319,
"step": 465
},
{
"epoch": 0.12314746581511504,
"grad_norm": 5.163875987837063,
"learning_rate": 1.9992730467956218e-06,
"loss": 0.3274,
"step": 470
},
{
"epoch": 0.12445754523868009,
"grad_norm": 5.096460744829759,
"learning_rate": 1.9991881031472787e-06,
"loss": 0.3369,
"step": 475
},
{
"epoch": 0.12576762466224514,
"grad_norm": 4.898833760747336,
"learning_rate": 1.9990984690887376e-06,
"loss": 0.3342,
"step": 480
},
{
"epoch": 0.1270777040858102,
"grad_norm": 8.546538011577958,
"learning_rate": 1.99900414504076e-06,
"loss": 0.3675,
"step": 485
},
{
"epoch": 0.12838778350937527,
"grad_norm": 9.545619233480027,
"learning_rate": 1.998905131446124e-06,
"loss": 0.2993,
"step": 490
},
{
"epoch": 0.1296978629329403,
"grad_norm": 6.0468014618988,
"learning_rate": 1.998801428769621e-06,
"loss": 0.3337,
"step": 495
},
{
"epoch": 0.13100794235650537,
"grad_norm": 6.525761511275149,
"learning_rate": 1.998693037498054e-06,
"loss": 0.3504,
"step": 500
},
{
"epoch": 0.13100794235650537,
"eval_accuracy": 0.6496,
"eval_loss": 0.7146463394165039,
"eval_runtime": 141.9904,
"eval_samples_per_second": 8.803,
"eval_steps_per_second": 2.204,
"step": 500
},
{
"epoch": 0.1323180217800704,
"grad_norm": 6.303773120304537,
"learning_rate": 1.9985799581402366e-06,
"loss": 0.3254,
"step": 505
},
{
"epoch": 0.13362810120363547,
"grad_norm": 7.995893571860963,
"learning_rate": 1.998462191226988e-06,
"loss": 0.3392,
"step": 510
},
{
"epoch": 0.13493818062720053,
"grad_norm": 9.171359972165128,
"learning_rate": 1.9983397373111318e-06,
"loss": 0.3223,
"step": 515
},
{
"epoch": 0.13624826005076557,
"grad_norm": 5.152252573879665,
"learning_rate": 1.9982125969674943e-06,
"loss": 0.3214,
"step": 520
},
{
"epoch": 0.13755833947433063,
"grad_norm": 6.810478020843169,
"learning_rate": 1.9980807707929e-06,
"loss": 0.3643,
"step": 525
},
{
"epoch": 0.1388684188978957,
"grad_norm": 6.555447149076346,
"learning_rate": 1.99794425940617e-06,
"loss": 0.3173,
"step": 530
},
{
"epoch": 0.14017849832146073,
"grad_norm": 7.460824741390232,
"learning_rate": 1.99780306344812e-06,
"loss": 0.362,
"step": 535
},
{
"epoch": 0.1414885777450258,
"grad_norm": 4.465161138093627,
"learning_rate": 1.997657183581554e-06,
"loss": 0.2876,
"step": 540
},
{
"epoch": 0.14279865716859086,
"grad_norm": 6.492929729490839,
"learning_rate": 1.997506620491265e-06,
"loss": 0.3412,
"step": 545
},
{
"epoch": 0.1441087365921559,
"grad_norm": 6.323199580280994,
"learning_rate": 1.9973513748840294e-06,
"loss": 0.2913,
"step": 550
},
{
"epoch": 0.14541881601572096,
"grad_norm": 6.860707779728365,
"learning_rate": 1.997191447488604e-06,
"loss": 0.2841,
"step": 555
},
{
"epoch": 0.146728895439286,
"grad_norm": 7.428003430414849,
"learning_rate": 1.9970268390557235e-06,
"loss": 0.3296,
"step": 560
},
{
"epoch": 0.14803897486285106,
"grad_norm": 9.809910396455075,
"learning_rate": 1.996857550358097e-06,
"loss": 0.3316,
"step": 565
},
{
"epoch": 0.14934905428641612,
"grad_norm": 6.120283708913698,
"learning_rate": 1.9966835821904022e-06,
"loss": 0.3227,
"step": 570
},
{
"epoch": 0.15065913370998116,
"grad_norm": 5.518190183534834,
"learning_rate": 1.9965049353692853e-06,
"loss": 0.3271,
"step": 575
},
{
"epoch": 0.15196921313354622,
"grad_norm": 4.956536091880624,
"learning_rate": 1.996321610733353e-06,
"loss": 0.3677,
"step": 580
},
{
"epoch": 0.15327929255711129,
"grad_norm": 6.782512343216961,
"learning_rate": 1.9961336091431724e-06,
"loss": 0.3538,
"step": 585
},
{
"epoch": 0.15458937198067632,
"grad_norm": 4.957135478968806,
"learning_rate": 1.995940931481264e-06,
"loss": 0.3716,
"step": 590
},
{
"epoch": 0.15589945140424138,
"grad_norm": 5.89621538406006,
"learning_rate": 1.9957435786521003e-06,
"loss": 0.3211,
"step": 595
},
{
"epoch": 0.15720953082780645,
"grad_norm": 3.4716470492850133,
"learning_rate": 1.9955415515820982e-06,
"loss": 0.3335,
"step": 600
},
{
"epoch": 0.15720953082780645,
"eval_accuracy": 0.74,
"eval_loss": 0.6648128628730774,
"eval_runtime": 143.2818,
"eval_samples_per_second": 8.724,
"eval_steps_per_second": 2.185,
"step": 600
},
{
"epoch": 0.15851961025137148,
"grad_norm": 4.420746512696032,
"learning_rate": 1.9953348512196184e-06,
"loss": 0.3074,
"step": 605
},
{
"epoch": 0.15982968967493655,
"grad_norm": 5.278777879896076,
"learning_rate": 1.9951234785349572e-06,
"loss": 0.3338,
"step": 610
},
{
"epoch": 0.16113976909850158,
"grad_norm": 7.753730093542852,
"learning_rate": 1.9949074345203457e-06,
"loss": 0.3409,
"step": 615
},
{
"epoch": 0.16244984852206665,
"grad_norm": 4.666850591356625,
"learning_rate": 1.9946867201899415e-06,
"loss": 0.3368,
"step": 620
},
{
"epoch": 0.1637599279456317,
"grad_norm": 3.574785990989966,
"learning_rate": 1.994461336579827e-06,
"loss": 0.2872,
"step": 625
},
{
"epoch": 0.16507000736919675,
"grad_norm": 6.370783353900673,
"learning_rate": 1.9942312847480032e-06,
"loss": 0.3223,
"step": 630
},
{
"epoch": 0.1663800867927618,
"grad_norm": 6.091101766679421,
"learning_rate": 1.993996565774384e-06,
"loss": 0.3247,
"step": 635
},
{
"epoch": 0.16769016621632687,
"grad_norm": 4.963319059049603,
"learning_rate": 1.9937571807607914e-06,
"loss": 0.3035,
"step": 640
},
{
"epoch": 0.1690002456398919,
"grad_norm": 5.763955927215438,
"learning_rate": 1.993513130830953e-06,
"loss": 0.3207,
"step": 645
},
{
"epoch": 0.17031032506345697,
"grad_norm": 5.873434630553368,
"learning_rate": 1.9932644171304922e-06,
"loss": 0.2886,
"step": 650
},
{
"epoch": 0.17162040448702204,
"grad_norm": 6.2646805543143165,
"learning_rate": 1.9930110408269265e-06,
"loss": 0.2844,
"step": 655
},
{
"epoch": 0.17293048391058707,
"grad_norm": 6.593113180342127,
"learning_rate": 1.992753003109661e-06,
"loss": 0.3156,
"step": 660
},
{
"epoch": 0.17424056333415214,
"grad_norm": 7.157645021880429,
"learning_rate": 1.9924903051899805e-06,
"loss": 0.2825,
"step": 665
},
{
"epoch": 0.17555064275771717,
"grad_norm": 8.060379496602742,
"learning_rate": 1.9922229483010486e-06,
"loss": 0.2938,
"step": 670
},
{
"epoch": 0.17686072218128224,
"grad_norm": 3.9961852237294413,
"learning_rate": 1.9919509336978966e-06,
"loss": 0.3503,
"step": 675
},
{
"epoch": 0.1781708016048473,
"grad_norm": 4.475800758356923,
"learning_rate": 1.9916742626574224e-06,
"loss": 0.3459,
"step": 680
},
{
"epoch": 0.17948088102841234,
"grad_norm": 4.593693404230203,
"learning_rate": 1.9913929364783804e-06,
"loss": 0.33,
"step": 685
},
{
"epoch": 0.1807909604519774,
"grad_norm": 5.413425332374862,
"learning_rate": 1.9911069564813783e-06,
"loss": 0.3051,
"step": 690
},
{
"epoch": 0.18210103987554246,
"grad_norm": 8.008275090324018,
"learning_rate": 1.9908163240088693e-06,
"loss": 0.3699,
"step": 695
},
{
"epoch": 0.1834111192991075,
"grad_norm": 5.528157261942457,
"learning_rate": 1.9905210404251465e-06,
"loss": 0.2891,
"step": 700
},
{
"epoch": 0.1834111192991075,
"eval_accuracy": 0.7104,
"eval_loss": 0.6656551957130432,
"eval_runtime": 142.4624,
"eval_samples_per_second": 8.774,
"eval_steps_per_second": 2.197,
"step": 700
},
{
"epoch": 0.18472119872267256,
"grad_norm": 3.6403780225909403,
"learning_rate": 1.9902211071163366e-06,
"loss": 0.287,
"step": 705
},
{
"epoch": 0.18603127814623763,
"grad_norm": 6.998780707143166,
"learning_rate": 1.989916525490393e-06,
"loss": 0.2794,
"step": 710
},
{
"epoch": 0.18734135756980266,
"grad_norm": 4.8809190411029055,
"learning_rate": 1.989607296977089e-06,
"loss": 0.3102,
"step": 715
},
{
"epoch": 0.18865143699336773,
"grad_norm": 4.433896161785547,
"learning_rate": 1.989293423028012e-06,
"loss": 0.3142,
"step": 720
},
{
"epoch": 0.18996151641693276,
"grad_norm": 4.968740406829342,
"learning_rate": 1.988974905116556e-06,
"loss": 0.2907,
"step": 725
},
{
"epoch": 0.19127159584049783,
"grad_norm": 5.0219581203222505,
"learning_rate": 1.988651744737914e-06,
"loss": 0.3004,
"step": 730
},
{
"epoch": 0.1925816752640629,
"grad_norm": 5.9858192432913215,
"learning_rate": 1.9883239434090727e-06,
"loss": 0.3099,
"step": 735
},
{
"epoch": 0.19389175468762793,
"grad_norm": 6.069766314309123,
"learning_rate": 1.9879915026688042e-06,
"loss": 0.3456,
"step": 740
},
{
"epoch": 0.195201834111193,
"grad_norm": 3.8932647987051365,
"learning_rate": 1.9876544240776593e-06,
"loss": 0.2827,
"step": 745
},
{
"epoch": 0.19651191353475805,
"grad_norm": 4.970870187009217,
"learning_rate": 1.987312709217959e-06,
"loss": 0.2862,
"step": 750
},
{
"epoch": 0.1978219929583231,
"grad_norm": 7.016260113036865,
"learning_rate": 1.9869663596937884e-06,
"loss": 0.2776,
"step": 755
},
{
"epoch": 0.19913207238188815,
"grad_norm": 9.66509560172156,
"learning_rate": 1.986615377130989e-06,
"loss": 0.2772,
"step": 760
},
{
"epoch": 0.20044215180545322,
"grad_norm": 5.77891746002653,
"learning_rate": 1.9862597631771508e-06,
"loss": 0.353,
"step": 765
},
{
"epoch": 0.20175223122901825,
"grad_norm": 6.520911408063365,
"learning_rate": 1.9858995195016044e-06,
"loss": 0.3101,
"step": 770
},
{
"epoch": 0.20306231065258332,
"grad_norm": 3.484594315471376,
"learning_rate": 1.9855346477954142e-06,
"loss": 0.2896,
"step": 775
},
{
"epoch": 0.20437239007614838,
"grad_norm": 4.684870944655888,
"learning_rate": 1.9851651497713672e-06,
"loss": 0.2596,
"step": 780
},
{
"epoch": 0.20568246949971342,
"grad_norm": 4.808149471025438,
"learning_rate": 1.9847910271639697e-06,
"loss": 0.3015,
"step": 785
},
{
"epoch": 0.20699254892327848,
"grad_norm": 4.156391998918141,
"learning_rate": 1.984412281729436e-06,
"loss": 0.2871,
"step": 790
},
{
"epoch": 0.20830262834684352,
"grad_norm": 8.382532262606357,
"learning_rate": 1.9840289152456814e-06,
"loss": 0.375,
"step": 795
},
{
"epoch": 0.20961270777040858,
"grad_norm": 4.571411739513223,
"learning_rate": 1.9836409295123127e-06,
"loss": 0.3006,
"step": 800
},
{
"epoch": 0.20961270777040858,
"eval_accuracy": 0.6704,
"eval_loss": 0.7644935250282288,
"eval_runtime": 138.0522,
"eval_samples_per_second": 9.055,
"eval_steps_per_second": 2.267,
"step": 800
},
{
"epoch": 0.21092278719397364,
"grad_norm": 2.825880146573424,
"learning_rate": 1.983248326350621e-06,
"loss": 0.2792,
"step": 805
},
{
"epoch": 0.21223286661753868,
"grad_norm": 4.072431366106783,
"learning_rate": 1.982851107603572e-06,
"loss": 0.256,
"step": 810
},
{
"epoch": 0.21354294604110374,
"grad_norm": 8.315895632884684,
"learning_rate": 1.982449275135799e-06,
"loss": 0.288,
"step": 815
},
{
"epoch": 0.2148530254646688,
"grad_norm": 6.617341511261604,
"learning_rate": 1.982042830833592e-06,
"loss": 0.2574,
"step": 820
},
{
"epoch": 0.21616310488823384,
"grad_norm": 7.4121422793295055,
"learning_rate": 1.981631776604892e-06,
"loss": 0.3751,
"step": 825
},
{
"epoch": 0.2174731843117989,
"grad_norm": 5.462630509848653,
"learning_rate": 1.9812161143792764e-06,
"loss": 0.3347,
"step": 830
},
{
"epoch": 0.21878326373536397,
"grad_norm": 4.397947143261118,
"learning_rate": 1.9807958461079574e-06,
"loss": 0.318,
"step": 835
},
{
"epoch": 0.220093343158929,
"grad_norm": 3.733736827246703,
"learning_rate": 1.980370973763767e-06,
"loss": 0.2653,
"step": 840
},
{
"epoch": 0.22140342258249407,
"grad_norm": 4.754151304483215,
"learning_rate": 1.9799414993411495e-06,
"loss": 0.2822,
"step": 845
},
{
"epoch": 0.2227135020060591,
"grad_norm": 5.574460765166766,
"learning_rate": 1.979507424856153e-06,
"loss": 0.336,
"step": 850
},
{
"epoch": 0.22402358142962417,
"grad_norm": 4.505363713646319,
"learning_rate": 1.97906875234642e-06,
"loss": 0.2928,
"step": 855
},
{
"epoch": 0.22533366085318923,
"grad_norm": 6.222786448829383,
"learning_rate": 1.9786254838711757e-06,
"loss": 0.2989,
"step": 860
},
{
"epoch": 0.22664374027675427,
"grad_norm": 6.1340912560899525,
"learning_rate": 1.9781776215112204e-06,
"loss": 0.2904,
"step": 865
},
{
"epoch": 0.22795381970031933,
"grad_norm": 4.747377998160214,
"learning_rate": 1.9777251673689198e-06,
"loss": 0.2786,
"step": 870
},
{
"epoch": 0.2292638991238844,
"grad_norm": 7.702780817923823,
"learning_rate": 1.9772681235681933e-06,
"loss": 0.3207,
"step": 875
},
{
"epoch": 0.23057397854744943,
"grad_norm": 3.7484591701091077,
"learning_rate": 1.976806492254506e-06,
"loss": 0.272,
"step": 880
},
{
"epoch": 0.2318840579710145,
"grad_norm": 6.027425431373605,
"learning_rate": 1.9763402755948574e-06,
"loss": 0.2878,
"step": 885
},
{
"epoch": 0.23319413739457956,
"grad_norm": 4.420243905346968,
"learning_rate": 1.975869475777772e-06,
"loss": 0.3112,
"step": 890
},
{
"epoch": 0.2345042168181446,
"grad_norm": 3.2086505540675474,
"learning_rate": 1.9753940950132874e-06,
"loss": 0.3328,
"step": 895
},
{
"epoch": 0.23581429624170966,
"grad_norm": 4.92351892053512,
"learning_rate": 1.9749141355329473e-06,
"loss": 0.3039,
"step": 900
},
{
"epoch": 0.23581429624170966,
"eval_accuracy": 0.7176,
"eval_loss": 0.6235886812210083,
"eval_runtime": 134.5821,
"eval_samples_per_second": 9.288,
"eval_steps_per_second": 2.326,
"step": 900
},
{
"epoch": 0.2371243756652747,
"grad_norm": 4.595098138514267,
"learning_rate": 1.9744295995897874e-06,
"loss": 0.3384,
"step": 905
},
{
"epoch": 0.23843445508883976,
"grad_norm": 5.875594511033142,
"learning_rate": 1.9739404894583262e-06,
"loss": 0.2493,
"step": 910
},
{
"epoch": 0.23974453451240482,
"grad_norm": 4.943964557160407,
"learning_rate": 1.9734468074345555e-06,
"loss": 0.3264,
"step": 915
},
{
"epoch": 0.24105461393596986,
"grad_norm": 3.4784751132842167,
"learning_rate": 1.9729485558359286e-06,
"loss": 0.2736,
"step": 920
},
{
"epoch": 0.24236469335953492,
"grad_norm": 5.689006473597506,
"learning_rate": 1.9724457370013474e-06,
"loss": 0.2991,
"step": 925
},
{
"epoch": 0.24367477278309999,
"grad_norm": 4.95697152099517,
"learning_rate": 1.971938353291156e-06,
"loss": 0.3023,
"step": 930
},
{
"epoch": 0.24498485220666502,
"grad_norm": 3.698150841899642,
"learning_rate": 1.9714264070871254e-06,
"loss": 0.3104,
"step": 935
},
{
"epoch": 0.24629493163023009,
"grad_norm": 3.485873023817148,
"learning_rate": 1.970909900792444e-06,
"loss": 0.296,
"step": 940
},
{
"epoch": 0.24760501105379515,
"grad_norm": 4.428297690192133,
"learning_rate": 1.9703888368317084e-06,
"loss": 0.349,
"step": 945
},
{
"epoch": 0.24891509047736018,
"grad_norm": 3.1990684592166447,
"learning_rate": 1.969863217650906e-06,
"loss": 0.2494,
"step": 950
},
{
"epoch": 0.25022516990092525,
"grad_norm": 5.662300170468204,
"learning_rate": 1.9693330457174113e-06,
"loss": 0.3193,
"step": 955
},
{
"epoch": 0.2515352493244903,
"grad_norm": 3.389632162628184,
"learning_rate": 1.968798323519968e-06,
"loss": 0.3378,
"step": 960
},
{
"epoch": 0.2528453287480554,
"grad_norm": 3.776539689852539,
"learning_rate": 1.9682590535686804e-06,
"loss": 0.2909,
"step": 965
},
{
"epoch": 0.2541554081716204,
"grad_norm": 2.739785839636218,
"learning_rate": 1.9677152383950014e-06,
"loss": 0.2877,
"step": 970
},
{
"epoch": 0.25546548759518545,
"grad_norm": 4.226352508831814,
"learning_rate": 1.9671668805517197e-06,
"loss": 0.2917,
"step": 975
},
{
"epoch": 0.25677556701875054,
"grad_norm": 4.092459875987079,
"learning_rate": 1.9666139826129482e-06,
"loss": 0.3101,
"step": 980
},
{
"epoch": 0.2580856464423156,
"grad_norm": 3.022031693799492,
"learning_rate": 1.9660565471741133e-06,
"loss": 0.2451,
"step": 985
},
{
"epoch": 0.2593957258658806,
"grad_norm": 5.445606320908549,
"learning_rate": 1.965494576851939e-06,
"loss": 0.2803,
"step": 990
},
{
"epoch": 0.26070580528944565,
"grad_norm": 6.789604538228405,
"learning_rate": 1.9649280742844383e-06,
"loss": 0.3155,
"step": 995
},
{
"epoch": 0.26201588471301074,
"grad_norm": 9.717312792399483,
"learning_rate": 1.9643570421309013e-06,
"loss": 0.354,
"step": 1000
},
{
"epoch": 0.26201588471301074,
"eval_accuracy": 0.7272,
"eval_loss": 0.641910970211029,
"eval_runtime": 135.3149,
"eval_samples_per_second": 9.238,
"eval_steps_per_second": 2.313,
"step": 1000
},
{
"epoch": 0.2633259641365758,
"grad_norm": 3.570212334750135,
"learning_rate": 1.9637814830718784e-06,
"loss": 0.2197,
"step": 1005
},
{
"epoch": 0.2646360435601408,
"grad_norm": 5.497952454503209,
"learning_rate": 1.9632013998091708e-06,
"loss": 0.2843,
"step": 1010
},
{
"epoch": 0.2659461229837059,
"grad_norm": 4.3351899402575595,
"learning_rate": 1.962616795065819e-06,
"loss": 0.3116,
"step": 1015
},
{
"epoch": 0.26725620240727094,
"grad_norm": 5.059296767345353,
"learning_rate": 1.962027671586086e-06,
"loss": 0.2732,
"step": 1020
},
{
"epoch": 0.268566281830836,
"grad_norm": 3.636972491245334,
"learning_rate": 1.961434032135448e-06,
"loss": 0.3015,
"step": 1025
},
{
"epoch": 0.26987636125440106,
"grad_norm": 2.886855538209043,
"learning_rate": 1.9608358795005805e-06,
"loss": 0.271,
"step": 1030
},
{
"epoch": 0.2711864406779661,
"grad_norm": 4.518933069509563,
"learning_rate": 1.960233216489344e-06,
"loss": 0.2875,
"step": 1035
},
{
"epoch": 0.27249652010153114,
"grad_norm": 6.776126720418537,
"learning_rate": 1.959626045930773e-06,
"loss": 0.3297,
"step": 1040
},
{
"epoch": 0.27380659952509623,
"grad_norm": 3.3329344336767175,
"learning_rate": 1.9590143706750595e-06,
"loss": 0.3023,
"step": 1045
},
{
"epoch": 0.27511667894866126,
"grad_norm": 8.048171208434075,
"learning_rate": 1.958398193593543e-06,
"loss": 0.334,
"step": 1050
},
{
"epoch": 0.2764267583722263,
"grad_norm": 3.78319855061,
"learning_rate": 1.9577775175786944e-06,
"loss": 0.2919,
"step": 1055
},
{
"epoch": 0.2777368377957914,
"grad_norm": 3.3427967602303164,
"learning_rate": 1.957152345544106e-06,
"loss": 0.3142,
"step": 1060
},
{
"epoch": 0.2790469172193564,
"grad_norm": 5.104886446268955,
"learning_rate": 1.9565226804244723e-06,
"loss": 0.3025,
"step": 1065
},
{
"epoch": 0.28035699664292146,
"grad_norm": 2.0521842311663114,
"learning_rate": 1.9558885251755814e-06,
"loss": 0.2591,
"step": 1070
},
{
"epoch": 0.28166707606648655,
"grad_norm": 2.77191138245992,
"learning_rate": 1.955249882774298e-06,
"loss": 0.3224,
"step": 1075
},
{
"epoch": 0.2829771554900516,
"grad_norm": 5.0369322317805185,
"learning_rate": 1.954606756218552e-06,
"loss": 0.3104,
"step": 1080
},
{
"epoch": 0.2842872349136166,
"grad_norm": 7.9829960694246624,
"learning_rate": 1.9539591485273207e-06,
"loss": 0.2774,
"step": 1085
},
{
"epoch": 0.2855973143371817,
"grad_norm": 6.1487449662425835,
"learning_rate": 1.953307062740619e-06,
"loss": 0.3271,
"step": 1090
},
{
"epoch": 0.28690739376074675,
"grad_norm": 4.902123463001405,
"learning_rate": 1.952650501919481e-06,
"loss": 0.3087,
"step": 1095
},
{
"epoch": 0.2882174731843118,
"grad_norm": 5.005642536094271,
"learning_rate": 1.9519894691459488e-06,
"loss": 0.3698,
"step": 1100
},
{
"epoch": 0.2882174731843118,
"eval_accuracy": 0.7104,
"eval_loss": 0.7176594734191895,
"eval_runtime": 136.7009,
"eval_samples_per_second": 9.144,
"eval_steps_per_second": 2.29,
"step": 1100
},
{
"epoch": 0.2895275526078768,
"grad_norm": 3.1207339769636837,
"learning_rate": 1.951323967523057e-06,
"loss": 0.3232,
"step": 1105
},
{
"epoch": 0.2908376320314419,
"grad_norm": 3.1159736941377147,
"learning_rate": 1.9506540001748172e-06,
"loss": 0.2797,
"step": 1110
},
{
"epoch": 0.29214771145500695,
"grad_norm": 4.293077428446999,
"learning_rate": 1.9499795702462047e-06,
"loss": 0.3155,
"step": 1115
},
{
"epoch": 0.293457790878572,
"grad_norm": 3.8028097270147794,
"learning_rate": 1.949300680903143e-06,
"loss": 0.21,
"step": 1120
},
{
"epoch": 0.2947678703021371,
"grad_norm": 4.84671496698013,
"learning_rate": 1.948617335332489e-06,
"loss": 0.2633,
"step": 1125
},
{
"epoch": 0.2960779497257021,
"grad_norm": 3.960211317187865,
"learning_rate": 1.947929536742018e-06,
"loss": 0.2954,
"step": 1130
},
{
"epoch": 0.29738802914926715,
"grad_norm": 6.0168036176339985,
"learning_rate": 1.947237288360408e-06,
"loss": 0.284,
"step": 1135
},
{
"epoch": 0.29869810857283224,
"grad_norm": 3.518422321473526,
"learning_rate": 1.946540593437228e-06,
"loss": 0.2759,
"step": 1140
},
{
"epoch": 0.3000081879963973,
"grad_norm": 5.514253446140628,
"learning_rate": 1.945839455242917e-06,
"loss": 0.2304,
"step": 1145
},
{
"epoch": 0.3013182674199623,
"grad_norm": 8.396945191804443,
"learning_rate": 1.945133877068773e-06,
"loss": 0.3492,
"step": 1150
},
{
"epoch": 0.3026283468435274,
"grad_norm": 2.9344408033676497,
"learning_rate": 1.9444238622269366e-06,
"loss": 0.2529,
"step": 1155
},
{
"epoch": 0.30393842626709244,
"grad_norm": 4.341884742564996,
"learning_rate": 1.9437094140503745e-06,
"loss": 0.2763,
"step": 1160
},
{
"epoch": 0.3052485056906575,
"grad_norm": 4.144835095588702,
"learning_rate": 1.9429905358928646e-06,
"loss": 0.2992,
"step": 1165
},
{
"epoch": 0.30655858511422257,
"grad_norm": 4.571896652829111,
"learning_rate": 1.9422672311289797e-06,
"loss": 0.2094,
"step": 1170
},
{
"epoch": 0.3078686645377876,
"grad_norm": 6.487182515831676,
"learning_rate": 1.9415395031540734e-06,
"loss": 0.3184,
"step": 1175
},
{
"epoch": 0.30917874396135264,
"grad_norm": 6.2890432701053784,
"learning_rate": 1.9408073553842614e-06,
"loss": 0.2885,
"step": 1180
},
{
"epoch": 0.31048882338491773,
"grad_norm": 5.261502639917455,
"learning_rate": 1.9400707912564078e-06,
"loss": 0.2425,
"step": 1185
},
{
"epoch": 0.31179890280848277,
"grad_norm": 5.4319235826756715,
"learning_rate": 1.939329814228107e-06,
"loss": 0.3138,
"step": 1190
},
{
"epoch": 0.3131089822320478,
"grad_norm": 4.569463003103509,
"learning_rate": 1.93858442777767e-06,
"loss": 0.2905,
"step": 1195
},
{
"epoch": 0.3144190616556129,
"grad_norm": 6.51841356160416,
"learning_rate": 1.9378346354041057e-06,
"loss": 0.2544,
"step": 1200
},
{
"epoch": 0.3144190616556129,
"eval_accuracy": 0.7032,
"eval_loss": 0.7195152640342712,
"eval_runtime": 139.9321,
"eval_samples_per_second": 8.933,
"eval_steps_per_second": 2.237,
"step": 1200
},
{
"epoch": 0.31572914107917793,
"grad_norm": 6.07329817723254,
"learning_rate": 1.9370804406271053e-06,
"loss": 0.3082,
"step": 1205
},
{
"epoch": 0.31703922050274297,
"grad_norm": 5.5739656052051565,
"learning_rate": 1.936321846987026e-06,
"loss": 0.2982,
"step": 1210
},
{
"epoch": 0.318349299926308,
"grad_norm": 3.19437710777185,
"learning_rate": 1.9355588580448743e-06,
"loss": 0.2404,
"step": 1215
},
{
"epoch": 0.3196593793498731,
"grad_norm": 4.020281104033066,
"learning_rate": 1.9347914773822897e-06,
"loss": 0.3113,
"step": 1220
},
{
"epoch": 0.32096945877343813,
"grad_norm": 4.494719920151199,
"learning_rate": 1.9340197086015267e-06,
"loss": 0.3129,
"step": 1225
},
{
"epoch": 0.32227953819700317,
"grad_norm": 4.154261834382855,
"learning_rate": 1.9332435553254386e-06,
"loss": 0.3315,
"step": 1230
},
{
"epoch": 0.32358961762056826,
"grad_norm": 4.748202443565547,
"learning_rate": 1.932463021197461e-06,
"loss": 0.2484,
"step": 1235
},
{
"epoch": 0.3248996970441333,
"grad_norm": 3.4246878831109404,
"learning_rate": 1.9316781098815938e-06,
"loss": 0.2892,
"step": 1240
},
{
"epoch": 0.32620977646769833,
"grad_norm": 2.680374700010348,
"learning_rate": 1.930888825062385e-06,
"loss": 0.2731,
"step": 1245
},
{
"epoch": 0.3275198558912634,
"grad_norm": 5.622466682481238,
"learning_rate": 1.9300951704449113e-06,
"loss": 0.3281,
"step": 1250
},
{
"epoch": 0.32882993531482846,
"grad_norm": 4.573189599074694,
"learning_rate": 1.929297149754764e-06,
"loss": 0.3044,
"step": 1255
},
{
"epoch": 0.3301400147383935,
"grad_norm": 3.977565315266715,
"learning_rate": 1.928494766738029e-06,
"loss": 0.3347,
"step": 1260
},
{
"epoch": 0.3314500941619586,
"grad_norm": 3.267696615745348,
"learning_rate": 1.927688025161269e-06,
"loss": 0.273,
"step": 1265
},
{
"epoch": 0.3327601735855236,
"grad_norm": 3.7685478337164535,
"learning_rate": 1.9268769288115083e-06,
"loss": 0.308,
"step": 1270
},
{
"epoch": 0.33407025300908866,
"grad_norm": 4.733168908675778,
"learning_rate": 1.9260614814962127e-06,
"loss": 0.2864,
"step": 1275
},
{
"epoch": 0.33538033243265375,
"grad_norm": 3.7829911630990756,
"learning_rate": 1.9252416870432723e-06,
"loss": 0.2763,
"step": 1280
},
{
"epoch": 0.3366904118562188,
"grad_norm": 4.817098610717063,
"learning_rate": 1.9244175493009836e-06,
"loss": 0.2661,
"step": 1285
},
{
"epoch": 0.3380004912797838,
"grad_norm": 5.155494294997122,
"learning_rate": 1.9235890721380323e-06,
"loss": 0.3272,
"step": 1290
},
{
"epoch": 0.3393105707033489,
"grad_norm": 3.9443526122441295,
"learning_rate": 1.9227562594434733e-06,
"loss": 0.3294,
"step": 1295
},
{
"epoch": 0.34062065012691395,
"grad_norm": 3.268646227982553,
"learning_rate": 1.9219191151267133e-06,
"loss": 0.2571,
"step": 1300
},
{
"epoch": 0.34062065012691395,
"eval_accuracy": 0.712,
"eval_loss": 0.7290279269218445,
"eval_runtime": 139.9623,
"eval_samples_per_second": 8.931,
"eval_steps_per_second": 2.236,
"step": 1300
},
{
"epoch": 0.341930729550479,
"grad_norm": 4.156705015549692,
"learning_rate": 1.9210776431174937e-06,
"loss": 0.296,
"step": 1305
},
{
"epoch": 0.3432408089740441,
"grad_norm": 4.1937292159776405,
"learning_rate": 1.9202318473658702e-06,
"loss": 0.2799,
"step": 1310
},
{
"epoch": 0.3445508883976091,
"grad_norm": 3.5322347963356866,
"learning_rate": 1.9193817318421952e-06,
"loss": 0.2803,
"step": 1315
},
{
"epoch": 0.34586096782117415,
"grad_norm": 5.317835964163557,
"learning_rate": 1.9185273005371e-06,
"loss": 0.2849,
"step": 1320
},
{
"epoch": 0.34717104724473924,
"grad_norm": 5.169820633932056,
"learning_rate": 1.9176685574614733e-06,
"loss": 0.2987,
"step": 1325
},
{
"epoch": 0.3484811266683043,
"grad_norm": 4.982709983606647,
"learning_rate": 1.9168055066464457e-06,
"loss": 0.2716,
"step": 1330
},
{
"epoch": 0.3497912060918693,
"grad_norm": 4.866013018973415,
"learning_rate": 1.9159381521433684e-06,
"loss": 0.2766,
"step": 1335
},
{
"epoch": 0.35110128551543435,
"grad_norm": 4.025011433913149,
"learning_rate": 1.9150664980237964e-06,
"loss": 0.2584,
"step": 1340
},
{
"epoch": 0.35241136493899944,
"grad_norm": 3.8599220124227545,
"learning_rate": 1.9141905483794664e-06,
"loss": 0.3204,
"step": 1345
},
{
"epoch": 0.3537214443625645,
"grad_norm": 3.79972737879995,
"learning_rate": 1.91331030732228e-06,
"loss": 0.2836,
"step": 1350
},
{
"epoch": 0.3550315237861295,
"grad_norm": 2.919813006103404,
"learning_rate": 1.9124257789842843e-06,
"loss": 0.2587,
"step": 1355
},
{
"epoch": 0.3563416032096946,
"grad_norm": 5.170605350757861,
"learning_rate": 1.9115369675176504e-06,
"loss": 0.3065,
"step": 1360
},
{
"epoch": 0.35765168263325964,
"grad_norm": 6.1588988469192065,
"learning_rate": 1.910643877094656e-06,
"loss": 0.3447,
"step": 1365
},
{
"epoch": 0.3589617620568247,
"grad_norm": 2.6672804726236303,
"learning_rate": 1.9097465119076665e-06,
"loss": 0.3036,
"step": 1370
},
{
"epoch": 0.36027184148038977,
"grad_norm": 3.4333435314335983,
"learning_rate": 1.908844876169112e-06,
"loss": 0.2682,
"step": 1375
},
{
"epoch": 0.3615819209039548,
"grad_norm": 2.6540494040134153,
"learning_rate": 1.9079389741114696e-06,
"loss": 0.2592,
"step": 1380
},
{
"epoch": 0.36289200032751984,
"grad_norm": 4.249055358619522,
"learning_rate": 1.9070288099872452e-06,
"loss": 0.2605,
"step": 1385
},
{
"epoch": 0.36420207975108493,
"grad_norm": 4.836139691290449,
"learning_rate": 1.9061143880689503e-06,
"loss": 0.2977,
"step": 1390
},
{
"epoch": 0.36551215917464996,
"grad_norm": 4.705615420915993,
"learning_rate": 1.905195712649084e-06,
"loss": 0.3444,
"step": 1395
},
{
"epoch": 0.366822238598215,
"grad_norm": 3.2537510054626515,
"learning_rate": 1.9042727880401122e-06,
"loss": 0.3558,
"step": 1400
},
{
"epoch": 0.366822238598215,
"eval_accuracy": 0.676,
"eval_loss": 0.7812010049819946,
"eval_runtime": 137.3509,
"eval_samples_per_second": 9.101,
"eval_steps_per_second": 2.279,
"step": 1400
},
{
"epoch": 0.3681323180217801,
"grad_norm": 2.7663311029982856,
"learning_rate": 1.9033456185744469e-06,
"loss": 0.2985,
"step": 1405
},
{
"epoch": 0.3694423974453451,
"grad_norm": 2.6580405529803808,
"learning_rate": 1.9024142086044277e-06,
"loss": 0.2834,
"step": 1410
},
{
"epoch": 0.37075247686891016,
"grad_norm": 3.6849674325031407,
"learning_rate": 1.9014785625022985e-06,
"loss": 0.2779,
"step": 1415
},
{
"epoch": 0.37206255629247525,
"grad_norm": 4.020792652442759,
"learning_rate": 1.9005386846601893e-06,
"loss": 0.2472,
"step": 1420
},
{
"epoch": 0.3733726357160403,
"grad_norm": 4.526317952946907,
"learning_rate": 1.8995945794900953e-06,
"loss": 0.2786,
"step": 1425
},
{
"epoch": 0.3746827151396053,
"grad_norm": 4.123722032882601,
"learning_rate": 1.8986462514238547e-06,
"loss": 0.2833,
"step": 1430
},
{
"epoch": 0.3759927945631704,
"grad_norm": 6.744679094753499,
"learning_rate": 1.8976937049131298e-06,
"loss": 0.3072,
"step": 1435
},
{
"epoch": 0.37730287398673545,
"grad_norm": 4.370200879451724,
"learning_rate": 1.8967369444293847e-06,
"loss": 0.25,
"step": 1440
},
{
"epoch": 0.3786129534103005,
"grad_norm": 2.6567231964667424,
"learning_rate": 1.8957759744638651e-06,
"loss": 0.2461,
"step": 1445
},
{
"epoch": 0.3799230328338655,
"grad_norm": 4.993924185098615,
"learning_rate": 1.8948107995275761e-06,
"loss": 0.2457,
"step": 1450
},
{
"epoch": 0.3812331122574306,
"grad_norm": 5.890536303510147,
"learning_rate": 1.8938414241512637e-06,
"loss": 0.3337,
"step": 1455
},
{
"epoch": 0.38254319168099565,
"grad_norm": 6.086658941241191,
"learning_rate": 1.8928678528853895e-06,
"loss": 0.261,
"step": 1460
},
{
"epoch": 0.3838532711045607,
"grad_norm": 2.5029858692820444,
"learning_rate": 1.8918900903001136e-06,
"loss": 0.2623,
"step": 1465
},
{
"epoch": 0.3851633505281258,
"grad_norm": 3.1670655034154347,
"learning_rate": 1.8909081409852692e-06,
"loss": 0.3239,
"step": 1470
},
{
"epoch": 0.3864734299516908,
"grad_norm": 6.92258040722732,
"learning_rate": 1.8899220095503442e-06,
"loss": 0.3251,
"step": 1475
},
{
"epoch": 0.38778350937525585,
"grad_norm": 2.459950138600639,
"learning_rate": 1.888931700624458e-06,
"loss": 0.2865,
"step": 1480
},
{
"epoch": 0.38909358879882094,
"grad_norm": 4.566006943218545,
"learning_rate": 1.8879372188563396e-06,
"loss": 0.2919,
"step": 1485
},
{
"epoch": 0.390403668222386,
"grad_norm": 6.1169971997341515,
"learning_rate": 1.8869385689143069e-06,
"loss": 0.3248,
"step": 1490
},
{
"epoch": 0.391713747645951,
"grad_norm": 4.321962895847691,
"learning_rate": 1.885935755486244e-06,
"loss": 0.2497,
"step": 1495
},
{
"epoch": 0.3930238270695161,
"grad_norm": 3.786195016300289,
"learning_rate": 1.8849287832795785e-06,
"loss": 0.2842,
"step": 1500
},
{
"epoch": 0.3930238270695161,
"eval_accuracy": 0.6952,
"eval_loss": 0.7388889789581299,
"eval_runtime": 140.4298,
"eval_samples_per_second": 8.901,
"eval_steps_per_second": 2.229,
"step": 1500
},
{
"epoch": 0.39433390649308114,
"grad_norm": 4.144022239569388,
"learning_rate": 1.8839176570212619e-06,
"loss": 0.2776,
"step": 1505
},
{
"epoch": 0.3956439859166462,
"grad_norm": 3.254154282020282,
"learning_rate": 1.882902381457744e-06,
"loss": 0.3046,
"step": 1510
},
{
"epoch": 0.39695406534021127,
"grad_norm": 3.7972516111759465,
"learning_rate": 1.8818829613549532e-06,
"loss": 0.3571,
"step": 1515
},
{
"epoch": 0.3982641447637763,
"grad_norm": 2.7098946167159217,
"learning_rate": 1.8808594014982736e-06,
"loss": 0.3086,
"step": 1520
},
{
"epoch": 0.39957422418734134,
"grad_norm": 2.395987116328653,
"learning_rate": 1.879831706692521e-06,
"loss": 0.2955,
"step": 1525
},
{
"epoch": 0.40088430361090643,
"grad_norm": 5.567909535649407,
"learning_rate": 1.8787998817619233e-06,
"loss": 0.3045,
"step": 1530
},
{
"epoch": 0.40219438303447147,
"grad_norm": 4.680059669737792,
"learning_rate": 1.8777639315500945e-06,
"loss": 0.2648,
"step": 1535
},
{
"epoch": 0.4035044624580365,
"grad_norm": 8.387574343664538,
"learning_rate": 1.876723860920015e-06,
"loss": 0.3123,
"step": 1540
},
{
"epoch": 0.4048145418816016,
"grad_norm": 2.023022886159982,
"learning_rate": 1.8756796747540057e-06,
"loss": 0.2561,
"step": 1545
},
{
"epoch": 0.40612462130516663,
"grad_norm": 4.634489921732028,
"learning_rate": 1.8746313779537087e-06,
"loss": 0.3115,
"step": 1550
},
{
"epoch": 0.40743470072873167,
"grad_norm": 3.401314650673904,
"learning_rate": 1.8735789754400603e-06,
"loss": 0.2493,
"step": 1555
},
{
"epoch": 0.40874478015229676,
"grad_norm": 2.586199787021852,
"learning_rate": 1.8725224721532715e-06,
"loss": 0.2521,
"step": 1560
},
{
"epoch": 0.4100548595758618,
"grad_norm": 4.393631624962878,
"learning_rate": 1.8714618730528024e-06,
"loss": 0.2817,
"step": 1565
},
{
"epoch": 0.41136493899942683,
"grad_norm": 5.57797489979425,
"learning_rate": 1.8703971831173405e-06,
"loss": 0.2937,
"step": 1570
},
{
"epoch": 0.41267501842299187,
"grad_norm": 3.4687053444228235,
"learning_rate": 1.8693284073447755e-06,
"loss": 0.3344,
"step": 1575
},
{
"epoch": 0.41398509784655696,
"grad_norm": 4.141659335286871,
"learning_rate": 1.868255550752178e-06,
"loss": 0.2546,
"step": 1580
},
{
"epoch": 0.415295177270122,
"grad_norm": 7.531913864092293,
"learning_rate": 1.8671786183757741e-06,
"loss": 0.2992,
"step": 1585
},
{
"epoch": 0.41660525669368703,
"grad_norm": 5.209397304260616,
"learning_rate": 1.866097615270923e-06,
"loss": 0.2978,
"step": 1590
},
{
"epoch": 0.4179153361172521,
"grad_norm": 4.9952265985686966,
"learning_rate": 1.865012546512092e-06,
"loss": 0.2386,
"step": 1595
},
{
"epoch": 0.41922541554081716,
"grad_norm": 4.620451674592193,
"learning_rate": 1.863923417192835e-06,
"loss": 0.3012,
"step": 1600
},
{
"epoch": 0.41922541554081716,
"eval_accuracy": 0.7088,
"eval_loss": 0.7305626273155212,
"eval_runtime": 137.487,
"eval_samples_per_second": 9.092,
"eval_steps_per_second": 2.277,
"step": 1600
},
{
"epoch": 0.4205354949643822,
"grad_norm": 3.8216787722513335,
"learning_rate": 1.8628302324257664e-06,
"loss": 0.2886,
"step": 1605
},
{
"epoch": 0.4218455743879473,
"grad_norm": 4.302516444293675,
"learning_rate": 1.8617329973425364e-06,
"loss": 0.2986,
"step": 1610
},
{
"epoch": 0.4231556538115123,
"grad_norm": 2.5052399973675823,
"learning_rate": 1.86063171709381e-06,
"loss": 0.2977,
"step": 1615
},
{
"epoch": 0.42446573323507736,
"grad_norm": 4.561793077013278,
"learning_rate": 1.8595263968492407e-06,
"loss": 0.3231,
"step": 1620
},
{
"epoch": 0.42577581265864245,
"grad_norm": 7.8865966163916426,
"learning_rate": 1.8584170417974465e-06,
"loss": 0.3202,
"step": 1625
},
{
"epoch": 0.4270858920822075,
"grad_norm": 3.674689043482507,
"learning_rate": 1.857303657145985e-06,
"loss": 0.2683,
"step": 1630
},
{
"epoch": 0.4283959715057725,
"grad_norm": 2.8123441864934,
"learning_rate": 1.8561862481213313e-06,
"loss": 0.2893,
"step": 1635
},
{
"epoch": 0.4297060509293376,
"grad_norm": 2.5957429423912854,
"learning_rate": 1.85506481996885e-06,
"loss": 0.3001,
"step": 1640
},
{
"epoch": 0.43101613035290265,
"grad_norm": 6.272609371298048,
"learning_rate": 1.8539393779527735e-06,
"loss": 0.2944,
"step": 1645
},
{
"epoch": 0.4323262097764677,
"grad_norm": 5.300171596731147,
"learning_rate": 1.8528099273561754e-06,
"loss": 0.2443,
"step": 1650
},
{
"epoch": 0.4336362892000328,
"grad_norm": 4.309301759126504,
"learning_rate": 1.8516764734809475e-06,
"loss": 0.2504,
"step": 1655
},
{
"epoch": 0.4349463686235978,
"grad_norm": 2.716951228905198,
"learning_rate": 1.8505390216477732e-06,
"loss": 0.2625,
"step": 1660
},
{
"epoch": 0.43625644804716285,
"grad_norm": 4.606417589142611,
"learning_rate": 1.8493975771961026e-06,
"loss": 0.2715,
"step": 1665
},
{
"epoch": 0.43756652747072794,
"grad_norm": 3.7628490993032444,
"learning_rate": 1.8482521454841296e-06,
"loss": 0.3187,
"step": 1670
},
{
"epoch": 0.438876606894293,
"grad_norm": 3.6806362340314878,
"learning_rate": 1.8471027318887632e-06,
"loss": 0.2446,
"step": 1675
},
{
"epoch": 0.440186686317858,
"grad_norm": 2.8358474618960554,
"learning_rate": 1.8459493418056064e-06,
"loss": 0.2803,
"step": 1680
},
{
"epoch": 0.44149676574142305,
"grad_norm": 3.559305323246588,
"learning_rate": 1.8447919806489272e-06,
"loss": 0.3376,
"step": 1685
},
{
"epoch": 0.44280684516498814,
"grad_norm": 3.18546710084024,
"learning_rate": 1.8436306538516348e-06,
"loss": 0.2526,
"step": 1690
},
{
"epoch": 0.4441169245885532,
"grad_norm": 2.2770988367831317,
"learning_rate": 1.8424653668652548e-06,
"loss": 0.2878,
"step": 1695
},
{
"epoch": 0.4454270040121182,
"grad_norm": 2.6311103553113186,
"learning_rate": 1.8412961251599021e-06,
"loss": 0.323,
"step": 1700
},
{
"epoch": 0.4454270040121182,
"eval_accuracy": 0.7104,
"eval_loss": 0.7182445526123047,
"eval_runtime": 139.4262,
"eval_samples_per_second": 8.965,
"eval_steps_per_second": 2.245,
"step": 1700
},
{
"epoch": 0.4467370834356833,
"grad_norm": 2.7960747447704275,
"learning_rate": 1.8401229342242564e-06,
"loss": 0.3345,
"step": 1705
},
{
"epoch": 0.44804716285924834,
"grad_norm": 2.303028253758041,
"learning_rate": 1.8389457995655354e-06,
"loss": 0.2837,
"step": 1710
},
{
"epoch": 0.4493572422828134,
"grad_norm": 3.6261923466611763,
"learning_rate": 1.8377647267094699e-06,
"loss": 0.2656,
"step": 1715
},
{
"epoch": 0.45066732170637847,
"grad_norm": 4.89861955448886,
"learning_rate": 1.8365797212002777e-06,
"loss": 0.276,
"step": 1720
},
{
"epoch": 0.4519774011299435,
"grad_norm": 6.264789410236653,
"learning_rate": 1.8353907886006369e-06,
"loss": 0.3056,
"step": 1725
},
{
"epoch": 0.45328748055350854,
"grad_norm": 2.51930771111435,
"learning_rate": 1.8341979344916601e-06,
"loss": 0.2885,
"step": 1730
},
{
"epoch": 0.45459755997707363,
"grad_norm": 6.288406487126684,
"learning_rate": 1.833001164472869e-06,
"loss": 0.3229,
"step": 1735
},
{
"epoch": 0.45590763940063866,
"grad_norm": 5.444204531738827,
"learning_rate": 1.8318004841621666e-06,
"loss": 0.2589,
"step": 1740
},
{
"epoch": 0.4572177188242037,
"grad_norm": 6.474274016450882,
"learning_rate": 1.8305958991958126e-06,
"loss": 0.2912,
"step": 1745
},
{
"epoch": 0.4585277982477688,
"grad_norm": 6.449122812307309,
"learning_rate": 1.8293874152283952e-06,
"loss": 0.2992,
"step": 1750
},
{
"epoch": 0.45983787767133383,
"grad_norm": 4.641695980313187,
"learning_rate": 1.8281750379328061e-06,
"loss": 0.3278,
"step": 1755
},
{
"epoch": 0.46114795709489886,
"grad_norm": 2.681087051955473,
"learning_rate": 1.8269587730002125e-06,
"loss": 0.255,
"step": 1760
},
{
"epoch": 0.46245803651846396,
"grad_norm": 6.151626591739763,
"learning_rate": 1.8257386261400316e-06,
"loss": 0.2494,
"step": 1765
},
{
"epoch": 0.463768115942029,
"grad_norm": 4.505233549633642,
"learning_rate": 1.8245146030799025e-06,
"loss": 0.3442,
"step": 1770
},
{
"epoch": 0.465078195365594,
"grad_norm": 4.205788305228986,
"learning_rate": 1.8232867095656608e-06,
"loss": 0.3093,
"step": 1775
},
{
"epoch": 0.4663882747891591,
"grad_norm": 3.5089694683317174,
"learning_rate": 1.8220549513613104e-06,
"loss": 0.2846,
"step": 1780
},
{
"epoch": 0.46769835421272415,
"grad_norm": 4.514848486626711,
"learning_rate": 1.820819334248997e-06,
"loss": 0.3689,
"step": 1785
},
{
"epoch": 0.4690084336362892,
"grad_norm": 2.320338638195368,
"learning_rate": 1.8195798640289807e-06,
"loss": 0.2559,
"step": 1790
},
{
"epoch": 0.4703185130598542,
"grad_norm": 4.819290066996436,
"learning_rate": 1.8183365465196099e-06,
"loss": 0.2729,
"step": 1795
},
{
"epoch": 0.4716285924834193,
"grad_norm": 2.8951610503894734,
"learning_rate": 1.8170893875572916e-06,
"loss": 0.2502,
"step": 1800
},
{
"epoch": 0.4716285924834193,
"eval_accuracy": 0.7248,
"eval_loss": 0.6544848680496216,
"eval_runtime": 140.79,
"eval_samples_per_second": 8.878,
"eval_steps_per_second": 2.223,
"step": 1800
},
{
"epoch": 0.47293867190698435,
"grad_norm": 6.1281331169132045,
"learning_rate": 1.8158383929964665e-06,
"loss": 0.2792,
"step": 1805
},
{
"epoch": 0.4742487513305494,
"grad_norm": 4.051837237000961,
"learning_rate": 1.8145835687095797e-06,
"loss": 0.3106,
"step": 1810
},
{
"epoch": 0.4755588307541145,
"grad_norm": 5.166640184431374,
"learning_rate": 1.8133249205870547e-06,
"loss": 0.3153,
"step": 1815
},
{
"epoch": 0.4768689101776795,
"grad_norm": 4.026299173963558,
"learning_rate": 1.8120624545372643e-06,
"loss": 0.2343,
"step": 1820
},
{
"epoch": 0.47817898960124455,
"grad_norm": 4.295951177165347,
"learning_rate": 1.8107961764865033e-06,
"loss": 0.2883,
"step": 1825
},
{
"epoch": 0.47948906902480964,
"grad_norm": 3.4331157371118945,
"learning_rate": 1.8095260923789617e-06,
"loss": 0.2696,
"step": 1830
},
{
"epoch": 0.4807991484483747,
"grad_norm": 4.813504074638746,
"learning_rate": 1.8082522081766953e-06,
"loss": 0.3209,
"step": 1835
},
{
"epoch": 0.4821092278719397,
"grad_norm": 3.589632561094004,
"learning_rate": 1.8069745298595992e-06,
"loss": 0.2516,
"step": 1840
},
{
"epoch": 0.4834193072955048,
"grad_norm": 3.7263852570627143,
"learning_rate": 1.805693063425377e-06,
"loss": 0.3106,
"step": 1845
},
{
"epoch": 0.48472938671906984,
"grad_norm": 4.05564679184159,
"learning_rate": 1.8044078148895174e-06,
"loss": 0.2901,
"step": 1850
},
{
"epoch": 0.4860394661426349,
"grad_norm": 3.53180167912472,
"learning_rate": 1.8031187902852607e-06,
"loss": 0.2981,
"step": 1855
},
{
"epoch": 0.48734954556619997,
"grad_norm": 3.70131644140251,
"learning_rate": 1.801825995663574e-06,
"loss": 0.266,
"step": 1860
},
{
"epoch": 0.488659624989765,
"grad_norm": 3.9187161597018214,
"learning_rate": 1.8005294370931217e-06,
"loss": 0.2921,
"step": 1865
},
{
"epoch": 0.48996970441333004,
"grad_norm": 2.365181839837511,
"learning_rate": 1.7992291206602366e-06,
"loss": 0.292,
"step": 1870
},
{
"epoch": 0.49127978383689513,
"grad_norm": 3.6772154023031014,
"learning_rate": 1.797925052468892e-06,
"loss": 0.2926,
"step": 1875
},
{
"epoch": 0.49258986326046017,
"grad_norm": 3.169897885563791,
"learning_rate": 1.7966172386406728e-06,
"loss": 0.3069,
"step": 1880
},
{
"epoch": 0.4938999426840252,
"grad_norm": 3.623768040249494,
"learning_rate": 1.7953056853147466e-06,
"loss": 0.2728,
"step": 1885
},
{
"epoch": 0.4952100221075903,
"grad_norm": 3.8930495139893884,
"learning_rate": 1.7939903986478354e-06,
"loss": 0.2497,
"step": 1890
},
{
"epoch": 0.49652010153115533,
"grad_norm": 3.054365703792985,
"learning_rate": 1.7926713848141856e-06,
"loss": 0.2798,
"step": 1895
},
{
"epoch": 0.49783018095472037,
"grad_norm": 5.479009666538431,
"learning_rate": 1.7913486500055402e-06,
"loss": 0.3357,
"step": 1900
},
{
"epoch": 0.49783018095472037,
"eval_accuracy": 0.7184,
"eval_loss": 0.6975212097167969,
"eval_runtime": 134.9313,
"eval_samples_per_second": 9.264,
"eval_steps_per_second": 2.32,
"step": 1900
},
{
"epoch": 0.49914026037828546,
"grad_norm": 2.433308128925003,
"learning_rate": 1.7900222004311098e-06,
"loss": 0.28,
"step": 1905
},
{
"epoch": 0.5004503398018505,
"grad_norm": 3.0724802951664896,
"learning_rate": 1.788692042317542e-06,
"loss": 0.2741,
"step": 1910
},
{
"epoch": 0.5017604192254156,
"grad_norm": 2.9281942193789563,
"learning_rate": 1.7873581819088937e-06,
"loss": 0.2622,
"step": 1915
},
{
"epoch": 0.5030704986489806,
"grad_norm": 3.2439911646866415,
"learning_rate": 1.786020625466601e-06,
"loss": 0.2706,
"step": 1920
},
{
"epoch": 0.5043805780725457,
"grad_norm": 3.4446296154345175,
"learning_rate": 1.7846793792694497e-06,
"loss": 0.2596,
"step": 1925
},
{
"epoch": 0.5056906574961108,
"grad_norm": 3.7556256418902905,
"learning_rate": 1.7833344496135467e-06,
"loss": 0.3073,
"step": 1930
},
{
"epoch": 0.5070007369196757,
"grad_norm": 4.400597629681425,
"learning_rate": 1.7819858428122893e-06,
"loss": 0.2764,
"step": 1935
},
{
"epoch": 0.5083108163432408,
"grad_norm": 3.7380324234060143,
"learning_rate": 1.7806335651963372e-06,
"loss": 0.2906,
"step": 1940
},
{
"epoch": 0.5096208957668059,
"grad_norm": 4.614102737000705,
"learning_rate": 1.7792776231135802e-06,
"loss": 0.2898,
"step": 1945
},
{
"epoch": 0.5109309751903709,
"grad_norm": 2.7297343017049287,
"learning_rate": 1.7779180229291105e-06,
"loss": 0.23,
"step": 1950
},
{
"epoch": 0.512241054613936,
"grad_norm": 5.463732251912146,
"learning_rate": 1.7765547710251935e-06,
"loss": 0.2813,
"step": 1955
},
{
"epoch": 0.5135511340375011,
"grad_norm": 2.478093229460874,
"learning_rate": 1.7751878738012346e-06,
"loss": 0.2119,
"step": 1960
},
{
"epoch": 0.5148612134610661,
"grad_norm": 5.4823008219173595,
"learning_rate": 1.7738173376737522e-06,
"loss": 0.2642,
"step": 1965
},
{
"epoch": 0.5161712928846311,
"grad_norm": 3.9944643614596638,
"learning_rate": 1.7724431690763462e-06,
"loss": 0.2575,
"step": 1970
},
{
"epoch": 0.5174813723081961,
"grad_norm": 5.526636384041946,
"learning_rate": 1.7710653744596687e-06,
"loss": 0.3462,
"step": 1975
},
{
"epoch": 0.5187914517317612,
"grad_norm": 5.965066611467832,
"learning_rate": 1.7696839602913925e-06,
"loss": 0.3024,
"step": 1980
},
{
"epoch": 0.5201015311553263,
"grad_norm": 2.9808391808623247,
"learning_rate": 1.7682989330561813e-06,
"loss": 0.2729,
"step": 1985
},
{
"epoch": 0.5214116105788913,
"grad_norm": 3.6106553367793746,
"learning_rate": 1.7669102992556601e-06,
"loss": 0.2461,
"step": 1990
},
{
"epoch": 0.5227216900024564,
"grad_norm": 3.2772498647327732,
"learning_rate": 1.7655180654083832e-06,
"loss": 0.2842,
"step": 1995
},
{
"epoch": 0.5240317694260215,
"grad_norm": 6.885168575642456,
"learning_rate": 1.7641222380498044e-06,
"loss": 0.3379,
"step": 2000
},
{
"epoch": 0.5240317694260215,
"eval_accuracy": 0.7288,
"eval_loss": 0.673081636428833,
"eval_runtime": 136.835,
"eval_samples_per_second": 9.135,
"eval_steps_per_second": 2.287,
"step": 2000
},
{
"epoch": 0.5253418488495865,
"grad_norm": 3.6157723487154128,
"learning_rate": 1.7627228237322466e-06,
"loss": 0.2985,
"step": 2005
},
{
"epoch": 0.5266519282731515,
"grad_norm": 2.6275278120467105,
"learning_rate": 1.7613198290248706e-06,
"loss": 0.2281,
"step": 2010
},
{
"epoch": 0.5279620076967166,
"grad_norm": 3.3363602312332175,
"learning_rate": 1.7599132605136436e-06,
"loss": 0.3043,
"step": 2015
},
{
"epoch": 0.5292720871202816,
"grad_norm": 2.4277559410443175,
"learning_rate": 1.7585031248013106e-06,
"loss": 0.202,
"step": 2020
},
{
"epoch": 0.5305821665438467,
"grad_norm": 5.072153681054817,
"learning_rate": 1.7570894285073599e-06,
"loss": 0.2483,
"step": 2025
},
{
"epoch": 0.5318922459674118,
"grad_norm": 5.204973135313382,
"learning_rate": 1.7556721782679956e-06,
"loss": 0.3329,
"step": 2030
},
{
"epoch": 0.5332023253909768,
"grad_norm": 2.806879239003485,
"learning_rate": 1.7542513807361037e-06,
"loss": 0.2548,
"step": 2035
},
{
"epoch": 0.5345124048145419,
"grad_norm": 3.0228433116007327,
"learning_rate": 1.7528270425812228e-06,
"loss": 0.2651,
"step": 2040
},
{
"epoch": 0.535822484238107,
"grad_norm": 2.6869261489897736,
"learning_rate": 1.7513991704895112e-06,
"loss": 0.2844,
"step": 2045
},
{
"epoch": 0.537132563661672,
"grad_norm": 4.5723366454832215,
"learning_rate": 1.7499677711637171e-06,
"loss": 0.3071,
"step": 2050
},
{
"epoch": 0.538442643085237,
"grad_norm": 4.179299164940856,
"learning_rate": 1.7485328513231453e-06,
"loss": 0.2774,
"step": 2055
},
{
"epoch": 0.5397527225088021,
"grad_norm": 3.356708836608762,
"learning_rate": 1.7470944177036277e-06,
"loss": 0.2927,
"step": 2060
},
{
"epoch": 0.5410628019323671,
"grad_norm": 4.017539637774881,
"learning_rate": 1.74565247705749e-06,
"loss": 0.3008,
"step": 2065
},
{
"epoch": 0.5423728813559322,
"grad_norm": 3.974226727917273,
"learning_rate": 1.744207036153521e-06,
"loss": 0.2742,
"step": 2070
},
{
"epoch": 0.5436829607794973,
"grad_norm": 3.160288278087426,
"learning_rate": 1.7427581017769404e-06,
"loss": 0.3134,
"step": 2075
},
{
"epoch": 0.5449930402030623,
"grad_norm": 4.995568511985773,
"learning_rate": 1.741305680729367e-06,
"loss": 0.2927,
"step": 2080
},
{
"epoch": 0.5463031196266274,
"grad_norm": 2.5038111120195192,
"learning_rate": 1.7398497798287863e-06,
"loss": 0.2442,
"step": 2085
},
{
"epoch": 0.5476131990501925,
"grad_norm": 3.4536557863053035,
"learning_rate": 1.7383904059095202e-06,
"loss": 0.2592,
"step": 2090
},
{
"epoch": 0.5489232784737574,
"grad_norm": 4.410374261613443,
"learning_rate": 1.7369275658221926e-06,
"loss": 0.3117,
"step": 2095
},
{
"epoch": 0.5502333578973225,
"grad_norm": 2.538812009761459,
"learning_rate": 1.735461266433699e-06,
"loss": 0.2717,
"step": 2100
},
{
"epoch": 0.5502333578973225,
"eval_accuracy": 0.7424,
"eval_loss": 0.671869158744812,
"eval_runtime": 135.9633,
"eval_samples_per_second": 9.194,
"eval_steps_per_second": 2.302,
"step": 2100
},
{
"epoch": 0.5515434373208876,
"grad_norm": 3.627287673105934,
"learning_rate": 1.7339915146271732e-06,
"loss": 0.269,
"step": 2105
},
{
"epoch": 0.5528535167444526,
"grad_norm": 2.889042058590513,
"learning_rate": 1.7325183173019556e-06,
"loss": 0.2357,
"step": 2110
},
{
"epoch": 0.5541635961680177,
"grad_norm": 3.5905021176878775,
"learning_rate": 1.731041681373561e-06,
"loss": 0.2349,
"step": 2115
},
{
"epoch": 0.5554736755915828,
"grad_norm": 4.874519688988458,
"learning_rate": 1.729561613773645e-06,
"loss": 0.2886,
"step": 2120
},
{
"epoch": 0.5567837550151478,
"grad_norm": 3.281500493007926,
"learning_rate": 1.7280781214499727e-06,
"loss": 0.282,
"step": 2125
},
{
"epoch": 0.5580938344387129,
"grad_norm": 1.90376998294021,
"learning_rate": 1.7265912113663857e-06,
"loss": 0.2952,
"step": 2130
},
{
"epoch": 0.559403913862278,
"grad_norm": 4.157189646380671,
"learning_rate": 1.7251008905027692e-06,
"loss": 0.2913,
"step": 2135
},
{
"epoch": 0.5607139932858429,
"grad_norm": 7.357071844148765,
"learning_rate": 1.7236071658550191e-06,
"loss": 0.3016,
"step": 2140
},
{
"epoch": 0.562024072709408,
"grad_norm": 3.3548044431409214,
"learning_rate": 1.7221100444350099e-06,
"loss": 0.2526,
"step": 2145
},
{
"epoch": 0.5633341521329731,
"grad_norm": 2.070755143349665,
"learning_rate": 1.7206095332705608e-06,
"loss": 0.2859,
"step": 2150
},
{
"epoch": 0.5646442315565381,
"grad_norm": 2.9975997234107123,
"learning_rate": 1.7191056394054035e-06,
"loss": 0.2739,
"step": 2155
},
{
"epoch": 0.5659543109801032,
"grad_norm": 4.385338795716017,
"learning_rate": 1.7175983698991488e-06,
"loss": 0.312,
"step": 2160
},
{
"epoch": 0.5672643904036683,
"grad_norm": 3.112159603233555,
"learning_rate": 1.7160877318272537e-06,
"loss": 0.272,
"step": 2165
},
{
"epoch": 0.5685744698272333,
"grad_norm": 2.353537092467099,
"learning_rate": 1.7145737322809876e-06,
"loss": 0.2534,
"step": 2170
},
{
"epoch": 0.5698845492507983,
"grad_norm": 2.6947118285924385,
"learning_rate": 1.7130563783674e-06,
"loss": 0.2702,
"step": 2175
},
{
"epoch": 0.5711946286743634,
"grad_norm": 4.573113056178734,
"learning_rate": 1.7115356772092855e-06,
"loss": 0.3059,
"step": 2180
},
{
"epoch": 0.5725047080979284,
"grad_norm": 2.465542000370307,
"learning_rate": 1.7100116359451523e-06,
"loss": 0.2602,
"step": 2185
},
{
"epoch": 0.5738147875214935,
"grad_norm": 3.340565744796501,
"learning_rate": 1.7084842617291874e-06,
"loss": 0.2824,
"step": 2190
},
{
"epoch": 0.5751248669450586,
"grad_norm": 2.292406930546328,
"learning_rate": 1.706953561731224e-06,
"loss": 0.2696,
"step": 2195
},
{
"epoch": 0.5764349463686236,
"grad_norm": 3.3346832292412967,
"learning_rate": 1.705419543136707e-06,
"loss": 0.317,
"step": 2200
},
{
"epoch": 0.5764349463686236,
"eval_accuracy": 0.724,
"eval_loss": 0.6991069912910461,
"eval_runtime": 140.893,
"eval_samples_per_second": 8.872,
"eval_steps_per_second": 2.222,
"step": 2200
},
{
"epoch": 0.5777450257921887,
"grad_norm": 2.3310945353240435,
"learning_rate": 1.7038822131466583e-06,
"loss": 0.2504,
"step": 2205
},
{
"epoch": 0.5790551052157537,
"grad_norm": 2.3747884427134713,
"learning_rate": 1.7023415789776463e-06,
"loss": 0.298,
"step": 2210
},
{
"epoch": 0.5803651846393187,
"grad_norm": 3.133493758446318,
"learning_rate": 1.7007976478617484e-06,
"loss": 0.2376,
"step": 2215
},
{
"epoch": 0.5816752640628838,
"grad_norm": 4.12633676209407,
"learning_rate": 1.6992504270465193e-06,
"loss": 0.2944,
"step": 2220
},
{
"epoch": 0.5829853434864488,
"grad_norm": 3.9130822922585144,
"learning_rate": 1.697699923794956e-06,
"loss": 0.2993,
"step": 2225
},
{
"epoch": 0.5842954229100139,
"grad_norm": 2.7907750454465203,
"learning_rate": 1.696146145385464e-06,
"loss": 0.2868,
"step": 2230
},
{
"epoch": 0.585605502333579,
"grad_norm": 3.8858008383873397,
"learning_rate": 1.6945890991118236e-06,
"loss": 0.3234,
"step": 2235
},
{
"epoch": 0.586915581757144,
"grad_norm": 2.4869248403820308,
"learning_rate": 1.6930287922831546e-06,
"loss": 0.2584,
"step": 2240
},
{
"epoch": 0.5882256611807091,
"grad_norm": 2.9309269678483525,
"learning_rate": 1.6914652322238824e-06,
"loss": 0.2303,
"step": 2245
},
{
"epoch": 0.5895357406042742,
"grad_norm": 4.8721568277124625,
"learning_rate": 1.6898984262737046e-06,
"loss": 0.2216,
"step": 2250
},
{
"epoch": 0.5908458200278391,
"grad_norm": 4.919818554906796,
"learning_rate": 1.6883283817875546e-06,
"loss": 0.2742,
"step": 2255
},
{
"epoch": 0.5921558994514042,
"grad_norm": 7.35406987379939,
"learning_rate": 1.6867551061355696e-06,
"loss": 0.2984,
"step": 2260
},
{
"epoch": 0.5934659788749693,
"grad_norm": 2.5259386632956864,
"learning_rate": 1.6851786067030535e-06,
"loss": 0.2001,
"step": 2265
},
{
"epoch": 0.5947760582985343,
"grad_norm": 4.776608491229436,
"learning_rate": 1.6835988908904437e-06,
"loss": 0.3169,
"step": 2270
},
{
"epoch": 0.5960861377220994,
"grad_norm": 2.473422082161941,
"learning_rate": 1.6820159661132763e-06,
"loss": 0.2355,
"step": 2275
},
{
"epoch": 0.5973962171456645,
"grad_norm": 3.0778943971784165,
"learning_rate": 1.6804298398021501e-06,
"loss": 0.2308,
"step": 2280
},
{
"epoch": 0.5987062965692295,
"grad_norm": 3.5878133464375646,
"learning_rate": 1.6788405194026937e-06,
"loss": 0.2586,
"step": 2285
},
{
"epoch": 0.6000163759927946,
"grad_norm": 3.9888619741299194,
"learning_rate": 1.6772480123755288e-06,
"loss": 0.3039,
"step": 2290
},
{
"epoch": 0.6013264554163597,
"grad_norm": 4.413706147317179,
"learning_rate": 1.6756523261962361e-06,
"loss": 0.3061,
"step": 2295
},
{
"epoch": 0.6026365348399246,
"grad_norm": 6.997366587082385,
"learning_rate": 1.6740534683553197e-06,
"loss": 0.2696,
"step": 2300
},
{
"epoch": 0.6026365348399246,
"eval_accuracy": 0.7272,
"eval_loss": 0.7597007751464844,
"eval_runtime": 141.0016,
"eval_samples_per_second": 8.865,
"eval_steps_per_second": 2.22,
"step": 2300
},
{
"epoch": 0.6039466142634897,
"grad_norm": 5.753280459621215,
"learning_rate": 1.6724514463581727e-06,
"loss": 0.2935,
"step": 2305
},
{
"epoch": 0.6052566936870548,
"grad_norm": 3.6663692285540996,
"learning_rate": 1.6708462677250405e-06,
"loss": 0.2493,
"step": 2310
},
{
"epoch": 0.6065667731106198,
"grad_norm": 4.557277515775418,
"learning_rate": 1.6692379399909876e-06,
"loss": 0.3299,
"step": 2315
},
{
"epoch": 0.6078768525341849,
"grad_norm": 2.960623480418135,
"learning_rate": 1.6676264707058599e-06,
"loss": 0.3056,
"step": 2320
},
{
"epoch": 0.60918693195775,
"grad_norm": 5.423095170753771,
"learning_rate": 1.6660118674342515e-06,
"loss": 0.341,
"step": 2325
},
{
"epoch": 0.610497011381315,
"grad_norm": 2.6332110051555113,
"learning_rate": 1.6643941377554675e-06,
"loss": 0.2743,
"step": 2330
},
{
"epoch": 0.61180709080488,
"grad_norm": 2.2505359636330207,
"learning_rate": 1.6627732892634893e-06,
"loss": 0.2578,
"step": 2335
},
{
"epoch": 0.6131171702284451,
"grad_norm": 4.154284637955047,
"learning_rate": 1.6611493295669386e-06,
"loss": 0.3286,
"step": 2340
},
{
"epoch": 0.6144272496520101,
"grad_norm": 4.577727996526487,
"learning_rate": 1.6595222662890418e-06,
"loss": 0.2868,
"step": 2345
},
{
"epoch": 0.6157373290755752,
"grad_norm": 4.653567093852986,
"learning_rate": 1.657892107067594e-06,
"loss": 0.2551,
"step": 2350
},
{
"epoch": 0.6170474084991403,
"grad_norm": 4.19440150981268,
"learning_rate": 1.6562588595549235e-06,
"loss": 0.2847,
"step": 2355
},
{
"epoch": 0.6183574879227053,
"grad_norm": 4.490108240777686,
"learning_rate": 1.654622531417856e-06,
"loss": 0.319,
"step": 2360
},
{
"epoch": 0.6196675673462704,
"grad_norm": 3.896691949712352,
"learning_rate": 1.6529831303376787e-06,
"loss": 0.2833,
"step": 2365
},
{
"epoch": 0.6209776467698355,
"grad_norm": 4.158481682638508,
"learning_rate": 1.651340664010102e-06,
"loss": 0.2759,
"step": 2370
},
{
"epoch": 0.6222877261934004,
"grad_norm": 2.4673087844419337,
"learning_rate": 1.6496951401452272e-06,
"loss": 0.2068,
"step": 2375
},
{
"epoch": 0.6235978056169655,
"grad_norm": 4.873523893306045,
"learning_rate": 1.6480465664675078e-06,
"loss": 0.2822,
"step": 2380
},
{
"epoch": 0.6249078850405306,
"grad_norm": 4.513011014916301,
"learning_rate": 1.6463949507157131e-06,
"loss": 0.311,
"step": 2385
},
{
"epoch": 0.6262179644640956,
"grad_norm": 4.41040904711801,
"learning_rate": 1.644740300642894e-06,
"loss": 0.2894,
"step": 2390
},
{
"epoch": 0.6275280438876607,
"grad_norm": 7.808507472292027,
"learning_rate": 1.6430826240163436e-06,
"loss": 0.3345,
"step": 2395
},
{
"epoch": 0.6288381233112258,
"grad_norm": 2.615243925080016,
"learning_rate": 1.6414219286175635e-06,
"loss": 0.2465,
"step": 2400
},
{
"epoch": 0.6288381233112258,
"eval_accuracy": 0.7408,
"eval_loss": 0.7380235195159912,
"eval_runtime": 136.2377,
"eval_samples_per_second": 9.175,
"eval_steps_per_second": 2.297,
"step": 2400
},
{
"epoch": 0.6301482027347908,
"grad_norm": 3.567322311755078,
"learning_rate": 1.639758222242225e-06,
"loss": 0.2349,
"step": 2405
},
{
"epoch": 0.6314582821583559,
"grad_norm": 5.503722999681643,
"learning_rate": 1.638091512700135e-06,
"loss": 0.2486,
"step": 2410
},
{
"epoch": 0.632768361581921,
"grad_norm": 5.060199812880774,
"learning_rate": 1.6364218078151963e-06,
"loss": 0.3254,
"step": 2415
},
{
"epoch": 0.6340784410054859,
"grad_norm": 2.967797600289104,
"learning_rate": 1.6347491154253738e-06,
"loss": 0.3049,
"step": 2420
},
{
"epoch": 0.635388520429051,
"grad_norm": 3.6123691523694297,
"learning_rate": 1.6330734433826562e-06,
"loss": 0.3079,
"step": 2425
},
{
"epoch": 0.636698599852616,
"grad_norm": 2.520240254040695,
"learning_rate": 1.6313947995530187e-06,
"loss": 0.2677,
"step": 2430
},
{
"epoch": 0.6380086792761811,
"grad_norm": 3.700192416490241,
"learning_rate": 1.6297131918163874e-06,
"loss": 0.2393,
"step": 2435
},
{
"epoch": 0.6393187586997462,
"grad_norm": 3.4585291941554797,
"learning_rate": 1.6280286280666011e-06,
"loss": 0.253,
"step": 2440
},
{
"epoch": 0.6406288381233112,
"grad_norm": 5.9407205540242884,
"learning_rate": 1.6263411162113752e-06,
"loss": 0.2991,
"step": 2445
},
{
"epoch": 0.6419389175468763,
"grad_norm": 3.7257997296487546,
"learning_rate": 1.624650664172264e-06,
"loss": 0.3,
"step": 2450
},
{
"epoch": 0.6432489969704414,
"grad_norm": 7.091164226358698,
"learning_rate": 1.6229572798846233e-06,
"loss": 0.2964,
"step": 2455
},
{
"epoch": 0.6445590763940063,
"grad_norm": 3.945065293498854,
"learning_rate": 1.6212609712975746e-06,
"loss": 0.3003,
"step": 2460
},
{
"epoch": 0.6458691558175714,
"grad_norm": 4.547862400421276,
"learning_rate": 1.6195617463739657e-06,
"loss": 0.312,
"step": 2465
},
{
"epoch": 0.6471792352411365,
"grad_norm": 3.792620720497921,
"learning_rate": 1.6178596130903343e-06,
"loss": 0.2689,
"step": 2470
},
{
"epoch": 0.6484893146647015,
"grad_norm": 4.549047060259805,
"learning_rate": 1.6161545794368712e-06,
"loss": 0.3019,
"step": 2475
},
{
"epoch": 0.6497993940882666,
"grad_norm": 2.3008964624889114,
"learning_rate": 1.614446653417382e-06,
"loss": 0.2427,
"step": 2480
},
{
"epoch": 0.6511094735118317,
"grad_norm": 3.448390528740768,
"learning_rate": 1.6127358430492496e-06,
"loss": 0.2733,
"step": 2485
},
{
"epoch": 0.6524195529353967,
"grad_norm": 3.115685764931888,
"learning_rate": 1.6110221563633966e-06,
"loss": 0.2813,
"step": 2490
},
{
"epoch": 0.6537296323589618,
"grad_norm": 2.7637030640180056,
"learning_rate": 1.6093056014042476e-06,
"loss": 0.316,
"step": 2495
},
{
"epoch": 0.6550397117825268,
"grad_norm": 2.8028721777171763,
"learning_rate": 1.6075861862296918e-06,
"loss": 0.2465,
"step": 2500
},
{
"epoch": 0.6550397117825268,
"eval_accuracy": 0.7504,
"eval_loss": 0.7594350576400757,
"eval_runtime": 136.5188,
"eval_samples_per_second": 9.156,
"eval_steps_per_second": 2.293,
"step": 2500
},
{
"epoch": 0.6563497912060918,
"grad_norm": 2.201406591183069,
"learning_rate": 1.6058639189110448e-06,
"loss": 0.2579,
"step": 2505
},
{
"epoch": 0.6576598706296569,
"grad_norm": 2.743123705211622,
"learning_rate": 1.6041388075330104e-06,
"loss": 0.2671,
"step": 2510
},
{
"epoch": 0.658969950053222,
"grad_norm": 3.35926627410109,
"learning_rate": 1.6024108601936441e-06,
"loss": 0.2722,
"step": 2515
},
{
"epoch": 0.660280029476787,
"grad_norm": 4.177142712614172,
"learning_rate": 1.600680085004313e-06,
"loss": 0.255,
"step": 2520
},
{
"epoch": 0.6615901089003521,
"grad_norm": 5.607535570859494,
"learning_rate": 1.5989464900896584e-06,
"loss": 0.2808,
"step": 2525
},
{
"epoch": 0.6629001883239172,
"grad_norm": 4.111155214094161,
"learning_rate": 1.5972100835875596e-06,
"loss": 0.2749,
"step": 2530
},
{
"epoch": 0.6642102677474822,
"grad_norm": 3.852938068841201,
"learning_rate": 1.5954708736490927e-06,
"loss": 0.374,
"step": 2535
},
{
"epoch": 0.6655203471710472,
"grad_norm": 4.998114409517782,
"learning_rate": 1.5937288684384948e-06,
"loss": 0.2988,
"step": 2540
},
{
"epoch": 0.6668304265946123,
"grad_norm": 3.2234837676131036,
"learning_rate": 1.5919840761331233e-06,
"loss": 0.2926,
"step": 2545
},
{
"epoch": 0.6681405060181773,
"grad_norm": 2.6128145021675135,
"learning_rate": 1.59023650492342e-06,
"loss": 0.2685,
"step": 2550
},
{
"epoch": 0.6694505854417424,
"grad_norm": 2.746049149593303,
"learning_rate": 1.588486163012871e-06,
"loss": 0.276,
"step": 2555
},
{
"epoch": 0.6707606648653075,
"grad_norm": 12.230075077506218,
"learning_rate": 1.5867330586179692e-06,
"loss": 0.3356,
"step": 2560
},
{
"epoch": 0.6720707442888725,
"grad_norm": 3.609150884738277,
"learning_rate": 1.5849771999681744e-06,
"loss": 0.2876,
"step": 2565
},
{
"epoch": 0.6733808237124376,
"grad_norm": 3.696663482780853,
"learning_rate": 1.583218595305876e-06,
"loss": 0.2801,
"step": 2570
},
{
"epoch": 0.6746909031360027,
"grad_norm": 2.4314233621566674,
"learning_rate": 1.5814572528863537e-06,
"loss": 0.246,
"step": 2575
},
{
"epoch": 0.6760009825595676,
"grad_norm": 3.410781191475223,
"learning_rate": 1.5796931809777387e-06,
"loss": 0.2854,
"step": 2580
},
{
"epoch": 0.6773110619831327,
"grad_norm": 7.965165743041762,
"learning_rate": 1.5779263878609752e-06,
"loss": 0.3286,
"step": 2585
},
{
"epoch": 0.6786211414066978,
"grad_norm": 2.8880233729881333,
"learning_rate": 1.5761568818297814e-06,
"loss": 0.3273,
"step": 2590
},
{
"epoch": 0.6799312208302628,
"grad_norm": 4.279782991468597,
"learning_rate": 1.5743846711906103e-06,
"loss": 0.2907,
"step": 2595
},
{
"epoch": 0.6812413002538279,
"grad_norm": 4.080828216313647,
"learning_rate": 1.5726097642626112e-06,
"loss": 0.3034,
"step": 2600
},
{
"epoch": 0.6812413002538279,
"eval_accuracy": 0.7576,
"eval_loss": 0.6795002818107605,
"eval_runtime": 138.009,
"eval_samples_per_second": 9.057,
"eval_steps_per_second": 2.268,
"step": 2600
},
{
"epoch": 0.682551379677393,
"grad_norm": 3.704947001007602,
"learning_rate": 1.5708321693775901e-06,
"loss": 0.2779,
"step": 2605
},
{
"epoch": 0.683861459100958,
"grad_norm": 3.7633025557802045,
"learning_rate": 1.569051894879971e-06,
"loss": 0.2513,
"step": 2610
},
{
"epoch": 0.6851715385245231,
"grad_norm": 4.533908569894921,
"learning_rate": 1.5672689491267565e-06,
"loss": 0.2519,
"step": 2615
},
{
"epoch": 0.6864816179480882,
"grad_norm": 7.825654578480919,
"learning_rate": 1.5654833404874889e-06,
"loss": 0.3064,
"step": 2620
},
{
"epoch": 0.6877916973716531,
"grad_norm": 6.701186216511913,
"learning_rate": 1.5636950773442107e-06,
"loss": 0.2888,
"step": 2625
},
{
"epoch": 0.6891017767952182,
"grad_norm": 5.08064257662279,
"learning_rate": 1.5619041680914244e-06,
"loss": 0.2841,
"step": 2630
},
{
"epoch": 0.6904118562187833,
"grad_norm": 3.3431125663033403,
"learning_rate": 1.560110621136055e-06,
"loss": 0.33,
"step": 2635
},
{
"epoch": 0.6917219356423483,
"grad_norm": 3.348150218520968,
"learning_rate": 1.5583144448974092e-06,
"loss": 0.2425,
"step": 2640
},
{
"epoch": 0.6930320150659134,
"grad_norm": 2.6434575784544485,
"learning_rate": 1.556515647807136e-06,
"loss": 0.2892,
"step": 2645
},
{
"epoch": 0.6943420944894785,
"grad_norm": 3.4776108132319514,
"learning_rate": 1.5547142383091868e-06,
"loss": 0.2468,
"step": 2650
},
{
"epoch": 0.6956521739130435,
"grad_norm": 4.831568463736758,
"learning_rate": 1.5529102248597772e-06,
"loss": 0.2789,
"step": 2655
},
{
"epoch": 0.6969622533366086,
"grad_norm": 2.093319788857653,
"learning_rate": 1.5511036159273452e-06,
"loss": 0.287,
"step": 2660
},
{
"epoch": 0.6982723327601735,
"grad_norm": 3.5718053321237213,
"learning_rate": 1.5492944199925133e-06,
"loss": 0.2576,
"step": 2665
},
{
"epoch": 0.6995824121837386,
"grad_norm": 2.6423916949191173,
"learning_rate": 1.5474826455480486e-06,
"loss": 0.3232,
"step": 2670
},
{
"epoch": 0.7008924916073037,
"grad_norm": 2.4967928837848996,
"learning_rate": 1.5456683010988203e-06,
"loss": 0.2656,
"step": 2675
},
{
"epoch": 0.7022025710308687,
"grad_norm": 2.280284760261213,
"learning_rate": 1.5438513951617637e-06,
"loss": 0.223,
"step": 2680
},
{
"epoch": 0.7035126504544338,
"grad_norm": 3.9018364135960497,
"learning_rate": 1.5420319362658373e-06,
"loss": 0.2352,
"step": 2685
},
{
"epoch": 0.7048227298779989,
"grad_norm": 4.281704624864632,
"learning_rate": 1.5402099329519845e-06,
"loss": 0.2683,
"step": 2690
},
{
"epoch": 0.7061328093015639,
"grad_norm": 6.450930230248805,
"learning_rate": 1.5383853937730916e-06,
"loss": 0.2804,
"step": 2695
},
{
"epoch": 0.707442888725129,
"grad_norm": 3.6301375705851835,
"learning_rate": 1.53655832729395e-06,
"loss": 0.256,
"step": 2700
},
{
"epoch": 0.707442888725129,
"eval_accuracy": 0.7624,
"eval_loss": 0.7787925004959106,
"eval_runtime": 138.9559,
"eval_samples_per_second": 8.996,
"eval_steps_per_second": 2.253,
"step": 2700
},
{
"epoch": 0.708752968148694,
"grad_norm": 4.1331580241606725,
"learning_rate": 1.534728742091214e-06,
"loss": 0.3178,
"step": 2705
},
{
"epoch": 0.710063047572259,
"grad_norm": 4.1609025912552005,
"learning_rate": 1.532896646753362e-06,
"loss": 0.2764,
"step": 2710
},
{
"epoch": 0.7113731269958241,
"grad_norm": 2.4782882085210884,
"learning_rate": 1.5310620498806548e-06,
"loss": 0.2497,
"step": 2715
},
{
"epoch": 0.7126832064193892,
"grad_norm": 4.503219440050312,
"learning_rate": 1.5292249600850966e-06,
"loss": 0.2618,
"step": 2720
},
{
"epoch": 0.7139932858429542,
"grad_norm": 4.86090545111869,
"learning_rate": 1.5273853859903935e-06,
"loss": 0.2522,
"step": 2725
},
{
"epoch": 0.7153033652665193,
"grad_norm": 4.018354852882808,
"learning_rate": 1.525543336231914e-06,
"loss": 0.3052,
"step": 2730
},
{
"epoch": 0.7166134446900844,
"grad_norm": 4.797568374404226,
"learning_rate": 1.5236988194566469e-06,
"loss": 0.3183,
"step": 2735
},
{
"epoch": 0.7179235241136493,
"grad_norm": 4.8386270061207055,
"learning_rate": 1.5218518443231628e-06,
"loss": 0.2763,
"step": 2740
},
{
"epoch": 0.7192336035372144,
"grad_norm": 4.215400128326543,
"learning_rate": 1.5200024195015719e-06,
"loss": 0.2661,
"step": 2745
},
{
"epoch": 0.7205436829607795,
"grad_norm": 4.56588429028685,
"learning_rate": 1.5181505536734835e-06,
"loss": 0.283,
"step": 2750
},
{
"epoch": 0.7218537623843445,
"grad_norm": 6.619608414847504,
"learning_rate": 1.5162962555319664e-06,
"loss": 0.271,
"step": 2755
},
{
"epoch": 0.7231638418079096,
"grad_norm": 2.4274939604447385,
"learning_rate": 1.5144395337815063e-06,
"loss": 0.313,
"step": 2760
},
{
"epoch": 0.7244739212314747,
"grad_norm": 5.626984953335138,
"learning_rate": 1.5125803971379665e-06,
"loss": 0.2866,
"step": 2765
},
{
"epoch": 0.7257840006550397,
"grad_norm": 4.285823933441923,
"learning_rate": 1.5107188543285454e-06,
"loss": 0.2603,
"step": 2770
},
{
"epoch": 0.7270940800786048,
"grad_norm": 4.38863656110863,
"learning_rate": 1.5088549140917381e-06,
"loss": 0.3184,
"step": 2775
},
{
"epoch": 0.7284041595021699,
"grad_norm": 2.9328782019117465,
"learning_rate": 1.506988585177292e-06,
"loss": 0.2389,
"step": 2780
},
{
"epoch": 0.7297142389257348,
"grad_norm": 3.483606480673357,
"learning_rate": 1.505119876346168e-06,
"loss": 0.276,
"step": 2785
},
{
"epoch": 0.7310243183492999,
"grad_norm": 4.504190498010961,
"learning_rate": 1.5032487963705003e-06,
"loss": 0.1977,
"step": 2790
},
{
"epoch": 0.732334397772865,
"grad_norm": 4.184926339697806,
"learning_rate": 1.5013753540335517e-06,
"loss": 0.2972,
"step": 2795
},
{
"epoch": 0.73364447719643,
"grad_norm": 3.8006093754774195,
"learning_rate": 1.499499558129676e-06,
"loss": 0.2776,
"step": 2800
},
{
"epoch": 0.73364447719643,
"eval_accuracy": 0.7504,
"eval_loss": 0.7540197372436523,
"eval_runtime": 142.6507,
"eval_samples_per_second": 8.763,
"eval_steps_per_second": 2.194,
"step": 2800
},
{
"epoch": 0.7349545566199951,
"grad_norm": 3.614639963171112,
"learning_rate": 1.497621417464274e-06,
"loss": 0.2199,
"step": 2805
},
{
"epoch": 0.7362646360435602,
"grad_norm": 3.4753724939982367,
"learning_rate": 1.4957409408537535e-06,
"loss": 0.2842,
"step": 2810
},
{
"epoch": 0.7375747154671252,
"grad_norm": 2.829347092202445,
"learning_rate": 1.493858137125489e-06,
"loss": 0.2054,
"step": 2815
},
{
"epoch": 0.7388847948906903,
"grad_norm": 4.607528640210262,
"learning_rate": 1.4919730151177773e-06,
"loss": 0.2488,
"step": 2820
},
{
"epoch": 0.7401948743142553,
"grad_norm": 4.424154310853472,
"learning_rate": 1.4900855836797995e-06,
"loss": 0.3079,
"step": 2825
},
{
"epoch": 0.7415049537378203,
"grad_norm": 2.8286263481877434,
"learning_rate": 1.4881958516715757e-06,
"loss": 0.267,
"step": 2830
},
{
"epoch": 0.7428150331613854,
"grad_norm": 4.694968243877861,
"learning_rate": 1.4863038279639268e-06,
"loss": 0.2903,
"step": 2835
},
{
"epoch": 0.7441251125849505,
"grad_norm": 3.672549062096689,
"learning_rate": 1.4844095214384309e-06,
"loss": 0.2583,
"step": 2840
},
{
"epoch": 0.7454351920085155,
"grad_norm": 3.4410420172535887,
"learning_rate": 1.4825129409873822e-06,
"loss": 0.3213,
"step": 2845
},
{
"epoch": 0.7467452714320806,
"grad_norm": 3.4101727347068382,
"learning_rate": 1.4806140955137495e-06,
"loss": 0.2537,
"step": 2850
},
{
"epoch": 0.7480553508556457,
"grad_norm": 4.519383622184218,
"learning_rate": 1.4787129939311337e-06,
"loss": 0.2929,
"step": 2855
},
{
"epoch": 0.7493654302792107,
"grad_norm": 3.4774712459804404,
"learning_rate": 1.4768096451637272e-06,
"loss": 0.2682,
"step": 2860
},
{
"epoch": 0.7506755097027757,
"grad_norm": 2.6479749188555575,
"learning_rate": 1.4749040581462694e-06,
"loss": 0.2519,
"step": 2865
},
{
"epoch": 0.7519855891263408,
"grad_norm": 4.16913561566471,
"learning_rate": 1.4729962418240086e-06,
"loss": 0.2619,
"step": 2870
},
{
"epoch": 0.7532956685499058,
"grad_norm": 2.0678666370348324,
"learning_rate": 1.471086205152657e-06,
"loss": 0.319,
"step": 2875
},
{
"epoch": 0.7546057479734709,
"grad_norm": 2.976384517477917,
"learning_rate": 1.469173957098349e-06,
"loss": 0.3259,
"step": 2880
},
{
"epoch": 0.755915827397036,
"grad_norm": 3.658100772623381,
"learning_rate": 1.4672595066376015e-06,
"loss": 0.2506,
"step": 2885
},
{
"epoch": 0.757225906820601,
"grad_norm": 6.101815938265203,
"learning_rate": 1.4653428627572674e-06,
"loss": 0.2655,
"step": 2890
},
{
"epoch": 0.7585359862441661,
"grad_norm": 2.8143348607782337,
"learning_rate": 1.4634240344544988e-06,
"loss": 0.2684,
"step": 2895
},
{
"epoch": 0.759846065667731,
"grad_norm": 2.105144871048026,
"learning_rate": 1.4615030307366998e-06,
"loss": 0.2804,
"step": 2900
},
{
"epoch": 0.759846065667731,
"eval_accuracy": 0.748,
"eval_loss": 0.7601897716522217,
"eval_runtime": 139.2011,
"eval_samples_per_second": 8.98,
"eval_steps_per_second": 2.249,
"step": 2900
},
{
"epoch": 0.7611561450912961,
"grad_norm": 3.998393869040855,
"learning_rate": 1.459579860621488e-06,
"loss": 0.2674,
"step": 2905
},
{
"epoch": 0.7624662245148612,
"grad_norm": 2.2268642108185053,
"learning_rate": 1.4576545331366488e-06,
"loss": 0.2702,
"step": 2910
},
{
"epoch": 0.7637763039384262,
"grad_norm": 6.218849876814229,
"learning_rate": 1.4557270573200962e-06,
"loss": 0.2864,
"step": 2915
},
{
"epoch": 0.7650863833619913,
"grad_norm": 4.323897051065836,
"learning_rate": 1.4537974422198285e-06,
"loss": 0.2636,
"step": 2920
},
{
"epoch": 0.7663964627855564,
"grad_norm": 5.750558481033307,
"learning_rate": 1.451865696893886e-06,
"loss": 0.2319,
"step": 2925
},
{
"epoch": 0.7677065422091214,
"grad_norm": 6.231068587651604,
"learning_rate": 1.4499318304103097e-06,
"loss": 0.2912,
"step": 2930
},
{
"epoch": 0.7690166216326865,
"grad_norm": 3.8621492036274545,
"learning_rate": 1.447995851847096e-06,
"loss": 0.2594,
"step": 2935
},
{
"epoch": 0.7703267010562516,
"grad_norm": 4.8680458967049285,
"learning_rate": 1.4460577702921577e-06,
"loss": 0.2787,
"step": 2940
},
{
"epoch": 0.7716367804798165,
"grad_norm": 2.277649540761437,
"learning_rate": 1.4441175948432784e-06,
"loss": 0.2722,
"step": 2945
},
{
"epoch": 0.7729468599033816,
"grad_norm": 3.335412545156817,
"learning_rate": 1.4421753346080714e-06,
"loss": 0.2614,
"step": 2950
},
{
"epoch": 0.7742569393269467,
"grad_norm": 3.194957371530983,
"learning_rate": 1.4402309987039365e-06,
"loss": 0.3021,
"step": 2955
},
{
"epoch": 0.7755670187505117,
"grad_norm": 4.092383074593162,
"learning_rate": 1.4382845962580165e-06,
"loss": 0.2532,
"step": 2960
},
{
"epoch": 0.7768770981740768,
"grad_norm": 2.9160857878852595,
"learning_rate": 1.436336136407156e-06,
"loss": 0.3102,
"step": 2965
},
{
"epoch": 0.7781871775976419,
"grad_norm": 2.6117926566384377,
"learning_rate": 1.4343856282978565e-06,
"loss": 0.2532,
"step": 2970
},
{
"epoch": 0.7794972570212069,
"grad_norm": 2.514471217178964,
"learning_rate": 1.4324330810862354e-06,
"loss": 0.2709,
"step": 2975
},
{
"epoch": 0.780807336444772,
"grad_norm": 2.391957064033006,
"learning_rate": 1.430478503937981e-06,
"loss": 0.2655,
"step": 2980
},
{
"epoch": 0.782117415868337,
"grad_norm": 8.037406108008932,
"learning_rate": 1.4285219060283119e-06,
"loss": 0.3229,
"step": 2985
},
{
"epoch": 0.783427495291902,
"grad_norm": 2.024818465241261,
"learning_rate": 1.4265632965419311e-06,
"loss": 0.2476,
"step": 2990
},
{
"epoch": 0.7847375747154671,
"grad_norm": 4.525940151913456,
"learning_rate": 1.4246026846729864e-06,
"loss": 0.2801,
"step": 2995
},
{
"epoch": 0.7860476541390322,
"grad_norm": 5.616203546892296,
"learning_rate": 1.422640079625023e-06,
"loss": 0.2893,
"step": 3000
},
{
"epoch": 0.7860476541390322,
"eval_accuracy": 0.7448,
"eval_loss": 0.8274851441383362,
"eval_runtime": 139.0761,
"eval_samples_per_second": 8.988,
"eval_steps_per_second": 2.251,
"step": 3000
},
{
"epoch": 0.7873577335625972,
"grad_norm": 3.641490270919579,
"learning_rate": 1.420675490610944e-06,
"loss": 0.2927,
"step": 3005
},
{
"epoch": 0.7886678129861623,
"grad_norm": 2.8550119558130884,
"learning_rate": 1.418708926852965e-06,
"loss": 0.2525,
"step": 3010
},
{
"epoch": 0.7899778924097274,
"grad_norm": 3.081716905966947,
"learning_rate": 1.4167403975825726e-06,
"loss": 0.2494,
"step": 3015
},
{
"epoch": 0.7912879718332924,
"grad_norm": 4.676494817613609,
"learning_rate": 1.4147699120404775e-06,
"loss": 0.2858,
"step": 3020
},
{
"epoch": 0.7925980512568574,
"grad_norm": 1.9991869906134794,
"learning_rate": 1.4127974794765764e-06,
"loss": 0.2937,
"step": 3025
},
{
"epoch": 0.7939081306804225,
"grad_norm": 2.6233135206447398,
"learning_rate": 1.410823109149904e-06,
"loss": 0.2932,
"step": 3030
},
{
"epoch": 0.7952182101039875,
"grad_norm": 2.2451673120874176,
"learning_rate": 1.408846810328592e-06,
"loss": 0.2594,
"step": 3035
},
{
"epoch": 0.7965282895275526,
"grad_norm": 2.7009560292084336,
"learning_rate": 1.4068685922898244e-06,
"loss": 0.3115,
"step": 3040
},
{
"epoch": 0.7978383689511177,
"grad_norm": 2.433826115411649,
"learning_rate": 1.4048884643197947e-06,
"loss": 0.268,
"step": 3045
},
{
"epoch": 0.7991484483746827,
"grad_norm": 3.5050872305744663,
"learning_rate": 1.4029064357136626e-06,
"loss": 0.266,
"step": 3050
},
{
"epoch": 0.8004585277982478,
"grad_norm": 3.1713770891943462,
"learning_rate": 1.4009225157755085e-06,
"loss": 0.2807,
"step": 3055
},
{
"epoch": 0.8017686072218129,
"grad_norm": 3.3467606419772697,
"learning_rate": 1.3989367138182924e-06,
"loss": 0.2641,
"step": 3060
},
{
"epoch": 0.8030786866453778,
"grad_norm": 3.1069787059795946,
"learning_rate": 1.396949039163808e-06,
"loss": 0.277,
"step": 3065
},
{
"epoch": 0.8043887660689429,
"grad_norm": 4.763920028748174,
"learning_rate": 1.3949595011426407e-06,
"loss": 0.2625,
"step": 3070
},
{
"epoch": 0.805698845492508,
"grad_norm": 2.8182907657903606,
"learning_rate": 1.392968109094122e-06,
"loss": 0.2487,
"step": 3075
},
{
"epoch": 0.807008924916073,
"grad_norm": 2.72524792612828,
"learning_rate": 1.3909748723662871e-06,
"loss": 0.2513,
"step": 3080
},
{
"epoch": 0.8083190043396381,
"grad_norm": 3.3492249191737677,
"learning_rate": 1.3889798003158312e-06,
"loss": 0.2844,
"step": 3085
},
{
"epoch": 0.8096290837632032,
"grad_norm": 4.142567741582466,
"learning_rate": 1.3869829023080636e-06,
"loss": 0.2978,
"step": 3090
},
{
"epoch": 0.8109391631867682,
"grad_norm": 4.435078753474168,
"learning_rate": 1.384984187716866e-06,
"loss": 0.217,
"step": 3095
},
{
"epoch": 0.8122492426103333,
"grad_norm": 6.294316726095601,
"learning_rate": 1.3829836659246473e-06,
"loss": 0.3141,
"step": 3100
},
{
"epoch": 0.8122492426103333,
"eval_accuracy": 0.7392,
"eval_loss": 0.7475783824920654,
"eval_runtime": 139.032,
"eval_samples_per_second": 8.991,
"eval_steps_per_second": 2.251,
"step": 3100
},
{
"epoch": 0.8135593220338984,
"grad_norm": 2.2935780929180707,
"learning_rate": 1.3809813463222995e-06,
"loss": 0.2432,
"step": 3105
},
{
"epoch": 0.8148694014574633,
"grad_norm": 3.0405694359199367,
"learning_rate": 1.3789772383091542e-06,
"loss": 0.234,
"step": 3110
},
{
"epoch": 0.8161794808810284,
"grad_norm": 2.6036641756583974,
"learning_rate": 1.3769713512929384e-06,
"loss": 0.2513,
"step": 3115
},
{
"epoch": 0.8174895603045935,
"grad_norm": 3.0854001671426166,
"learning_rate": 1.37496369468973e-06,
"loss": 0.3248,
"step": 3120
},
{
"epoch": 0.8187996397281585,
"grad_norm": 2.7741805327089253,
"learning_rate": 1.3729542779239133e-06,
"loss": 0.2183,
"step": 3125
},
{
"epoch": 0.8201097191517236,
"grad_norm": 3.154383132924369,
"learning_rate": 1.370943110428136e-06,
"loss": 0.2318,
"step": 3130
},
{
"epoch": 0.8214197985752886,
"grad_norm": 2.728987342059349,
"learning_rate": 1.3689302016432628e-06,
"loss": 0.2505,
"step": 3135
},
{
"epoch": 0.8227298779988537,
"grad_norm": 4.260828009892338,
"learning_rate": 1.3669155610183336e-06,
"loss": 0.2859,
"step": 3140
},
{
"epoch": 0.8240399574224188,
"grad_norm": 4.698119734994208,
"learning_rate": 1.364899198010518e-06,
"loss": 0.3126,
"step": 3145
},
{
"epoch": 0.8253500368459837,
"grad_norm": 2.9266861851773336,
"learning_rate": 1.3628811220850703e-06,
"loss": 0.2524,
"step": 3150
},
{
"epoch": 0.8266601162695488,
"grad_norm": 3.3660376635311744,
"learning_rate": 1.3608613427152854e-06,
"loss": 0.26,
"step": 3155
},
{
"epoch": 0.8279701956931139,
"grad_norm": 2.7452509590309027,
"learning_rate": 1.358839869382455e-06,
"loss": 0.2787,
"step": 3160
},
{
"epoch": 0.8292802751166789,
"grad_norm": 2.2871077739708348,
"learning_rate": 1.356816711575823e-06,
"loss": 0.2774,
"step": 3165
},
{
"epoch": 0.830590354540244,
"grad_norm": 2.353498321089704,
"learning_rate": 1.3547918787925392e-06,
"loss": 0.1922,
"step": 3170
},
{
"epoch": 0.8319004339638091,
"grad_norm": 2.8043846357680895,
"learning_rate": 1.352765380537618e-06,
"loss": 0.2457,
"step": 3175
},
{
"epoch": 0.8332105133873741,
"grad_norm": 6.287288851930004,
"learning_rate": 1.3507372263238901e-06,
"loss": 0.2882,
"step": 3180
},
{
"epoch": 0.8345205928109392,
"grad_norm": 5.293879458072892,
"learning_rate": 1.3487074256719608e-06,
"loss": 0.2908,
"step": 3185
},
{
"epoch": 0.8358306722345042,
"grad_norm": 3.70662303230532,
"learning_rate": 1.3466759881101637e-06,
"loss": 0.2343,
"step": 3190
},
{
"epoch": 0.8371407516580692,
"grad_norm": 5.617247281731303,
"learning_rate": 1.344642923174517e-06,
"loss": 0.3469,
"step": 3195
},
{
"epoch": 0.8384508310816343,
"grad_norm": 5.831422377330226,
"learning_rate": 1.3426082404086772e-06,
"loss": 0.3464,
"step": 3200
},
{
"epoch": 0.8384508310816343,
"eval_accuracy": 0.7464,
"eval_loss": 0.6823216080665588,
"eval_runtime": 137.8591,
"eval_samples_per_second": 9.067,
"eval_steps_per_second": 2.27,
"step": 3200
},
{
"epoch": 0.8397609105051994,
"grad_norm": 5.229782066538766,
"learning_rate": 1.3405719493638959e-06,
"loss": 0.2926,
"step": 3205
},
{
"epoch": 0.8410709899287644,
"grad_norm": 1.8257926894676517,
"learning_rate": 1.3385340595989738e-06,
"loss": 0.2532,
"step": 3210
},
{
"epoch": 0.8423810693523295,
"grad_norm": 2.1256870704370434,
"learning_rate": 1.3364945806802173e-06,
"loss": 0.2456,
"step": 3215
},
{
"epoch": 0.8436911487758946,
"grad_norm": 2.7507619238311065,
"learning_rate": 1.3344535221813915e-06,
"loss": 0.2556,
"step": 3220
},
{
"epoch": 0.8450012281994596,
"grad_norm": 3.313724807442175,
"learning_rate": 1.3324108936836775e-06,
"loss": 0.2604,
"step": 3225
},
{
"epoch": 0.8463113076230246,
"grad_norm": 3.2583479898589385,
"learning_rate": 1.330366704775625e-06,
"loss": 0.2566,
"step": 3230
},
{
"epoch": 0.8476213870465897,
"grad_norm": 4.463854161721075,
"learning_rate": 1.3283209650531098e-06,
"loss": 0.3077,
"step": 3235
},
{
"epoch": 0.8489314664701547,
"grad_norm": 2.7758922633618868,
"learning_rate": 1.326273684119287e-06,
"loss": 0.2555,
"step": 3240
},
{
"epoch": 0.8502415458937198,
"grad_norm": 6.067311625889626,
"learning_rate": 1.3242248715845468e-06,
"loss": 0.3606,
"step": 3245
},
{
"epoch": 0.8515516253172849,
"grad_norm": 2.7176945371959658,
"learning_rate": 1.3221745370664689e-06,
"loss": 0.2035,
"step": 3250
},
{
"epoch": 0.8528617047408499,
"grad_norm": 5.014963951139648,
"learning_rate": 1.3201226901897773e-06,
"loss": 0.3122,
"step": 3255
},
{
"epoch": 0.854171784164415,
"grad_norm": 3.1409689213203262,
"learning_rate": 1.318069340586296e-06,
"loss": 0.2756,
"step": 3260
},
{
"epoch": 0.8554818635879801,
"grad_norm": 2.6726747362164613,
"learning_rate": 1.316014497894902e-06,
"loss": 0.2037,
"step": 3265
},
{
"epoch": 0.856791943011545,
"grad_norm": 4.397327066527509,
"learning_rate": 1.3139581717614822e-06,
"loss": 0.2166,
"step": 3270
},
{
"epoch": 0.8581020224351101,
"grad_norm": 5.515919631566852,
"learning_rate": 1.311900371838887e-06,
"loss": 0.3015,
"step": 3275
},
{
"epoch": 0.8594121018586752,
"grad_norm": 2.8756311830991206,
"learning_rate": 1.3098411077868846e-06,
"loss": 0.2597,
"step": 3280
},
{
"epoch": 0.8607221812822402,
"grad_norm": 5.970001237473167,
"learning_rate": 1.3077803892721166e-06,
"loss": 0.2328,
"step": 3285
},
{
"epoch": 0.8620322607058053,
"grad_norm": 5.921050843170067,
"learning_rate": 1.3057182259680517e-06,
"loss": 0.235,
"step": 3290
},
{
"epoch": 0.8633423401293704,
"grad_norm": 3.740850134478578,
"learning_rate": 1.3036546275549416e-06,
"loss": 0.2827,
"step": 3295
},
{
"epoch": 0.8646524195529354,
"grad_norm": 3.5537486719044873,
"learning_rate": 1.3015896037197737e-06,
"loss": 0.2382,
"step": 3300
},
{
"epoch": 0.8646524195529354,
"eval_accuracy": 0.7336,
"eval_loss": 0.791848361492157,
"eval_runtime": 137.8386,
"eval_samples_per_second": 9.069,
"eval_steps_per_second": 2.271,
"step": 3300
},
{
"epoch": 0.8659624989765005,
"grad_norm": 3.7119905199979706,
"learning_rate": 1.2995231641562276e-06,
"loss": 0.255,
"step": 3305
},
{
"epoch": 0.8672725784000656,
"grad_norm": 3.383107638588926,
"learning_rate": 1.2974553185646275e-06,
"loss": 0.2459,
"step": 3310
},
{
"epoch": 0.8685826578236305,
"grad_norm": 3.48789577540892,
"learning_rate": 1.295386076651899e-06,
"loss": 0.2969,
"step": 3315
},
{
"epoch": 0.8698927372471956,
"grad_norm": 3.9640319112857556,
"learning_rate": 1.2933154481315219e-06,
"loss": 0.2857,
"step": 3320
},
{
"epoch": 0.8712028166707607,
"grad_norm": 3.1582073938250077,
"learning_rate": 1.2912434427234841e-06,
"loss": 0.254,
"step": 3325
},
{
"epoch": 0.8725128960943257,
"grad_norm": 2.276234259584371,
"learning_rate": 1.289170070154239e-06,
"loss": 0.2445,
"step": 3330
},
{
"epoch": 0.8738229755178908,
"grad_norm": 5.00541090993625,
"learning_rate": 1.2870953401566555e-06,
"loss": 0.2843,
"step": 3335
},
{
"epoch": 0.8751330549414559,
"grad_norm": 3.159470503361849,
"learning_rate": 1.285019262469976e-06,
"loss": 0.2521,
"step": 3340
},
{
"epoch": 0.8764431343650209,
"grad_norm": 5.2597276914314435,
"learning_rate": 1.282941846839769e-06,
"loss": 0.2499,
"step": 3345
},
{
"epoch": 0.877753213788586,
"grad_norm": 3.595455023325043,
"learning_rate": 1.2808631030178834e-06,
"loss": 0.2818,
"step": 3350
},
{
"epoch": 0.8790632932121509,
"grad_norm": 3.5145264435052934,
"learning_rate": 1.278783040762403e-06,
"loss": 0.3035,
"step": 3355
},
{
"epoch": 0.880373372635716,
"grad_norm": 1.9251613140804913,
"learning_rate": 1.2767016698376002e-06,
"loss": 0.2244,
"step": 3360
},
{
"epoch": 0.8816834520592811,
"grad_norm": 4.237713316911567,
"learning_rate": 1.2746190000138915e-06,
"loss": 0.2627,
"step": 3365
},
{
"epoch": 0.8829935314828461,
"grad_norm": 2.6561528116215474,
"learning_rate": 1.27253504106779e-06,
"loss": 0.273,
"step": 3370
},
{
"epoch": 0.8843036109064112,
"grad_norm": 3.109317673581231,
"learning_rate": 1.2704498027818603e-06,
"loss": 0.2651,
"step": 3375
},
{
"epoch": 0.8856136903299763,
"grad_norm": 2.0153319806341403,
"learning_rate": 1.2683632949446726e-06,
"loss": 0.2476,
"step": 3380
},
{
"epoch": 0.8869237697535413,
"grad_norm": 5.576097183261757,
"learning_rate": 1.266275527350757e-06,
"loss": 0.235,
"step": 3385
},
{
"epoch": 0.8882338491771063,
"grad_norm": 3.7365883269489784,
"learning_rate": 1.2641865098005564e-06,
"loss": 0.2446,
"step": 3390
},
{
"epoch": 0.8895439286006714,
"grad_norm": 2.5559798536789153,
"learning_rate": 1.2620962521003824e-06,
"loss": 0.2616,
"step": 3395
},
{
"epoch": 0.8908540080242364,
"grad_norm": 3.7982545775172545,
"learning_rate": 1.260004764062367e-06,
"loss": 0.3298,
"step": 3400
},
{
"epoch": 0.8908540080242364,
"eval_accuracy": 0.7328,
"eval_loss": 0.8060081601142883,
"eval_runtime": 137.0834,
"eval_samples_per_second": 9.119,
"eval_steps_per_second": 2.283,
"step": 3400
},
{
"epoch": 0.8921640874478015,
"grad_norm": 3.7011223316651165,
"learning_rate": 1.2579120555044183e-06,
"loss": 0.2734,
"step": 3405
},
{
"epoch": 0.8934741668713666,
"grad_norm": 3.1941707471283496,
"learning_rate": 1.2558181362501733e-06,
"loss": 0.2535,
"step": 3410
},
{
"epoch": 0.8947842462949316,
"grad_norm": 2.791095240654008,
"learning_rate": 1.2537230161289536e-06,
"loss": 0.264,
"step": 3415
},
{
"epoch": 0.8960943257184967,
"grad_norm": 3.6185337902204244,
"learning_rate": 1.2516267049757156e-06,
"loss": 0.2472,
"step": 3420
},
{
"epoch": 0.8974044051420618,
"grad_norm": 3.85855921456429,
"learning_rate": 1.249529212631009e-06,
"loss": 0.3052,
"step": 3425
},
{
"epoch": 0.8987144845656267,
"grad_norm": 3.6343729382527146,
"learning_rate": 1.247430548940927e-06,
"loss": 0.2441,
"step": 3430
},
{
"epoch": 0.9000245639891918,
"grad_norm": 3.2162815125864164,
"learning_rate": 1.2453307237570617e-06,
"loss": 0.2659,
"step": 3435
},
{
"epoch": 0.9013346434127569,
"grad_norm": 3.1960651267976896,
"learning_rate": 1.2432297469364569e-06,
"loss": 0.2555,
"step": 3440
},
{
"epoch": 0.9026447228363219,
"grad_norm": 3.523205821552062,
"learning_rate": 1.2411276283415638e-06,
"loss": 0.2867,
"step": 3445
},
{
"epoch": 0.903954802259887,
"grad_norm": 3.455735398408258,
"learning_rate": 1.2390243778401927e-06,
"loss": 0.2998,
"step": 3450
},
{
"epoch": 0.9052648816834521,
"grad_norm": 2.6520324497661822,
"learning_rate": 1.2369200053054663e-06,
"loss": 0.2581,
"step": 3455
},
{
"epoch": 0.9065749611070171,
"grad_norm": 2.2579348470775655,
"learning_rate": 1.2348145206157758e-06,
"loss": 0.2196,
"step": 3460
},
{
"epoch": 0.9078850405305822,
"grad_norm": 6.681015179121082,
"learning_rate": 1.232707933654732e-06,
"loss": 0.3075,
"step": 3465
},
{
"epoch": 0.9091951199541473,
"grad_norm": 4.24745491945866,
"learning_rate": 1.2306002543111215e-06,
"loss": 0.2822,
"step": 3470
},
{
"epoch": 0.9105051993777122,
"grad_norm": 2.769756448881865,
"learning_rate": 1.2284914924788568e-06,
"loss": 0.2628,
"step": 3475
},
{
"epoch": 0.9118152788012773,
"grad_norm": 2.5369009573995407,
"learning_rate": 1.2263816580569333e-06,
"loss": 0.2338,
"step": 3480
},
{
"epoch": 0.9131253582248424,
"grad_norm": 3.165436055957326,
"learning_rate": 1.224270760949381e-06,
"loss": 0.3067,
"step": 3485
},
{
"epoch": 0.9144354376484074,
"grad_norm": 3.351223367009085,
"learning_rate": 1.2221588110652183e-06,
"loss": 0.3004,
"step": 3490
},
{
"epoch": 0.9157455170719725,
"grad_norm": 3.3328215960308305,
"learning_rate": 1.220045818318406e-06,
"loss": 0.2857,
"step": 3495
},
{
"epoch": 0.9170555964955376,
"grad_norm": 5.5246139033024635,
"learning_rate": 1.2179317926277987e-06,
"loss": 0.2715,
"step": 3500
},
{
"epoch": 0.9170555964955376,
"eval_accuracy": 0.7488,
"eval_loss": 0.7673412561416626,
"eval_runtime": 137.1036,
"eval_samples_per_second": 9.117,
"eval_steps_per_second": 2.283,
"step": 3500
},
{
"epoch": 0.9183656759191026,
"grad_norm": 2.8554155032311366,
"learning_rate": 1.2158167439171026e-06,
"loss": 0.2767,
"step": 3505
},
{
"epoch": 0.9196757553426677,
"grad_norm": 3.134856252459499,
"learning_rate": 1.2137006821148234e-06,
"loss": 0.296,
"step": 3510
},
{
"epoch": 0.9209858347662327,
"grad_norm": 3.993316808863827,
"learning_rate": 1.2115836171542243e-06,
"loss": 0.3058,
"step": 3515
},
{
"epoch": 0.9222959141897977,
"grad_norm": 3.939068373566409,
"learning_rate": 1.2094655589732773e-06,
"loss": 0.2605,
"step": 3520
},
{
"epoch": 0.9236059936133628,
"grad_norm": 2.3253660721101825,
"learning_rate": 1.2073465175146159e-06,
"loss": 0.2342,
"step": 3525
},
{
"epoch": 0.9249160730369279,
"grad_norm": 4.9098246531853675,
"learning_rate": 1.2052265027254904e-06,
"loss": 0.2824,
"step": 3530
},
{
"epoch": 0.9262261524604929,
"grad_norm": 4.153863783851212,
"learning_rate": 1.203105524557719e-06,
"loss": 0.2884,
"step": 3535
},
{
"epoch": 0.927536231884058,
"grad_norm": 2.8597337529967786,
"learning_rate": 1.2009835929676435e-06,
"loss": 0.2527,
"step": 3540
},
{
"epoch": 0.9288463113076231,
"grad_norm": 3.7214300730819603,
"learning_rate": 1.19886071791608e-06,
"loss": 0.2642,
"step": 3545
},
{
"epoch": 0.930156390731188,
"grad_norm": 2.2141165902052826,
"learning_rate": 1.196736909368275e-06,
"loss": 0.1855,
"step": 3550
},
{
"epoch": 0.9314664701547531,
"grad_norm": 4.015370663796761,
"learning_rate": 1.1946121772938554e-06,
"loss": 0.2747,
"step": 3555
},
{
"epoch": 0.9327765495783182,
"grad_norm": 2.7879854190831366,
"learning_rate": 1.1924865316667839e-06,
"loss": 0.2768,
"step": 3560
},
{
"epoch": 0.9340866290018832,
"grad_norm": 10.049756094566998,
"learning_rate": 1.190359982465312e-06,
"loss": 0.2436,
"step": 3565
},
{
"epoch": 0.9353967084254483,
"grad_norm": 3.644259827300173,
"learning_rate": 1.1882325396719323e-06,
"loss": 0.2508,
"step": 3570
},
{
"epoch": 0.9367067878490134,
"grad_norm": 3.612418668345567,
"learning_rate": 1.1861042132733328e-06,
"loss": 0.269,
"step": 3575
},
{
"epoch": 0.9380168672725784,
"grad_norm": 3.7040505997852735,
"learning_rate": 1.1839750132603486e-06,
"loss": 0.2481,
"step": 3580
},
{
"epoch": 0.9393269466961435,
"grad_norm": 4.288075933572893,
"learning_rate": 1.1818449496279159e-06,
"loss": 0.2708,
"step": 3585
},
{
"epoch": 0.9406370261197085,
"grad_norm": 2.940632362279229,
"learning_rate": 1.1797140323750249e-06,
"loss": 0.2669,
"step": 3590
},
{
"epoch": 0.9419471055432735,
"grad_norm": 3.7558809010488394,
"learning_rate": 1.1775822715046736e-06,
"loss": 0.2544,
"step": 3595
},
{
"epoch": 0.9432571849668386,
"grad_norm": 3.414999704185975,
"learning_rate": 1.175449677023819e-06,
"loss": 0.2229,
"step": 3600
},
{
"epoch": 0.9432571849668386,
"eval_accuracy": 0.7704,
"eval_loss": 0.7259599566459656,
"eval_runtime": 137.2184,
"eval_samples_per_second": 9.11,
"eval_steps_per_second": 2.281,
"step": 3600
},
{
"epoch": 0.9445672643904036,
"grad_norm": 3.8559190297316244,
"learning_rate": 1.173316258943332e-06,
"loss": 0.2573,
"step": 3605
},
{
"epoch": 0.9458773438139687,
"grad_norm": 3.862447153928952,
"learning_rate": 1.1711820272779497e-06,
"loss": 0.2706,
"step": 3610
},
{
"epoch": 0.9471874232375338,
"grad_norm": 3.11275213461627,
"learning_rate": 1.1690469920462276e-06,
"loss": 0.242,
"step": 3615
},
{
"epoch": 0.9484975026610988,
"grad_norm": 2.7086439962254265,
"learning_rate": 1.166911163270494e-06,
"loss": 0.2639,
"step": 3620
},
{
"epoch": 0.9498075820846639,
"grad_norm": 2.7066495290470205,
"learning_rate": 1.1647745509768025e-06,
"loss": 0.2526,
"step": 3625
},
{
"epoch": 0.951117661508229,
"grad_norm": 3.9839085079101735,
"learning_rate": 1.1626371651948836e-06,
"loss": 0.2045,
"step": 3630
},
{
"epoch": 0.9524277409317939,
"grad_norm": 4.411550498275121,
"learning_rate": 1.1604990159580998e-06,
"loss": 0.2613,
"step": 3635
},
{
"epoch": 0.953737820355359,
"grad_norm": 3.830942956470282,
"learning_rate": 1.1583601133033973e-06,
"loss": 0.3089,
"step": 3640
},
{
"epoch": 0.9550478997789241,
"grad_norm": 4.664311559495326,
"learning_rate": 1.1562204672712583e-06,
"loss": 0.2669,
"step": 3645
},
{
"epoch": 0.9563579792024891,
"grad_norm": 2.3992079061884515,
"learning_rate": 1.1540800879056554e-06,
"loss": 0.2524,
"step": 3650
},
{
"epoch": 0.9576680586260542,
"grad_norm": 2.8182995787216627,
"learning_rate": 1.1519389852540032e-06,
"loss": 0.2641,
"step": 3655
},
{
"epoch": 0.9589781380496193,
"grad_norm": 4.0781942860428675,
"learning_rate": 1.1497971693671113e-06,
"loss": 0.2646,
"step": 3660
},
{
"epoch": 0.9602882174731843,
"grad_norm": 4.213894313755887,
"learning_rate": 1.147654650299138e-06,
"loss": 0.239,
"step": 3665
},
{
"epoch": 0.9615982968967494,
"grad_norm": 2.8090197504712737,
"learning_rate": 1.1455114381075423e-06,
"loss": 0.2587,
"step": 3670
},
{
"epoch": 0.9629083763203145,
"grad_norm": 3.858779958604277,
"learning_rate": 1.1433675428530366e-06,
"loss": 0.2865,
"step": 3675
},
{
"epoch": 0.9642184557438794,
"grad_norm": 3.865527824121575,
"learning_rate": 1.14122297459954e-06,
"loss": 0.2328,
"step": 3680
},
{
"epoch": 0.9655285351674445,
"grad_norm": 1.6264730559645344,
"learning_rate": 1.1390777434141306e-06,
"loss": 0.2631,
"step": 3685
},
{
"epoch": 0.9668386145910096,
"grad_norm": 3.1179917555516834,
"learning_rate": 1.1369318593669988e-06,
"loss": 0.2577,
"step": 3690
},
{
"epoch": 0.9681486940145746,
"grad_norm": 4.097463115035321,
"learning_rate": 1.1347853325313993e-06,
"loss": 0.2727,
"step": 3695
},
{
"epoch": 0.9694587734381397,
"grad_norm": 2.5547154186995145,
"learning_rate": 1.1326381729836045e-06,
"loss": 0.225,
"step": 3700
},
{
"epoch": 0.9694587734381397,
"eval_accuracy": 0.7512,
"eval_loss": 0.6980738043785095,
"eval_runtime": 138.6178,
"eval_samples_per_second": 9.018,
"eval_steps_per_second": 2.258,
"step": 3700
},
{
"epoch": 0.9707688528617048,
"grad_norm": 2.383269216589878,
"learning_rate": 1.1304903908028569e-06,
"loss": 0.2568,
"step": 3705
},
{
"epoch": 0.9720789322852698,
"grad_norm": 4.510012606255057,
"learning_rate": 1.1283419960713212e-06,
"loss": 0.3083,
"step": 3710
},
{
"epoch": 0.9733890117088349,
"grad_norm": 2.280997080205734,
"learning_rate": 1.126192998874038e-06,
"loss": 0.2523,
"step": 3715
},
{
"epoch": 0.9746990911323999,
"grad_norm": 6.594544711402591,
"learning_rate": 1.1240434092988764e-06,
"loss": 0.2407,
"step": 3720
},
{
"epoch": 0.9760091705559649,
"grad_norm": 4.135213461207541,
"learning_rate": 1.1218932374364855e-06,
"loss": 0.2893,
"step": 3725
},
{
"epoch": 0.97731924997953,
"grad_norm": 2.861868089040975,
"learning_rate": 1.1197424933802485e-06,
"loss": 0.2204,
"step": 3730
},
{
"epoch": 0.9786293294030951,
"grad_norm": 5.266728439797464,
"learning_rate": 1.1175911872262332e-06,
"loss": 0.3179,
"step": 3735
},
{
"epoch": 0.9799394088266601,
"grad_norm": 3.5669822038480405,
"learning_rate": 1.1154393290731483e-06,
"loss": 0.2392,
"step": 3740
},
{
"epoch": 0.9812494882502252,
"grad_norm": 3.360906639915107,
"learning_rate": 1.1132869290222917e-06,
"loss": 0.2802,
"step": 3745
},
{
"epoch": 0.9825595676737903,
"grad_norm": 4.200005976718456,
"learning_rate": 1.111133997177506e-06,
"loss": 0.3154,
"step": 3750
},
{
"epoch": 0.9838696470973552,
"grad_norm": 2.262745756657975,
"learning_rate": 1.1089805436451303e-06,
"loss": 0.2222,
"step": 3755
},
{
"epoch": 0.9851797265209203,
"grad_norm": 2.6653368398425723,
"learning_rate": 1.1068265785339518e-06,
"loss": 0.2718,
"step": 3760
},
{
"epoch": 0.9864898059444854,
"grad_norm": 4.296893472198315,
"learning_rate": 1.1046721119551598e-06,
"loss": 0.3262,
"step": 3765
},
{
"epoch": 0.9877998853680504,
"grad_norm": 4.189321257109803,
"learning_rate": 1.1025171540222977e-06,
"loss": 0.2656,
"step": 3770
},
{
"epoch": 0.9891099647916155,
"grad_norm": 4.22478244416407,
"learning_rate": 1.1003617148512149e-06,
"loss": 0.2863,
"step": 3775
},
{
"epoch": 0.9904200442151806,
"grad_norm": 3.0639361862726995,
"learning_rate": 1.0982058045600205e-06,
"loss": 0.2578,
"step": 3780
},
{
"epoch": 0.9917301236387456,
"grad_norm": 3.242022108834711,
"learning_rate": 1.0960494332690342e-06,
"loss": 0.2316,
"step": 3785
},
{
"epoch": 0.9930402030623107,
"grad_norm": 3.563673193533161,
"learning_rate": 1.093892611100741e-06,
"loss": 0.2838,
"step": 3790
},
{
"epoch": 0.9943502824858758,
"grad_norm": 4.493703507987354,
"learning_rate": 1.0917353481797412e-06,
"loss": 0.2579,
"step": 3795
},
{
"epoch": 0.9956603619094407,
"grad_norm": 6.8325710898527054,
"learning_rate": 1.089577654632705e-06,
"loss": 0.2317,
"step": 3800
},
{
"epoch": 0.9956603619094407,
"eval_accuracy": 0.7432,
"eval_loss": 0.6873839497566223,
"eval_runtime": 147.4303,
"eval_samples_per_second": 8.479,
"eval_steps_per_second": 2.123,
"step": 3800
},
{
"epoch": 0.9969704413330058,
"grad_norm": 3.8200894747277454,
"learning_rate": 1.0874195405883231e-06,
"loss": 0.2404,
"step": 3805
},
{
"epoch": 0.9982805207565709,
"grad_norm": 2.60177554342581,
"learning_rate": 1.085261016177261e-06,
"loss": 0.2528,
"step": 3810
},
{
"epoch": 0.9995906001801359,
"grad_norm": 2.373688391522338,
"learning_rate": 1.0831020915321109e-06,
"loss": 0.2214,
"step": 3815
},
{
"epoch": 1.000900679603701,
"grad_norm": 2.2734563557794907,
"learning_rate": 1.080942776787342e-06,
"loss": 0.1921,
"step": 3820
},
{
"epoch": 1.002210759027266,
"grad_norm": 2.3069259940839606,
"learning_rate": 1.0787830820792566e-06,
"loss": 0.2056,
"step": 3825
},
{
"epoch": 1.0035208384508312,
"grad_norm": 6.478254156842445,
"learning_rate": 1.0766230175459394e-06,
"loss": 0.1716,
"step": 3830
},
{
"epoch": 1.0048309178743962,
"grad_norm": 2.849832491085133,
"learning_rate": 1.0744625933272118e-06,
"loss": 0.1632,
"step": 3835
},
{
"epoch": 1.0061409972979611,
"grad_norm": 3.009110354194684,
"learning_rate": 1.0723018195645835e-06,
"loss": 0.1915,
"step": 3840
},
{
"epoch": 1.0074510767215263,
"grad_norm": 7.183583057327153,
"learning_rate": 1.070140706401205e-06,
"loss": 0.1776,
"step": 3845
},
{
"epoch": 1.0087611561450913,
"grad_norm": 7.645123151827477,
"learning_rate": 1.0679792639818199e-06,
"loss": 0.2206,
"step": 3850
},
{
"epoch": 1.0100712355686563,
"grad_norm": 2.0069646269660875,
"learning_rate": 1.0658175024527175e-06,
"loss": 0.1073,
"step": 3855
},
{
"epoch": 1.0113813149922215,
"grad_norm": 2.0486815490375325,
"learning_rate": 1.0636554319616853e-06,
"loss": 0.1817,
"step": 3860
},
{
"epoch": 1.0126913944157865,
"grad_norm": 7.2346007294382435,
"learning_rate": 1.0614930626579603e-06,
"loss": 0.2206,
"step": 3865
},
{
"epoch": 1.0140014738393515,
"grad_norm": 4.437819204972086,
"learning_rate": 1.0593304046921838e-06,
"loss": 0.1944,
"step": 3870
},
{
"epoch": 1.0153115532629167,
"grad_norm": 4.917690349323733,
"learning_rate": 1.0571674682163504e-06,
"loss": 0.1716,
"step": 3875
},
{
"epoch": 1.0166216326864816,
"grad_norm": 4.465444787397066,
"learning_rate": 1.0550042633837629e-06,
"loss": 0.1873,
"step": 3880
},
{
"epoch": 1.0179317121100466,
"grad_norm": 5.034120197481139,
"learning_rate": 1.052840800348984e-06,
"loss": 0.1971,
"step": 3885
},
{
"epoch": 1.0192417915336118,
"grad_norm": 4.650074157777241,
"learning_rate": 1.050677089267788e-06,
"loss": 0.1936,
"step": 3890
},
{
"epoch": 1.0205518709571768,
"grad_norm": 2.482941440321424,
"learning_rate": 1.0485131402971142e-06,
"loss": 0.1653,
"step": 3895
},
{
"epoch": 1.0218619503807418,
"grad_norm": 2.852065540223939,
"learning_rate": 1.0463489635950179e-06,
"loss": 0.1846,
"step": 3900
},
{
"epoch": 1.0218619503807418,
"eval_accuracy": 0.7488,
"eval_loss": 0.8795240521430969,
"eval_runtime": 142.5175,
"eval_samples_per_second": 8.771,
"eval_steps_per_second": 2.196,
"step": 3900
},
{
"epoch": 1.023172029804307,
"grad_norm": 1.8616921778346158,
"learning_rate": 1.0441845693206237e-06,
"loss": 0.1646,
"step": 3905
},
{
"epoch": 1.024482109227872,
"grad_norm": 2.432644973681949,
"learning_rate": 1.0420199676340777e-06,
"loss": 0.1653,
"step": 3910
},
{
"epoch": 1.025792188651437,
"grad_norm": 6.272988090712415,
"learning_rate": 1.0398551686964993e-06,
"loss": 0.181,
"step": 3915
},
{
"epoch": 1.0271022680750022,
"grad_norm": 5.836433358300422,
"learning_rate": 1.0376901826699347e-06,
"loss": 0.225,
"step": 3920
},
{
"epoch": 1.0284123474985671,
"grad_norm": 4.8783777162924205,
"learning_rate": 1.0355250197173066e-06,
"loss": 0.193,
"step": 3925
},
{
"epoch": 1.0297224269221321,
"grad_norm": 3.7860774323233537,
"learning_rate": 1.0333596900023702e-06,
"loss": 0.1351,
"step": 3930
},
{
"epoch": 1.031032506345697,
"grad_norm": 3.3891904081042243,
"learning_rate": 1.0311942036896623e-06,
"loss": 0.1365,
"step": 3935
},
{
"epoch": 1.0323425857692623,
"grad_norm": 4.255144512646531,
"learning_rate": 1.0290285709444556e-06,
"loss": 0.1947,
"step": 3940
},
{
"epoch": 1.0336526651928273,
"grad_norm": 3.551773880971244,
"learning_rate": 1.0268628019327088e-06,
"loss": 0.1691,
"step": 3945
},
{
"epoch": 1.0349627446163923,
"grad_norm": 3.21813090542831,
"learning_rate": 1.0246969068210217e-06,
"loss": 0.1839,
"step": 3950
},
{
"epoch": 1.0362728240399575,
"grad_norm": 2.166114985115052,
"learning_rate": 1.022530895776586e-06,
"loss": 0.1386,
"step": 3955
},
{
"epoch": 1.0375829034635224,
"grad_norm": 3.7664288882285573,
"learning_rate": 1.0203647789671364e-06,
"loss": 0.1829,
"step": 3960
},
{
"epoch": 1.0388929828870874,
"grad_norm": 4.568624206840217,
"learning_rate": 1.0181985665609051e-06,
"loss": 0.1606,
"step": 3965
},
{
"epoch": 1.0402030623106526,
"grad_norm": 4.12795033279393,
"learning_rate": 1.0160322687265728e-06,
"loss": 0.2144,
"step": 3970
},
{
"epoch": 1.0415131417342176,
"grad_norm": 6.066279389724659,
"learning_rate": 1.013865895633221e-06,
"loss": 0.153,
"step": 3975
},
{
"epoch": 1.0428232211577826,
"grad_norm": 5.99335728744553,
"learning_rate": 1.0116994574502853e-06,
"loss": 0.1776,
"step": 3980
},
{
"epoch": 1.0441333005813478,
"grad_norm": 3.323982778252669,
"learning_rate": 1.0095329643475056e-06,
"loss": 0.1258,
"step": 3985
},
{
"epoch": 1.0454433800049128,
"grad_norm": 4.0821616859221415,
"learning_rate": 1.0073664264948803e-06,
"loss": 0.141,
"step": 3990
},
{
"epoch": 1.0467534594284778,
"grad_norm": 4.303578582045898,
"learning_rate": 1.005199854062618e-06,
"loss": 0.1888,
"step": 3995
},
{
"epoch": 1.048063538852043,
"grad_norm": 6.9723723361771865,
"learning_rate": 1.0030332572210896e-06,
"loss": 0.1624,
"step": 4000
},
{
"epoch": 1.048063538852043,
"eval_accuracy": 0.748,
"eval_loss": 1.1003224849700928,
"eval_runtime": 138.6989,
"eval_samples_per_second": 9.012,
"eval_steps_per_second": 2.257,
"step": 4000
},
{
"epoch": 1.049373618275608,
"grad_norm": 4.415041744009451,
"learning_rate": 1.00086664614078e-06,
"loss": 0.167,
"step": 4005
},
{
"epoch": 1.050683697699173,
"grad_norm": 1.7135670096102869,
"learning_rate": 9.987000309922417e-07,
"loss": 0.1711,
"step": 4010
},
{
"epoch": 1.0519937771227381,
"grad_norm": 7.478625018041275,
"learning_rate": 9.965334219460455e-07,
"loss": 0.1731,
"step": 4015
},
{
"epoch": 1.053303856546303,
"grad_norm": 2.7314624626438206,
"learning_rate": 9.943668291727344e-07,
"loss": 0.1859,
"step": 4020
},
{
"epoch": 1.054613935969868,
"grad_norm": 5.770554889506936,
"learning_rate": 9.922002628427742e-07,
"loss": 0.1597,
"step": 4025
},
{
"epoch": 1.0559240153934333,
"grad_norm": 3.3610794660047834,
"learning_rate": 9.900337331265077e-07,
"loss": 0.187,
"step": 4030
},
{
"epoch": 1.0572340948169983,
"grad_norm": 5.42834738035381,
"learning_rate": 9.878672501941045e-07,
"loss": 0.1698,
"step": 4035
},
{
"epoch": 1.0585441742405632,
"grad_norm": 2.9976736613034665,
"learning_rate": 9.857008242155152e-07,
"loss": 0.1254,
"step": 4040
},
{
"epoch": 1.0598542536641284,
"grad_norm": 4.911495425759969,
"learning_rate": 9.83534465360423e-07,
"loss": 0.136,
"step": 4045
},
{
"epoch": 1.0611643330876934,
"grad_norm": 3.02741302534027,
"learning_rate": 9.813681837981966e-07,
"loss": 0.1938,
"step": 4050
},
{
"epoch": 1.0624744125112584,
"grad_norm": 11.241222743138435,
"learning_rate": 9.792019896978412e-07,
"loss": 0.1745,
"step": 4055
},
{
"epoch": 1.0637844919348236,
"grad_norm": 3.613601159548242,
"learning_rate": 9.77035893227951e-07,
"loss": 0.1792,
"step": 4060
},
{
"epoch": 1.0650945713583886,
"grad_norm": 2.560108792032753,
"learning_rate": 9.748699045566625e-07,
"loss": 0.173,
"step": 4065
},
{
"epoch": 1.0664046507819536,
"grad_norm": 4.2940503388019495,
"learning_rate": 9.727040338516066e-07,
"loss": 0.1496,
"step": 4070
},
{
"epoch": 1.0677147302055188,
"grad_norm": 4.675724374175031,
"learning_rate": 9.705382912798596e-07,
"loss": 0.2138,
"step": 4075
},
{
"epoch": 1.0690248096290837,
"grad_norm": 5.628200856430463,
"learning_rate": 9.683726870078971e-07,
"loss": 0.2194,
"step": 4080
},
{
"epoch": 1.0703348890526487,
"grad_norm": 5.2076046235205125,
"learning_rate": 9.662072312015445e-07,
"loss": 0.2401,
"step": 4085
},
{
"epoch": 1.071644968476214,
"grad_norm": 1.9418784608600421,
"learning_rate": 9.640419340259311e-07,
"loss": 0.1514,
"step": 4090
},
{
"epoch": 1.072955047899779,
"grad_norm": 7.120141596864124,
"learning_rate": 9.618768056454415e-07,
"loss": 0.157,
"step": 4095
},
{
"epoch": 1.074265127323344,
"grad_norm": 3.705216890309473,
"learning_rate": 9.597118562236679e-07,
"loss": 0.1456,
"step": 4100
},
{
"epoch": 1.074265127323344,
"eval_accuracy": 0.7608,
"eval_loss": 0.9698547124862671,
"eval_runtime": 139.0187,
"eval_samples_per_second": 8.992,
"eval_steps_per_second": 2.251,
"step": 4100
},
{
"epoch": 1.075575206746909,
"grad_norm": 3.085456133975227,
"learning_rate": 9.575470959233612e-07,
"loss": 0.1856,
"step": 4105
},
{
"epoch": 1.076885286170474,
"grad_norm": 1.823222530695256,
"learning_rate": 9.553825349063864e-07,
"loss": 0.1667,
"step": 4110
},
{
"epoch": 1.078195365594039,
"grad_norm": 3.7383958453668096,
"learning_rate": 9.532181833336721e-07,
"loss": 0.1391,
"step": 4115
},
{
"epoch": 1.0795054450176043,
"grad_norm": 3.8448773992307514,
"learning_rate": 9.510540513651637e-07,
"loss": 0.1542,
"step": 4120
},
{
"epoch": 1.0808155244411692,
"grad_norm": 4.78668688388694,
"learning_rate": 9.488901491597761e-07,
"loss": 0.1696,
"step": 4125
},
{
"epoch": 1.0821256038647342,
"grad_norm": 3.2055677447348923,
"learning_rate": 9.46726486875345e-07,
"loss": 0.2188,
"step": 4130
},
{
"epoch": 1.0834356832882994,
"grad_norm": 5.891308978767123,
"learning_rate": 9.445630746685806e-07,
"loss": 0.1885,
"step": 4135
},
{
"epoch": 1.0847457627118644,
"grad_norm": 4.8421132337125545,
"learning_rate": 9.423999226950185e-07,
"loss": 0.1609,
"step": 4140
},
{
"epoch": 1.0860558421354294,
"grad_norm": 5.9154276009344136,
"learning_rate": 9.402370411089732e-07,
"loss": 0.1527,
"step": 4145
},
{
"epoch": 1.0873659215589946,
"grad_norm": 7.1723306345970865,
"learning_rate": 9.380744400634903e-07,
"loss": 0.1594,
"step": 4150
},
{
"epoch": 1.0886760009825596,
"grad_norm": 3.6603352773642106,
"learning_rate": 9.35912129710297e-07,
"loss": 0.1706,
"step": 4155
},
{
"epoch": 1.0899860804061245,
"grad_norm": 1.8190671058733112,
"learning_rate": 9.337501201997573e-07,
"loss": 0.1687,
"step": 4160
},
{
"epoch": 1.0912961598296897,
"grad_norm": 6.103828451150774,
"learning_rate": 9.315884216808226e-07,
"loss": 0.1543,
"step": 4165
},
{
"epoch": 1.0926062392532547,
"grad_norm": 3.51452822849322,
"learning_rate": 9.294270443009847e-07,
"loss": 0.168,
"step": 4170
},
{
"epoch": 1.0939163186768197,
"grad_norm": 3.9256660164579267,
"learning_rate": 9.27265998206227e-07,
"loss": 0.1447,
"step": 4175
},
{
"epoch": 1.095226398100385,
"grad_norm": 6.647259650468985,
"learning_rate": 9.251052935409783e-07,
"loss": 0.219,
"step": 4180
},
{
"epoch": 1.09653647752395,
"grad_norm": 2.8205270117655767,
"learning_rate": 9.229449404480653e-07,
"loss": 0.1496,
"step": 4185
},
{
"epoch": 1.0978465569475149,
"grad_norm": 6.128991520594209,
"learning_rate": 9.207849490686636e-07,
"loss": 0.2047,
"step": 4190
},
{
"epoch": 1.09915663637108,
"grad_norm": 4.140511690244823,
"learning_rate": 9.186253295422514e-07,
"loss": 0.2245,
"step": 4195
},
{
"epoch": 1.100466715794645,
"grad_norm": 2.0987997572217303,
"learning_rate": 9.1646609200656e-07,
"loss": 0.1966,
"step": 4200
},
{
"epoch": 1.100466715794645,
"eval_accuracy": 0.7504,
"eval_loss": 0.9615470767021179,
"eval_runtime": 145.8533,
"eval_samples_per_second": 8.57,
"eval_steps_per_second": 2.146,
"step": 4200
},
{
"epoch": 1.10177679521821,
"grad_norm": 6.7544463505960195,
"learning_rate": 9.14307246597529e-07,
"loss": 0.1695,
"step": 4205
},
{
"epoch": 1.1030868746417752,
"grad_norm": 3.881058148850053,
"learning_rate": 9.121488034492568e-07,
"loss": 0.1736,
"step": 4210
},
{
"epoch": 1.1043969540653402,
"grad_norm": 5.101818081783207,
"learning_rate": 9.099907726939533e-07,
"loss": 0.2124,
"step": 4215
},
{
"epoch": 1.1057070334889052,
"grad_norm": 4.13924770593388,
"learning_rate": 9.078331644618934e-07,
"loss": 0.149,
"step": 4220
},
{
"epoch": 1.1070171129124704,
"grad_norm": 4.56496678907466,
"learning_rate": 9.056759888813668e-07,
"loss": 0.1696,
"step": 4225
},
{
"epoch": 1.1083271923360354,
"grad_norm": 6.020881682990206,
"learning_rate": 9.035192560786338e-07,
"loss": 0.2085,
"step": 4230
},
{
"epoch": 1.1096372717596004,
"grad_norm": 3.020245719833497,
"learning_rate": 9.013629761778757e-07,
"loss": 0.1503,
"step": 4235
},
{
"epoch": 1.1109473511831656,
"grad_norm": 5.745920685976253,
"learning_rate": 8.99207159301148e-07,
"loss": 0.1883,
"step": 4240
},
{
"epoch": 1.1122574306067305,
"grad_norm": 6.338038152438336,
"learning_rate": 8.970518155683324e-07,
"loss": 0.1612,
"step": 4245
},
{
"epoch": 1.1135675100302955,
"grad_norm": 4.337625038324552,
"learning_rate": 8.948969550970894e-07,
"loss": 0.1276,
"step": 4250
},
{
"epoch": 1.1148775894538607,
"grad_norm": 5.983331324725646,
"learning_rate": 8.927425880028113e-07,
"loss": 0.1572,
"step": 4255
},
{
"epoch": 1.1161876688774257,
"grad_norm": 5.471826595967007,
"learning_rate": 8.905887243985743e-07,
"loss": 0.1733,
"step": 4260
},
{
"epoch": 1.1174977483009907,
"grad_norm": 9.369736563628495,
"learning_rate": 8.884353743950915e-07,
"loss": 0.1768,
"step": 4265
},
{
"epoch": 1.118807827724556,
"grad_norm": 4.302075004357904,
"learning_rate": 8.862825481006637e-07,
"loss": 0.1676,
"step": 4270
},
{
"epoch": 1.1201179071481209,
"grad_norm": 3.995934439864761,
"learning_rate": 8.841302556211348e-07,
"loss": 0.1556,
"step": 4275
},
{
"epoch": 1.1214279865716859,
"grad_norm": 5.71645655203888,
"learning_rate": 8.81978507059842e-07,
"loss": 0.173,
"step": 4280
},
{
"epoch": 1.122738065995251,
"grad_norm": 3.863778158057957,
"learning_rate": 8.798273125175697e-07,
"loss": 0.1905,
"step": 4285
},
{
"epoch": 1.124048145418816,
"grad_norm": 8.652156181110675,
"learning_rate": 8.776766820925016e-07,
"loss": 0.2137,
"step": 4290
},
{
"epoch": 1.125358224842381,
"grad_norm": 4.502859011012491,
"learning_rate": 8.755266258801725e-07,
"loss": 0.1615,
"step": 4295
},
{
"epoch": 1.1266683042659462,
"grad_norm": 6.956098402947135,
"learning_rate": 8.73377153973423e-07,
"loss": 0.203,
"step": 4300
},
{
"epoch": 1.1266683042659462,
"eval_accuracy": 0.7576,
"eval_loss": 1.0583382844924927,
"eval_runtime": 149.7241,
"eval_samples_per_second": 8.349,
"eval_steps_per_second": 2.091,
"step": 4300
},
{
"epoch": 1.1279783836895112,
"grad_norm": 4.881012007634449,
"learning_rate": 8.712282764623495e-07,
"loss": 0.1625,
"step": 4305
},
{
"epoch": 1.1292884631130762,
"grad_norm": 2.1915201124191523,
"learning_rate": 8.690800034342593e-07,
"loss": 0.1598,
"step": 4310
},
{
"epoch": 1.1305985425366414,
"grad_norm": 6.988605240677998,
"learning_rate": 8.669323449736223e-07,
"loss": 0.1763,
"step": 4315
},
{
"epoch": 1.1319086219602064,
"grad_norm": 8.099073728074591,
"learning_rate": 8.647853111620213e-07,
"loss": 0.2026,
"step": 4320
},
{
"epoch": 1.1332187013837713,
"grad_norm": 6.4285970895379,
"learning_rate": 8.626389120781096e-07,
"loss": 0.1622,
"step": 4325
},
{
"epoch": 1.1345287808073365,
"grad_norm": 2.9631902369392145,
"learning_rate": 8.604931577975591e-07,
"loss": 0.1983,
"step": 4330
},
{
"epoch": 1.1358388602309015,
"grad_norm": 1.8505065261163445,
"learning_rate": 8.583480583930162e-07,
"loss": 0.1276,
"step": 4335
},
{
"epoch": 1.1371489396544665,
"grad_norm": 3.4127484907896175,
"learning_rate": 8.562036239340519e-07,
"loss": 0.1559,
"step": 4340
},
{
"epoch": 1.1384590190780317,
"grad_norm": 4.817799074804773,
"learning_rate": 8.540598644871166e-07,
"loss": 0.2032,
"step": 4345
},
{
"epoch": 1.1397690985015967,
"grad_norm": 3.9208306819766907,
"learning_rate": 8.519167901154915e-07,
"loss": 0.1249,
"step": 4350
},
{
"epoch": 1.1410791779251617,
"grad_norm": 3.258083851351543,
"learning_rate": 8.497744108792429e-07,
"loss": 0.167,
"step": 4355
},
{
"epoch": 1.1423892573487269,
"grad_norm": 5.712624458894274,
"learning_rate": 8.476327368351731e-07,
"loss": 0.1821,
"step": 4360
},
{
"epoch": 1.1436993367722919,
"grad_norm": 5.622711654282518,
"learning_rate": 8.454917780367738e-07,
"loss": 0.1426,
"step": 4365
},
{
"epoch": 1.1450094161958568,
"grad_norm": 2.9714920706791603,
"learning_rate": 8.433515445341798e-07,
"loss": 0.1508,
"step": 4370
},
{
"epoch": 1.146319495619422,
"grad_norm": 8.145950090881742,
"learning_rate": 8.412120463741213e-07,
"loss": 0.1911,
"step": 4375
},
{
"epoch": 1.147629575042987,
"grad_norm": 7.816310979213919,
"learning_rate": 8.390732935998762e-07,
"loss": 0.1972,
"step": 4380
},
{
"epoch": 1.148939654466552,
"grad_norm": 10.997788887744328,
"learning_rate": 8.369352962512241e-07,
"loss": 0.2195,
"step": 4385
},
{
"epoch": 1.1502497338901172,
"grad_norm": 2.7493033712936668,
"learning_rate": 8.347980643643972e-07,
"loss": 0.1853,
"step": 4390
},
{
"epoch": 1.1515598133136822,
"grad_norm": 3.181837621729024,
"learning_rate": 8.326616079720356e-07,
"loss": 0.1779,
"step": 4395
},
{
"epoch": 1.1528698927372472,
"grad_norm": 3.4603211671345084,
"learning_rate": 8.305259371031385e-07,
"loss": 0.1975,
"step": 4400
},
{
"epoch": 1.1528698927372472,
"eval_accuracy": 0.756,
"eval_loss": 0.9896759986877441,
"eval_runtime": 146.8037,
"eval_samples_per_second": 8.515,
"eval_steps_per_second": 2.132,
"step": 4400
},
{
"epoch": 1.1541799721608124,
"grad_norm": 6.84507170824882,
"learning_rate": 8.283910617830185e-07,
"loss": 0.2055,
"step": 4405
},
{
"epoch": 1.1554900515843773,
"grad_norm": 2.9623090673688295,
"learning_rate": 8.262569920332522e-07,
"loss": 0.1344,
"step": 4410
},
{
"epoch": 1.1568001310079423,
"grad_norm": 2.676649590370874,
"learning_rate": 8.241237378716357e-07,
"loss": 0.1341,
"step": 4415
},
{
"epoch": 1.1581102104315075,
"grad_norm": 3.3462527937569284,
"learning_rate": 8.219913093121367e-07,
"loss": 0.1479,
"step": 4420
},
{
"epoch": 1.1594202898550725,
"grad_norm": 4.922809803079367,
"learning_rate": 8.198597163648466e-07,
"loss": 0.1377,
"step": 4425
},
{
"epoch": 1.1607303692786375,
"grad_norm": 7.567987448628956,
"learning_rate": 8.177289690359354e-07,
"loss": 0.2551,
"step": 4430
},
{
"epoch": 1.1620404487022027,
"grad_norm": 4.269347526317818,
"learning_rate": 8.155990773276022e-07,
"loss": 0.1511,
"step": 4435
},
{
"epoch": 1.1633505281257677,
"grad_norm": 5.196415940319941,
"learning_rate": 8.134700512380304e-07,
"loss": 0.2124,
"step": 4440
},
{
"epoch": 1.1646606075493326,
"grad_norm": 4.346263990105768,
"learning_rate": 8.113419007613399e-07,
"loss": 0.1708,
"step": 4445
},
{
"epoch": 1.1659706869728979,
"grad_norm": 4.993849508082759,
"learning_rate": 8.092146358875405e-07,
"loss": 0.147,
"step": 4450
},
{
"epoch": 1.1672807663964628,
"grad_norm": 4.5209534974011945,
"learning_rate": 8.070882666024847e-07,
"loss": 0.1311,
"step": 4455
},
{
"epoch": 1.1685908458200278,
"grad_norm": 9.122486214875625,
"learning_rate": 8.049628028878199e-07,
"loss": 0.179,
"step": 4460
},
{
"epoch": 1.169900925243593,
"grad_norm": 4.302140098325049,
"learning_rate": 8.02838254720944e-07,
"loss": 0.1912,
"step": 4465
},
{
"epoch": 1.171211004667158,
"grad_norm": 5.885855438044166,
"learning_rate": 8.007146320749565e-07,
"loss": 0.209,
"step": 4470
},
{
"epoch": 1.172521084090723,
"grad_norm": 5.113483218057625,
"learning_rate": 7.985919449186122e-07,
"loss": 0.138,
"step": 4475
},
{
"epoch": 1.1738311635142882,
"grad_norm": 7.107404436549728,
"learning_rate": 7.964702032162748e-07,
"loss": 0.1443,
"step": 4480
},
{
"epoch": 1.1751412429378532,
"grad_norm": 5.613324141288739,
"learning_rate": 7.943494169278694e-07,
"loss": 0.1659,
"step": 4485
},
{
"epoch": 1.1764513223614181,
"grad_norm": 7.621228727692981,
"learning_rate": 7.922295960088366e-07,
"loss": 0.2055,
"step": 4490
},
{
"epoch": 1.1777614017849831,
"grad_norm": 5.157419189737596,
"learning_rate": 7.901107504100851e-07,
"loss": 0.1951,
"step": 4495
},
{
"epoch": 1.1790714812085483,
"grad_norm": 6.255459111664694,
"learning_rate": 7.879928900779455e-07,
"loss": 0.1878,
"step": 4500
},
{
"epoch": 1.1790714812085483,
"eval_accuracy": 0.7544,
"eval_loss": 1.0830539464950562,
"eval_runtime": 147.9333,
"eval_samples_per_second": 8.45,
"eval_steps_per_second": 2.116,
"step": 4500
},
{
"epoch": 1.1803815606321133,
"grad_norm": 5.071030425233318,
"learning_rate": 7.858760249541227e-07,
"loss": 0.1376,
"step": 4505
},
{
"epoch": 1.1816916400556783,
"grad_norm": 4.613895284146258,
"learning_rate": 7.837601649756507e-07,
"loss": 0.1871,
"step": 4510
},
{
"epoch": 1.1830017194792435,
"grad_norm": 5.343062025394728,
"learning_rate": 7.816453200748445e-07,
"loss": 0.1557,
"step": 4515
},
{
"epoch": 1.1843117989028085,
"grad_norm": 2.3488119081864878,
"learning_rate": 7.795315001792545e-07,
"loss": 0.1275,
"step": 4520
},
{
"epoch": 1.1856218783263734,
"grad_norm": 7.211896790191278,
"learning_rate": 7.774187152116195e-07,
"loss": 0.1795,
"step": 4525
},
{
"epoch": 1.1869319577499386,
"grad_norm": 6.629116565849999,
"learning_rate": 7.753069750898195e-07,
"loss": 0.1694,
"step": 4530
},
{
"epoch": 1.1882420371735036,
"grad_norm": 6.7686458510507945,
"learning_rate": 7.731962897268304e-07,
"loss": 0.1823,
"step": 4535
},
{
"epoch": 1.1895521165970686,
"grad_norm": 9.499227541023666,
"learning_rate": 7.710866690306767e-07,
"loss": 0.1973,
"step": 4540
},
{
"epoch": 1.1908621960206338,
"grad_norm": 5.305703998887908,
"learning_rate": 7.689781229043852e-07,
"loss": 0.1417,
"step": 4545
},
{
"epoch": 1.1921722754441988,
"grad_norm": 6.807389274265624,
"learning_rate": 7.668706612459386e-07,
"loss": 0.1309,
"step": 4550
},
{
"epoch": 1.1934823548677638,
"grad_norm": 2.37846342555138,
"learning_rate": 7.647642939482276e-07,
"loss": 0.2224,
"step": 4555
},
{
"epoch": 1.194792434291329,
"grad_norm": 7.282824648401145,
"learning_rate": 7.626590308990073e-07,
"loss": 0.1746,
"step": 4560
},
{
"epoch": 1.196102513714894,
"grad_norm": 10.05223632714189,
"learning_rate": 7.605548819808485e-07,
"loss": 0.1777,
"step": 4565
},
{
"epoch": 1.197412593138459,
"grad_norm": 3.5055904042182653,
"learning_rate": 7.584518570710923e-07,
"loss": 0.182,
"step": 4570
},
{
"epoch": 1.1987226725620241,
"grad_norm": 5.58019032163587,
"learning_rate": 7.56349966041803e-07,
"loss": 0.1708,
"step": 4575
},
{
"epoch": 1.2000327519855891,
"grad_norm": 4.895119266207443,
"learning_rate": 7.542492187597227e-07,
"loss": 0.1614,
"step": 4580
},
{
"epoch": 1.201342831409154,
"grad_norm": 6.978583024065426,
"learning_rate": 7.52149625086224e-07,
"loss": 0.1561,
"step": 4585
},
{
"epoch": 1.2026529108327193,
"grad_norm": 6.610019216750305,
"learning_rate": 7.500511948772649e-07,
"loss": 0.1557,
"step": 4590
},
{
"epoch": 1.2039629902562843,
"grad_norm": 8.135947467914335,
"learning_rate": 7.479539379833417e-07,
"loss": 0.1616,
"step": 4595
},
{
"epoch": 1.2052730696798493,
"grad_norm": 3.5915184810255667,
"learning_rate": 7.458578642494417e-07,
"loss": 0.1177,
"step": 4600
},
{
"epoch": 1.2052730696798493,
"eval_accuracy": 0.7656,
"eval_loss": 1.0870707035064697,
"eval_runtime": 143.0548,
"eval_samples_per_second": 8.738,
"eval_steps_per_second": 2.188,
"step": 4600
},
{
"epoch": 1.2065831491034145,
"grad_norm": 8.379654070687748,
"learning_rate": 7.437629835149997e-07,
"loss": 0.1494,
"step": 4605
},
{
"epoch": 1.2078932285269794,
"grad_norm": 5.7531869850518165,
"learning_rate": 7.416693056138496e-07,
"loss": 0.15,
"step": 4610
},
{
"epoch": 1.2092033079505444,
"grad_norm": 4.807799748254105,
"learning_rate": 7.395768403741793e-07,
"loss": 0.1665,
"step": 4615
},
{
"epoch": 1.2105133873741096,
"grad_norm": 5.192570962379375,
"learning_rate": 7.37485597618484e-07,
"loss": 0.1866,
"step": 4620
},
{
"epoch": 1.2118234667976746,
"grad_norm": 7.151185425257774,
"learning_rate": 7.353955871635194e-07,
"loss": 0.1781,
"step": 4625
},
{
"epoch": 1.2131335462212396,
"grad_norm": 7.814709553590773,
"learning_rate": 7.33306818820258e-07,
"loss": 0.1362,
"step": 4630
},
{
"epoch": 1.2144436256448048,
"grad_norm": 4.846484187047676,
"learning_rate": 7.312193023938411e-07,
"loss": 0.1624,
"step": 4635
},
{
"epoch": 1.2157537050683698,
"grad_norm": 4.026530959170853,
"learning_rate": 7.291330476835327e-07,
"loss": 0.1428,
"step": 4640
},
{
"epoch": 1.2170637844919348,
"grad_norm": 4.135011637724928,
"learning_rate": 7.270480644826749e-07,
"loss": 0.1685,
"step": 4645
},
{
"epoch": 1.2183738639155,
"grad_norm": 4.129753177959848,
"learning_rate": 7.249643625786396e-07,
"loss": 0.1385,
"step": 4650
},
{
"epoch": 1.219683943339065,
"grad_norm": 3.5058969060009972,
"learning_rate": 7.228819517527853e-07,
"loss": 0.1573,
"step": 4655
},
{
"epoch": 1.22099402276263,
"grad_norm": 5.141143930253066,
"learning_rate": 7.208008417804097e-07,
"loss": 0.1667,
"step": 4660
},
{
"epoch": 1.2223041021861951,
"grad_norm": 5.713198603618221,
"learning_rate": 7.18721042430704e-07,
"loss": 0.1665,
"step": 4665
},
{
"epoch": 1.22361418160976,
"grad_norm": 6.881035052397601,
"learning_rate": 7.166425634667061e-07,
"loss": 0.0995,
"step": 4670
},
{
"epoch": 1.224924261033325,
"grad_norm": 9.101390365069749,
"learning_rate": 7.14565414645257e-07,
"loss": 0.1738,
"step": 4675
},
{
"epoch": 1.2262343404568903,
"grad_norm": 4.609339453067137,
"learning_rate": 7.124896057169532e-07,
"loss": 0.1568,
"step": 4680
},
{
"epoch": 1.2275444198804553,
"grad_norm": 4.95075728149686,
"learning_rate": 7.104151464261012e-07,
"loss": 0.1443,
"step": 4685
},
{
"epoch": 1.2288544993040202,
"grad_norm": 5.718250117967298,
"learning_rate": 7.083420465106727e-07,
"loss": 0.145,
"step": 4690
},
{
"epoch": 1.2301645787275854,
"grad_norm": 4.025790690698405,
"learning_rate": 7.062703157022571e-07,
"loss": 0.2297,
"step": 4695
},
{
"epoch": 1.2314746581511504,
"grad_norm": 5.85692684018953,
"learning_rate": 7.041999637260179e-07,
"loss": 0.1599,
"step": 4700
},
{
"epoch": 1.2314746581511504,
"eval_accuracy": 0.7528,
"eval_loss": 1.1270846128463745,
"eval_runtime": 144.8062,
"eval_samples_per_second": 8.632,
"eval_steps_per_second": 2.162,
"step": 4700
},
{
"epoch": 1.2327847375747154,
"grad_norm": 3.639396650071254,
"learning_rate": 7.021310003006458e-07,
"loss": 0.1767,
"step": 4705
},
{
"epoch": 1.2340948169982806,
"grad_norm": 4.992262473685122,
"learning_rate": 7.00063435138313e-07,
"loss": 0.1965,
"step": 4710
},
{
"epoch": 1.2354048964218456,
"grad_norm": 4.404778717860058,
"learning_rate": 6.979972779446288e-07,
"loss": 0.1772,
"step": 4715
},
{
"epoch": 1.2367149758454106,
"grad_norm": 6.471861850114097,
"learning_rate": 6.959325384185916e-07,
"loss": 0.1849,
"step": 4720
},
{
"epoch": 1.2380250552689758,
"grad_norm": 4.178248349318392,
"learning_rate": 6.938692262525463e-07,
"loss": 0.1845,
"step": 4725
},
{
"epoch": 1.2393351346925408,
"grad_norm": 3.328304867825855,
"learning_rate": 6.918073511321372e-07,
"loss": 0.1609,
"step": 4730
},
{
"epoch": 1.2406452141161057,
"grad_norm": 6.825787570815529,
"learning_rate": 6.897469227362626e-07,
"loss": 0.2165,
"step": 4735
},
{
"epoch": 1.241955293539671,
"grad_norm": 3.6473987849811573,
"learning_rate": 6.876879507370296e-07,
"loss": 0.1681,
"step": 4740
},
{
"epoch": 1.243265372963236,
"grad_norm": 5.7577805362937395,
"learning_rate": 6.856304447997087e-07,
"loss": 0.1393,
"step": 4745
},
{
"epoch": 1.244575452386801,
"grad_norm": 2.988983071415241,
"learning_rate": 6.835744145826883e-07,
"loss": 0.1293,
"step": 4750
},
{
"epoch": 1.245885531810366,
"grad_norm": 4.094951732166031,
"learning_rate": 6.815198697374295e-07,
"loss": 0.1986,
"step": 4755
},
{
"epoch": 1.247195611233931,
"grad_norm": 5.3643705834327555,
"learning_rate": 6.794668199084211e-07,
"loss": 0.1561,
"step": 4760
},
{
"epoch": 1.248505690657496,
"grad_norm": 6.488709549888938,
"learning_rate": 6.774152747331327e-07,
"loss": 0.1506,
"step": 4765
},
{
"epoch": 1.2498157700810613,
"grad_norm": 8.658130020703828,
"learning_rate": 6.753652438419724e-07,
"loss": 0.1462,
"step": 4770
},
{
"epoch": 1.2511258495046262,
"grad_norm": 3.6729089118316143,
"learning_rate": 6.733167368582387e-07,
"loss": 0.1754,
"step": 4775
},
{
"epoch": 1.2524359289281912,
"grad_norm": 4.412486038113313,
"learning_rate": 6.71269763398077e-07,
"loss": 0.1524,
"step": 4780
},
{
"epoch": 1.2537460083517562,
"grad_norm": 7.05308577872481,
"learning_rate": 6.692243330704345e-07,
"loss": 0.1955,
"step": 4785
},
{
"epoch": 1.2550560877753214,
"grad_norm": 3.707641518167412,
"learning_rate": 6.671804554770134e-07,
"loss": 0.1519,
"step": 4790
},
{
"epoch": 1.2563661671988864,
"grad_norm": 5.375891710975129,
"learning_rate": 6.651381402122279e-07,
"loss": 0.175,
"step": 4795
},
{
"epoch": 1.2576762466224514,
"grad_norm": 5.686007779957575,
"learning_rate": 6.630973968631582e-07,
"loss": 0.1541,
"step": 4800
},
{
"epoch": 1.2576762466224514,
"eval_accuracy": 0.7504,
"eval_loss": 1.1022791862487793,
"eval_runtime": 147.8534,
"eval_samples_per_second": 8.454,
"eval_steps_per_second": 2.117,
"step": 4800
},
{
"epoch": 1.2589863260460166,
"grad_norm": 4.67723472749942,
"learning_rate": 6.610582350095056e-07,
"loss": 0.1378,
"step": 4805
},
{
"epoch": 1.2602964054695815,
"grad_norm": 4.617066533461443,
"learning_rate": 6.590206642235469e-07,
"loss": 0.1512,
"step": 4810
},
{
"epoch": 1.2616064848931465,
"grad_norm": 6.057922955780923,
"learning_rate": 6.569846940700905e-07,
"loss": 0.1826,
"step": 4815
},
{
"epoch": 1.2629165643167117,
"grad_norm": 6.250001047253521,
"learning_rate": 6.549503341064315e-07,
"loss": 0.1458,
"step": 4820
},
{
"epoch": 1.2642266437402767,
"grad_norm": 4.27455152251786,
"learning_rate": 6.529175938823059e-07,
"loss": 0.1333,
"step": 4825
},
{
"epoch": 1.2655367231638417,
"grad_norm": 4.878557343525659,
"learning_rate": 6.508864829398464e-07,
"loss": 0.16,
"step": 4830
},
{
"epoch": 1.266846802587407,
"grad_norm": 5.406320166959371,
"learning_rate": 6.488570108135375e-07,
"loss": 0.1777,
"step": 4835
},
{
"epoch": 1.2681568820109719,
"grad_norm": 2.433179347244675,
"learning_rate": 6.468291870301707e-07,
"loss": 0.1715,
"step": 4840
},
{
"epoch": 1.2694669614345369,
"grad_norm": 9.875918507487867,
"learning_rate": 6.448030211087997e-07,
"loss": 0.1599,
"step": 4845
},
{
"epoch": 1.270777040858102,
"grad_norm": 6.6713245625990245,
"learning_rate": 6.427785225606961e-07,
"loss": 0.1406,
"step": 4850
},
{
"epoch": 1.272087120281667,
"grad_norm": 5.027821637791478,
"learning_rate": 6.40755700889305e-07,
"loss": 0.187,
"step": 4855
},
{
"epoch": 1.273397199705232,
"grad_norm": 2.7556993105477527,
"learning_rate": 6.38734565590198e-07,
"loss": 0.159,
"step": 4860
},
{
"epoch": 1.2747072791287972,
"grad_norm": 5.364669134362788,
"learning_rate": 6.367151261510324e-07,
"loss": 0.2186,
"step": 4865
},
{
"epoch": 1.2760173585523622,
"grad_norm": 3.5785337661276673,
"learning_rate": 6.346973920515039e-07,
"loss": 0.1364,
"step": 4870
},
{
"epoch": 1.2773274379759272,
"grad_norm": 5.414858492393773,
"learning_rate": 6.326813727633034e-07,
"loss": 0.1825,
"step": 4875
},
{
"epoch": 1.2786375173994924,
"grad_norm": 2.4129442164086536,
"learning_rate": 6.306670777500718e-07,
"loss": 0.1197,
"step": 4880
},
{
"epoch": 1.2799475968230574,
"grad_norm": 5.264909047367014,
"learning_rate": 6.286545164673555e-07,
"loss": 0.2254,
"step": 4885
},
{
"epoch": 1.2812576762466223,
"grad_norm": 2.0451302251453956,
"learning_rate": 6.26643698362563e-07,
"loss": 0.1347,
"step": 4890
},
{
"epoch": 1.2825677556701875,
"grad_norm": 3.7829205343945733,
"learning_rate": 6.246346328749199e-07,
"loss": 0.1552,
"step": 4895
},
{
"epoch": 1.2838778350937525,
"grad_norm": 5.594121118174163,
"learning_rate": 6.226273294354247e-07,
"loss": 0.1621,
"step": 4900
},
{
"epoch": 1.2838778350937525,
"eval_accuracy": 0.7496,
"eval_loss": 1.1256372928619385,
"eval_runtime": 143.3507,
"eval_samples_per_second": 8.72,
"eval_steps_per_second": 2.183,
"step": 4900
},
{
"epoch": 1.2851879145173175,
"grad_norm": 5.282433980211254,
"learning_rate": 6.206217974668034e-07,
"loss": 0.1379,
"step": 4905
},
{
"epoch": 1.2864979939408827,
"grad_norm": 3.277253081058706,
"learning_rate": 6.186180463834675e-07,
"loss": 0.1338,
"step": 4910
},
{
"epoch": 1.2878080733644477,
"grad_norm": 6.466429706589474,
"learning_rate": 6.166160855914683e-07,
"loss": 0.1542,
"step": 4915
},
{
"epoch": 1.2891181527880127,
"grad_norm": 4.3752571537021625,
"learning_rate": 6.146159244884533e-07,
"loss": 0.204,
"step": 4920
},
{
"epoch": 1.2904282322115779,
"grad_norm": 9.612960014494304,
"learning_rate": 6.126175724636213e-07,
"loss": 0.1666,
"step": 4925
},
{
"epoch": 1.2917383116351429,
"grad_norm": 7.388843981840605,
"learning_rate": 6.106210388976792e-07,
"loss": 0.1676,
"step": 4930
},
{
"epoch": 1.2930483910587078,
"grad_norm": 5.525130280779438,
"learning_rate": 6.086263331627975e-07,
"loss": 0.1371,
"step": 4935
},
{
"epoch": 1.294358470482273,
"grad_norm": 15.7136065256586,
"learning_rate": 6.066334646225669e-07,
"loss": 0.2647,
"step": 4940
},
{
"epoch": 1.295668549905838,
"grad_norm": 4.509787064035042,
"learning_rate": 6.046424426319534e-07,
"loss": 0.186,
"step": 4945
},
{
"epoch": 1.296978629329403,
"grad_norm": 3.034726602811886,
"learning_rate": 6.026532765372556e-07,
"loss": 0.1689,
"step": 4950
},
{
"epoch": 1.2982887087529682,
"grad_norm": 9.470147103767804,
"learning_rate": 6.006659756760587e-07,
"loss": 0.1738,
"step": 4955
},
{
"epoch": 1.2995987881765332,
"grad_norm": 3.723471628172551,
"learning_rate": 5.986805493771933e-07,
"loss": 0.1699,
"step": 4960
},
{
"epoch": 1.3009088676000982,
"grad_norm": 2.4954433173901998,
"learning_rate": 5.966970069606905e-07,
"loss": 0.1066,
"step": 4965
},
{
"epoch": 1.3022189470236634,
"grad_norm": 3.8824997751755332,
"learning_rate": 5.947153577377372e-07,
"loss": 0.1243,
"step": 4970
},
{
"epoch": 1.3035290264472283,
"grad_norm": 7.155179589969273,
"learning_rate": 5.927356110106335e-07,
"loss": 0.1868,
"step": 4975
},
{
"epoch": 1.3048391058707933,
"grad_norm": 6.426034228727986,
"learning_rate": 5.907577760727491e-07,
"loss": 0.1749,
"step": 4980
},
{
"epoch": 1.3061491852943585,
"grad_norm": 6.259326643585595,
"learning_rate": 5.887818622084792e-07,
"loss": 0.1687,
"step": 4985
},
{
"epoch": 1.3074592647179235,
"grad_norm": 7.118570215949986,
"learning_rate": 5.86807878693201e-07,
"loss": 0.1945,
"step": 4990
},
{
"epoch": 1.3087693441414885,
"grad_norm": 4.470114328159557,
"learning_rate": 5.848358347932305e-07,
"loss": 0.1279,
"step": 4995
},
{
"epoch": 1.3100794235650537,
"grad_norm": 4.518469800001269,
"learning_rate": 5.828657397657775e-07,
"loss": 0.1581,
"step": 5000
},
{
"epoch": 1.3100794235650537,
"eval_accuracy": 0.7664,
"eval_loss": 1.0690715312957764,
"eval_runtime": 138.0186,
"eval_samples_per_second": 9.057,
"eval_steps_per_second": 2.268,
"step": 5000
},
{
"epoch": 1.3113895029886187,
"grad_norm": 3.118035615186372,
"learning_rate": 5.808976028589052e-07,
"loss": 0.148,
"step": 5005
},
{
"epoch": 1.3126995824121837,
"grad_norm": 8.339043807140468,
"learning_rate": 5.789314333114832e-07,
"loss": 0.1599,
"step": 5010
},
{
"epoch": 1.3140096618357489,
"grad_norm": 3.979545884384439,
"learning_rate": 5.769672403531476e-07,
"loss": 0.1862,
"step": 5015
},
{
"epoch": 1.3153197412593138,
"grad_norm": 5.713823007971235,
"learning_rate": 5.750050332042546e-07,
"loss": 0.1493,
"step": 5020
},
{
"epoch": 1.3166298206828788,
"grad_norm": 3.975891070180634,
"learning_rate": 5.730448210758392e-07,
"loss": 0.1615,
"step": 5025
},
{
"epoch": 1.317939900106444,
"grad_norm": 9.895767819873718,
"learning_rate": 5.710866131695707e-07,
"loss": 0.1817,
"step": 5030
},
{
"epoch": 1.319249979530009,
"grad_norm": 3.9920763618019306,
"learning_rate": 5.691304186777112e-07,
"loss": 0.1139,
"step": 5035
},
{
"epoch": 1.320560058953574,
"grad_norm": 4.602626327223196,
"learning_rate": 5.671762467830701e-07,
"loss": 0.1388,
"step": 5040
},
{
"epoch": 1.3218701383771392,
"grad_norm": 3.504924250719407,
"learning_rate": 5.652241066589638e-07,
"loss": 0.1349,
"step": 5045
},
{
"epoch": 1.3231802178007042,
"grad_norm": 3.9727230916218694,
"learning_rate": 5.6327400746917e-07,
"loss": 0.1308,
"step": 5050
},
{
"epoch": 1.3244902972242691,
"grad_norm": 5.966820662181034,
"learning_rate": 5.613259583678855e-07,
"loss": 0.1937,
"step": 5055
},
{
"epoch": 1.3258003766478343,
"grad_norm": 4.982426614770313,
"learning_rate": 5.593799684996851e-07,
"loss": 0.0966,
"step": 5060
},
{
"epoch": 1.3271104560713993,
"grad_norm": 11.15113796656988,
"learning_rate": 5.574360469994755e-07,
"loss": 0.1868,
"step": 5065
},
{
"epoch": 1.3284205354949643,
"grad_norm": 4.032648149967243,
"learning_rate": 5.55494202992455e-07,
"loss": 0.1081,
"step": 5070
},
{
"epoch": 1.3297306149185295,
"grad_norm": 13.254457078365599,
"learning_rate": 5.535544455940685e-07,
"loss": 0.198,
"step": 5075
},
{
"epoch": 1.3310406943420945,
"grad_norm": 6.857347197358272,
"learning_rate": 5.51616783909968e-07,
"loss": 0.1458,
"step": 5080
},
{
"epoch": 1.3323507737656595,
"grad_norm": 4.4329913576651565,
"learning_rate": 5.496812270359651e-07,
"loss": 0.1764,
"step": 5085
},
{
"epoch": 1.3336608531892247,
"grad_norm": 1.9756285446680741,
"learning_rate": 5.477477840579941e-07,
"loss": 0.1328,
"step": 5090
},
{
"epoch": 1.3349709326127897,
"grad_norm": 16.33866926710285,
"learning_rate": 5.458164640520626e-07,
"loss": 0.1688,
"step": 5095
},
{
"epoch": 1.3362810120363546,
"grad_norm": 7.260662473486001,
"learning_rate": 5.438872760842155e-07,
"loss": 0.1475,
"step": 5100
},
{
"epoch": 1.3362810120363546,
"eval_accuracy": 0.7632,
"eval_loss": 1.166494607925415,
"eval_runtime": 139.3544,
"eval_samples_per_second": 8.97,
"eval_steps_per_second": 2.246,
"step": 5100
},
{
"epoch": 1.3375910914599198,
"grad_norm": 9.098282960735217,
"learning_rate": 5.419602292104877e-07,
"loss": 0.2249,
"step": 5105
},
{
"epoch": 1.3389011708834848,
"grad_norm": 14.59920166671128,
"learning_rate": 5.400353324768641e-07,
"loss": 0.2254,
"step": 5110
},
{
"epoch": 1.3402112503070498,
"grad_norm": 5.661845688109474,
"learning_rate": 5.381125949192369e-07,
"loss": 0.1491,
"step": 5115
},
{
"epoch": 1.341521329730615,
"grad_norm": 4.343788864841793,
"learning_rate": 5.361920255633608e-07,
"loss": 0.1416,
"step": 5120
},
{
"epoch": 1.34283140915418,
"grad_norm": 5.455096296340938,
"learning_rate": 5.342736334248142e-07,
"loss": 0.1591,
"step": 5125
},
{
"epoch": 1.344141488577745,
"grad_norm": 6.62802098246343,
"learning_rate": 5.323574275089542e-07,
"loss": 0.1631,
"step": 5130
},
{
"epoch": 1.3454515680013102,
"grad_norm": 5.475172079252899,
"learning_rate": 5.304434168108768e-07,
"loss": 0.1486,
"step": 5135
},
{
"epoch": 1.3467616474248751,
"grad_norm": 3.7823792470345867,
"learning_rate": 5.285316103153703e-07,
"loss": 0.162,
"step": 5140
},
{
"epoch": 1.3480717268484401,
"grad_norm": 3.950455595949725,
"learning_rate": 5.266220169968789e-07,
"loss": 0.1386,
"step": 5145
},
{
"epoch": 1.3493818062720053,
"grad_norm": 7.064128257879523,
"learning_rate": 5.247146458194558e-07,
"loss": 0.1265,
"step": 5150
},
{
"epoch": 1.3506918856955703,
"grad_norm": 3.451315722735635,
"learning_rate": 5.228095057367244e-07,
"loss": 0.1564,
"step": 5155
},
{
"epoch": 1.3520019651191353,
"grad_norm": 6.376390408082125,
"learning_rate": 5.209066056918336e-07,
"loss": 0.1408,
"step": 5160
},
{
"epoch": 1.3533120445427005,
"grad_norm": 10.020189583400871,
"learning_rate": 5.190059546174173e-07,
"loss": 0.1868,
"step": 5165
},
{
"epoch": 1.3546221239662655,
"grad_norm": 10.402422967695797,
"learning_rate": 5.171075614355531e-07,
"loss": 0.1567,
"step": 5170
},
{
"epoch": 1.3559322033898304,
"grad_norm": 6.325865551750877,
"learning_rate": 5.152114350577183e-07,
"loss": 0.1524,
"step": 5175
},
{
"epoch": 1.3572422828133957,
"grad_norm": 9.377120804911332,
"learning_rate": 5.133175843847507e-07,
"loss": 0.2113,
"step": 5180
},
{
"epoch": 1.3585523622369606,
"grad_norm": 5.0552106113831226,
"learning_rate": 5.114260183068043e-07,
"loss": 0.1793,
"step": 5185
},
{
"epoch": 1.3598624416605256,
"grad_norm": 7.793559280526845,
"learning_rate": 5.095367457033091e-07,
"loss": 0.2107,
"step": 5190
},
{
"epoch": 1.3611725210840908,
"grad_norm": 5.256871245122319,
"learning_rate": 5.076497754429286e-07,
"loss": 0.153,
"step": 5195
},
{
"epoch": 1.3624826005076558,
"grad_norm": 7.340368996139302,
"learning_rate": 5.0576511638352e-07,
"loss": 0.1562,
"step": 5200
},
{
"epoch": 1.3624826005076558,
"eval_accuracy": 0.7648,
"eval_loss": 1.0087817907333374,
"eval_runtime": 139.3617,
"eval_samples_per_second": 8.969,
"eval_steps_per_second": 2.246,
"step": 5200
},
{
"epoch": 1.3637926799312208,
"grad_norm": 4.87245584125935,
"learning_rate": 5.03882777372089e-07,
"loss": 0.1542,
"step": 5205
},
{
"epoch": 1.365102759354786,
"grad_norm": 4.198926991817569,
"learning_rate": 5.020027672447531e-07,
"loss": 0.1252,
"step": 5210
},
{
"epoch": 1.366412838778351,
"grad_norm": 6.952393202633123,
"learning_rate": 5.001250948266953e-07,
"loss": 0.1858,
"step": 5215
},
{
"epoch": 1.367722918201916,
"grad_norm": 2.4402795139976936,
"learning_rate": 4.982497689321254e-07,
"loss": 0.139,
"step": 5220
},
{
"epoch": 1.3690329976254811,
"grad_norm": 3.952799970755654,
"learning_rate": 4.963767983642391e-07,
"loss": 0.1942,
"step": 5225
},
{
"epoch": 1.3703430770490461,
"grad_norm": 3.1488846024801855,
"learning_rate": 4.945061919151748e-07,
"loss": 0.1268,
"step": 5230
},
{
"epoch": 1.371653156472611,
"grad_norm": 3.8295556788809533,
"learning_rate": 4.926379583659732e-07,
"loss": 0.1492,
"step": 5235
},
{
"epoch": 1.3729632358961763,
"grad_norm": 5.807740283147101,
"learning_rate": 4.907721064865358e-07,
"loss": 0.1764,
"step": 5240
},
{
"epoch": 1.3742733153197413,
"grad_norm": 6.049197583837082,
"learning_rate": 4.889086450355853e-07,
"loss": 0.1335,
"step": 5245
},
{
"epoch": 1.3755833947433063,
"grad_norm": 5.872325479624525,
"learning_rate": 4.870475827606218e-07,
"loss": 0.1875,
"step": 5250
},
{
"epoch": 1.3768934741668715,
"grad_norm": 7.484940602815659,
"learning_rate": 4.851889283978841e-07,
"loss": 0.2242,
"step": 5255
},
{
"epoch": 1.3782035535904364,
"grad_norm": 4.44039134698427,
"learning_rate": 4.833326906723071e-07,
"loss": 0.1884,
"step": 5260
},
{
"epoch": 1.3795136330140014,
"grad_norm": 2.9457269887282584,
"learning_rate": 4.814788782974814e-07,
"loss": 0.1575,
"step": 5265
},
{
"epoch": 1.3808237124375666,
"grad_norm": 4.142755818231746,
"learning_rate": 4.796274999756134e-07,
"loss": 0.1503,
"step": 5270
},
{
"epoch": 1.3821337918611316,
"grad_norm": 4.231068410579535,
"learning_rate": 4.777785643974822e-07,
"loss": 0.1296,
"step": 5275
},
{
"epoch": 1.3834438712846966,
"grad_norm": 7.276995612922505,
"learning_rate": 4.7593208024240196e-07,
"loss": 0.1793,
"step": 5280
},
{
"epoch": 1.3847539507082618,
"grad_norm": 2.8458411818811973,
"learning_rate": 4.740880561781766e-07,
"loss": 0.097,
"step": 5285
},
{
"epoch": 1.3860640301318268,
"grad_norm": 9.313608656005794,
"learning_rate": 4.7224650086106444e-07,
"loss": 0.1973,
"step": 5290
},
{
"epoch": 1.3873741095553918,
"grad_norm": 8.399153217159018,
"learning_rate": 4.7040742293573334e-07,
"loss": 0.1789,
"step": 5295
},
{
"epoch": 1.388684188978957,
"grad_norm": 5.871107449455294,
"learning_rate": 4.6857083103522277e-07,
"loss": 0.1899,
"step": 5300
},
{
"epoch": 1.388684188978957,
"eval_accuracy": 0.7624,
"eval_loss": 1.0121264457702637,
"eval_runtime": 139.5539,
"eval_samples_per_second": 8.957,
"eval_steps_per_second": 2.243,
"step": 5300
},
{
"epoch": 1.389994268402522,
"grad_norm": 3.8156581594965684,
"learning_rate": 4.667367337809016e-07,
"loss": 0.1204,
"step": 5305
},
{
"epoch": 1.391304347826087,
"grad_norm": 4.985992975846861,
"learning_rate": 4.6490513978242804e-07,
"loss": 0.1319,
"step": 5310
},
{
"epoch": 1.3926144272496521,
"grad_norm": 5.076467294818805,
"learning_rate": 4.6307605763771076e-07,
"loss": 0.1684,
"step": 5315
},
{
"epoch": 1.393924506673217,
"grad_norm": 5.593144516901242,
"learning_rate": 4.6124949593286523e-07,
"loss": 0.2016,
"step": 5320
},
{
"epoch": 1.395234586096782,
"grad_norm": 6.386164042680592,
"learning_rate": 4.5942546324217803e-07,
"loss": 0.1468,
"step": 5325
},
{
"epoch": 1.3965446655203473,
"grad_norm": 6.108862203835904,
"learning_rate": 4.576039681280608e-07,
"loss": 0.1441,
"step": 5330
},
{
"epoch": 1.3978547449439123,
"grad_norm": 3.9244812319321274,
"learning_rate": 4.557850191410161e-07,
"loss": 0.1768,
"step": 5335
},
{
"epoch": 1.3991648243674772,
"grad_norm": 4.15439436004101,
"learning_rate": 4.5396862481959243e-07,
"loss": 0.1338,
"step": 5340
},
{
"epoch": 1.4004749037910424,
"grad_norm": 5.897274197879647,
"learning_rate": 4.521547936903477e-07,
"loss": 0.1798,
"step": 5345
},
{
"epoch": 1.4017849832146074,
"grad_norm": 6.090071523418443,
"learning_rate": 4.5034353426780657e-07,
"loss": 0.1729,
"step": 5350
},
{
"epoch": 1.4030950626381724,
"grad_norm": 6.853061331028334,
"learning_rate": 4.4853485505442133e-07,
"loss": 0.1445,
"step": 5355
},
{
"epoch": 1.4044051420617376,
"grad_norm": 4.835112562982259,
"learning_rate": 4.4672876454053354e-07,
"loss": 0.1255,
"step": 5360
},
{
"epoch": 1.4057152214853026,
"grad_norm": 3.3603532935805713,
"learning_rate": 4.449252712043311e-07,
"loss": 0.1178,
"step": 5365
},
{
"epoch": 1.4070253009088676,
"grad_norm": 5.621068344363656,
"learning_rate": 4.431243835118124e-07,
"loss": 0.1521,
"step": 5370
},
{
"epoch": 1.4083353803324328,
"grad_norm": 6.177956693196569,
"learning_rate": 4.4132610991674123e-07,
"loss": 0.2011,
"step": 5375
},
{
"epoch": 1.4096454597559978,
"grad_norm": 3.8093768021526095,
"learning_rate": 4.3953045886061336e-07,
"loss": 0.1414,
"step": 5380
},
{
"epoch": 1.4109555391795627,
"grad_norm": 8.73859339392293,
"learning_rate": 4.377374387726116e-07,
"loss": 0.2335,
"step": 5385
},
{
"epoch": 1.412265618603128,
"grad_norm": 7.4975778108062965,
"learning_rate": 4.359470580695701e-07,
"loss": 0.1395,
"step": 5390
},
{
"epoch": 1.413575698026693,
"grad_norm": 8.260550346451351,
"learning_rate": 4.341593251559319e-07,
"loss": 0.1615,
"step": 5395
},
{
"epoch": 1.414885777450258,
"grad_norm": 3.552748594627843,
"learning_rate": 4.323742484237107e-07,
"loss": 0.1378,
"step": 5400
},
{
"epoch": 1.414885777450258,
"eval_accuracy": 0.7656,
"eval_loss": 1.022687554359436,
"eval_runtime": 142.9636,
"eval_samples_per_second": 8.743,
"eval_steps_per_second": 2.189,
"step": 5400
},
{
"epoch": 1.416195856873823,
"grad_norm": 3.658068549302559,
"learning_rate": 4.3059183625245275e-07,
"loss": 0.1878,
"step": 5405
},
{
"epoch": 1.417505936297388,
"grad_norm": 12.81739721678878,
"learning_rate": 4.288120970091947e-07,
"loss": 0.2519,
"step": 5410
},
{
"epoch": 1.418816015720953,
"grad_norm": 6.806589788632898,
"learning_rate": 4.270350390484274e-07,
"loss": 0.1387,
"step": 5415
},
{
"epoch": 1.4201260951445183,
"grad_norm": 3.558959244970237,
"learning_rate": 4.2526067071205394e-07,
"loss": 0.1574,
"step": 5420
},
{
"epoch": 1.4214361745680832,
"grad_norm": 4.1991124986912665,
"learning_rate": 4.234890003293522e-07,
"loss": 0.1533,
"step": 5425
},
{
"epoch": 1.4227462539916482,
"grad_norm": 5.57307236404189,
"learning_rate": 4.2172003621693495e-07,
"loss": 0.1435,
"step": 5430
},
{
"epoch": 1.4240563334152134,
"grad_norm": 9.98468287385797,
"learning_rate": 4.1995378667871206e-07,
"loss": 0.1221,
"step": 5435
},
{
"epoch": 1.4253664128387784,
"grad_norm": 4.021122095372401,
"learning_rate": 4.1819026000584935e-07,
"loss": 0.1356,
"step": 5440
},
{
"epoch": 1.4266764922623434,
"grad_norm": 2.539120750400961,
"learning_rate": 4.164294644767321e-07,
"loss": 0.1386,
"step": 5445
},
{
"epoch": 1.4279865716859086,
"grad_norm": 4.900068180108013,
"learning_rate": 4.1467140835692403e-07,
"loss": 0.1509,
"step": 5450
},
{
"epoch": 1.4292966511094736,
"grad_norm": 8.043017914794532,
"learning_rate": 4.1291609989912955e-07,
"loss": 0.1282,
"step": 5455
},
{
"epoch": 1.4306067305330386,
"grad_norm": 4.392158250344916,
"learning_rate": 4.1116354734315596e-07,
"loss": 0.1136,
"step": 5460
},
{
"epoch": 1.4319168099566038,
"grad_norm": 4.14190663170065,
"learning_rate": 4.0941375891587273e-07,
"loss": 0.1398,
"step": 5465
},
{
"epoch": 1.4332268893801687,
"grad_norm": 8.877219117463126,
"learning_rate": 4.076667428311739e-07,
"loss": 0.1529,
"step": 5470
},
{
"epoch": 1.4345369688037337,
"grad_norm": 10.951781738101637,
"learning_rate": 4.059225072899397e-07,
"loss": 0.1794,
"step": 5475
},
{
"epoch": 1.435847048227299,
"grad_norm": 3.5884378317825982,
"learning_rate": 4.041810604799986e-07,
"loss": 0.1287,
"step": 5480
},
{
"epoch": 1.437157127650864,
"grad_norm": 7.646485652411928,
"learning_rate": 4.0244241057608675e-07,
"loss": 0.1526,
"step": 5485
},
{
"epoch": 1.4384672070744289,
"grad_norm": 4.095820215840846,
"learning_rate": 4.0070656573981263e-07,
"loss": 0.1629,
"step": 5490
},
{
"epoch": 1.4397772864979939,
"grad_norm": 4.619960063809711,
"learning_rate": 3.9897353411961576e-07,
"loss": 0.1631,
"step": 5495
},
{
"epoch": 1.441087365921559,
"grad_norm": 7.65793128091754,
"learning_rate": 3.9724332385073e-07,
"loss": 0.1684,
"step": 5500
},
{
"epoch": 1.441087365921559,
"eval_accuracy": 0.7616,
"eval_loss": 1.1524358987808228,
"eval_runtime": 140.9725,
"eval_samples_per_second": 8.867,
"eval_steps_per_second": 2.22,
"step": 5500
},
{
"epoch": 1.442397445345124,
"grad_norm": 9.53033405655631,
"learning_rate": 3.955159430551462e-07,
"loss": 0.1856,
"step": 5505
},
{
"epoch": 1.443707524768689,
"grad_norm": 6.635703636489727,
"learning_rate": 3.937913998415716e-07,
"loss": 0.1173,
"step": 5510
},
{
"epoch": 1.4450176041922542,
"grad_norm": 2.8008935096487444,
"learning_rate": 3.9206970230539484e-07,
"loss": 0.1407,
"step": 5515
},
{
"epoch": 1.4463276836158192,
"grad_norm": 7.567404117719152,
"learning_rate": 3.90350858528644e-07,
"loss": 0.1339,
"step": 5520
},
{
"epoch": 1.4476377630393842,
"grad_norm": 5.684176798537522,
"learning_rate": 3.886348765799535e-07,
"loss": 0.1448,
"step": 5525
},
{
"epoch": 1.4489478424629494,
"grad_norm": 7.48394265324762,
"learning_rate": 3.8692176451452187e-07,
"loss": 0.1873,
"step": 5530
},
{
"epoch": 1.4502579218865144,
"grad_norm": 4.704500306984347,
"learning_rate": 3.852115303740775e-07,
"loss": 0.1384,
"step": 5535
},
{
"epoch": 1.4515680013100793,
"grad_norm": 7.782133396015602,
"learning_rate": 3.8350418218683656e-07,
"loss": 0.1678,
"step": 5540
},
{
"epoch": 1.4528780807336446,
"grad_norm": 7.950318860217571,
"learning_rate": 3.817997279674707e-07,
"loss": 0.1491,
"step": 5545
},
{
"epoch": 1.4541881601572095,
"grad_norm": 4.885181041366676,
"learning_rate": 3.800981757170647e-07,
"loss": 0.1333,
"step": 5550
},
{
"epoch": 1.4554982395807745,
"grad_norm": 4.5099336389227,
"learning_rate": 3.7839953342308195e-07,
"loss": 0.1649,
"step": 5555
},
{
"epoch": 1.4568083190043397,
"grad_norm": 1.3392143606297127,
"learning_rate": 3.767038090593262e-07,
"loss": 0.1196,
"step": 5560
},
{
"epoch": 1.4581183984279047,
"grad_norm": 6.801138336879372,
"learning_rate": 3.7501101058590156e-07,
"loss": 0.1303,
"step": 5565
},
{
"epoch": 1.4594284778514697,
"grad_norm": 6.231994683303187,
"learning_rate": 3.733211459491802e-07,
"loss": 0.1275,
"step": 5570
},
{
"epoch": 1.4607385572750349,
"grad_norm": 7.481895592566931,
"learning_rate": 3.716342230817598e-07,
"loss": 0.1563,
"step": 5575
},
{
"epoch": 1.4620486366985999,
"grad_norm": 5.294171079568994,
"learning_rate": 3.6995024990243097e-07,
"loss": 0.1615,
"step": 5580
},
{
"epoch": 1.4633587161221648,
"grad_norm": 4.889827410342588,
"learning_rate": 3.682692343161361e-07,
"loss": 0.1409,
"step": 5585
},
{
"epoch": 1.46466879554573,
"grad_norm": 4.755475192052327,
"learning_rate": 3.6659118421393454e-07,
"loss": 0.2151,
"step": 5590
},
{
"epoch": 1.465978874969295,
"grad_norm": 8.146609131839343,
"learning_rate": 3.6491610747296464e-07,
"loss": 0.167,
"step": 5595
},
{
"epoch": 1.46728895439286,
"grad_norm": 11.571129826043965,
"learning_rate": 3.632440119564084e-07,
"loss": 0.1526,
"step": 5600
},
{
"epoch": 1.46728895439286,
"eval_accuracy": 0.7632,
"eval_loss": 1.1522161960601807,
"eval_runtime": 140.4528,
"eval_samples_per_second": 8.9,
"eval_steps_per_second": 2.229,
"step": 5600
},
{
"epoch": 1.4685990338164252,
"grad_norm": 13.388610986617053,
"learning_rate": 3.615749055134516e-07,
"loss": 0.1434,
"step": 5605
},
{
"epoch": 1.4699091132399902,
"grad_norm": 4.611773903177833,
"learning_rate": 3.5990879597925015e-07,
"loss": 0.1593,
"step": 5610
},
{
"epoch": 1.4712191926635552,
"grad_norm": 5.209036065178781,
"learning_rate": 3.5824569117489087e-07,
"loss": 0.1589,
"step": 5615
},
{
"epoch": 1.4725292720871201,
"grad_norm": 5.985898828281977,
"learning_rate": 3.565855989073555e-07,
"loss": 0.2083,
"step": 5620
},
{
"epoch": 1.4738393515106853,
"grad_norm": 7.4907701541903355,
"learning_rate": 3.549285269694855e-07,
"loss": 0.2042,
"step": 5625
},
{
"epoch": 1.4751494309342503,
"grad_norm": 8.271025659121081,
"learning_rate": 3.53274483139943e-07,
"loss": 0.1482,
"step": 5630
},
{
"epoch": 1.4764595103578153,
"grad_norm": 8.918810219484554,
"learning_rate": 3.5162347518317614e-07,
"loss": 0.155,
"step": 5635
},
{
"epoch": 1.4777695897813805,
"grad_norm": 6.395860091750389,
"learning_rate": 3.499755108493814e-07,
"loss": 0.1675,
"step": 5640
},
{
"epoch": 1.4790796692049455,
"grad_norm": 4.314058968165375,
"learning_rate": 3.483305978744688e-07,
"loss": 0.1404,
"step": 5645
},
{
"epoch": 1.4803897486285105,
"grad_norm": 4.359611398643348,
"learning_rate": 3.4668874398002367e-07,
"loss": 0.1973,
"step": 5650
},
{
"epoch": 1.4816998280520757,
"grad_norm": 4.284018420029098,
"learning_rate": 3.450499568732722e-07,
"loss": 0.1673,
"step": 5655
},
{
"epoch": 1.4830099074756407,
"grad_norm": 11.418803588749793,
"learning_rate": 3.434142442470437e-07,
"loss": 0.1604,
"step": 5660
},
{
"epoch": 1.4843199868992056,
"grad_norm": 11.223038201848048,
"learning_rate": 3.41781613779735e-07,
"loss": 0.1685,
"step": 5665
},
{
"epoch": 1.4856300663227708,
"grad_norm": 2.587612812317313,
"learning_rate": 3.401520731352758e-07,
"loss": 0.136,
"step": 5670
},
{
"epoch": 1.4869401457463358,
"grad_norm": 3.3606721211871826,
"learning_rate": 3.385256299630901e-07,
"loss": 0.1451,
"step": 5675
},
{
"epoch": 1.4882502251699008,
"grad_norm": 4.563542445144167,
"learning_rate": 3.36902291898063e-07,
"loss": 0.1518,
"step": 5680
},
{
"epoch": 1.489560304593466,
"grad_norm": 4.828685381151716,
"learning_rate": 3.352820665605016e-07,
"loss": 0.1545,
"step": 5685
},
{
"epoch": 1.490870384017031,
"grad_norm": 6.374748369935473,
"learning_rate": 3.336649615561035e-07,
"loss": 0.1404,
"step": 5690
},
{
"epoch": 1.492180463440596,
"grad_norm": 6.1615180099066285,
"learning_rate": 3.320509844759168e-07,
"loss": 0.1522,
"step": 5695
},
{
"epoch": 1.4934905428641612,
"grad_norm": 3.212620440024653,
"learning_rate": 3.3044014289630827e-07,
"loss": 0.1852,
"step": 5700
},
{
"epoch": 1.4934905428641612,
"eval_accuracy": 0.7648,
"eval_loss": 1.1015968322753906,
"eval_runtime": 139.045,
"eval_samples_per_second": 8.99,
"eval_steps_per_second": 2.251,
"step": 5700
},
{
"epoch": 1.4948006222877261,
"grad_norm": 9.244280944426595,
"learning_rate": 3.288324443789243e-07,
"loss": 0.173,
"step": 5705
},
{
"epoch": 1.4961107017112911,
"grad_norm": 7.103864592003495,
"learning_rate": 3.272278964706575e-07,
"loss": 0.1468,
"step": 5710
},
{
"epoch": 1.4974207811348563,
"grad_norm": 9.814370133917347,
"learning_rate": 3.256265067036118e-07,
"loss": 0.2144,
"step": 5715
},
{
"epoch": 1.4987308605584213,
"grad_norm": 3.41872463628258,
"learning_rate": 3.2402828259506445e-07,
"loss": 0.1161,
"step": 5720
},
{
"epoch": 1.5000409399819863,
"grad_norm": 3.982224660417661,
"learning_rate": 3.2243323164743453e-07,
"loss": 0.1338,
"step": 5725
},
{
"epoch": 1.5013510194055515,
"grad_norm": 3.5236382432537052,
"learning_rate": 3.208413613482429e-07,
"loss": 0.1216,
"step": 5730
},
{
"epoch": 1.5026610988291165,
"grad_norm": 5.361802539263841,
"learning_rate": 3.1925267917008224e-07,
"loss": 0.1533,
"step": 5735
},
{
"epoch": 1.5039711782526815,
"grad_norm": 2.839792538080114,
"learning_rate": 3.1766719257057785e-07,
"loss": 0.1389,
"step": 5740
},
{
"epoch": 1.5052812576762467,
"grad_norm": 8.32225450006049,
"learning_rate": 3.160849089923555e-07,
"loss": 0.1513,
"step": 5745
},
{
"epoch": 1.5065913370998116,
"grad_norm": 7.675934502719418,
"learning_rate": 3.145058358630043e-07,
"loss": 0.1482,
"step": 5750
},
{
"epoch": 1.5079014165233766,
"grad_norm": 4.898667455406003,
"learning_rate": 3.1292998059504294e-07,
"loss": 0.1264,
"step": 5755
},
{
"epoch": 1.5092114959469418,
"grad_norm": 7.366536986513522,
"learning_rate": 3.113573505858855e-07,
"loss": 0.1777,
"step": 5760
},
{
"epoch": 1.5105215753705068,
"grad_norm": 6.704445471281222,
"learning_rate": 3.0978795321780506e-07,
"loss": 0.1492,
"step": 5765
},
{
"epoch": 1.5118316547940718,
"grad_norm": 7.187985717646523,
"learning_rate": 3.0822179585790063e-07,
"loss": 0.1358,
"step": 5770
},
{
"epoch": 1.513141734217637,
"grad_norm": 8.290411615555337,
"learning_rate": 3.0665888585806163e-07,
"loss": 0.2399,
"step": 5775
},
{
"epoch": 1.514451813641202,
"grad_norm": 4.958261863176042,
"learning_rate": 3.050992305549335e-07,
"loss": 0.1241,
"step": 5780
},
{
"epoch": 1.515761893064767,
"grad_norm": 5.902391740892916,
"learning_rate": 3.035428372698833e-07,
"loss": 0.1296,
"step": 5785
},
{
"epoch": 1.5170719724883321,
"grad_norm": 4.285558089259955,
"learning_rate": 3.0198971330896637e-07,
"loss": 0.183,
"step": 5790
},
{
"epoch": 1.5183820519118971,
"grad_norm": 5.273051514451481,
"learning_rate": 3.0043986596289027e-07,
"loss": 0.1311,
"step": 5795
},
{
"epoch": 1.519692131335462,
"grad_norm": 6.51925311343804,
"learning_rate": 2.988933025069811e-07,
"loss": 0.1358,
"step": 5800
},
{
"epoch": 1.519692131335462,
"eval_accuracy": 0.7624,
"eval_loss": 1.1159664392471313,
"eval_runtime": 141.0917,
"eval_samples_per_second": 8.859,
"eval_steps_per_second": 2.218,
"step": 5800
},
{
"epoch": 1.5210022107590273,
"grad_norm": 7.383268111667339,
"learning_rate": 2.973500302011509e-07,
"loss": 0.147,
"step": 5805
},
{
"epoch": 1.5223122901825923,
"grad_norm": 3.8912007914030182,
"learning_rate": 2.958100562898609e-07,
"loss": 0.1089,
"step": 5810
},
{
"epoch": 1.5236223696061573,
"grad_norm": 8.712734187230408,
"learning_rate": 2.9427338800209033e-07,
"loss": 0.2046,
"step": 5815
},
{
"epoch": 1.5249324490297225,
"grad_norm": 3.405653489283435,
"learning_rate": 2.927400325513001e-07,
"loss": 0.1524,
"step": 5820
},
{
"epoch": 1.5262425284532875,
"grad_norm": 9.562161269374949,
"learning_rate": 2.912099971354002e-07,
"loss": 0.1311,
"step": 5825
},
{
"epoch": 1.5275526078768524,
"grad_norm": 4.4220037412099495,
"learning_rate": 2.896832889367151e-07,
"loss": 0.1844,
"step": 5830
},
{
"epoch": 1.5288626873004176,
"grad_norm": 8.764506736183701,
"learning_rate": 2.8815991512195217e-07,
"loss": 0.1857,
"step": 5835
},
{
"epoch": 1.5301727667239826,
"grad_norm": 5.111081703573647,
"learning_rate": 2.8663988284216444e-07,
"loss": 0.1286,
"step": 5840
},
{
"epoch": 1.5314828461475476,
"grad_norm": 4.293980562734722,
"learning_rate": 2.851231992327208e-07,
"loss": 0.1363,
"step": 5845
},
{
"epoch": 1.5327929255711128,
"grad_norm": 8.711462310411664,
"learning_rate": 2.8360987141326954e-07,
"loss": 0.1606,
"step": 5850
},
{
"epoch": 1.5341030049946778,
"grad_norm": 8.990881900478213,
"learning_rate": 2.820999064877062e-07,
"loss": 0.2247,
"step": 5855
},
{
"epoch": 1.5354130844182428,
"grad_norm": 9.192897104652108,
"learning_rate": 2.805933115441412e-07,
"loss": 0.1715,
"step": 5860
},
{
"epoch": 1.536723163841808,
"grad_norm": 3.8288642962534567,
"learning_rate": 2.790900936548646e-07,
"loss": 0.1652,
"step": 5865
},
{
"epoch": 1.538033243265373,
"grad_norm": 12.038766998437053,
"learning_rate": 2.775902598763137e-07,
"loss": 0.2038,
"step": 5870
},
{
"epoch": 1.539343322688938,
"grad_norm": 4.827284945086436,
"learning_rate": 2.7609381724904024e-07,
"loss": 0.1901,
"step": 5875
},
{
"epoch": 1.5406534021125031,
"grad_norm": 4.331535469429681,
"learning_rate": 2.746007727976779e-07,
"loss": 0.1754,
"step": 5880
},
{
"epoch": 1.541963481536068,
"grad_norm": 1.900735020734344,
"learning_rate": 2.731111335309072e-07,
"loss": 0.1058,
"step": 5885
},
{
"epoch": 1.543273560959633,
"grad_norm": 2.961654173117568,
"learning_rate": 2.7162490644142545e-07,
"loss": 0.1598,
"step": 5890
},
{
"epoch": 1.5445836403831983,
"grad_norm": 10.176050102064952,
"learning_rate": 2.701420985059112e-07,
"loss": 0.1775,
"step": 5895
},
{
"epoch": 1.5458937198067633,
"grad_norm": 8.432950736971934,
"learning_rate": 2.686627166849931e-07,
"loss": 0.1664,
"step": 5900
},
{
"epoch": 1.5458937198067633,
"eval_accuracy": 0.7504,
"eval_loss": 1.094858169555664,
"eval_runtime": 139.504,
"eval_samples_per_second": 8.96,
"eval_steps_per_second": 2.244,
"step": 5900
},
{
"epoch": 1.5472037992303282,
"grad_norm": 3.4036278165694545,
"learning_rate": 2.671867679232175e-07,
"loss": 0.1702,
"step": 5905
},
{
"epoch": 1.5485138786538934,
"grad_norm": 2.46431216575455,
"learning_rate": 2.65714259149014e-07,
"loss": 0.2071,
"step": 5910
},
{
"epoch": 1.5498239580774584,
"grad_norm": 4.676488316765192,
"learning_rate": 2.64245197274666e-07,
"loss": 0.1668,
"step": 5915
},
{
"epoch": 1.5511340375010234,
"grad_norm": 2.776658179276032,
"learning_rate": 2.6277958919627386e-07,
"loss": 0.173,
"step": 5920
},
{
"epoch": 1.5524441169245886,
"grad_norm": 2.9881895057457424,
"learning_rate": 2.6131744179372725e-07,
"loss": 0.166,
"step": 5925
},
{
"epoch": 1.5537541963481536,
"grad_norm": 5.351569952228676,
"learning_rate": 2.5985876193066925e-07,
"loss": 0.1378,
"step": 5930
},
{
"epoch": 1.5550642757717186,
"grad_norm": 7.287593146087938,
"learning_rate": 2.5840355645446687e-07,
"loss": 0.18,
"step": 5935
},
{
"epoch": 1.5563743551952838,
"grad_norm": 5.162933527380095,
"learning_rate": 2.5695183219617644e-07,
"loss": 0.2142,
"step": 5940
},
{
"epoch": 1.5576844346188488,
"grad_norm": 4.473626526697219,
"learning_rate": 2.555035959705127e-07,
"loss": 0.1674,
"step": 5945
},
{
"epoch": 1.5589945140424137,
"grad_norm": 5.821815812779585,
"learning_rate": 2.540588545758179e-07,
"loss": 0.1876,
"step": 5950
},
{
"epoch": 1.560304593465979,
"grad_norm": 4.101891407721645,
"learning_rate": 2.5261761479402734e-07,
"loss": 0.1597,
"step": 5955
},
{
"epoch": 1.561614672889544,
"grad_norm": 15.605199269628502,
"learning_rate": 2.5117988339064053e-07,
"loss": 0.1509,
"step": 5960
},
{
"epoch": 1.562924752313109,
"grad_norm": 7.523818440166435,
"learning_rate": 2.49745667114686e-07,
"loss": 0.1557,
"step": 5965
},
{
"epoch": 1.564234831736674,
"grad_norm": 4.751911403623371,
"learning_rate": 2.483149726986934e-07,
"loss": 0.1609,
"step": 5970
},
{
"epoch": 1.565544911160239,
"grad_norm": 10.300208770198049,
"learning_rate": 2.468878068586583e-07,
"loss": 0.2317,
"step": 5975
},
{
"epoch": 1.566854990583804,
"grad_norm": 6.418645648934525,
"learning_rate": 2.4546417629401396e-07,
"loss": 0.1699,
"step": 5980
},
{
"epoch": 1.5681650700073693,
"grad_norm": 6.411061114094063,
"learning_rate": 2.440440876875971e-07,
"loss": 0.1921,
"step": 5985
},
{
"epoch": 1.5694751494309342,
"grad_norm": 7.188743967752872,
"learning_rate": 2.4262754770561777e-07,
"loss": 0.1515,
"step": 5990
},
{
"epoch": 1.5707852288544992,
"grad_norm": 5.309747838189995,
"learning_rate": 2.412145629976289e-07,
"loss": 0.1801,
"step": 5995
},
{
"epoch": 1.5720953082780644,
"grad_norm": 4.00510825001943,
"learning_rate": 2.39805140196493e-07,
"loss": 0.1731,
"step": 6000
},
{
"epoch": 1.5720953082780644,
"eval_accuracy": 0.76,
"eval_loss": 1.0234113931655884,
"eval_runtime": 139.1959,
"eval_samples_per_second": 8.98,
"eval_steps_per_second": 2.249,
"step": 6000
},
{
"epoch": 1.5734053877016294,
"grad_norm": 9.59492476008665,
"learning_rate": 2.3839928591835335e-07,
"loss": 0.1686,
"step": 6005
},
{
"epoch": 1.5747154671251944,
"grad_norm": 3.765377816124392,
"learning_rate": 2.3699700676260092e-07,
"loss": 0.1494,
"step": 6010
},
{
"epoch": 1.5760255465487596,
"grad_norm": 5.338406619519301,
"learning_rate": 2.3559830931184455e-07,
"loss": 0.1467,
"step": 6015
},
{
"epoch": 1.5773356259723246,
"grad_norm": 6.763713809000965,
"learning_rate": 2.3420320013187954e-07,
"loss": 0.1898,
"step": 6020
},
{
"epoch": 1.5786457053958896,
"grad_norm": 5.481130008134444,
"learning_rate": 2.328116857716579e-07,
"loss": 0.1548,
"step": 6025
},
{
"epoch": 1.5799557848194548,
"grad_norm": 3.557311963677506,
"learning_rate": 2.3142377276325563e-07,
"loss": 0.1443,
"step": 6030
},
{
"epoch": 1.5812658642430197,
"grad_norm": 5.246282698025739,
"learning_rate": 2.30039467621844e-07,
"loss": 0.1982,
"step": 6035
},
{
"epoch": 1.5825759436665847,
"grad_norm": 7.849841735411486,
"learning_rate": 2.286587768456575e-07,
"loss": 0.1785,
"step": 6040
},
{
"epoch": 1.58388602309015,
"grad_norm": 3.056662932671019,
"learning_rate": 2.272817069159647e-07,
"loss": 0.1452,
"step": 6045
},
{
"epoch": 1.585196102513715,
"grad_norm": 7.441075760718876,
"learning_rate": 2.2590826429703647e-07,
"loss": 0.1358,
"step": 6050
},
{
"epoch": 1.5865061819372799,
"grad_norm": 4.349747044723176,
"learning_rate": 2.2453845543611705e-07,
"loss": 0.126,
"step": 6055
},
{
"epoch": 1.587816261360845,
"grad_norm": 7.685604617278789,
"learning_rate": 2.2317228676339216e-07,
"loss": 0.1422,
"step": 6060
},
{
"epoch": 1.58912634078441,
"grad_norm": 5.9349721355409795,
"learning_rate": 2.218097646919599e-07,
"loss": 0.1861,
"step": 6065
},
{
"epoch": 1.590436420207975,
"grad_norm": 5.891145802679616,
"learning_rate": 2.2045089561780107e-07,
"loss": 0.1431,
"step": 6070
},
{
"epoch": 1.5917464996315402,
"grad_norm": 2.287356113721854,
"learning_rate": 2.1909568591974748e-07,
"loss": 0.1288,
"step": 6075
},
{
"epoch": 1.5930565790551052,
"grad_norm": 7.38109989814088,
"learning_rate": 2.1774414195945423e-07,
"loss": 0.1277,
"step": 6080
},
{
"epoch": 1.5943666584786702,
"grad_norm": 3.893434900862315,
"learning_rate": 2.1639627008136697e-07,
"loss": 0.1412,
"step": 6085
},
{
"epoch": 1.5956767379022354,
"grad_norm": 3.7517744969526183,
"learning_rate": 2.1505207661269554e-07,
"loss": 0.131,
"step": 6090
},
{
"epoch": 1.5969868173258004,
"grad_norm": 5.213568999899694,
"learning_rate": 2.1371156786338107e-07,
"loss": 0.1493,
"step": 6095
},
{
"epoch": 1.5982968967493654,
"grad_norm": 7.30638497105969,
"learning_rate": 2.123747501260691e-07,
"loss": 0.1427,
"step": 6100
},
{
"epoch": 1.5982968967493654,
"eval_accuracy": 0.7576,
"eval_loss": 1.1005467176437378,
"eval_runtime": 138.3031,
"eval_samples_per_second": 9.038,
"eval_steps_per_second": 2.263,
"step": 6100
},
{
"epoch": 1.5996069761729306,
"grad_norm": 4.455588136802878,
"learning_rate": 2.1104162967607774e-07,
"loss": 0.1667,
"step": 6105
},
{
"epoch": 1.6009170555964956,
"grad_norm": 5.978054630784655,
"learning_rate": 2.0971221277136942e-07,
"loss": 0.1548,
"step": 6110
},
{
"epoch": 1.6022271350200605,
"grad_norm": 6.935845367832056,
"learning_rate": 2.083865056525218e-07,
"loss": 0.1465,
"step": 6115
},
{
"epoch": 1.6035372144436257,
"grad_norm": 9.625156258887605,
"learning_rate": 2.0706451454269723e-07,
"loss": 0.1988,
"step": 6120
},
{
"epoch": 1.6048472938671907,
"grad_norm": 8.66757452370429,
"learning_rate": 2.0574624564761557e-07,
"loss": 0.1249,
"step": 6125
},
{
"epoch": 1.6061573732907557,
"grad_norm": 6.280401831803158,
"learning_rate": 2.0443170515552166e-07,
"loss": 0.1398,
"step": 6130
},
{
"epoch": 1.607467452714321,
"grad_norm": 7.078421243621821,
"learning_rate": 2.0312089923716058e-07,
"loss": 0.1744,
"step": 6135
},
{
"epoch": 1.6087775321378859,
"grad_norm": 3.5683942015597343,
"learning_rate": 2.0181383404574493e-07,
"loss": 0.1518,
"step": 6140
},
{
"epoch": 1.6100876115614509,
"grad_norm": 5.9348857695988135,
"learning_rate": 2.0051051571692866e-07,
"loss": 0.15,
"step": 6145
},
{
"epoch": 1.611397690985016,
"grad_norm": 2.6156284263853595,
"learning_rate": 1.9921095036877644e-07,
"loss": 0.1248,
"step": 6150
},
{
"epoch": 1.612707770408581,
"grad_norm": 5.4712881492494425,
"learning_rate": 1.9791514410173538e-07,
"loss": 0.1972,
"step": 6155
},
{
"epoch": 1.614017849832146,
"grad_norm": 5.1651116865695785,
"learning_rate": 1.966231029986075e-07,
"loss": 0.1164,
"step": 6160
},
{
"epoch": 1.6153279292557112,
"grad_norm": 7.989806082396054,
"learning_rate": 1.9533483312451959e-07,
"loss": 0.2138,
"step": 6165
},
{
"epoch": 1.6166380086792762,
"grad_norm": 5.514479603993972,
"learning_rate": 1.9405034052689585e-07,
"loss": 0.1346,
"step": 6170
},
{
"epoch": 1.6179480881028412,
"grad_norm": 5.995445272935516,
"learning_rate": 1.927696312354289e-07,
"loss": 0.1327,
"step": 6175
},
{
"epoch": 1.6192581675264064,
"grad_norm": 7.346676621492845,
"learning_rate": 1.9149271126205168e-07,
"loss": 0.172,
"step": 6180
},
{
"epoch": 1.6205682469499714,
"grad_norm": 7.02864110812697,
"learning_rate": 1.902195866009091e-07,
"loss": 0.1684,
"step": 6185
},
{
"epoch": 1.6218783263735363,
"grad_norm": 5.984903488092592,
"learning_rate": 1.8895026322833063e-07,
"loss": 0.1282,
"step": 6190
},
{
"epoch": 1.6231884057971016,
"grad_norm": 3.5054458379581197,
"learning_rate": 1.876847471028009e-07,
"loss": 0.1257,
"step": 6195
},
{
"epoch": 1.6244984852206665,
"grad_norm": 4.70529605927654,
"learning_rate": 1.8642304416493283e-07,
"loss": 0.1267,
"step": 6200
},
{
"epoch": 1.6244984852206665,
"eval_accuracy": 0.7552,
"eval_loss": 1.1194959878921509,
"eval_runtime": 142.0998,
"eval_samples_per_second": 8.797,
"eval_steps_per_second": 2.203,
"step": 6200
},
{
"epoch": 1.6258085646442315,
"grad_norm": 7.7424757434182006,
"learning_rate": 1.8516516033743956e-07,
"loss": 0.1575,
"step": 6205
},
{
"epoch": 1.6271186440677967,
"grad_norm": 5.852572753378986,
"learning_rate": 1.8391110152510615e-07,
"loss": 0.1466,
"step": 6210
},
{
"epoch": 1.6284287234913617,
"grad_norm": 9.711403077550843,
"learning_rate": 1.8266087361476258e-07,
"loss": 0.2505,
"step": 6215
},
{
"epoch": 1.6297388029149267,
"grad_norm": 7.233520202930265,
"learning_rate": 1.8141448247525527e-07,
"loss": 0.1326,
"step": 6220
},
{
"epoch": 1.6310488823384919,
"grad_norm": 7.089498480489309,
"learning_rate": 1.8017193395742024e-07,
"loss": 0.165,
"step": 6225
},
{
"epoch": 1.6323589617620569,
"grad_norm": 6.065667727984184,
"learning_rate": 1.7893323389405524e-07,
"loss": 0.1338,
"step": 6230
},
{
"epoch": 1.6336690411856218,
"grad_norm": 4.969225424762747,
"learning_rate": 1.776983880998929e-07,
"loss": 0.1625,
"step": 6235
},
{
"epoch": 1.634979120609187,
"grad_norm": 6.713374539142975,
"learning_rate": 1.7646740237157254e-07,
"loss": 0.1703,
"step": 6240
},
{
"epoch": 1.636289200032752,
"grad_norm": 5.594374782950924,
"learning_rate": 1.7524028248761401e-07,
"loss": 0.1917,
"step": 6245
},
{
"epoch": 1.637599279456317,
"grad_norm": 6.226467459107668,
"learning_rate": 1.7401703420838975e-07,
"loss": 0.1807,
"step": 6250
},
{
"epoch": 1.6389093588798822,
"grad_norm": 4.7490550372943074,
"learning_rate": 1.7279766327609757e-07,
"loss": 0.1605,
"step": 6255
},
{
"epoch": 1.6402194383034472,
"grad_norm": 6.40470417405122,
"learning_rate": 1.7158217541473518e-07,
"loss": 0.1279,
"step": 6260
},
{
"epoch": 1.6415295177270122,
"grad_norm": 9.101155921957748,
"learning_rate": 1.7037057633007157e-07,
"loss": 0.1125,
"step": 6265
},
{
"epoch": 1.6428395971505774,
"grad_norm": 6.16268053536328,
"learning_rate": 1.6916287170962107e-07,
"loss": 0.1575,
"step": 6270
},
{
"epoch": 1.6441496765741423,
"grad_norm": 6.511167097266026,
"learning_rate": 1.6795906722261644e-07,
"loss": 0.1668,
"step": 6275
},
{
"epoch": 1.6454597559977073,
"grad_norm": 5.236685660626339,
"learning_rate": 1.6675916851998272e-07,
"loss": 0.106,
"step": 6280
},
{
"epoch": 1.6467698354212725,
"grad_norm": 5.638441943629535,
"learning_rate": 1.6556318123430978e-07,
"loss": 0.1362,
"step": 6285
},
{
"epoch": 1.6480799148448375,
"grad_norm": 8.863766597675237,
"learning_rate": 1.6437111097982726e-07,
"loss": 0.1769,
"step": 6290
},
{
"epoch": 1.6493899942684025,
"grad_norm": 3.794500769944755,
"learning_rate": 1.631829633523767e-07,
"loss": 0.1522,
"step": 6295
},
{
"epoch": 1.6507000736919677,
"grad_norm": 3.9716392563806027,
"learning_rate": 1.6199874392938574e-07,
"loss": 0.1483,
"step": 6300
},
{
"epoch": 1.6507000736919677,
"eval_accuracy": 0.7576,
"eval_loss": 1.1004310846328735,
"eval_runtime": 141.7328,
"eval_samples_per_second": 8.819,
"eval_steps_per_second": 2.208,
"step": 6300
},
{
"epoch": 1.6520101531155327,
"grad_norm": 3.655291620709186,
"learning_rate": 1.6081845826984307e-07,
"loss": 0.1227,
"step": 6305
},
{
"epoch": 1.6533202325390977,
"grad_norm": 6.90496866662,
"learning_rate": 1.5964211191427058e-07,
"loss": 0.1756,
"step": 6310
},
{
"epoch": 1.6546303119626629,
"grad_norm": 9.388436741639218,
"learning_rate": 1.5846971038469915e-07,
"loss": 0.1361,
"step": 6315
},
{
"epoch": 1.6559403913862278,
"grad_norm": 6.255702654542568,
"learning_rate": 1.573012591846402e-07,
"loss": 0.1674,
"step": 6320
},
{
"epoch": 1.6572504708097928,
"grad_norm": 5.3552774475843945,
"learning_rate": 1.5613676379906315e-07,
"loss": 0.1525,
"step": 6325
},
{
"epoch": 1.658560550233358,
"grad_norm": 2.691722315737717,
"learning_rate": 1.5497622969436662e-07,
"loss": 0.1796,
"step": 6330
},
{
"epoch": 1.659870629656923,
"grad_norm": 6.025378541300572,
"learning_rate": 1.538196623183552e-07,
"loss": 0.183,
"step": 6335
},
{
"epoch": 1.661180709080488,
"grad_norm": 4.761564415573431,
"learning_rate": 1.5266706710021194e-07,
"loss": 0.1312,
"step": 6340
},
{
"epoch": 1.6624907885040532,
"grad_norm": 7.805446740567002,
"learning_rate": 1.51518449450474e-07,
"loss": 0.1651,
"step": 6345
},
{
"epoch": 1.6638008679276182,
"grad_norm": 5.256694126891557,
"learning_rate": 1.5037381476100707e-07,
"loss": 0.1294,
"step": 6350
},
{
"epoch": 1.6651109473511831,
"grad_norm": 7.755672196084709,
"learning_rate": 1.4923316840497968e-07,
"loss": 0.156,
"step": 6355
},
{
"epoch": 1.6664210267747483,
"grad_norm": 4.656789891974052,
"learning_rate": 1.480965157368389e-07,
"loss": 0.133,
"step": 6360
},
{
"epoch": 1.667731106198313,
"grad_norm": 5.4961595195828705,
"learning_rate": 1.4696386209228307e-07,
"loss": 0.1812,
"step": 6365
},
{
"epoch": 1.6690411856218783,
"grad_norm": 6.495919732500172,
"learning_rate": 1.4583521278824008e-07,
"loss": 0.1657,
"step": 6370
},
{
"epoch": 1.6703512650454435,
"grad_norm": 6.812774990594977,
"learning_rate": 1.4471057312283906e-07,
"loss": 0.1115,
"step": 6375
},
{
"epoch": 1.6716613444690083,
"grad_norm": 3.956134496828771,
"learning_rate": 1.4358994837538817e-07,
"loss": 0.2423,
"step": 6380
},
{
"epoch": 1.6729714238925735,
"grad_norm": 6.259621872344836,
"learning_rate": 1.424733438063479e-07,
"loss": 0.1022,
"step": 6385
},
{
"epoch": 1.6742815033161387,
"grad_norm": 6.072009290092665,
"learning_rate": 1.4136076465730695e-07,
"loss": 0.1832,
"step": 6390
},
{
"epoch": 1.6755915827397034,
"grad_norm": 7.456858413136058,
"learning_rate": 1.4025221615095873e-07,
"loss": 0.1657,
"step": 6395
},
{
"epoch": 1.6769016621632686,
"grad_norm": 7.260932680002808,
"learning_rate": 1.3914770349107495e-07,
"loss": 0.1346,
"step": 6400
},
{
"epoch": 1.6769016621632686,
"eval_accuracy": 0.7632,
"eval_loss": 1.100335717201233,
"eval_runtime": 141.6273,
"eval_samples_per_second": 8.826,
"eval_steps_per_second": 2.21,
"step": 6400
},
{
"epoch": 1.6782117415868338,
"grad_norm": 3.6061256334408203,
"learning_rate": 1.3804723186248313e-07,
"loss": 0.1343,
"step": 6405
},
{
"epoch": 1.6795218210103986,
"grad_norm": 8.270495540507543,
"learning_rate": 1.369508064310404e-07,
"loss": 0.1182,
"step": 6410
},
{
"epoch": 1.6808319004339638,
"grad_norm": 5.629731179384749,
"learning_rate": 1.3585843234361049e-07,
"loss": 0.1568,
"step": 6415
},
{
"epoch": 1.682141979857529,
"grad_norm": 2.867569435579598,
"learning_rate": 1.347701147280391e-07,
"loss": 0.1729,
"step": 6420
},
{
"epoch": 1.6834520592810938,
"grad_norm": 6.83897468616409,
"learning_rate": 1.3368585869313065e-07,
"loss": 0.1874,
"step": 6425
},
{
"epoch": 1.684762138704659,
"grad_norm": 4.3014228257373945,
"learning_rate": 1.326056693286226e-07,
"loss": 0.1778,
"step": 6430
},
{
"epoch": 1.6860722181282242,
"grad_norm": 10.457994997688326,
"learning_rate": 1.31529551705163e-07,
"loss": 0.2127,
"step": 6435
},
{
"epoch": 1.687382297551789,
"grad_norm": 5.086460128057394,
"learning_rate": 1.3045751087428648e-07,
"loss": 0.153,
"step": 6440
},
{
"epoch": 1.6886923769753541,
"grad_norm": 5.2909003289546375,
"learning_rate": 1.2938955186838983e-07,
"loss": 0.1303,
"step": 6445
},
{
"epoch": 1.6900024563989193,
"grad_norm": 6.3469654482036555,
"learning_rate": 1.283256797007094e-07,
"loss": 0.1625,
"step": 6450
},
{
"epoch": 1.691312535822484,
"grad_norm": 3.8066454211371687,
"learning_rate": 1.2726589936529654e-07,
"loss": 0.2029,
"step": 6455
},
{
"epoch": 1.6926226152460493,
"grad_norm": 5.242010338144686,
"learning_rate": 1.2621021583699476e-07,
"loss": 0.1424,
"step": 6460
},
{
"epoch": 1.6939326946696145,
"grad_norm": 3.25302928925533,
"learning_rate": 1.2515863407141603e-07,
"loss": 0.1493,
"step": 6465
},
{
"epoch": 1.6952427740931792,
"grad_norm": 5.788528827053253,
"learning_rate": 1.2411115900491865e-07,
"loss": 0.1396,
"step": 6470
},
{
"epoch": 1.6965528535167445,
"grad_norm": 5.311777991254716,
"learning_rate": 1.230677955545819e-07,
"loss": 0.1388,
"step": 6475
},
{
"epoch": 1.6978629329403097,
"grad_norm": 9.193485516456121,
"learning_rate": 1.2202854861818557e-07,
"loss": 0.1502,
"step": 6480
},
{
"epoch": 1.6991730123638744,
"grad_norm": 3.6649259537864043,
"learning_rate": 1.2099342307418392e-07,
"loss": 0.1834,
"step": 6485
},
{
"epoch": 1.7004830917874396,
"grad_norm": 2.368713152141659,
"learning_rate": 1.199624237816862e-07,
"loss": 0.1621,
"step": 6490
},
{
"epoch": 1.7017931712110048,
"grad_norm": 10.317778961476517,
"learning_rate": 1.1893555558043089e-07,
"loss": 0.1625,
"step": 6495
},
{
"epoch": 1.7031032506345696,
"grad_norm": 3.1939718268686623,
"learning_rate": 1.1791282329076523e-07,
"loss": 0.1682,
"step": 6500
},
{
"epoch": 1.7031032506345696,
"eval_accuracy": 0.7608,
"eval_loss": 1.0906686782836914,
"eval_runtime": 141.6589,
"eval_samples_per_second": 8.824,
"eval_steps_per_second": 2.21,
"step": 6500
},
{
"epoch": 1.7044133300581348,
"grad_norm": 7.245253263810927,
"learning_rate": 1.1689423171362079e-07,
"loss": 0.147,
"step": 6505
},
{
"epoch": 1.7057234094817,
"grad_norm": 3.713535860169349,
"learning_rate": 1.1587978563049161e-07,
"loss": 0.1361,
"step": 6510
},
{
"epoch": 1.7070334889052647,
"grad_norm": 3.1243926261206547,
"learning_rate": 1.1486948980341282e-07,
"loss": 0.1104,
"step": 6515
},
{
"epoch": 1.70834356832883,
"grad_norm": 2.6440342898591838,
"learning_rate": 1.1386334897493632e-07,
"loss": 0.1154,
"step": 6520
},
{
"epoch": 1.7096536477523951,
"grad_norm": 2.7119760374007877,
"learning_rate": 1.128613678681104e-07,
"loss": 0.1315,
"step": 6525
},
{
"epoch": 1.71096372717596,
"grad_norm": 6.397748692900095,
"learning_rate": 1.1186355118645552e-07,
"loss": 0.1652,
"step": 6530
},
{
"epoch": 1.712273806599525,
"grad_norm": 4.992532932504358,
"learning_rate": 1.1086990361394477e-07,
"loss": 0.1224,
"step": 6535
},
{
"epoch": 1.7135838860230903,
"grad_norm": 8.728515211435251,
"learning_rate": 1.0988042981497947e-07,
"loss": 0.2042,
"step": 6540
},
{
"epoch": 1.714893965446655,
"grad_norm": 13.21150999052598,
"learning_rate": 1.0889513443436904e-07,
"loss": 0.1576,
"step": 6545
},
{
"epoch": 1.7162040448702203,
"grad_norm": 2.632026076658514,
"learning_rate": 1.0791402209730794e-07,
"loss": 0.0997,
"step": 6550
},
{
"epoch": 1.7175141242937855,
"grad_norm": 5.720341695762074,
"learning_rate": 1.0693709740935463e-07,
"loss": 0.155,
"step": 6555
},
{
"epoch": 1.7188242037173502,
"grad_norm": 5.80222557370497,
"learning_rate": 1.0596436495641025e-07,
"loss": 0.1255,
"step": 6560
},
{
"epoch": 1.7201342831409154,
"grad_norm": 6.100322814196168,
"learning_rate": 1.0499582930469597e-07,
"loss": 0.1629,
"step": 6565
},
{
"epoch": 1.7214443625644806,
"grad_norm": 4.478475471096975,
"learning_rate": 1.0403149500073294e-07,
"loss": 0.1398,
"step": 6570
},
{
"epoch": 1.7227544419880454,
"grad_norm": 4.8025162956814835,
"learning_rate": 1.0307136657131977e-07,
"loss": 0.1035,
"step": 6575
},
{
"epoch": 1.7240645214116106,
"grad_norm": 3.2219463387203233,
"learning_rate": 1.0211544852351183e-07,
"loss": 0.1807,
"step": 6580
},
{
"epoch": 1.7253746008351758,
"grad_norm": 5.573188128425597,
"learning_rate": 1.0116374534459993e-07,
"loss": 0.1532,
"step": 6585
},
{
"epoch": 1.7266846802587406,
"grad_norm": 4.185020111907581,
"learning_rate": 1.0021626150208984e-07,
"loss": 0.1329,
"step": 6590
},
{
"epoch": 1.7279947596823058,
"grad_norm": 3.8823758733463016,
"learning_rate": 9.927300144368045e-08,
"loss": 0.1349,
"step": 6595
},
{
"epoch": 1.7293048391058707,
"grad_norm": 16.750270685634902,
"learning_rate": 9.833396959724306e-08,
"loss": 0.1322,
"step": 6600
},
{
"epoch": 1.7293048391058707,
"eval_accuracy": 0.7608,
"eval_loss": 1.128875970840454,
"eval_runtime": 143.9246,
"eval_samples_per_second": 8.685,
"eval_steps_per_second": 2.175,
"step": 6600
},
{
"epoch": 1.7306149185294357,
"grad_norm": 5.043906483496135,
"learning_rate": 9.739917037080148e-08,
"loss": 0.1572,
"step": 6605
},
{
"epoch": 1.731924997953001,
"grad_norm": 6.199552158675514,
"learning_rate": 9.646860815250979e-08,
"loss": 0.1627,
"step": 6610
},
{
"epoch": 1.733235077376566,
"grad_norm": 2.951863930249291,
"learning_rate": 9.554228731063373e-08,
"loss": 0.154,
"step": 6615
},
{
"epoch": 1.7345451568001309,
"grad_norm": 5.2157616611104975,
"learning_rate": 9.462021219352801e-08,
"loss": 0.1631,
"step": 6620
},
{
"epoch": 1.735855236223696,
"grad_norm": 3.3619242290029963,
"learning_rate": 9.370238712961742e-08,
"loss": 0.2129,
"step": 6625
},
{
"epoch": 1.737165315647261,
"grad_norm": 4.655934383309167,
"learning_rate": 9.27888164273759e-08,
"loss": 0.1738,
"step": 6630
},
{
"epoch": 1.738475395070826,
"grad_norm": 9.09248052048832,
"learning_rate": 9.1879504375307e-08,
"loss": 0.21,
"step": 6635
},
{
"epoch": 1.7397854744943912,
"grad_norm": 9.603961489686213,
"learning_rate": 9.097445524192248e-08,
"loss": 0.1156,
"step": 6640
},
{
"epoch": 1.7410955539179562,
"grad_norm": 6.608421456426681,
"learning_rate": 9.007367327572368e-08,
"loss": 0.1623,
"step": 6645
},
{
"epoch": 1.7424056333415212,
"grad_norm": 5.838309352454389,
"learning_rate": 8.91771627051805e-08,
"loss": 0.1661,
"step": 6650
},
{
"epoch": 1.7437157127650864,
"grad_norm": 5.515377990794858,
"learning_rate": 8.828492773871177e-08,
"loss": 0.1721,
"step": 6655
},
{
"epoch": 1.7450257921886514,
"grad_norm": 5.475478047926709,
"learning_rate": 8.739697256466638e-08,
"loss": 0.1668,
"step": 6660
},
{
"epoch": 1.7463358716122164,
"grad_norm": 6.660668383621966,
"learning_rate": 8.651330135130241e-08,
"loss": 0.1841,
"step": 6665
},
{
"epoch": 1.7476459510357816,
"grad_norm": 4.043989017999333,
"learning_rate": 8.563391824676814e-08,
"loss": 0.1521,
"step": 6670
},
{
"epoch": 1.7489560304593466,
"grad_norm": 3.805298088229625,
"learning_rate": 8.475882737908247e-08,
"loss": 0.129,
"step": 6675
},
{
"epoch": 1.7502661098829115,
"grad_norm": 3.0665024919957022,
"learning_rate": 8.388803285611601e-08,
"loss": 0.1577,
"step": 6680
},
{
"epoch": 1.7515761893064767,
"grad_norm": 14.26899142932033,
"learning_rate": 8.30215387655706e-08,
"loss": 0.1589,
"step": 6685
},
{
"epoch": 1.7528862687300417,
"grad_norm": 6.541801424230482,
"learning_rate": 8.215934917496192e-08,
"loss": 0.153,
"step": 6690
},
{
"epoch": 1.7541963481536067,
"grad_norm": 5.250779392264523,
"learning_rate": 8.130146813159844e-08,
"loss": 0.148,
"step": 6695
},
{
"epoch": 1.755506427577172,
"grad_norm": 3.927112284430514,
"learning_rate": 8.044789966256382e-08,
"loss": 0.1994,
"step": 6700
},
{
"epoch": 1.755506427577172,
"eval_accuracy": 0.76,
"eval_loss": 1.106866478919983,
"eval_runtime": 141.8989,
"eval_samples_per_second": 8.809,
"eval_steps_per_second": 2.206,
"step": 6700
},
{
"epoch": 1.7568165070007369,
"grad_norm": 9.767839811176259,
"learning_rate": 7.959864777469749e-08,
"loss": 0.2056,
"step": 6705
},
{
"epoch": 1.7581265864243019,
"grad_norm": 5.9103243659566,
"learning_rate": 7.875371645457574e-08,
"loss": 0.1468,
"step": 6710
},
{
"epoch": 1.759436665847867,
"grad_norm": 6.055358982690812,
"learning_rate": 7.791310966849362e-08,
"loss": 0.1375,
"step": 6715
},
{
"epoch": 1.760746745271432,
"grad_norm": 7.406585709894513,
"learning_rate": 7.707683136244503e-08,
"loss": 0.1663,
"step": 6720
},
{
"epoch": 1.762056824694997,
"grad_norm": 10.412933831104338,
"learning_rate": 7.624488546210584e-08,
"loss": 0.1703,
"step": 6725
},
{
"epoch": 1.7633669041185622,
"grad_norm": 11.2079157468613,
"learning_rate": 7.5417275872814e-08,
"loss": 0.1649,
"step": 6730
},
{
"epoch": 1.7646769835421272,
"grad_norm": 4.487770161857672,
"learning_rate": 7.459400647955261e-08,
"loss": 0.1109,
"step": 6735
},
{
"epoch": 1.7659870629656922,
"grad_norm": 4.720489203941066,
"learning_rate": 7.377508114693021e-08,
"loss": 0.2277,
"step": 6740
},
{
"epoch": 1.7672971423892574,
"grad_norm": 5.432786720479122,
"learning_rate": 7.296050371916362e-08,
"loss": 0.1617,
"step": 6745
},
{
"epoch": 1.7686072218128224,
"grad_norm": 6.287493094401141,
"learning_rate": 7.21502780200598e-08,
"loss": 0.1686,
"step": 6750
},
{
"epoch": 1.7699173012363874,
"grad_norm": 2.1808008637078897,
"learning_rate": 7.134440785299745e-08,
"loss": 0.1527,
"step": 6755
},
{
"epoch": 1.7712273806599526,
"grad_norm": 3.224585978413297,
"learning_rate": 7.054289700090987e-08,
"loss": 0.1003,
"step": 6760
},
{
"epoch": 1.7725374600835175,
"grad_norm": 4.540878351864405,
"learning_rate": 6.974574922626598e-08,
"loss": 0.146,
"step": 6765
},
{
"epoch": 1.7738475395070825,
"grad_norm": 7.4029328124452345,
"learning_rate": 6.895296827105423e-08,
"loss": 0.1749,
"step": 6770
},
{
"epoch": 1.7751576189306477,
"grad_norm": 4.084693702284536,
"learning_rate": 6.81645578567639e-08,
"loss": 0.1532,
"step": 6775
},
{
"epoch": 1.7764676983542127,
"grad_norm": 5.481752305139202,
"learning_rate": 6.738052168436814e-08,
"loss": 0.1742,
"step": 6780
},
{
"epoch": 1.7777777777777777,
"grad_norm": 6.07861709288181,
"learning_rate": 6.660086343430637e-08,
"loss": 0.1624,
"step": 6785
},
{
"epoch": 1.7790878572013429,
"grad_norm": 4.676302274517847,
"learning_rate": 6.582558676646676e-08,
"loss": 0.1583,
"step": 6790
},
{
"epoch": 1.7803979366249079,
"grad_norm": 3.9543347540784666,
"learning_rate": 6.505469532017005e-08,
"loss": 0.142,
"step": 6795
},
{
"epoch": 1.7817080160484728,
"grad_norm": 7.99552033660062,
"learning_rate": 6.428819271415098e-08,
"loss": 0.159,
"step": 6800
},
{
"epoch": 1.7817080160484728,
"eval_accuracy": 0.7608,
"eval_loss": 1.0944597721099854,
"eval_runtime": 142.6449,
"eval_samples_per_second": 8.763,
"eval_steps_per_second": 2.194,
"step": 6800
},
{
"epoch": 1.783018095472038,
"grad_norm": 5.928421818999669,
"learning_rate": 6.35260825465429e-08,
"loss": 0.1406,
"step": 6805
},
{
"epoch": 1.784328174895603,
"grad_norm": 5.358450738294621,
"learning_rate": 6.276836839485944e-08,
"loss": 0.1684,
"step": 6810
},
{
"epoch": 1.785638254319168,
"grad_norm": 7.4098511156954885,
"learning_rate": 6.201505381597872e-08,
"loss": 0.1258,
"step": 6815
},
{
"epoch": 1.7869483337427332,
"grad_norm": 10.028496011778648,
"learning_rate": 6.126614234612593e-08,
"loss": 0.1363,
"step": 6820
},
{
"epoch": 1.7882584131662982,
"grad_norm": 8.42691690098271,
"learning_rate": 6.05216375008576e-08,
"loss": 0.1629,
"step": 6825
},
{
"epoch": 1.7895684925898632,
"grad_norm": 7.8437977453123695,
"learning_rate": 5.978154277504432e-08,
"loss": 0.1488,
"step": 6830
},
{
"epoch": 1.7908785720134284,
"grad_norm": 9.079568079332288,
"learning_rate": 5.904586164285441e-08,
"loss": 0.1451,
"step": 6835
},
{
"epoch": 1.7921886514369934,
"grad_norm": 5.503155338633177,
"learning_rate": 5.831459755773815e-08,
"loss": 0.1478,
"step": 6840
},
{
"epoch": 1.7934987308605583,
"grad_norm": 6.773303376259157,
"learning_rate": 5.7587753952411e-08,
"loss": 0.1445,
"step": 6845
},
{
"epoch": 1.7948088102841235,
"grad_norm": 4.765236453726558,
"learning_rate": 5.686533423883788e-08,
"loss": 0.1617,
"step": 6850
},
{
"epoch": 1.7961188897076885,
"grad_norm": 6.230089077098877,
"learning_rate": 5.6147341808216894e-08,
"loss": 0.1509,
"step": 6855
},
{
"epoch": 1.7974289691312535,
"grad_norm": 4.574445807215422,
"learning_rate": 5.543378003096344e-08,
"loss": 0.1722,
"step": 6860
},
{
"epoch": 1.7987390485548187,
"grad_norm": 4.348732547956968,
"learning_rate": 5.4724652256694205e-08,
"loss": 0.1443,
"step": 6865
},
{
"epoch": 1.8000491279783837,
"grad_norm": 7.066725464839033,
"learning_rate": 5.401996181421253e-08,
"loss": 0.1485,
"step": 6870
},
{
"epoch": 1.8013592074019487,
"grad_norm": 5.328359673700302,
"learning_rate": 5.331971201149088e-08,
"loss": 0.1419,
"step": 6875
},
{
"epoch": 1.8026692868255139,
"grad_norm": 5.644656887108177,
"learning_rate": 5.262390613565737e-08,
"loss": 0.1424,
"step": 6880
},
{
"epoch": 1.8039793662490788,
"grad_norm": 5.671327408847072,
"learning_rate": 5.193254745297848e-08,
"loss": 0.198,
"step": 6885
},
{
"epoch": 1.8052894456726438,
"grad_norm": 4.877568064660579,
"learning_rate": 5.124563920884495e-08,
"loss": 0.1428,
"step": 6890
},
{
"epoch": 1.806599525096209,
"grad_norm": 6.817395500823747,
"learning_rate": 5.056318462775644e-08,
"loss": 0.1432,
"step": 6895
},
{
"epoch": 1.807909604519774,
"grad_norm": 5.927934052499454,
"learning_rate": 4.988518691330579e-08,
"loss": 0.1137,
"step": 6900
},
{
"epoch": 1.807909604519774,
"eval_accuracy": 0.7632,
"eval_loss": 1.1021169424057007,
"eval_runtime": 141.7931,
"eval_samples_per_second": 8.816,
"eval_steps_per_second": 2.207,
"step": 6900
},
{
"epoch": 1.809219683943339,
"grad_norm": 4.973694598901904,
"learning_rate": 4.9211649248164125e-08,
"loss": 0.1506,
"step": 6905
},
{
"epoch": 1.8105297633669042,
"grad_norm": 3.883659080774288,
"learning_rate": 4.854257479406654e-08,
"loss": 0.141,
"step": 6910
},
{
"epoch": 1.8118398427904692,
"grad_norm": 6.41491315709119,
"learning_rate": 4.787796669179689e-08,
"loss": 0.158,
"step": 6915
},
{
"epoch": 1.8131499222140341,
"grad_norm": 3.9302834820572814,
"learning_rate": 4.721782806117236e-08,
"loss": 0.1322,
"step": 6920
},
{
"epoch": 1.8144600016375994,
"grad_norm": 4.289211965868008,
"learning_rate": 4.656216200103036e-08,
"loss": 0.1337,
"step": 6925
},
{
"epoch": 1.8157700810611643,
"grad_norm": 11.248831743934637,
"learning_rate": 4.591097158921198e-08,
"loss": 0.1829,
"step": 6930
},
{
"epoch": 1.8170801604847293,
"grad_norm": 9.878450217229018,
"learning_rate": 4.526425988254967e-08,
"loss": 0.1566,
"step": 6935
},
{
"epoch": 1.8183902399082945,
"grad_norm": 7.865233707641088,
"learning_rate": 4.4622029916850935e-08,
"loss": 0.1189,
"step": 6940
},
{
"epoch": 1.8197003193318595,
"grad_norm": 9.744045573658111,
"learning_rate": 4.3984284706885976e-08,
"loss": 0.1497,
"step": 6945
},
{
"epoch": 1.8210103987554245,
"grad_norm": 7.22547788135765,
"learning_rate": 4.335102724637163e-08,
"loss": 0.2296,
"step": 6950
},
{
"epoch": 1.8223204781789897,
"grad_norm": 11.079040650914395,
"learning_rate": 4.2722260507958684e-08,
"loss": 0.1922,
"step": 6955
},
{
"epoch": 1.8236305576025547,
"grad_norm": 8.165182518676898,
"learning_rate": 4.2097987443217577e-08,
"loss": 0.1381,
"step": 6960
},
{
"epoch": 1.8249406370261196,
"grad_norm": 3.739467681211686,
"learning_rate": 4.147821098262405e-08,
"loss": 0.1294,
"step": 6965
},
{
"epoch": 1.8262507164496848,
"grad_norm": 5.402892377251783,
"learning_rate": 4.086293403554641e-08,
"loss": 0.1786,
"step": 6970
},
{
"epoch": 1.8275607958732498,
"grad_norm": 4.468753443729968,
"learning_rate": 4.0252159490230645e-08,
"loss": 0.1654,
"step": 6975
},
{
"epoch": 1.8288708752968148,
"grad_norm": 5.203594506261989,
"learning_rate": 3.964589021378772e-08,
"loss": 0.1367,
"step": 6980
},
{
"epoch": 1.83018095472038,
"grad_norm": 3.380052180374155,
"learning_rate": 3.90441290521798e-08,
"loss": 0.1106,
"step": 6985
},
{
"epoch": 1.831491034143945,
"grad_norm": 4.633407767619745,
"learning_rate": 3.8446878830207254e-08,
"loss": 0.1679,
"step": 6990
},
{
"epoch": 1.83280111356751,
"grad_norm": 5.329985366173506,
"learning_rate": 3.785414235149465e-08,
"loss": 0.1565,
"step": 6995
},
{
"epoch": 1.8341111929910752,
"grad_norm": 6.556659404147076,
"learning_rate": 3.726592239847826e-08,
"loss": 0.2095,
"step": 7000
},
{
"epoch": 1.8341111929910752,
"eval_accuracy": 0.7624,
"eval_loss": 1.1032047271728516,
"eval_runtime": 141.6558,
"eval_samples_per_second": 8.824,
"eval_steps_per_second": 2.21,
"step": 7000
},
{
"epoch": 1.8354212724146401,
"grad_norm": 4.684757438072978,
"learning_rate": 3.668222173239288e-08,
"loss": 0.1576,
"step": 7005
},
{
"epoch": 1.8367313518382051,
"grad_norm": 7.039438206694064,
"learning_rate": 3.6103043093258625e-08,
"loss": 0.1128,
"step": 7010
},
{
"epoch": 1.8380414312617703,
"grad_norm": 12.172322751037852,
"learning_rate": 3.552838919986845e-08,
"loss": 0.1683,
"step": 7015
},
{
"epoch": 1.8393515106853353,
"grad_norm": 7.127343846862284,
"learning_rate": 3.495826274977487e-08,
"loss": 0.0943,
"step": 7020
},
{
"epoch": 1.8406615901089003,
"grad_norm": 3.994985416023033,
"learning_rate": 3.439266641927752e-08,
"loss": 0.1505,
"step": 7025
},
{
"epoch": 1.8419716695324655,
"grad_norm": 10.627564884995905,
"learning_rate": 3.383160286341091e-08,
"loss": 0.1712,
"step": 7030
},
{
"epoch": 1.8432817489560305,
"grad_norm": 5.604749535399941,
"learning_rate": 3.327507471593172e-08,
"loss": 0.1205,
"step": 7035
},
{
"epoch": 1.8445918283795955,
"grad_norm": 4.8421269912484215,
"learning_rate": 3.272308458930606e-08,
"loss": 0.1152,
"step": 7040
},
{
"epoch": 1.8459019078031607,
"grad_norm": 9.074100019849872,
"learning_rate": 3.2175635074698005e-08,
"loss": 0.2357,
"step": 7045
},
{
"epoch": 1.8472119872267256,
"grad_norm": 7.154307006903744,
"learning_rate": 3.1632728741956884e-08,
"loss": 0.1552,
"step": 7050
},
{
"epoch": 1.8485220666502906,
"grad_norm": 2.6781924530315306,
"learning_rate": 3.1094368139604865e-08,
"loss": 0.106,
"step": 7055
},
{
"epoch": 1.8498321460738558,
"grad_norm": 7.370147947886868,
"learning_rate": 3.0560555794826196e-08,
"loss": 0.1413,
"step": 7060
},
{
"epoch": 1.8511422254974208,
"grad_norm": 3.802825821569026,
"learning_rate": 3.003129421345407e-08,
"loss": 0.1453,
"step": 7065
},
{
"epoch": 1.8524523049209858,
"grad_norm": 6.1792330095504395,
"learning_rate": 2.9506585879959577e-08,
"loss": 0.1564,
"step": 7070
},
{
"epoch": 1.853762384344551,
"grad_norm": 5.355565750702989,
"learning_rate": 2.8986433257439658e-08,
"loss": 0.1967,
"step": 7075
},
{
"epoch": 1.855072463768116,
"grad_norm": 6.360151114951984,
"learning_rate": 2.8470838787606034e-08,
"loss": 0.0963,
"step": 7080
},
{
"epoch": 1.856382543191681,
"grad_norm": 6.69819610174965,
"learning_rate": 2.795980489077332e-08,
"loss": 0.1303,
"step": 7085
},
{
"epoch": 1.8576926226152461,
"grad_norm": 4.435876647603045,
"learning_rate": 2.7453333965847815e-08,
"loss": 0.1269,
"step": 7090
},
{
"epoch": 1.8590027020388111,
"grad_norm": 7.698895216569222,
"learning_rate": 2.6951428390316165e-08,
"loss": 0.1347,
"step": 7095
},
{
"epoch": 1.860312781462376,
"grad_norm": 3.790916594995889,
"learning_rate": 2.6454090520234063e-08,
"loss": 0.2099,
"step": 7100
},
{
"epoch": 1.860312781462376,
"eval_accuracy": 0.7632,
"eval_loss": 1.1122453212738037,
"eval_runtime": 142.3069,
"eval_samples_per_second": 8.784,
"eval_steps_per_second": 2.199,
"step": 7100
},
{
"epoch": 1.8616228608859413,
"grad_norm": 3.9663209864483724,
"learning_rate": 2.596132269021589e-08,
"loss": 0.1212,
"step": 7105
},
{
"epoch": 1.8629329403095063,
"grad_norm": 5.657131945956216,
"learning_rate": 2.5473127213422762e-08,
"loss": 0.1551,
"step": 7110
},
{
"epoch": 1.8642430197330713,
"grad_norm": 5.092795916916417,
"learning_rate": 2.4989506381552617e-08,
"loss": 0.1736,
"step": 7115
},
{
"epoch": 1.8655530991566365,
"grad_norm": 5.798872866253983,
"learning_rate": 2.4510462464828352e-08,
"loss": 0.1684,
"step": 7120
},
{
"epoch": 1.8668631785802015,
"grad_norm": 3.20953589455004,
"learning_rate": 2.4035997711988387e-08,
"loss": 0.1094,
"step": 7125
},
{
"epoch": 1.8681732580037664,
"grad_norm": 3.3680896195026477,
"learning_rate": 2.3566114350275223e-08,
"loss": 0.1694,
"step": 7130
},
{
"epoch": 1.8694833374273316,
"grad_norm": 5.729982989667835,
"learning_rate": 2.3100814585425564e-08,
"loss": 0.1564,
"step": 7135
},
{
"epoch": 1.8707934168508966,
"grad_norm": 10.389717669058681,
"learning_rate": 2.264010060165944e-08,
"loss": 0.1514,
"step": 7140
},
{
"epoch": 1.8721034962744616,
"grad_norm": 5.075854563803782,
"learning_rate": 2.2183974561670205e-08,
"loss": 0.2024,
"step": 7145
},
{
"epoch": 1.8734135756980268,
"grad_norm": 4.242493204391383,
"learning_rate": 2.1732438606614665e-08,
"loss": 0.1311,
"step": 7150
},
{
"epoch": 1.8747236551215918,
"grad_norm": 5.4042102288406095,
"learning_rate": 2.1285494856102315e-08,
"loss": 0.1726,
"step": 7155
},
{
"epoch": 1.8760337345451568,
"grad_norm": 6.071406295353831,
"learning_rate": 2.0843145408186547e-08,
"loss": 0.1006,
"step": 7160
},
{
"epoch": 1.877343813968722,
"grad_norm": 6.149832756857255,
"learning_rate": 2.0405392339353234e-08,
"loss": 0.1713,
"step": 7165
},
{
"epoch": 1.878653893392287,
"grad_norm": 9.357766475713992,
"learning_rate": 1.9972237704512283e-08,
"loss": 0.1644,
"step": 7170
},
{
"epoch": 1.879963972815852,
"grad_norm": 5.253827651758333,
"learning_rate": 1.9543683536987434e-08,
"loss": 0.111,
"step": 7175
},
{
"epoch": 1.8812740522394171,
"grad_norm": 11.566026718756346,
"learning_rate": 1.9119731848506902e-08,
"loss": 0.1984,
"step": 7180
},
{
"epoch": 1.882584131662982,
"grad_norm": 8.248572977961965,
"learning_rate": 1.8700384629193876e-08,
"loss": 0.1202,
"step": 7185
},
{
"epoch": 1.883894211086547,
"grad_norm": 4.350069192366543,
"learning_rate": 1.828564384755682e-08,
"loss": 0.1471,
"step": 7190
},
{
"epoch": 1.8852042905101123,
"grad_norm": 5.389482671182921,
"learning_rate": 1.787551145048094e-08,
"loss": 0.1356,
"step": 7195
},
{
"epoch": 1.8865143699336773,
"grad_norm": 9.568845913678533,
"learning_rate": 1.7469989363218528e-08,
"loss": 0.209,
"step": 7200
},
{
"epoch": 1.8865143699336773,
"eval_accuracy": 0.7616,
"eval_loss": 1.1072343587875366,
"eval_runtime": 141.2973,
"eval_samples_per_second": 8.847,
"eval_steps_per_second": 2.215,
"step": 7200
},
{
"epoch": 1.8878244493572423,
"grad_norm": 8.437455929535107,
"learning_rate": 1.706907948938008e-08,
"loss": 0.1703,
"step": 7205
},
{
"epoch": 1.8891345287808075,
"grad_norm": 9.671218815145942,
"learning_rate": 1.6672783710925288e-08,
"loss": 0.18,
"step": 7210
},
{
"epoch": 1.8904446082043724,
"grad_norm": 5.889420942486911,
"learning_rate": 1.628110388815429e-08,
"loss": 0.1196,
"step": 7215
},
{
"epoch": 1.8917546876279374,
"grad_norm": 4.068164701286381,
"learning_rate": 1.5894041859698783e-08,
"loss": 0.1432,
"step": 7220
},
{
"epoch": 1.8930647670515026,
"grad_norm": 4.827161273788743,
"learning_rate": 1.5511599442513677e-08,
"loss": 0.1612,
"step": 7225
},
{
"epoch": 1.8943748464750676,
"grad_norm": 2.5102178836850495,
"learning_rate": 1.5133778431868583e-08,
"loss": 0.1626,
"step": 7230
},
{
"epoch": 1.8956849258986326,
"grad_norm": 7.7934519895192285,
"learning_rate": 1.4760580601338669e-08,
"loss": 0.2144,
"step": 7235
},
{
"epoch": 1.8969950053221978,
"grad_norm": 8.667381276168085,
"learning_rate": 1.439200770279736e-08,
"loss": 0.2232,
"step": 7240
},
{
"epoch": 1.8983050847457628,
"grad_norm": 7.482118460513503,
"learning_rate": 1.4028061466407449e-08,
"loss": 0.1269,
"step": 7245
},
{
"epoch": 1.8996151641693277,
"grad_norm": 5.507929252171665,
"learning_rate": 1.3668743600613097e-08,
"loss": 0.1869,
"step": 7250
},
{
"epoch": 1.900925243592893,
"grad_norm": 3.497582694746413,
"learning_rate": 1.3314055792131961e-08,
"loss": 0.1518,
"step": 7255
},
{
"epoch": 1.902235323016458,
"grad_norm": 6.083554156233226,
"learning_rate": 1.2963999705947193e-08,
"loss": 0.158,
"step": 7260
},
{
"epoch": 1.903545402440023,
"grad_norm": 11.09067780911257,
"learning_rate": 1.2618576985299334e-08,
"loss": 0.1666,
"step": 7265
},
{
"epoch": 1.904855481863588,
"grad_norm": 4.6729278171579605,
"learning_rate": 1.227778925167955e-08,
"loss": 0.135,
"step": 7270
},
{
"epoch": 1.906165561287153,
"grad_norm": 5.01367722348655,
"learning_rate": 1.1941638104820517e-08,
"loss": 0.1376,
"step": 7275
},
{
"epoch": 1.907475640710718,
"grad_norm": 8.20404432458644,
"learning_rate": 1.1610125122690328e-08,
"loss": 0.2188,
"step": 7280
},
{
"epoch": 1.9087857201342833,
"grad_norm": 9.608276812286391,
"learning_rate": 1.1283251861484378e-08,
"loss": 0.199,
"step": 7285
},
{
"epoch": 1.910095799557848,
"grad_norm": 4.3189093484564145,
"learning_rate": 1.0961019855618037e-08,
"loss": 0.1662,
"step": 7290
},
{
"epoch": 1.9114058789814132,
"grad_norm": 6.758336690737442,
"learning_rate": 1.0643430617719663e-08,
"loss": 0.1357,
"step": 7295
},
{
"epoch": 1.9127159584049784,
"grad_norm": 6.780661902438455,
"learning_rate": 1.0330485638623488e-08,
"loss": 0.178,
"step": 7300
},
{
"epoch": 1.9127159584049784,
"eval_accuracy": 0.7656,
"eval_loss": 1.1024446487426758,
"eval_runtime": 141.415,
"eval_samples_per_second": 8.839,
"eval_steps_per_second": 2.213,
"step": 7300
},
{
"epoch": 1.9140260378285432,
"grad_norm": 6.026966107553702,
"learning_rate": 1.0022186387362742e-08,
"loss": 0.1445,
"step": 7305
},
{
"epoch": 1.9153361172521084,
"grad_norm": 6.277429872371872,
"learning_rate": 9.718534311161985e-09,
"loss": 0.1679,
"step": 7310
},
{
"epoch": 1.9166461966756736,
"grad_norm": 4.119214806989816,
"learning_rate": 9.419530835431676e-09,
"loss": 0.1928,
"step": 7315
},
{
"epoch": 1.9179562760992384,
"grad_norm": 5.6153080280283385,
"learning_rate": 9.125177363759951e-09,
"loss": 0.1118,
"step": 7320
},
{
"epoch": 1.9192663555228036,
"grad_norm": 4.7666055487436525,
"learning_rate": 8.835475277907622e-09,
"loss": 0.1643,
"step": 7325
},
{
"epoch": 1.9205764349463688,
"grad_norm": 8.192595195938912,
"learning_rate": 8.550425937800088e-09,
"loss": 0.1507,
"step": 7330
},
{
"epoch": 1.9218865143699335,
"grad_norm": 6.765512918112135,
"learning_rate": 8.270030681522099e-09,
"loss": 0.1295,
"step": 7335
},
{
"epoch": 1.9231965937934987,
"grad_norm": 3.56638369560944,
"learning_rate": 7.994290825311333e-09,
"loss": 0.1031,
"step": 7340
},
{
"epoch": 1.924506673217064,
"grad_norm": 7.699305575968428,
"learning_rate": 7.72320766355139e-09,
"loss": 0.1526,
"step": 7345
},
{
"epoch": 1.9258167526406287,
"grad_norm": 4.8819269994799654,
"learning_rate": 7.45678246876702e-09,
"loss": 0.1646,
"step": 7350
},
{
"epoch": 1.9271268320641939,
"grad_norm": 3.9964247606777357,
"learning_rate": 7.19501649161669e-09,
"loss": 0.1028,
"step": 7355
},
{
"epoch": 1.928436911487759,
"grad_norm": 7.878684128224621,
"learning_rate": 6.937910960888138e-09,
"loss": 0.1542,
"step": 7360
},
{
"epoch": 1.9297469909113238,
"grad_norm": 9.167984693047696,
"learning_rate": 6.685467083491492e-09,
"loss": 0.1468,
"step": 7365
},
{
"epoch": 1.931057070334889,
"grad_norm": 2.1053337480076344,
"learning_rate": 6.437686044454382e-09,
"loss": 0.153,
"step": 7370
},
{
"epoch": 1.9323671497584543,
"grad_norm": 6.447874086726777,
"learning_rate": 6.194569006915729e-09,
"loss": 0.1358,
"step": 7375
},
{
"epoch": 1.933677229182019,
"grad_norm": 10.70751883403138,
"learning_rate": 5.95611711212074e-09,
"loss": 0.2028,
"step": 7380
},
{
"epoch": 1.9349873086055842,
"grad_norm": 10.40080019721987,
"learning_rate": 5.722331479415476e-09,
"loss": 0.1971,
"step": 7385
},
{
"epoch": 1.9362973880291494,
"grad_norm": 6.470624613494294,
"learning_rate": 5.4932132062414095e-09,
"loss": 0.148,
"step": 7390
},
{
"epoch": 1.9376074674527142,
"grad_norm": 6.406990396350547,
"learning_rate": 5.268763368130425e-09,
"loss": 0.1788,
"step": 7395
},
{
"epoch": 1.9389175468762794,
"grad_norm": 2.770021538790025,
"learning_rate": 5.048983018699826e-09,
"loss": 0.1198,
"step": 7400
},
{
"epoch": 1.9389175468762794,
"eval_accuracy": 0.7632,
"eval_loss": 1.106671929359436,
"eval_runtime": 143.3333,
"eval_samples_per_second": 8.721,
"eval_steps_per_second": 2.184,
"step": 7400
},
{
"epoch": 1.9402276262998446,
"grad_norm": 7.932600638186708,
"learning_rate": 4.8338731896472305e-09,
"loss": 0.0954,
"step": 7405
},
{
"epoch": 1.9415377057234093,
"grad_norm": 4.158505987091941,
"learning_rate": 4.623434890745792e-09,
"loss": 0.1482,
"step": 7410
},
{
"epoch": 1.9428477851469745,
"grad_norm": 4.125686999210389,
"learning_rate": 4.417669109839539e-09,
"loss": 0.1672,
"step": 7415
},
{
"epoch": 1.9441578645705397,
"grad_norm": 9.401687509625868,
"learning_rate": 4.2165768128384905e-09,
"loss": 0.2056,
"step": 7420
},
{
"epoch": 1.9454679439941045,
"grad_norm": 6.656706387472742,
"learning_rate": 4.020158943714436e-09,
"loss": 0.1292,
"step": 7425
},
{
"epoch": 1.9467780234176697,
"grad_norm": 5.612244185690268,
"learning_rate": 3.828416424496383e-09,
"loss": 0.1141,
"step": 7430
},
{
"epoch": 1.948088102841235,
"grad_norm": 8.832257438742863,
"learning_rate": 3.641350155266232e-09,
"loss": 0.2152,
"step": 7435
},
{
"epoch": 1.9493981822647997,
"grad_norm": 9.016570066506297,
"learning_rate": 3.458961014154327e-09,
"loss": 0.1548,
"step": 7440
},
{
"epoch": 1.9507082616883649,
"grad_norm": 5.852774450303297,
"learning_rate": 3.2812498573359104e-09,
"loss": 0.1769,
"step": 7445
},
{
"epoch": 1.95201834111193,
"grad_norm": 3.8609337077050783,
"learning_rate": 3.108217519026235e-09,
"loss": 0.1429,
"step": 7450
},
{
"epoch": 1.9533284205354948,
"grad_norm": 3.881198457400227,
"learning_rate": 2.9398648114775658e-09,
"loss": 0.1024,
"step": 7455
},
{
"epoch": 1.95463849995906,
"grad_norm": 4.8164655468300115,
"learning_rate": 2.776192524974741e-09,
"loss": 0.138,
"step": 7460
},
{
"epoch": 1.9559485793826252,
"grad_norm": 6.545137926620753,
"learning_rate": 2.617201427831728e-09,
"loss": 0.1693,
"step": 7465
},
{
"epoch": 1.95725865880619,
"grad_norm": 4.068942827553475,
"learning_rate": 2.4628922663879615e-09,
"loss": 0.1181,
"step": 7470
},
{
"epoch": 1.9585687382297552,
"grad_norm": 3.941067732119495,
"learning_rate": 2.3132657650047905e-09,
"loss": 0.1674,
"step": 7475
},
{
"epoch": 1.9598788176533204,
"grad_norm": 4.838837050335417,
"learning_rate": 2.168322626062147e-09,
"loss": 0.1547,
"step": 7480
},
{
"epoch": 1.9611888970768852,
"grad_norm": 7.190077481755126,
"learning_rate": 2.0280635299551043e-09,
"loss": 0.1601,
"step": 7485
},
{
"epoch": 1.9624989765004504,
"grad_norm": 6.231947930455576,
"learning_rate": 1.8924891350911023e-09,
"loss": 0.1571,
"step": 7490
},
{
"epoch": 1.9638090559240156,
"grad_norm": 3.2883780328164156,
"learning_rate": 1.7616000778863938e-09,
"loss": 0.1346,
"step": 7495
},
{
"epoch": 1.9651191353475803,
"grad_norm": 2.251817701553141,
"learning_rate": 1.6353969727629368e-09,
"loss": 0.1483,
"step": 7500
},
{
"epoch": 1.9651191353475803,
"eval_accuracy": 0.764,
"eval_loss": 1.1052285432815552,
"eval_runtime": 140.9937,
"eval_samples_per_second": 8.866,
"eval_steps_per_second": 2.22,
"step": 7500
},
{
"epoch": 1.9664292147711455,
"grad_norm": 6.416248872950269,
"learning_rate": 1.5138804121462844e-09,
"loss": 0.1381,
"step": 7505
},
{
"epoch": 1.9677392941947107,
"grad_norm": 6.998254537894883,
"learning_rate": 1.3970509664620323e-09,
"loss": 0.143,
"step": 7510
},
{
"epoch": 1.9690493736182755,
"grad_norm": 8.413132217200232,
"learning_rate": 1.284909184133487e-09,
"loss": 0.1645,
"step": 7515
},
{
"epoch": 1.9703594530418407,
"grad_norm": 5.34533852883374,
"learning_rate": 1.1774555915787799e-09,
"loss": 0.1896,
"step": 7520
},
{
"epoch": 1.9716695324654057,
"grad_norm": 3.1253164448556814,
"learning_rate": 1.0746906932092016e-09,
"loss": 0.17,
"step": 7525
},
{
"epoch": 1.9729796118889706,
"grad_norm": 6.051727298230588,
"learning_rate": 9.7661497142576e-10,
"loss": 0.1539,
"step": 7530
},
{
"epoch": 1.9742896913125358,
"grad_norm": 9.970322292501223,
"learning_rate": 8.832288866175152e-10,
"loss": 0.1438,
"step": 7535
},
{
"epoch": 1.9755997707361008,
"grad_norm": 4.902999720057873,
"learning_rate": 7.945328771596926e-10,
"loss": 0.1661,
"step": 7540
},
{
"epoch": 1.9769098501596658,
"grad_norm": 6.8168524634333325,
"learning_rate": 7.105273594107953e-10,
"loss": 0.1571,
"step": 7545
},
{
"epoch": 1.978219929583231,
"grad_norm": 11.517594529415646,
"learning_rate": 6.312127277113833e-10,
"loss": 0.2014,
"step": 7550
},
{
"epoch": 1.979530009006796,
"grad_norm": 5.3054234456984535,
"learning_rate": 5.565893543818534e-10,
"loss": 0.1121,
"step": 7555
},
{
"epoch": 1.980840088430361,
"grad_norm": 6.920016441538864,
"learning_rate": 4.866575897208846e-10,
"loss": 0.1289,
"step": 7560
},
{
"epoch": 1.9821501678539262,
"grad_norm": 5.089129025398095,
"learning_rate": 4.2141776200366184e-10,
"loss": 0.1832,
"step": 7565
},
{
"epoch": 1.9834602472774912,
"grad_norm": 13.206489418737272,
"learning_rate": 3.6087017748043235e-10,
"loss": 0.1799,
"step": 7570
},
{
"epoch": 1.9847703267010561,
"grad_norm": 7.895102786478579,
"learning_rate": 3.050151203749518e-10,
"loss": 0.1761,
"step": 7575
},
{
"epoch": 1.9860804061246213,
"grad_norm": 7.13408976553629,
"learning_rate": 2.538528528831518e-10,
"loss": 0.1308,
"step": 7580
},
{
"epoch": 1.9873904855481863,
"grad_norm": 5.735776530531999,
"learning_rate": 2.0738361517214087e-10,
"loss": 0.1536,
"step": 7585
},
{
"epoch": 1.9887005649717513,
"grad_norm": 6.373237190830313,
"learning_rate": 1.656076253786498e-10,
"loss": 0.1635,
"step": 7590
},
{
"epoch": 1.9900106443953165,
"grad_norm": 6.685342113001509,
"learning_rate": 1.2852507960858793e-10,
"loss": 0.124,
"step": 7595
},
{
"epoch": 1.9913207238188815,
"grad_norm": 5.437567848160449,
"learning_rate": 9.613615193548863e-11,
"loss": 0.1298,
"step": 7600
},
{
"epoch": 1.9913207238188815,
"eval_accuracy": 0.76,
"eval_loss": 1.105454921722412,
"eval_runtime": 140.9417,
"eval_samples_per_second": 8.869,
"eval_steps_per_second": 2.221,
"step": 7600
},
{
"epoch": 1.9926308032424465,
"grad_norm": 6.920328620350111,
"learning_rate": 6.84409944003983e-11,
"loss": 0.1196,
"step": 7605
},
{
"epoch": 1.9939408826660117,
"grad_norm": 8.1447085166869,
"learning_rate": 4.543973701021109e-11,
"loss": 0.1285,
"step": 7610
},
{
"epoch": 1.9952509620895766,
"grad_norm": 3.617450574006755,
"learning_rate": 2.7132487738223964e-11,
"loss": 0.1182,
"step": 7615
},
{
"epoch": 1.9965610415131416,
"grad_norm": 5.8602211913393445,
"learning_rate": 1.3519332522471393e-11,
"loss": 0.1203,
"step": 7620
},
{
"epoch": 1.9978711209367068,
"grad_norm": 7.209657643570803,
"learning_rate": 4.6003352661694304e-12,
"loss": 0.1478,
"step": 7625
},
{
"epoch": 1.9991812003602718,
"grad_norm": 4.426426765312953,
"learning_rate": 3.7553783716059993e-13,
"loss": 0.1669,
"step": 7630
}
],
"logging_steps": 5,
"max_steps": 7632,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}