exceptions / exp3 /high_0_1208 /trainer_state.json
craa's picture
Upload folder using huggingface_hub
72d73da verified
{
"best_global_step": 78000,
"best_metric": 3.2760121822357178,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/high_0_1208/checkpoint-70000",
"epoch": 29.602121016365203,
"eval_steps": 1000,
"global_step": 110000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013458225667527994,
"grad_norm": 1.9294723272323608,
"learning_rate": 0.000294,
"loss": 8.5675,
"step": 50
},
{
"epoch": 0.026916451335055987,
"grad_norm": 4.372699737548828,
"learning_rate": 0.0005939999999999999,
"loss": 6.7649,
"step": 100
},
{
"epoch": 0.04037467700258398,
"grad_norm": 0.6613487601280212,
"learning_rate": 0.0005998020735155513,
"loss": 6.3115,
"step": 150
},
{
"epoch": 0.053832902670111975,
"grad_norm": 1.9673986434936523,
"learning_rate": 0.0005996001077150935,
"loss": 6.0152,
"step": 200
},
{
"epoch": 0.06729112833763996,
"grad_norm": 2.4221529960632324,
"learning_rate": 0.0005993981419146358,
"loss": 5.8697,
"step": 250
},
{
"epoch": 0.08074935400516796,
"grad_norm": 1.3177257776260376,
"learning_rate": 0.0005991961761141779,
"loss": 5.7306,
"step": 300
},
{
"epoch": 0.09420757967269595,
"grad_norm": 1.0655848979949951,
"learning_rate": 0.0005989942103137202,
"loss": 5.6299,
"step": 350
},
{
"epoch": 0.10766580534022395,
"grad_norm": 1.2664433717727661,
"learning_rate": 0.0005987922445132624,
"loss": 5.5363,
"step": 400
},
{
"epoch": 0.12112403100775193,
"grad_norm": 1.265374779701233,
"learning_rate": 0.0005985902787128047,
"loss": 5.3807,
"step": 450
},
{
"epoch": 0.13458225667527993,
"grad_norm": 1.2221217155456543,
"learning_rate": 0.0005983883129123468,
"loss": 5.2511,
"step": 500
},
{
"epoch": 0.14804048234280792,
"grad_norm": 1.1258769035339355,
"learning_rate": 0.000598186347111889,
"loss": 5.1464,
"step": 550
},
{
"epoch": 0.16149870801033592,
"grad_norm": 1.3253281116485596,
"learning_rate": 0.0005979843813114312,
"loss": 5.0664,
"step": 600
},
{
"epoch": 0.1749569336778639,
"grad_norm": 1.076049566268921,
"learning_rate": 0.0005977824155109734,
"loss": 5.013,
"step": 650
},
{
"epoch": 0.1884151593453919,
"grad_norm": 1.1340619325637817,
"learning_rate": 0.0005975804497105157,
"loss": 4.9208,
"step": 700
},
{
"epoch": 0.2018733850129199,
"grad_norm": 0.8552718758583069,
"learning_rate": 0.0005973784839100578,
"loss": 4.8547,
"step": 750
},
{
"epoch": 0.2153316106804479,
"grad_norm": 1.0275166034698486,
"learning_rate": 0.0005971765181096001,
"loss": 4.7937,
"step": 800
},
{
"epoch": 0.2287898363479759,
"grad_norm": 0.8931620121002197,
"learning_rate": 0.0005969745523091422,
"loss": 4.7444,
"step": 850
},
{
"epoch": 0.24224806201550386,
"grad_norm": 0.7615459561347961,
"learning_rate": 0.0005967725865086845,
"loss": 4.6897,
"step": 900
},
{
"epoch": 0.2557062876830319,
"grad_norm": 0.860148549079895,
"learning_rate": 0.0005965706207082267,
"loss": 4.669,
"step": 950
},
{
"epoch": 0.26916451335055985,
"grad_norm": 1.0182037353515625,
"learning_rate": 0.0005963686549077689,
"loss": 4.6123,
"step": 1000
},
{
"epoch": 0.26916451335055985,
"eval_accuracy": 0.26530030208171723,
"eval_loss": 4.529453277587891,
"eval_runtime": 54.5835,
"eval_samples_per_second": 329.99,
"eval_steps_per_second": 20.629,
"step": 1000
},
{
"epoch": 0.2826227390180879,
"grad_norm": 0.9250805974006653,
"learning_rate": 0.0005961666891073111,
"loss": 4.5473,
"step": 1050
},
{
"epoch": 0.29608096468561584,
"grad_norm": 0.8115113973617554,
"learning_rate": 0.0005959647233068533,
"loss": 4.5016,
"step": 1100
},
{
"epoch": 0.30953919035314387,
"grad_norm": 0.8632411956787109,
"learning_rate": 0.0005957627575063955,
"loss": 4.4742,
"step": 1150
},
{
"epoch": 0.32299741602067183,
"grad_norm": 0.7869690656661987,
"learning_rate": 0.0005955607917059378,
"loss": 4.4344,
"step": 1200
},
{
"epoch": 0.3364556416881998,
"grad_norm": 0.6230737566947937,
"learning_rate": 0.00059535882590548,
"loss": 4.4066,
"step": 1250
},
{
"epoch": 0.3499138673557278,
"grad_norm": 0.935434103012085,
"learning_rate": 0.0005951568601050221,
"loss": 4.3607,
"step": 1300
},
{
"epoch": 0.3633720930232558,
"grad_norm": 0.8798397779464722,
"learning_rate": 0.0005949548943045644,
"loss": 4.3366,
"step": 1350
},
{
"epoch": 0.3768303186907838,
"grad_norm": 0.7297024726867676,
"learning_rate": 0.0005947529285041066,
"loss": 4.3083,
"step": 1400
},
{
"epoch": 0.3902885443583118,
"grad_norm": 0.7763248682022095,
"learning_rate": 0.0005945509627036488,
"loss": 4.2737,
"step": 1450
},
{
"epoch": 0.4037467700258398,
"grad_norm": 0.5607454776763916,
"learning_rate": 0.000594348996903191,
"loss": 4.2506,
"step": 1500
},
{
"epoch": 0.4172049956933678,
"grad_norm": 0.7158567309379578,
"learning_rate": 0.0005941470311027332,
"loss": 4.2292,
"step": 1550
},
{
"epoch": 0.4306632213608958,
"grad_norm": 0.7625264525413513,
"learning_rate": 0.0005939450653022754,
"loss": 4.2232,
"step": 1600
},
{
"epoch": 0.44412144702842377,
"grad_norm": 0.6740456819534302,
"learning_rate": 0.0005937430995018177,
"loss": 4.1809,
"step": 1650
},
{
"epoch": 0.4575796726959518,
"grad_norm": 0.7420955300331116,
"learning_rate": 0.0005935411337013598,
"loss": 4.1803,
"step": 1700
},
{
"epoch": 0.47103789836347976,
"grad_norm": 0.8082626461982727,
"learning_rate": 0.0005933391679009021,
"loss": 4.1628,
"step": 1750
},
{
"epoch": 0.4844961240310077,
"grad_norm": 0.6156010627746582,
"learning_rate": 0.0005931372021004442,
"loss": 4.1499,
"step": 1800
},
{
"epoch": 0.49795434969853575,
"grad_norm": 0.6687771677970886,
"learning_rate": 0.0005929352362999865,
"loss": 4.1337,
"step": 1850
},
{
"epoch": 0.5114125753660638,
"grad_norm": 0.67042076587677,
"learning_rate": 0.0005927332704995287,
"loss": 4.1201,
"step": 1900
},
{
"epoch": 0.5248708010335917,
"grad_norm": 0.6833565831184387,
"learning_rate": 0.0005925313046990709,
"loss": 4.0887,
"step": 1950
},
{
"epoch": 0.5383290267011197,
"grad_norm": 0.6024614572525024,
"learning_rate": 0.0005923293388986131,
"loss": 4.0841,
"step": 2000
},
{
"epoch": 0.5383290267011197,
"eval_accuracy": 0.31777640737877294,
"eval_loss": 4.024723052978516,
"eval_runtime": 53.7356,
"eval_samples_per_second": 335.197,
"eval_steps_per_second": 20.954,
"step": 2000
},
{
"epoch": 0.5517872523686477,
"grad_norm": 0.5663601160049438,
"learning_rate": 0.0005921273730981553,
"loss": 4.0555,
"step": 2050
},
{
"epoch": 0.5652454780361758,
"grad_norm": 0.6550332307815552,
"learning_rate": 0.0005919254072976975,
"loss": 4.0597,
"step": 2100
},
{
"epoch": 0.5787037037037037,
"grad_norm": 0.6065599322319031,
"learning_rate": 0.0005917234414972398,
"loss": 4.048,
"step": 2150
},
{
"epoch": 0.5921619293712317,
"grad_norm": 0.6482925415039062,
"learning_rate": 0.000591521475696782,
"loss": 4.0347,
"step": 2200
},
{
"epoch": 0.6056201550387597,
"grad_norm": 0.5995512008666992,
"learning_rate": 0.0005913195098963241,
"loss": 4.0295,
"step": 2250
},
{
"epoch": 0.6190783807062877,
"grad_norm": 0.6398453712463379,
"learning_rate": 0.0005911175440958664,
"loss": 4.0111,
"step": 2300
},
{
"epoch": 0.6325366063738157,
"grad_norm": 0.6352100968360901,
"learning_rate": 0.0005909155782954086,
"loss": 4.0079,
"step": 2350
},
{
"epoch": 0.6459948320413437,
"grad_norm": 0.4817008674144745,
"learning_rate": 0.0005907136124949508,
"loss": 4.0007,
"step": 2400
},
{
"epoch": 0.6594530577088716,
"grad_norm": 0.6795246005058289,
"learning_rate": 0.000590511646694493,
"loss": 3.9933,
"step": 2450
},
{
"epoch": 0.6729112833763996,
"grad_norm": 0.5192553997039795,
"learning_rate": 0.0005903096808940352,
"loss": 3.9658,
"step": 2500
},
{
"epoch": 0.6863695090439277,
"grad_norm": 0.5410998463630676,
"learning_rate": 0.0005901077150935774,
"loss": 3.9671,
"step": 2550
},
{
"epoch": 0.6998277347114557,
"grad_norm": 0.48503005504608154,
"learning_rate": 0.0005899057492931197,
"loss": 3.9541,
"step": 2600
},
{
"epoch": 0.7132859603789836,
"grad_norm": 0.474247545003891,
"learning_rate": 0.0005897037834926618,
"loss": 3.9415,
"step": 2650
},
{
"epoch": 0.7267441860465116,
"grad_norm": 0.5398220419883728,
"learning_rate": 0.0005895018176922041,
"loss": 3.9447,
"step": 2700
},
{
"epoch": 0.7402024117140397,
"grad_norm": 0.5455359816551208,
"learning_rate": 0.0005892998518917462,
"loss": 3.9324,
"step": 2750
},
{
"epoch": 0.7536606373815676,
"grad_norm": 0.556117832660675,
"learning_rate": 0.0005890978860912885,
"loss": 3.9207,
"step": 2800
},
{
"epoch": 0.7671188630490956,
"grad_norm": 0.550010621547699,
"learning_rate": 0.0005888959202908307,
"loss": 3.9291,
"step": 2850
},
{
"epoch": 0.7805770887166236,
"grad_norm": 0.6481958627700806,
"learning_rate": 0.0005886939544903729,
"loss": 3.9165,
"step": 2900
},
{
"epoch": 0.7940353143841516,
"grad_norm": 0.4899815320968628,
"learning_rate": 0.0005884919886899151,
"loss": 3.8995,
"step": 2950
},
{
"epoch": 0.8074935400516796,
"grad_norm": 0.5277990698814392,
"learning_rate": 0.0005882900228894573,
"loss": 3.9046,
"step": 3000
},
{
"epoch": 0.8074935400516796,
"eval_accuracy": 0.33397511030915245,
"eval_loss": 3.841794729232788,
"eval_runtime": 53.7996,
"eval_samples_per_second": 334.798,
"eval_steps_per_second": 20.93,
"step": 3000
},
{
"epoch": 0.8209517657192076,
"grad_norm": 0.477497935295105,
"learning_rate": 0.0005880880570889996,
"loss": 3.8943,
"step": 3050
},
{
"epoch": 0.8344099913867356,
"grad_norm": 0.4592600464820862,
"learning_rate": 0.0005878860912885418,
"loss": 3.8894,
"step": 3100
},
{
"epoch": 0.8478682170542635,
"grad_norm": 0.5376394987106323,
"learning_rate": 0.000587684125488084,
"loss": 3.8786,
"step": 3150
},
{
"epoch": 0.8613264427217916,
"grad_norm": 0.47509804368019104,
"learning_rate": 0.0005874821596876261,
"loss": 3.8695,
"step": 3200
},
{
"epoch": 0.8747846683893196,
"grad_norm": 0.590036928653717,
"learning_rate": 0.0005872801938871684,
"loss": 3.87,
"step": 3250
},
{
"epoch": 0.8882428940568475,
"grad_norm": 0.5190223455429077,
"learning_rate": 0.0005870782280867106,
"loss": 3.8557,
"step": 3300
},
{
"epoch": 0.9017011197243755,
"grad_norm": 0.5153225064277649,
"learning_rate": 0.0005868762622862528,
"loss": 3.852,
"step": 3350
},
{
"epoch": 0.9151593453919036,
"grad_norm": 0.42253756523132324,
"learning_rate": 0.000586674296485795,
"loss": 3.867,
"step": 3400
},
{
"epoch": 0.9286175710594315,
"grad_norm": 0.48400136828422546,
"learning_rate": 0.0005864723306853373,
"loss": 3.842,
"step": 3450
},
{
"epoch": 0.9420757967269595,
"grad_norm": 0.4630362093448639,
"learning_rate": 0.0005862703648848794,
"loss": 3.8455,
"step": 3500
},
{
"epoch": 0.9555340223944875,
"grad_norm": 0.526874840259552,
"learning_rate": 0.0005860683990844217,
"loss": 3.8223,
"step": 3550
},
{
"epoch": 0.9689922480620154,
"grad_norm": 0.5283749103546143,
"learning_rate": 0.0005858664332839638,
"loss": 3.8237,
"step": 3600
},
{
"epoch": 0.9824504737295435,
"grad_norm": 0.5012550950050354,
"learning_rate": 0.0005856644674835061,
"loss": 3.825,
"step": 3650
},
{
"epoch": 0.9959086993970715,
"grad_norm": 0.4376530051231384,
"learning_rate": 0.0005854625016830483,
"loss": 3.805,
"step": 3700
},
{
"epoch": 1.009151593453919,
"grad_norm": 0.4493418037891388,
"learning_rate": 0.0005852605358825905,
"loss": 3.7713,
"step": 3750
},
{
"epoch": 1.022609819121447,
"grad_norm": 0.44258996844291687,
"learning_rate": 0.0005850585700821327,
"loss": 3.7372,
"step": 3800
},
{
"epoch": 1.036068044788975,
"grad_norm": 0.5155314803123474,
"learning_rate": 0.0005848566042816749,
"loss": 3.7485,
"step": 3850
},
{
"epoch": 1.049526270456503,
"grad_norm": 0.4668378233909607,
"learning_rate": 0.0005846546384812171,
"loss": 3.739,
"step": 3900
},
{
"epoch": 1.062984496124031,
"grad_norm": 0.5066882371902466,
"learning_rate": 0.0005844526726807594,
"loss": 3.7466,
"step": 3950
},
{
"epoch": 1.076442721791559,
"grad_norm": 0.46061182022094727,
"learning_rate": 0.0005842507068803016,
"loss": 3.7493,
"step": 4000
},
{
"epoch": 1.076442721791559,
"eval_accuracy": 0.34418400344540906,
"eval_loss": 3.738100051879883,
"eval_runtime": 53.6959,
"eval_samples_per_second": 335.444,
"eval_steps_per_second": 20.97,
"step": 4000
},
{
"epoch": 1.089900947459087,
"grad_norm": 0.5034427642822266,
"learning_rate": 0.0005840487410798437,
"loss": 3.7378,
"step": 4050
},
{
"epoch": 1.103359173126615,
"grad_norm": 0.4937703609466553,
"learning_rate": 0.000583846775279386,
"loss": 3.7272,
"step": 4100
},
{
"epoch": 1.116817398794143,
"grad_norm": 0.43894490599632263,
"learning_rate": 0.0005836448094789282,
"loss": 3.7287,
"step": 4150
},
{
"epoch": 1.130275624461671,
"grad_norm": 0.48713067173957825,
"learning_rate": 0.0005834428436784704,
"loss": 3.7364,
"step": 4200
},
{
"epoch": 1.143733850129199,
"grad_norm": 0.4943729341030121,
"learning_rate": 0.0005832408778780126,
"loss": 3.739,
"step": 4250
},
{
"epoch": 1.157192075796727,
"grad_norm": 0.46905580163002014,
"learning_rate": 0.0005830389120775548,
"loss": 3.7298,
"step": 4300
},
{
"epoch": 1.1706503014642549,
"grad_norm": 0.4387616217136383,
"learning_rate": 0.000582836946277097,
"loss": 3.7308,
"step": 4350
},
{
"epoch": 1.1841085271317828,
"grad_norm": 0.455136239528656,
"learning_rate": 0.0005826349804766393,
"loss": 3.7206,
"step": 4400
},
{
"epoch": 1.197566752799311,
"grad_norm": 0.44222962856292725,
"learning_rate": 0.0005824330146761814,
"loss": 3.7197,
"step": 4450
},
{
"epoch": 1.211024978466839,
"grad_norm": 0.4487605392932892,
"learning_rate": 0.0005822310488757237,
"loss": 3.718,
"step": 4500
},
{
"epoch": 1.224483204134367,
"grad_norm": 0.4749026894569397,
"learning_rate": 0.0005820290830752658,
"loss": 3.7054,
"step": 4550
},
{
"epoch": 1.237941429801895,
"grad_norm": 0.44181352853775024,
"learning_rate": 0.0005818271172748081,
"loss": 3.701,
"step": 4600
},
{
"epoch": 1.251399655469423,
"grad_norm": 0.4515010714530945,
"learning_rate": 0.0005816251514743503,
"loss": 3.7134,
"step": 4650
},
{
"epoch": 1.2648578811369509,
"grad_norm": 0.4210042357444763,
"learning_rate": 0.0005814231856738925,
"loss": 3.6976,
"step": 4700
},
{
"epoch": 1.2783161068044788,
"grad_norm": 0.509992241859436,
"learning_rate": 0.0005812212198734347,
"loss": 3.6931,
"step": 4750
},
{
"epoch": 1.291774332472007,
"grad_norm": 0.42137229442596436,
"learning_rate": 0.0005810192540729769,
"loss": 3.7055,
"step": 4800
},
{
"epoch": 1.3052325581395348,
"grad_norm": 0.39782291650772095,
"learning_rate": 0.0005808172882725192,
"loss": 3.6862,
"step": 4850
},
{
"epoch": 1.318690783807063,
"grad_norm": 0.4171382188796997,
"learning_rate": 0.0005806153224720614,
"loss": 3.6812,
"step": 4900
},
{
"epoch": 1.332149009474591,
"grad_norm": 0.4413432478904724,
"learning_rate": 0.0005804133566716036,
"loss": 3.6806,
"step": 4950
},
{
"epoch": 1.3456072351421189,
"grad_norm": 0.42678385972976685,
"learning_rate": 0.0005802113908711457,
"loss": 3.6752,
"step": 5000
},
{
"epoch": 1.3456072351421189,
"eval_accuracy": 0.35101408802046735,
"eval_loss": 3.6677448749542236,
"eval_runtime": 53.6849,
"eval_samples_per_second": 335.514,
"eval_steps_per_second": 20.974,
"step": 5000
},
{
"epoch": 1.3590654608096469,
"grad_norm": 0.3610088527202606,
"learning_rate": 0.000580009425070688,
"loss": 3.6851,
"step": 5050
},
{
"epoch": 1.3725236864771748,
"grad_norm": 0.3927769064903259,
"learning_rate": 0.0005798074592702302,
"loss": 3.688,
"step": 5100
},
{
"epoch": 1.3859819121447028,
"grad_norm": 0.4166286289691925,
"learning_rate": 0.0005796054934697724,
"loss": 3.6718,
"step": 5150
},
{
"epoch": 1.3994401378122308,
"grad_norm": 0.39592215418815613,
"learning_rate": 0.0005794035276693146,
"loss": 3.6768,
"step": 5200
},
{
"epoch": 1.412898363479759,
"grad_norm": 0.4000367820262909,
"learning_rate": 0.0005792015618688568,
"loss": 3.6685,
"step": 5250
},
{
"epoch": 1.4263565891472867,
"grad_norm": 0.38646310567855835,
"learning_rate": 0.000578999596068399,
"loss": 3.674,
"step": 5300
},
{
"epoch": 1.4398148148148149,
"grad_norm": 0.37530556321144104,
"learning_rate": 0.0005787976302679413,
"loss": 3.6658,
"step": 5350
},
{
"epoch": 1.4532730404823428,
"grad_norm": 0.37342968583106995,
"learning_rate": 0.0005785956644674834,
"loss": 3.6638,
"step": 5400
},
{
"epoch": 1.4667312661498708,
"grad_norm": 0.48188892006874084,
"learning_rate": 0.0005783936986670257,
"loss": 3.65,
"step": 5450
},
{
"epoch": 1.4801894918173988,
"grad_norm": 0.4143880605697632,
"learning_rate": 0.0005781917328665678,
"loss": 3.6685,
"step": 5500
},
{
"epoch": 1.4936477174849268,
"grad_norm": 0.3935947120189667,
"learning_rate": 0.0005779897670661101,
"loss": 3.6693,
"step": 5550
},
{
"epoch": 1.507105943152455,
"grad_norm": 0.3887571692466736,
"learning_rate": 0.0005777878012656523,
"loss": 3.6481,
"step": 5600
},
{
"epoch": 1.5205641688199827,
"grad_norm": 0.38384705781936646,
"learning_rate": 0.0005775858354651945,
"loss": 3.6564,
"step": 5650
},
{
"epoch": 1.5340223944875109,
"grad_norm": 0.36646515130996704,
"learning_rate": 0.0005773838696647367,
"loss": 3.645,
"step": 5700
},
{
"epoch": 1.5474806201550386,
"grad_norm": 0.41495048999786377,
"learning_rate": 0.000577181903864279,
"loss": 3.6521,
"step": 5750
},
{
"epoch": 1.5609388458225668,
"grad_norm": 0.4048604369163513,
"learning_rate": 0.0005769799380638212,
"loss": 3.6421,
"step": 5800
},
{
"epoch": 1.5743970714900948,
"grad_norm": 0.4305686950683594,
"learning_rate": 0.0005767779722633633,
"loss": 3.6493,
"step": 5850
},
{
"epoch": 1.5878552971576227,
"grad_norm": 0.414792001247406,
"learning_rate": 0.0005765760064629056,
"loss": 3.6338,
"step": 5900
},
{
"epoch": 1.6013135228251507,
"grad_norm": 0.38997161388397217,
"learning_rate": 0.0005763740406624477,
"loss": 3.6387,
"step": 5950
},
{
"epoch": 1.6147717484926787,
"grad_norm": 0.37915465235710144,
"learning_rate": 0.00057617207486199,
"loss": 3.6324,
"step": 6000
},
{
"epoch": 1.6147717484926787,
"eval_accuracy": 0.3565350866328297,
"eval_loss": 3.6097068786621094,
"eval_runtime": 53.6997,
"eval_samples_per_second": 335.421,
"eval_steps_per_second": 20.968,
"step": 6000
},
{
"epoch": 1.6282299741602069,
"grad_norm": 0.4142931401729584,
"learning_rate": 0.0005759701090615322,
"loss": 3.6351,
"step": 6050
},
{
"epoch": 1.6416881998277346,
"grad_norm": 0.4205895960330963,
"learning_rate": 0.0005757681432610744,
"loss": 3.6229,
"step": 6100
},
{
"epoch": 1.6551464254952628,
"grad_norm": 0.42943212389945984,
"learning_rate": 0.0005755661774606166,
"loss": 3.6268,
"step": 6150
},
{
"epoch": 1.6686046511627906,
"grad_norm": 0.4431730806827545,
"learning_rate": 0.0005753642116601588,
"loss": 3.6232,
"step": 6200
},
{
"epoch": 1.6820628768303187,
"grad_norm": 0.4297159016132355,
"learning_rate": 0.000575162245859701,
"loss": 3.6256,
"step": 6250
},
{
"epoch": 1.6955211024978467,
"grad_norm": 0.3931505084037781,
"learning_rate": 0.0005749602800592433,
"loss": 3.6251,
"step": 6300
},
{
"epoch": 1.7089793281653747,
"grad_norm": 0.41490694880485535,
"learning_rate": 0.0005747583142587854,
"loss": 3.6127,
"step": 6350
},
{
"epoch": 1.7224375538329026,
"grad_norm": 0.3898763656616211,
"learning_rate": 0.0005745563484583277,
"loss": 3.6275,
"step": 6400
},
{
"epoch": 1.7358957795004306,
"grad_norm": 0.3727113902568817,
"learning_rate": 0.0005743543826578698,
"loss": 3.6163,
"step": 6450
},
{
"epoch": 1.7493540051679588,
"grad_norm": 0.3718154728412628,
"learning_rate": 0.0005741524168574121,
"loss": 3.6241,
"step": 6500
},
{
"epoch": 1.7628122308354865,
"grad_norm": 0.38011330366134644,
"learning_rate": 0.0005739504510569543,
"loss": 3.603,
"step": 6550
},
{
"epoch": 1.7762704565030147,
"grad_norm": 0.3921981155872345,
"learning_rate": 0.0005737484852564966,
"loss": 3.6211,
"step": 6600
},
{
"epoch": 1.7897286821705425,
"grad_norm": 0.3750942349433899,
"learning_rate": 0.0005735465194560387,
"loss": 3.6174,
"step": 6650
},
{
"epoch": 1.8031869078380707,
"grad_norm": 0.341286838054657,
"learning_rate": 0.000573344553655581,
"loss": 3.6087,
"step": 6700
},
{
"epoch": 1.8166451335055986,
"grad_norm": 0.4016365706920624,
"learning_rate": 0.0005731425878551232,
"loss": 3.6186,
"step": 6750
},
{
"epoch": 1.8301033591731266,
"grad_norm": 0.37889373302459717,
"learning_rate": 0.0005729406220546653,
"loss": 3.5999,
"step": 6800
},
{
"epoch": 1.8435615848406546,
"grad_norm": 0.392206609249115,
"learning_rate": 0.0005727386562542076,
"loss": 3.5989,
"step": 6850
},
{
"epoch": 1.8570198105081825,
"grad_norm": 0.39812007546424866,
"learning_rate": 0.0005725366904537497,
"loss": 3.587,
"step": 6900
},
{
"epoch": 1.8704780361757107,
"grad_norm": 0.37985455989837646,
"learning_rate": 0.000572334724653292,
"loss": 3.609,
"step": 6950
},
{
"epoch": 1.8839362618432385,
"grad_norm": 0.41674208641052246,
"learning_rate": 0.0005721327588528342,
"loss": 3.5971,
"step": 7000
},
{
"epoch": 1.8839362618432385,
"eval_accuracy": 0.3611523607006071,
"eval_loss": 3.5647761821746826,
"eval_runtime": 53.6742,
"eval_samples_per_second": 335.58,
"eval_steps_per_second": 20.978,
"step": 7000
},
{
"epoch": 1.8973944875107667,
"grad_norm": 0.4201742112636566,
"learning_rate": 0.0005719307930523764,
"loss": 3.6045,
"step": 7050
},
{
"epoch": 1.9108527131782944,
"grad_norm": 0.35860392451286316,
"learning_rate": 0.0005717288272519186,
"loss": 3.5862,
"step": 7100
},
{
"epoch": 1.9243109388458226,
"grad_norm": 0.42538923025131226,
"learning_rate": 0.0005715268614514608,
"loss": 3.5803,
"step": 7150
},
{
"epoch": 1.9377691645133506,
"grad_norm": 0.3780660331249237,
"learning_rate": 0.000571324895651003,
"loss": 3.5995,
"step": 7200
},
{
"epoch": 1.9512273901808785,
"grad_norm": 0.37989741563796997,
"learning_rate": 0.0005711229298505453,
"loss": 3.5968,
"step": 7250
},
{
"epoch": 1.9646856158484065,
"grad_norm": 0.43703949451446533,
"learning_rate": 0.0005709209640500874,
"loss": 3.5722,
"step": 7300
},
{
"epoch": 1.9781438415159345,
"grad_norm": 0.4558578431606293,
"learning_rate": 0.0005707189982496297,
"loss": 3.5875,
"step": 7350
},
{
"epoch": 1.9916020671834627,
"grad_norm": 0.35950618982315063,
"learning_rate": 0.0005705170324491719,
"loss": 3.5881,
"step": 7400
},
{
"epoch": 2.00484496124031,
"grad_norm": 0.39500686526298523,
"learning_rate": 0.0005703150666487141,
"loss": 3.5579,
"step": 7450
},
{
"epoch": 2.018303186907838,
"grad_norm": 0.341329425573349,
"learning_rate": 0.0005701131008482563,
"loss": 3.4853,
"step": 7500
},
{
"epoch": 2.0317614125753662,
"grad_norm": 0.35838985443115234,
"learning_rate": 0.0005699111350477986,
"loss": 3.4963,
"step": 7550
},
{
"epoch": 2.045219638242894,
"grad_norm": 0.3362147808074951,
"learning_rate": 0.0005697091692473408,
"loss": 3.4946,
"step": 7600
},
{
"epoch": 2.058677863910422,
"grad_norm": 0.40855300426483154,
"learning_rate": 0.000569507203446883,
"loss": 3.5035,
"step": 7650
},
{
"epoch": 2.07213608957795,
"grad_norm": 0.38907238841056824,
"learning_rate": 0.0005693052376464252,
"loss": 3.5011,
"step": 7700
},
{
"epoch": 2.085594315245478,
"grad_norm": 0.39034557342529297,
"learning_rate": 0.0005691032718459673,
"loss": 3.4969,
"step": 7750
},
{
"epoch": 2.099052540913006,
"grad_norm": 0.41712623834609985,
"learning_rate": 0.0005689013060455096,
"loss": 3.5032,
"step": 7800
},
{
"epoch": 2.112510766580534,
"grad_norm": 0.39129939675331116,
"learning_rate": 0.0005686993402450518,
"loss": 3.4977,
"step": 7850
},
{
"epoch": 2.125968992248062,
"grad_norm": 0.36673828959465027,
"learning_rate": 0.000568497374444594,
"loss": 3.5007,
"step": 7900
},
{
"epoch": 2.13942721791559,
"grad_norm": 0.37993109226226807,
"learning_rate": 0.0005682954086441362,
"loss": 3.5074,
"step": 7950
},
{
"epoch": 2.152885443583118,
"grad_norm": 0.37266653776168823,
"learning_rate": 0.0005680934428436784,
"loss": 3.5112,
"step": 8000
},
{
"epoch": 2.152885443583118,
"eval_accuracy": 0.3640361741878539,
"eval_loss": 3.537073850631714,
"eval_runtime": 53.612,
"eval_samples_per_second": 335.97,
"eval_steps_per_second": 21.003,
"step": 8000
},
{
"epoch": 2.166343669250646,
"grad_norm": 0.369495153427124,
"learning_rate": 0.0005678914770432206,
"loss": 3.504,
"step": 8050
},
{
"epoch": 2.179801894918174,
"grad_norm": 0.3906343877315521,
"learning_rate": 0.0005676895112427629,
"loss": 3.4954,
"step": 8100
},
{
"epoch": 2.193260120585702,
"grad_norm": 0.4126552641391754,
"learning_rate": 0.000567487545442305,
"loss": 3.5038,
"step": 8150
},
{
"epoch": 2.20671834625323,
"grad_norm": 0.3587755560874939,
"learning_rate": 0.0005672855796418473,
"loss": 3.5045,
"step": 8200
},
{
"epoch": 2.220176571920758,
"grad_norm": 0.3550557792186737,
"learning_rate": 0.0005670836138413894,
"loss": 3.5078,
"step": 8250
},
{
"epoch": 2.233634797588286,
"grad_norm": 0.38642674684524536,
"learning_rate": 0.0005668816480409317,
"loss": 3.506,
"step": 8300
},
{
"epoch": 2.2470930232558137,
"grad_norm": 0.3594475984573364,
"learning_rate": 0.0005666796822404739,
"loss": 3.4996,
"step": 8350
},
{
"epoch": 2.260551248923342,
"grad_norm": 0.3822736144065857,
"learning_rate": 0.0005664777164400162,
"loss": 3.5008,
"step": 8400
},
{
"epoch": 2.27400947459087,
"grad_norm": 0.3965492844581604,
"learning_rate": 0.0005662757506395583,
"loss": 3.5014,
"step": 8450
},
{
"epoch": 2.287467700258398,
"grad_norm": 0.3902672231197357,
"learning_rate": 0.0005660737848391006,
"loss": 3.5048,
"step": 8500
},
{
"epoch": 2.300925925925926,
"grad_norm": 0.38660308718681335,
"learning_rate": 0.0005658718190386428,
"loss": 3.5106,
"step": 8550
},
{
"epoch": 2.314384151593454,
"grad_norm": 0.38404619693756104,
"learning_rate": 0.000565669853238185,
"loss": 3.5119,
"step": 8600
},
{
"epoch": 2.327842377260982,
"grad_norm": 0.385078102350235,
"learning_rate": 0.0005654678874377272,
"loss": 3.5057,
"step": 8650
},
{
"epoch": 2.3413006029285097,
"grad_norm": 0.3999466300010681,
"learning_rate": 0.0005652659216372693,
"loss": 3.5044,
"step": 8700
},
{
"epoch": 2.354758828596038,
"grad_norm": 0.36590397357940674,
"learning_rate": 0.0005650639558368116,
"loss": 3.4961,
"step": 8750
},
{
"epoch": 2.3682170542635657,
"grad_norm": 0.38425323367118835,
"learning_rate": 0.0005648619900363538,
"loss": 3.5168,
"step": 8800
},
{
"epoch": 2.381675279931094,
"grad_norm": 0.3574206531047821,
"learning_rate": 0.000564660024235896,
"loss": 3.498,
"step": 8850
},
{
"epoch": 2.395133505598622,
"grad_norm": 0.3823363482952118,
"learning_rate": 0.0005644580584354382,
"loss": 3.4875,
"step": 8900
},
{
"epoch": 2.40859173126615,
"grad_norm": 0.3611487150192261,
"learning_rate": 0.0005642560926349804,
"loss": 3.4983,
"step": 8950
},
{
"epoch": 2.422049956933678,
"grad_norm": 0.38465991616249084,
"learning_rate": 0.0005640541268345226,
"loss": 3.4948,
"step": 9000
},
{
"epoch": 2.422049956933678,
"eval_accuracy": 0.36659046176217375,
"eval_loss": 3.5099785327911377,
"eval_runtime": 53.8716,
"eval_samples_per_second": 334.351,
"eval_steps_per_second": 20.902,
"step": 9000
},
{
"epoch": 2.4355081826012057,
"grad_norm": 0.38401660323143005,
"learning_rate": 0.0005638521610340649,
"loss": 3.5016,
"step": 9050
},
{
"epoch": 2.448966408268734,
"grad_norm": 0.36064672470092773,
"learning_rate": 0.000563650195233607,
"loss": 3.4943,
"step": 9100
},
{
"epoch": 2.4624246339362617,
"grad_norm": 0.39322274923324585,
"learning_rate": 0.0005634482294331493,
"loss": 3.4883,
"step": 9150
},
{
"epoch": 2.47588285960379,
"grad_norm": 0.36240771412849426,
"learning_rate": 0.0005632462636326914,
"loss": 3.4901,
"step": 9200
},
{
"epoch": 2.4893410852713176,
"grad_norm": 0.37943360209465027,
"learning_rate": 0.0005630442978322337,
"loss": 3.494,
"step": 9250
},
{
"epoch": 2.502799310938846,
"grad_norm": 0.36113718152046204,
"learning_rate": 0.0005628423320317759,
"loss": 3.4915,
"step": 9300
},
{
"epoch": 2.516257536606374,
"grad_norm": 0.3588191568851471,
"learning_rate": 0.0005626403662313182,
"loss": 3.4769,
"step": 9350
},
{
"epoch": 2.5297157622739017,
"grad_norm": 0.3674715459346771,
"learning_rate": 0.0005624384004308603,
"loss": 3.4824,
"step": 9400
},
{
"epoch": 2.54317398794143,
"grad_norm": 0.3795354962348938,
"learning_rate": 0.0005622364346304026,
"loss": 3.4977,
"step": 9450
},
{
"epoch": 2.5566322136089576,
"grad_norm": 0.3495054244995117,
"learning_rate": 0.0005620344688299448,
"loss": 3.4829,
"step": 9500
},
{
"epoch": 2.570090439276486,
"grad_norm": 0.3756536543369293,
"learning_rate": 0.0005618325030294869,
"loss": 3.4937,
"step": 9550
},
{
"epoch": 2.583548664944014,
"grad_norm": 0.33491694927215576,
"learning_rate": 0.0005616305372290292,
"loss": 3.4838,
"step": 9600
},
{
"epoch": 2.5970068906115418,
"grad_norm": 0.3515341877937317,
"learning_rate": 0.0005614285714285713,
"loss": 3.4769,
"step": 9650
},
{
"epoch": 2.6104651162790695,
"grad_norm": 0.35633155703544617,
"learning_rate": 0.0005612266056281136,
"loss": 3.482,
"step": 9700
},
{
"epoch": 2.6239233419465977,
"grad_norm": 0.4096840023994446,
"learning_rate": 0.0005610246398276558,
"loss": 3.4919,
"step": 9750
},
{
"epoch": 2.637381567614126,
"grad_norm": 0.38962793350219727,
"learning_rate": 0.000560822674027198,
"loss": 3.4777,
"step": 9800
},
{
"epoch": 2.6508397932816536,
"grad_norm": 0.35541465878486633,
"learning_rate": 0.0005606207082267402,
"loss": 3.4946,
"step": 9850
},
{
"epoch": 2.664298018949182,
"grad_norm": 0.36673033237457275,
"learning_rate": 0.0005604187424262824,
"loss": 3.478,
"step": 9900
},
{
"epoch": 2.6777562446167096,
"grad_norm": 0.38117632269859314,
"learning_rate": 0.0005602167766258246,
"loss": 3.4992,
"step": 9950
},
{
"epoch": 2.6912144702842378,
"grad_norm": 0.3540132939815521,
"learning_rate": 0.0005600148108253669,
"loss": 3.4936,
"step": 10000
},
{
"epoch": 2.6912144702842378,
"eval_accuracy": 0.36949480950512226,
"eval_loss": 3.4849045276641846,
"eval_runtime": 53.8648,
"eval_samples_per_second": 334.393,
"eval_steps_per_second": 20.904,
"step": 10000
},
{
"epoch": 2.704672695951766,
"grad_norm": 0.3549436330795288,
"learning_rate": 0.000559812845024909,
"loss": 3.482,
"step": 10050
},
{
"epoch": 2.7181309216192937,
"grad_norm": 0.3870752155780792,
"learning_rate": 0.0005596108792244513,
"loss": 3.4803,
"step": 10100
},
{
"epoch": 2.7315891472868215,
"grad_norm": 0.3666519522666931,
"learning_rate": 0.0005594089134239935,
"loss": 3.4813,
"step": 10150
},
{
"epoch": 2.7450473729543496,
"grad_norm": 0.3458859622478485,
"learning_rate": 0.0005592069476235358,
"loss": 3.4725,
"step": 10200
},
{
"epoch": 2.758505598621878,
"grad_norm": 0.3320566415786743,
"learning_rate": 0.0005590049818230779,
"loss": 3.4812,
"step": 10250
},
{
"epoch": 2.7719638242894056,
"grad_norm": 0.354028582572937,
"learning_rate": 0.0005588030160226202,
"loss": 3.4935,
"step": 10300
},
{
"epoch": 2.7854220499569338,
"grad_norm": 0.38568511605262756,
"learning_rate": 0.0005586010502221623,
"loss": 3.4781,
"step": 10350
},
{
"epoch": 2.7988802756244615,
"grad_norm": 0.4278452694416046,
"learning_rate": 0.0005583990844217045,
"loss": 3.4732,
"step": 10400
},
{
"epoch": 2.8123385012919897,
"grad_norm": 0.33488065004348755,
"learning_rate": 0.0005581971186212468,
"loss": 3.4761,
"step": 10450
},
{
"epoch": 2.825796726959518,
"grad_norm": 0.37788429856300354,
"learning_rate": 0.0005579951528207889,
"loss": 3.4835,
"step": 10500
},
{
"epoch": 2.8392549526270456,
"grad_norm": 0.3877081573009491,
"learning_rate": 0.0005577931870203312,
"loss": 3.4722,
"step": 10550
},
{
"epoch": 2.8527131782945734,
"grad_norm": 0.37572547793388367,
"learning_rate": 0.0005575912212198733,
"loss": 3.4701,
"step": 10600
},
{
"epoch": 2.8661714039621016,
"grad_norm": 0.38266733288764954,
"learning_rate": 0.0005573892554194156,
"loss": 3.4855,
"step": 10650
},
{
"epoch": 2.8796296296296298,
"grad_norm": 0.3513830304145813,
"learning_rate": 0.0005571872896189578,
"loss": 3.4572,
"step": 10700
},
{
"epoch": 2.8930878552971575,
"grad_norm": 0.3514174818992615,
"learning_rate": 0.0005569853238185,
"loss": 3.4694,
"step": 10750
},
{
"epoch": 2.9065460809646857,
"grad_norm": 0.3365142345428467,
"learning_rate": 0.0005567833580180422,
"loss": 3.4767,
"step": 10800
},
{
"epoch": 2.9200043066322134,
"grad_norm": 0.3384622037410736,
"learning_rate": 0.0005565813922175844,
"loss": 3.4679,
"step": 10850
},
{
"epoch": 2.9334625322997416,
"grad_norm": 0.3428475260734558,
"learning_rate": 0.0005563794264171266,
"loss": 3.4704,
"step": 10900
},
{
"epoch": 2.94692075796727,
"grad_norm": 0.3406570553779602,
"learning_rate": 0.0005561774606166689,
"loss": 3.4632,
"step": 10950
},
{
"epoch": 2.9603789836347976,
"grad_norm": 0.36909055709838867,
"learning_rate": 0.000555975494816211,
"loss": 3.4683,
"step": 11000
},
{
"epoch": 2.9603789836347976,
"eval_accuracy": 0.37134626057079584,
"eval_loss": 3.4629483222961426,
"eval_runtime": 53.8353,
"eval_samples_per_second": 334.576,
"eval_steps_per_second": 20.916,
"step": 11000
},
{
"epoch": 2.9738372093023253,
"grad_norm": 0.36576735973358154,
"learning_rate": 0.0005557735290157534,
"loss": 3.4682,
"step": 11050
},
{
"epoch": 2.9872954349698535,
"grad_norm": 0.3771063983440399,
"learning_rate": 0.0005555715632152955,
"loss": 3.4609,
"step": 11100
},
{
"epoch": 3.000538329026701,
"grad_norm": 0.3679291009902954,
"learning_rate": 0.0005553695974148378,
"loss": 3.4581,
"step": 11150
},
{
"epoch": 3.013996554694229,
"grad_norm": 0.4077318012714386,
"learning_rate": 0.0005551676316143799,
"loss": 3.3805,
"step": 11200
},
{
"epoch": 3.027454780361757,
"grad_norm": 0.3519296646118164,
"learning_rate": 0.0005549656658139222,
"loss": 3.3762,
"step": 11250
},
{
"epoch": 3.0409130060292853,
"grad_norm": 0.3560413718223572,
"learning_rate": 0.0005547637000134644,
"loss": 3.3698,
"step": 11300
},
{
"epoch": 3.054371231696813,
"grad_norm": 0.4313643276691437,
"learning_rate": 0.0005545617342130065,
"loss": 3.3794,
"step": 11350
},
{
"epoch": 3.067829457364341,
"grad_norm": 0.3495795726776123,
"learning_rate": 0.0005543597684125488,
"loss": 3.3729,
"step": 11400
},
{
"epoch": 3.081287683031869,
"grad_norm": 0.37050846219062805,
"learning_rate": 0.0005541578026120909,
"loss": 3.3754,
"step": 11450
},
{
"epoch": 3.094745908699397,
"grad_norm": 0.364422470331192,
"learning_rate": 0.0005539558368116332,
"loss": 3.3805,
"step": 11500
},
{
"epoch": 3.108204134366925,
"grad_norm": 0.39504632353782654,
"learning_rate": 0.0005537538710111754,
"loss": 3.39,
"step": 11550
},
{
"epoch": 3.121662360034453,
"grad_norm": 0.3495160937309265,
"learning_rate": 0.0005535519052107176,
"loss": 3.3926,
"step": 11600
},
{
"epoch": 3.135120585701981,
"grad_norm": 0.3640858232975006,
"learning_rate": 0.0005533499394102598,
"loss": 3.3842,
"step": 11650
},
{
"epoch": 3.148578811369509,
"grad_norm": 0.35223206877708435,
"learning_rate": 0.000553147973609802,
"loss": 3.3832,
"step": 11700
},
{
"epoch": 3.162037037037037,
"grad_norm": 0.3473789691925049,
"learning_rate": 0.0005529460078093442,
"loss": 3.3901,
"step": 11750
},
{
"epoch": 3.175495262704565,
"grad_norm": 0.3820473253726959,
"learning_rate": 0.0005527440420088865,
"loss": 3.4031,
"step": 11800
},
{
"epoch": 3.188953488372093,
"grad_norm": 0.36691343784332275,
"learning_rate": 0.0005525420762084286,
"loss": 3.3887,
"step": 11850
},
{
"epoch": 3.202411714039621,
"grad_norm": 0.3371462821960449,
"learning_rate": 0.0005523401104079709,
"loss": 3.3819,
"step": 11900
},
{
"epoch": 3.215869939707149,
"grad_norm": 0.34302136301994324,
"learning_rate": 0.0005521381446075131,
"loss": 3.4062,
"step": 11950
},
{
"epoch": 3.229328165374677,
"grad_norm": 0.3512645363807678,
"learning_rate": 0.0005519361788070554,
"loss": 3.3893,
"step": 12000
},
{
"epoch": 3.229328165374677,
"eval_accuracy": 0.373407834655131,
"eval_loss": 3.449765205383301,
"eval_runtime": 53.7784,
"eval_samples_per_second": 334.93,
"eval_steps_per_second": 20.938,
"step": 12000
},
{
"epoch": 3.242786391042205,
"grad_norm": 0.3671615719795227,
"learning_rate": 0.0005517342130065975,
"loss": 3.4015,
"step": 12050
},
{
"epoch": 3.2562446167097328,
"grad_norm": 0.33293598890304565,
"learning_rate": 0.0005515322472061398,
"loss": 3.3848,
"step": 12100
},
{
"epoch": 3.269702842377261,
"grad_norm": 0.3794623017311096,
"learning_rate": 0.0005513302814056819,
"loss": 3.3976,
"step": 12150
},
{
"epoch": 3.283161068044789,
"grad_norm": 0.3835780918598175,
"learning_rate": 0.0005511283156052242,
"loss": 3.3949,
"step": 12200
},
{
"epoch": 3.296619293712317,
"grad_norm": 0.35747066140174866,
"learning_rate": 0.0005509263498047664,
"loss": 3.3953,
"step": 12250
},
{
"epoch": 3.310077519379845,
"grad_norm": 0.34848782420158386,
"learning_rate": 0.0005507243840043085,
"loss": 3.3953,
"step": 12300
},
{
"epoch": 3.323535745047373,
"grad_norm": 0.34142157435417175,
"learning_rate": 0.0005505224182038508,
"loss": 3.4,
"step": 12350
},
{
"epoch": 3.336993970714901,
"grad_norm": 0.3376274108886719,
"learning_rate": 0.0005503204524033929,
"loss": 3.39,
"step": 12400
},
{
"epoch": 3.3504521963824287,
"grad_norm": 0.3726096749305725,
"learning_rate": 0.0005501184866029352,
"loss": 3.3992,
"step": 12450
},
{
"epoch": 3.363910422049957,
"grad_norm": 0.37750956416130066,
"learning_rate": 0.0005499165208024774,
"loss": 3.3888,
"step": 12500
},
{
"epoch": 3.3773686477174847,
"grad_norm": 0.35686615109443665,
"learning_rate": 0.0005497145550020196,
"loss": 3.3925,
"step": 12550
},
{
"epoch": 3.390826873385013,
"grad_norm": 0.3921195864677429,
"learning_rate": 0.0005495125892015618,
"loss": 3.3896,
"step": 12600
},
{
"epoch": 3.404285099052541,
"grad_norm": 0.3748328387737274,
"learning_rate": 0.000549310623401104,
"loss": 3.3959,
"step": 12650
},
{
"epoch": 3.417743324720069,
"grad_norm": 0.35698792338371277,
"learning_rate": 0.0005491086576006462,
"loss": 3.4172,
"step": 12700
},
{
"epoch": 3.431201550387597,
"grad_norm": 0.34931182861328125,
"learning_rate": 0.0005489066918001885,
"loss": 3.4007,
"step": 12750
},
{
"epoch": 3.4446597760551247,
"grad_norm": 0.36840617656707764,
"learning_rate": 0.0005487047259997306,
"loss": 3.3995,
"step": 12800
},
{
"epoch": 3.458118001722653,
"grad_norm": 0.37831541895866394,
"learning_rate": 0.0005485027601992729,
"loss": 3.4005,
"step": 12850
},
{
"epoch": 3.471576227390181,
"grad_norm": 0.3618316054344177,
"learning_rate": 0.0005483007943988151,
"loss": 3.4051,
"step": 12900
},
{
"epoch": 3.485034453057709,
"grad_norm": 0.3680399954319,
"learning_rate": 0.0005480988285983574,
"loss": 3.3998,
"step": 12950
},
{
"epoch": 3.4984926787252366,
"grad_norm": 0.36591285467147827,
"learning_rate": 0.0005478968627978995,
"loss": 3.3915,
"step": 13000
},
{
"epoch": 3.4984926787252366,
"eval_accuracy": 0.37471398715272664,
"eval_loss": 3.436342239379883,
"eval_runtime": 53.7514,
"eval_samples_per_second": 335.098,
"eval_steps_per_second": 20.948,
"step": 13000
},
{
"epoch": 3.511950904392765,
"grad_norm": 0.34238138794898987,
"learning_rate": 0.0005476948969974418,
"loss": 3.4124,
"step": 13050
},
{
"epoch": 3.525409130060293,
"grad_norm": 0.36642181873321533,
"learning_rate": 0.0005474929311969839,
"loss": 3.3979,
"step": 13100
},
{
"epoch": 3.5388673557278207,
"grad_norm": 0.378031462430954,
"learning_rate": 0.0005472909653965261,
"loss": 3.3842,
"step": 13150
},
{
"epoch": 3.552325581395349,
"grad_norm": 0.34340566396713257,
"learning_rate": 0.0005470889995960684,
"loss": 3.3945,
"step": 13200
},
{
"epoch": 3.5657838070628767,
"grad_norm": 0.37123072147369385,
"learning_rate": 0.0005468870337956105,
"loss": 3.4008,
"step": 13250
},
{
"epoch": 3.579242032730405,
"grad_norm": 0.38739728927612305,
"learning_rate": 0.0005466850679951528,
"loss": 3.4049,
"step": 13300
},
{
"epoch": 3.592700258397933,
"grad_norm": 0.36094415187835693,
"learning_rate": 0.0005464831021946949,
"loss": 3.408,
"step": 13350
},
{
"epoch": 3.606158484065461,
"grad_norm": 0.35295525193214417,
"learning_rate": 0.0005462811363942372,
"loss": 3.3991,
"step": 13400
},
{
"epoch": 3.6196167097329885,
"grad_norm": 0.35480549931526184,
"learning_rate": 0.0005460791705937794,
"loss": 3.4071,
"step": 13450
},
{
"epoch": 3.6330749354005167,
"grad_norm": 0.35453832149505615,
"learning_rate": 0.0005458772047933216,
"loss": 3.3911,
"step": 13500
},
{
"epoch": 3.646533161068045,
"grad_norm": 0.37023717164993286,
"learning_rate": 0.0005456752389928638,
"loss": 3.3983,
"step": 13550
},
{
"epoch": 3.6599913867355727,
"grad_norm": 0.35451245307922363,
"learning_rate": 0.000545473273192406,
"loss": 3.3998,
"step": 13600
},
{
"epoch": 3.673449612403101,
"grad_norm": 0.35649922490119934,
"learning_rate": 0.0005452713073919482,
"loss": 3.3961,
"step": 13650
},
{
"epoch": 3.6869078380706286,
"grad_norm": 0.36602070927619934,
"learning_rate": 0.0005450693415914905,
"loss": 3.3947,
"step": 13700
},
{
"epoch": 3.700366063738157,
"grad_norm": 0.37440329790115356,
"learning_rate": 0.0005448673757910327,
"loss": 3.3986,
"step": 13750
},
{
"epoch": 3.713824289405685,
"grad_norm": 0.35884660482406616,
"learning_rate": 0.0005446654099905749,
"loss": 3.3951,
"step": 13800
},
{
"epoch": 3.7272825150732127,
"grad_norm": 0.3605027496814728,
"learning_rate": 0.0005444634441901171,
"loss": 3.3956,
"step": 13850
},
{
"epoch": 3.7407407407407405,
"grad_norm": 0.3374119997024536,
"learning_rate": 0.0005442614783896594,
"loss": 3.3976,
"step": 13900
},
{
"epoch": 3.7541989664082687,
"grad_norm": 0.34979817271232605,
"learning_rate": 0.0005440595125892015,
"loss": 3.3934,
"step": 13950
},
{
"epoch": 3.767657192075797,
"grad_norm": 0.38936689496040344,
"learning_rate": 0.0005438575467887438,
"loss": 3.3858,
"step": 14000
},
{
"epoch": 3.767657192075797,
"eval_accuracy": 0.37646504852385865,
"eval_loss": 3.4231066703796387,
"eval_runtime": 53.7446,
"eval_samples_per_second": 335.141,
"eval_steps_per_second": 20.951,
"step": 14000
},
{
"epoch": 3.7811154177433246,
"grad_norm": 0.3927522301673889,
"learning_rate": 0.0005436555809882859,
"loss": 3.3889,
"step": 14050
},
{
"epoch": 3.794573643410853,
"grad_norm": 0.35339125990867615,
"learning_rate": 0.0005434536151878281,
"loss": 3.4018,
"step": 14100
},
{
"epoch": 3.8080318690783805,
"grad_norm": 0.3210137188434601,
"learning_rate": 0.0005432516493873704,
"loss": 3.3906,
"step": 14150
},
{
"epoch": 3.8214900947459087,
"grad_norm": 0.35777273774147034,
"learning_rate": 0.0005430496835869125,
"loss": 3.3963,
"step": 14200
},
{
"epoch": 3.834948320413437,
"grad_norm": 0.358101487159729,
"learning_rate": 0.0005428477177864548,
"loss": 3.4034,
"step": 14250
},
{
"epoch": 3.8484065460809647,
"grad_norm": 0.35471367835998535,
"learning_rate": 0.0005426457519859969,
"loss": 3.4062,
"step": 14300
},
{
"epoch": 3.8618647717484924,
"grad_norm": 0.3749210834503174,
"learning_rate": 0.0005424437861855392,
"loss": 3.3965,
"step": 14350
},
{
"epoch": 3.8753229974160206,
"grad_norm": 0.3748970627784729,
"learning_rate": 0.0005422418203850814,
"loss": 3.3907,
"step": 14400
},
{
"epoch": 3.888781223083549,
"grad_norm": 0.3373413681983948,
"learning_rate": 0.0005420398545846236,
"loss": 3.4029,
"step": 14450
},
{
"epoch": 3.9022394487510765,
"grad_norm": 0.3378717005252838,
"learning_rate": 0.0005418378887841658,
"loss": 3.3926,
"step": 14500
},
{
"epoch": 3.9156976744186047,
"grad_norm": 0.33396196365356445,
"learning_rate": 0.000541635922983708,
"loss": 3.4127,
"step": 14550
},
{
"epoch": 3.9291559000861325,
"grad_norm": 0.3631775379180908,
"learning_rate": 0.0005414339571832503,
"loss": 3.3993,
"step": 14600
},
{
"epoch": 3.9426141257536607,
"grad_norm": 0.3275887966156006,
"learning_rate": 0.0005412319913827925,
"loss": 3.3836,
"step": 14650
},
{
"epoch": 3.956072351421189,
"grad_norm": 0.336851567029953,
"learning_rate": 0.0005410300255823347,
"loss": 3.3843,
"step": 14700
},
{
"epoch": 3.9695305770887166,
"grad_norm": 0.3692421019077301,
"learning_rate": 0.0005408280597818769,
"loss": 3.389,
"step": 14750
},
{
"epoch": 3.9829888027562443,
"grad_norm": 0.35839107632637024,
"learning_rate": 0.0005406260939814191,
"loss": 3.3787,
"step": 14800
},
{
"epoch": 3.9964470284237725,
"grad_norm": 0.34140458703041077,
"learning_rate": 0.0005404241281809614,
"loss": 3.385,
"step": 14850
},
{
"epoch": 4.00968992248062,
"grad_norm": 0.37650322914123535,
"learning_rate": 0.0005402221623805035,
"loss": 3.3252,
"step": 14900
},
{
"epoch": 4.023148148148148,
"grad_norm": 0.3383863866329193,
"learning_rate": 0.0005400201965800457,
"loss": 3.2848,
"step": 14950
},
{
"epoch": 4.036606373815676,
"grad_norm": 0.35266733169555664,
"learning_rate": 0.000539818230779588,
"loss": 3.3038,
"step": 15000
},
{
"epoch": 4.036606373815676,
"eval_accuracy": 0.3777076426109491,
"eval_loss": 3.412649154663086,
"eval_runtime": 53.7186,
"eval_samples_per_second": 335.303,
"eval_steps_per_second": 20.961,
"step": 15000
},
{
"epoch": 4.050064599483204,
"grad_norm": 0.3664638102054596,
"learning_rate": 0.0005396162649791301,
"loss": 3.3013,
"step": 15050
},
{
"epoch": 4.0635228251507325,
"grad_norm": 0.3443576991558075,
"learning_rate": 0.0005394142991786724,
"loss": 3.306,
"step": 15100
},
{
"epoch": 4.07698105081826,
"grad_norm": 0.3623943030834198,
"learning_rate": 0.0005392123333782145,
"loss": 3.3083,
"step": 15150
},
{
"epoch": 4.090439276485788,
"grad_norm": 0.345225989818573,
"learning_rate": 0.0005390103675777568,
"loss": 3.3135,
"step": 15200
},
{
"epoch": 4.103897502153316,
"grad_norm": 0.3902343809604645,
"learning_rate": 0.000538808401777299,
"loss": 3.3166,
"step": 15250
},
{
"epoch": 4.117355727820844,
"grad_norm": 0.3565817177295685,
"learning_rate": 0.0005386064359768412,
"loss": 3.3177,
"step": 15300
},
{
"epoch": 4.1308139534883725,
"grad_norm": 0.33238816261291504,
"learning_rate": 0.0005384044701763834,
"loss": 3.3145,
"step": 15350
},
{
"epoch": 4.1442721791559,
"grad_norm": 0.36902859807014465,
"learning_rate": 0.0005382025043759256,
"loss": 3.3058,
"step": 15400
},
{
"epoch": 4.157730404823428,
"grad_norm": 0.349520742893219,
"learning_rate": 0.0005380005385754678,
"loss": 3.3161,
"step": 15450
},
{
"epoch": 4.171188630490956,
"grad_norm": 0.39129889011383057,
"learning_rate": 0.0005377985727750101,
"loss": 3.3209,
"step": 15500
},
{
"epoch": 4.184646856158484,
"grad_norm": 0.36261945962905884,
"learning_rate": 0.0005375966069745523,
"loss": 3.3263,
"step": 15550
},
{
"epoch": 4.198105081826012,
"grad_norm": 0.3307056725025177,
"learning_rate": 0.0005373946411740945,
"loss": 3.3133,
"step": 15600
},
{
"epoch": 4.21156330749354,
"grad_norm": 0.3555365800857544,
"learning_rate": 0.0005371926753736367,
"loss": 3.3177,
"step": 15650
},
{
"epoch": 4.225021533161068,
"grad_norm": 0.3683795630931854,
"learning_rate": 0.000536990709573179,
"loss": 3.3199,
"step": 15700
},
{
"epoch": 4.238479758828596,
"grad_norm": 0.3561367988586426,
"learning_rate": 0.0005367887437727211,
"loss": 3.3146,
"step": 15750
},
{
"epoch": 4.251937984496124,
"grad_norm": 0.3561237156391144,
"learning_rate": 0.0005365867779722634,
"loss": 3.3396,
"step": 15800
},
{
"epoch": 4.265396210163652,
"grad_norm": 0.3543408215045929,
"learning_rate": 0.0005363848121718055,
"loss": 3.3308,
"step": 15850
},
{
"epoch": 4.27885443583118,
"grad_norm": 0.36262819170951843,
"learning_rate": 0.0005361828463713477,
"loss": 3.3253,
"step": 15900
},
{
"epoch": 4.292312661498708,
"grad_norm": 0.36303988099098206,
"learning_rate": 0.00053598088057089,
"loss": 3.3338,
"step": 15950
},
{
"epoch": 4.305770887166236,
"grad_norm": 0.34338095784187317,
"learning_rate": 0.0005357789147704321,
"loss": 3.3297,
"step": 16000
},
{
"epoch": 4.305770887166236,
"eval_accuracy": 0.3787833551278926,
"eval_loss": 3.404193162918091,
"eval_runtime": 53.6662,
"eval_samples_per_second": 335.63,
"eval_steps_per_second": 20.982,
"step": 16000
},
{
"epoch": 4.319229112833764,
"grad_norm": 0.3415316939353943,
"learning_rate": 0.0005355769489699744,
"loss": 3.3426,
"step": 16050
},
{
"epoch": 4.332687338501292,
"grad_norm": 0.34920433163642883,
"learning_rate": 0.0005353749831695165,
"loss": 3.3309,
"step": 16100
},
{
"epoch": 4.34614556416882,
"grad_norm": 0.3775346875190735,
"learning_rate": 0.0005351730173690588,
"loss": 3.329,
"step": 16150
},
{
"epoch": 4.359603789836348,
"grad_norm": 0.34089094400405884,
"learning_rate": 0.000534971051568601,
"loss": 3.311,
"step": 16200
},
{
"epoch": 4.373062015503876,
"grad_norm": 0.3913591206073761,
"learning_rate": 0.0005347690857681432,
"loss": 3.3375,
"step": 16250
},
{
"epoch": 4.386520241171404,
"grad_norm": 0.3345584273338318,
"learning_rate": 0.0005345671199676854,
"loss": 3.3189,
"step": 16300
},
{
"epoch": 4.399978466838932,
"grad_norm": 0.3587518632411957,
"learning_rate": 0.0005343651541672276,
"loss": 3.3273,
"step": 16350
},
{
"epoch": 4.41343669250646,
"grad_norm": 0.35501107573509216,
"learning_rate": 0.0005341631883667699,
"loss": 3.3295,
"step": 16400
},
{
"epoch": 4.426894918173988,
"grad_norm": 0.36069580912590027,
"learning_rate": 0.0005339612225663121,
"loss": 3.3267,
"step": 16450
},
{
"epoch": 4.440353143841516,
"grad_norm": 0.36240604519844055,
"learning_rate": 0.0005337592567658543,
"loss": 3.3333,
"step": 16500
},
{
"epoch": 4.453811369509044,
"grad_norm": 0.35791751742362976,
"learning_rate": 0.0005335572909653965,
"loss": 3.3298,
"step": 16550
},
{
"epoch": 4.467269595176572,
"grad_norm": 0.3455749750137329,
"learning_rate": 0.0005333553251649387,
"loss": 3.3251,
"step": 16600
},
{
"epoch": 4.4807278208441,
"grad_norm": 0.3759973347187042,
"learning_rate": 0.000533153359364481,
"loss": 3.3341,
"step": 16650
},
{
"epoch": 4.4941860465116275,
"grad_norm": 0.3809243440628052,
"learning_rate": 0.0005329513935640231,
"loss": 3.3308,
"step": 16700
},
{
"epoch": 4.507644272179156,
"grad_norm": 0.3631037771701813,
"learning_rate": 0.0005327494277635654,
"loss": 3.3494,
"step": 16750
},
{
"epoch": 4.521102497846684,
"grad_norm": 0.3409591317176819,
"learning_rate": 0.0005325474619631075,
"loss": 3.3306,
"step": 16800
},
{
"epoch": 4.534560723514212,
"grad_norm": 0.3606366813182831,
"learning_rate": 0.0005323454961626497,
"loss": 3.3328,
"step": 16850
},
{
"epoch": 4.54801894918174,
"grad_norm": 0.33501338958740234,
"learning_rate": 0.000532143530362192,
"loss": 3.3409,
"step": 16900
},
{
"epoch": 4.5614771748492675,
"grad_norm": 0.3617742657661438,
"learning_rate": 0.0005319415645617341,
"loss": 3.3329,
"step": 16950
},
{
"epoch": 4.574935400516796,
"grad_norm": 0.8550599813461304,
"learning_rate": 0.0005317395987612764,
"loss": 3.3282,
"step": 17000
},
{
"epoch": 4.574935400516796,
"eval_accuracy": 0.3796109182267269,
"eval_loss": 3.39310884475708,
"eval_runtime": 53.8374,
"eval_samples_per_second": 334.563,
"eval_steps_per_second": 20.915,
"step": 17000
},
{
"epoch": 4.588393626184324,
"grad_norm": 0.35077497363090515,
"learning_rate": 0.0005315376329608185,
"loss": 3.3273,
"step": 17050
},
{
"epoch": 4.601851851851852,
"grad_norm": 0.3806801438331604,
"learning_rate": 0.0005313356671603608,
"loss": 3.3338,
"step": 17100
},
{
"epoch": 4.61531007751938,
"grad_norm": 0.3507062792778015,
"learning_rate": 0.000531133701359903,
"loss": 3.3406,
"step": 17150
},
{
"epoch": 4.628768303186908,
"grad_norm": 0.35898759961128235,
"learning_rate": 0.0005309317355594452,
"loss": 3.3315,
"step": 17200
},
{
"epoch": 4.642226528854436,
"grad_norm": 0.3401118814945221,
"learning_rate": 0.0005307297697589874,
"loss": 3.329,
"step": 17250
},
{
"epoch": 4.655684754521964,
"grad_norm": 0.38454142212867737,
"learning_rate": 0.0005305278039585297,
"loss": 3.3373,
"step": 17300
},
{
"epoch": 4.669142980189492,
"grad_norm": 0.3205120265483856,
"learning_rate": 0.0005303258381580719,
"loss": 3.331,
"step": 17350
},
{
"epoch": 4.682601205857019,
"grad_norm": 0.37140244245529175,
"learning_rate": 0.0005301238723576141,
"loss": 3.3317,
"step": 17400
},
{
"epoch": 4.696059431524548,
"grad_norm": 0.34272000193595886,
"learning_rate": 0.0005299219065571563,
"loss": 3.3464,
"step": 17450
},
{
"epoch": 4.709517657192076,
"grad_norm": 0.3573205769062042,
"learning_rate": 0.0005297199407566985,
"loss": 3.3392,
"step": 17500
},
{
"epoch": 4.722975882859604,
"grad_norm": 0.3326584994792938,
"learning_rate": 0.0005295179749562407,
"loss": 3.3355,
"step": 17550
},
{
"epoch": 4.736434108527131,
"grad_norm": 0.33970969915390015,
"learning_rate": 0.000529316009155783,
"loss": 3.3304,
"step": 17600
},
{
"epoch": 4.7498923341946595,
"grad_norm": 0.34760308265686035,
"learning_rate": 0.0005291140433553251,
"loss": 3.3451,
"step": 17650
},
{
"epoch": 4.763350559862188,
"grad_norm": 0.34035566449165344,
"learning_rate": 0.0005289120775548673,
"loss": 3.3336,
"step": 17700
},
{
"epoch": 4.776808785529716,
"grad_norm": 0.36723145842552185,
"learning_rate": 0.0005287101117544095,
"loss": 3.3358,
"step": 17750
},
{
"epoch": 4.790267011197244,
"grad_norm": 0.3488785922527313,
"learning_rate": 0.0005285081459539517,
"loss": 3.3446,
"step": 17800
},
{
"epoch": 4.803725236864771,
"grad_norm": 0.33435118198394775,
"learning_rate": 0.000528306180153494,
"loss": 3.3367,
"step": 17850
},
{
"epoch": 4.8171834625323,
"grad_norm": 0.35024383664131165,
"learning_rate": 0.0005281042143530361,
"loss": 3.3474,
"step": 17900
},
{
"epoch": 4.830641688199828,
"grad_norm": 0.32620319724082947,
"learning_rate": 0.0005279022485525784,
"loss": 3.3375,
"step": 17950
},
{
"epoch": 4.844099913867356,
"grad_norm": 0.34913358092308044,
"learning_rate": 0.0005277002827521205,
"loss": 3.335,
"step": 18000
},
{
"epoch": 4.844099913867356,
"eval_accuracy": 0.3806844578065591,
"eval_loss": 3.3818206787109375,
"eval_runtime": 54.2008,
"eval_samples_per_second": 332.32,
"eval_steps_per_second": 20.775,
"step": 18000
},
{
"epoch": 4.857558139534884,
"grad_norm": 0.3346174955368042,
"learning_rate": 0.0005274983169516628,
"loss": 3.334,
"step": 18050
},
{
"epoch": 4.871016365202411,
"grad_norm": 0.3737140893936157,
"learning_rate": 0.000527296351151205,
"loss": 3.3378,
"step": 18100
},
{
"epoch": 4.88447459086994,
"grad_norm": 0.3536180853843689,
"learning_rate": 0.0005270943853507472,
"loss": 3.3514,
"step": 18150
},
{
"epoch": 4.897932816537468,
"grad_norm": 0.3579419255256653,
"learning_rate": 0.0005268924195502894,
"loss": 3.3251,
"step": 18200
},
{
"epoch": 4.911391042204996,
"grad_norm": 0.3723459541797638,
"learning_rate": 0.0005266904537498317,
"loss": 3.3348,
"step": 18250
},
{
"epoch": 4.924849267872523,
"grad_norm": 0.3395371735095978,
"learning_rate": 0.0005264884879493739,
"loss": 3.3409,
"step": 18300
},
{
"epoch": 4.9383074935400515,
"grad_norm": 0.3212871253490448,
"learning_rate": 0.0005262865221489161,
"loss": 3.3269,
"step": 18350
},
{
"epoch": 4.95176571920758,
"grad_norm": 0.33720704913139343,
"learning_rate": 0.0005260845563484583,
"loss": 3.3345,
"step": 18400
},
{
"epoch": 4.965223944875108,
"grad_norm": 0.38691216707229614,
"learning_rate": 0.0005258825905480005,
"loss": 3.3425,
"step": 18450
},
{
"epoch": 4.978682170542635,
"grad_norm": 0.3235993981361389,
"learning_rate": 0.0005256806247475427,
"loss": 3.3389,
"step": 18500
},
{
"epoch": 4.992140396210163,
"grad_norm": 0.33104822039604187,
"learning_rate": 0.000525478658947085,
"loss": 3.3442,
"step": 18550
},
{
"epoch": 5.0053832902670115,
"grad_norm": 0.3393186032772064,
"learning_rate": 0.0005252766931466271,
"loss": 3.2944,
"step": 18600
},
{
"epoch": 5.018841515934539,
"grad_norm": 0.37101826071739197,
"learning_rate": 0.0005250747273461693,
"loss": 3.2355,
"step": 18650
},
{
"epoch": 5.032299741602067,
"grad_norm": 0.3399945795536041,
"learning_rate": 0.0005248727615457115,
"loss": 3.2344,
"step": 18700
},
{
"epoch": 5.045757967269595,
"grad_norm": 0.35850459337234497,
"learning_rate": 0.0005246707957452537,
"loss": 3.2436,
"step": 18750
},
{
"epoch": 5.059216192937123,
"grad_norm": 0.35731905698776245,
"learning_rate": 0.000524468829944796,
"loss": 3.252,
"step": 18800
},
{
"epoch": 5.0726744186046515,
"grad_norm": 0.35466647148132324,
"learning_rate": 0.0005242668641443381,
"loss": 3.2462,
"step": 18850
},
{
"epoch": 5.086132644272179,
"grad_norm": 0.3400084376335144,
"learning_rate": 0.0005240648983438804,
"loss": 3.253,
"step": 18900
},
{
"epoch": 5.099590869939707,
"grad_norm": 0.3490995466709137,
"learning_rate": 0.0005238629325434225,
"loss": 3.2463,
"step": 18950
},
{
"epoch": 5.113049095607235,
"grad_norm": 0.3619450032711029,
"learning_rate": 0.0005236609667429648,
"loss": 3.2534,
"step": 19000
},
{
"epoch": 5.113049095607235,
"eval_accuracy": 0.38128494897726367,
"eval_loss": 3.382563591003418,
"eval_runtime": 53.8344,
"eval_samples_per_second": 334.582,
"eval_steps_per_second": 20.916,
"step": 19000
},
{
"epoch": 5.126507321274763,
"grad_norm": 0.36200740933418274,
"learning_rate": 0.000523459000942507,
"loss": 3.2613,
"step": 19050
},
{
"epoch": 5.139965546942291,
"grad_norm": 0.3502586781978607,
"learning_rate": 0.0005232570351420493,
"loss": 3.2516,
"step": 19100
},
{
"epoch": 5.153423772609819,
"grad_norm": 0.3829094171524048,
"learning_rate": 0.0005230550693415915,
"loss": 3.2667,
"step": 19150
},
{
"epoch": 5.166881998277347,
"grad_norm": 0.38154736161231995,
"learning_rate": 0.0005228531035411337,
"loss": 3.257,
"step": 19200
},
{
"epoch": 5.180340223944875,
"grad_norm": 0.3990512192249298,
"learning_rate": 0.0005226511377406759,
"loss": 3.2685,
"step": 19250
},
{
"epoch": 5.1937984496124034,
"grad_norm": 0.35598447918891907,
"learning_rate": 0.0005224491719402181,
"loss": 3.2561,
"step": 19300
},
{
"epoch": 5.207256675279931,
"grad_norm": 0.3600831925868988,
"learning_rate": 0.0005222472061397603,
"loss": 3.2696,
"step": 19350
},
{
"epoch": 5.220714900947459,
"grad_norm": 0.340609610080719,
"learning_rate": 0.0005220452403393026,
"loss": 3.2744,
"step": 19400
},
{
"epoch": 5.234173126614987,
"grad_norm": 0.32513388991355896,
"learning_rate": 0.0005218432745388447,
"loss": 3.2763,
"step": 19450
},
{
"epoch": 5.247631352282515,
"grad_norm": 0.33820098638534546,
"learning_rate": 0.000521641308738387,
"loss": 3.2638,
"step": 19500
},
{
"epoch": 5.2610895779500435,
"grad_norm": 0.3647639751434326,
"learning_rate": 0.0005214393429379291,
"loss": 3.2744,
"step": 19550
},
{
"epoch": 5.274547803617571,
"grad_norm": 0.36164987087249756,
"learning_rate": 0.0005212373771374713,
"loss": 3.2734,
"step": 19600
},
{
"epoch": 5.288006029285099,
"grad_norm": 0.3829108774662018,
"learning_rate": 0.0005210354113370136,
"loss": 3.2782,
"step": 19650
},
{
"epoch": 5.301464254952627,
"grad_norm": 0.3650548458099365,
"learning_rate": 0.0005208334455365557,
"loss": 3.2766,
"step": 19700
},
{
"epoch": 5.314922480620155,
"grad_norm": 0.37036004662513733,
"learning_rate": 0.000520631479736098,
"loss": 3.2846,
"step": 19750
},
{
"epoch": 5.328380706287683,
"grad_norm": 0.3291724920272827,
"learning_rate": 0.0005204295139356401,
"loss": 3.2703,
"step": 19800
},
{
"epoch": 5.341838931955211,
"grad_norm": 0.3196430206298828,
"learning_rate": 0.0005202275481351824,
"loss": 3.2797,
"step": 19850
},
{
"epoch": 5.355297157622739,
"grad_norm": 0.3584645092487335,
"learning_rate": 0.0005200255823347246,
"loss": 3.2801,
"step": 19900
},
{
"epoch": 5.368755383290267,
"grad_norm": 0.34693044424057007,
"learning_rate": 0.0005198236165342669,
"loss": 3.2844,
"step": 19950
},
{
"epoch": 5.3822136089577945,
"grad_norm": 0.3746398091316223,
"learning_rate": 0.000519621650733809,
"loss": 3.2736,
"step": 20000
},
{
"epoch": 5.3822136089577945,
"eval_accuracy": 0.38213033016041054,
"eval_loss": 3.3756988048553467,
"eval_runtime": 53.7995,
"eval_samples_per_second": 334.799,
"eval_steps_per_second": 20.93,
"step": 20000
},
{
"epoch": 5.395671834625323,
"grad_norm": 0.348362535238266,
"learning_rate": 0.0005194196849333513,
"loss": 3.2986,
"step": 20050
},
{
"epoch": 5.409130060292851,
"grad_norm": 0.37827175855636597,
"learning_rate": 0.0005192177191328935,
"loss": 3.2875,
"step": 20100
},
{
"epoch": 5.422588285960379,
"grad_norm": 0.3439246714115143,
"learning_rate": 0.0005190157533324357,
"loss": 3.2836,
"step": 20150
},
{
"epoch": 5.436046511627907,
"grad_norm": 0.37693148851394653,
"learning_rate": 0.0005188137875319779,
"loss": 3.2799,
"step": 20200
},
{
"epoch": 5.449504737295435,
"grad_norm": 0.3539809584617615,
"learning_rate": 0.00051861182173152,
"loss": 3.2826,
"step": 20250
},
{
"epoch": 5.462962962962963,
"grad_norm": 0.37011033296585083,
"learning_rate": 0.0005184098559310623,
"loss": 3.2864,
"step": 20300
},
{
"epoch": 5.476421188630491,
"grad_norm": 0.371745765209198,
"learning_rate": 0.0005182078901306046,
"loss": 3.2917,
"step": 20350
},
{
"epoch": 5.489879414298019,
"grad_norm": 0.3410128951072693,
"learning_rate": 0.0005180059243301467,
"loss": 3.2862,
"step": 20400
},
{
"epoch": 5.503337639965547,
"grad_norm": 0.33623674511909485,
"learning_rate": 0.000517803958529689,
"loss": 3.2809,
"step": 20450
},
{
"epoch": 5.516795865633075,
"grad_norm": 0.37878212332725525,
"learning_rate": 0.0005176019927292311,
"loss": 3.2855,
"step": 20500
},
{
"epoch": 5.530254091300603,
"grad_norm": 0.35254615545272827,
"learning_rate": 0.0005174000269287733,
"loss": 3.2833,
"step": 20550
},
{
"epoch": 5.543712316968131,
"grad_norm": 0.340909481048584,
"learning_rate": 0.0005171980611283156,
"loss": 3.2962,
"step": 20600
},
{
"epoch": 5.557170542635659,
"grad_norm": 0.3362419307231903,
"learning_rate": 0.0005169960953278577,
"loss": 3.2961,
"step": 20650
},
{
"epoch": 5.5706287683031865,
"grad_norm": 0.3751087486743927,
"learning_rate": 0.0005167941295274,
"loss": 3.2949,
"step": 20700
},
{
"epoch": 5.584086993970715,
"grad_norm": 0.3268432021141052,
"learning_rate": 0.0005165921637269421,
"loss": 3.2864,
"step": 20750
},
{
"epoch": 5.597545219638243,
"grad_norm": 0.33362266421318054,
"learning_rate": 0.0005163901979264844,
"loss": 3.2922,
"step": 20800
},
{
"epoch": 5.611003445305771,
"grad_norm": 0.3502647876739502,
"learning_rate": 0.0005161882321260266,
"loss": 3.2945,
"step": 20850
},
{
"epoch": 5.624461670973298,
"grad_norm": 0.3486431837081909,
"learning_rate": 0.0005159862663255689,
"loss": 3.2938,
"step": 20900
},
{
"epoch": 5.637919896640827,
"grad_norm": 0.38896527886390686,
"learning_rate": 0.000515784300525111,
"loss": 3.2806,
"step": 20950
},
{
"epoch": 5.651378122308355,
"grad_norm": 0.35072192549705505,
"learning_rate": 0.0005155823347246533,
"loss": 3.2934,
"step": 21000
},
{
"epoch": 5.651378122308355,
"eval_accuracy": 0.38315747753291673,
"eval_loss": 3.3653366565704346,
"eval_runtime": 53.8377,
"eval_samples_per_second": 334.561,
"eval_steps_per_second": 20.915,
"step": 21000
},
{
"epoch": 5.664836347975883,
"grad_norm": 0.33917105197906494,
"learning_rate": 0.0005153803689241955,
"loss": 3.2948,
"step": 21050
},
{
"epoch": 5.678294573643411,
"grad_norm": 0.39053285121917725,
"learning_rate": 0.0005151784031237377,
"loss": 3.2933,
"step": 21100
},
{
"epoch": 5.6917527993109385,
"grad_norm": 0.34076425433158875,
"learning_rate": 0.0005149764373232799,
"loss": 3.284,
"step": 21150
},
{
"epoch": 5.705211024978467,
"grad_norm": 0.3462291657924652,
"learning_rate": 0.000514774471522822,
"loss": 3.2927,
"step": 21200
},
{
"epoch": 5.718669250645995,
"grad_norm": 0.34847456216812134,
"learning_rate": 0.0005145725057223643,
"loss": 3.2929,
"step": 21250
},
{
"epoch": 5.732127476313523,
"grad_norm": 0.35302457213401794,
"learning_rate": 0.0005143705399219066,
"loss": 3.2986,
"step": 21300
},
{
"epoch": 5.745585701981051,
"grad_norm": 0.3497825562953949,
"learning_rate": 0.0005141685741214487,
"loss": 3.2843,
"step": 21350
},
{
"epoch": 5.7590439276485785,
"grad_norm": 0.34568512439727783,
"learning_rate": 0.0005139666083209909,
"loss": 3.286,
"step": 21400
},
{
"epoch": 5.772502153316107,
"grad_norm": 0.3420438766479492,
"learning_rate": 0.0005137646425205331,
"loss": 3.2816,
"step": 21450
},
{
"epoch": 5.785960378983635,
"grad_norm": 0.372644305229187,
"learning_rate": 0.0005135626767200753,
"loss": 3.2862,
"step": 21500
},
{
"epoch": 5.799418604651163,
"grad_norm": 0.3587517738342285,
"learning_rate": 0.0005133607109196176,
"loss": 3.2871,
"step": 21550
},
{
"epoch": 5.81287683031869,
"grad_norm": 0.353718638420105,
"learning_rate": 0.0005131587451191597,
"loss": 3.2912,
"step": 21600
},
{
"epoch": 5.826335055986219,
"grad_norm": 0.33927828073501587,
"learning_rate": 0.000512956779318702,
"loss": 3.2915,
"step": 21650
},
{
"epoch": 5.839793281653747,
"grad_norm": 0.34568026661872864,
"learning_rate": 0.0005127548135182441,
"loss": 3.3025,
"step": 21700
},
{
"epoch": 5.853251507321275,
"grad_norm": 0.3527640402317047,
"learning_rate": 0.0005125528477177865,
"loss": 3.2974,
"step": 21750
},
{
"epoch": 5.866709732988802,
"grad_norm": 0.3370378911495209,
"learning_rate": 0.0005123508819173286,
"loss": 3.2858,
"step": 21800
},
{
"epoch": 5.8801679586563305,
"grad_norm": 0.3530554175376892,
"learning_rate": 0.0005121489161168709,
"loss": 3.2923,
"step": 21850
},
{
"epoch": 5.893626184323859,
"grad_norm": 0.3588743507862091,
"learning_rate": 0.000511946950316413,
"loss": 3.2968,
"step": 21900
},
{
"epoch": 5.907084409991387,
"grad_norm": 0.3334029018878937,
"learning_rate": 0.0005117449845159553,
"loss": 3.292,
"step": 21950
},
{
"epoch": 5.920542635658915,
"grad_norm": 0.3445538878440857,
"learning_rate": 0.0005115430187154975,
"loss": 3.2915,
"step": 22000
},
{
"epoch": 5.920542635658915,
"eval_accuracy": 0.3842970744009321,
"eval_loss": 3.3547863960266113,
"eval_runtime": 53.6807,
"eval_samples_per_second": 335.54,
"eval_steps_per_second": 20.976,
"step": 22000
},
{
"epoch": 5.934000861326442,
"grad_norm": 0.35701045393943787,
"learning_rate": 0.0005113410529150397,
"loss": 3.2871,
"step": 22050
},
{
"epoch": 5.9474590869939705,
"grad_norm": 0.3606407046318054,
"learning_rate": 0.0005111390871145819,
"loss": 3.2977,
"step": 22100
},
{
"epoch": 5.960917312661499,
"grad_norm": 0.3338751494884491,
"learning_rate": 0.000510937121314124,
"loss": 3.2957,
"step": 22150
},
{
"epoch": 5.974375538329027,
"grad_norm": 0.3611808717250824,
"learning_rate": 0.0005107351555136663,
"loss": 3.286,
"step": 22200
},
{
"epoch": 5.987833763996555,
"grad_norm": 0.34626203775405884,
"learning_rate": 0.0005105331897132085,
"loss": 3.2976,
"step": 22250
},
{
"epoch": 6.001076658053402,
"grad_norm": 0.37035489082336426,
"learning_rate": 0.0005103312239127507,
"loss": 3.2967,
"step": 22300
},
{
"epoch": 6.0145348837209305,
"grad_norm": 0.36504310369491577,
"learning_rate": 0.0005101292581122929,
"loss": 3.19,
"step": 22350
},
{
"epoch": 6.027993109388458,
"grad_norm": 0.37124723196029663,
"learning_rate": 0.0005099272923118351,
"loss": 3.2022,
"step": 22400
},
{
"epoch": 6.041451335055986,
"grad_norm": 0.3739500343799591,
"learning_rate": 0.0005097253265113773,
"loss": 3.1889,
"step": 22450
},
{
"epoch": 6.054909560723514,
"grad_norm": 0.3512820303440094,
"learning_rate": 0.0005095233607109196,
"loss": 3.1982,
"step": 22500
},
{
"epoch": 6.068367786391042,
"grad_norm": 0.36028966307640076,
"learning_rate": 0.0005093213949104617,
"loss": 3.2053,
"step": 22550
},
{
"epoch": 6.0818260120585705,
"grad_norm": 0.3446792662143707,
"learning_rate": 0.000509119429110004,
"loss": 3.211,
"step": 22600
},
{
"epoch": 6.095284237726098,
"grad_norm": 0.33103981614112854,
"learning_rate": 0.0005089174633095462,
"loss": 3.2067,
"step": 22650
},
{
"epoch": 6.108742463393626,
"grad_norm": 0.3572562038898468,
"learning_rate": 0.0005087154975090885,
"loss": 3.2146,
"step": 22700
},
{
"epoch": 6.122200689061154,
"grad_norm": 0.340128093957901,
"learning_rate": 0.0005085135317086306,
"loss": 3.2187,
"step": 22750
},
{
"epoch": 6.135658914728682,
"grad_norm": 0.3321945071220398,
"learning_rate": 0.0005083115659081729,
"loss": 3.2173,
"step": 22800
},
{
"epoch": 6.149117140396211,
"grad_norm": 0.34041503071784973,
"learning_rate": 0.0005081096001077151,
"loss": 3.2212,
"step": 22850
},
{
"epoch": 6.162575366063738,
"grad_norm": 0.33608099818229675,
"learning_rate": 0.0005079076343072573,
"loss": 3.2335,
"step": 22900
},
{
"epoch": 6.176033591731266,
"grad_norm": 0.3682544529438019,
"learning_rate": 0.0005077056685067995,
"loss": 3.2197,
"step": 22950
},
{
"epoch": 6.189491817398794,
"grad_norm": 0.3501201570034027,
"learning_rate": 0.0005075037027063417,
"loss": 3.2243,
"step": 23000
},
{
"epoch": 6.189491817398794,
"eval_accuracy": 0.3844461378867665,
"eval_loss": 3.3621091842651367,
"eval_runtime": 53.6366,
"eval_samples_per_second": 335.815,
"eval_steps_per_second": 20.993,
"step": 23000
},
{
"epoch": 6.2029500430663225,
"grad_norm": 0.3694973289966583,
"learning_rate": 0.0005073017369058839,
"loss": 3.2362,
"step": 23050
},
{
"epoch": 6.21640826873385,
"grad_norm": 0.35488101840019226,
"learning_rate": 0.0005070997711054262,
"loss": 3.2353,
"step": 23100
},
{
"epoch": 6.229866494401378,
"grad_norm": 0.3596543073654175,
"learning_rate": 0.0005068978053049683,
"loss": 3.226,
"step": 23150
},
{
"epoch": 6.243324720068906,
"grad_norm": 0.36401212215423584,
"learning_rate": 0.0005066958395045105,
"loss": 3.2394,
"step": 23200
},
{
"epoch": 6.256782945736434,
"grad_norm": 0.3978697657585144,
"learning_rate": 0.0005064938737040527,
"loss": 3.2418,
"step": 23250
},
{
"epoch": 6.270241171403962,
"grad_norm": 0.33834999799728394,
"learning_rate": 0.0005062919079035949,
"loss": 3.2326,
"step": 23300
},
{
"epoch": 6.28369939707149,
"grad_norm": 0.3549429178237915,
"learning_rate": 0.0005060899421031372,
"loss": 3.2363,
"step": 23350
},
{
"epoch": 6.297157622739018,
"grad_norm": 0.3379305899143219,
"learning_rate": 0.0005058879763026793,
"loss": 3.2259,
"step": 23400
},
{
"epoch": 6.310615848406546,
"grad_norm": 0.3503647446632385,
"learning_rate": 0.0005056860105022216,
"loss": 3.2491,
"step": 23450
},
{
"epoch": 6.324074074074074,
"grad_norm": 0.3466089963912964,
"learning_rate": 0.0005054840447017637,
"loss": 3.2312,
"step": 23500
},
{
"epoch": 6.337532299741602,
"grad_norm": 0.3370702862739563,
"learning_rate": 0.0005052820789013061,
"loss": 3.2449,
"step": 23550
},
{
"epoch": 6.35099052540913,
"grad_norm": 0.3398071825504303,
"learning_rate": 0.0005050801131008482,
"loss": 3.2457,
"step": 23600
},
{
"epoch": 6.364448751076658,
"grad_norm": 0.3322971761226654,
"learning_rate": 0.0005048781473003905,
"loss": 3.2388,
"step": 23650
},
{
"epoch": 6.377906976744186,
"grad_norm": 0.33924156427383423,
"learning_rate": 0.0005046761814999326,
"loss": 3.2471,
"step": 23700
},
{
"epoch": 6.3913652024117145,
"grad_norm": 0.3498065173625946,
"learning_rate": 0.0005044742156994749,
"loss": 3.2248,
"step": 23750
},
{
"epoch": 6.404823428079242,
"grad_norm": 0.3730420768260956,
"learning_rate": 0.0005042722498990171,
"loss": 3.2432,
"step": 23800
},
{
"epoch": 6.41828165374677,
"grad_norm": 0.35002046823501587,
"learning_rate": 0.0005040702840985593,
"loss": 3.2248,
"step": 23850
},
{
"epoch": 6.431739879414298,
"grad_norm": 0.37649956345558167,
"learning_rate": 0.0005038683182981015,
"loss": 3.24,
"step": 23900
},
{
"epoch": 6.445198105081826,
"grad_norm": 0.3526284992694855,
"learning_rate": 0.0005036663524976436,
"loss": 3.2304,
"step": 23950
},
{
"epoch": 6.458656330749354,
"grad_norm": 0.3461792767047882,
"learning_rate": 0.0005034643866971859,
"loss": 3.2413,
"step": 24000
},
{
"epoch": 6.458656330749354,
"eval_accuracy": 0.3850296801480031,
"eval_loss": 3.3537731170654297,
"eval_runtime": 53.6425,
"eval_samples_per_second": 335.778,
"eval_steps_per_second": 20.991,
"step": 24000
},
{
"epoch": 6.472114556416882,
"grad_norm": 0.36732858419418335,
"learning_rate": 0.0005032624208967281,
"loss": 3.2483,
"step": 24050
},
{
"epoch": 6.48557278208441,
"grad_norm": 0.3334197998046875,
"learning_rate": 0.0005030604550962703,
"loss": 3.243,
"step": 24100
},
{
"epoch": 6.499031007751938,
"grad_norm": 0.3647319972515106,
"learning_rate": 0.0005028584892958125,
"loss": 3.2491,
"step": 24150
},
{
"epoch": 6.5124892334194655,
"grad_norm": 0.3465515077114105,
"learning_rate": 0.0005026565234953547,
"loss": 3.2504,
"step": 24200
},
{
"epoch": 6.525947459086994,
"grad_norm": 0.36729180812835693,
"learning_rate": 0.0005024545576948969,
"loss": 3.2485,
"step": 24250
},
{
"epoch": 6.539405684754522,
"grad_norm": 0.3396286964416504,
"learning_rate": 0.0005022525918944392,
"loss": 3.2503,
"step": 24300
},
{
"epoch": 6.55286391042205,
"grad_norm": 0.4158382713794708,
"learning_rate": 0.0005020506260939813,
"loss": 3.2527,
"step": 24350
},
{
"epoch": 6.566322136089578,
"grad_norm": 0.35420548915863037,
"learning_rate": 0.0005018486602935236,
"loss": 3.2511,
"step": 24400
},
{
"epoch": 6.579780361757106,
"grad_norm": 0.3404446840286255,
"learning_rate": 0.0005016466944930658,
"loss": 3.2485,
"step": 24450
},
{
"epoch": 6.593238587424634,
"grad_norm": 0.3330139219760895,
"learning_rate": 0.0005014447286926081,
"loss": 3.2608,
"step": 24500
},
{
"epoch": 6.606696813092162,
"grad_norm": 0.32995307445526123,
"learning_rate": 0.0005012427628921502,
"loss": 3.2426,
"step": 24550
},
{
"epoch": 6.62015503875969,
"grad_norm": 0.33813372254371643,
"learning_rate": 0.0005010407970916925,
"loss": 3.256,
"step": 24600
},
{
"epoch": 6.633613264427218,
"grad_norm": 0.3447318971157074,
"learning_rate": 0.0005008388312912346,
"loss": 3.2448,
"step": 24650
},
{
"epoch": 6.647071490094746,
"grad_norm": 0.34425389766693115,
"learning_rate": 0.0005006368654907769,
"loss": 3.253,
"step": 24700
},
{
"epoch": 6.660529715762274,
"grad_norm": 0.34749671816825867,
"learning_rate": 0.0005004348996903191,
"loss": 3.2517,
"step": 24750
},
{
"epoch": 6.673987941429802,
"grad_norm": 0.33765843510627747,
"learning_rate": 0.0005002329338898613,
"loss": 3.2594,
"step": 24800
},
{
"epoch": 6.68744616709733,
"grad_norm": 0.34231141209602356,
"learning_rate": 0.0005000309680894035,
"loss": 3.2659,
"step": 24850
},
{
"epoch": 6.7009043927648575,
"grad_norm": 0.3280162811279297,
"learning_rate": 0.0004998290022889456,
"loss": 3.2453,
"step": 24900
},
{
"epoch": 6.714362618432386,
"grad_norm": 0.34672361612319946,
"learning_rate": 0.0004996270364884879,
"loss": 3.2519,
"step": 24950
},
{
"epoch": 6.727820844099914,
"grad_norm": 0.36213499307632446,
"learning_rate": 0.0004994250706880301,
"loss": 3.2643,
"step": 25000
},
{
"epoch": 6.727820844099914,
"eval_accuracy": 0.38583725222541354,
"eval_loss": 3.3438923358917236,
"eval_runtime": 53.6863,
"eval_samples_per_second": 335.505,
"eval_steps_per_second": 20.974,
"step": 25000
},
{
"epoch": 6.741279069767442,
"grad_norm": 0.34953513741493225,
"learning_rate": 0.0004992231048875723,
"loss": 3.2652,
"step": 25050
},
{
"epoch": 6.754737295434969,
"grad_norm": 0.3408108949661255,
"learning_rate": 0.0004990211390871145,
"loss": 3.2599,
"step": 25100
},
{
"epoch": 6.768195521102498,
"grad_norm": 0.3221488893032074,
"learning_rate": 0.0004988191732866567,
"loss": 3.2672,
"step": 25150
},
{
"epoch": 6.781653746770026,
"grad_norm": 0.36635658144950867,
"learning_rate": 0.0004986172074861989,
"loss": 3.2519,
"step": 25200
},
{
"epoch": 6.795111972437554,
"grad_norm": 0.395259827375412,
"learning_rate": 0.0004984152416857412,
"loss": 3.2548,
"step": 25250
},
{
"epoch": 6.808570198105082,
"grad_norm": 0.3348065912723541,
"learning_rate": 0.0004982132758852834,
"loss": 3.258,
"step": 25300
},
{
"epoch": 6.822028423772609,
"grad_norm": 0.37395408749580383,
"learning_rate": 0.0004980113100848256,
"loss": 3.2477,
"step": 25350
},
{
"epoch": 6.835486649440138,
"grad_norm": 0.32347390055656433,
"learning_rate": 0.0004978093442843678,
"loss": 3.2514,
"step": 25400
},
{
"epoch": 6.848944875107666,
"grad_norm": 0.34834301471710205,
"learning_rate": 0.0004976073784839101,
"loss": 3.2541,
"step": 25450
},
{
"epoch": 6.862403100775194,
"grad_norm": 0.3680538535118103,
"learning_rate": 0.0004974054126834522,
"loss": 3.2515,
"step": 25500
},
{
"epoch": 6.875861326442722,
"grad_norm": 0.3374391496181488,
"learning_rate": 0.0004972034468829945,
"loss": 3.2601,
"step": 25550
},
{
"epoch": 6.8893195521102495,
"grad_norm": 0.37110382318496704,
"learning_rate": 0.0004970014810825366,
"loss": 3.2651,
"step": 25600
},
{
"epoch": 6.902777777777778,
"grad_norm": 0.33359917998313904,
"learning_rate": 0.0004967995152820789,
"loss": 3.2554,
"step": 25650
},
{
"epoch": 6.916236003445306,
"grad_norm": 0.3703191578388214,
"learning_rate": 0.0004965975494816211,
"loss": 3.262,
"step": 25700
},
{
"epoch": 6.929694229112834,
"grad_norm": 0.3683694303035736,
"learning_rate": 0.0004963955836811633,
"loss": 3.2602,
"step": 25750
},
{
"epoch": 6.943152454780362,
"grad_norm": 0.34347638487815857,
"learning_rate": 0.0004961936178807055,
"loss": 3.2569,
"step": 25800
},
{
"epoch": 6.9566106804478895,
"grad_norm": 0.3468749225139618,
"learning_rate": 0.0004959916520802476,
"loss": 3.2657,
"step": 25850
},
{
"epoch": 6.970068906115418,
"grad_norm": 0.31962108612060547,
"learning_rate": 0.0004957896862797899,
"loss": 3.2497,
"step": 25900
},
{
"epoch": 6.983527131782946,
"grad_norm": 0.3544672727584839,
"learning_rate": 0.0004955877204793321,
"loss": 3.2562,
"step": 25950
},
{
"epoch": 6.996985357450473,
"grad_norm": 0.35126814246177673,
"learning_rate": 0.0004953857546788743,
"loss": 3.2581,
"step": 26000
},
{
"epoch": 6.996985357450473,
"eval_accuracy": 0.3859895751169149,
"eval_loss": 3.33788800239563,
"eval_runtime": 53.8045,
"eval_samples_per_second": 334.767,
"eval_steps_per_second": 20.928,
"step": 26000
},
{
"epoch": 7.010228251507321,
"grad_norm": 0.3702124059200287,
"learning_rate": 0.0004951837888784165,
"loss": 3.1869,
"step": 26050
},
{
"epoch": 7.0236864771748495,
"grad_norm": 0.36202797293663025,
"learning_rate": 0.0004949818230779587,
"loss": 3.1512,
"step": 26100
},
{
"epoch": 7.037144702842378,
"grad_norm": 0.3474515676498413,
"learning_rate": 0.0004947798572775009,
"loss": 3.1569,
"step": 26150
},
{
"epoch": 7.050602928509905,
"grad_norm": 0.3655959963798523,
"learning_rate": 0.0004945778914770432,
"loss": 3.1773,
"step": 26200
},
{
"epoch": 7.064061154177433,
"grad_norm": 0.34737685322761536,
"learning_rate": 0.0004943759256765854,
"loss": 3.171,
"step": 26250
},
{
"epoch": 7.077519379844961,
"grad_norm": 0.377200186252594,
"learning_rate": 0.0004941739598761276,
"loss": 3.1669,
"step": 26300
},
{
"epoch": 7.09097760551249,
"grad_norm": 0.3763810694217682,
"learning_rate": 0.0004939719940756698,
"loss": 3.1819,
"step": 26350
},
{
"epoch": 7.104435831180017,
"grad_norm": 0.3821322023868561,
"learning_rate": 0.0004937700282752121,
"loss": 3.1822,
"step": 26400
},
{
"epoch": 7.117894056847545,
"grad_norm": 0.36671724915504456,
"learning_rate": 0.0004935680624747542,
"loss": 3.1802,
"step": 26450
},
{
"epoch": 7.131352282515073,
"grad_norm": 0.3423325717449188,
"learning_rate": 0.0004933660966742965,
"loss": 3.1887,
"step": 26500
},
{
"epoch": 7.144810508182601,
"grad_norm": 0.34335920214653015,
"learning_rate": 0.0004931641308738386,
"loss": 3.1763,
"step": 26550
},
{
"epoch": 7.15826873385013,
"grad_norm": 0.32979974150657654,
"learning_rate": 0.0004929621650733809,
"loss": 3.1907,
"step": 26600
},
{
"epoch": 7.171726959517657,
"grad_norm": 0.3438250720500946,
"learning_rate": 0.0004927601992729231,
"loss": 3.1968,
"step": 26650
},
{
"epoch": 7.185185185185185,
"grad_norm": 0.3741178512573242,
"learning_rate": 0.0004925582334724652,
"loss": 3.1963,
"step": 26700
},
{
"epoch": 7.198643410852713,
"grad_norm": 0.34840378165245056,
"learning_rate": 0.0004923562676720075,
"loss": 3.1908,
"step": 26750
},
{
"epoch": 7.2121016365202415,
"grad_norm": 0.3589928150177002,
"learning_rate": 0.0004921543018715497,
"loss": 3.1877,
"step": 26800
},
{
"epoch": 7.225559862187769,
"grad_norm": 0.3323129415512085,
"learning_rate": 0.0004919523360710919,
"loss": 3.1912,
"step": 26850
},
{
"epoch": 7.239018087855297,
"grad_norm": 0.3843107223510742,
"learning_rate": 0.0004917503702706341,
"loss": 3.1922,
"step": 26900
},
{
"epoch": 7.252476313522825,
"grad_norm": 0.32951635122299194,
"learning_rate": 0.0004915484044701763,
"loss": 3.1885,
"step": 26950
},
{
"epoch": 7.265934539190353,
"grad_norm": 0.35862505435943604,
"learning_rate": 0.0004913464386697185,
"loss": 3.198,
"step": 27000
},
{
"epoch": 7.265934539190353,
"eval_accuracy": 0.38612983820744856,
"eval_loss": 3.345175266265869,
"eval_runtime": 53.7729,
"eval_samples_per_second": 334.964,
"eval_steps_per_second": 20.94,
"step": 27000
},
{
"epoch": 7.279392764857882,
"grad_norm": 0.3674672842025757,
"learning_rate": 0.0004911444728692608,
"loss": 3.1976,
"step": 27050
},
{
"epoch": 7.292850990525409,
"grad_norm": 0.35447216033935547,
"learning_rate": 0.000490942507068803,
"loss": 3.2064,
"step": 27100
},
{
"epoch": 7.306309216192937,
"grad_norm": 0.36305779218673706,
"learning_rate": 0.0004907405412683452,
"loss": 3.2001,
"step": 27150
},
{
"epoch": 7.319767441860465,
"grad_norm": 0.3749120831489563,
"learning_rate": 0.0004905385754678874,
"loss": 3.2083,
"step": 27200
},
{
"epoch": 7.333225667527993,
"grad_norm": 0.3669654428958893,
"learning_rate": 0.0004903366096674297,
"loss": 3.2036,
"step": 27250
},
{
"epoch": 7.346683893195521,
"grad_norm": 0.3420581817626953,
"learning_rate": 0.0004901346438669718,
"loss": 3.2078,
"step": 27300
},
{
"epoch": 7.360142118863049,
"grad_norm": 0.4257405400276184,
"learning_rate": 0.0004899326780665141,
"loss": 3.2083,
"step": 27350
},
{
"epoch": 7.373600344530577,
"grad_norm": 0.3865572512149811,
"learning_rate": 0.0004897307122660562,
"loss": 3.2109,
"step": 27400
},
{
"epoch": 7.387058570198105,
"grad_norm": 0.3718532621860504,
"learning_rate": 0.0004895287464655985,
"loss": 3.207,
"step": 27450
},
{
"epoch": 7.4005167958656335,
"grad_norm": 0.33398640155792236,
"learning_rate": 0.0004893267806651407,
"loss": 3.2055,
"step": 27500
},
{
"epoch": 7.413975021533161,
"grad_norm": 0.3843555152416229,
"learning_rate": 0.0004891248148646829,
"loss": 3.2098,
"step": 27550
},
{
"epoch": 7.427433247200689,
"grad_norm": 0.3726537227630615,
"learning_rate": 0.0004889228490642251,
"loss": 3.2024,
"step": 27600
},
{
"epoch": 7.440891472868217,
"grad_norm": 0.37347468733787537,
"learning_rate": 0.0004887208832637672,
"loss": 3.2197,
"step": 27650
},
{
"epoch": 7.454349698535745,
"grad_norm": 0.3620690107345581,
"learning_rate": 0.0004885189174633095,
"loss": 3.1987,
"step": 27700
},
{
"epoch": 7.467807924203273,
"grad_norm": 0.35882100462913513,
"learning_rate": 0.0004883169516628517,
"loss": 3.2174,
"step": 27750
},
{
"epoch": 7.481266149870801,
"grad_norm": 0.36509430408477783,
"learning_rate": 0.0004881149858623939,
"loss": 3.2147,
"step": 27800
},
{
"epoch": 7.494724375538329,
"grad_norm": 0.35377126932144165,
"learning_rate": 0.00048791302006193614,
"loss": 3.2112,
"step": 27850
},
{
"epoch": 7.508182601205857,
"grad_norm": 0.35412663221359253,
"learning_rate": 0.00048771105426147833,
"loss": 3.2221,
"step": 27900
},
{
"epoch": 7.521640826873385,
"grad_norm": 0.3604266941547394,
"learning_rate": 0.00048750908846102053,
"loss": 3.2176,
"step": 27950
},
{
"epoch": 7.535099052540913,
"grad_norm": 0.36388853192329407,
"learning_rate": 0.0004873071226605628,
"loss": 3.2262,
"step": 28000
},
{
"epoch": 7.535099052540913,
"eval_accuracy": 0.3867142496435297,
"eval_loss": 3.3380982875823975,
"eval_runtime": 53.7774,
"eval_samples_per_second": 334.936,
"eval_steps_per_second": 20.938,
"step": 28000
},
{
"epoch": 7.548557278208441,
"grad_norm": 0.350801557302475,
"learning_rate": 0.00048710515686010503,
"loss": 3.2109,
"step": 28050
},
{
"epoch": 7.562015503875969,
"grad_norm": 0.34448304772377014,
"learning_rate": 0.0004869031910596472,
"loss": 3.2345,
"step": 28100
},
{
"epoch": 7.575473729543497,
"grad_norm": 0.3337467908859253,
"learning_rate": 0.0004867012252591894,
"loss": 3.2291,
"step": 28150
},
{
"epoch": 7.588931955211025,
"grad_norm": 0.37222522497177124,
"learning_rate": 0.0004864992594587316,
"loss": 3.222,
"step": 28200
},
{
"epoch": 7.602390180878553,
"grad_norm": 0.34009498357772827,
"learning_rate": 0.0004862972936582738,
"loss": 3.2206,
"step": 28250
},
{
"epoch": 7.615848406546081,
"grad_norm": 0.3641204535961151,
"learning_rate": 0.00048609532785781606,
"loss": 3.2147,
"step": 28300
},
{
"epoch": 7.629306632213609,
"grad_norm": 0.3570398986339569,
"learning_rate": 0.00048589336205735826,
"loss": 3.2101,
"step": 28350
},
{
"epoch": 7.6427648578811365,
"grad_norm": 0.3372342586517334,
"learning_rate": 0.00048569139625690046,
"loss": 3.226,
"step": 28400
},
{
"epoch": 7.656223083548665,
"grad_norm": 0.34732359647750854,
"learning_rate": 0.00048548943045644265,
"loss": 3.2353,
"step": 28450
},
{
"epoch": 7.669681309216193,
"grad_norm": 0.38042765855789185,
"learning_rate": 0.00048528746465598485,
"loss": 3.2211,
"step": 28500
},
{
"epoch": 7.683139534883721,
"grad_norm": 0.35742899775505066,
"learning_rate": 0.0004850854988555271,
"loss": 3.2208,
"step": 28550
},
{
"epoch": 7.696597760551249,
"grad_norm": 0.37352654337882996,
"learning_rate": 0.0004848835330550693,
"loss": 3.2207,
"step": 28600
},
{
"epoch": 7.7100559862187765,
"grad_norm": 0.35837510228157043,
"learning_rate": 0.0004846815672546115,
"loss": 3.2348,
"step": 28650
},
{
"epoch": 7.723514211886305,
"grad_norm": 0.32850074768066406,
"learning_rate": 0.0004844796014541537,
"loss": 3.2138,
"step": 28700
},
{
"epoch": 7.736972437553833,
"grad_norm": 0.373390257358551,
"learning_rate": 0.0004842776356536959,
"loss": 3.2324,
"step": 28750
},
{
"epoch": 7.750430663221361,
"grad_norm": 0.3398002088069916,
"learning_rate": 0.00048407566985323813,
"loss": 3.2306,
"step": 28800
},
{
"epoch": 7.763888888888889,
"grad_norm": 0.4056737422943115,
"learning_rate": 0.0004838737040527804,
"loss": 3.217,
"step": 28850
},
{
"epoch": 7.777347114556417,
"grad_norm": 0.3630368113517761,
"learning_rate": 0.0004836717382523226,
"loss": 3.2092,
"step": 28900
},
{
"epoch": 7.790805340223945,
"grad_norm": 0.36135610938072205,
"learning_rate": 0.0004834697724518648,
"loss": 3.219,
"step": 28950
},
{
"epoch": 7.804263565891473,
"grad_norm": 0.33417677879333496,
"learning_rate": 0.000483267806651407,
"loss": 3.2175,
"step": 29000
},
{
"epoch": 7.804263565891473,
"eval_accuracy": 0.38752779729799613,
"eval_loss": 3.331707000732422,
"eval_runtime": 53.6927,
"eval_samples_per_second": 335.465,
"eval_steps_per_second": 20.971,
"step": 29000
},
{
"epoch": 7.817721791559001,
"grad_norm": 0.3444252014160156,
"learning_rate": 0.0004830658408509492,
"loss": 3.2306,
"step": 29050
},
{
"epoch": 7.831180017226529,
"grad_norm": 0.34740421175956726,
"learning_rate": 0.0004828638750504914,
"loss": 3.2341,
"step": 29100
},
{
"epoch": 7.844638242894057,
"grad_norm": 0.37734100222587585,
"learning_rate": 0.0004826619092500336,
"loss": 3.2191,
"step": 29150
},
{
"epoch": 7.858096468561585,
"grad_norm": 0.3513396680355072,
"learning_rate": 0.0004824599434495758,
"loss": 3.2282,
"step": 29200
},
{
"epoch": 7.871554694229113,
"grad_norm": 0.3746366798877716,
"learning_rate": 0.00048225797764911806,
"loss": 3.2241,
"step": 29250
},
{
"epoch": 7.885012919896641,
"grad_norm": 0.3567333519458771,
"learning_rate": 0.00048205601184866026,
"loss": 3.2393,
"step": 29300
},
{
"epoch": 7.8984711455641685,
"grad_norm": 0.33819180727005005,
"learning_rate": 0.00048185404604820245,
"loss": 3.2353,
"step": 29350
},
{
"epoch": 7.911929371231697,
"grad_norm": 0.38296690583229065,
"learning_rate": 0.00048165208024774465,
"loss": 3.2292,
"step": 29400
},
{
"epoch": 7.925387596899225,
"grad_norm": 0.3398057222366333,
"learning_rate": 0.00048145011444728685,
"loss": 3.2312,
"step": 29450
},
{
"epoch": 7.938845822566753,
"grad_norm": 0.35328567028045654,
"learning_rate": 0.0004812481486468291,
"loss": 3.231,
"step": 29500
},
{
"epoch": 7.95230404823428,
"grad_norm": 0.3437725603580475,
"learning_rate": 0.0004810461828463713,
"loss": 3.2242,
"step": 29550
},
{
"epoch": 7.965762273901809,
"grad_norm": 0.342734158039093,
"learning_rate": 0.0004808442170459135,
"loss": 3.2265,
"step": 29600
},
{
"epoch": 7.979220499569337,
"grad_norm": 0.33320966362953186,
"learning_rate": 0.0004806422512454557,
"loss": 3.2315,
"step": 29650
},
{
"epoch": 7.992678725236865,
"grad_norm": 0.3411356508731842,
"learning_rate": 0.00048044028544499793,
"loss": 3.232,
"step": 29700
},
{
"epoch": 8.005921619293712,
"grad_norm": 0.40168818831443787,
"learning_rate": 0.0004802383196445402,
"loss": 3.1846,
"step": 29750
},
{
"epoch": 8.01937984496124,
"grad_norm": 0.35845109820365906,
"learning_rate": 0.0004800363538440824,
"loss": 3.1328,
"step": 29800
},
{
"epoch": 8.032838070628769,
"grad_norm": 0.34396156668663025,
"learning_rate": 0.0004798343880436246,
"loss": 3.126,
"step": 29850
},
{
"epoch": 8.046296296296296,
"grad_norm": 0.3609023094177246,
"learning_rate": 0.00047963242224316683,
"loss": 3.1423,
"step": 29900
},
{
"epoch": 8.059754521963825,
"grad_norm": 0.34926462173461914,
"learning_rate": 0.000479430456442709,
"loss": 3.1416,
"step": 29950
},
{
"epoch": 8.073212747631352,
"grad_norm": 0.3574993312358856,
"learning_rate": 0.0004792284906422512,
"loss": 3.146,
"step": 30000
},
{
"epoch": 8.073212747631352,
"eval_accuracy": 0.3879771606926107,
"eval_loss": 3.3365345001220703,
"eval_runtime": 53.7752,
"eval_samples_per_second": 334.95,
"eval_steps_per_second": 20.939,
"step": 30000
},
{
"epoch": 8.08667097329888,
"grad_norm": 0.3560766875743866,
"learning_rate": 0.0004790265248417934,
"loss": 3.1485,
"step": 30050
},
{
"epoch": 8.100129198966409,
"grad_norm": 0.3525884747505188,
"learning_rate": 0.0004788245590413356,
"loss": 3.1476,
"step": 30100
},
{
"epoch": 8.113587424633936,
"grad_norm": 0.3512996435165405,
"learning_rate": 0.00047862259324087786,
"loss": 3.1545,
"step": 30150
},
{
"epoch": 8.127045650301465,
"grad_norm": 0.3729488253593445,
"learning_rate": 0.00047842062744042006,
"loss": 3.1522,
"step": 30200
},
{
"epoch": 8.140503875968992,
"grad_norm": 0.3845618963241577,
"learning_rate": 0.00047821866163996225,
"loss": 3.1521,
"step": 30250
},
{
"epoch": 8.15396210163652,
"grad_norm": 0.36919400095939636,
"learning_rate": 0.00047801669583950445,
"loss": 3.1622,
"step": 30300
},
{
"epoch": 8.167420327304049,
"grad_norm": 0.35437729954719543,
"learning_rate": 0.00047781473003904665,
"loss": 3.1565,
"step": 30350
},
{
"epoch": 8.180878552971576,
"grad_norm": 0.37341639399528503,
"learning_rate": 0.0004776127642385889,
"loss": 3.169,
"step": 30400
},
{
"epoch": 8.194336778639105,
"grad_norm": 0.3353579342365265,
"learning_rate": 0.0004774107984381311,
"loss": 3.1711,
"step": 30450
},
{
"epoch": 8.207795004306632,
"grad_norm": 0.36150503158569336,
"learning_rate": 0.0004772088326376733,
"loss": 3.1728,
"step": 30500
},
{
"epoch": 8.22125322997416,
"grad_norm": 0.36009085178375244,
"learning_rate": 0.0004770068668372155,
"loss": 3.1609,
"step": 30550
},
{
"epoch": 8.234711455641689,
"grad_norm": 0.33860263228416443,
"learning_rate": 0.0004768049010367577,
"loss": 3.168,
"step": 30600
},
{
"epoch": 8.248169681309216,
"grad_norm": 0.392787367105484,
"learning_rate": 0.0004766029352363,
"loss": 3.1719,
"step": 30650
},
{
"epoch": 8.261627906976745,
"grad_norm": 0.3563990592956543,
"learning_rate": 0.0004764009694358422,
"loss": 3.1695,
"step": 30700
},
{
"epoch": 8.275086132644272,
"grad_norm": 0.3538586497306824,
"learning_rate": 0.0004761990036353844,
"loss": 3.1694,
"step": 30750
},
{
"epoch": 8.2885443583118,
"grad_norm": 0.37058621644973755,
"learning_rate": 0.0004759970378349266,
"loss": 3.1726,
"step": 30800
},
{
"epoch": 8.302002583979329,
"grad_norm": 0.34298816323280334,
"learning_rate": 0.0004757950720344688,
"loss": 3.1585,
"step": 30850
},
{
"epoch": 8.315460809646856,
"grad_norm": 0.35922834277153015,
"learning_rate": 0.000475593106234011,
"loss": 3.1808,
"step": 30900
},
{
"epoch": 8.328919035314383,
"grad_norm": 0.3559612035751343,
"learning_rate": 0.0004753911404335532,
"loss": 3.1752,
"step": 30950
},
{
"epoch": 8.342377260981912,
"grad_norm": 0.38853004574775696,
"learning_rate": 0.0004751891746330954,
"loss": 3.1766,
"step": 31000
},
{
"epoch": 8.342377260981912,
"eval_accuracy": 0.38834275736158497,
"eval_loss": 3.3302693367004395,
"eval_runtime": 53.7698,
"eval_samples_per_second": 334.984,
"eval_steps_per_second": 20.941,
"step": 31000
},
{
"epoch": 8.35583548664944,
"grad_norm": 0.3335705101490021,
"learning_rate": 0.0004749872088326376,
"loss": 3.1689,
"step": 31050
},
{
"epoch": 8.369293712316969,
"grad_norm": 0.34073248505592346,
"learning_rate": 0.00047478524303217986,
"loss": 3.1727,
"step": 31100
},
{
"epoch": 8.382751937984496,
"grad_norm": 0.3616897165775299,
"learning_rate": 0.00047458327723172206,
"loss": 3.1768,
"step": 31150
},
{
"epoch": 8.396210163652023,
"grad_norm": 0.3575231432914734,
"learning_rate": 0.00047438131143126425,
"loss": 3.1921,
"step": 31200
},
{
"epoch": 8.409668389319553,
"grad_norm": 0.3506259620189667,
"learning_rate": 0.00047417934563080645,
"loss": 3.1823,
"step": 31250
},
{
"epoch": 8.42312661498708,
"grad_norm": 0.36722439527511597,
"learning_rate": 0.00047397737983034864,
"loss": 3.1934,
"step": 31300
},
{
"epoch": 8.436584840654609,
"grad_norm": 0.38640642166137695,
"learning_rate": 0.0004737754140298909,
"loss": 3.1819,
"step": 31350
},
{
"epoch": 8.450043066322136,
"grad_norm": 0.3896099328994751,
"learning_rate": 0.0004735734482294331,
"loss": 3.1933,
"step": 31400
},
{
"epoch": 8.463501291989663,
"grad_norm": 0.39949628710746765,
"learning_rate": 0.0004733714824289753,
"loss": 3.1912,
"step": 31450
},
{
"epoch": 8.476959517657193,
"grad_norm": 0.3628818690776825,
"learning_rate": 0.0004731695166285175,
"loss": 3.1906,
"step": 31500
},
{
"epoch": 8.49041774332472,
"grad_norm": 0.3500027060508728,
"learning_rate": 0.0004729675508280598,
"loss": 3.1806,
"step": 31550
},
{
"epoch": 8.503875968992247,
"grad_norm": 0.37505653500556946,
"learning_rate": 0.000472765585027602,
"loss": 3.1896,
"step": 31600
},
{
"epoch": 8.517334194659776,
"grad_norm": 0.34353429079055786,
"learning_rate": 0.0004725636192271442,
"loss": 3.1806,
"step": 31650
},
{
"epoch": 8.530792420327304,
"grad_norm": 0.3562263250350952,
"learning_rate": 0.0004723616534266864,
"loss": 3.1983,
"step": 31700
},
{
"epoch": 8.544250645994833,
"grad_norm": 0.36144253611564636,
"learning_rate": 0.0004721596876262286,
"loss": 3.1921,
"step": 31750
},
{
"epoch": 8.55770887166236,
"grad_norm": 0.3487912118434906,
"learning_rate": 0.0004719577218257708,
"loss": 3.1899,
"step": 31800
},
{
"epoch": 8.571167097329887,
"grad_norm": 0.3909497559070587,
"learning_rate": 0.000471755756025313,
"loss": 3.1832,
"step": 31850
},
{
"epoch": 8.584625322997416,
"grad_norm": 0.358192503452301,
"learning_rate": 0.0004715537902248552,
"loss": 3.202,
"step": 31900
},
{
"epoch": 8.598083548664944,
"grad_norm": 0.37098532915115356,
"learning_rate": 0.0004713518244243974,
"loss": 3.1867,
"step": 31950
},
{
"epoch": 8.611541774332473,
"grad_norm": 0.36137476563453674,
"learning_rate": 0.00047114985862393966,
"loss": 3.2014,
"step": 32000
},
{
"epoch": 8.611541774332473,
"eval_accuracy": 0.3883745908902654,
"eval_loss": 3.3255698680877686,
"eval_runtime": 53.8982,
"eval_samples_per_second": 334.185,
"eval_steps_per_second": 20.891,
"step": 32000
},
{
"epoch": 8.625,
"grad_norm": 0.35992470383644104,
"learning_rate": 0.00047094789282348186,
"loss": 3.1849,
"step": 32050
},
{
"epoch": 8.638458225667527,
"grad_norm": 0.33300545811653137,
"learning_rate": 0.00047074592702302405,
"loss": 3.1975,
"step": 32100
},
{
"epoch": 8.651916451335056,
"grad_norm": 0.34040096402168274,
"learning_rate": 0.00047054396122256625,
"loss": 3.1932,
"step": 32150
},
{
"epoch": 8.665374677002584,
"grad_norm": 0.36708134412765503,
"learning_rate": 0.00047034199542210844,
"loss": 3.1929,
"step": 32200
},
{
"epoch": 8.678832902670113,
"grad_norm": 0.37494978308677673,
"learning_rate": 0.0004701400296216507,
"loss": 3.1904,
"step": 32250
},
{
"epoch": 8.69229112833764,
"grad_norm": 0.3530576825141907,
"learning_rate": 0.0004699380638211929,
"loss": 3.2015,
"step": 32300
},
{
"epoch": 8.705749354005167,
"grad_norm": 0.3551010489463806,
"learning_rate": 0.0004697360980207351,
"loss": 3.2013,
"step": 32350
},
{
"epoch": 8.719207579672696,
"grad_norm": 0.3860146105289459,
"learning_rate": 0.0004695341322202773,
"loss": 3.1963,
"step": 32400
},
{
"epoch": 8.732665805340224,
"grad_norm": 0.3952493965625763,
"learning_rate": 0.0004693321664198196,
"loss": 3.2054,
"step": 32450
},
{
"epoch": 8.746124031007753,
"grad_norm": 0.35887420177459717,
"learning_rate": 0.0004691302006193618,
"loss": 3.2014,
"step": 32500
},
{
"epoch": 8.75958225667528,
"grad_norm": 0.3608771860599518,
"learning_rate": 0.000468928234818904,
"loss": 3.2075,
"step": 32550
},
{
"epoch": 8.773040482342807,
"grad_norm": 0.3411955237388611,
"learning_rate": 0.0004687262690184462,
"loss": 3.1987,
"step": 32600
},
{
"epoch": 8.786498708010337,
"grad_norm": 0.3674717843532562,
"learning_rate": 0.00046852430321798837,
"loss": 3.2043,
"step": 32650
},
{
"epoch": 8.799956933677864,
"grad_norm": 0.34659647941589355,
"learning_rate": 0.0004683223374175306,
"loss": 3.1958,
"step": 32700
},
{
"epoch": 8.813415159345391,
"grad_norm": 0.3701222538948059,
"learning_rate": 0.0004681203716170728,
"loss": 3.183,
"step": 32750
},
{
"epoch": 8.82687338501292,
"grad_norm": 0.355498731136322,
"learning_rate": 0.000467918405816615,
"loss": 3.1937,
"step": 32800
},
{
"epoch": 8.840331610680447,
"grad_norm": 0.3362954556941986,
"learning_rate": 0.0004677164400161572,
"loss": 3.1952,
"step": 32850
},
{
"epoch": 8.853789836347977,
"grad_norm": 0.3454212248325348,
"learning_rate": 0.0004675144742156994,
"loss": 3.1975,
"step": 32900
},
{
"epoch": 8.867248062015504,
"grad_norm": 0.3511376976966858,
"learning_rate": 0.00046731250841524166,
"loss": 3.1951,
"step": 32950
},
{
"epoch": 8.880706287683031,
"grad_norm": 0.3271363377571106,
"learning_rate": 0.00046711054261478385,
"loss": 3.1996,
"step": 33000
},
{
"epoch": 8.880706287683031,
"eval_accuracy": 0.38930091398080774,
"eval_loss": 3.3175978660583496,
"eval_runtime": 53.6786,
"eval_samples_per_second": 335.553,
"eval_steps_per_second": 20.977,
"step": 33000
},
{
"epoch": 8.89416451335056,
"grad_norm": 0.3616025149822235,
"learning_rate": 0.00046690857681432605,
"loss": 3.1985,
"step": 33050
},
{
"epoch": 8.907622739018088,
"grad_norm": 0.33482256531715393,
"learning_rate": 0.00046670661101386825,
"loss": 3.2106,
"step": 33100
},
{
"epoch": 8.921080964685617,
"grad_norm": 0.35267388820648193,
"learning_rate": 0.00046650464521341044,
"loss": 3.1885,
"step": 33150
},
{
"epoch": 8.934539190353144,
"grad_norm": 0.387184202671051,
"learning_rate": 0.0004663026794129527,
"loss": 3.2031,
"step": 33200
},
{
"epoch": 8.947997416020671,
"grad_norm": 0.34149935841560364,
"learning_rate": 0.0004661007136124949,
"loss": 3.2134,
"step": 33250
},
{
"epoch": 8.9614556416882,
"grad_norm": 0.34807565808296204,
"learning_rate": 0.00046589874781203714,
"loss": 3.206,
"step": 33300
},
{
"epoch": 8.974913867355728,
"grad_norm": 0.3618689775466919,
"learning_rate": 0.0004656967820115794,
"loss": 3.2077,
"step": 33350
},
{
"epoch": 8.988372093023255,
"grad_norm": 0.35687363147735596,
"learning_rate": 0.0004654948162111216,
"loss": 3.2034,
"step": 33400
},
{
"epoch": 9.001614987080103,
"grad_norm": 0.3490736484527588,
"learning_rate": 0.0004652928504106638,
"loss": 3.1925,
"step": 33450
},
{
"epoch": 9.015073212747632,
"grad_norm": 0.3292122185230255,
"learning_rate": 0.000465090884610206,
"loss": 3.0937,
"step": 33500
},
{
"epoch": 9.02853143841516,
"grad_norm": 0.38094958662986755,
"learning_rate": 0.00046488891880974817,
"loss": 3.1038,
"step": 33550
},
{
"epoch": 9.041989664082687,
"grad_norm": 0.3638545274734497,
"learning_rate": 0.0004646869530092904,
"loss": 3.1066,
"step": 33600
},
{
"epoch": 9.055447889750216,
"grad_norm": 0.3722701668739319,
"learning_rate": 0.0004644849872088326,
"loss": 3.1154,
"step": 33650
},
{
"epoch": 9.068906115417743,
"grad_norm": 0.39622655510902405,
"learning_rate": 0.0004642830214083748,
"loss": 3.1048,
"step": 33700
},
{
"epoch": 9.082364341085272,
"grad_norm": 0.3642023801803589,
"learning_rate": 0.000464081055607917,
"loss": 3.1062,
"step": 33750
},
{
"epoch": 9.0958225667528,
"grad_norm": 0.36213210225105286,
"learning_rate": 0.0004638790898074592,
"loss": 3.1246,
"step": 33800
},
{
"epoch": 9.109280792420327,
"grad_norm": 0.3247149884700775,
"learning_rate": 0.00046367712400700146,
"loss": 3.118,
"step": 33850
},
{
"epoch": 9.122739018087856,
"grad_norm": 0.37343013286590576,
"learning_rate": 0.00046347515820654365,
"loss": 3.1159,
"step": 33900
},
{
"epoch": 9.136197243755383,
"grad_norm": 0.36302250623703003,
"learning_rate": 0.00046327319240608585,
"loss": 3.1286,
"step": 33950
},
{
"epoch": 9.14965546942291,
"grad_norm": 0.38092878460884094,
"learning_rate": 0.00046307122660562805,
"loss": 3.137,
"step": 34000
},
{
"epoch": 9.14965546942291,
"eval_accuracy": 0.38890011573063055,
"eval_loss": 3.3283283710479736,
"eval_runtime": 53.7606,
"eval_samples_per_second": 335.041,
"eval_steps_per_second": 20.945,
"step": 34000
},
{
"epoch": 9.16311369509044,
"grad_norm": 0.35080328583717346,
"learning_rate": 0.00046286926080517024,
"loss": 3.1347,
"step": 34050
},
{
"epoch": 9.176571920757967,
"grad_norm": 0.34843164682388306,
"learning_rate": 0.0004626672950047125,
"loss": 3.1405,
"step": 34100
},
{
"epoch": 9.190030146425496,
"grad_norm": 0.4157335162162781,
"learning_rate": 0.0004624653292042547,
"loss": 3.136,
"step": 34150
},
{
"epoch": 9.203488372093023,
"grad_norm": 0.3641476333141327,
"learning_rate": 0.00046226336340379694,
"loss": 3.1512,
"step": 34200
},
{
"epoch": 9.21694659776055,
"grad_norm": 0.341251015663147,
"learning_rate": 0.00046206139760333913,
"loss": 3.1524,
"step": 34250
},
{
"epoch": 9.23040482342808,
"grad_norm": 0.35386136174201965,
"learning_rate": 0.0004618594318028814,
"loss": 3.1352,
"step": 34300
},
{
"epoch": 9.243863049095607,
"grad_norm": 0.3733835816383362,
"learning_rate": 0.0004616574660024236,
"loss": 3.1424,
"step": 34350
},
{
"epoch": 9.257321274763136,
"grad_norm": 0.33062437176704407,
"learning_rate": 0.0004614555002019658,
"loss": 3.153,
"step": 34400
},
{
"epoch": 9.270779500430663,
"grad_norm": 0.34706413745880127,
"learning_rate": 0.000461253534401508,
"loss": 3.1459,
"step": 34450
},
{
"epoch": 9.28423772609819,
"grad_norm": 0.35080230236053467,
"learning_rate": 0.00046105156860105017,
"loss": 3.1433,
"step": 34500
},
{
"epoch": 9.29769595176572,
"grad_norm": 0.37283027172088623,
"learning_rate": 0.0004608496028005924,
"loss": 3.145,
"step": 34550
},
{
"epoch": 9.311154177433247,
"grad_norm": 0.37630441784858704,
"learning_rate": 0.0004606476370001346,
"loss": 3.1535,
"step": 34600
},
{
"epoch": 9.324612403100776,
"grad_norm": 0.34792250394821167,
"learning_rate": 0.0004604456711996768,
"loss": 3.1609,
"step": 34650
},
{
"epoch": 9.338070628768303,
"grad_norm": 0.35801804065704346,
"learning_rate": 0.000460243705399219,
"loss": 3.1578,
"step": 34700
},
{
"epoch": 9.35152885443583,
"grad_norm": 0.40342605113983154,
"learning_rate": 0.0004600417395987612,
"loss": 3.1523,
"step": 34750
},
{
"epoch": 9.36498708010336,
"grad_norm": 0.3323036730289459,
"learning_rate": 0.00045983977379830345,
"loss": 3.1518,
"step": 34800
},
{
"epoch": 9.378445305770887,
"grad_norm": 0.3701625466346741,
"learning_rate": 0.00045963780799784565,
"loss": 3.1529,
"step": 34850
},
{
"epoch": 9.391903531438416,
"grad_norm": 0.34849992394447327,
"learning_rate": 0.00045943584219738785,
"loss": 3.159,
"step": 34900
},
{
"epoch": 9.405361757105943,
"grad_norm": 0.3489476144313812,
"learning_rate": 0.00045923387639693004,
"loss": 3.1557,
"step": 34950
},
{
"epoch": 9.41881998277347,
"grad_norm": 0.3345106542110443,
"learning_rate": 0.00045903191059647224,
"loss": 3.1566,
"step": 35000
},
{
"epoch": 9.41881998277347,
"eval_accuracy": 0.38953950247562724,
"eval_loss": 3.321983575820923,
"eval_runtime": 53.7047,
"eval_samples_per_second": 335.389,
"eval_steps_per_second": 20.966,
"step": 35000
},
{
"epoch": 9.432278208441,
"grad_norm": 0.3547540605068207,
"learning_rate": 0.0004588299447960145,
"loss": 3.1598,
"step": 35050
},
{
"epoch": 9.445736434108527,
"grad_norm": 0.35543152689933777,
"learning_rate": 0.00045862797899555674,
"loss": 3.1773,
"step": 35100
},
{
"epoch": 9.459194659776054,
"grad_norm": 0.3571447432041168,
"learning_rate": 0.00045842601319509894,
"loss": 3.1631,
"step": 35150
},
{
"epoch": 9.472652885443583,
"grad_norm": 0.35717859864234924,
"learning_rate": 0.00045822404739464113,
"loss": 3.1635,
"step": 35200
},
{
"epoch": 9.48611111111111,
"grad_norm": 0.36354541778564453,
"learning_rate": 0.0004580220815941834,
"loss": 3.1556,
"step": 35250
},
{
"epoch": 9.49956933677864,
"grad_norm": 0.36084792017936707,
"learning_rate": 0.0004578201157937256,
"loss": 3.1599,
"step": 35300
},
{
"epoch": 9.513027562446167,
"grad_norm": 0.3501897156238556,
"learning_rate": 0.0004576181499932678,
"loss": 3.1769,
"step": 35350
},
{
"epoch": 9.526485788113694,
"grad_norm": 0.3838970959186554,
"learning_rate": 0.00045741618419280997,
"loss": 3.1579,
"step": 35400
},
{
"epoch": 9.539944013781223,
"grad_norm": 0.38200482726097107,
"learning_rate": 0.0004572142183923522,
"loss": 3.1472,
"step": 35450
},
{
"epoch": 9.55340223944875,
"grad_norm": 0.3594954311847687,
"learning_rate": 0.0004570122525918944,
"loss": 3.1716,
"step": 35500
},
{
"epoch": 9.56686046511628,
"grad_norm": 0.3559810519218445,
"learning_rate": 0.0004568102867914366,
"loss": 3.1719,
"step": 35550
},
{
"epoch": 9.580318690783807,
"grad_norm": 0.356738418340683,
"learning_rate": 0.0004566083209909788,
"loss": 3.1838,
"step": 35600
},
{
"epoch": 9.593776916451334,
"grad_norm": 0.3445727229118347,
"learning_rate": 0.000456406355190521,
"loss": 3.1601,
"step": 35650
},
{
"epoch": 9.607235142118864,
"grad_norm": 0.35387566685676575,
"learning_rate": 0.00045620438939006326,
"loss": 3.1663,
"step": 35700
},
{
"epoch": 9.62069336778639,
"grad_norm": 0.3698170483112335,
"learning_rate": 0.00045600242358960545,
"loss": 3.173,
"step": 35750
},
{
"epoch": 9.634151593453918,
"grad_norm": 0.35212114453315735,
"learning_rate": 0.00045580045778914765,
"loss": 3.1671,
"step": 35800
},
{
"epoch": 9.647609819121447,
"grad_norm": 0.39634934067726135,
"learning_rate": 0.00045559849198868984,
"loss": 3.1782,
"step": 35850
},
{
"epoch": 9.661068044788975,
"grad_norm": 0.3627624809741974,
"learning_rate": 0.00045539652618823204,
"loss": 3.1689,
"step": 35900
},
{
"epoch": 9.674526270456504,
"grad_norm": 0.3928319215774536,
"learning_rate": 0.0004551945603877743,
"loss": 3.1757,
"step": 35950
},
{
"epoch": 9.687984496124031,
"grad_norm": 0.37066513299942017,
"learning_rate": 0.00045499259458731654,
"loss": 3.1774,
"step": 36000
},
{
"epoch": 9.687984496124031,
"eval_accuracy": 0.39003808289581243,
"eval_loss": 3.3119335174560547,
"eval_runtime": 53.867,
"eval_samples_per_second": 334.379,
"eval_steps_per_second": 20.903,
"step": 36000
},
{
"epoch": 9.701442721791558,
"grad_norm": 0.36574748158454895,
"learning_rate": 0.00045479062878685874,
"loss": 3.169,
"step": 36050
},
{
"epoch": 9.714900947459087,
"grad_norm": 0.3367103040218353,
"learning_rate": 0.00045458866298640093,
"loss": 3.1758,
"step": 36100
},
{
"epoch": 9.728359173126615,
"grad_norm": 0.3776065707206726,
"learning_rate": 0.0004543866971859432,
"loss": 3.1861,
"step": 36150
},
{
"epoch": 9.741817398794144,
"grad_norm": 0.3644520044326782,
"learning_rate": 0.0004541847313854854,
"loss": 3.1765,
"step": 36200
},
{
"epoch": 9.755275624461671,
"grad_norm": 0.3726387917995453,
"learning_rate": 0.0004539827655850276,
"loss": 3.1772,
"step": 36250
},
{
"epoch": 9.768733850129198,
"grad_norm": 0.3458203375339508,
"learning_rate": 0.00045378079978456977,
"loss": 3.1767,
"step": 36300
},
{
"epoch": 9.782192075796727,
"grad_norm": 0.3517729640007019,
"learning_rate": 0.00045357883398411197,
"loss": 3.1872,
"step": 36350
},
{
"epoch": 9.795650301464255,
"grad_norm": 0.37035584449768066,
"learning_rate": 0.0004533768681836542,
"loss": 3.176,
"step": 36400
},
{
"epoch": 9.809108527131784,
"grad_norm": 0.331911563873291,
"learning_rate": 0.0004531749023831964,
"loss": 3.1869,
"step": 36450
},
{
"epoch": 9.822566752799311,
"grad_norm": 0.378213107585907,
"learning_rate": 0.0004529729365827386,
"loss": 3.1803,
"step": 36500
},
{
"epoch": 9.836024978466838,
"grad_norm": 0.36638858914375305,
"learning_rate": 0.0004527709707822808,
"loss": 3.1897,
"step": 36550
},
{
"epoch": 9.849483204134367,
"grad_norm": 0.35065630078315735,
"learning_rate": 0.000452569004981823,
"loss": 3.1761,
"step": 36600
},
{
"epoch": 9.862941429801895,
"grad_norm": 0.3395127058029175,
"learning_rate": 0.00045236703918136525,
"loss": 3.1697,
"step": 36650
},
{
"epoch": 9.876399655469424,
"grad_norm": 0.3822707235813141,
"learning_rate": 0.00045216507338090745,
"loss": 3.1762,
"step": 36700
},
{
"epoch": 9.889857881136951,
"grad_norm": 0.363520085811615,
"learning_rate": 0.00045196310758044964,
"loss": 3.1838,
"step": 36750
},
{
"epoch": 9.903316106804478,
"grad_norm": 0.3515053391456604,
"learning_rate": 0.00045176114177999184,
"loss": 3.1785,
"step": 36800
},
{
"epoch": 9.916774332472007,
"grad_norm": 0.349691778421402,
"learning_rate": 0.00045155917597953404,
"loss": 3.1721,
"step": 36850
},
{
"epoch": 9.930232558139535,
"grad_norm": 0.35931867361068726,
"learning_rate": 0.00045135721017907634,
"loss": 3.1714,
"step": 36900
},
{
"epoch": 9.943690783807062,
"grad_norm": 0.34536248445510864,
"learning_rate": 0.00045115524437861854,
"loss": 3.1884,
"step": 36950
},
{
"epoch": 9.957149009474591,
"grad_norm": 0.36185145378112793,
"learning_rate": 0.00045095327857816073,
"loss": 3.1915,
"step": 37000
},
{
"epoch": 9.957149009474591,
"eval_accuracy": 0.3906682346580862,
"eval_loss": 3.307512044906616,
"eval_runtime": 53.9755,
"eval_samples_per_second": 333.707,
"eval_steps_per_second": 20.861,
"step": 37000
},
{
"epoch": 9.970607235142118,
"grad_norm": 0.37505844235420227,
"learning_rate": 0.00045075131277770293,
"loss": 3.1809,
"step": 37050
},
{
"epoch": 9.984065460809648,
"grad_norm": 0.38229846954345703,
"learning_rate": 0.0004505493469772452,
"loss": 3.1617,
"step": 37100
},
{
"epoch": 9.997523686477175,
"grad_norm": 0.3569982051849365,
"learning_rate": 0.0004503473811767874,
"loss": 3.181,
"step": 37150
},
{
"epoch": 10.010766580534023,
"grad_norm": 0.3492891490459442,
"learning_rate": 0.00045014541537632957,
"loss": 3.0966,
"step": 37200
},
{
"epoch": 10.02422480620155,
"grad_norm": 0.3511972427368164,
"learning_rate": 0.00044994344957587177,
"loss": 3.0796,
"step": 37250
},
{
"epoch": 10.037683031869078,
"grad_norm": 0.37117940187454224,
"learning_rate": 0.00044974148377541396,
"loss": 3.0834,
"step": 37300
},
{
"epoch": 10.051141257536607,
"grad_norm": 0.35840341448783875,
"learning_rate": 0.0004495395179749562,
"loss": 3.0828,
"step": 37350
},
{
"epoch": 10.064599483204134,
"grad_norm": 0.34897172451019287,
"learning_rate": 0.0004493375521744984,
"loss": 3.0958,
"step": 37400
},
{
"epoch": 10.078057708871663,
"grad_norm": 0.39569246768951416,
"learning_rate": 0.0004491355863740406,
"loss": 3.089,
"step": 37450
},
{
"epoch": 10.09151593453919,
"grad_norm": 0.3689843416213989,
"learning_rate": 0.0004489336205735828,
"loss": 3.0977,
"step": 37500
},
{
"epoch": 10.104974160206718,
"grad_norm": 0.34813550114631653,
"learning_rate": 0.00044873165477312505,
"loss": 3.1025,
"step": 37550
},
{
"epoch": 10.118432385874247,
"grad_norm": 0.3824092447757721,
"learning_rate": 0.00044852968897266725,
"loss": 3.0957,
"step": 37600
},
{
"epoch": 10.131890611541774,
"grad_norm": 0.38753488659858704,
"learning_rate": 0.00044832772317220945,
"loss": 3.0925,
"step": 37650
},
{
"epoch": 10.145348837209303,
"grad_norm": 0.3686502277851105,
"learning_rate": 0.00044812575737175164,
"loss": 3.0996,
"step": 37700
},
{
"epoch": 10.15880706287683,
"grad_norm": 0.37750762701034546,
"learning_rate": 0.00044792379157129384,
"loss": 3.1156,
"step": 37750
},
{
"epoch": 10.172265288544358,
"grad_norm": 0.3830547332763672,
"learning_rate": 0.00044772182577083614,
"loss": 3.1072,
"step": 37800
},
{
"epoch": 10.185723514211887,
"grad_norm": 0.40359804034233093,
"learning_rate": 0.00044751985997037834,
"loss": 3.1251,
"step": 37850
},
{
"epoch": 10.199181739879414,
"grad_norm": 0.35740941762924194,
"learning_rate": 0.00044731789416992053,
"loss": 3.1123,
"step": 37900
},
{
"epoch": 10.212639965546943,
"grad_norm": 0.3627743124961853,
"learning_rate": 0.00044711592836946273,
"loss": 3.1226,
"step": 37950
},
{
"epoch": 10.22609819121447,
"grad_norm": 0.4064336121082306,
"learning_rate": 0.000446913962569005,
"loss": 3.1193,
"step": 38000
},
{
"epoch": 10.22609819121447,
"eval_accuracy": 0.39022843218676134,
"eval_loss": 3.3206238746643066,
"eval_runtime": 53.6924,
"eval_samples_per_second": 335.467,
"eval_steps_per_second": 20.971,
"step": 38000
},
{
"epoch": 10.239556416881998,
"grad_norm": 0.40242040157318115,
"learning_rate": 0.0004467119967685472,
"loss": 3.112,
"step": 38050
},
{
"epoch": 10.253014642549527,
"grad_norm": 0.3557272255420685,
"learning_rate": 0.00044651003096808937,
"loss": 3.1364,
"step": 38100
},
{
"epoch": 10.266472868217054,
"grad_norm": 0.3683841824531555,
"learning_rate": 0.00044630806516763157,
"loss": 3.1146,
"step": 38150
},
{
"epoch": 10.279931093884581,
"grad_norm": 0.33976754546165466,
"learning_rate": 0.00044610609936717376,
"loss": 3.1277,
"step": 38200
},
{
"epoch": 10.29338931955211,
"grad_norm": 0.3842891752719879,
"learning_rate": 0.000445904133566716,
"loss": 3.128,
"step": 38250
},
{
"epoch": 10.306847545219638,
"grad_norm": 0.3514265716075897,
"learning_rate": 0.0004457021677662582,
"loss": 3.1333,
"step": 38300
},
{
"epoch": 10.320305770887167,
"grad_norm": 0.34577810764312744,
"learning_rate": 0.0004455002019658004,
"loss": 3.1148,
"step": 38350
},
{
"epoch": 10.333763996554694,
"grad_norm": 0.37134677171707153,
"learning_rate": 0.0004452982361653426,
"loss": 3.1305,
"step": 38400
},
{
"epoch": 10.347222222222221,
"grad_norm": 0.39002493023872375,
"learning_rate": 0.0004450962703648848,
"loss": 3.1265,
"step": 38450
},
{
"epoch": 10.36068044788975,
"grad_norm": 0.377231240272522,
"learning_rate": 0.00044489430456442705,
"loss": 3.1283,
"step": 38500
},
{
"epoch": 10.374138673557278,
"grad_norm": 0.36426547169685364,
"learning_rate": 0.00044469233876396925,
"loss": 3.1356,
"step": 38550
},
{
"epoch": 10.387596899224807,
"grad_norm": 0.3630325198173523,
"learning_rate": 0.00044449037296351144,
"loss": 3.1403,
"step": 38600
},
{
"epoch": 10.401055124892334,
"grad_norm": 0.3570690453052521,
"learning_rate": 0.0004442884071630537,
"loss": 3.1327,
"step": 38650
},
{
"epoch": 10.414513350559861,
"grad_norm": 0.3494502604007721,
"learning_rate": 0.00044408644136259594,
"loss": 3.1401,
"step": 38700
},
{
"epoch": 10.42797157622739,
"grad_norm": 0.3937465250492096,
"learning_rate": 0.00044388447556213814,
"loss": 3.1323,
"step": 38750
},
{
"epoch": 10.441429801894918,
"grad_norm": 0.3534800410270691,
"learning_rate": 0.00044368250976168033,
"loss": 3.1363,
"step": 38800
},
{
"epoch": 10.454888027562447,
"grad_norm": 0.35517048835754395,
"learning_rate": 0.00044348054396122253,
"loss": 3.1398,
"step": 38850
},
{
"epoch": 10.468346253229974,
"grad_norm": 0.3748292028903961,
"learning_rate": 0.0004432785781607647,
"loss": 3.1397,
"step": 38900
},
{
"epoch": 10.481804478897502,
"grad_norm": 0.3736768066883087,
"learning_rate": 0.000443076612360307,
"loss": 3.1524,
"step": 38950
},
{
"epoch": 10.49526270456503,
"grad_norm": 0.3461846709251404,
"learning_rate": 0.0004428746465598492,
"loss": 3.1495,
"step": 39000
},
{
"epoch": 10.49526270456503,
"eval_accuracy": 0.3906297736712164,
"eval_loss": 3.3105146884918213,
"eval_runtime": 53.6571,
"eval_samples_per_second": 335.687,
"eval_steps_per_second": 20.985,
"step": 39000
},
{
"epoch": 10.508720930232558,
"grad_norm": 0.3518693149089813,
"learning_rate": 0.00044267268075939137,
"loss": 3.1377,
"step": 39050
},
{
"epoch": 10.522179155900087,
"grad_norm": 0.35536989569664,
"learning_rate": 0.00044247071495893357,
"loss": 3.1556,
"step": 39100
},
{
"epoch": 10.535637381567614,
"grad_norm": 0.3642809987068176,
"learning_rate": 0.00044226874915847576,
"loss": 3.151,
"step": 39150
},
{
"epoch": 10.549095607235142,
"grad_norm": 0.3890918791294098,
"learning_rate": 0.000442066783358018,
"loss": 3.1532,
"step": 39200
},
{
"epoch": 10.56255383290267,
"grad_norm": 0.37581443786621094,
"learning_rate": 0.0004418648175575602,
"loss": 3.1472,
"step": 39250
},
{
"epoch": 10.576012058570198,
"grad_norm": 0.3536636233329773,
"learning_rate": 0.0004416628517571024,
"loss": 3.1446,
"step": 39300
},
{
"epoch": 10.589470284237725,
"grad_norm": 0.3826201856136322,
"learning_rate": 0.0004414608859566446,
"loss": 3.1564,
"step": 39350
},
{
"epoch": 10.602928509905254,
"grad_norm": 0.40004265308380127,
"learning_rate": 0.00044125892015618685,
"loss": 3.1413,
"step": 39400
},
{
"epoch": 10.616386735572782,
"grad_norm": 0.3891655504703522,
"learning_rate": 0.00044105695435572905,
"loss": 3.1628,
"step": 39450
},
{
"epoch": 10.62984496124031,
"grad_norm": 0.3744836747646332,
"learning_rate": 0.00044085498855527124,
"loss": 3.1475,
"step": 39500
},
{
"epoch": 10.643303186907838,
"grad_norm": 0.3514010012149811,
"learning_rate": 0.0004406530227548135,
"loss": 3.161,
"step": 39550
},
{
"epoch": 10.656761412575365,
"grad_norm": 0.35587388277053833,
"learning_rate": 0.00044045105695435574,
"loss": 3.1597,
"step": 39600
},
{
"epoch": 10.670219638242894,
"grad_norm": 0.36579716205596924,
"learning_rate": 0.00044024909115389794,
"loss": 3.154,
"step": 39650
},
{
"epoch": 10.683677863910422,
"grad_norm": 0.3504290282726288,
"learning_rate": 0.00044004712535344014,
"loss": 3.1476,
"step": 39700
},
{
"epoch": 10.69713608957795,
"grad_norm": 0.4184782803058624,
"learning_rate": 0.00043984515955298233,
"loss": 3.1578,
"step": 39750
},
{
"epoch": 10.710594315245478,
"grad_norm": 0.40172553062438965,
"learning_rate": 0.00043964319375252453,
"loss": 3.1515,
"step": 39800
},
{
"epoch": 10.724052540913005,
"grad_norm": 0.35225343704223633,
"learning_rate": 0.0004394412279520668,
"loss": 3.1479,
"step": 39850
},
{
"epoch": 10.737510766580534,
"grad_norm": 0.37683871388435364,
"learning_rate": 0.000439239262151609,
"loss": 3.1586,
"step": 39900
},
{
"epoch": 10.750968992248062,
"grad_norm": 0.36777010560035706,
"learning_rate": 0.00043903729635115117,
"loss": 3.1583,
"step": 39950
},
{
"epoch": 10.764427217915589,
"grad_norm": 0.36609140038490295,
"learning_rate": 0.00043883533055069337,
"loss": 3.1636,
"step": 40000
},
{
"epoch": 10.764427217915589,
"eval_accuracy": 0.3912790472800694,
"eval_loss": 3.3053524494171143,
"eval_runtime": 53.8099,
"eval_samples_per_second": 334.734,
"eval_steps_per_second": 20.926,
"step": 40000
},
{
"epoch": 10.777885443583118,
"grad_norm": 0.3523204028606415,
"learning_rate": 0.00043863336475023556,
"loss": 3.1587,
"step": 40050
},
{
"epoch": 10.791343669250645,
"grad_norm": 0.3931622803211212,
"learning_rate": 0.0004384313989497778,
"loss": 3.157,
"step": 40100
},
{
"epoch": 10.804801894918175,
"grad_norm": 0.3558056354522705,
"learning_rate": 0.00043822943314932,
"loss": 3.1674,
"step": 40150
},
{
"epoch": 10.818260120585702,
"grad_norm": 0.3576938807964325,
"learning_rate": 0.0004380274673488622,
"loss": 3.156,
"step": 40200
},
{
"epoch": 10.83171834625323,
"grad_norm": 0.3403548002243042,
"learning_rate": 0.0004378255015484044,
"loss": 3.1594,
"step": 40250
},
{
"epoch": 10.845176571920758,
"grad_norm": 0.3665982186794281,
"learning_rate": 0.0004376235357479466,
"loss": 3.1629,
"step": 40300
},
{
"epoch": 10.858634797588286,
"grad_norm": 0.3475422263145447,
"learning_rate": 0.00043742156994748885,
"loss": 3.1553,
"step": 40350
},
{
"epoch": 10.872093023255815,
"grad_norm": 0.33809924125671387,
"learning_rate": 0.00043721960414703104,
"loss": 3.1654,
"step": 40400
},
{
"epoch": 10.885551248923342,
"grad_norm": 0.3743918836116791,
"learning_rate": 0.0004370176383465733,
"loss": 3.1581,
"step": 40450
},
{
"epoch": 10.89900947459087,
"grad_norm": 0.36682623624801636,
"learning_rate": 0.0004368156725461155,
"loss": 3.1621,
"step": 40500
},
{
"epoch": 10.912467700258398,
"grad_norm": 0.3797398507595062,
"learning_rate": 0.00043661370674565774,
"loss": 3.1628,
"step": 40550
},
{
"epoch": 10.925925925925926,
"grad_norm": 0.3800196051597595,
"learning_rate": 0.00043641174094519994,
"loss": 3.1664,
"step": 40600
},
{
"epoch": 10.939384151593455,
"grad_norm": 0.3516307473182678,
"learning_rate": 0.00043620977514474213,
"loss": 3.1608,
"step": 40650
},
{
"epoch": 10.952842377260982,
"grad_norm": 0.37099623680114746,
"learning_rate": 0.00043600780934428433,
"loss": 3.1647,
"step": 40700
},
{
"epoch": 10.96630060292851,
"grad_norm": 0.38898375630378723,
"learning_rate": 0.0004358058435438265,
"loss": 3.1516,
"step": 40750
},
{
"epoch": 10.979758828596038,
"grad_norm": 0.3787337839603424,
"learning_rate": 0.0004356038777433688,
"loss": 3.1438,
"step": 40800
},
{
"epoch": 10.993217054263566,
"grad_norm": 0.3420003354549408,
"learning_rate": 0.00043540191194291097,
"loss": 3.1638,
"step": 40850
},
{
"epoch": 11.006459948320414,
"grad_norm": 0.373717337846756,
"learning_rate": 0.00043519994614245317,
"loss": 3.0953,
"step": 40900
},
{
"epoch": 11.019918173987941,
"grad_norm": 0.36699631810188293,
"learning_rate": 0.00043499798034199536,
"loss": 3.0586,
"step": 40950
},
{
"epoch": 11.03337639965547,
"grad_norm": 0.351595401763916,
"learning_rate": 0.00043479601454153756,
"loss": 3.0765,
"step": 41000
},
{
"epoch": 11.03337639965547,
"eval_accuracy": 0.3908649941135134,
"eval_loss": 3.3149876594543457,
"eval_runtime": 53.8322,
"eval_samples_per_second": 334.596,
"eval_steps_per_second": 20.917,
"step": 41000
},
{
"epoch": 11.046834625322997,
"grad_norm": 0.36418431997299194,
"learning_rate": 0.0004345940487410798,
"loss": 3.0751,
"step": 41050
},
{
"epoch": 11.060292850990525,
"grad_norm": 0.3695267140865326,
"learning_rate": 0.000434392082940622,
"loss": 3.0719,
"step": 41100
},
{
"epoch": 11.073751076658054,
"grad_norm": 0.3487664461135864,
"learning_rate": 0.0004341901171401642,
"loss": 3.0753,
"step": 41150
},
{
"epoch": 11.087209302325581,
"grad_norm": 0.3637808859348297,
"learning_rate": 0.0004339881513397064,
"loss": 3.0803,
"step": 41200
},
{
"epoch": 11.10066752799311,
"grad_norm": 0.3343772888183594,
"learning_rate": 0.0004337861855392486,
"loss": 3.0836,
"step": 41250
},
{
"epoch": 11.114125753660637,
"grad_norm": 0.3723909854888916,
"learning_rate": 0.00043358421973879084,
"loss": 3.0935,
"step": 41300
},
{
"epoch": 11.127583979328165,
"grad_norm": 0.34839126467704773,
"learning_rate": 0.0004333822539383331,
"loss": 3.0799,
"step": 41350
},
{
"epoch": 11.141042204995694,
"grad_norm": 0.40357962250709534,
"learning_rate": 0.0004331802881378753,
"loss": 3.0947,
"step": 41400
},
{
"epoch": 11.154500430663221,
"grad_norm": 0.3761771619319916,
"learning_rate": 0.00043297832233741754,
"loss": 3.0852,
"step": 41450
},
{
"epoch": 11.16795865633075,
"grad_norm": 0.3732476532459259,
"learning_rate": 0.00043277635653695974,
"loss": 3.1048,
"step": 41500
},
{
"epoch": 11.181416881998278,
"grad_norm": 0.3786817789077759,
"learning_rate": 0.00043257439073650193,
"loss": 3.098,
"step": 41550
},
{
"epoch": 11.194875107665805,
"grad_norm": 0.359264999628067,
"learning_rate": 0.00043237242493604413,
"loss": 3.0977,
"step": 41600
},
{
"epoch": 11.208333333333334,
"grad_norm": 0.38861802220344543,
"learning_rate": 0.0004321704591355863,
"loss": 3.1024,
"step": 41650
},
{
"epoch": 11.221791559000861,
"grad_norm": 0.3696124255657196,
"learning_rate": 0.0004319684933351286,
"loss": 3.1105,
"step": 41700
},
{
"epoch": 11.235249784668389,
"grad_norm": 0.3674962520599365,
"learning_rate": 0.00043176652753467077,
"loss": 3.1026,
"step": 41750
},
{
"epoch": 11.248708010335918,
"grad_norm": 0.3563523292541504,
"learning_rate": 0.00043156456173421297,
"loss": 3.0967,
"step": 41800
},
{
"epoch": 11.262166236003445,
"grad_norm": 0.36378583312034607,
"learning_rate": 0.00043136259593375516,
"loss": 3.1061,
"step": 41850
},
{
"epoch": 11.275624461670974,
"grad_norm": 0.3854440152645111,
"learning_rate": 0.00043116063013329736,
"loss": 3.0973,
"step": 41900
},
{
"epoch": 11.289082687338501,
"grad_norm": 0.37868574261665344,
"learning_rate": 0.0004309586643328396,
"loss": 3.1045,
"step": 41950
},
{
"epoch": 11.302540913006029,
"grad_norm": 0.37912723422050476,
"learning_rate": 0.0004307566985323818,
"loss": 3.1059,
"step": 42000
},
{
"epoch": 11.302540913006029,
"eval_accuracy": 0.3909425679683864,
"eval_loss": 3.3138086795806885,
"eval_runtime": 54.0416,
"eval_samples_per_second": 333.299,
"eval_steps_per_second": 20.836,
"step": 42000
},
{
"epoch": 11.315999138673558,
"grad_norm": 0.37423864006996155,
"learning_rate": 0.000430554732731924,
"loss": 3.1092,
"step": 42050
},
{
"epoch": 11.329457364341085,
"grad_norm": 0.3635264039039612,
"learning_rate": 0.0004303527669314662,
"loss": 3.1067,
"step": 42100
},
{
"epoch": 11.342915590008614,
"grad_norm": 0.37760043144226074,
"learning_rate": 0.0004301508011310084,
"loss": 3.1138,
"step": 42150
},
{
"epoch": 11.356373815676141,
"grad_norm": 0.35156896710395813,
"learning_rate": 0.00042994883533055065,
"loss": 3.1106,
"step": 42200
},
{
"epoch": 11.369832041343669,
"grad_norm": 0.366276353597641,
"learning_rate": 0.0004297468695300929,
"loss": 3.1133,
"step": 42250
},
{
"epoch": 11.383290267011198,
"grad_norm": 0.35794273018836975,
"learning_rate": 0.0004295449037296351,
"loss": 3.122,
"step": 42300
},
{
"epoch": 11.396748492678725,
"grad_norm": 0.3940405547618866,
"learning_rate": 0.0004293429379291773,
"loss": 3.1278,
"step": 42350
},
{
"epoch": 11.410206718346252,
"grad_norm": 0.369793176651001,
"learning_rate": 0.00042914097212871954,
"loss": 3.111,
"step": 42400
},
{
"epoch": 11.423664944013781,
"grad_norm": 0.40873241424560547,
"learning_rate": 0.00042893900632826173,
"loss": 3.1187,
"step": 42450
},
{
"epoch": 11.437123169681309,
"grad_norm": 0.3555212616920471,
"learning_rate": 0.00042873704052780393,
"loss": 3.1219,
"step": 42500
},
{
"epoch": 11.450581395348838,
"grad_norm": 0.3871249556541443,
"learning_rate": 0.0004285350747273461,
"loss": 3.1233,
"step": 42550
},
{
"epoch": 11.464039621016365,
"grad_norm": 0.36317795515060425,
"learning_rate": 0.0004283331089268883,
"loss": 3.1194,
"step": 42600
},
{
"epoch": 11.477497846683892,
"grad_norm": 0.36784833669662476,
"learning_rate": 0.00042813114312643057,
"loss": 3.1254,
"step": 42650
},
{
"epoch": 11.490956072351421,
"grad_norm": 0.38175830245018005,
"learning_rate": 0.00042792917732597277,
"loss": 3.1371,
"step": 42700
},
{
"epoch": 11.504414298018949,
"grad_norm": 0.3648242652416229,
"learning_rate": 0.00042772721152551496,
"loss": 3.1285,
"step": 42750
},
{
"epoch": 11.517872523686478,
"grad_norm": 0.37506625056266785,
"learning_rate": 0.00042752524572505716,
"loss": 3.1295,
"step": 42800
},
{
"epoch": 11.531330749354005,
"grad_norm": 0.37387633323669434,
"learning_rate": 0.00042732327992459936,
"loss": 3.1248,
"step": 42850
},
{
"epoch": 11.544788975021532,
"grad_norm": 0.3871805667877197,
"learning_rate": 0.0004271213141241416,
"loss": 3.1234,
"step": 42900
},
{
"epoch": 11.558247200689062,
"grad_norm": 0.35133877396583557,
"learning_rate": 0.0004269193483236838,
"loss": 3.1269,
"step": 42950
},
{
"epoch": 11.571705426356589,
"grad_norm": 0.36315521597862244,
"learning_rate": 0.000426717382523226,
"loss": 3.1231,
"step": 43000
},
{
"epoch": 11.571705426356589,
"eval_accuracy": 0.39158260659451644,
"eval_loss": 3.3041725158691406,
"eval_runtime": 53.8797,
"eval_samples_per_second": 334.3,
"eval_steps_per_second": 20.898,
"step": 43000
},
{
"epoch": 11.585163652024118,
"grad_norm": 0.367910772562027,
"learning_rate": 0.0004265154167227682,
"loss": 3.123,
"step": 43050
},
{
"epoch": 11.598621877691645,
"grad_norm": 0.38121262192726135,
"learning_rate": 0.0004263134509223105,
"loss": 3.1253,
"step": 43100
},
{
"epoch": 11.612080103359173,
"grad_norm": 0.7838655114173889,
"learning_rate": 0.0004261114851218527,
"loss": 3.1172,
"step": 43150
},
{
"epoch": 11.625538329026702,
"grad_norm": 0.3997848629951477,
"learning_rate": 0.0004259095193213949,
"loss": 3.1365,
"step": 43200
},
{
"epoch": 11.638996554694229,
"grad_norm": 0.34914714097976685,
"learning_rate": 0.0004257075535209371,
"loss": 3.1371,
"step": 43250
},
{
"epoch": 11.652454780361758,
"grad_norm": 0.3584500849246979,
"learning_rate": 0.00042550558772047934,
"loss": 3.1419,
"step": 43300
},
{
"epoch": 11.665913006029285,
"grad_norm": 0.3795340359210968,
"learning_rate": 0.00042530362192002153,
"loss": 3.1181,
"step": 43350
},
{
"epoch": 11.679371231696813,
"grad_norm": 0.3768688142299652,
"learning_rate": 0.00042510165611956373,
"loss": 3.1343,
"step": 43400
},
{
"epoch": 11.692829457364342,
"grad_norm": 0.37140101194381714,
"learning_rate": 0.0004248996903191059,
"loss": 3.1356,
"step": 43450
},
{
"epoch": 11.706287683031869,
"grad_norm": 0.34071606397628784,
"learning_rate": 0.0004246977245186481,
"loss": 3.1279,
"step": 43500
},
{
"epoch": 11.719745908699396,
"grad_norm": 0.39517703652381897,
"learning_rate": 0.0004244957587181904,
"loss": 3.1408,
"step": 43550
},
{
"epoch": 11.733204134366925,
"grad_norm": 0.3629872500896454,
"learning_rate": 0.00042429379291773257,
"loss": 3.1351,
"step": 43600
},
{
"epoch": 11.746662360034453,
"grad_norm": 0.4008491635322571,
"learning_rate": 0.00042409182711727477,
"loss": 3.137,
"step": 43650
},
{
"epoch": 11.760120585701982,
"grad_norm": 0.3763497471809387,
"learning_rate": 0.00042388986131681696,
"loss": 3.1371,
"step": 43700
},
{
"epoch": 11.773578811369509,
"grad_norm": 0.37213990092277527,
"learning_rate": 0.00042368789551635916,
"loss": 3.1363,
"step": 43750
},
{
"epoch": 11.787037037037036,
"grad_norm": 0.3561500906944275,
"learning_rate": 0.0004234859297159014,
"loss": 3.141,
"step": 43800
},
{
"epoch": 11.800495262704565,
"grad_norm": 0.3931141793727875,
"learning_rate": 0.0004232839639154436,
"loss": 3.1416,
"step": 43850
},
{
"epoch": 11.813953488372093,
"grad_norm": 0.35057225823402405,
"learning_rate": 0.0004230819981149858,
"loss": 3.1379,
"step": 43900
},
{
"epoch": 11.827411714039622,
"grad_norm": 0.35676315426826477,
"learning_rate": 0.000422880032314528,
"loss": 3.1338,
"step": 43950
},
{
"epoch": 11.840869939707149,
"grad_norm": 0.35738396644592285,
"learning_rate": 0.0004226780665140703,
"loss": 3.1332,
"step": 44000
},
{
"epoch": 11.840869939707149,
"eval_accuracy": 0.39190430993384273,
"eval_loss": 3.3014116287231445,
"eval_runtime": 53.7607,
"eval_samples_per_second": 335.041,
"eval_steps_per_second": 20.945,
"step": 44000
},
{
"epoch": 11.854328165374676,
"grad_norm": 0.3747558891773224,
"learning_rate": 0.0004224761007136125,
"loss": 3.1428,
"step": 44050
},
{
"epoch": 11.867786391042205,
"grad_norm": 0.3758200407028198,
"learning_rate": 0.0004222741349131547,
"loss": 3.1328,
"step": 44100
},
{
"epoch": 11.881244616709733,
"grad_norm": 0.3885456621646881,
"learning_rate": 0.0004220721691126969,
"loss": 3.1333,
"step": 44150
},
{
"epoch": 11.89470284237726,
"grad_norm": 0.35970941185951233,
"learning_rate": 0.0004218702033122391,
"loss": 3.1387,
"step": 44200
},
{
"epoch": 11.90816106804479,
"grad_norm": 0.3670229911804199,
"learning_rate": 0.00042166823751178134,
"loss": 3.139,
"step": 44250
},
{
"epoch": 11.921619293712316,
"grad_norm": 0.3680804967880249,
"learning_rate": 0.00042146627171132353,
"loss": 3.1381,
"step": 44300
},
{
"epoch": 11.935077519379846,
"grad_norm": 0.367384135723114,
"learning_rate": 0.00042126430591086573,
"loss": 3.1405,
"step": 44350
},
{
"epoch": 11.948535745047373,
"grad_norm": 0.3529140055179596,
"learning_rate": 0.0004210623401104079,
"loss": 3.1475,
"step": 44400
},
{
"epoch": 11.9619939707149,
"grad_norm": 0.34686964750289917,
"learning_rate": 0.0004208603743099501,
"loss": 3.1463,
"step": 44450
},
{
"epoch": 11.97545219638243,
"grad_norm": 0.3600054085254669,
"learning_rate": 0.00042065840850949237,
"loss": 3.149,
"step": 44500
},
{
"epoch": 11.988910422049956,
"grad_norm": 0.37668830156326294,
"learning_rate": 0.00042045644270903457,
"loss": 3.1494,
"step": 44550
},
{
"epoch": 12.002153316106805,
"grad_norm": 0.39227724075317383,
"learning_rate": 0.00042025447690857676,
"loss": 3.12,
"step": 44600
},
{
"epoch": 12.015611541774332,
"grad_norm": 0.3704946041107178,
"learning_rate": 0.00042005251110811896,
"loss": 3.0452,
"step": 44650
},
{
"epoch": 12.029069767441861,
"grad_norm": 0.3810844421386719,
"learning_rate": 0.00041985054530766115,
"loss": 3.0469,
"step": 44700
},
{
"epoch": 12.042527993109388,
"grad_norm": 0.4110226035118103,
"learning_rate": 0.0004196485795072034,
"loss": 3.053,
"step": 44750
},
{
"epoch": 12.055986218776916,
"grad_norm": 0.38302749395370483,
"learning_rate": 0.0004194466137067456,
"loss": 3.0359,
"step": 44800
},
{
"epoch": 12.069444444444445,
"grad_norm": 0.38793662190437317,
"learning_rate": 0.0004192446479062878,
"loss": 3.0611,
"step": 44850
},
{
"epoch": 12.082902670111972,
"grad_norm": 0.3869202136993408,
"learning_rate": 0.00041904268210583005,
"loss": 3.0617,
"step": 44900
},
{
"epoch": 12.096360895779501,
"grad_norm": 0.40806224942207336,
"learning_rate": 0.0004188407163053723,
"loss": 3.0621,
"step": 44950
},
{
"epoch": 12.109819121447028,
"grad_norm": 0.3624691367149353,
"learning_rate": 0.0004186387505049145,
"loss": 3.0655,
"step": 45000
},
{
"epoch": 12.109819121447028,
"eval_accuracy": 0.3916841914044692,
"eval_loss": 3.3107516765594482,
"eval_runtime": 53.6121,
"eval_samples_per_second": 335.969,
"eval_steps_per_second": 21.003,
"step": 45000
},
{
"epoch": 12.123277347114556,
"grad_norm": 0.38657304644584656,
"learning_rate": 0.0004184367847044567,
"loss": 3.0653,
"step": 45050
},
{
"epoch": 12.136735572782085,
"grad_norm": 0.42197513580322266,
"learning_rate": 0.0004182348189039989,
"loss": 3.0755,
"step": 45100
},
{
"epoch": 12.150193798449612,
"grad_norm": 0.41983577609062195,
"learning_rate": 0.00041803285310354114,
"loss": 3.0759,
"step": 45150
},
{
"epoch": 12.163652024117141,
"grad_norm": 0.352384090423584,
"learning_rate": 0.00041783088730308333,
"loss": 3.0761,
"step": 45200
},
{
"epoch": 12.177110249784668,
"grad_norm": 0.42857182025909424,
"learning_rate": 0.00041762892150262553,
"loss": 3.0656,
"step": 45250
},
{
"epoch": 12.190568475452196,
"grad_norm": 0.35385847091674805,
"learning_rate": 0.0004174269557021677,
"loss": 3.0833,
"step": 45300
},
{
"epoch": 12.204026701119725,
"grad_norm": 0.40004876255989075,
"learning_rate": 0.0004172249899017099,
"loss": 3.0804,
"step": 45350
},
{
"epoch": 12.217484926787252,
"grad_norm": 0.36738428473472595,
"learning_rate": 0.00041702302410125217,
"loss": 3.0868,
"step": 45400
},
{
"epoch": 12.230943152454781,
"grad_norm": 0.3519749939441681,
"learning_rate": 0.00041682105830079437,
"loss": 3.0793,
"step": 45450
},
{
"epoch": 12.244401378122308,
"grad_norm": 0.3788878917694092,
"learning_rate": 0.00041661909250033656,
"loss": 3.0942,
"step": 45500
},
{
"epoch": 12.257859603789836,
"grad_norm": 0.3716530501842499,
"learning_rate": 0.00041641712669987876,
"loss": 3.0917,
"step": 45550
},
{
"epoch": 12.271317829457365,
"grad_norm": 0.3835557699203491,
"learning_rate": 0.00041621516089942096,
"loss": 3.0832,
"step": 45600
},
{
"epoch": 12.284776055124892,
"grad_norm": 0.42075875401496887,
"learning_rate": 0.0004160131950989632,
"loss": 3.0948,
"step": 45650
},
{
"epoch": 12.298234280792421,
"grad_norm": 0.372883677482605,
"learning_rate": 0.0004158112292985054,
"loss": 3.0969,
"step": 45700
},
{
"epoch": 12.311692506459949,
"grad_norm": 0.35647454857826233,
"learning_rate": 0.0004156092634980476,
"loss": 3.0887,
"step": 45750
},
{
"epoch": 12.325150732127476,
"grad_norm": 0.3993690013885498,
"learning_rate": 0.00041540729769758985,
"loss": 3.0938,
"step": 45800
},
{
"epoch": 12.338608957795005,
"grad_norm": 0.36539211869239807,
"learning_rate": 0.0004152053318971321,
"loss": 3.1063,
"step": 45850
},
{
"epoch": 12.352067183462532,
"grad_norm": 0.38160133361816406,
"learning_rate": 0.0004150033660966743,
"loss": 3.0854,
"step": 45900
},
{
"epoch": 12.36552540913006,
"grad_norm": 0.34852227568626404,
"learning_rate": 0.0004148014002962165,
"loss": 3.0945,
"step": 45950
},
{
"epoch": 12.378983634797589,
"grad_norm": 0.38215118646621704,
"learning_rate": 0.0004145994344957587,
"loss": 3.0919,
"step": 46000
},
{
"epoch": 12.378983634797589,
"eval_accuracy": 0.3920929208751026,
"eval_loss": 3.30391001701355,
"eval_runtime": 53.9181,
"eval_samples_per_second": 334.063,
"eval_steps_per_second": 20.884,
"step": 46000
},
{
"epoch": 12.392441860465116,
"grad_norm": 0.40114298462867737,
"learning_rate": 0.0004143974686953009,
"loss": 3.1014,
"step": 46050
},
{
"epoch": 12.405900086132645,
"grad_norm": 0.40130308270454407,
"learning_rate": 0.00041419550289484313,
"loss": 3.1058,
"step": 46100
},
{
"epoch": 12.419358311800172,
"grad_norm": 0.37803414463996887,
"learning_rate": 0.00041399353709438533,
"loss": 3.0961,
"step": 46150
},
{
"epoch": 12.4328165374677,
"grad_norm": 0.4074687957763672,
"learning_rate": 0.0004137915712939275,
"loss": 3.1013,
"step": 46200
},
{
"epoch": 12.446274763135229,
"grad_norm": 0.35416853427886963,
"learning_rate": 0.0004135896054934697,
"loss": 3.107,
"step": 46250
},
{
"epoch": 12.459732988802756,
"grad_norm": 0.394949734210968,
"learning_rate": 0.0004133876396930119,
"loss": 3.1127,
"step": 46300
},
{
"epoch": 12.473191214470285,
"grad_norm": 0.3806135654449463,
"learning_rate": 0.00041318567389255417,
"loss": 3.1096,
"step": 46350
},
{
"epoch": 12.486649440137812,
"grad_norm": 0.3582363724708557,
"learning_rate": 0.00041298370809209636,
"loss": 3.0983,
"step": 46400
},
{
"epoch": 12.50010766580534,
"grad_norm": 0.36571231484413147,
"learning_rate": 0.00041278174229163856,
"loss": 3.0931,
"step": 46450
},
{
"epoch": 12.513565891472869,
"grad_norm": 0.37472379207611084,
"learning_rate": 0.00041257977649118076,
"loss": 3.1149,
"step": 46500
},
{
"epoch": 12.527024117140396,
"grad_norm": 0.3692905306816101,
"learning_rate": 0.00041237781069072295,
"loss": 3.1073,
"step": 46550
},
{
"epoch": 12.540482342807923,
"grad_norm": 0.4097956418991089,
"learning_rate": 0.0004121758448902652,
"loss": 3.1159,
"step": 46600
},
{
"epoch": 12.553940568475452,
"grad_norm": 0.38633161783218384,
"learning_rate": 0.0004119738790898074,
"loss": 3.1091,
"step": 46650
},
{
"epoch": 12.56739879414298,
"grad_norm": 0.3908534049987793,
"learning_rate": 0.00041177191328934965,
"loss": 3.1229,
"step": 46700
},
{
"epoch": 12.580857019810509,
"grad_norm": 0.4074409306049347,
"learning_rate": 0.00041156994748889185,
"loss": 3.1045,
"step": 46750
},
{
"epoch": 12.594315245478036,
"grad_norm": 0.3800044655799866,
"learning_rate": 0.0004113679816884341,
"loss": 3.1112,
"step": 46800
},
{
"epoch": 12.607773471145563,
"grad_norm": 0.34563085436820984,
"learning_rate": 0.0004111660158879763,
"loss": 3.1177,
"step": 46850
},
{
"epoch": 12.621231696813092,
"grad_norm": 0.36784085631370544,
"learning_rate": 0.0004109640500875185,
"loss": 3.1148,
"step": 46900
},
{
"epoch": 12.63468992248062,
"grad_norm": 0.39189931750297546,
"learning_rate": 0.0004107620842870607,
"loss": 3.1105,
"step": 46950
},
{
"epoch": 12.648148148148149,
"grad_norm": 0.36800920963287354,
"learning_rate": 0.0004105601184866029,
"loss": 3.1198,
"step": 47000
},
{
"epoch": 12.648148148148149,
"eval_accuracy": 0.3922400287175369,
"eval_loss": 3.3033642768859863,
"eval_runtime": 53.9166,
"eval_samples_per_second": 334.071,
"eval_steps_per_second": 20.884,
"step": 47000
},
{
"epoch": 12.661606373815676,
"grad_norm": 0.3976839780807495,
"learning_rate": 0.00041035815268614513,
"loss": 3.1156,
"step": 47050
},
{
"epoch": 12.675064599483203,
"grad_norm": 0.39268913865089417,
"learning_rate": 0.0004101561868856873,
"loss": 3.1203,
"step": 47100
},
{
"epoch": 12.688522825150732,
"grad_norm": 0.36905089020729065,
"learning_rate": 0.0004099542210852295,
"loss": 3.1219,
"step": 47150
},
{
"epoch": 12.70198105081826,
"grad_norm": 0.3765939772129059,
"learning_rate": 0.0004097522552847717,
"loss": 3.1171,
"step": 47200
},
{
"epoch": 12.715439276485789,
"grad_norm": 0.37911146879196167,
"learning_rate": 0.00040955028948431397,
"loss": 3.1116,
"step": 47250
},
{
"epoch": 12.728897502153316,
"grad_norm": 0.39323848485946655,
"learning_rate": 0.00040934832368385616,
"loss": 3.1152,
"step": 47300
},
{
"epoch": 12.742355727820843,
"grad_norm": 0.3719523251056671,
"learning_rate": 0.00040914635788339836,
"loss": 3.107,
"step": 47350
},
{
"epoch": 12.755813953488373,
"grad_norm": 0.358672559261322,
"learning_rate": 0.00040894439208294056,
"loss": 3.1216,
"step": 47400
},
{
"epoch": 12.7692721791559,
"grad_norm": 0.3891298174858093,
"learning_rate": 0.00040874242628248275,
"loss": 3.1341,
"step": 47450
},
{
"epoch": 12.782730404823429,
"grad_norm": 0.37830036878585815,
"learning_rate": 0.000408540460482025,
"loss": 3.1233,
"step": 47500
},
{
"epoch": 12.796188630490956,
"grad_norm": 0.3913838565349579,
"learning_rate": 0.00040833849468156725,
"loss": 3.1199,
"step": 47550
},
{
"epoch": 12.809646856158484,
"grad_norm": 0.3707485795021057,
"learning_rate": 0.00040813652888110945,
"loss": 3.118,
"step": 47600
},
{
"epoch": 12.823105081826013,
"grad_norm": 0.42476046085357666,
"learning_rate": 0.00040793456308065165,
"loss": 3.1103,
"step": 47650
},
{
"epoch": 12.83656330749354,
"grad_norm": 0.41528019309043884,
"learning_rate": 0.0004077325972801939,
"loss": 3.1213,
"step": 47700
},
{
"epoch": 12.850021533161069,
"grad_norm": 0.3821558654308319,
"learning_rate": 0.0004075306314797361,
"loss": 3.1253,
"step": 47750
},
{
"epoch": 12.863479758828596,
"grad_norm": 0.3485643267631531,
"learning_rate": 0.0004073286656792783,
"loss": 3.1242,
"step": 47800
},
{
"epoch": 12.876937984496124,
"grad_norm": 0.3739717900753021,
"learning_rate": 0.0004071266998788205,
"loss": 3.1311,
"step": 47850
},
{
"epoch": 12.890396210163653,
"grad_norm": 0.36342811584472656,
"learning_rate": 0.0004069247340783627,
"loss": 3.1201,
"step": 47900
},
{
"epoch": 12.90385443583118,
"grad_norm": 0.3751324713230133,
"learning_rate": 0.00040672276827790493,
"loss": 3.127,
"step": 47950
},
{
"epoch": 12.917312661498707,
"grad_norm": 0.3644934594631195,
"learning_rate": 0.0004065208024774471,
"loss": 3.1283,
"step": 48000
},
{
"epoch": 12.917312661498707,
"eval_accuracy": 0.39249176348187964,
"eval_loss": 3.295912504196167,
"eval_runtime": 54.0321,
"eval_samples_per_second": 333.358,
"eval_steps_per_second": 20.839,
"step": 48000
},
{
"epoch": 12.930770887166236,
"grad_norm": 0.3778104782104492,
"learning_rate": 0.0004063188366769893,
"loss": 3.1223,
"step": 48050
},
{
"epoch": 12.944229112833764,
"grad_norm": 0.3311636447906494,
"learning_rate": 0.0004061168708765315,
"loss": 3.1324,
"step": 48100
},
{
"epoch": 12.957687338501293,
"grad_norm": 0.36411207914352417,
"learning_rate": 0.0004059149050760737,
"loss": 3.1244,
"step": 48150
},
{
"epoch": 12.97114556416882,
"grad_norm": 0.3728819191455841,
"learning_rate": 0.00040571293927561597,
"loss": 3.13,
"step": 48200
},
{
"epoch": 12.984603789836347,
"grad_norm": 0.3733992278575897,
"learning_rate": 0.00040551097347515816,
"loss": 3.1203,
"step": 48250
},
{
"epoch": 12.998062015503876,
"grad_norm": 0.34325742721557617,
"learning_rate": 0.00040530900767470036,
"loss": 3.1313,
"step": 48300
},
{
"epoch": 13.011304909560723,
"grad_norm": 0.36356621980667114,
"learning_rate": 0.00040510704187424255,
"loss": 3.043,
"step": 48350
},
{
"epoch": 13.024763135228252,
"grad_norm": 0.37825655937194824,
"learning_rate": 0.00040490507607378475,
"loss": 3.0414,
"step": 48400
},
{
"epoch": 13.038221360895779,
"grad_norm": 0.4046306908130646,
"learning_rate": 0.00040470311027332705,
"loss": 3.0372,
"step": 48450
},
{
"epoch": 13.051679586563308,
"grad_norm": 0.36423417925834656,
"learning_rate": 0.00040450114447286925,
"loss": 3.0349,
"step": 48500
},
{
"epoch": 13.065137812230835,
"grad_norm": 0.3580648899078369,
"learning_rate": 0.00040429917867241145,
"loss": 3.036,
"step": 48550
},
{
"epoch": 13.078596037898363,
"grad_norm": 0.3821873664855957,
"learning_rate": 0.00040409721287195364,
"loss": 3.0446,
"step": 48600
},
{
"epoch": 13.092054263565892,
"grad_norm": 0.38778361678123474,
"learning_rate": 0.0004038952470714959,
"loss": 3.0454,
"step": 48650
},
{
"epoch": 13.10551248923342,
"grad_norm": 0.3847378194332123,
"learning_rate": 0.0004036932812710381,
"loss": 3.0419,
"step": 48700
},
{
"epoch": 13.118970714900948,
"grad_norm": 0.36374303698539734,
"learning_rate": 0.0004034913154705803,
"loss": 3.0542,
"step": 48750
},
{
"epoch": 13.132428940568476,
"grad_norm": 0.394603967666626,
"learning_rate": 0.0004032893496701225,
"loss": 3.047,
"step": 48800
},
{
"epoch": 13.145887166236003,
"grad_norm": 0.4271048903465271,
"learning_rate": 0.0004030873838696647,
"loss": 3.0521,
"step": 48850
},
{
"epoch": 13.159345391903532,
"grad_norm": 0.38402384519577026,
"learning_rate": 0.00040288541806920693,
"loss": 3.057,
"step": 48900
},
{
"epoch": 13.17280361757106,
"grad_norm": 0.38647517561912537,
"learning_rate": 0.0004026834522687491,
"loss": 3.0643,
"step": 48950
},
{
"epoch": 13.186261843238588,
"grad_norm": 0.372152715921402,
"learning_rate": 0.0004024814864682913,
"loss": 3.0591,
"step": 49000
},
{
"epoch": 13.186261843238588,
"eval_accuracy": 0.39232064468436567,
"eval_loss": 3.306389570236206,
"eval_runtime": 53.7783,
"eval_samples_per_second": 334.93,
"eval_steps_per_second": 20.938,
"step": 49000
},
{
"epoch": 13.199720068906116,
"grad_norm": 0.3545859456062317,
"learning_rate": 0.0004022795206678335,
"loss": 3.0649,
"step": 49050
},
{
"epoch": 13.213178294573643,
"grad_norm": 0.3903788924217224,
"learning_rate": 0.00040207755486737577,
"loss": 3.063,
"step": 49100
},
{
"epoch": 13.226636520241172,
"grad_norm": 0.3961975574493408,
"learning_rate": 0.00040187558906691796,
"loss": 3.0666,
"step": 49150
},
{
"epoch": 13.2400947459087,
"grad_norm": 0.3676101863384247,
"learning_rate": 0.00040167362326646016,
"loss": 3.0687,
"step": 49200
},
{
"epoch": 13.253552971576227,
"grad_norm": 0.3698784112930298,
"learning_rate": 0.00040147165746600235,
"loss": 3.0722,
"step": 49250
},
{
"epoch": 13.267011197243756,
"grad_norm": 0.3777436316013336,
"learning_rate": 0.00040126969166554455,
"loss": 3.0763,
"step": 49300
},
{
"epoch": 13.280469422911283,
"grad_norm": 0.3777233958244324,
"learning_rate": 0.00040106772586508686,
"loss": 3.0669,
"step": 49350
},
{
"epoch": 13.293927648578812,
"grad_norm": 0.3873639404773712,
"learning_rate": 0.00040086576006462905,
"loss": 3.0709,
"step": 49400
},
{
"epoch": 13.30738587424634,
"grad_norm": 0.36705899238586426,
"learning_rate": 0.00040066379426417125,
"loss": 3.0626,
"step": 49450
},
{
"epoch": 13.320844099913867,
"grad_norm": 0.35446983575820923,
"learning_rate": 0.00040046182846371344,
"loss": 3.076,
"step": 49500
},
{
"epoch": 13.334302325581396,
"grad_norm": 0.393531858921051,
"learning_rate": 0.0004002598626632557,
"loss": 3.0867,
"step": 49550
},
{
"epoch": 13.347760551248923,
"grad_norm": 0.4465448260307312,
"learning_rate": 0.0004000578968627979,
"loss": 3.0735,
"step": 49600
},
{
"epoch": 13.361218776916452,
"grad_norm": 0.3967099189758301,
"learning_rate": 0.0003998559310623401,
"loss": 3.0843,
"step": 49650
},
{
"epoch": 13.37467700258398,
"grad_norm": 0.3714440166950226,
"learning_rate": 0.0003996539652618823,
"loss": 3.0741,
"step": 49700
},
{
"epoch": 13.388135228251507,
"grad_norm": 0.4029998481273651,
"learning_rate": 0.0003994519994614245,
"loss": 3.0777,
"step": 49750
},
{
"epoch": 13.401593453919036,
"grad_norm": 0.39566588401794434,
"learning_rate": 0.00039925003366096673,
"loss": 3.0849,
"step": 49800
},
{
"epoch": 13.415051679586563,
"grad_norm": 0.3758895993232727,
"learning_rate": 0.0003990480678605089,
"loss": 3.0984,
"step": 49850
},
{
"epoch": 13.428509905254092,
"grad_norm": 0.3822093605995178,
"learning_rate": 0.0003988461020600511,
"loss": 3.0859,
"step": 49900
},
{
"epoch": 13.44196813092162,
"grad_norm": 0.35837680101394653,
"learning_rate": 0.0003986441362595933,
"loss": 3.0958,
"step": 49950
},
{
"epoch": 13.455426356589147,
"grad_norm": 0.42340973019599915,
"learning_rate": 0.0003984421704591355,
"loss": 3.0828,
"step": 50000
},
{
"epoch": 13.455426356589147,
"eval_accuracy": 0.39285638232915393,
"eval_loss": 3.3012328147888184,
"eval_runtime": 53.8418,
"eval_samples_per_second": 334.536,
"eval_steps_per_second": 20.913,
"step": 50000
},
{
"epoch": 13.468884582256676,
"grad_norm": 0.4042765200138092,
"learning_rate": 0.00039824020465867776,
"loss": 3.0829,
"step": 50050
},
{
"epoch": 13.482342807924203,
"grad_norm": 0.42345064878463745,
"learning_rate": 0.00039803823885821996,
"loss": 3.0972,
"step": 50100
},
{
"epoch": 13.49580103359173,
"grad_norm": 0.4129413366317749,
"learning_rate": 0.00039783627305776216,
"loss": 3.0934,
"step": 50150
},
{
"epoch": 13.50925925925926,
"grad_norm": 0.4112797677516937,
"learning_rate": 0.00039763430725730435,
"loss": 3.0947,
"step": 50200
},
{
"epoch": 13.522717484926787,
"grad_norm": 0.36507686972618103,
"learning_rate": 0.00039743234145684666,
"loss": 3.0975,
"step": 50250
},
{
"epoch": 13.536175710594316,
"grad_norm": 0.3800624907016754,
"learning_rate": 0.00039723037565638885,
"loss": 3.1025,
"step": 50300
},
{
"epoch": 13.549633936261843,
"grad_norm": 0.40299904346466064,
"learning_rate": 0.00039702840985593105,
"loss": 3.0832,
"step": 50350
},
{
"epoch": 13.56309216192937,
"grad_norm": 0.38481107354164124,
"learning_rate": 0.00039682644405547324,
"loss": 3.0907,
"step": 50400
},
{
"epoch": 13.5765503875969,
"grad_norm": 0.3586687445640564,
"learning_rate": 0.00039662447825501544,
"loss": 3.1058,
"step": 50450
},
{
"epoch": 13.590008613264427,
"grad_norm": 0.37668395042419434,
"learning_rate": 0.0003964225124545577,
"loss": 3.0936,
"step": 50500
},
{
"epoch": 13.603466838931956,
"grad_norm": 0.4070712625980377,
"learning_rate": 0.0003962205466540999,
"loss": 3.1099,
"step": 50550
},
{
"epoch": 13.616925064599483,
"grad_norm": 0.3600846230983734,
"learning_rate": 0.0003960185808536421,
"loss": 3.0942,
"step": 50600
},
{
"epoch": 13.63038329026701,
"grad_norm": 0.3763042390346527,
"learning_rate": 0.0003958166150531843,
"loss": 3.1044,
"step": 50650
},
{
"epoch": 13.64384151593454,
"grad_norm": 0.3834032714366913,
"learning_rate": 0.0003956146492527265,
"loss": 3.1118,
"step": 50700
},
{
"epoch": 13.657299741602067,
"grad_norm": 0.3641510009765625,
"learning_rate": 0.0003954126834522687,
"loss": 3.0999,
"step": 50750
},
{
"epoch": 13.670757967269594,
"grad_norm": 0.3956216275691986,
"learning_rate": 0.0003952107176518109,
"loss": 3.0896,
"step": 50800
},
{
"epoch": 13.684216192937123,
"grad_norm": 0.39530178904533386,
"learning_rate": 0.0003950087518513531,
"loss": 3.1057,
"step": 50850
},
{
"epoch": 13.69767441860465,
"grad_norm": 0.38161706924438477,
"learning_rate": 0.0003948067860508953,
"loss": 3.1006,
"step": 50900
},
{
"epoch": 13.71113264427218,
"grad_norm": 0.3645673990249634,
"learning_rate": 0.0003946048202504375,
"loss": 3.1121,
"step": 50950
},
{
"epoch": 13.724590869939707,
"grad_norm": 0.37178608775138855,
"learning_rate": 0.00039440285444997976,
"loss": 3.1093,
"step": 51000
},
{
"epoch": 13.724590869939707,
"eval_accuracy": 0.3933404040706935,
"eval_loss": 3.2926268577575684,
"eval_runtime": 53.8199,
"eval_samples_per_second": 334.672,
"eval_steps_per_second": 20.922,
"step": 51000
},
{
"epoch": 13.738049095607234,
"grad_norm": 0.3872733414173126,
"learning_rate": 0.00039420088864952196,
"loss": 3.0982,
"step": 51050
},
{
"epoch": 13.751507321274763,
"grad_norm": 0.3768724203109741,
"learning_rate": 0.00039399892284906415,
"loss": 3.1018,
"step": 51100
},
{
"epoch": 13.76496554694229,
"grad_norm": 0.3499247431755066,
"learning_rate": 0.00039379695704860646,
"loss": 3.1137,
"step": 51150
},
{
"epoch": 13.77842377260982,
"grad_norm": 0.3912452757358551,
"learning_rate": 0.00039359499124814865,
"loss": 3.1003,
"step": 51200
},
{
"epoch": 13.791881998277347,
"grad_norm": 0.40462765097618103,
"learning_rate": 0.00039339302544769085,
"loss": 3.109,
"step": 51250
},
{
"epoch": 13.805340223944874,
"grad_norm": 0.38249385356903076,
"learning_rate": 0.00039319105964723305,
"loss": 3.0964,
"step": 51300
},
{
"epoch": 13.818798449612403,
"grad_norm": 0.38811343908309937,
"learning_rate": 0.00039298909384677524,
"loss": 3.1095,
"step": 51350
},
{
"epoch": 13.83225667527993,
"grad_norm": 0.3731904625892639,
"learning_rate": 0.0003927871280463175,
"loss": 3.1067,
"step": 51400
},
{
"epoch": 13.84571490094746,
"grad_norm": 0.35964900255203247,
"learning_rate": 0.0003925851622458597,
"loss": 3.1094,
"step": 51450
},
{
"epoch": 13.859173126614987,
"grad_norm": 0.3533209562301636,
"learning_rate": 0.0003923831964454019,
"loss": 3.1167,
"step": 51500
},
{
"epoch": 13.872631352282514,
"grad_norm": 0.368437796831131,
"learning_rate": 0.0003921812306449441,
"loss": 3.1096,
"step": 51550
},
{
"epoch": 13.886089577950044,
"grad_norm": 0.39540213346481323,
"learning_rate": 0.0003919792648444863,
"loss": 3.1123,
"step": 51600
},
{
"epoch": 13.89954780361757,
"grad_norm": 0.38563069701194763,
"learning_rate": 0.0003917772990440285,
"loss": 3.1218,
"step": 51650
},
{
"epoch": 13.9130060292851,
"grad_norm": 0.3552553057670593,
"learning_rate": 0.0003915753332435707,
"loss": 3.1082,
"step": 51700
},
{
"epoch": 13.926464254952627,
"grad_norm": 0.3842296600341797,
"learning_rate": 0.0003913733674431129,
"loss": 3.097,
"step": 51750
},
{
"epoch": 13.939922480620154,
"grad_norm": 0.3653877377510071,
"learning_rate": 0.0003911714016426551,
"loss": 3.1161,
"step": 51800
},
{
"epoch": 13.953380706287684,
"grad_norm": 0.3652689754962921,
"learning_rate": 0.0003909694358421973,
"loss": 3.096,
"step": 51850
},
{
"epoch": 13.96683893195521,
"grad_norm": 0.3673049211502075,
"learning_rate": 0.00039076747004173956,
"loss": 3.1018,
"step": 51900
},
{
"epoch": 13.98029715762274,
"grad_norm": 0.3873627185821533,
"learning_rate": 0.00039056550424128176,
"loss": 3.1102,
"step": 51950
},
{
"epoch": 13.993755383290267,
"grad_norm": 0.3623334467411041,
"learning_rate": 0.00039036353844082395,
"loss": 3.1115,
"step": 52000
},
{
"epoch": 13.993755383290267,
"eval_accuracy": 0.39365352430843015,
"eval_loss": 3.287444591522217,
"eval_runtime": 54.0713,
"eval_samples_per_second": 333.115,
"eval_steps_per_second": 20.824,
"step": 52000
},
{
"epoch": 14.006998277347115,
"grad_norm": 0.39216455817222595,
"learning_rate": 0.0003901615726403662,
"loss": 3.0531,
"step": 52050
},
{
"epoch": 14.020456503014643,
"grad_norm": 0.39898359775543213,
"learning_rate": 0.00038995960683990845,
"loss": 3.0271,
"step": 52100
},
{
"epoch": 14.03391472868217,
"grad_norm": 0.3908878564834595,
"learning_rate": 0.00038975764103945065,
"loss": 3.0249,
"step": 52150
},
{
"epoch": 14.047372954349699,
"grad_norm": 0.3882172703742981,
"learning_rate": 0.00038955567523899285,
"loss": 3.0202,
"step": 52200
},
{
"epoch": 14.060831180017226,
"grad_norm": 0.3726351261138916,
"learning_rate": 0.00038935370943853504,
"loss": 3.0342,
"step": 52250
},
{
"epoch": 14.074289405684755,
"grad_norm": 0.3756456971168518,
"learning_rate": 0.00038915174363807724,
"loss": 3.0319,
"step": 52300
},
{
"epoch": 14.087747631352283,
"grad_norm": 0.39908862113952637,
"learning_rate": 0.0003889497778376195,
"loss": 3.0268,
"step": 52350
},
{
"epoch": 14.10120585701981,
"grad_norm": 0.3887988030910492,
"learning_rate": 0.0003887478120371617,
"loss": 3.0341,
"step": 52400
},
{
"epoch": 14.114664082687339,
"grad_norm": 0.38865944743156433,
"learning_rate": 0.0003885458462367039,
"loss": 3.038,
"step": 52450
},
{
"epoch": 14.128122308354866,
"grad_norm": 0.37640053033828735,
"learning_rate": 0.0003883438804362461,
"loss": 3.0369,
"step": 52500
},
{
"epoch": 14.141580534022394,
"grad_norm": 0.3813260793685913,
"learning_rate": 0.0003881419146357883,
"loss": 3.045,
"step": 52550
},
{
"epoch": 14.155038759689923,
"grad_norm": 0.3812572658061981,
"learning_rate": 0.0003879399488353305,
"loss": 3.06,
"step": 52600
},
{
"epoch": 14.16849698535745,
"grad_norm": 0.3978724777698517,
"learning_rate": 0.0003877379830348727,
"loss": 3.0308,
"step": 52650
},
{
"epoch": 14.18195521102498,
"grad_norm": 0.40393364429473877,
"learning_rate": 0.0003875360172344149,
"loss": 3.0453,
"step": 52700
},
{
"epoch": 14.195413436692506,
"grad_norm": 0.3880857229232788,
"learning_rate": 0.0003873340514339571,
"loss": 3.0465,
"step": 52750
},
{
"epoch": 14.208871662360034,
"grad_norm": 0.39600270986557007,
"learning_rate": 0.0003871320856334993,
"loss": 3.0441,
"step": 52800
},
{
"epoch": 14.222329888027563,
"grad_norm": 0.42481228709220886,
"learning_rate": 0.00038693011983304156,
"loss": 3.0545,
"step": 52850
},
{
"epoch": 14.23578811369509,
"grad_norm": 0.39608439803123474,
"learning_rate": 0.0003867281540325838,
"loss": 3.0596,
"step": 52900
},
{
"epoch": 14.24924633936262,
"grad_norm": 0.35995277762413025,
"learning_rate": 0.000386526188232126,
"loss": 3.0558,
"step": 52950
},
{
"epoch": 14.262704565030146,
"grad_norm": 0.3757532835006714,
"learning_rate": 0.00038632422243166825,
"loss": 3.0686,
"step": 53000
},
{
"epoch": 14.262704565030146,
"eval_accuracy": 0.3930424943927358,
"eval_loss": 3.301429271697998,
"eval_runtime": 53.8948,
"eval_samples_per_second": 334.207,
"eval_steps_per_second": 20.893,
"step": 53000
},
{
"epoch": 14.276162790697674,
"grad_norm": 0.3709987998008728,
"learning_rate": 0.00038612225663121045,
"loss": 3.0512,
"step": 53050
},
{
"epoch": 14.289621016365203,
"grad_norm": 0.40017008781433105,
"learning_rate": 0.00038592029083075265,
"loss": 3.0583,
"step": 53100
},
{
"epoch": 14.30307924203273,
"grad_norm": 0.37422966957092285,
"learning_rate": 0.00038571832503029484,
"loss": 3.0723,
"step": 53150
},
{
"epoch": 14.31653746770026,
"grad_norm": 0.3892733156681061,
"learning_rate": 0.00038551635922983704,
"loss": 3.0606,
"step": 53200
},
{
"epoch": 14.329995693367787,
"grad_norm": 0.39466214179992676,
"learning_rate": 0.0003853143934293793,
"loss": 3.0575,
"step": 53250
},
{
"epoch": 14.343453919035314,
"grad_norm": 0.3730536103248596,
"learning_rate": 0.0003851124276289215,
"loss": 3.0646,
"step": 53300
},
{
"epoch": 14.356912144702843,
"grad_norm": 0.37154144048690796,
"learning_rate": 0.0003849104618284637,
"loss": 3.0496,
"step": 53350
},
{
"epoch": 14.37037037037037,
"grad_norm": 0.3818061351776123,
"learning_rate": 0.0003847084960280059,
"loss": 3.0638,
"step": 53400
},
{
"epoch": 14.383828596037898,
"grad_norm": 0.38460275530815125,
"learning_rate": 0.0003845065302275481,
"loss": 3.0647,
"step": 53450
},
{
"epoch": 14.397286821705427,
"grad_norm": 0.37454620003700256,
"learning_rate": 0.0003843045644270903,
"loss": 3.0702,
"step": 53500
},
{
"epoch": 14.410745047372954,
"grad_norm": 0.3800989091396332,
"learning_rate": 0.0003841025986266325,
"loss": 3.0693,
"step": 53550
},
{
"epoch": 14.424203273040483,
"grad_norm": 0.40141618251800537,
"learning_rate": 0.0003839006328261747,
"loss": 3.0764,
"step": 53600
},
{
"epoch": 14.43766149870801,
"grad_norm": 0.40033701062202454,
"learning_rate": 0.0003836986670257169,
"loss": 3.0703,
"step": 53650
},
{
"epoch": 14.451119724375538,
"grad_norm": 0.37985333800315857,
"learning_rate": 0.0003834967012252591,
"loss": 3.0714,
"step": 53700
},
{
"epoch": 14.464577950043067,
"grad_norm": 0.368487685918808,
"learning_rate": 0.00038329473542480136,
"loss": 3.0718,
"step": 53750
},
{
"epoch": 14.478036175710594,
"grad_norm": 0.3814280331134796,
"learning_rate": 0.0003830927696243436,
"loss": 3.0695,
"step": 53800
},
{
"epoch": 14.491494401378123,
"grad_norm": 0.3550291061401367,
"learning_rate": 0.0003828908038238858,
"loss": 3.0714,
"step": 53850
},
{
"epoch": 14.50495262704565,
"grad_norm": 0.38511765003204346,
"learning_rate": 0.000382688838023428,
"loss": 3.0839,
"step": 53900
},
{
"epoch": 14.518410852713178,
"grad_norm": 0.40868499875068665,
"learning_rate": 0.00038248687222297025,
"loss": 3.0847,
"step": 53950
},
{
"epoch": 14.531869078380707,
"grad_norm": 0.41092634201049805,
"learning_rate": 0.00038228490642251245,
"loss": 3.0762,
"step": 54000
},
{
"epoch": 14.531869078380707,
"eval_accuracy": 0.393471975412782,
"eval_loss": 3.2951552867889404,
"eval_runtime": 53.9287,
"eval_samples_per_second": 333.997,
"eval_steps_per_second": 20.879,
"step": 54000
},
{
"epoch": 14.545327304048234,
"grad_norm": 0.37341392040252686,
"learning_rate": 0.00038208294062205464,
"loss": 3.0923,
"step": 54050
},
{
"epoch": 14.558785529715763,
"grad_norm": 0.40408065915107727,
"learning_rate": 0.00038188097482159684,
"loss": 3.0862,
"step": 54100
},
{
"epoch": 14.57224375538329,
"grad_norm": 0.3749300241470337,
"learning_rate": 0.00038167900902113904,
"loss": 3.0847,
"step": 54150
},
{
"epoch": 14.585701981050818,
"grad_norm": 0.3992476463317871,
"learning_rate": 0.0003814770432206813,
"loss": 3.0899,
"step": 54200
},
{
"epoch": 14.599160206718347,
"grad_norm": 0.41293609142303467,
"learning_rate": 0.0003812750774202235,
"loss": 3.0751,
"step": 54250
},
{
"epoch": 14.612618432385874,
"grad_norm": 0.3896488547325134,
"learning_rate": 0.0003810731116197657,
"loss": 3.0844,
"step": 54300
},
{
"epoch": 14.626076658053403,
"grad_norm": 0.3871552348136902,
"learning_rate": 0.0003808711458193079,
"loss": 3.0875,
"step": 54350
},
{
"epoch": 14.63953488372093,
"grad_norm": 0.3741537034511566,
"learning_rate": 0.00038066918001885007,
"loss": 3.0883,
"step": 54400
},
{
"epoch": 14.652993109388458,
"grad_norm": 0.40278348326683044,
"learning_rate": 0.0003804672142183923,
"loss": 3.0921,
"step": 54450
},
{
"epoch": 14.666451335055987,
"grad_norm": 0.40198415517807007,
"learning_rate": 0.0003802652484179345,
"loss": 3.094,
"step": 54500
},
{
"epoch": 14.679909560723514,
"grad_norm": 0.36237287521362305,
"learning_rate": 0.0003800632826174767,
"loss": 3.0858,
"step": 54550
},
{
"epoch": 14.693367786391041,
"grad_norm": 0.38690119981765747,
"learning_rate": 0.0003798613168170189,
"loss": 3.0753,
"step": 54600
},
{
"epoch": 14.70682601205857,
"grad_norm": 0.39163732528686523,
"learning_rate": 0.0003796593510165611,
"loss": 3.0878,
"step": 54650
},
{
"epoch": 14.720284237726098,
"grad_norm": 0.38994866609573364,
"learning_rate": 0.0003794573852161034,
"loss": 3.0708,
"step": 54700
},
{
"epoch": 14.733742463393627,
"grad_norm": 0.3824878931045532,
"learning_rate": 0.0003792554194156456,
"loss": 3.0862,
"step": 54750
},
{
"epoch": 14.747200689061154,
"grad_norm": 0.3619995415210724,
"learning_rate": 0.0003790534536151878,
"loss": 3.0844,
"step": 54800
},
{
"epoch": 14.760658914728682,
"grad_norm": 0.40019071102142334,
"learning_rate": 0.00037885148781473005,
"loss": 3.0864,
"step": 54850
},
{
"epoch": 14.77411714039621,
"grad_norm": 0.36875399947166443,
"learning_rate": 0.00037864952201427225,
"loss": 3.0859,
"step": 54900
},
{
"epoch": 14.787575366063738,
"grad_norm": 0.3560780882835388,
"learning_rate": 0.00037844755621381444,
"loss": 3.0997,
"step": 54950
},
{
"epoch": 14.801033591731267,
"grad_norm": 0.42874544858932495,
"learning_rate": 0.00037824559041335664,
"loss": 3.0797,
"step": 55000
},
{
"epoch": 14.801033591731267,
"eval_accuracy": 0.3938084547244651,
"eval_loss": 3.2878448963165283,
"eval_runtime": 53.7813,
"eval_samples_per_second": 334.912,
"eval_steps_per_second": 20.937,
"step": 55000
},
{
"epoch": 14.814491817398794,
"grad_norm": 0.3775944709777832,
"learning_rate": 0.00037804362461289884,
"loss": 3.0984,
"step": 55050
},
{
"epoch": 14.827950043066322,
"grad_norm": 0.3809267282485962,
"learning_rate": 0.0003778416588124411,
"loss": 3.0919,
"step": 55100
},
{
"epoch": 14.84140826873385,
"grad_norm": 0.3942723274230957,
"learning_rate": 0.0003776396930119833,
"loss": 3.1,
"step": 55150
},
{
"epoch": 14.854866494401378,
"grad_norm": 0.3672431707382202,
"learning_rate": 0.0003774377272115255,
"loss": 3.0864,
"step": 55200
},
{
"epoch": 14.868324720068905,
"grad_norm": 0.38476642966270447,
"learning_rate": 0.0003772357614110677,
"loss": 3.0911,
"step": 55250
},
{
"epoch": 14.881782945736434,
"grad_norm": 0.3824423849582672,
"learning_rate": 0.00037703379561060987,
"loss": 3.0828,
"step": 55300
},
{
"epoch": 14.895241171403962,
"grad_norm": 0.352585107088089,
"learning_rate": 0.0003768318298101521,
"loss": 3.0994,
"step": 55350
},
{
"epoch": 14.90869939707149,
"grad_norm": 0.4190617501735687,
"learning_rate": 0.0003766298640096943,
"loss": 3.0963,
"step": 55400
},
{
"epoch": 14.922157622739018,
"grad_norm": 0.3812563419342041,
"learning_rate": 0.0003764278982092365,
"loss": 3.0994,
"step": 55450
},
{
"epoch": 14.935615848406545,
"grad_norm": 0.37324008345603943,
"learning_rate": 0.0003762259324087787,
"loss": 3.0875,
"step": 55500
},
{
"epoch": 14.949074074074074,
"grad_norm": 0.3767263889312744,
"learning_rate": 0.0003760239666083209,
"loss": 3.0994,
"step": 55550
},
{
"epoch": 14.962532299741602,
"grad_norm": 0.39034217596054077,
"learning_rate": 0.0003758220008078632,
"loss": 3.103,
"step": 55600
},
{
"epoch": 14.97599052540913,
"grad_norm": 0.4098099172115326,
"learning_rate": 0.0003756200350074054,
"loss": 3.0952,
"step": 55650
},
{
"epoch": 14.989448751076658,
"grad_norm": 0.36337199807167053,
"learning_rate": 0.0003754180692069476,
"loss": 3.0987,
"step": 55700
},
{
"epoch": 15.002691645133506,
"grad_norm": 0.41866105794906616,
"learning_rate": 0.0003752161034064898,
"loss": 3.0784,
"step": 55750
},
{
"epoch": 15.016149870801033,
"grad_norm": 0.3725742995738983,
"learning_rate": 0.00037501413760603205,
"loss": 3.0073,
"step": 55800
},
{
"epoch": 15.02960809646856,
"grad_norm": 0.37522321939468384,
"learning_rate": 0.00037481217180557425,
"loss": 3.0118,
"step": 55850
},
{
"epoch": 15.04306632213609,
"grad_norm": 0.39416176080703735,
"learning_rate": 0.00037461020600511644,
"loss": 3.0055,
"step": 55900
},
{
"epoch": 15.056524547803617,
"grad_norm": 0.3889634311199188,
"learning_rate": 0.00037440824020465864,
"loss": 3.015,
"step": 55950
},
{
"epoch": 15.069982773471146,
"grad_norm": 0.4061291217803955,
"learning_rate": 0.00037420627440420083,
"loss": 3.0089,
"step": 56000
},
{
"epoch": 15.069982773471146,
"eval_accuracy": 0.39340830835542123,
"eval_loss": 3.298887252807617,
"eval_runtime": 53.7996,
"eval_samples_per_second": 334.798,
"eval_steps_per_second": 20.93,
"step": 56000
},
{
"epoch": 15.083440999138674,
"grad_norm": 0.3685813546180725,
"learning_rate": 0.0003740043086037431,
"loss": 3.0137,
"step": 56050
},
{
"epoch": 15.0968992248062,
"grad_norm": 0.3749956786632538,
"learning_rate": 0.0003738023428032853,
"loss": 3.0182,
"step": 56100
},
{
"epoch": 15.11035745047373,
"grad_norm": 0.3896404802799225,
"learning_rate": 0.0003736003770028275,
"loss": 3.0167,
"step": 56150
},
{
"epoch": 15.123815676141257,
"grad_norm": 0.40831461548805237,
"learning_rate": 0.00037339841120236967,
"loss": 3.0115,
"step": 56200
},
{
"epoch": 15.137273901808786,
"grad_norm": 0.36938607692718506,
"learning_rate": 0.00037319644540191187,
"loss": 3.0303,
"step": 56250
},
{
"epoch": 15.150732127476314,
"grad_norm": 0.4110319912433624,
"learning_rate": 0.0003729944796014541,
"loss": 3.0322,
"step": 56300
},
{
"epoch": 15.164190353143841,
"grad_norm": 0.383331835269928,
"learning_rate": 0.0003727925138009963,
"loss": 3.0447,
"step": 56350
},
{
"epoch": 15.17764857881137,
"grad_norm": 0.371985524892807,
"learning_rate": 0.0003725905480005385,
"loss": 3.0273,
"step": 56400
},
{
"epoch": 15.191106804478897,
"grad_norm": 0.3929747939109802,
"learning_rate": 0.0003723885822000807,
"loss": 3.0346,
"step": 56450
},
{
"epoch": 15.204565030146426,
"grad_norm": 0.3700959086418152,
"learning_rate": 0.000372186616399623,
"loss": 3.0414,
"step": 56500
},
{
"epoch": 15.218023255813954,
"grad_norm": 0.3915750980377197,
"learning_rate": 0.0003719846505991652,
"loss": 3.0328,
"step": 56550
},
{
"epoch": 15.231481481481481,
"grad_norm": 0.39838945865631104,
"learning_rate": 0.0003717826847987074,
"loss": 3.0331,
"step": 56600
},
{
"epoch": 15.24493970714901,
"grad_norm": 0.3869384825229645,
"learning_rate": 0.0003715807189982496,
"loss": 3.0534,
"step": 56650
},
{
"epoch": 15.258397932816537,
"grad_norm": 0.394001305103302,
"learning_rate": 0.00037137875319779185,
"loss": 3.0312,
"step": 56700
},
{
"epoch": 15.271856158484065,
"grad_norm": 0.4037325084209442,
"learning_rate": 0.00037117678739733405,
"loss": 3.051,
"step": 56750
},
{
"epoch": 15.285314384151594,
"grad_norm": 0.4047858715057373,
"learning_rate": 0.00037097482159687624,
"loss": 3.0406,
"step": 56800
},
{
"epoch": 15.298772609819121,
"grad_norm": 0.39127621054649353,
"learning_rate": 0.00037077285579641844,
"loss": 3.0564,
"step": 56850
},
{
"epoch": 15.31223083548665,
"grad_norm": 0.4089362323284149,
"learning_rate": 0.00037057088999596063,
"loss": 3.0495,
"step": 56900
},
{
"epoch": 15.325689061154177,
"grad_norm": 0.4318329095840454,
"learning_rate": 0.0003703689241955029,
"loss": 3.0614,
"step": 56950
},
{
"epoch": 15.339147286821705,
"grad_norm": 0.3942766487598419,
"learning_rate": 0.0003701669583950451,
"loss": 3.0535,
"step": 57000
},
{
"epoch": 15.339147286821705,
"eval_accuracy": 0.39346882465397065,
"eval_loss": 3.296555280685425,
"eval_runtime": 53.8696,
"eval_samples_per_second": 334.363,
"eval_steps_per_second": 20.902,
"step": 57000
},
{
"epoch": 15.352605512489234,
"grad_norm": 0.45541346073150635,
"learning_rate": 0.0003699649925945873,
"loss": 3.0515,
"step": 57050
},
{
"epoch": 15.366063738156761,
"grad_norm": 0.40245404839515686,
"learning_rate": 0.0003697630267941295,
"loss": 3.0566,
"step": 57100
},
{
"epoch": 15.37952196382429,
"grad_norm": 0.3851865231990814,
"learning_rate": 0.00036956106099367167,
"loss": 3.0555,
"step": 57150
},
{
"epoch": 15.392980189491817,
"grad_norm": 0.3949066996574402,
"learning_rate": 0.0003693590951932139,
"loss": 3.0416,
"step": 57200
},
{
"epoch": 15.406438415159345,
"grad_norm": 0.38792628049850464,
"learning_rate": 0.0003691571293927561,
"loss": 3.0546,
"step": 57250
},
{
"epoch": 15.419896640826874,
"grad_norm": 0.41789019107818604,
"learning_rate": 0.0003689551635922983,
"loss": 3.0661,
"step": 57300
},
{
"epoch": 15.433354866494401,
"grad_norm": 0.38184309005737305,
"learning_rate": 0.00036875319779184056,
"loss": 3.0535,
"step": 57350
},
{
"epoch": 15.44681309216193,
"grad_norm": 0.4123072326183319,
"learning_rate": 0.0003685512319913828,
"loss": 3.0622,
"step": 57400
},
{
"epoch": 15.460271317829458,
"grad_norm": 0.38529086112976074,
"learning_rate": 0.000368349266190925,
"loss": 3.0553,
"step": 57450
},
{
"epoch": 15.473729543496985,
"grad_norm": 0.37527531385421753,
"learning_rate": 0.0003681473003904672,
"loss": 3.0544,
"step": 57500
},
{
"epoch": 15.487187769164514,
"grad_norm": 0.38482967019081116,
"learning_rate": 0.0003679453345900094,
"loss": 3.063,
"step": 57550
},
{
"epoch": 15.500645994832041,
"grad_norm": 0.3870854377746582,
"learning_rate": 0.0003677433687895516,
"loss": 3.0678,
"step": 57600
},
{
"epoch": 15.514104220499568,
"grad_norm": 0.4207375645637512,
"learning_rate": 0.00036754140298909385,
"loss": 3.0578,
"step": 57650
},
{
"epoch": 15.527562446167098,
"grad_norm": 0.38036173582077026,
"learning_rate": 0.00036733943718863604,
"loss": 3.0655,
"step": 57700
},
{
"epoch": 15.541020671834625,
"grad_norm": 0.37135618925094604,
"learning_rate": 0.00036713747138817824,
"loss": 3.0561,
"step": 57750
},
{
"epoch": 15.554478897502154,
"grad_norm": 0.4144805073738098,
"learning_rate": 0.00036693550558772044,
"loss": 3.0701,
"step": 57800
},
{
"epoch": 15.567937123169681,
"grad_norm": 0.3989149332046509,
"learning_rate": 0.00036673353978726263,
"loss": 3.0672,
"step": 57850
},
{
"epoch": 15.581395348837209,
"grad_norm": 0.4064973294734955,
"learning_rate": 0.0003665315739868049,
"loss": 3.0692,
"step": 57900
},
{
"epoch": 15.594853574504738,
"grad_norm": 0.3864794373512268,
"learning_rate": 0.0003663296081863471,
"loss": 3.0669,
"step": 57950
},
{
"epoch": 15.608311800172265,
"grad_norm": 0.3883097767829895,
"learning_rate": 0.0003661276423858893,
"loss": 3.0641,
"step": 58000
},
{
"epoch": 15.608311800172265,
"eval_accuracy": 0.393839636372012,
"eval_loss": 3.2892065048217773,
"eval_runtime": 53.801,
"eval_samples_per_second": 334.789,
"eval_steps_per_second": 20.929,
"step": 58000
},
{
"epoch": 15.621770025839794,
"grad_norm": 0.4003264605998993,
"learning_rate": 0.00036592567658543147,
"loss": 3.0776,
"step": 58050
},
{
"epoch": 15.635228251507321,
"grad_norm": 0.40963396430015564,
"learning_rate": 0.00036572371078497367,
"loss": 3.0615,
"step": 58100
},
{
"epoch": 15.648686477174849,
"grad_norm": 0.3997986912727356,
"learning_rate": 0.0003655217449845159,
"loss": 3.0714,
"step": 58150
},
{
"epoch": 15.662144702842378,
"grad_norm": 0.40300729870796204,
"learning_rate": 0.0003653197791840581,
"loss": 3.0714,
"step": 58200
},
{
"epoch": 15.675602928509905,
"grad_norm": 0.4134303033351898,
"learning_rate": 0.00036511781338360036,
"loss": 3.0696,
"step": 58250
},
{
"epoch": 15.689061154177434,
"grad_norm": 0.36091458797454834,
"learning_rate": 0.00036491584758314256,
"loss": 3.0863,
"step": 58300
},
{
"epoch": 15.702519379844961,
"grad_norm": 0.380769819021225,
"learning_rate": 0.0003647138817826848,
"loss": 3.0681,
"step": 58350
},
{
"epoch": 15.715977605512489,
"grad_norm": 0.3838481605052948,
"learning_rate": 0.000364511915982227,
"loss": 3.0671,
"step": 58400
},
{
"epoch": 15.729435831180018,
"grad_norm": 0.4186742901802063,
"learning_rate": 0.0003643099501817692,
"loss": 3.0736,
"step": 58450
},
{
"epoch": 15.742894056847545,
"grad_norm": 0.4187677800655365,
"learning_rate": 0.0003641079843813114,
"loss": 3.0783,
"step": 58500
},
{
"epoch": 15.756352282515074,
"grad_norm": 0.3893994987010956,
"learning_rate": 0.0003639060185808536,
"loss": 3.074,
"step": 58550
},
{
"epoch": 15.769810508182601,
"grad_norm": 0.41808056831359863,
"learning_rate": 0.00036370405278039584,
"loss": 3.0731,
"step": 58600
},
{
"epoch": 15.783268733850129,
"grad_norm": 0.38791757822036743,
"learning_rate": 0.00036350208697993804,
"loss": 3.0766,
"step": 58650
},
{
"epoch": 15.796726959517658,
"grad_norm": 0.3836047649383545,
"learning_rate": 0.00036330012117948024,
"loss": 3.0732,
"step": 58700
},
{
"epoch": 15.810185185185185,
"grad_norm": 0.3814838230609894,
"learning_rate": 0.00036309815537902243,
"loss": 3.086,
"step": 58750
},
{
"epoch": 15.823643410852712,
"grad_norm": 0.3762393891811371,
"learning_rate": 0.0003628961895785647,
"loss": 3.0793,
"step": 58800
},
{
"epoch": 15.837101636520241,
"grad_norm": 0.4067472517490387,
"learning_rate": 0.0003626942237781069,
"loss": 3.084,
"step": 58850
},
{
"epoch": 15.850559862187769,
"grad_norm": 0.3889180123806,
"learning_rate": 0.0003624922579776491,
"loss": 3.0767,
"step": 58900
},
{
"epoch": 15.864018087855298,
"grad_norm": 0.36162832379341125,
"learning_rate": 0.00036229029217719127,
"loss": 3.0917,
"step": 58950
},
{
"epoch": 15.877476313522825,
"grad_norm": 0.41509488224983215,
"learning_rate": 0.00036208832637673347,
"loss": 3.082,
"step": 59000
},
{
"epoch": 15.877476313522825,
"eval_accuracy": 0.39411407832916784,
"eval_loss": 3.2861170768737793,
"eval_runtime": 53.7782,
"eval_samples_per_second": 334.931,
"eval_steps_per_second": 20.938,
"step": 59000
},
{
"epoch": 15.890934539190352,
"grad_norm": 0.38733652234077454,
"learning_rate": 0.0003618863605762757,
"loss": 3.0888,
"step": 59050
},
{
"epoch": 15.904392764857882,
"grad_norm": 0.3730623126029968,
"learning_rate": 0.0003616843947758179,
"loss": 3.0815,
"step": 59100
},
{
"epoch": 15.917850990525409,
"grad_norm": 0.4350191056728363,
"learning_rate": 0.00036148242897536016,
"loss": 3.0842,
"step": 59150
},
{
"epoch": 15.931309216192938,
"grad_norm": 0.3823108971118927,
"learning_rate": 0.00036128046317490236,
"loss": 3.0824,
"step": 59200
},
{
"epoch": 15.944767441860465,
"grad_norm": 0.3937095105648041,
"learning_rate": 0.0003610784973744446,
"loss": 3.0881,
"step": 59250
},
{
"epoch": 15.958225667527993,
"grad_norm": 0.43180474638938904,
"learning_rate": 0.0003608765315739868,
"loss": 3.0817,
"step": 59300
},
{
"epoch": 15.971683893195522,
"grad_norm": 0.37747758626937866,
"learning_rate": 0.000360674565773529,
"loss": 3.0744,
"step": 59350
},
{
"epoch": 15.985142118863049,
"grad_norm": 0.38189202547073364,
"learning_rate": 0.0003604725999730712,
"loss": 3.0809,
"step": 59400
},
{
"epoch": 15.998600344530576,
"grad_norm": 0.38622957468032837,
"learning_rate": 0.0003602706341726134,
"loss": 3.0887,
"step": 59450
},
{
"epoch": 16.011843238587424,
"grad_norm": 0.39193543791770935,
"learning_rate": 0.00036006866837215564,
"loss": 3.0044,
"step": 59500
},
{
"epoch": 16.02530146425495,
"grad_norm": 0.36499685049057007,
"learning_rate": 0.00035986670257169784,
"loss": 2.9916,
"step": 59550
},
{
"epoch": 16.03875968992248,
"grad_norm": 0.4056093096733093,
"learning_rate": 0.00035966473677124004,
"loss": 2.9866,
"step": 59600
},
{
"epoch": 16.05221791559001,
"grad_norm": 0.3871045708656311,
"learning_rate": 0.00035946277097078223,
"loss": 2.999,
"step": 59650
},
{
"epoch": 16.065676141257537,
"grad_norm": 0.40806224942207336,
"learning_rate": 0.00035926080517032443,
"loss": 3.0065,
"step": 59700
},
{
"epoch": 16.079134366925064,
"grad_norm": 0.3846936523914337,
"learning_rate": 0.0003590588393698667,
"loss": 3.0043,
"step": 59750
},
{
"epoch": 16.09259259259259,
"grad_norm": 0.41798853874206543,
"learning_rate": 0.0003588568735694089,
"loss": 3.005,
"step": 59800
},
{
"epoch": 16.10605081826012,
"grad_norm": 0.38085705041885376,
"learning_rate": 0.00035865490776895107,
"loss": 2.99,
"step": 59850
},
{
"epoch": 16.11950904392765,
"grad_norm": 0.37874123454093933,
"learning_rate": 0.00035845294196849327,
"loss": 3.0201,
"step": 59900
},
{
"epoch": 16.132967269595177,
"grad_norm": 0.38884493708610535,
"learning_rate": 0.00035825097616803546,
"loss": 2.9971,
"step": 59950
},
{
"epoch": 16.146425495262704,
"grad_norm": 0.39228013157844543,
"learning_rate": 0.0003580490103675777,
"loss": 3.0178,
"step": 60000
},
{
"epoch": 16.146425495262704,
"eval_accuracy": 0.3935985489995146,
"eval_loss": 3.300384283065796,
"eval_runtime": 53.764,
"eval_samples_per_second": 335.02,
"eval_steps_per_second": 20.943,
"step": 60000
},
{
"epoch": 16.15988372093023,
"grad_norm": 0.37571796774864197,
"learning_rate": 0.00035784704456711996,
"loss": 3.0206,
"step": 60050
},
{
"epoch": 16.17334194659776,
"grad_norm": 0.3760213255882263,
"learning_rate": 0.00035764507876666216,
"loss": 3.024,
"step": 60100
},
{
"epoch": 16.18680017226529,
"grad_norm": 0.4139486849308014,
"learning_rate": 0.00035744311296620436,
"loss": 3.0225,
"step": 60150
},
{
"epoch": 16.200258397932817,
"grad_norm": 0.41361162066459656,
"learning_rate": 0.0003572411471657466,
"loss": 3.0177,
"step": 60200
},
{
"epoch": 16.213716623600344,
"grad_norm": 0.4155104160308838,
"learning_rate": 0.0003570391813652888,
"loss": 3.0305,
"step": 60250
},
{
"epoch": 16.227174849267872,
"grad_norm": 0.4072454869747162,
"learning_rate": 0.000356837215564831,
"loss": 3.0319,
"step": 60300
},
{
"epoch": 16.2406330749354,
"grad_norm": 0.3964226543903351,
"learning_rate": 0.0003566352497643732,
"loss": 3.0271,
"step": 60350
},
{
"epoch": 16.25409130060293,
"grad_norm": 0.40195783972740173,
"learning_rate": 0.0003564332839639154,
"loss": 3.0424,
"step": 60400
},
{
"epoch": 16.267549526270457,
"grad_norm": 0.39406704902648926,
"learning_rate": 0.00035623131816345764,
"loss": 3.0325,
"step": 60450
},
{
"epoch": 16.281007751937985,
"grad_norm": 0.3998791575431824,
"learning_rate": 0.00035602935236299984,
"loss": 3.0188,
"step": 60500
},
{
"epoch": 16.294465977605512,
"grad_norm": 0.4291344881057739,
"learning_rate": 0.00035582738656254203,
"loss": 3.0384,
"step": 60550
},
{
"epoch": 16.30792420327304,
"grad_norm": 0.42071303725242615,
"learning_rate": 0.00035562542076208423,
"loss": 3.04,
"step": 60600
},
{
"epoch": 16.32138242894057,
"grad_norm": 0.40110665559768677,
"learning_rate": 0.0003554234549616264,
"loss": 3.0441,
"step": 60650
},
{
"epoch": 16.334840654608097,
"grad_norm": 0.4020436704158783,
"learning_rate": 0.0003552214891611687,
"loss": 3.035,
"step": 60700
},
{
"epoch": 16.348298880275625,
"grad_norm": 0.39630720019340515,
"learning_rate": 0.00035501952336071087,
"loss": 3.0418,
"step": 60750
},
{
"epoch": 16.361757105943152,
"grad_norm": 0.39753347635269165,
"learning_rate": 0.00035481755756025307,
"loss": 3.0448,
"step": 60800
},
{
"epoch": 16.37521533161068,
"grad_norm": 0.39796334505081177,
"learning_rate": 0.00035461559175979526,
"loss": 3.0384,
"step": 60850
},
{
"epoch": 16.38867355727821,
"grad_norm": 0.4023888111114502,
"learning_rate": 0.0003544136259593375,
"loss": 3.0512,
"step": 60900
},
{
"epoch": 16.402131782945737,
"grad_norm": 0.3885258436203003,
"learning_rate": 0.00035421166015887977,
"loss": 3.0415,
"step": 60950
},
{
"epoch": 16.415590008613265,
"grad_norm": 0.39868536591529846,
"learning_rate": 0.00035400969435842196,
"loss": 3.0405,
"step": 61000
},
{
"epoch": 16.415590008613265,
"eval_accuracy": 0.39402955107553866,
"eval_loss": 3.2945430278778076,
"eval_runtime": 53.9312,
"eval_samples_per_second": 333.981,
"eval_steps_per_second": 20.878,
"step": 61000
},
{
"epoch": 16.429048234280792,
"grad_norm": 0.4093899130821228,
"learning_rate": 0.00035380772855796416,
"loss": 3.0436,
"step": 61050
},
{
"epoch": 16.44250645994832,
"grad_norm": 0.44660383462905884,
"learning_rate": 0.0003536057627575064,
"loss": 3.041,
"step": 61100
},
{
"epoch": 16.45596468561585,
"grad_norm": 0.39885213971138,
"learning_rate": 0.0003534037969570486,
"loss": 3.05,
"step": 61150
},
{
"epoch": 16.469422911283377,
"grad_norm": 0.3952399492263794,
"learning_rate": 0.0003532018311565908,
"loss": 3.0525,
"step": 61200
},
{
"epoch": 16.482881136950905,
"grad_norm": 0.3856141269207001,
"learning_rate": 0.000352999865356133,
"loss": 3.0495,
"step": 61250
},
{
"epoch": 16.496339362618432,
"grad_norm": 0.4019649028778076,
"learning_rate": 0.0003527978995556752,
"loss": 3.0524,
"step": 61300
},
{
"epoch": 16.50979758828596,
"grad_norm": 0.40786245465278625,
"learning_rate": 0.00035259593375521744,
"loss": 3.0536,
"step": 61350
},
{
"epoch": 16.52325581395349,
"grad_norm": 0.39933323860168457,
"learning_rate": 0.00035239396795475964,
"loss": 3.0513,
"step": 61400
},
{
"epoch": 16.536714039621017,
"grad_norm": 0.4114389717578888,
"learning_rate": 0.00035219200215430183,
"loss": 3.0541,
"step": 61450
},
{
"epoch": 16.550172265288545,
"grad_norm": 0.378556489944458,
"learning_rate": 0.00035199003635384403,
"loss": 3.0464,
"step": 61500
},
{
"epoch": 16.563630490956072,
"grad_norm": 0.3831023573875427,
"learning_rate": 0.0003517880705533862,
"loss": 3.0496,
"step": 61550
},
{
"epoch": 16.5770887166236,
"grad_norm": 0.41064491868019104,
"learning_rate": 0.0003515861047529285,
"loss": 3.0466,
"step": 61600
},
{
"epoch": 16.590546942291127,
"grad_norm": 0.3654628098011017,
"learning_rate": 0.0003513841389524707,
"loss": 3.0648,
"step": 61650
},
{
"epoch": 16.604005167958658,
"grad_norm": 0.4042568504810333,
"learning_rate": 0.00035118217315201287,
"loss": 3.046,
"step": 61700
},
{
"epoch": 16.617463393626185,
"grad_norm": 0.3990447223186493,
"learning_rate": 0.00035098020735155507,
"loss": 3.0588,
"step": 61750
},
{
"epoch": 16.630921619293712,
"grad_norm": 0.3792515695095062,
"learning_rate": 0.00035077824155109737,
"loss": 3.0543,
"step": 61800
},
{
"epoch": 16.64437984496124,
"grad_norm": 0.40316513180732727,
"learning_rate": 0.00035057627575063957,
"loss": 3.0559,
"step": 61850
},
{
"epoch": 16.657838070628767,
"grad_norm": 0.4269405007362366,
"learning_rate": 0.00035037430995018176,
"loss": 3.0573,
"step": 61900
},
{
"epoch": 16.671296296296298,
"grad_norm": 0.3970986008644104,
"learning_rate": 0.00035017234414972396,
"loss": 3.0569,
"step": 61950
},
{
"epoch": 16.684754521963825,
"grad_norm": 0.40799281001091003,
"learning_rate": 0.00034997037834926615,
"loss": 3.0625,
"step": 62000
},
{
"epoch": 16.684754521963825,
"eval_accuracy": 0.3946576385475567,
"eval_loss": 3.286635637283325,
"eval_runtime": 53.7597,
"eval_samples_per_second": 335.047,
"eval_steps_per_second": 20.945,
"step": 62000
},
{
"epoch": 16.698212747631352,
"grad_norm": 0.43438488245010376,
"learning_rate": 0.0003497684125488084,
"loss": 3.0667,
"step": 62050
},
{
"epoch": 16.71167097329888,
"grad_norm": 0.385447233915329,
"learning_rate": 0.0003495664467483506,
"loss": 3.0618,
"step": 62100
},
{
"epoch": 16.725129198966407,
"grad_norm": 0.4281361699104309,
"learning_rate": 0.0003493644809478928,
"loss": 3.0627,
"step": 62150
},
{
"epoch": 16.738587424633938,
"grad_norm": 0.41003167629241943,
"learning_rate": 0.000349162515147435,
"loss": 3.0556,
"step": 62200
},
{
"epoch": 16.752045650301465,
"grad_norm": 0.40472331643104553,
"learning_rate": 0.0003489605493469772,
"loss": 3.0607,
"step": 62250
},
{
"epoch": 16.765503875968992,
"grad_norm": 0.39419615268707275,
"learning_rate": 0.00034875858354651944,
"loss": 3.0669,
"step": 62300
},
{
"epoch": 16.77896210163652,
"grad_norm": 0.40481939911842346,
"learning_rate": 0.00034855661774606164,
"loss": 3.0657,
"step": 62350
},
{
"epoch": 16.792420327304047,
"grad_norm": 0.4163655936717987,
"learning_rate": 0.00034835465194560383,
"loss": 3.0697,
"step": 62400
},
{
"epoch": 16.805878552971578,
"grad_norm": 0.3779263496398926,
"learning_rate": 0.00034815268614514603,
"loss": 3.0607,
"step": 62450
},
{
"epoch": 16.819336778639105,
"grad_norm": 0.3963411748409271,
"learning_rate": 0.0003479507203446882,
"loss": 3.0616,
"step": 62500
},
{
"epoch": 16.832795004306632,
"grad_norm": 0.3897510766983032,
"learning_rate": 0.0003477487545442305,
"loss": 3.0729,
"step": 62550
},
{
"epoch": 16.84625322997416,
"grad_norm": 0.3864540159702301,
"learning_rate": 0.00034754678874377267,
"loss": 3.0724,
"step": 62600
},
{
"epoch": 16.859711455641687,
"grad_norm": 0.42142173647880554,
"learning_rate": 0.00034734482294331487,
"loss": 3.0703,
"step": 62650
},
{
"epoch": 16.873169681309218,
"grad_norm": 0.3723422884941101,
"learning_rate": 0.00034714285714285717,
"loss": 3.0714,
"step": 62700
},
{
"epoch": 16.886627906976745,
"grad_norm": 0.39334335923194885,
"learning_rate": 0.00034694089134239937,
"loss": 3.0651,
"step": 62750
},
{
"epoch": 16.900086132644272,
"grad_norm": 0.408385306596756,
"learning_rate": 0.00034673892554194156,
"loss": 3.0698,
"step": 62800
},
{
"epoch": 16.9135443583118,
"grad_norm": 0.40279293060302734,
"learning_rate": 0.00034653695974148376,
"loss": 3.0715,
"step": 62850
},
{
"epoch": 16.927002583979327,
"grad_norm": 0.39347708225250244,
"learning_rate": 0.00034633499394102595,
"loss": 3.0641,
"step": 62900
},
{
"epoch": 16.940460809646858,
"grad_norm": 0.3958011269569397,
"learning_rate": 0.0003461330281405682,
"loss": 3.0754,
"step": 62950
},
{
"epoch": 16.953919035314385,
"grad_norm": 0.3987782895565033,
"learning_rate": 0.0003459310623401104,
"loss": 3.0662,
"step": 63000
},
{
"epoch": 16.953919035314385,
"eval_accuracy": 0.39508483798363603,
"eval_loss": 3.2805721759796143,
"eval_runtime": 53.7065,
"eval_samples_per_second": 335.378,
"eval_steps_per_second": 20.966,
"step": 63000
},
{
"epoch": 16.967377260981912,
"grad_norm": 0.4031330645084381,
"learning_rate": 0.0003457290965396526,
"loss": 3.0679,
"step": 63050
},
{
"epoch": 16.98083548664944,
"grad_norm": 0.410900741815567,
"learning_rate": 0.0003455271307391948,
"loss": 3.072,
"step": 63100
},
{
"epoch": 16.994293712316967,
"grad_norm": 0.3965557813644409,
"learning_rate": 0.000345325164938737,
"loss": 3.0664,
"step": 63150
},
{
"epoch": 17.007536606373815,
"grad_norm": 0.41293439269065857,
"learning_rate": 0.00034512319913827924,
"loss": 3.013,
"step": 63200
},
{
"epoch": 17.020994832041342,
"grad_norm": 0.414392352104187,
"learning_rate": 0.00034492123333782144,
"loss": 2.9733,
"step": 63250
},
{
"epoch": 17.034453057708873,
"grad_norm": 0.4076540470123291,
"learning_rate": 0.00034471926753736363,
"loss": 2.9819,
"step": 63300
},
{
"epoch": 17.0479112833764,
"grad_norm": 0.4018961489200592,
"learning_rate": 0.00034451730173690583,
"loss": 2.9865,
"step": 63350
},
{
"epoch": 17.061369509043928,
"grad_norm": 0.4264177978038788,
"learning_rate": 0.000344315335936448,
"loss": 2.9894,
"step": 63400
},
{
"epoch": 17.074827734711455,
"grad_norm": 0.43880993127822876,
"learning_rate": 0.0003441133701359903,
"loss": 2.997,
"step": 63450
},
{
"epoch": 17.088285960378983,
"grad_norm": 0.4118068218231201,
"learning_rate": 0.00034391140433553247,
"loss": 2.9963,
"step": 63500
},
{
"epoch": 17.101744186046513,
"grad_norm": 0.37996986508369446,
"learning_rate": 0.00034370943853507467,
"loss": 2.9818,
"step": 63550
},
{
"epoch": 17.11520241171404,
"grad_norm": 0.41833794116973877,
"learning_rate": 0.0003435074727346169,
"loss": 3.0017,
"step": 63600
},
{
"epoch": 17.128660637381568,
"grad_norm": 0.39799895882606506,
"learning_rate": 0.00034330550693415917,
"loss": 2.9962,
"step": 63650
},
{
"epoch": 17.142118863049095,
"grad_norm": 0.41376993060112,
"learning_rate": 0.00034310354113370136,
"loss": 3.0022,
"step": 63700
},
{
"epoch": 17.155577088716623,
"grad_norm": 0.37583568692207336,
"learning_rate": 0.00034290157533324356,
"loss": 3.0003,
"step": 63750
},
{
"epoch": 17.16903531438415,
"grad_norm": 0.4121802747249603,
"learning_rate": 0.00034269960953278576,
"loss": 3.0103,
"step": 63800
},
{
"epoch": 17.18249354005168,
"grad_norm": 0.41035139560699463,
"learning_rate": 0.00034249764373232795,
"loss": 3.0181,
"step": 63850
},
{
"epoch": 17.195951765719208,
"grad_norm": 0.4261062741279602,
"learning_rate": 0.0003422956779318702,
"loss": 3.0084,
"step": 63900
},
{
"epoch": 17.209409991386735,
"grad_norm": 0.40924733877182007,
"learning_rate": 0.0003420937121314124,
"loss": 2.9905,
"step": 63950
},
{
"epoch": 17.222868217054263,
"grad_norm": 0.40608179569244385,
"learning_rate": 0.0003418917463309546,
"loss": 3.0215,
"step": 64000
},
{
"epoch": 17.222868217054263,
"eval_accuracy": 0.39425086472032345,
"eval_loss": 3.298827886581421,
"eval_runtime": 53.7996,
"eval_samples_per_second": 334.798,
"eval_steps_per_second": 20.93,
"step": 64000
},
{
"epoch": 17.23632644272179,
"grad_norm": 0.39848390221595764,
"learning_rate": 0.0003416897805304968,
"loss": 3.0149,
"step": 64050
},
{
"epoch": 17.24978466838932,
"grad_norm": 0.40141576528549194,
"learning_rate": 0.000341487814730039,
"loss": 3.016,
"step": 64100
},
{
"epoch": 17.263242894056848,
"grad_norm": 0.40395596623420715,
"learning_rate": 0.00034128584892958124,
"loss": 3.0128,
"step": 64150
},
{
"epoch": 17.276701119724375,
"grad_norm": 0.3964282274246216,
"learning_rate": 0.00034108388312912343,
"loss": 3.0192,
"step": 64200
},
{
"epoch": 17.290159345391903,
"grad_norm": 0.3771671950817108,
"learning_rate": 0.00034088191732866563,
"loss": 3.026,
"step": 64250
},
{
"epoch": 17.30361757105943,
"grad_norm": 0.4377134442329407,
"learning_rate": 0.0003406799515282078,
"loss": 3.0216,
"step": 64300
},
{
"epoch": 17.31707579672696,
"grad_norm": 0.4398791790008545,
"learning_rate": 0.00034047798572775,
"loss": 3.024,
"step": 64350
},
{
"epoch": 17.330534022394488,
"grad_norm": 0.4335786998271942,
"learning_rate": 0.00034027601992729227,
"loss": 3.0369,
"step": 64400
},
{
"epoch": 17.343992248062015,
"grad_norm": 0.43530410528182983,
"learning_rate": 0.00034007405412683447,
"loss": 3.0229,
"step": 64450
},
{
"epoch": 17.357450473729543,
"grad_norm": 0.38939180970191956,
"learning_rate": 0.0003398720883263767,
"loss": 3.0386,
"step": 64500
},
{
"epoch": 17.37090869939707,
"grad_norm": 0.4043862521648407,
"learning_rate": 0.00033967012252591897,
"loss": 3.0247,
"step": 64550
},
{
"epoch": 17.3843669250646,
"grad_norm": 0.40165212750434875,
"learning_rate": 0.00033946815672546116,
"loss": 3.0206,
"step": 64600
},
{
"epoch": 17.397825150732128,
"grad_norm": 0.40205591917037964,
"learning_rate": 0.00033926619092500336,
"loss": 3.031,
"step": 64650
},
{
"epoch": 17.411283376399656,
"grad_norm": 0.40525221824645996,
"learning_rate": 0.00033906422512454556,
"loss": 3.0279,
"step": 64700
},
{
"epoch": 17.424741602067183,
"grad_norm": 0.3944297134876251,
"learning_rate": 0.00033886225932408775,
"loss": 3.045,
"step": 64750
},
{
"epoch": 17.43819982773471,
"grad_norm": 0.4010119140148163,
"learning_rate": 0.00033866029352363,
"loss": 3.0263,
"step": 64800
},
{
"epoch": 17.45165805340224,
"grad_norm": 0.41308003664016724,
"learning_rate": 0.0003384583277231722,
"loss": 3.0358,
"step": 64850
},
{
"epoch": 17.46511627906977,
"grad_norm": 0.39949336647987366,
"learning_rate": 0.0003382563619227144,
"loss": 3.0366,
"step": 64900
},
{
"epoch": 17.478574504737296,
"grad_norm": 0.39073440432548523,
"learning_rate": 0.0003380543961222566,
"loss": 3.0358,
"step": 64950
},
{
"epoch": 17.492032730404823,
"grad_norm": 0.4053451716899872,
"learning_rate": 0.0003378524303217988,
"loss": 3.0386,
"step": 65000
},
{
"epoch": 17.492032730404823,
"eval_accuracy": 0.39411951067194606,
"eval_loss": 3.292527675628662,
"eval_runtime": 53.6555,
"eval_samples_per_second": 335.697,
"eval_steps_per_second": 20.986,
"step": 65000
},
{
"epoch": 17.50549095607235,
"grad_norm": 0.38078823685646057,
"learning_rate": 0.00033765046452134104,
"loss": 3.0278,
"step": 65050
},
{
"epoch": 17.51894918173988,
"grad_norm": 0.4012027084827423,
"learning_rate": 0.00033744849872088323,
"loss": 3.0415,
"step": 65100
},
{
"epoch": 17.53240740740741,
"grad_norm": 0.4206222891807556,
"learning_rate": 0.00033724653292042543,
"loss": 3.0358,
"step": 65150
},
{
"epoch": 17.545865633074936,
"grad_norm": 0.4432675838470459,
"learning_rate": 0.0003370445671199676,
"loss": 3.0415,
"step": 65200
},
{
"epoch": 17.559323858742463,
"grad_norm": 0.4019043445587158,
"learning_rate": 0.0003368426013195098,
"loss": 3.0508,
"step": 65250
},
{
"epoch": 17.57278208440999,
"grad_norm": 0.4079228639602661,
"learning_rate": 0.00033664063551905207,
"loss": 3.038,
"step": 65300
},
{
"epoch": 17.58624031007752,
"grad_norm": 0.404167115688324,
"learning_rate": 0.00033643866971859427,
"loss": 3.0394,
"step": 65350
},
{
"epoch": 17.59969853574505,
"grad_norm": 0.37649834156036377,
"learning_rate": 0.0003362367039181365,
"loss": 3.0455,
"step": 65400
},
{
"epoch": 17.613156761412576,
"grad_norm": 0.406495600938797,
"learning_rate": 0.0003360347381176787,
"loss": 3.0486,
"step": 65450
},
{
"epoch": 17.626614987080103,
"grad_norm": 0.3711218237876892,
"learning_rate": 0.00033583277231722097,
"loss": 3.0505,
"step": 65500
},
{
"epoch": 17.64007321274763,
"grad_norm": 0.4153537154197693,
"learning_rate": 0.00033563080651676316,
"loss": 3.0454,
"step": 65550
},
{
"epoch": 17.653531438415158,
"grad_norm": 0.4080434739589691,
"learning_rate": 0.00033542884071630536,
"loss": 3.0557,
"step": 65600
},
{
"epoch": 17.66698966408269,
"grad_norm": 0.403916597366333,
"learning_rate": 0.00033522687491584755,
"loss": 3.0523,
"step": 65650
},
{
"epoch": 17.680447889750216,
"grad_norm": 0.40088391304016113,
"learning_rate": 0.00033502490911538975,
"loss": 3.0431,
"step": 65700
},
{
"epoch": 17.693906115417743,
"grad_norm": 0.4177820086479187,
"learning_rate": 0.000334822943314932,
"loss": 3.0592,
"step": 65750
},
{
"epoch": 17.70736434108527,
"grad_norm": 0.4082454741001129,
"learning_rate": 0.0003346209775144742,
"loss": 3.0556,
"step": 65800
},
{
"epoch": 17.720822566752798,
"grad_norm": 0.4115118086338043,
"learning_rate": 0.0003344190117140164,
"loss": 3.0538,
"step": 65850
},
{
"epoch": 17.73428079242033,
"grad_norm": 0.4031737744808197,
"learning_rate": 0.0003342170459135586,
"loss": 3.047,
"step": 65900
},
{
"epoch": 17.747739018087856,
"grad_norm": 0.3955917954444885,
"learning_rate": 0.0003340150801131008,
"loss": 3.0408,
"step": 65950
},
{
"epoch": 17.761197243755383,
"grad_norm": 0.4012543559074402,
"learning_rate": 0.00033381311431264303,
"loss": 3.0558,
"step": 66000
},
{
"epoch": 17.761197243755383,
"eval_accuracy": 0.3950584367977339,
"eval_loss": 3.285689115524292,
"eval_runtime": 53.7002,
"eval_samples_per_second": 335.418,
"eval_steps_per_second": 20.968,
"step": 66000
},
{
"epoch": 17.77465546942291,
"grad_norm": 0.4043067693710327,
"learning_rate": 0.00033361114851218523,
"loss": 3.0489,
"step": 66050
},
{
"epoch": 17.788113695090438,
"grad_norm": 0.39902979135513306,
"learning_rate": 0.0003334091827117274,
"loss": 3.0574,
"step": 66100
},
{
"epoch": 17.80157192075797,
"grad_norm": 0.38922590017318726,
"learning_rate": 0.0003332072169112696,
"loss": 3.0436,
"step": 66150
},
{
"epoch": 17.815030146425496,
"grad_norm": 0.40953120589256287,
"learning_rate": 0.0003330052511108118,
"loss": 3.0573,
"step": 66200
},
{
"epoch": 17.828488372093023,
"grad_norm": 0.3957615792751312,
"learning_rate": 0.0003328032853103541,
"loss": 3.0469,
"step": 66250
},
{
"epoch": 17.84194659776055,
"grad_norm": 0.3910478949546814,
"learning_rate": 0.0003326013195098963,
"loss": 3.0572,
"step": 66300
},
{
"epoch": 17.855404823428078,
"grad_norm": 0.3920937776565552,
"learning_rate": 0.0003323993537094385,
"loss": 3.0609,
"step": 66350
},
{
"epoch": 17.86886304909561,
"grad_norm": 0.4142955243587494,
"learning_rate": 0.00033219738790898077,
"loss": 3.0574,
"step": 66400
},
{
"epoch": 17.882321274763136,
"grad_norm": 0.405781090259552,
"learning_rate": 0.00033199542210852296,
"loss": 3.0626,
"step": 66450
},
{
"epoch": 17.895779500430663,
"grad_norm": 0.4257899522781372,
"learning_rate": 0.00033179345630806516,
"loss": 3.0595,
"step": 66500
},
{
"epoch": 17.90923772609819,
"grad_norm": 0.39462465047836304,
"learning_rate": 0.00033159149050760735,
"loss": 3.0525,
"step": 66550
},
{
"epoch": 17.922695951765718,
"grad_norm": 0.4202898442745209,
"learning_rate": 0.00033138952470714955,
"loss": 3.0605,
"step": 66600
},
{
"epoch": 17.93615417743325,
"grad_norm": 0.40152961015701294,
"learning_rate": 0.0003311875589066918,
"loss": 3.0562,
"step": 66650
},
{
"epoch": 17.949612403100776,
"grad_norm": 0.40167486667633057,
"learning_rate": 0.000330985593106234,
"loss": 3.0591,
"step": 66700
},
{
"epoch": 17.963070628768303,
"grad_norm": 0.4044479727745056,
"learning_rate": 0.0003307836273057762,
"loss": 3.0511,
"step": 66750
},
{
"epoch": 17.97652885443583,
"grad_norm": 0.4120575785636902,
"learning_rate": 0.0003305816615053184,
"loss": 3.0666,
"step": 66800
},
{
"epoch": 17.989987080103358,
"grad_norm": 0.4056987762451172,
"learning_rate": 0.0003303796957048606,
"loss": 3.0567,
"step": 66850
},
{
"epoch": 18.003229974160206,
"grad_norm": 0.3993924558162689,
"learning_rate": 0.00033017772990440284,
"loss": 3.025,
"step": 66900
},
{
"epoch": 18.016688199827733,
"grad_norm": 0.4064941704273224,
"learning_rate": 0.00032997576410394503,
"loss": 2.9688,
"step": 66950
},
{
"epoch": 18.030146425495264,
"grad_norm": 0.41245749592781067,
"learning_rate": 0.00032977379830348723,
"loss": 2.9668,
"step": 67000
},
{
"epoch": 18.030146425495264,
"eval_accuracy": 0.39444577717920604,
"eval_loss": 3.294330358505249,
"eval_runtime": 53.7092,
"eval_samples_per_second": 335.361,
"eval_steps_per_second": 20.965,
"step": 67000
},
{
"epoch": 18.04360465116279,
"grad_norm": 0.40845730900764465,
"learning_rate": 0.0003295718325030294,
"loss": 2.9719,
"step": 67050
},
{
"epoch": 18.05706287683032,
"grad_norm": 0.3882702887058258,
"learning_rate": 0.0003293698667025716,
"loss": 2.9772,
"step": 67100
},
{
"epoch": 18.070521102497846,
"grad_norm": 0.41161108016967773,
"learning_rate": 0.0003291679009021139,
"loss": 2.9702,
"step": 67150
},
{
"epoch": 18.083979328165373,
"grad_norm": 0.40619149804115295,
"learning_rate": 0.0003289659351016561,
"loss": 2.9882,
"step": 67200
},
{
"epoch": 18.097437553832904,
"grad_norm": 0.4130079448223114,
"learning_rate": 0.0003287639693011983,
"loss": 2.9782,
"step": 67250
},
{
"epoch": 18.11089577950043,
"grad_norm": 0.4122353196144104,
"learning_rate": 0.0003285620035007405,
"loss": 2.9827,
"step": 67300
},
{
"epoch": 18.12435400516796,
"grad_norm": 0.42838728427886963,
"learning_rate": 0.00032836003770028276,
"loss": 2.9885,
"step": 67350
},
{
"epoch": 18.137812230835486,
"grad_norm": 0.4504906237125397,
"learning_rate": 0.00032815807189982496,
"loss": 3.0084,
"step": 67400
},
{
"epoch": 18.151270456503013,
"grad_norm": 0.4178699254989624,
"learning_rate": 0.00032795610609936715,
"loss": 2.9822,
"step": 67450
},
{
"epoch": 18.164728682170544,
"grad_norm": 0.4099842309951782,
"learning_rate": 0.00032775414029890935,
"loss": 3.001,
"step": 67500
},
{
"epoch": 18.17818690783807,
"grad_norm": 0.4077083170413971,
"learning_rate": 0.00032755217449845155,
"loss": 2.9985,
"step": 67550
},
{
"epoch": 18.1916451335056,
"grad_norm": 0.40748822689056396,
"learning_rate": 0.0003273502086979938,
"loss": 2.9892,
"step": 67600
},
{
"epoch": 18.205103359173126,
"grad_norm": 0.3962273597717285,
"learning_rate": 0.000327148242897536,
"loss": 2.9994,
"step": 67650
},
{
"epoch": 18.218561584840653,
"grad_norm": 0.4352484345436096,
"learning_rate": 0.0003269462770970782,
"loss": 3.0084,
"step": 67700
},
{
"epoch": 18.232019810508184,
"grad_norm": 0.393781453371048,
"learning_rate": 0.0003267443112966204,
"loss": 3.0002,
"step": 67750
},
{
"epoch": 18.24547803617571,
"grad_norm": 0.4139980673789978,
"learning_rate": 0.0003265423454961626,
"loss": 3.0153,
"step": 67800
},
{
"epoch": 18.25893626184324,
"grad_norm": 0.5772366523742676,
"learning_rate": 0.00032634037969570483,
"loss": 3.0038,
"step": 67850
},
{
"epoch": 18.272394487510766,
"grad_norm": 0.40956127643585205,
"learning_rate": 0.00032613841389524703,
"loss": 3.0068,
"step": 67900
},
{
"epoch": 18.285852713178294,
"grad_norm": 0.417221337556839,
"learning_rate": 0.0003259364480947892,
"loss": 3.0055,
"step": 67950
},
{
"epoch": 18.29931093884582,
"grad_norm": 0.4156797230243683,
"learning_rate": 0.0003257344822943314,
"loss": 3.006,
"step": 68000
},
{
"epoch": 18.29931093884582,
"eval_accuracy": 0.394656552079001,
"eval_loss": 3.295771360397339,
"eval_runtime": 53.9336,
"eval_samples_per_second": 333.966,
"eval_steps_per_second": 20.878,
"step": 68000
},
{
"epoch": 18.31276916451335,
"grad_norm": 0.3963870108127594,
"learning_rate": 0.0003255325164938737,
"loss": 3.0138,
"step": 68050
},
{
"epoch": 18.32622739018088,
"grad_norm": 0.39834779500961304,
"learning_rate": 0.0003253305506934159,
"loss": 3.0103,
"step": 68100
},
{
"epoch": 18.339685615848406,
"grad_norm": 0.43453970551490784,
"learning_rate": 0.0003251285848929581,
"loss": 3.0127,
"step": 68150
},
{
"epoch": 18.353143841515934,
"grad_norm": 0.40500518679618835,
"learning_rate": 0.0003249266190925003,
"loss": 3.0052,
"step": 68200
},
{
"epoch": 18.36660206718346,
"grad_norm": 0.4253470003604889,
"learning_rate": 0.0003247246532920425,
"loss": 3.0107,
"step": 68250
},
{
"epoch": 18.38006029285099,
"grad_norm": 0.39556410908699036,
"learning_rate": 0.00032452268749158476,
"loss": 3.0187,
"step": 68300
},
{
"epoch": 18.39351851851852,
"grad_norm": 0.41645529866218567,
"learning_rate": 0.00032432072169112696,
"loss": 3.0237,
"step": 68350
},
{
"epoch": 18.406976744186046,
"grad_norm": 0.4263641834259033,
"learning_rate": 0.00032411875589066915,
"loss": 3.0154,
"step": 68400
},
{
"epoch": 18.420434969853574,
"grad_norm": 0.4121094048023224,
"learning_rate": 0.00032391679009021135,
"loss": 3.0223,
"step": 68450
},
{
"epoch": 18.4338931955211,
"grad_norm": 0.41028955578804016,
"learning_rate": 0.0003237148242897536,
"loss": 3.0178,
"step": 68500
},
{
"epoch": 18.447351421188632,
"grad_norm": 0.4091287851333618,
"learning_rate": 0.0003235128584892958,
"loss": 3.0271,
"step": 68550
},
{
"epoch": 18.46080964685616,
"grad_norm": 0.4232977032661438,
"learning_rate": 0.000323310892688838,
"loss": 3.0182,
"step": 68600
},
{
"epoch": 18.474267872523686,
"grad_norm": 0.42817702889442444,
"learning_rate": 0.0003231089268883802,
"loss": 3.0154,
"step": 68650
},
{
"epoch": 18.487726098191214,
"grad_norm": 0.40232494473457336,
"learning_rate": 0.0003229069610879224,
"loss": 3.0274,
"step": 68700
},
{
"epoch": 18.50118432385874,
"grad_norm": 0.44148021936416626,
"learning_rate": 0.00032270499528746463,
"loss": 3.0238,
"step": 68750
},
{
"epoch": 18.514642549526272,
"grad_norm": 0.4260505437850952,
"learning_rate": 0.00032250302948700683,
"loss": 3.0144,
"step": 68800
},
{
"epoch": 18.5281007751938,
"grad_norm": 0.4293977916240692,
"learning_rate": 0.000322301063686549,
"loss": 3.0332,
"step": 68850
},
{
"epoch": 18.541559000861326,
"grad_norm": 0.40571919083595276,
"learning_rate": 0.0003220990978860912,
"loss": 3.0276,
"step": 68900
},
{
"epoch": 18.555017226528854,
"grad_norm": 0.42529913783073425,
"learning_rate": 0.0003218971320856335,
"loss": 3.0315,
"step": 68950
},
{
"epoch": 18.56847545219638,
"grad_norm": 0.41675060987472534,
"learning_rate": 0.0003216951662851757,
"loss": 3.033,
"step": 69000
},
{
"epoch": 18.56847545219638,
"eval_accuracy": 0.3946870818454146,
"eval_loss": 3.29016375541687,
"eval_runtime": 53.8549,
"eval_samples_per_second": 334.455,
"eval_steps_per_second": 20.908,
"step": 69000
},
{
"epoch": 18.581933677863912,
"grad_norm": 0.4186652898788452,
"learning_rate": 0.0003214932004847179,
"loss": 3.0313,
"step": 69050
},
{
"epoch": 18.59539190353144,
"grad_norm": 0.38911905884742737,
"learning_rate": 0.0003212912346842601,
"loss": 3.0319,
"step": 69100
},
{
"epoch": 18.608850129198967,
"grad_norm": 0.39345499873161316,
"learning_rate": 0.0003210892688838023,
"loss": 3.0337,
"step": 69150
},
{
"epoch": 18.622308354866494,
"grad_norm": 0.40627118945121765,
"learning_rate": 0.00032088730308334456,
"loss": 3.0292,
"step": 69200
},
{
"epoch": 18.63576658053402,
"grad_norm": 0.4483228623867035,
"learning_rate": 0.00032068533728288676,
"loss": 3.0318,
"step": 69250
},
{
"epoch": 18.649224806201552,
"grad_norm": 0.46649301052093506,
"learning_rate": 0.00032048337148242895,
"loss": 3.033,
"step": 69300
},
{
"epoch": 18.66268303186908,
"grad_norm": 0.4463086426258087,
"learning_rate": 0.00032028140568197115,
"loss": 3.0324,
"step": 69350
},
{
"epoch": 18.676141257536607,
"grad_norm": 0.38854748010635376,
"learning_rate": 0.00032007943988151334,
"loss": 3.0428,
"step": 69400
},
{
"epoch": 18.689599483204134,
"grad_norm": 0.39255139231681824,
"learning_rate": 0.0003198774740810556,
"loss": 3.0289,
"step": 69450
},
{
"epoch": 18.70305770887166,
"grad_norm": 0.3995414972305298,
"learning_rate": 0.0003196755082805978,
"loss": 3.0391,
"step": 69500
},
{
"epoch": 18.716515934539192,
"grad_norm": 0.391249418258667,
"learning_rate": 0.00031947354248014,
"loss": 3.047,
"step": 69550
},
{
"epoch": 18.72997416020672,
"grad_norm": 0.4146568179130554,
"learning_rate": 0.0003192715766796822,
"loss": 3.0407,
"step": 69600
},
{
"epoch": 18.743432385874247,
"grad_norm": 0.40213024616241455,
"learning_rate": 0.0003190696108792244,
"loss": 3.0424,
"step": 69650
},
{
"epoch": 18.756890611541774,
"grad_norm": 0.4518072009086609,
"learning_rate": 0.00031886764507876663,
"loss": 3.0393,
"step": 69700
},
{
"epoch": 18.7703488372093,
"grad_norm": 0.4141794741153717,
"learning_rate": 0.0003186656792783088,
"loss": 3.0423,
"step": 69750
},
{
"epoch": 18.783807062876832,
"grad_norm": 0.40530627965927124,
"learning_rate": 0.000318463713477851,
"loss": 3.0372,
"step": 69800
},
{
"epoch": 18.79726528854436,
"grad_norm": 0.4177812933921814,
"learning_rate": 0.00031826174767739327,
"loss": 3.0382,
"step": 69850
},
{
"epoch": 18.810723514211887,
"grad_norm": 0.41861647367477417,
"learning_rate": 0.0003180597818769355,
"loss": 3.0386,
"step": 69900
},
{
"epoch": 18.824181739879414,
"grad_norm": 0.41492319107055664,
"learning_rate": 0.0003178578160764777,
"loss": 3.0423,
"step": 69950
},
{
"epoch": 18.83763996554694,
"grad_norm": 0.41332483291625977,
"learning_rate": 0.0003176558502760199,
"loss": 3.0561,
"step": 70000
},
{
"epoch": 18.83763996554694,
"eval_accuracy": 0.3955084520734818,
"eval_loss": 3.280143976211548,
"eval_runtime": 55.4316,
"eval_samples_per_second": 324.941,
"eval_steps_per_second": 20.313,
"step": 70000
},
{
"epoch": 18.85109819121447,
"grad_norm": 0.40138715505599976,
"learning_rate": 0.0003174538844755621,
"loss": 3.0547,
"step": 70050
},
{
"epoch": 18.864556416882,
"grad_norm": 0.451656311750412,
"learning_rate": 0.0003172519186751043,
"loss": 3.0475,
"step": 70100
},
{
"epoch": 18.878014642549527,
"grad_norm": 0.4022556245326996,
"learning_rate": 0.00031704995287464656,
"loss": 3.0491,
"step": 70150
},
{
"epoch": 18.891472868217054,
"grad_norm": 0.43887948989868164,
"learning_rate": 0.00031684798707418875,
"loss": 3.055,
"step": 70200
},
{
"epoch": 18.90493109388458,
"grad_norm": 0.4141213595867157,
"learning_rate": 0.00031664602127373095,
"loss": 3.0429,
"step": 70250
},
{
"epoch": 18.91838931955211,
"grad_norm": 0.43518728017807007,
"learning_rate": 0.00031644405547327315,
"loss": 3.0606,
"step": 70300
},
{
"epoch": 18.93184754521964,
"grad_norm": 0.45341476798057556,
"learning_rate": 0.00031624208967281534,
"loss": 3.0438,
"step": 70350
},
{
"epoch": 18.945305770887167,
"grad_norm": 0.3871983587741852,
"learning_rate": 0.0003160401238723576,
"loss": 3.0371,
"step": 70400
},
{
"epoch": 18.958763996554694,
"grad_norm": 0.38054120540618896,
"learning_rate": 0.0003158381580718998,
"loss": 3.0451,
"step": 70450
},
{
"epoch": 18.97222222222222,
"grad_norm": 0.4231681227684021,
"learning_rate": 0.000315636192271442,
"loss": 3.0478,
"step": 70500
},
{
"epoch": 18.98568044788975,
"grad_norm": 0.4074268043041229,
"learning_rate": 0.0003154342264709842,
"loss": 3.0456,
"step": 70550
},
{
"epoch": 18.99913867355728,
"grad_norm": 0.4081641137599945,
"learning_rate": 0.00031523226067052643,
"loss": 3.0445,
"step": 70600
},
{
"epoch": 19.012381567614124,
"grad_norm": 0.40542617440223694,
"learning_rate": 0.0003150302948700686,
"loss": 2.9738,
"step": 70650
},
{
"epoch": 19.025839793281655,
"grad_norm": 0.4279390275478363,
"learning_rate": 0.0003148283290696108,
"loss": 2.9535,
"step": 70700
},
{
"epoch": 19.039298018949182,
"grad_norm": 0.4095859229564667,
"learning_rate": 0.0003146263632691531,
"loss": 2.9532,
"step": 70750
},
{
"epoch": 19.05275624461671,
"grad_norm": 0.39089035987854004,
"learning_rate": 0.0003144243974686953,
"loss": 2.9654,
"step": 70800
},
{
"epoch": 19.066214470284237,
"grad_norm": 0.42628535628318787,
"learning_rate": 0.0003142224316682375,
"loss": 2.9724,
"step": 70850
},
{
"epoch": 19.079672695951764,
"grad_norm": 0.42668628692626953,
"learning_rate": 0.0003140204658677797,
"loss": 2.9648,
"step": 70900
},
{
"epoch": 19.093130921619295,
"grad_norm": 0.44374603033065796,
"learning_rate": 0.0003138185000673219,
"loss": 2.9687,
"step": 70950
},
{
"epoch": 19.106589147286822,
"grad_norm": 0.4030226767063141,
"learning_rate": 0.0003136165342668641,
"loss": 2.9788,
"step": 71000
},
{
"epoch": 19.106589147286822,
"eval_accuracy": 0.3948139813727139,
"eval_loss": 3.2952170372009277,
"eval_runtime": 55.0491,
"eval_samples_per_second": 327.199,
"eval_steps_per_second": 20.454,
"step": 71000
},
{
"epoch": 19.12004737295435,
"grad_norm": 0.44227829575538635,
"learning_rate": 0.00031341456846640636,
"loss": 2.9831,
"step": 71050
},
{
"epoch": 19.133505598621877,
"grad_norm": 0.42472559213638306,
"learning_rate": 0.00031321260266594855,
"loss": 2.9756,
"step": 71100
},
{
"epoch": 19.146963824289404,
"grad_norm": 0.4363155961036682,
"learning_rate": 0.00031301063686549075,
"loss": 2.9772,
"step": 71150
},
{
"epoch": 19.160422049956935,
"grad_norm": 0.387218177318573,
"learning_rate": 0.00031280867106503295,
"loss": 2.9856,
"step": 71200
},
{
"epoch": 19.173880275624462,
"grad_norm": 0.43504536151885986,
"learning_rate": 0.00031260670526457514,
"loss": 2.9884,
"step": 71250
},
{
"epoch": 19.18733850129199,
"grad_norm": 0.4170171618461609,
"learning_rate": 0.0003124047394641174,
"loss": 2.9937,
"step": 71300
},
{
"epoch": 19.200796726959517,
"grad_norm": 0.4231520891189575,
"learning_rate": 0.0003122027736636596,
"loss": 2.988,
"step": 71350
},
{
"epoch": 19.214254952627044,
"grad_norm": 0.4176693260669708,
"learning_rate": 0.0003120008078632018,
"loss": 2.9804,
"step": 71400
},
{
"epoch": 19.227713178294575,
"grad_norm": 0.41988492012023926,
"learning_rate": 0.000311798842062744,
"loss": 2.9922,
"step": 71450
},
{
"epoch": 19.241171403962102,
"grad_norm": 0.4519420862197876,
"learning_rate": 0.0003115968762622862,
"loss": 2.9901,
"step": 71500
},
{
"epoch": 19.25462962962963,
"grad_norm": 0.4187524914741516,
"learning_rate": 0.00031139491046182843,
"loss": 2.993,
"step": 71550
},
{
"epoch": 19.268087855297157,
"grad_norm": 0.41358649730682373,
"learning_rate": 0.0003111929446613707,
"loss": 2.9979,
"step": 71600
},
{
"epoch": 19.281546080964684,
"grad_norm": 0.44196566939353943,
"learning_rate": 0.0003109909788609129,
"loss": 3.0035,
"step": 71650
},
{
"epoch": 19.295004306632215,
"grad_norm": 0.44156354665756226,
"learning_rate": 0.00031078901306045507,
"loss": 2.9925,
"step": 71700
},
{
"epoch": 19.308462532299743,
"grad_norm": 0.41277235746383667,
"learning_rate": 0.0003105870472599973,
"loss": 2.9963,
"step": 71750
},
{
"epoch": 19.32192075796727,
"grad_norm": 0.4259941577911377,
"learning_rate": 0.0003103850814595395,
"loss": 3.0017,
"step": 71800
},
{
"epoch": 19.335378983634797,
"grad_norm": 0.4394524097442627,
"learning_rate": 0.0003101831156590817,
"loss": 3.0053,
"step": 71850
},
{
"epoch": 19.348837209302324,
"grad_norm": 0.40097951889038086,
"learning_rate": 0.0003099811498586239,
"loss": 3.0032,
"step": 71900
},
{
"epoch": 19.362295434969855,
"grad_norm": 0.4460085928440094,
"learning_rate": 0.0003097791840581661,
"loss": 3.005,
"step": 71950
},
{
"epoch": 19.375753660637383,
"grad_norm": 0.4067431092262268,
"learning_rate": 0.00030957721825770836,
"loss": 3.0036,
"step": 72000
},
{
"epoch": 19.375753660637383,
"eval_accuracy": 0.39494533542109134,
"eval_loss": 3.2903969287872314,
"eval_runtime": 55.4344,
"eval_samples_per_second": 324.925,
"eval_steps_per_second": 20.312,
"step": 72000
},
{
"epoch": 19.38921188630491,
"grad_norm": 0.42211583256721497,
"learning_rate": 0.00030937525245725055,
"loss": 3.0002,
"step": 72050
},
{
"epoch": 19.402670111972437,
"grad_norm": 0.4175995886325836,
"learning_rate": 0.00030917328665679275,
"loss": 3.0021,
"step": 72100
},
{
"epoch": 19.416128337639964,
"grad_norm": 0.4194367229938507,
"learning_rate": 0.00030897132085633494,
"loss": 3.0103,
"step": 72150
},
{
"epoch": 19.429586563307495,
"grad_norm": 0.42266571521759033,
"learning_rate": 0.00030876935505587714,
"loss": 3.0068,
"step": 72200
},
{
"epoch": 19.443044788975023,
"grad_norm": 0.41117116808891296,
"learning_rate": 0.0003085673892554194,
"loss": 3.0042,
"step": 72250
},
{
"epoch": 19.45650301464255,
"grad_norm": 0.43733495473861694,
"learning_rate": 0.0003083654234549616,
"loss": 3.0077,
"step": 72300
},
{
"epoch": 19.469961240310077,
"grad_norm": 0.4256683588027954,
"learning_rate": 0.0003081634576545038,
"loss": 3.0079,
"step": 72350
},
{
"epoch": 19.483419465977605,
"grad_norm": 0.4018738865852356,
"learning_rate": 0.000307961491854046,
"loss": 3.0079,
"step": 72400
},
{
"epoch": 19.496877691645132,
"grad_norm": 0.41685667634010315,
"learning_rate": 0.00030775952605358823,
"loss": 3.0111,
"step": 72450
},
{
"epoch": 19.510335917312663,
"grad_norm": 0.3987107574939728,
"learning_rate": 0.0003075575602531305,
"loss": 3.0036,
"step": 72500
},
{
"epoch": 19.52379414298019,
"grad_norm": 0.45025312900543213,
"learning_rate": 0.0003073555944526727,
"loss": 3.0131,
"step": 72550
},
{
"epoch": 19.537252368647717,
"grad_norm": 0.43616893887519836,
"learning_rate": 0.00030715362865221487,
"loss": 3.014,
"step": 72600
},
{
"epoch": 19.550710594315245,
"grad_norm": 0.39793211221694946,
"learning_rate": 0.0003069516628517571,
"loss": 3.0105,
"step": 72650
},
{
"epoch": 19.564168819982772,
"grad_norm": 0.438885360956192,
"learning_rate": 0.0003067496970512993,
"loss": 3.017,
"step": 72700
},
{
"epoch": 19.577627045650303,
"grad_norm": 0.4122118055820465,
"learning_rate": 0.0003065477312508415,
"loss": 3.0191,
"step": 72750
},
{
"epoch": 19.59108527131783,
"grad_norm": 0.42040184140205383,
"learning_rate": 0.0003063457654503837,
"loss": 3.0237,
"step": 72800
},
{
"epoch": 19.604543496985357,
"grad_norm": 0.4153655767440796,
"learning_rate": 0.0003061437996499259,
"loss": 3.0278,
"step": 72850
},
{
"epoch": 19.618001722652885,
"grad_norm": 0.4188932478427887,
"learning_rate": 0.00030594183384946816,
"loss": 3.0302,
"step": 72900
},
{
"epoch": 19.631459948320412,
"grad_norm": 0.41482555866241455,
"learning_rate": 0.00030573986804901035,
"loss": 3.0148,
"step": 72950
},
{
"epoch": 19.644918173987943,
"grad_norm": 0.3990839421749115,
"learning_rate": 0.00030553790224855255,
"loss": 3.0251,
"step": 73000
},
{
"epoch": 19.644918173987943,
"eval_accuracy": 0.3955169265282158,
"eval_loss": 3.2822790145874023,
"eval_runtime": 55.3891,
"eval_samples_per_second": 325.19,
"eval_steps_per_second": 20.329,
"step": 73000
},
{
"epoch": 19.65837639965547,
"grad_norm": 0.4105415940284729,
"learning_rate": 0.00030533593644809474,
"loss": 3.035,
"step": 73050
},
{
"epoch": 19.671834625322997,
"grad_norm": 0.40522801876068115,
"learning_rate": 0.00030513397064763694,
"loss": 3.0141,
"step": 73100
},
{
"epoch": 19.685292850990525,
"grad_norm": 0.43990591168403625,
"learning_rate": 0.0003049320048471792,
"loss": 3.0327,
"step": 73150
},
{
"epoch": 19.698751076658052,
"grad_norm": 0.41591787338256836,
"learning_rate": 0.0003047300390467214,
"loss": 3.0286,
"step": 73200
},
{
"epoch": 19.712209302325583,
"grad_norm": 0.4017585217952728,
"learning_rate": 0.0003045280732462636,
"loss": 3.0219,
"step": 73250
},
{
"epoch": 19.72566752799311,
"grad_norm": 0.42853009700775146,
"learning_rate": 0.0003043261074458058,
"loss": 3.0272,
"step": 73300
},
{
"epoch": 19.739125753660637,
"grad_norm": 0.43009036779403687,
"learning_rate": 0.000304124141645348,
"loss": 3.0274,
"step": 73350
},
{
"epoch": 19.752583979328165,
"grad_norm": 0.383428692817688,
"learning_rate": 0.0003039221758448903,
"loss": 3.0356,
"step": 73400
},
{
"epoch": 19.766042204995692,
"grad_norm": 0.42961567640304565,
"learning_rate": 0.0003037202100444325,
"loss": 3.0221,
"step": 73450
},
{
"epoch": 19.779500430663223,
"grad_norm": 0.4073396623134613,
"learning_rate": 0.00030351824424397467,
"loss": 3.0323,
"step": 73500
},
{
"epoch": 19.79295865633075,
"grad_norm": 0.4351046681404114,
"learning_rate": 0.00030331627844351687,
"loss": 3.0328,
"step": 73550
},
{
"epoch": 19.806416881998278,
"grad_norm": 0.42110610008239746,
"learning_rate": 0.0003031143126430591,
"loss": 3.0431,
"step": 73600
},
{
"epoch": 19.819875107665805,
"grad_norm": 0.39867541193962097,
"learning_rate": 0.0003029123468426013,
"loss": 3.0301,
"step": 73650
},
{
"epoch": 19.833333333333332,
"grad_norm": 0.42126715183258057,
"learning_rate": 0.0003027103810421435,
"loss": 3.0318,
"step": 73700
},
{
"epoch": 19.846791559000863,
"grad_norm": 0.4099518954753876,
"learning_rate": 0.0003025084152416857,
"loss": 3.0313,
"step": 73750
},
{
"epoch": 19.86024978466839,
"grad_norm": 0.43412184715270996,
"learning_rate": 0.0003023064494412279,
"loss": 3.0272,
"step": 73800
},
{
"epoch": 19.873708010335918,
"grad_norm": 0.3960249125957489,
"learning_rate": 0.00030210448364077015,
"loss": 3.0411,
"step": 73850
},
{
"epoch": 19.887166236003445,
"grad_norm": 0.4127246141433716,
"learning_rate": 0.00030190251784031235,
"loss": 3.0356,
"step": 73900
},
{
"epoch": 19.900624461670972,
"grad_norm": 0.4184736907482147,
"learning_rate": 0.00030170055203985454,
"loss": 3.0384,
"step": 73950
},
{
"epoch": 19.9140826873385,
"grad_norm": 0.4029821753501892,
"learning_rate": 0.00030149858623939674,
"loss": 3.0421,
"step": 74000
},
{
"epoch": 19.9140826873385,
"eval_accuracy": 0.3958698115150891,
"eval_loss": 3.27805757522583,
"eval_runtime": 55.2649,
"eval_samples_per_second": 325.921,
"eval_steps_per_second": 20.375,
"step": 74000
},
{
"epoch": 19.92754091300603,
"grad_norm": 0.41179102659225464,
"learning_rate": 0.00030129662043893894,
"loss": 3.0358,
"step": 74050
},
{
"epoch": 19.940999138673558,
"grad_norm": 0.4140405058860779,
"learning_rate": 0.0003010946546384812,
"loss": 3.0384,
"step": 74100
},
{
"epoch": 19.954457364341085,
"grad_norm": 0.4397096335887909,
"learning_rate": 0.0003008926888380234,
"loss": 3.0453,
"step": 74150
},
{
"epoch": 19.967915590008612,
"grad_norm": 0.410493940114975,
"learning_rate": 0.0003006907230375656,
"loss": 3.0347,
"step": 74200
},
{
"epoch": 19.98137381567614,
"grad_norm": 0.4063168466091156,
"learning_rate": 0.0003004887572371078,
"loss": 3.0357,
"step": 74250
},
{
"epoch": 19.99483204134367,
"grad_norm": 0.4114035964012146,
"learning_rate": 0.0003002867914366501,
"loss": 3.0458,
"step": 74300
},
{
"epoch": 20.00807493540052,
"grad_norm": 0.4396079480648041,
"learning_rate": 0.0003000848256361923,
"loss": 2.9857,
"step": 74350
},
{
"epoch": 20.021533161068046,
"grad_norm": 0.41727790236473083,
"learning_rate": 0.00029988285983573447,
"loss": 2.9466,
"step": 74400
},
{
"epoch": 20.034991386735573,
"grad_norm": 0.41952675580978394,
"learning_rate": 0.00029968089403527667,
"loss": 2.9494,
"step": 74450
},
{
"epoch": 20.0484496124031,
"grad_norm": 0.48774582147598267,
"learning_rate": 0.0002994789282348189,
"loss": 2.9569,
"step": 74500
},
{
"epoch": 20.061907838070628,
"grad_norm": 0.41048768162727356,
"learning_rate": 0.0002992769624343611,
"loss": 2.9548,
"step": 74550
},
{
"epoch": 20.075366063738155,
"grad_norm": 0.4365278482437134,
"learning_rate": 0.0002990749966339033,
"loss": 2.9549,
"step": 74600
},
{
"epoch": 20.088824289405686,
"grad_norm": 0.4222956597805023,
"learning_rate": 0.0002988730308334455,
"loss": 2.9585,
"step": 74650
},
{
"epoch": 20.102282515073213,
"grad_norm": 0.40463319420814514,
"learning_rate": 0.0002986710650329877,
"loss": 2.9616,
"step": 74700
},
{
"epoch": 20.11574074074074,
"grad_norm": 0.4185885488986969,
"learning_rate": 0.00029846909923252995,
"loss": 2.9653,
"step": 74750
},
{
"epoch": 20.129198966408268,
"grad_norm": 0.4026871621608734,
"learning_rate": 0.00029826713343207215,
"loss": 2.9658,
"step": 74800
},
{
"epoch": 20.142657192075795,
"grad_norm": 0.43527159094810486,
"learning_rate": 0.00029806516763161435,
"loss": 2.9718,
"step": 74850
},
{
"epoch": 20.156115417743326,
"grad_norm": 0.4299376308917999,
"learning_rate": 0.00029786320183115654,
"loss": 2.9793,
"step": 74900
},
{
"epoch": 20.169573643410853,
"grad_norm": 0.4304753541946411,
"learning_rate": 0.0002976612360306988,
"loss": 2.978,
"step": 74950
},
{
"epoch": 20.18303186907838,
"grad_norm": 0.4182604253292084,
"learning_rate": 0.000297459270230241,
"loss": 2.9677,
"step": 75000
},
{
"epoch": 20.18303186907838,
"eval_accuracy": 0.39509135679496993,
"eval_loss": 3.2936933040618896,
"eval_runtime": 55.0997,
"eval_samples_per_second": 326.898,
"eval_steps_per_second": 20.436,
"step": 75000
},
{
"epoch": 20.196490094745908,
"grad_norm": 0.4104526937007904,
"learning_rate": 0.0002972573044297832,
"loss": 2.9908,
"step": 75050
},
{
"epoch": 20.209948320413435,
"grad_norm": 0.4046146273612976,
"learning_rate": 0.00029705533862932543,
"loss": 2.9804,
"step": 75100
},
{
"epoch": 20.223406546080966,
"grad_norm": 0.4415130615234375,
"learning_rate": 0.00029685337282886763,
"loss": 2.9857,
"step": 75150
},
{
"epoch": 20.236864771748493,
"grad_norm": 0.39041709899902344,
"learning_rate": 0.0002966514070284098,
"loss": 2.975,
"step": 75200
},
{
"epoch": 20.25032299741602,
"grad_norm": 0.4639005959033966,
"learning_rate": 0.000296449441227952,
"loss": 2.9851,
"step": 75250
},
{
"epoch": 20.263781223083548,
"grad_norm": 0.39461809396743774,
"learning_rate": 0.0002962474754274942,
"loss": 2.9904,
"step": 75300
},
{
"epoch": 20.277239448751075,
"grad_norm": 0.43576109409332275,
"learning_rate": 0.00029604550962703647,
"loss": 2.9891,
"step": 75350
},
{
"epoch": 20.290697674418606,
"grad_norm": 0.4303674101829529,
"learning_rate": 0.00029584354382657867,
"loss": 2.9851,
"step": 75400
},
{
"epoch": 20.304155900086133,
"grad_norm": 0.49273186922073364,
"learning_rate": 0.0002956415780261209,
"loss": 2.9925,
"step": 75450
},
{
"epoch": 20.31761412575366,
"grad_norm": 0.420663982629776,
"learning_rate": 0.0002954396122256631,
"loss": 2.9861,
"step": 75500
},
{
"epoch": 20.331072351421188,
"grad_norm": 0.436514675617218,
"learning_rate": 0.0002952376464252053,
"loss": 2.9873,
"step": 75550
},
{
"epoch": 20.344530577088715,
"grad_norm": 0.43746113777160645,
"learning_rate": 0.0002950356806247475,
"loss": 3.0009,
"step": 75600
},
{
"epoch": 20.357988802756246,
"grad_norm": 0.4135793447494507,
"learning_rate": 0.0002948337148242897,
"loss": 2.9983,
"step": 75650
},
{
"epoch": 20.371447028423773,
"grad_norm": 0.4526557922363281,
"learning_rate": 0.00029463174902383195,
"loss": 2.9872,
"step": 75700
},
{
"epoch": 20.3849052540913,
"grad_norm": 0.4632878303527832,
"learning_rate": 0.00029442978322337415,
"loss": 2.9979,
"step": 75750
},
{
"epoch": 20.398363479758828,
"grad_norm": 0.40115952491760254,
"learning_rate": 0.00029422781742291634,
"loss": 2.9991,
"step": 75800
},
{
"epoch": 20.411821705426355,
"grad_norm": 0.4160281717777252,
"learning_rate": 0.0002940258516224586,
"loss": 2.9976,
"step": 75850
},
{
"epoch": 20.425279931093886,
"grad_norm": 0.40379562973976135,
"learning_rate": 0.0002938238858220008,
"loss": 3.006,
"step": 75900
},
{
"epoch": 20.438738156761413,
"grad_norm": 0.45283401012420654,
"learning_rate": 0.000293621920021543,
"loss": 3.0042,
"step": 75950
},
{
"epoch": 20.45219638242894,
"grad_norm": 0.421700119972229,
"learning_rate": 0.0002934199542210852,
"loss": 2.9962,
"step": 76000
},
{
"epoch": 20.45219638242894,
"eval_accuracy": 0.3956393715344369,
"eval_loss": 3.287869691848755,
"eval_runtime": 55.497,
"eval_samples_per_second": 324.558,
"eval_steps_per_second": 20.289,
"step": 76000
},
{
"epoch": 20.465654608096468,
"grad_norm": 0.44577735662460327,
"learning_rate": 0.00029321798842062743,
"loss": 3.008,
"step": 76050
},
{
"epoch": 20.479112833763995,
"grad_norm": 0.4258463680744171,
"learning_rate": 0.00029301602262016963,
"loss": 3.0077,
"step": 76100
},
{
"epoch": 20.492571059431526,
"grad_norm": 0.4366303086280823,
"learning_rate": 0.0002928140568197118,
"loss": 3.0012,
"step": 76150
},
{
"epoch": 20.506029285099054,
"grad_norm": 0.4077489674091339,
"learning_rate": 0.000292612091019254,
"loss": 3.001,
"step": 76200
},
{
"epoch": 20.51948751076658,
"grad_norm": 0.4265643358230591,
"learning_rate": 0.0002924101252187962,
"loss": 3.0123,
"step": 76250
},
{
"epoch": 20.532945736434108,
"grad_norm": 0.42429250478744507,
"learning_rate": 0.00029220815941833847,
"loss": 3.0062,
"step": 76300
},
{
"epoch": 20.546403962101635,
"grad_norm": 0.41154927015304565,
"learning_rate": 0.0002920061936178807,
"loss": 3.0039,
"step": 76350
},
{
"epoch": 20.559862187769163,
"grad_norm": 0.41237393021583557,
"learning_rate": 0.0002918042278174229,
"loss": 3.0127,
"step": 76400
},
{
"epoch": 20.573320413436694,
"grad_norm": 0.42659783363342285,
"learning_rate": 0.0002916022620169651,
"loss": 3.0032,
"step": 76450
},
{
"epoch": 20.58677863910422,
"grad_norm": 0.4047008454799652,
"learning_rate": 0.0002914002962165073,
"loss": 3.0177,
"step": 76500
},
{
"epoch": 20.600236864771748,
"grad_norm": 0.39722734689712524,
"learning_rate": 0.0002911983304160495,
"loss": 3.005,
"step": 76550
},
{
"epoch": 20.613695090439276,
"grad_norm": 0.4279921054840088,
"learning_rate": 0.00029099636461559175,
"loss": 2.9999,
"step": 76600
},
{
"epoch": 20.627153316106803,
"grad_norm": 0.42196568846702576,
"learning_rate": 0.00029079439881513395,
"loss": 3.0085,
"step": 76650
},
{
"epoch": 20.640611541774334,
"grad_norm": 0.42284271121025085,
"learning_rate": 0.00029059243301467614,
"loss": 3.0077,
"step": 76700
},
{
"epoch": 20.65406976744186,
"grad_norm": 0.4308563768863678,
"learning_rate": 0.0002903904672142184,
"loss": 3.0001,
"step": 76750
},
{
"epoch": 20.66752799310939,
"grad_norm": 0.399814248085022,
"learning_rate": 0.0002901885014137606,
"loss": 3.0182,
"step": 76800
},
{
"epoch": 20.680986218776916,
"grad_norm": 0.4300035834312439,
"learning_rate": 0.0002899865356133028,
"loss": 3.0036,
"step": 76850
},
{
"epoch": 20.694444444444443,
"grad_norm": 0.4693319499492645,
"learning_rate": 0.000289784569812845,
"loss": 3.0158,
"step": 76900
},
{
"epoch": 20.707902670111974,
"grad_norm": 0.43014174699783325,
"learning_rate": 0.00028958260401238723,
"loss": 3.0057,
"step": 76950
},
{
"epoch": 20.7213608957795,
"grad_norm": 0.43209654092788696,
"learning_rate": 0.00028938063821192943,
"loss": 3.028,
"step": 77000
},
{
"epoch": 20.7213608957795,
"eval_accuracy": 0.3957726812262145,
"eval_loss": 3.284038543701172,
"eval_runtime": 55.0721,
"eval_samples_per_second": 327.062,
"eval_steps_per_second": 20.446,
"step": 77000
},
{
"epoch": 20.73481912144703,
"grad_norm": 0.4174690842628479,
"learning_rate": 0.0002891786724114716,
"loss": 3.0144,
"step": 77050
},
{
"epoch": 20.748277347114556,
"grad_norm": 0.42950111627578735,
"learning_rate": 0.0002889767066110138,
"loss": 3.01,
"step": 77100
},
{
"epoch": 20.761735572782083,
"grad_norm": 0.4183488190174103,
"learning_rate": 0.000288774740810556,
"loss": 3.0134,
"step": 77150
},
{
"epoch": 20.775193798449614,
"grad_norm": 0.43711057305336,
"learning_rate": 0.00028857277501009827,
"loss": 3.0213,
"step": 77200
},
{
"epoch": 20.78865202411714,
"grad_norm": 0.46745753288269043,
"learning_rate": 0.00028837080920964046,
"loss": 3.0255,
"step": 77250
},
{
"epoch": 20.80211024978467,
"grad_norm": 0.4243837594985962,
"learning_rate": 0.0002881688434091827,
"loss": 3.0196,
"step": 77300
},
{
"epoch": 20.815568475452196,
"grad_norm": 0.4431048631668091,
"learning_rate": 0.0002879668776087249,
"loss": 3.0138,
"step": 77350
},
{
"epoch": 20.829026701119723,
"grad_norm": 0.42100629210472107,
"learning_rate": 0.0002877649118082671,
"loss": 3.0169,
"step": 77400
},
{
"epoch": 20.842484926787254,
"grad_norm": 0.40253376960754395,
"learning_rate": 0.0002875629460078093,
"loss": 3.0119,
"step": 77450
},
{
"epoch": 20.85594315245478,
"grad_norm": 0.4260178804397583,
"learning_rate": 0.0002873609802073515,
"loss": 3.0266,
"step": 77500
},
{
"epoch": 20.86940137812231,
"grad_norm": 0.4209248423576355,
"learning_rate": 0.00028715901440689375,
"loss": 3.0247,
"step": 77550
},
{
"epoch": 20.882859603789836,
"grad_norm": 0.4190898537635803,
"learning_rate": 0.00028695704860643594,
"loss": 3.0384,
"step": 77600
},
{
"epoch": 20.896317829457363,
"grad_norm": 0.42939960956573486,
"learning_rate": 0.0002867550828059782,
"loss": 3.0209,
"step": 77650
},
{
"epoch": 20.909776055124894,
"grad_norm": 0.4232606589794159,
"learning_rate": 0.0002865531170055204,
"loss": 3.0296,
"step": 77700
},
{
"epoch": 20.92323428079242,
"grad_norm": 0.4421567916870117,
"learning_rate": 0.0002863511512050626,
"loss": 3.0192,
"step": 77750
},
{
"epoch": 20.93669250645995,
"grad_norm": 0.4117579162120819,
"learning_rate": 0.0002861491854046048,
"loss": 3.0199,
"step": 77800
},
{
"epoch": 20.950150732127476,
"grad_norm": 0.43439605832099915,
"learning_rate": 0.000285947219604147,
"loss": 3.0265,
"step": 77850
},
{
"epoch": 20.963608957795003,
"grad_norm": 0.44588276743888855,
"learning_rate": 0.00028574525380368923,
"loss": 3.0231,
"step": 77900
},
{
"epoch": 20.977067183462534,
"grad_norm": 0.43251749873161316,
"learning_rate": 0.0002855432880032314,
"loss": 3.0243,
"step": 77950
},
{
"epoch": 20.99052540913006,
"grad_norm": 0.4446578323841095,
"learning_rate": 0.0002853413222027736,
"loss": 3.015,
"step": 78000
},
{
"epoch": 20.99052540913006,
"eval_accuracy": 0.39639381529947637,
"eval_loss": 3.2760121822357178,
"eval_runtime": 55.1938,
"eval_samples_per_second": 326.341,
"eval_steps_per_second": 20.401,
"step": 78000
},
{
"epoch": 21.00376830318691,
"grad_norm": 0.4257495403289795,
"learning_rate": 0.00028513935640231587,
"loss": 3.0018,
"step": 78050
},
{
"epoch": 21.017226528854437,
"grad_norm": 0.4258408844470978,
"learning_rate": 0.00028493739060185807,
"loss": 2.935,
"step": 78100
},
{
"epoch": 21.030684754521964,
"grad_norm": 0.4279120862483978,
"learning_rate": 0.00028473542480140026,
"loss": 2.9377,
"step": 78150
},
{
"epoch": 21.04414298018949,
"grad_norm": 0.40601927042007446,
"learning_rate": 0.0002845334590009425,
"loss": 2.9336,
"step": 78200
},
{
"epoch": 21.05760120585702,
"grad_norm": 0.46881160140037537,
"learning_rate": 0.0002843314932004847,
"loss": 2.948,
"step": 78250
},
{
"epoch": 21.07105943152455,
"grad_norm": 0.4635114371776581,
"learning_rate": 0.0002841295274000269,
"loss": 2.9619,
"step": 78300
},
{
"epoch": 21.084517657192077,
"grad_norm": 0.42312192916870117,
"learning_rate": 0.0002839275615995691,
"loss": 2.9504,
"step": 78350
},
{
"epoch": 21.097975882859604,
"grad_norm": 0.43240320682525635,
"learning_rate": 0.0002837255957991113,
"loss": 2.9561,
"step": 78400
},
{
"epoch": 21.11143410852713,
"grad_norm": 0.43588295578956604,
"learning_rate": 0.00028352362999865355,
"loss": 2.9655,
"step": 78450
},
{
"epoch": 21.12489233419466,
"grad_norm": 0.4394192397594452,
"learning_rate": 0.00028332166419819574,
"loss": 2.961,
"step": 78500
},
{
"epoch": 21.13835055986219,
"grad_norm": 0.4436136484146118,
"learning_rate": 0.000283119698397738,
"loss": 2.9606,
"step": 78550
},
{
"epoch": 21.151808785529717,
"grad_norm": 0.4162307679653168,
"learning_rate": 0.0002829177325972802,
"loss": 2.9571,
"step": 78600
},
{
"epoch": 21.165267011197244,
"grad_norm": 0.44429853558540344,
"learning_rate": 0.0002827157667968224,
"loss": 2.9653,
"step": 78650
},
{
"epoch": 21.17872523686477,
"grad_norm": 0.4423364996910095,
"learning_rate": 0.0002825138009963646,
"loss": 2.9711,
"step": 78700
},
{
"epoch": 21.1921834625323,
"grad_norm": 0.45903778076171875,
"learning_rate": 0.0002823118351959068,
"loss": 2.9725,
"step": 78750
},
{
"epoch": 21.205641688199826,
"grad_norm": 0.4161292016506195,
"learning_rate": 0.00028210986939544903,
"loss": 2.9722,
"step": 78800
},
{
"epoch": 21.219099913867357,
"grad_norm": 0.41161108016967773,
"learning_rate": 0.0002819079035949912,
"loss": 2.9769,
"step": 78850
},
{
"epoch": 21.232558139534884,
"grad_norm": 0.4620738625526428,
"learning_rate": 0.0002817059377945334,
"loss": 2.9643,
"step": 78900
},
{
"epoch": 21.24601636520241,
"grad_norm": 0.42669835686683655,
"learning_rate": 0.00028150397199407567,
"loss": 2.9732,
"step": 78950
},
{
"epoch": 21.25947459086994,
"grad_norm": 0.4427119195461273,
"learning_rate": 0.00028130200619361787,
"loss": 2.9675,
"step": 79000
},
{
"epoch": 21.25947459086994,
"eval_accuracy": 0.39546597115295606,
"eval_loss": 3.294128894805908,
"eval_runtime": 54.6202,
"eval_samples_per_second": 329.768,
"eval_steps_per_second": 20.615,
"step": 79000
},
{
"epoch": 21.272932816537466,
"grad_norm": 0.4521730840206146,
"learning_rate": 0.00028110004039316006,
"loss": 2.9685,
"step": 79050
},
{
"epoch": 21.286391042204997,
"grad_norm": 0.4480832517147064,
"learning_rate": 0.00028089807459270226,
"loss": 2.9697,
"step": 79100
},
{
"epoch": 21.299849267872524,
"grad_norm": 0.42608147859573364,
"learning_rate": 0.0002806961087922445,
"loss": 2.9664,
"step": 79150
},
{
"epoch": 21.31330749354005,
"grad_norm": 0.42942923307418823,
"learning_rate": 0.0002804941429917867,
"loss": 2.9785,
"step": 79200
},
{
"epoch": 21.32676571920758,
"grad_norm": 0.42538803815841675,
"learning_rate": 0.0002802921771913289,
"loss": 2.9762,
"step": 79250
},
{
"epoch": 21.340223944875106,
"grad_norm": 0.42696043848991394,
"learning_rate": 0.0002800902113908711,
"loss": 2.9846,
"step": 79300
},
{
"epoch": 21.353682170542637,
"grad_norm": 0.4404904544353485,
"learning_rate": 0.0002798882455904133,
"loss": 2.9768,
"step": 79350
},
{
"epoch": 21.367140396210164,
"grad_norm": 0.46387866139411926,
"learning_rate": 0.00027968627978995555,
"loss": 2.9766,
"step": 79400
},
{
"epoch": 21.38059862187769,
"grad_norm": 0.427202433347702,
"learning_rate": 0.00027948431398949774,
"loss": 2.9877,
"step": 79450
},
{
"epoch": 21.39405684754522,
"grad_norm": 0.41624459624290466,
"learning_rate": 0.00027928234818904,
"loss": 2.9882,
"step": 79500
},
{
"epoch": 21.407515073212746,
"grad_norm": 0.45006537437438965,
"learning_rate": 0.0002790803823885822,
"loss": 2.988,
"step": 79550
},
{
"epoch": 21.420973298880277,
"grad_norm": 0.42271533608436584,
"learning_rate": 0.0002788784165881244,
"loss": 2.9872,
"step": 79600
},
{
"epoch": 21.434431524547804,
"grad_norm": 0.4559018015861511,
"learning_rate": 0.0002786764507876666,
"loss": 2.9797,
"step": 79650
},
{
"epoch": 21.44788975021533,
"grad_norm": 0.42914602160453796,
"learning_rate": 0.0002784744849872088,
"loss": 2.9898,
"step": 79700
},
{
"epoch": 21.46134797588286,
"grad_norm": 0.40934568643569946,
"learning_rate": 0.000278272519186751,
"loss": 2.9896,
"step": 79750
},
{
"epoch": 21.474806201550386,
"grad_norm": 0.4386095106601715,
"learning_rate": 0.0002780705533862932,
"loss": 2.9904,
"step": 79800
},
{
"epoch": 21.488264427217917,
"grad_norm": 0.42801961302757263,
"learning_rate": 0.0002778685875858355,
"loss": 2.981,
"step": 79850
},
{
"epoch": 21.501722652885444,
"grad_norm": 0.42655736207962036,
"learning_rate": 0.00027766662178537767,
"loss": 2.9964,
"step": 79900
},
{
"epoch": 21.51518087855297,
"grad_norm": 0.43257346749305725,
"learning_rate": 0.00027746465598491987,
"loss": 2.9868,
"step": 79950
},
{
"epoch": 21.5286391042205,
"grad_norm": 0.4436582922935486,
"learning_rate": 0.00027726269018446206,
"loss": 2.9881,
"step": 80000
},
{
"epoch": 21.5286391042205,
"eval_accuracy": 0.3959727000873086,
"eval_loss": 3.2854392528533936,
"eval_runtime": 55.4249,
"eval_samples_per_second": 324.98,
"eval_steps_per_second": 20.316,
"step": 80000
},
{
"epoch": 21.542097329888026,
"grad_norm": 0.4455031752586365,
"learning_rate": 0.00027706072438400426,
"loss": 2.988,
"step": 80050
},
{
"epoch": 21.555555555555557,
"grad_norm": 0.43415355682373047,
"learning_rate": 0.0002768587585835465,
"loss": 3.0084,
"step": 80100
},
{
"epoch": 21.569013781223084,
"grad_norm": 0.47144296765327454,
"learning_rate": 0.0002766567927830887,
"loss": 2.9981,
"step": 80150
},
{
"epoch": 21.58247200689061,
"grad_norm": 0.46530064940452576,
"learning_rate": 0.0002764548269826309,
"loss": 3.0019,
"step": 80200
},
{
"epoch": 21.59593023255814,
"grad_norm": 0.4477296471595764,
"learning_rate": 0.0002762528611821731,
"loss": 2.9965,
"step": 80250
},
{
"epoch": 21.609388458225666,
"grad_norm": 0.4592190384864807,
"learning_rate": 0.00027605089538171535,
"loss": 3.009,
"step": 80300
},
{
"epoch": 21.622846683893197,
"grad_norm": 0.4123205542564392,
"learning_rate": 0.00027584892958125754,
"loss": 3.0118,
"step": 80350
},
{
"epoch": 21.636304909560724,
"grad_norm": 0.41009268164634705,
"learning_rate": 0.0002756469637807998,
"loss": 2.9985,
"step": 80400
},
{
"epoch": 21.649763135228252,
"grad_norm": 0.41830089688301086,
"learning_rate": 0.000275444997980342,
"loss": 2.9939,
"step": 80450
},
{
"epoch": 21.66322136089578,
"grad_norm": 0.4384111166000366,
"learning_rate": 0.0002752430321798842,
"loss": 3.0026,
"step": 80500
},
{
"epoch": 21.676679586563306,
"grad_norm": 0.41752365231513977,
"learning_rate": 0.0002750410663794264,
"loss": 3.0026,
"step": 80550
},
{
"epoch": 21.690137812230837,
"grad_norm": 0.4098077714443207,
"learning_rate": 0.0002748391005789686,
"loss": 3.0001,
"step": 80600
},
{
"epoch": 21.703596037898365,
"grad_norm": 0.4417395293712616,
"learning_rate": 0.00027463713477851083,
"loss": 3.0034,
"step": 80650
},
{
"epoch": 21.717054263565892,
"grad_norm": 0.42584118247032166,
"learning_rate": 0.000274435168978053,
"loss": 2.9965,
"step": 80700
},
{
"epoch": 21.73051248923342,
"grad_norm": 0.42295321822166443,
"learning_rate": 0.0002742332031775953,
"loss": 3.0049,
"step": 80750
},
{
"epoch": 21.743970714900946,
"grad_norm": 0.41715312004089355,
"learning_rate": 0.00027403123737713747,
"loss": 3.0042,
"step": 80800
},
{
"epoch": 21.757428940568474,
"grad_norm": 0.41474443674087524,
"learning_rate": 0.00027382927157667967,
"loss": 3.009,
"step": 80850
},
{
"epoch": 21.770887166236005,
"grad_norm": 0.45613738894462585,
"learning_rate": 0.00027362730577622186,
"loss": 3.0105,
"step": 80900
},
{
"epoch": 21.784345391903532,
"grad_norm": 0.4140360951423645,
"learning_rate": 0.00027342533997576406,
"loss": 3.012,
"step": 80950
},
{
"epoch": 21.79780361757106,
"grad_norm": 0.47605544328689575,
"learning_rate": 0.0002732233741753063,
"loss": 3.0037,
"step": 81000
},
{
"epoch": 21.79780361757106,
"eval_accuracy": 0.3962476852787422,
"eval_loss": 3.280226707458496,
"eval_runtime": 55.7877,
"eval_samples_per_second": 322.867,
"eval_steps_per_second": 20.184,
"step": 81000
},
{
"epoch": 21.811261843238587,
"grad_norm": 0.4371415674686432,
"learning_rate": 0.0002730214083748485,
"loss": 3.0165,
"step": 81050
},
{
"epoch": 21.824720068906114,
"grad_norm": 0.42411020398139954,
"learning_rate": 0.0002728194425743907,
"loss": 3.0095,
"step": 81100
},
{
"epoch": 21.838178294573645,
"grad_norm": 0.4219321310520172,
"learning_rate": 0.0002726174767739329,
"loss": 3.0101,
"step": 81150
},
{
"epoch": 21.851636520241172,
"grad_norm": 0.41034814715385437,
"learning_rate": 0.00027241551097347515,
"loss": 3.0089,
"step": 81200
},
{
"epoch": 21.8650947459087,
"grad_norm": 0.4187079668045044,
"learning_rate": 0.00027221354517301734,
"loss": 3.0148,
"step": 81250
},
{
"epoch": 21.878552971576227,
"grad_norm": 0.41652804613113403,
"learning_rate": 0.00027201157937255954,
"loss": 3.0148,
"step": 81300
},
{
"epoch": 21.892011197243754,
"grad_norm": 0.4195777475833893,
"learning_rate": 0.0002718096135721018,
"loss": 3.0215,
"step": 81350
},
{
"epoch": 21.905469422911285,
"grad_norm": 0.44653409719467163,
"learning_rate": 0.000271607647771644,
"loss": 3.0189,
"step": 81400
},
{
"epoch": 21.918927648578812,
"grad_norm": 0.422077476978302,
"learning_rate": 0.0002714056819711862,
"loss": 3.0073,
"step": 81450
},
{
"epoch": 21.93238587424634,
"grad_norm": 0.40733182430267334,
"learning_rate": 0.0002712037161707284,
"loss": 3.0223,
"step": 81500
},
{
"epoch": 21.945844099913867,
"grad_norm": 0.42711979150772095,
"learning_rate": 0.0002710017503702706,
"loss": 3.0135,
"step": 81550
},
{
"epoch": 21.959302325581394,
"grad_norm": 0.42944803833961487,
"learning_rate": 0.0002707997845698128,
"loss": 3.0209,
"step": 81600
},
{
"epoch": 21.972760551248925,
"grad_norm": 0.39599812030792236,
"learning_rate": 0.000270597818769355,
"loss": 3.0113,
"step": 81650
},
{
"epoch": 21.986218776916452,
"grad_norm": 0.4108112156391144,
"learning_rate": 0.00027039585296889727,
"loss": 3.0103,
"step": 81700
},
{
"epoch": 21.99967700258398,
"grad_norm": 0.4447796642780304,
"learning_rate": 0.00027019388716843947,
"loss": 3.0168,
"step": 81750
},
{
"epoch": 22.012919896640827,
"grad_norm": 0.43677303194999695,
"learning_rate": 0.00026999192136798166,
"loss": 2.9338,
"step": 81800
},
{
"epoch": 22.026378122308355,
"grad_norm": 0.4421975314617157,
"learning_rate": 0.00026978995556752386,
"loss": 2.932,
"step": 81850
},
{
"epoch": 22.039836347975882,
"grad_norm": 0.4097566604614258,
"learning_rate": 0.00026958798976706606,
"loss": 2.9407,
"step": 81900
},
{
"epoch": 22.05329457364341,
"grad_norm": 0.4221478998661041,
"learning_rate": 0.0002693860239666083,
"loss": 2.9391,
"step": 81950
},
{
"epoch": 22.06675279931094,
"grad_norm": 0.4292081296443939,
"learning_rate": 0.0002691840581661505,
"loss": 2.9562,
"step": 82000
},
{
"epoch": 22.06675279931094,
"eval_accuracy": 0.3957283533091442,
"eval_loss": 3.292602300643921,
"eval_runtime": 55.3547,
"eval_samples_per_second": 325.392,
"eval_steps_per_second": 20.342,
"step": 82000
},
{
"epoch": 22.080211024978468,
"grad_norm": 0.4228661060333252,
"learning_rate": 0.0002689820923656927,
"loss": 2.9327,
"step": 82050
},
{
"epoch": 22.093669250645995,
"grad_norm": 0.4465382993221283,
"learning_rate": 0.00026878012656523495,
"loss": 2.9365,
"step": 82100
},
{
"epoch": 22.107127476313522,
"grad_norm": 0.43030622601509094,
"learning_rate": 0.00026857816076477714,
"loss": 2.9365,
"step": 82150
},
{
"epoch": 22.12058570198105,
"grad_norm": 0.4425069987773895,
"learning_rate": 0.00026837619496431934,
"loss": 2.9483,
"step": 82200
},
{
"epoch": 22.13404392764858,
"grad_norm": 0.4402919113636017,
"learning_rate": 0.0002681742291638616,
"loss": 2.948,
"step": 82250
},
{
"epoch": 22.147502153316108,
"grad_norm": 0.44754478335380554,
"learning_rate": 0.0002679722633634038,
"loss": 2.9469,
"step": 82300
},
{
"epoch": 22.160960378983635,
"grad_norm": 0.4323514401912689,
"learning_rate": 0.000267770297562946,
"loss": 2.9433,
"step": 82350
},
{
"epoch": 22.174418604651162,
"grad_norm": 0.43964049220085144,
"learning_rate": 0.0002675683317624882,
"loss": 2.9485,
"step": 82400
},
{
"epoch": 22.18787683031869,
"grad_norm": 0.4523833990097046,
"learning_rate": 0.0002673663659620304,
"loss": 2.9487,
"step": 82450
},
{
"epoch": 22.20133505598622,
"grad_norm": 0.4370718002319336,
"learning_rate": 0.0002671644001615726,
"loss": 2.961,
"step": 82500
},
{
"epoch": 22.214793281653748,
"grad_norm": 0.44688382744789124,
"learning_rate": 0.0002669624343611148,
"loss": 2.9504,
"step": 82550
},
{
"epoch": 22.228251507321275,
"grad_norm": 0.44331830739974976,
"learning_rate": 0.00026676046856065707,
"loss": 2.9555,
"step": 82600
},
{
"epoch": 22.241709732988802,
"grad_norm": 0.42020565271377563,
"learning_rate": 0.00026655850276019927,
"loss": 2.9615,
"step": 82650
},
{
"epoch": 22.25516795865633,
"grad_norm": 0.4653908908367157,
"learning_rate": 0.00026635653695974146,
"loss": 2.9603,
"step": 82700
},
{
"epoch": 22.26862618432386,
"grad_norm": 0.41683560609817505,
"learning_rate": 0.00026615457115928366,
"loss": 2.966,
"step": 82750
},
{
"epoch": 22.282084409991388,
"grad_norm": 0.46766164898872375,
"learning_rate": 0.00026595260535882586,
"loss": 2.9694,
"step": 82800
},
{
"epoch": 22.295542635658915,
"grad_norm": 0.46885979175567627,
"learning_rate": 0.0002657506395583681,
"loss": 2.9826,
"step": 82850
},
{
"epoch": 22.309000861326442,
"grad_norm": 0.4445846676826477,
"learning_rate": 0.0002655486737579103,
"loss": 2.9655,
"step": 82900
},
{
"epoch": 22.32245908699397,
"grad_norm": 0.4270409047603607,
"learning_rate": 0.00026534670795745255,
"loss": 2.9681,
"step": 82950
},
{
"epoch": 22.3359173126615,
"grad_norm": 0.4194854497909546,
"learning_rate": 0.00026514474215699475,
"loss": 2.9683,
"step": 83000
},
{
"epoch": 22.3359173126615,
"eval_accuracy": 0.3955815714072766,
"eval_loss": 3.292879819869995,
"eval_runtime": 55.3417,
"eval_samples_per_second": 325.469,
"eval_steps_per_second": 20.346,
"step": 83000
},
{
"epoch": 22.349375538329028,
"grad_norm": 0.429385781288147,
"learning_rate": 0.00026494277635653694,
"loss": 2.9716,
"step": 83050
},
{
"epoch": 22.362833763996555,
"grad_norm": 0.4517979621887207,
"learning_rate": 0.00026474081055607914,
"loss": 2.9669,
"step": 83100
},
{
"epoch": 22.376291989664082,
"grad_norm": 0.44728177785873413,
"learning_rate": 0.00026453884475562134,
"loss": 2.9587,
"step": 83150
},
{
"epoch": 22.38975021533161,
"grad_norm": 0.4721812605857849,
"learning_rate": 0.0002643368789551636,
"loss": 2.986,
"step": 83200
},
{
"epoch": 22.403208440999137,
"grad_norm": 0.44930049777030945,
"learning_rate": 0.0002641349131547058,
"loss": 2.9718,
"step": 83250
},
{
"epoch": 22.416666666666668,
"grad_norm": 0.47097310423851013,
"learning_rate": 0.000263932947354248,
"loss": 2.9863,
"step": 83300
},
{
"epoch": 22.430124892334195,
"grad_norm": 0.46559232473373413,
"learning_rate": 0.0002637309815537902,
"loss": 2.971,
"step": 83350
},
{
"epoch": 22.443583118001722,
"grad_norm": 0.42774224281311035,
"learning_rate": 0.0002635290157533324,
"loss": 2.984,
"step": 83400
},
{
"epoch": 22.45704134366925,
"grad_norm": 0.4455351233482361,
"learning_rate": 0.0002633270499528746,
"loss": 2.9784,
"step": 83450
},
{
"epoch": 22.470499569336777,
"grad_norm": 0.42454469203948975,
"learning_rate": 0.0002631250841524168,
"loss": 2.9708,
"step": 83500
},
{
"epoch": 22.483957795004308,
"grad_norm": 0.4294556975364685,
"learning_rate": 0.00026292311835195907,
"loss": 2.9963,
"step": 83550
},
{
"epoch": 22.497416020671835,
"grad_norm": 0.4321839213371277,
"learning_rate": 0.00026272115255150126,
"loss": 2.995,
"step": 83600
},
{
"epoch": 22.510874246339363,
"grad_norm": 0.4495703876018524,
"learning_rate": 0.00026251918675104346,
"loss": 2.988,
"step": 83650
},
{
"epoch": 22.52433247200689,
"grad_norm": 0.4380459189414978,
"learning_rate": 0.00026231722095058566,
"loss": 2.9866,
"step": 83700
},
{
"epoch": 22.537790697674417,
"grad_norm": 0.4377855062484741,
"learning_rate": 0.00026211525515012785,
"loss": 2.9762,
"step": 83750
},
{
"epoch": 22.551248923341948,
"grad_norm": 0.4443385899066925,
"learning_rate": 0.0002619132893496701,
"loss": 2.9879,
"step": 83800
},
{
"epoch": 22.564707149009475,
"grad_norm": 0.4317345917224884,
"learning_rate": 0.0002617113235492123,
"loss": 2.9873,
"step": 83850
},
{
"epoch": 22.578165374677003,
"grad_norm": 0.41431862115859985,
"learning_rate": 0.00026150935774875455,
"loss": 2.9937,
"step": 83900
},
{
"epoch": 22.59162360034453,
"grad_norm": 0.4557620882987976,
"learning_rate": 0.00026130739194829675,
"loss": 2.9766,
"step": 83950
},
{
"epoch": 22.605081826012057,
"grad_norm": 0.45642057061195374,
"learning_rate": 0.00026110542614783894,
"loss": 2.9852,
"step": 84000
},
{
"epoch": 22.605081826012057,
"eval_accuracy": 0.3964245623596011,
"eval_loss": 3.2839629650115967,
"eval_runtime": 55.1163,
"eval_samples_per_second": 326.8,
"eval_steps_per_second": 20.43,
"step": 84000
},
{
"epoch": 22.618540051679588,
"grad_norm": 0.48557594418525696,
"learning_rate": 0.00026090346034738114,
"loss": 2.9942,
"step": 84050
},
{
"epoch": 22.631998277347115,
"grad_norm": 0.44271036982536316,
"learning_rate": 0.0002607014945469234,
"loss": 2.9952,
"step": 84100
},
{
"epoch": 22.645456503014643,
"grad_norm": 0.46364423632621765,
"learning_rate": 0.0002604995287464656,
"loss": 2.9961,
"step": 84150
},
{
"epoch": 22.65891472868217,
"grad_norm": 0.4304630160331726,
"learning_rate": 0.0002602975629460078,
"loss": 2.9918,
"step": 84200
},
{
"epoch": 22.672372954349697,
"grad_norm": 0.4598659873008728,
"learning_rate": 0.00026009559714555,
"loss": 2.9955,
"step": 84250
},
{
"epoch": 22.685831180017228,
"grad_norm": 0.43804359436035156,
"learning_rate": 0.0002598936313450922,
"loss": 3.002,
"step": 84300
},
{
"epoch": 22.699289405684755,
"grad_norm": 0.44611334800720215,
"learning_rate": 0.0002596916655446344,
"loss": 2.9896,
"step": 84350
},
{
"epoch": 22.712747631352283,
"grad_norm": 0.4401785433292389,
"learning_rate": 0.0002594896997441766,
"loss": 2.9971,
"step": 84400
},
{
"epoch": 22.72620585701981,
"grad_norm": 0.45873647928237915,
"learning_rate": 0.00025928773394371887,
"loss": 2.9911,
"step": 84450
},
{
"epoch": 22.739664082687337,
"grad_norm": 0.42473602294921875,
"learning_rate": 0.00025908576814326107,
"loss": 2.9927,
"step": 84500
},
{
"epoch": 22.753122308354868,
"grad_norm": 0.4496315121650696,
"learning_rate": 0.00025888380234280326,
"loss": 2.9939,
"step": 84550
},
{
"epoch": 22.766580534022395,
"grad_norm": 0.44188305735588074,
"learning_rate": 0.00025868183654234546,
"loss": 2.9925,
"step": 84600
},
{
"epoch": 22.780038759689923,
"grad_norm": 0.47845959663391113,
"learning_rate": 0.00025847987074188765,
"loss": 2.9961,
"step": 84650
},
{
"epoch": 22.79349698535745,
"grad_norm": 0.4484929144382477,
"learning_rate": 0.0002582779049414299,
"loss": 3.0014,
"step": 84700
},
{
"epoch": 22.806955211024977,
"grad_norm": 0.41167107224464417,
"learning_rate": 0.0002580759391409721,
"loss": 2.9951,
"step": 84750
},
{
"epoch": 22.820413436692505,
"grad_norm": 0.4409504532814026,
"learning_rate": 0.00025787397334051435,
"loss": 2.9982,
"step": 84800
},
{
"epoch": 22.833871662360036,
"grad_norm": 0.44134747982025146,
"learning_rate": 0.00025767200754005655,
"loss": 2.9994,
"step": 84850
},
{
"epoch": 22.847329888027563,
"grad_norm": 0.475885808467865,
"learning_rate": 0.00025747004173959874,
"loss": 3.0012,
"step": 84900
},
{
"epoch": 22.86078811369509,
"grad_norm": 0.4444062411785126,
"learning_rate": 0.00025726807593914094,
"loss": 2.9944,
"step": 84950
},
{
"epoch": 22.874246339362617,
"grad_norm": 0.434906929731369,
"learning_rate": 0.00025706611013868313,
"loss": 3.0085,
"step": 85000
},
{
"epoch": 22.874246339362617,
"eval_accuracy": 0.396510393375497,
"eval_loss": 3.278679132461548,
"eval_runtime": 55.0664,
"eval_samples_per_second": 327.096,
"eval_steps_per_second": 20.448,
"step": 85000
},
{
"epoch": 22.887704565030145,
"grad_norm": 0.4553506076335907,
"learning_rate": 0.0002568641443382254,
"loss": 3.0023,
"step": 85050
},
{
"epoch": 22.901162790697676,
"grad_norm": 0.4525589942932129,
"learning_rate": 0.0002566621785377676,
"loss": 3.0052,
"step": 85100
},
{
"epoch": 22.914621016365203,
"grad_norm": 0.44847944378852844,
"learning_rate": 0.0002564602127373098,
"loss": 3.0062,
"step": 85150
},
{
"epoch": 22.92807924203273,
"grad_norm": 0.45734959840774536,
"learning_rate": 0.00025625824693685203,
"loss": 3.0016,
"step": 85200
},
{
"epoch": 22.941537467700257,
"grad_norm": 0.43638479709625244,
"learning_rate": 0.0002560562811363942,
"loss": 3.0056,
"step": 85250
},
{
"epoch": 22.954995693367785,
"grad_norm": 0.4501156210899353,
"learning_rate": 0.0002558543153359364,
"loss": 3.0053,
"step": 85300
},
{
"epoch": 22.968453919035316,
"grad_norm": 0.45217031240463257,
"learning_rate": 0.0002556523495354786,
"loss": 3.003,
"step": 85350
},
{
"epoch": 22.981912144702843,
"grad_norm": 0.44591525197029114,
"learning_rate": 0.00025545038373502087,
"loss": 2.9979,
"step": 85400
},
{
"epoch": 22.99537037037037,
"grad_norm": 0.4603706896305084,
"learning_rate": 0.00025524841793456306,
"loss": 3.0025,
"step": 85450
},
{
"epoch": 23.00861326442722,
"grad_norm": 0.43974849581718445,
"learning_rate": 0.00025504645213410526,
"loss": 2.9376,
"step": 85500
},
{
"epoch": 23.022071490094746,
"grad_norm": 0.4222824275493622,
"learning_rate": 0.00025484448633364745,
"loss": 2.9166,
"step": 85550
},
{
"epoch": 23.035529715762273,
"grad_norm": 0.44281378388404846,
"learning_rate": 0.00025464252053318965,
"loss": 2.9168,
"step": 85600
},
{
"epoch": 23.0489879414298,
"grad_norm": 0.478463351726532,
"learning_rate": 0.0002544405547327319,
"loss": 2.9187,
"step": 85650
},
{
"epoch": 23.06244616709733,
"grad_norm": 0.43511971831321716,
"learning_rate": 0.0002542385889322741,
"loss": 2.9263,
"step": 85700
},
{
"epoch": 23.07590439276486,
"grad_norm": 0.492725670337677,
"learning_rate": 0.00025403662313181635,
"loss": 2.9329,
"step": 85750
},
{
"epoch": 23.089362618432386,
"grad_norm": 0.4485880732536316,
"learning_rate": 0.00025383465733135854,
"loss": 2.9346,
"step": 85800
},
{
"epoch": 23.102820844099913,
"grad_norm": 0.4755620062351227,
"learning_rate": 0.00025363269153090074,
"loss": 2.9237,
"step": 85850
},
{
"epoch": 23.11627906976744,
"grad_norm": 0.4183093011379242,
"learning_rate": 0.00025343072573044294,
"loss": 2.9378,
"step": 85900
},
{
"epoch": 23.12973729543497,
"grad_norm": 0.4465462565422058,
"learning_rate": 0.00025322875992998513,
"loss": 2.9452,
"step": 85950
},
{
"epoch": 23.1431955211025,
"grad_norm": 0.42842474579811096,
"learning_rate": 0.0002530267941295274,
"loss": 2.9407,
"step": 86000
},
{
"epoch": 23.1431955211025,
"eval_accuracy": 0.3958654656408665,
"eval_loss": 3.2916109561920166,
"eval_runtime": 54.925,
"eval_samples_per_second": 327.938,
"eval_steps_per_second": 20.501,
"step": 86000
},
{
"epoch": 23.156653746770026,
"grad_norm": 0.4357658624649048,
"learning_rate": 0.0002528248283290696,
"loss": 2.9473,
"step": 86050
},
{
"epoch": 23.170111972437553,
"grad_norm": 0.4642813503742218,
"learning_rate": 0.00025262286252861183,
"loss": 2.9555,
"step": 86100
},
{
"epoch": 23.18357019810508,
"grad_norm": 0.45447543263435364,
"learning_rate": 0.000252420896728154,
"loss": 2.9458,
"step": 86150
},
{
"epoch": 23.19702842377261,
"grad_norm": 0.44857949018478394,
"learning_rate": 0.0002522189309276962,
"loss": 2.9494,
"step": 86200
},
{
"epoch": 23.21048664944014,
"grad_norm": 0.4471588730812073,
"learning_rate": 0.0002520169651272384,
"loss": 2.946,
"step": 86250
},
{
"epoch": 23.223944875107666,
"grad_norm": 0.4542173147201538,
"learning_rate": 0.00025181499932678067,
"loss": 2.9522,
"step": 86300
},
{
"epoch": 23.237403100775193,
"grad_norm": 0.44934558868408203,
"learning_rate": 0.00025161303352632286,
"loss": 2.9551,
"step": 86350
},
{
"epoch": 23.25086132644272,
"grad_norm": 0.45528241991996765,
"learning_rate": 0.00025141106772586506,
"loss": 2.9546,
"step": 86400
},
{
"epoch": 23.26431955211025,
"grad_norm": 0.4349673092365265,
"learning_rate": 0.00025120910192540726,
"loss": 2.9571,
"step": 86450
},
{
"epoch": 23.27777777777778,
"grad_norm": 0.44581711292266846,
"learning_rate": 0.00025100713612494945,
"loss": 2.9511,
"step": 86500
},
{
"epoch": 23.291236003445306,
"grad_norm": 0.4143180251121521,
"learning_rate": 0.0002508051703244917,
"loss": 2.9606,
"step": 86550
},
{
"epoch": 23.304694229112833,
"grad_norm": 0.44750291109085083,
"learning_rate": 0.0002506032045240339,
"loss": 2.9643,
"step": 86600
},
{
"epoch": 23.31815245478036,
"grad_norm": 0.4560827612876892,
"learning_rate": 0.00025040123872357615,
"loss": 2.961,
"step": 86650
},
{
"epoch": 23.33161068044789,
"grad_norm": 0.45675498247146606,
"learning_rate": 0.00025019927292311834,
"loss": 2.967,
"step": 86700
},
{
"epoch": 23.34506890611542,
"grad_norm": 0.4347269833087921,
"learning_rate": 0.00024999730712266054,
"loss": 2.965,
"step": 86750
},
{
"epoch": 23.358527131782946,
"grad_norm": 0.4316846430301666,
"learning_rate": 0.00024979534132220274,
"loss": 2.9634,
"step": 86800
},
{
"epoch": 23.371985357450473,
"grad_norm": 0.47420305013656616,
"learning_rate": 0.00024959337552174493,
"loss": 2.9675,
"step": 86850
},
{
"epoch": 23.385443583118,
"grad_norm": 0.44756773114204407,
"learning_rate": 0.0002493914097212872,
"loss": 2.9659,
"step": 86900
},
{
"epoch": 23.39890180878553,
"grad_norm": 0.5035362243652344,
"learning_rate": 0.0002491894439208294,
"loss": 2.9752,
"step": 86950
},
{
"epoch": 23.41236003445306,
"grad_norm": 0.46639010310173035,
"learning_rate": 0.00024898747812037163,
"loss": 2.9756,
"step": 87000
},
{
"epoch": 23.41236003445306,
"eval_accuracy": 0.3962778891045891,
"eval_loss": 3.2877182960510254,
"eval_runtime": 54.721,
"eval_samples_per_second": 329.161,
"eval_steps_per_second": 20.577,
"step": 87000
},
{
"epoch": 23.425818260120586,
"grad_norm": 0.44842982292175293,
"learning_rate": 0.0002487855123199138,
"loss": 2.9664,
"step": 87050
},
{
"epoch": 23.439276485788113,
"grad_norm": 0.45062634348869324,
"learning_rate": 0.000248583546519456,
"loss": 2.9603,
"step": 87100
},
{
"epoch": 23.45273471145564,
"grad_norm": 0.4483391046524048,
"learning_rate": 0.0002483815807189982,
"loss": 2.976,
"step": 87150
},
{
"epoch": 23.466192937123168,
"grad_norm": 0.45215165615081787,
"learning_rate": 0.0002481796149185404,
"loss": 2.9683,
"step": 87200
},
{
"epoch": 23.4796511627907,
"grad_norm": 0.48873844742774963,
"learning_rate": 0.00024797764911808266,
"loss": 2.9729,
"step": 87250
},
{
"epoch": 23.493109388458226,
"grad_norm": 0.4677174985408783,
"learning_rate": 0.00024777568331762486,
"loss": 2.9737,
"step": 87300
},
{
"epoch": 23.506567614125753,
"grad_norm": 0.43766000866889954,
"learning_rate": 0.00024757371751716706,
"loss": 2.9754,
"step": 87350
},
{
"epoch": 23.52002583979328,
"grad_norm": 0.42998966574668884,
"learning_rate": 0.0002473717517167093,
"loss": 2.9692,
"step": 87400
},
{
"epoch": 23.533484065460808,
"grad_norm": 0.45019766688346863,
"learning_rate": 0.0002471697859162515,
"loss": 2.9847,
"step": 87450
},
{
"epoch": 23.54694229112834,
"grad_norm": 0.43693816661834717,
"learning_rate": 0.0002469678201157937,
"loss": 2.9708,
"step": 87500
},
{
"epoch": 23.560400516795866,
"grad_norm": 0.4665409326553345,
"learning_rate": 0.0002467658543153359,
"loss": 2.9764,
"step": 87550
},
{
"epoch": 23.573858742463393,
"grad_norm": 0.46596479415893555,
"learning_rate": 0.00024656388851487815,
"loss": 2.9756,
"step": 87600
},
{
"epoch": 23.58731696813092,
"grad_norm": 0.4340677857398987,
"learning_rate": 0.00024636192271442034,
"loss": 2.9808,
"step": 87650
},
{
"epoch": 23.600775193798448,
"grad_norm": 0.47798460721969604,
"learning_rate": 0.00024615995691396254,
"loss": 2.9849,
"step": 87700
},
{
"epoch": 23.61423341946598,
"grad_norm": 0.45297908782958984,
"learning_rate": 0.00024595799111350473,
"loss": 2.9802,
"step": 87750
},
{
"epoch": 23.627691645133506,
"grad_norm": 0.473803848028183,
"learning_rate": 0.00024575602531304693,
"loss": 2.9863,
"step": 87800
},
{
"epoch": 23.641149870801033,
"grad_norm": 0.4625190794467926,
"learning_rate": 0.0002455540595125892,
"loss": 2.9927,
"step": 87850
},
{
"epoch": 23.65460809646856,
"grad_norm": 0.4412365257740021,
"learning_rate": 0.00024535209371213143,
"loss": 2.9739,
"step": 87900
},
{
"epoch": 23.668066322136088,
"grad_norm": 0.4445769488811493,
"learning_rate": 0.0002451501279116736,
"loss": 2.9827,
"step": 87950
},
{
"epoch": 23.68152454780362,
"grad_norm": 0.4315958619117737,
"learning_rate": 0.0002449481621112158,
"loss": 2.981,
"step": 88000
},
{
"epoch": 23.68152454780362,
"eval_accuracy": 0.3964807327839279,
"eval_loss": 3.2832388877868652,
"eval_runtime": 55.0121,
"eval_samples_per_second": 327.419,
"eval_steps_per_second": 20.468,
"step": 88000
},
{
"epoch": 23.694982773471146,
"grad_norm": 0.4323495328426361,
"learning_rate": 0.000244746196310758,
"loss": 2.9931,
"step": 88050
},
{
"epoch": 23.708440999138674,
"grad_norm": 0.46069687604904175,
"learning_rate": 0.0002445442305103002,
"loss": 2.9773,
"step": 88100
},
{
"epoch": 23.7218992248062,
"grad_norm": 0.4385862946510315,
"learning_rate": 0.00024434226470984246,
"loss": 2.9801,
"step": 88150
},
{
"epoch": 23.735357450473728,
"grad_norm": 0.45180192589759827,
"learning_rate": 0.00024414029890938463,
"loss": 2.988,
"step": 88200
},
{
"epoch": 23.74881567614126,
"grad_norm": 0.45826536417007446,
"learning_rate": 0.00024393833310892686,
"loss": 2.9816,
"step": 88250
},
{
"epoch": 23.762273901808786,
"grad_norm": 0.45120131969451904,
"learning_rate": 0.0002437363673084691,
"loss": 2.9752,
"step": 88300
},
{
"epoch": 23.775732127476314,
"grad_norm": 0.4657435417175293,
"learning_rate": 0.0002435344015080113,
"loss": 2.9837,
"step": 88350
},
{
"epoch": 23.78919035314384,
"grad_norm": 0.4517814815044403,
"learning_rate": 0.0002433324357075535,
"loss": 2.9858,
"step": 88400
},
{
"epoch": 23.802648578811368,
"grad_norm": 0.4310310184955597,
"learning_rate": 0.00024313046990709572,
"loss": 2.986,
"step": 88450
},
{
"epoch": 23.8161068044789,
"grad_norm": 0.4929543733596802,
"learning_rate": 0.00024292850410663792,
"loss": 2.9891,
"step": 88500
},
{
"epoch": 23.829565030146426,
"grad_norm": 0.471609890460968,
"learning_rate": 0.00024272653830618014,
"loss": 2.9759,
"step": 88550
},
{
"epoch": 23.843023255813954,
"grad_norm": 0.48696190118789673,
"learning_rate": 0.00024252457250572234,
"loss": 2.9843,
"step": 88600
},
{
"epoch": 23.85648148148148,
"grad_norm": 0.4469008147716522,
"learning_rate": 0.00024232260670526453,
"loss": 2.99,
"step": 88650
},
{
"epoch": 23.86993970714901,
"grad_norm": 0.45953747630119324,
"learning_rate": 0.00024212064090480676,
"loss": 2.9742,
"step": 88700
},
{
"epoch": 23.88339793281654,
"grad_norm": 0.48521921038627625,
"learning_rate": 0.00024191867510434898,
"loss": 2.9907,
"step": 88750
},
{
"epoch": 23.896856158484066,
"grad_norm": 0.4295513927936554,
"learning_rate": 0.0002417167093038912,
"loss": 2.9931,
"step": 88800
},
{
"epoch": 23.910314384151594,
"grad_norm": 0.4355540871620178,
"learning_rate": 0.0002415147435034334,
"loss": 2.9859,
"step": 88850
},
{
"epoch": 23.92377260981912,
"grad_norm": 0.43405336141586304,
"learning_rate": 0.00024131277770297562,
"loss": 3.0063,
"step": 88900
},
{
"epoch": 23.93723083548665,
"grad_norm": 0.4761360287666321,
"learning_rate": 0.00024111081190251782,
"loss": 2.9799,
"step": 88950
},
{
"epoch": 23.95068906115418,
"grad_norm": 0.4812622666358948,
"learning_rate": 0.00024090884610206002,
"loss": 2.9913,
"step": 89000
},
{
"epoch": 23.95068906115418,
"eval_accuracy": 0.3971195762946468,
"eval_loss": 3.276291847229004,
"eval_runtime": 54.8659,
"eval_samples_per_second": 328.291,
"eval_steps_per_second": 20.523,
"step": 89000
},
{
"epoch": 23.964147286821706,
"grad_norm": 0.4610179662704468,
"learning_rate": 0.00024070688030160224,
"loss": 2.9939,
"step": 89050
},
{
"epoch": 23.977605512489234,
"grad_norm": 0.4359179735183716,
"learning_rate": 0.00024050491450114443,
"loss": 2.9952,
"step": 89100
},
{
"epoch": 23.99106373815676,
"grad_norm": 0.45922085642814636,
"learning_rate": 0.00024030294870068666,
"loss": 2.9857,
"step": 89150
},
{
"epoch": 24.00430663221361,
"grad_norm": 0.46470963954925537,
"learning_rate": 0.00024010098290022888,
"loss": 2.9692,
"step": 89200
},
{
"epoch": 24.017764857881136,
"grad_norm": 0.42660030722618103,
"learning_rate": 0.0002398990170997711,
"loss": 2.8936,
"step": 89250
},
{
"epoch": 24.031223083548664,
"grad_norm": 0.433124840259552,
"learning_rate": 0.0002396970512993133,
"loss": 2.9173,
"step": 89300
},
{
"epoch": 24.044681309216195,
"grad_norm": 0.4298465847969055,
"learning_rate": 0.00023949508549885552,
"loss": 2.9137,
"step": 89350
},
{
"epoch": 24.058139534883722,
"grad_norm": 0.4245285987854004,
"learning_rate": 0.00023929311969839772,
"loss": 2.9107,
"step": 89400
},
{
"epoch": 24.07159776055125,
"grad_norm": 0.465323269367218,
"learning_rate": 0.00023909115389793992,
"loss": 2.92,
"step": 89450
},
{
"epoch": 24.085055986218777,
"grad_norm": 0.47525930404663086,
"learning_rate": 0.00023888918809748214,
"loss": 2.9168,
"step": 89500
},
{
"epoch": 24.098514211886304,
"grad_norm": 0.46605080366134644,
"learning_rate": 0.00023868722229702433,
"loss": 2.9265,
"step": 89550
},
{
"epoch": 24.11197243755383,
"grad_norm": 0.4591079354286194,
"learning_rate": 0.00023848525649656656,
"loss": 2.9212,
"step": 89600
},
{
"epoch": 24.125430663221362,
"grad_norm": 0.4741298258304596,
"learning_rate": 0.00023828329069610878,
"loss": 2.9165,
"step": 89650
},
{
"epoch": 24.13888888888889,
"grad_norm": 0.46697476506233215,
"learning_rate": 0.000238081324895651,
"loss": 2.9309,
"step": 89700
},
{
"epoch": 24.152347114556417,
"grad_norm": 0.47774580121040344,
"learning_rate": 0.0002378793590951932,
"loss": 2.9415,
"step": 89750
},
{
"epoch": 24.165805340223944,
"grad_norm": 0.4766087532043457,
"learning_rate": 0.0002376773932947354,
"loss": 2.9361,
"step": 89800
},
{
"epoch": 24.17926356589147,
"grad_norm": 0.464669793844223,
"learning_rate": 0.00023747542749427762,
"loss": 2.9333,
"step": 89850
},
{
"epoch": 24.192721791559002,
"grad_norm": 0.43852221965789795,
"learning_rate": 0.00023727346169381982,
"loss": 2.9363,
"step": 89900
},
{
"epoch": 24.20618001722653,
"grad_norm": 0.4529156982898712,
"learning_rate": 0.00023707149589336204,
"loss": 2.9422,
"step": 89950
},
{
"epoch": 24.219638242894057,
"grad_norm": 0.5049745440483093,
"learning_rate": 0.00023686953009290424,
"loss": 2.958,
"step": 90000
},
{
"epoch": 24.219638242894057,
"eval_accuracy": 0.3960319126235912,
"eval_loss": 3.292781352996826,
"eval_runtime": 54.9708,
"eval_samples_per_second": 327.665,
"eval_steps_per_second": 20.484,
"step": 90000
},
{
"epoch": 24.233096468561584,
"grad_norm": 0.4704437255859375,
"learning_rate": 0.00023666756429244643,
"loss": 2.9132,
"step": 90050
},
{
"epoch": 24.24655469422911,
"grad_norm": 0.4529268145561218,
"learning_rate": 0.00023646559849198868,
"loss": 2.9176,
"step": 90100
},
{
"epoch": 24.260012919896642,
"grad_norm": 0.43529608845710754,
"learning_rate": 0.00023626363269153088,
"loss": 2.9243,
"step": 90150
},
{
"epoch": 24.27347114556417,
"grad_norm": 0.48170673847198486,
"learning_rate": 0.0002360616668910731,
"loss": 2.9182,
"step": 90200
},
{
"epoch": 24.286929371231697,
"grad_norm": 0.46854284405708313,
"learning_rate": 0.0002358597010906153,
"loss": 2.9155,
"step": 90250
},
{
"epoch": 24.300387596899224,
"grad_norm": 0.45262134075164795,
"learning_rate": 0.00023565773529015752,
"loss": 2.926,
"step": 90300
},
{
"epoch": 24.31384582256675,
"grad_norm": 0.4530410170555115,
"learning_rate": 0.00023545576948969972,
"loss": 2.929,
"step": 90350
},
{
"epoch": 24.327304048234282,
"grad_norm": 0.483509361743927,
"learning_rate": 0.00023525380368924194,
"loss": 2.9243,
"step": 90400
},
{
"epoch": 24.34076227390181,
"grad_norm": 0.4750220477581024,
"learning_rate": 0.00023505183788878414,
"loss": 2.9258,
"step": 90450
},
{
"epoch": 24.354220499569337,
"grad_norm": 0.4815353751182556,
"learning_rate": 0.00023484987208832633,
"loss": 2.9405,
"step": 90500
},
{
"epoch": 24.367678725236864,
"grad_norm": 0.49263864755630493,
"learning_rate": 0.00023464790628786858,
"loss": 2.9295,
"step": 90550
},
{
"epoch": 24.38113695090439,
"grad_norm": 0.4376738667488098,
"learning_rate": 0.00023444594048741078,
"loss": 2.9399,
"step": 90600
},
{
"epoch": 24.394595176571922,
"grad_norm": 0.47231411933898926,
"learning_rate": 0.000234243974686953,
"loss": 2.9306,
"step": 90650
},
{
"epoch": 24.40805340223945,
"grad_norm": 0.4553051292896271,
"learning_rate": 0.0002340420088864952,
"loss": 2.9463,
"step": 90700
},
{
"epoch": 24.421511627906977,
"grad_norm": 0.47880223393440247,
"learning_rate": 0.00023384004308603742,
"loss": 2.9336,
"step": 90750
},
{
"epoch": 24.434969853574504,
"grad_norm": 0.422132670879364,
"learning_rate": 0.00023363807728557962,
"loss": 2.9377,
"step": 90800
},
{
"epoch": 24.44842807924203,
"grad_norm": 0.475422739982605,
"learning_rate": 0.0002334361114851218,
"loss": 2.9358,
"step": 90850
},
{
"epoch": 24.461886304909562,
"grad_norm": 0.4530656933784485,
"learning_rate": 0.00023323414568466404,
"loss": 2.9447,
"step": 90900
},
{
"epoch": 24.47534453057709,
"grad_norm": 0.4522700607776642,
"learning_rate": 0.00023303217988420623,
"loss": 2.9453,
"step": 90950
},
{
"epoch": 24.488802756244617,
"grad_norm": 0.4927493929862976,
"learning_rate": 0.00023283021408374848,
"loss": 2.951,
"step": 91000
},
{
"epoch": 24.488802756244617,
"eval_accuracy": 0.3960585311032045,
"eval_loss": 3.29355788230896,
"eval_runtime": 147.0497,
"eval_samples_per_second": 122.489,
"eval_steps_per_second": 7.657,
"step": 91000
},
{
"epoch": 24.502260981912144,
"grad_norm": 0.447524756193161,
"learning_rate": 0.00023262824828329068,
"loss": 2.9552,
"step": 91050
},
{
"epoch": 24.51571920757967,
"grad_norm": 0.45930957794189453,
"learning_rate": 0.0002324262824828329,
"loss": 2.9467,
"step": 91100
},
{
"epoch": 24.529177433247202,
"grad_norm": 0.473864883184433,
"learning_rate": 0.0002322243166823751,
"loss": 2.9415,
"step": 91150
},
{
"epoch": 24.54263565891473,
"grad_norm": 0.4445241689682007,
"learning_rate": 0.0002320223508819173,
"loss": 2.94,
"step": 91200
},
{
"epoch": 24.556093884582257,
"grad_norm": 0.4805726110935211,
"learning_rate": 0.00023182038508145952,
"loss": 2.9394,
"step": 91250
},
{
"epoch": 24.569552110249784,
"grad_norm": 0.4495256543159485,
"learning_rate": 0.0002316184192810017,
"loss": 2.9555,
"step": 91300
},
{
"epoch": 24.58301033591731,
"grad_norm": 0.4489133954048157,
"learning_rate": 0.00023141645348054394,
"loss": 2.9541,
"step": 91350
},
{
"epoch": 24.596468561584842,
"grad_norm": 0.48831552267074585,
"learning_rate": 0.00023121448768008613,
"loss": 2.9554,
"step": 91400
},
{
"epoch": 24.60992678725237,
"grad_norm": 0.4598943591117859,
"learning_rate": 0.00023101252187962838,
"loss": 2.9543,
"step": 91450
},
{
"epoch": 24.623385012919897,
"grad_norm": 0.4772765040397644,
"learning_rate": 0.00023081055607917058,
"loss": 2.9584,
"step": 91500
},
{
"epoch": 24.636843238587424,
"grad_norm": 0.47404298186302185,
"learning_rate": 0.0002306085902787128,
"loss": 2.9563,
"step": 91550
},
{
"epoch": 24.65030146425495,
"grad_norm": 0.5125542283058167,
"learning_rate": 0.000230406624478255,
"loss": 2.964,
"step": 91600
},
{
"epoch": 24.66375968992248,
"grad_norm": 0.48980486392974854,
"learning_rate": 0.0002302046586777972,
"loss": 2.9626,
"step": 91650
},
{
"epoch": 24.67721791559001,
"grad_norm": 0.48496899008750916,
"learning_rate": 0.00023000269287733942,
"loss": 2.9436,
"step": 91700
},
{
"epoch": 24.690676141257537,
"grad_norm": 0.49066996574401855,
"learning_rate": 0.00022980072707688161,
"loss": 2.9684,
"step": 91750
},
{
"epoch": 24.704134366925064,
"grad_norm": 0.4482797086238861,
"learning_rate": 0.00022959876127642384,
"loss": 2.9512,
"step": 91800
},
{
"epoch": 24.71759259259259,
"grad_norm": 0.49523958563804626,
"learning_rate": 0.00022939679547596603,
"loss": 2.9561,
"step": 91850
},
{
"epoch": 24.73105081826012,
"grad_norm": 0.44394782185554504,
"learning_rate": 0.00022919482967550828,
"loss": 2.9688,
"step": 91900
},
{
"epoch": 24.74450904392765,
"grad_norm": 0.49176523089408875,
"learning_rate": 0.00022899286387505048,
"loss": 2.9611,
"step": 91950
},
{
"epoch": 24.757967269595177,
"grad_norm": 0.4634227156639099,
"learning_rate": 0.00022879089807459268,
"loss": 2.9644,
"step": 92000
},
{
"epoch": 24.757967269595177,
"eval_accuracy": 0.396619148877917,
"eval_loss": 3.2853758335113525,
"eval_runtime": 147.3435,
"eval_samples_per_second": 122.245,
"eval_steps_per_second": 7.642,
"step": 92000
},
{
"epoch": 24.771425495262704,
"grad_norm": 0.4450036287307739,
"learning_rate": 0.0002285889322741349,
"loss": 2.9646,
"step": 92050
},
{
"epoch": 24.78488372093023,
"grad_norm": 0.46700024604797363,
"learning_rate": 0.0002283869664736771,
"loss": 2.9706,
"step": 92100
},
{
"epoch": 24.79834194659776,
"grad_norm": 0.48077601194381714,
"learning_rate": 0.00022818500067321932,
"loss": 2.9661,
"step": 92150
},
{
"epoch": 24.81180017226529,
"grad_norm": 0.46260204911231995,
"learning_rate": 0.00022798303487276151,
"loss": 2.9705,
"step": 92200
},
{
"epoch": 24.825258397932817,
"grad_norm": 0.4556381106376648,
"learning_rate": 0.0002277810690723037,
"loss": 2.9745,
"step": 92250
},
{
"epoch": 24.838716623600344,
"grad_norm": 0.4934409558773041,
"learning_rate": 0.00022757910327184596,
"loss": 2.9711,
"step": 92300
},
{
"epoch": 24.852174849267872,
"grad_norm": 0.46511897444725037,
"learning_rate": 0.00022737713747138818,
"loss": 2.9626,
"step": 92350
},
{
"epoch": 24.8656330749354,
"grad_norm": 0.43899616599082947,
"learning_rate": 0.00022717517167093038,
"loss": 2.9746,
"step": 92400
},
{
"epoch": 24.87909130060293,
"grad_norm": 0.46751806139945984,
"learning_rate": 0.00022697320587047258,
"loss": 2.9637,
"step": 92450
},
{
"epoch": 24.892549526270457,
"grad_norm": 0.465297132730484,
"learning_rate": 0.0002267712400700148,
"loss": 2.9674,
"step": 92500
},
{
"epoch": 24.906007751937985,
"grad_norm": 0.44672736525535583,
"learning_rate": 0.000226569274269557,
"loss": 2.9694,
"step": 92550
},
{
"epoch": 24.919465977605512,
"grad_norm": 0.4716099798679352,
"learning_rate": 0.00022636730846909922,
"loss": 2.9788,
"step": 92600
},
{
"epoch": 24.93292420327304,
"grad_norm": 0.47384023666381836,
"learning_rate": 0.00022616534266864141,
"loss": 2.9707,
"step": 92650
},
{
"epoch": 24.94638242894057,
"grad_norm": 0.5042713284492493,
"learning_rate": 0.0002259633768681836,
"loss": 2.983,
"step": 92700
},
{
"epoch": 24.959840654608097,
"grad_norm": 0.4943690896034241,
"learning_rate": 0.00022576141106772586,
"loss": 2.9636,
"step": 92750
},
{
"epoch": 24.973298880275625,
"grad_norm": 0.4922303557395935,
"learning_rate": 0.00022555944526726806,
"loss": 2.9757,
"step": 92800
},
{
"epoch": 24.986757105943152,
"grad_norm": 0.46571773290634155,
"learning_rate": 0.00022535747946681028,
"loss": 2.9737,
"step": 92850
},
{
"epoch": 25.00026916451335,
"grad_norm": 1.1899535655975342,
"learning_rate": 0.00022515551366635248,
"loss": 3.0341,
"step": 92900
},
{
"epoch": 25.01372739018088,
"grad_norm": 0.4870604872703552,
"learning_rate": 0.0002249535478658947,
"loss": 2.9042,
"step": 92950
},
{
"epoch": 25.027185615848406,
"grad_norm": 0.469473272562027,
"learning_rate": 0.0002247515820654369,
"loss": 2.9012,
"step": 93000
},
{
"epoch": 25.027185615848406,
"eval_accuracy": 0.39639979087653243,
"eval_loss": 3.292593240737915,
"eval_runtime": 147.4879,
"eval_samples_per_second": 122.125,
"eval_steps_per_second": 7.635,
"step": 93000
},
{
"epoch": 25.040643841515934,
"grad_norm": 0.4822351336479187,
"learning_rate": 0.0002245496162649791,
"loss": 2.9127,
"step": 93050
},
{
"epoch": 25.05410206718346,
"grad_norm": 0.46038123965263367,
"learning_rate": 0.00022434765046452131,
"loss": 2.9251,
"step": 93100
},
{
"epoch": 25.06756029285099,
"grad_norm": 0.44845762848854065,
"learning_rate": 0.0002241456846640635,
"loss": 2.9082,
"step": 93150
},
{
"epoch": 25.08101851851852,
"grad_norm": 0.5279198288917542,
"learning_rate": 0.00022394371886360576,
"loss": 2.9228,
"step": 93200
},
{
"epoch": 25.094476744186046,
"grad_norm": 0.45746752619743347,
"learning_rate": 0.00022374175306314796,
"loss": 2.9189,
"step": 93250
},
{
"epoch": 25.107934969853574,
"grad_norm": 0.4499928653240204,
"learning_rate": 0.00022353978726269018,
"loss": 2.919,
"step": 93300
},
{
"epoch": 25.1213931955211,
"grad_norm": 0.4959993064403534,
"learning_rate": 0.00022333782146223238,
"loss": 2.9152,
"step": 93350
},
{
"epoch": 25.134851421188632,
"grad_norm": 0.47189101576805115,
"learning_rate": 0.0002231358556617746,
"loss": 2.9283,
"step": 93400
},
{
"epoch": 25.14830964685616,
"grad_norm": 0.47774845361709595,
"learning_rate": 0.0002229338898613168,
"loss": 2.9224,
"step": 93450
},
{
"epoch": 25.161767872523686,
"grad_norm": 0.45308101177215576,
"learning_rate": 0.000222731924060859,
"loss": 2.9252,
"step": 93500
},
{
"epoch": 25.175226098191214,
"grad_norm": 0.4624495208263397,
"learning_rate": 0.00022252995826040122,
"loss": 2.9312,
"step": 93550
},
{
"epoch": 25.18868432385874,
"grad_norm": 0.4419146180152893,
"learning_rate": 0.0002223279924599434,
"loss": 2.9285,
"step": 93600
},
{
"epoch": 25.202142549526272,
"grad_norm": 0.4739558696746826,
"learning_rate": 0.00022212602665948566,
"loss": 2.9303,
"step": 93650
},
{
"epoch": 25.2156007751938,
"grad_norm": 0.4595142900943756,
"learning_rate": 0.00022192406085902786,
"loss": 2.9369,
"step": 93700
},
{
"epoch": 25.229059000861326,
"grad_norm": 0.47502854466438293,
"learning_rate": 0.00022172209505857008,
"loss": 2.9372,
"step": 93750
},
{
"epoch": 25.242517226528854,
"grad_norm": 0.4656815826892853,
"learning_rate": 0.00022152012925811228,
"loss": 2.9324,
"step": 93800
},
{
"epoch": 25.25597545219638,
"grad_norm": 0.4996993839740753,
"learning_rate": 0.00022131816345765447,
"loss": 2.9346,
"step": 93850
},
{
"epoch": 25.269433677863912,
"grad_norm": 0.43119895458221436,
"learning_rate": 0.0002211161976571967,
"loss": 2.9464,
"step": 93900
},
{
"epoch": 25.28289190353144,
"grad_norm": 0.48845374584198,
"learning_rate": 0.0002209142318567389,
"loss": 2.9391,
"step": 93950
},
{
"epoch": 25.296350129198967,
"grad_norm": 0.44671630859375,
"learning_rate": 0.00022071226605628112,
"loss": 2.9246,
"step": 94000
},
{
"epoch": 25.296350129198967,
"eval_accuracy": 0.3961111161812977,
"eval_loss": 3.2946267127990723,
"eval_runtime": 146.1973,
"eval_samples_per_second": 123.203,
"eval_steps_per_second": 7.702,
"step": 94000
},
{
"epoch": 25.309808354866494,
"grad_norm": 0.48212531208992004,
"learning_rate": 0.0002205103002558233,
"loss": 2.942,
"step": 94050
},
{
"epoch": 25.32326658053402,
"grad_norm": 0.540875256061554,
"learning_rate": 0.00022030833445536556,
"loss": 2.9456,
"step": 94100
},
{
"epoch": 25.336724806201552,
"grad_norm": 0.4727246165275574,
"learning_rate": 0.00022010636865490776,
"loss": 2.9372,
"step": 94150
},
{
"epoch": 25.35018303186908,
"grad_norm": 0.48135116696357727,
"learning_rate": 0.00021990440285444998,
"loss": 2.9495,
"step": 94200
},
{
"epoch": 25.363641257536607,
"grad_norm": 0.48746976256370544,
"learning_rate": 0.00021970243705399218,
"loss": 2.944,
"step": 94250
},
{
"epoch": 25.377099483204134,
"grad_norm": 0.43600234389305115,
"learning_rate": 0.00021950047125353437,
"loss": 2.9406,
"step": 94300
},
{
"epoch": 25.39055770887166,
"grad_norm": 0.4867391288280487,
"learning_rate": 0.0002192985054530766,
"loss": 2.9455,
"step": 94350
},
{
"epoch": 25.404015934539192,
"grad_norm": 0.4582846760749817,
"learning_rate": 0.0002190965396526188,
"loss": 2.9339,
"step": 94400
},
{
"epoch": 25.41747416020672,
"grad_norm": 0.49392837285995483,
"learning_rate": 0.00021889457385216102,
"loss": 2.9495,
"step": 94450
},
{
"epoch": 25.430932385874247,
"grad_norm": 0.46900445222854614,
"learning_rate": 0.0002186926080517032,
"loss": 2.9567,
"step": 94500
},
{
"epoch": 25.444390611541774,
"grad_norm": 0.504092812538147,
"learning_rate": 0.00021849064225124546,
"loss": 2.9483,
"step": 94550
},
{
"epoch": 25.4578488372093,
"grad_norm": 0.45981499552726746,
"learning_rate": 0.00021828867645078766,
"loss": 2.9443,
"step": 94600
},
{
"epoch": 25.471307062876832,
"grad_norm": 0.47376635670661926,
"learning_rate": 0.00021808671065032985,
"loss": 2.9667,
"step": 94650
},
{
"epoch": 25.48476528854436,
"grad_norm": 0.4677162766456604,
"learning_rate": 0.00021788474484987208,
"loss": 2.9544,
"step": 94700
},
{
"epoch": 25.498223514211887,
"grad_norm": 0.4654765725135803,
"learning_rate": 0.00021768277904941427,
"loss": 2.9487,
"step": 94750
},
{
"epoch": 25.511681739879414,
"grad_norm": 0.49871331453323364,
"learning_rate": 0.0002174808132489565,
"loss": 2.9439,
"step": 94800
},
{
"epoch": 25.52513996554694,
"grad_norm": 0.511438250541687,
"learning_rate": 0.0002172788474484987,
"loss": 2.9495,
"step": 94850
},
{
"epoch": 25.53859819121447,
"grad_norm": 0.48844873905181885,
"learning_rate": 0.0002170768816480409,
"loss": 2.9612,
"step": 94900
},
{
"epoch": 25.552056416882,
"grad_norm": 0.4564681649208069,
"learning_rate": 0.0002168749158475831,
"loss": 2.9637,
"step": 94950
},
{
"epoch": 25.565514642549527,
"grad_norm": 0.4665428698062897,
"learning_rate": 0.00021667295004712534,
"loss": 2.9605,
"step": 95000
},
{
"epoch": 25.565514642549527,
"eval_accuracy": 0.3967212769221476,
"eval_loss": 3.2849857807159424,
"eval_runtime": 146.1173,
"eval_samples_per_second": 123.271,
"eval_steps_per_second": 7.706,
"step": 95000
},
{
"epoch": 25.578972868217054,
"grad_norm": 0.45568856596946716,
"learning_rate": 0.00021647098424666756,
"loss": 2.9594,
"step": 95050
},
{
"epoch": 25.59243109388458,
"grad_norm": 0.47148966789245605,
"learning_rate": 0.00021626901844620976,
"loss": 2.9654,
"step": 95100
},
{
"epoch": 25.60588931955211,
"grad_norm": 0.4540193974971771,
"learning_rate": 0.00021606705264575198,
"loss": 2.9655,
"step": 95150
},
{
"epoch": 25.61934754521964,
"grad_norm": 0.491960734128952,
"learning_rate": 0.00021586508684529417,
"loss": 2.9575,
"step": 95200
},
{
"epoch": 25.632805770887167,
"grad_norm": 0.4663715958595276,
"learning_rate": 0.0002156631210448364,
"loss": 2.9559,
"step": 95250
},
{
"epoch": 25.646263996554694,
"grad_norm": 0.45852020382881165,
"learning_rate": 0.0002154611552443786,
"loss": 2.9595,
"step": 95300
},
{
"epoch": 25.65972222222222,
"grad_norm": 0.48818692564964294,
"learning_rate": 0.0002152591894439208,
"loss": 2.9513,
"step": 95350
},
{
"epoch": 25.67318044788975,
"grad_norm": 0.4618067145347595,
"learning_rate": 0.000215057223643463,
"loss": 2.9658,
"step": 95400
},
{
"epoch": 25.68663867355728,
"grad_norm": 0.4699975848197937,
"learning_rate": 0.00021485525784300524,
"loss": 2.9694,
"step": 95450
},
{
"epoch": 25.700096899224807,
"grad_norm": 0.49483734369277954,
"learning_rate": 0.00021465329204254746,
"loss": 2.9666,
"step": 95500
},
{
"epoch": 25.713555124892334,
"grad_norm": 0.479103684425354,
"learning_rate": 0.00021445132624208966,
"loss": 2.9711,
"step": 95550
},
{
"epoch": 25.72701335055986,
"grad_norm": 0.5061235427856445,
"learning_rate": 0.00021424936044163188,
"loss": 2.9746,
"step": 95600
},
{
"epoch": 25.74047157622739,
"grad_norm": 0.5018367171287537,
"learning_rate": 0.00021404739464117407,
"loss": 2.9706,
"step": 95650
},
{
"epoch": 25.75392980189492,
"grad_norm": 0.47308751940727234,
"learning_rate": 0.00021384542884071627,
"loss": 2.961,
"step": 95700
},
{
"epoch": 25.767388027562447,
"grad_norm": 0.47990474104881287,
"learning_rate": 0.0002136434630402585,
"loss": 2.9735,
"step": 95750
},
{
"epoch": 25.780846253229974,
"grad_norm": 0.5104753375053406,
"learning_rate": 0.0002134414972398007,
"loss": 2.9615,
"step": 95800
},
{
"epoch": 25.7943044788975,
"grad_norm": 0.4795853793621063,
"learning_rate": 0.0002132395314393429,
"loss": 2.9692,
"step": 95850
},
{
"epoch": 25.80776270456503,
"grad_norm": 0.47165408730506897,
"learning_rate": 0.00021303756563888514,
"loss": 2.9595,
"step": 95900
},
{
"epoch": 25.82122093023256,
"grad_norm": 0.4764661192893982,
"learning_rate": 0.00021283559983842736,
"loss": 2.9581,
"step": 95950
},
{
"epoch": 25.834679155900087,
"grad_norm": 0.4767214357852936,
"learning_rate": 0.00021263363403796956,
"loss": 2.9767,
"step": 96000
},
{
"epoch": 25.834679155900087,
"eval_accuracy": 0.3970028895717706,
"eval_loss": 3.2806341648101807,
"eval_runtime": 146.2565,
"eval_samples_per_second": 123.154,
"eval_steps_per_second": 7.699,
"step": 96000
},
{
"epoch": 25.848137381567614,
"grad_norm": 0.48081299662590027,
"learning_rate": 0.00021243166823751175,
"loss": 2.9614,
"step": 96050
},
{
"epoch": 25.86159560723514,
"grad_norm": 0.4761458933353424,
"learning_rate": 0.00021222970243705398,
"loss": 2.9817,
"step": 96100
},
{
"epoch": 25.87505383290267,
"grad_norm": 0.4514337182044983,
"learning_rate": 0.00021202773663659617,
"loss": 2.9691,
"step": 96150
},
{
"epoch": 25.8885120585702,
"grad_norm": 0.47705453634262085,
"learning_rate": 0.0002118257708361384,
"loss": 2.972,
"step": 96200
},
{
"epoch": 25.901970284237727,
"grad_norm": 0.46351706981658936,
"learning_rate": 0.0002116238050356806,
"loss": 2.964,
"step": 96250
},
{
"epoch": 25.915428509905254,
"grad_norm": 0.4799213409423828,
"learning_rate": 0.00021142183923522281,
"loss": 2.9753,
"step": 96300
},
{
"epoch": 25.92888673557278,
"grad_norm": 0.46214932203292847,
"learning_rate": 0.00021121987343476504,
"loss": 2.9806,
"step": 96350
},
{
"epoch": 25.94234496124031,
"grad_norm": 0.4498823583126068,
"learning_rate": 0.00021101790763430726,
"loss": 2.978,
"step": 96400
},
{
"epoch": 25.955803186907836,
"grad_norm": 0.4700547456741333,
"learning_rate": 0.00021081594183384946,
"loss": 2.9633,
"step": 96450
},
{
"epoch": 25.969261412575367,
"grad_norm": 0.49166616797447205,
"learning_rate": 0.00021061397603339165,
"loss": 2.9794,
"step": 96500
},
{
"epoch": 25.982719638242894,
"grad_norm": 0.4853648841381073,
"learning_rate": 0.00021041201023293388,
"loss": 2.9736,
"step": 96550
},
{
"epoch": 25.99617786391042,
"grad_norm": 0.4653874933719635,
"learning_rate": 0.00021021004443247607,
"loss": 2.9679,
"step": 96600
},
{
"epoch": 26.00942075796727,
"grad_norm": 0.48257073760032654,
"learning_rate": 0.0002100080786320183,
"loss": 2.9124,
"step": 96650
},
{
"epoch": 26.022878983634797,
"grad_norm": 0.4681517779827118,
"learning_rate": 0.0002098061128315605,
"loss": 2.8909,
"step": 96700
},
{
"epoch": 26.036337209302324,
"grad_norm": 0.5120118260383606,
"learning_rate": 0.00020960414703110274,
"loss": 2.9027,
"step": 96750
},
{
"epoch": 26.049795434969855,
"grad_norm": 0.5014836192131042,
"learning_rate": 0.00020940218123064494,
"loss": 2.9032,
"step": 96800
},
{
"epoch": 26.063253660637383,
"grad_norm": 0.48905983567237854,
"learning_rate": 0.00020920021543018713,
"loss": 2.9024,
"step": 96850
},
{
"epoch": 26.07671188630491,
"grad_norm": 0.4677479565143585,
"learning_rate": 0.00020899824962972936,
"loss": 2.9186,
"step": 96900
},
{
"epoch": 26.090170111972437,
"grad_norm": 0.4844556450843811,
"learning_rate": 0.00020879628382927155,
"loss": 2.9093,
"step": 96950
},
{
"epoch": 26.103628337639964,
"grad_norm": 0.48902034759521484,
"learning_rate": 0.00020859431802881378,
"loss": 2.9106,
"step": 97000
},
{
"epoch": 26.103628337639964,
"eval_accuracy": 0.3965241915261537,
"eval_loss": 3.293030261993408,
"eval_runtime": 146.2368,
"eval_samples_per_second": 123.17,
"eval_steps_per_second": 7.7,
"step": 97000
},
{
"epoch": 26.117086563307492,
"grad_norm": 0.4999445080757141,
"learning_rate": 0.00020839235222835597,
"loss": 2.9076,
"step": 97050
},
{
"epoch": 26.130544788975023,
"grad_norm": 0.4936251938343048,
"learning_rate": 0.00020819038642789817,
"loss": 2.9138,
"step": 97100
},
{
"epoch": 26.14400301464255,
"grad_norm": 0.5119921565055847,
"learning_rate": 0.0002079884206274404,
"loss": 2.9163,
"step": 97150
},
{
"epoch": 26.157461240310077,
"grad_norm": 0.4628806412220001,
"learning_rate": 0.00020778645482698264,
"loss": 2.9275,
"step": 97200
},
{
"epoch": 26.170919465977605,
"grad_norm": 0.4793238341808319,
"learning_rate": 0.00020758448902652484,
"loss": 2.9143,
"step": 97250
},
{
"epoch": 26.184377691645132,
"grad_norm": 0.5028554201126099,
"learning_rate": 0.00020738252322606703,
"loss": 2.9154,
"step": 97300
},
{
"epoch": 26.197835917312663,
"grad_norm": 0.4699283838272095,
"learning_rate": 0.00020718055742560926,
"loss": 2.92,
"step": 97350
},
{
"epoch": 26.21129414298019,
"grad_norm": 0.4700480103492737,
"learning_rate": 0.00020697859162515145,
"loss": 2.9203,
"step": 97400
},
{
"epoch": 26.224752368647717,
"grad_norm": 0.47370514273643494,
"learning_rate": 0.00020677662582469368,
"loss": 2.9186,
"step": 97450
},
{
"epoch": 26.238210594315245,
"grad_norm": 0.4865691363811493,
"learning_rate": 0.00020657466002423587,
"loss": 2.9284,
"step": 97500
},
{
"epoch": 26.251668819982772,
"grad_norm": 0.5234982371330261,
"learning_rate": 0.00020637269422377807,
"loss": 2.9388,
"step": 97550
},
{
"epoch": 26.265127045650303,
"grad_norm": 0.4665259122848511,
"learning_rate": 0.0002061707284233203,
"loss": 2.9165,
"step": 97600
},
{
"epoch": 26.27858527131783,
"grad_norm": 0.4814673960208893,
"learning_rate": 0.00020596876262286251,
"loss": 2.9279,
"step": 97650
},
{
"epoch": 26.292043496985357,
"grad_norm": 0.49803024530410767,
"learning_rate": 0.00020576679682240474,
"loss": 2.9278,
"step": 97700
},
{
"epoch": 26.305501722652885,
"grad_norm": 0.5198856592178345,
"learning_rate": 0.00020556483102194693,
"loss": 2.9315,
"step": 97750
},
{
"epoch": 26.318959948320412,
"grad_norm": 0.454545259475708,
"learning_rate": 0.00020536286522148916,
"loss": 2.9344,
"step": 97800
},
{
"epoch": 26.332418173987943,
"grad_norm": 0.5368754267692566,
"learning_rate": 0.00020516089942103135,
"loss": 2.9265,
"step": 97850
},
{
"epoch": 26.34587639965547,
"grad_norm": 0.46915602684020996,
"learning_rate": 0.00020495893362057355,
"loss": 2.9348,
"step": 97900
},
{
"epoch": 26.359334625322997,
"grad_norm": 0.495172381401062,
"learning_rate": 0.00020475696782011577,
"loss": 2.9374,
"step": 97950
},
{
"epoch": 26.372792850990525,
"grad_norm": 0.4795806109905243,
"learning_rate": 0.00020455500201965797,
"loss": 2.9378,
"step": 98000
},
{
"epoch": 26.372792850990525,
"eval_accuracy": 0.39687186146395986,
"eval_loss": 3.2883505821228027,
"eval_runtime": 146.3636,
"eval_samples_per_second": 123.063,
"eval_steps_per_second": 7.693,
"step": 98000
},
{
"epoch": 26.386251076658052,
"grad_norm": 0.4649851322174072,
"learning_rate": 0.0002043530362192002,
"loss": 2.934,
"step": 98050
},
{
"epoch": 26.399709302325583,
"grad_norm": 0.46076539158821106,
"learning_rate": 0.00020415107041874242,
"loss": 2.9432,
"step": 98100
},
{
"epoch": 26.41316752799311,
"grad_norm": 0.4639580249786377,
"learning_rate": 0.00020394910461828464,
"loss": 2.938,
"step": 98150
},
{
"epoch": 26.426625753660637,
"grad_norm": 0.48218265175819397,
"learning_rate": 0.00020374713881782683,
"loss": 2.9363,
"step": 98200
},
{
"epoch": 26.440083979328165,
"grad_norm": 0.4805491268634796,
"learning_rate": 0.00020354517301736906,
"loss": 2.936,
"step": 98250
},
{
"epoch": 26.453542204995692,
"grad_norm": 0.4810453951358795,
"learning_rate": 0.00020334320721691125,
"loss": 2.9441,
"step": 98300
},
{
"epoch": 26.467000430663223,
"grad_norm": 0.4797106981277466,
"learning_rate": 0.00020314124141645345,
"loss": 2.9435,
"step": 98350
},
{
"epoch": 26.48045865633075,
"grad_norm": 0.48143908381462097,
"learning_rate": 0.00020293927561599567,
"loss": 2.9428,
"step": 98400
},
{
"epoch": 26.493916881998278,
"grad_norm": 0.4961640536785126,
"learning_rate": 0.00020273730981553787,
"loss": 2.9369,
"step": 98450
},
{
"epoch": 26.507375107665805,
"grad_norm": 0.49791309237480164,
"learning_rate": 0.0002025353440150801,
"loss": 2.9537,
"step": 98500
},
{
"epoch": 26.520833333333332,
"grad_norm": 0.52032071352005,
"learning_rate": 0.00020233337821462232,
"loss": 2.9455,
"step": 98550
},
{
"epoch": 26.534291559000863,
"grad_norm": 0.4943895637989044,
"learning_rate": 0.00020213141241416454,
"loss": 2.9452,
"step": 98600
},
{
"epoch": 26.54774978466839,
"grad_norm": 0.4840410649776459,
"learning_rate": 0.00020192944661370674,
"loss": 2.944,
"step": 98650
},
{
"epoch": 26.561208010335918,
"grad_norm": 0.488031268119812,
"learning_rate": 0.00020172748081324893,
"loss": 2.954,
"step": 98700
},
{
"epoch": 26.574666236003445,
"grad_norm": 0.49708092212677,
"learning_rate": 0.00020152551501279115,
"loss": 2.9446,
"step": 98750
},
{
"epoch": 26.588124461670972,
"grad_norm": 0.4940889775753021,
"learning_rate": 0.00020132354921233335,
"loss": 2.9518,
"step": 98800
},
{
"epoch": 26.6015826873385,
"grad_norm": 0.4973823130130768,
"learning_rate": 0.00020112158341187557,
"loss": 2.9486,
"step": 98850
},
{
"epoch": 26.61504091300603,
"grad_norm": 0.4541454315185547,
"learning_rate": 0.00020091961761141777,
"loss": 2.9475,
"step": 98900
},
{
"epoch": 26.628499138673558,
"grad_norm": 0.47265252470970154,
"learning_rate": 0.00020071765181095997,
"loss": 2.9577,
"step": 98950
},
{
"epoch": 26.641957364341085,
"grad_norm": 0.48501822352409363,
"learning_rate": 0.00020051568601050222,
"loss": 2.9573,
"step": 99000
},
{
"epoch": 26.641957364341085,
"eval_accuracy": 0.396756587150206,
"eval_loss": 3.2868714332580566,
"eval_runtime": 146.3487,
"eval_samples_per_second": 123.076,
"eval_steps_per_second": 7.694,
"step": 99000
},
{
"epoch": 26.655415590008612,
"grad_norm": 0.4774377644062042,
"learning_rate": 0.00020031372021004444,
"loss": 2.9507,
"step": 99050
},
{
"epoch": 26.66887381567614,
"grad_norm": 0.4970323443412781,
"learning_rate": 0.00020011175440958664,
"loss": 2.9539,
"step": 99100
},
{
"epoch": 26.68233204134367,
"grad_norm": 0.47971388697624207,
"learning_rate": 0.00019990978860912883,
"loss": 2.9507,
"step": 99150
},
{
"epoch": 26.695790267011198,
"grad_norm": 0.4856283664703369,
"learning_rate": 0.00019970782280867105,
"loss": 2.9576,
"step": 99200
},
{
"epoch": 26.709248492678725,
"grad_norm": 0.5186119675636292,
"learning_rate": 0.00019950585700821325,
"loss": 2.9318,
"step": 99250
},
{
"epoch": 26.722706718346252,
"grad_norm": 0.472085177898407,
"learning_rate": 0.00019930389120775547,
"loss": 2.9396,
"step": 99300
},
{
"epoch": 26.73616494401378,
"grad_norm": 0.4699694514274597,
"learning_rate": 0.00019910192540729767,
"loss": 2.9471,
"step": 99350
},
{
"epoch": 26.74962316968131,
"grad_norm": 0.4968441128730774,
"learning_rate": 0.00019889995960683987,
"loss": 2.9501,
"step": 99400
},
{
"epoch": 26.763081395348838,
"grad_norm": 0.4901743233203888,
"learning_rate": 0.00019869799380638212,
"loss": 2.9584,
"step": 99450
},
{
"epoch": 26.776539621016365,
"grad_norm": 0.4878545105457306,
"learning_rate": 0.0001984960280059243,
"loss": 2.9686,
"step": 99500
},
{
"epoch": 26.789997846683892,
"grad_norm": 0.48013490438461304,
"learning_rate": 0.00019829406220546654,
"loss": 2.9544,
"step": 99550
},
{
"epoch": 26.80345607235142,
"grad_norm": 0.5074095726013184,
"learning_rate": 0.00019809209640500873,
"loss": 2.9607,
"step": 99600
},
{
"epoch": 26.81691429801895,
"grad_norm": 0.4604112505912781,
"learning_rate": 0.00019789013060455096,
"loss": 2.9542,
"step": 99650
},
{
"epoch": 26.830372523686478,
"grad_norm": 0.45256295800209045,
"learning_rate": 0.00019768816480409315,
"loss": 2.9589,
"step": 99700
},
{
"epoch": 26.843830749354005,
"grad_norm": 0.479516863822937,
"learning_rate": 0.00019748619900363535,
"loss": 2.9495,
"step": 99750
},
{
"epoch": 26.857288975021532,
"grad_norm": 0.48909792304039,
"learning_rate": 0.00019728423320317757,
"loss": 2.9572,
"step": 99800
},
{
"epoch": 26.87074720068906,
"grad_norm": 0.49186989665031433,
"learning_rate": 0.00019708226740271977,
"loss": 2.9611,
"step": 99850
},
{
"epoch": 26.88420542635659,
"grad_norm": 0.45272672176361084,
"learning_rate": 0.00019688030160226202,
"loss": 2.9534,
"step": 99900
},
{
"epoch": 26.897663652024118,
"grad_norm": 0.48817697167396545,
"learning_rate": 0.0001966783358018042,
"loss": 2.9614,
"step": 99950
},
{
"epoch": 26.911121877691645,
"grad_norm": 0.5166176557540894,
"learning_rate": 0.00019647637000134644,
"loss": 2.9592,
"step": 100000
},
{
"epoch": 26.911121877691645,
"eval_accuracy": 0.39753297758006945,
"eval_loss": 3.279358386993408,
"eval_runtime": 146.0791,
"eval_samples_per_second": 123.303,
"eval_steps_per_second": 7.708,
"step": 100000
},
{
"epoch": 26.924580103359173,
"grad_norm": 0.4788917601108551,
"learning_rate": 0.00019627440420088863,
"loss": 2.9581,
"step": 100050
},
{
"epoch": 26.9380383290267,
"grad_norm": 0.4836042821407318,
"learning_rate": 0.00019607243840043086,
"loss": 2.9502,
"step": 100100
},
{
"epoch": 26.95149655469423,
"grad_norm": 0.47917941212654114,
"learning_rate": 0.00019587047259997305,
"loss": 2.9627,
"step": 100150
},
{
"epoch": 26.964954780361758,
"grad_norm": 0.4649355113506317,
"learning_rate": 0.00019566850679951525,
"loss": 2.9603,
"step": 100200
},
{
"epoch": 26.978413006029285,
"grad_norm": 0.5115824341773987,
"learning_rate": 0.00019546654099905747,
"loss": 2.956,
"step": 100250
},
{
"epoch": 26.991871231696813,
"grad_norm": 0.511033833026886,
"learning_rate": 0.00019526457519859967,
"loss": 2.9732,
"step": 100300
},
{
"epoch": 27.00511412575366,
"grad_norm": 0.4819963276386261,
"learning_rate": 0.00019506260939814192,
"loss": 2.9322,
"step": 100350
},
{
"epoch": 27.018572351421188,
"grad_norm": 0.5109541416168213,
"learning_rate": 0.0001948606435976841,
"loss": 2.8825,
"step": 100400
},
{
"epoch": 27.032030577088715,
"grad_norm": 0.5299190878868103,
"learning_rate": 0.00019465867779722634,
"loss": 2.9011,
"step": 100450
},
{
"epoch": 27.045488802756246,
"grad_norm": 0.4947880804538727,
"learning_rate": 0.00019445671199676853,
"loss": 2.8925,
"step": 100500
},
{
"epoch": 27.058947028423773,
"grad_norm": 0.4906946122646332,
"learning_rate": 0.00019425474619631073,
"loss": 2.9027,
"step": 100550
},
{
"epoch": 27.0724052540913,
"grad_norm": 0.5048971176147461,
"learning_rate": 0.00019405278039585295,
"loss": 2.9011,
"step": 100600
},
{
"epoch": 27.085863479758828,
"grad_norm": 0.47121816873550415,
"learning_rate": 0.00019385081459539515,
"loss": 2.8926,
"step": 100650
},
{
"epoch": 27.099321705426355,
"grad_norm": 0.5003146529197693,
"learning_rate": 0.00019364884879493737,
"loss": 2.9034,
"step": 100700
},
{
"epoch": 27.112779931093886,
"grad_norm": 0.491941899061203,
"learning_rate": 0.00019344688299447957,
"loss": 2.9028,
"step": 100750
},
{
"epoch": 27.126238156761413,
"grad_norm": 0.4929443895816803,
"learning_rate": 0.00019324491719402182,
"loss": 2.8981,
"step": 100800
},
{
"epoch": 27.13969638242894,
"grad_norm": 0.48886409401893616,
"learning_rate": 0.00019304295139356401,
"loss": 2.9095,
"step": 100850
},
{
"epoch": 27.153154608096468,
"grad_norm": 0.4845869541168213,
"learning_rate": 0.0001928409855931062,
"loss": 2.8995,
"step": 100900
},
{
"epoch": 27.166612833763995,
"grad_norm": 0.5020434260368347,
"learning_rate": 0.00019263901979264843,
"loss": 2.9163,
"step": 100950
},
{
"epoch": 27.180071059431526,
"grad_norm": 0.48844480514526367,
"learning_rate": 0.00019243705399219063,
"loss": 2.9014,
"step": 101000
},
{
"epoch": 27.180071059431526,
"eval_accuracy": 0.39657873224764706,
"eval_loss": 3.2880594730377197,
"eval_runtime": 146.9823,
"eval_samples_per_second": 122.545,
"eval_steps_per_second": 7.661,
"step": 101000
},
{
"epoch": 27.193529285099054,
"grad_norm": 0.4601687490940094,
"learning_rate": 0.00019223508819173285,
"loss": 2.911,
"step": 101050
},
{
"epoch": 27.20698751076658,
"grad_norm": 0.5111984610557556,
"learning_rate": 0.00019203312239127505,
"loss": 2.9103,
"step": 101100
},
{
"epoch": 27.220445736434108,
"grad_norm": 0.48042234778404236,
"learning_rate": 0.00019183115659081727,
"loss": 2.918,
"step": 101150
},
{
"epoch": 27.233903962101635,
"grad_norm": 0.48300600051879883,
"learning_rate": 0.00019162919079035947,
"loss": 2.9079,
"step": 101200
},
{
"epoch": 27.247362187769163,
"grad_norm": 0.48452457785606384,
"learning_rate": 0.00019142722498990172,
"loss": 2.9142,
"step": 101250
},
{
"epoch": 27.260820413436694,
"grad_norm": 0.49995142221450806,
"learning_rate": 0.00019122525918944391,
"loss": 2.9201,
"step": 101300
},
{
"epoch": 27.27427863910422,
"grad_norm": 0.5176795721054077,
"learning_rate": 0.0001910232933889861,
"loss": 2.9096,
"step": 101350
},
{
"epoch": 27.287736864771748,
"grad_norm": 0.5069646239280701,
"learning_rate": 0.00019082132758852833,
"loss": 2.8984,
"step": 101400
},
{
"epoch": 27.301195090439276,
"grad_norm": 0.46328434348106384,
"learning_rate": 0.00019061936178807053,
"loss": 2.9239,
"step": 101450
},
{
"epoch": 27.314653316106803,
"grad_norm": 0.521058201789856,
"learning_rate": 0.00019041739598761275,
"loss": 2.9206,
"step": 101500
},
{
"epoch": 27.328111541774334,
"grad_norm": 0.5169256925582886,
"learning_rate": 0.00019021543018715495,
"loss": 2.9243,
"step": 101550
},
{
"epoch": 27.34156976744186,
"grad_norm": 0.47673463821411133,
"learning_rate": 0.00019001346438669714,
"loss": 2.9226,
"step": 101600
},
{
"epoch": 27.35502799310939,
"grad_norm": 0.4859578013420105,
"learning_rate": 0.0001898114985862394,
"loss": 2.9334,
"step": 101650
},
{
"epoch": 27.368486218776916,
"grad_norm": 0.5149036049842834,
"learning_rate": 0.0001896095327857816,
"loss": 2.9226,
"step": 101700
},
{
"epoch": 27.381944444444443,
"grad_norm": 0.485603392124176,
"learning_rate": 0.00018940756698532381,
"loss": 2.9129,
"step": 101750
},
{
"epoch": 27.395402670111974,
"grad_norm": 0.4967415928840637,
"learning_rate": 0.000189205601184866,
"loss": 2.9217,
"step": 101800
},
{
"epoch": 27.4088608957795,
"grad_norm": 0.48943030834198,
"learning_rate": 0.00018900363538440823,
"loss": 2.9191,
"step": 101850
},
{
"epoch": 27.42231912144703,
"grad_norm": 0.5117086172103882,
"learning_rate": 0.00018880166958395043,
"loss": 2.9233,
"step": 101900
},
{
"epoch": 27.435777347114556,
"grad_norm": 0.4888044595718384,
"learning_rate": 0.00018859970378349263,
"loss": 2.9305,
"step": 101950
},
{
"epoch": 27.449235572782083,
"grad_norm": 0.48958805203437805,
"learning_rate": 0.00018839773798303485,
"loss": 2.9383,
"step": 102000
},
{
"epoch": 27.449235572782083,
"eval_accuracy": 0.397087090884833,
"eval_loss": 3.2870469093322754,
"eval_runtime": 146.0558,
"eval_samples_per_second": 123.323,
"eval_steps_per_second": 7.709,
"step": 102000
},
{
"epoch": 27.462693798449614,
"grad_norm": 0.49132028222084045,
"learning_rate": 0.00018819577218257705,
"loss": 2.934,
"step": 102050
},
{
"epoch": 27.47615202411714,
"grad_norm": 0.5282468795776367,
"learning_rate": 0.0001879938063821193,
"loss": 2.9369,
"step": 102100
},
{
"epoch": 27.48961024978467,
"grad_norm": 0.5024771094322205,
"learning_rate": 0.0001877918405816615,
"loss": 2.9185,
"step": 102150
},
{
"epoch": 27.503068475452196,
"grad_norm": 0.48891547322273254,
"learning_rate": 0.00018758987478120371,
"loss": 2.9419,
"step": 102200
},
{
"epoch": 27.516526701119723,
"grad_norm": 0.5016714334487915,
"learning_rate": 0.0001873879089807459,
"loss": 2.9221,
"step": 102250
},
{
"epoch": 27.529984926787254,
"grad_norm": 0.5051071047782898,
"learning_rate": 0.00018718594318028813,
"loss": 2.9334,
"step": 102300
},
{
"epoch": 27.54344315245478,
"grad_norm": 0.4839610457420349,
"learning_rate": 0.00018698397737983033,
"loss": 2.9334,
"step": 102350
},
{
"epoch": 27.55690137812231,
"grad_norm": 0.49538129568099976,
"learning_rate": 0.00018678201157937253,
"loss": 2.9433,
"step": 102400
},
{
"epoch": 27.570359603789836,
"grad_norm": 0.5163812637329102,
"learning_rate": 0.00018658004577891475,
"loss": 2.9399,
"step": 102450
},
{
"epoch": 27.583817829457363,
"grad_norm": 0.48955875635147095,
"learning_rate": 0.00018637807997845695,
"loss": 2.938,
"step": 102500
},
{
"epoch": 27.597276055124894,
"grad_norm": 0.48446550965309143,
"learning_rate": 0.0001861761141779992,
"loss": 2.9378,
"step": 102550
},
{
"epoch": 27.61073428079242,
"grad_norm": 0.48129525780677795,
"learning_rate": 0.0001859741483775414,
"loss": 2.9373,
"step": 102600
},
{
"epoch": 27.62419250645995,
"grad_norm": 0.4978967308998108,
"learning_rate": 0.00018577218257708362,
"loss": 2.9394,
"step": 102650
},
{
"epoch": 27.637650732127476,
"grad_norm": 0.5358911156654358,
"learning_rate": 0.0001855702167766258,
"loss": 2.9379,
"step": 102700
},
{
"epoch": 27.651108957795003,
"grad_norm": 0.48122960329055786,
"learning_rate": 0.000185368250976168,
"loss": 2.9336,
"step": 102750
},
{
"epoch": 27.664567183462534,
"grad_norm": 0.4696429967880249,
"learning_rate": 0.00018516628517571023,
"loss": 2.9367,
"step": 102800
},
{
"epoch": 27.67802540913006,
"grad_norm": 0.49047210812568665,
"learning_rate": 0.00018496431937525243,
"loss": 2.9348,
"step": 102850
},
{
"epoch": 27.69148363479759,
"grad_norm": 0.4979493319988251,
"learning_rate": 0.00018476235357479465,
"loss": 2.9339,
"step": 102900
},
{
"epoch": 27.704941860465116,
"grad_norm": 0.46877792477607727,
"learning_rate": 0.00018456038777433685,
"loss": 2.9504,
"step": 102950
},
{
"epoch": 27.718400086132643,
"grad_norm": 0.5015468597412109,
"learning_rate": 0.0001843584219738791,
"loss": 2.9516,
"step": 103000
},
{
"epoch": 27.718400086132643,
"eval_accuracy": 0.39719171780674156,
"eval_loss": 3.282999277114868,
"eval_runtime": 146.236,
"eval_samples_per_second": 123.171,
"eval_steps_per_second": 7.7,
"step": 103000
},
{
"epoch": 27.731858311800174,
"grad_norm": 0.4905368685722351,
"learning_rate": 0.0001841564561734213,
"loss": 2.9381,
"step": 103050
},
{
"epoch": 27.7453165374677,
"grad_norm": 0.4895152747631073,
"learning_rate": 0.00018395449037296352,
"loss": 2.9425,
"step": 103100
},
{
"epoch": 27.75877476313523,
"grad_norm": 0.4837099313735962,
"learning_rate": 0.0001837525245725057,
"loss": 2.9406,
"step": 103150
},
{
"epoch": 27.772232988802756,
"grad_norm": 0.48197370767593384,
"learning_rate": 0.0001835505587720479,
"loss": 2.9444,
"step": 103200
},
{
"epoch": 27.785691214470283,
"grad_norm": 0.5033994913101196,
"learning_rate": 0.00018334859297159013,
"loss": 2.9319,
"step": 103250
},
{
"epoch": 27.79914944013781,
"grad_norm": 0.5263758301734924,
"learning_rate": 0.00018314662717113233,
"loss": 2.9547,
"step": 103300
},
{
"epoch": 27.81260766580534,
"grad_norm": 0.47729551792144775,
"learning_rate": 0.00018294466137067455,
"loss": 2.9482,
"step": 103350
},
{
"epoch": 27.82606589147287,
"grad_norm": 0.545293927192688,
"learning_rate": 0.00018274269557021675,
"loss": 2.9654,
"step": 103400
},
{
"epoch": 27.839524117140396,
"grad_norm": 0.49454832077026367,
"learning_rate": 0.000182540729769759,
"loss": 2.9512,
"step": 103450
},
{
"epoch": 27.852982342807923,
"grad_norm": 0.4755057394504547,
"learning_rate": 0.0001823387639693012,
"loss": 2.9534,
"step": 103500
},
{
"epoch": 27.86644056847545,
"grad_norm": 0.49803298711776733,
"learning_rate": 0.0001821367981688434,
"loss": 2.9539,
"step": 103550
},
{
"epoch": 27.87989879414298,
"grad_norm": 0.5047042369842529,
"learning_rate": 0.0001819348323683856,
"loss": 2.9482,
"step": 103600
},
{
"epoch": 27.89335701981051,
"grad_norm": 0.4869590997695923,
"learning_rate": 0.0001817328665679278,
"loss": 2.9538,
"step": 103650
},
{
"epoch": 27.906815245478036,
"grad_norm": 0.498722642660141,
"learning_rate": 0.00018153090076747003,
"loss": 2.9507,
"step": 103700
},
{
"epoch": 27.920273471145563,
"grad_norm": 0.5139634013175964,
"learning_rate": 0.00018132893496701223,
"loss": 2.9621,
"step": 103750
},
{
"epoch": 27.93373169681309,
"grad_norm": 0.5020641088485718,
"learning_rate": 0.00018112696916655442,
"loss": 2.9514,
"step": 103800
},
{
"epoch": 27.94718992248062,
"grad_norm": 0.48636969923973083,
"learning_rate": 0.00018092500336609665,
"loss": 2.9596,
"step": 103850
},
{
"epoch": 27.96064814814815,
"grad_norm": 0.48579445481300354,
"learning_rate": 0.0001807230375656389,
"loss": 2.948,
"step": 103900
},
{
"epoch": 27.974106373815676,
"grad_norm": 0.4894184470176697,
"learning_rate": 0.0001805210717651811,
"loss": 2.9485,
"step": 103950
},
{
"epoch": 27.987564599483203,
"grad_norm": 0.4811153709888458,
"learning_rate": 0.0001803191059647233,
"loss": 2.9534,
"step": 104000
},
{
"epoch": 27.987564599483203,
"eval_accuracy": 0.39759164688207427,
"eval_loss": 3.276357889175415,
"eval_runtime": 147.4517,
"eval_samples_per_second": 122.155,
"eval_steps_per_second": 7.636,
"step": 104000
},
{
"epoch": 28.00080749354005,
"grad_norm": 0.5266789793968201,
"learning_rate": 0.0001801171401642655,
"loss": 2.9475,
"step": 104050
},
{
"epoch": 28.01426571920758,
"grad_norm": 0.49525949358940125,
"learning_rate": 0.0001799151743638077,
"loss": 2.87,
"step": 104100
},
{
"epoch": 28.027723944875106,
"grad_norm": 0.5010724663734436,
"learning_rate": 0.00017971320856334993,
"loss": 2.8858,
"step": 104150
},
{
"epoch": 28.041182170542637,
"grad_norm": 0.48668617010116577,
"learning_rate": 0.00017951124276289213,
"loss": 2.8767,
"step": 104200
},
{
"epoch": 28.054640396210164,
"grad_norm": 0.48478174209594727,
"learning_rate": 0.00017930927696243432,
"loss": 2.8777,
"step": 104250
},
{
"epoch": 28.06809862187769,
"grad_norm": 0.4890100359916687,
"learning_rate": 0.00017910731116197655,
"loss": 2.8855,
"step": 104300
},
{
"epoch": 28.08155684754522,
"grad_norm": 0.48516470193862915,
"learning_rate": 0.00017890534536151877,
"loss": 2.8959,
"step": 104350
},
{
"epoch": 28.095015073212746,
"grad_norm": 0.5097112655639648,
"learning_rate": 0.000178703379561061,
"loss": 2.8993,
"step": 104400
},
{
"epoch": 28.108473298880277,
"grad_norm": 0.49391499161720276,
"learning_rate": 0.0001785014137606032,
"loss": 2.8922,
"step": 104450
},
{
"epoch": 28.121931524547804,
"grad_norm": 0.5124300122261047,
"learning_rate": 0.0001782994479601454,
"loss": 2.9033,
"step": 104500
},
{
"epoch": 28.13538975021533,
"grad_norm": 0.5130902528762817,
"learning_rate": 0.0001780974821596876,
"loss": 2.895,
"step": 104550
},
{
"epoch": 28.14884797588286,
"grad_norm": 0.4990008473396301,
"learning_rate": 0.0001778955163592298,
"loss": 2.8929,
"step": 104600
},
{
"epoch": 28.162306201550386,
"grad_norm": 0.530022144317627,
"learning_rate": 0.00017769355055877203,
"loss": 2.8976,
"step": 104650
},
{
"epoch": 28.175764427217917,
"grad_norm": 0.45357492566108704,
"learning_rate": 0.00017749158475831422,
"loss": 2.8898,
"step": 104700
},
{
"epoch": 28.189222652885444,
"grad_norm": 0.5224539637565613,
"learning_rate": 0.00017728961895785645,
"loss": 2.8979,
"step": 104750
},
{
"epoch": 28.20268087855297,
"grad_norm": 0.5053698420524597,
"learning_rate": 0.00017708765315739867,
"loss": 2.9074,
"step": 104800
},
{
"epoch": 28.2161391042205,
"grad_norm": 0.49665287137031555,
"learning_rate": 0.0001768856873569409,
"loss": 2.8893,
"step": 104850
},
{
"epoch": 28.229597329888026,
"grad_norm": 0.5174131989479065,
"learning_rate": 0.0001766837215564831,
"loss": 2.9079,
"step": 104900
},
{
"epoch": 28.243055555555557,
"grad_norm": 0.482637882232666,
"learning_rate": 0.0001764817557560253,
"loss": 2.9022,
"step": 104950
},
{
"epoch": 28.256513781223084,
"grad_norm": 0.5195315480232239,
"learning_rate": 0.0001762797899555675,
"loss": 2.9109,
"step": 105000
},
{
"epoch": 28.256513781223084,
"eval_accuracy": 0.3970951307521448,
"eval_loss": 3.2897088527679443,
"eval_runtime": 147.4221,
"eval_samples_per_second": 122.18,
"eval_steps_per_second": 7.638,
"step": 105000
},
{
"epoch": 28.26997200689061,
"grad_norm": 0.5179916024208069,
"learning_rate": 0.0001760778241551097,
"loss": 2.9069,
"step": 105050
},
{
"epoch": 28.28343023255814,
"grad_norm": 0.4780939519405365,
"learning_rate": 0.00017587585835465193,
"loss": 2.9143,
"step": 105100
},
{
"epoch": 28.296888458225666,
"grad_norm": 0.5064705610275269,
"learning_rate": 0.00017567389255419412,
"loss": 2.8949,
"step": 105150
},
{
"epoch": 28.310346683893197,
"grad_norm": 0.493682861328125,
"learning_rate": 0.00017547192675373635,
"loss": 2.8993,
"step": 105200
},
{
"epoch": 28.323804909560724,
"grad_norm": 0.5155807137489319,
"learning_rate": 0.00017526996095327857,
"loss": 2.921,
"step": 105250
},
{
"epoch": 28.337263135228252,
"grad_norm": 0.5077420473098755,
"learning_rate": 0.0001750679951528208,
"loss": 2.9144,
"step": 105300
},
{
"epoch": 28.35072136089578,
"grad_norm": 0.4884844422340393,
"learning_rate": 0.000174866029352363,
"loss": 2.9225,
"step": 105350
},
{
"epoch": 28.364179586563306,
"grad_norm": 0.49359777569770813,
"learning_rate": 0.0001746640635519052,
"loss": 2.9095,
"step": 105400
},
{
"epoch": 28.377637812230837,
"grad_norm": 0.5174322724342346,
"learning_rate": 0.0001744620977514474,
"loss": 2.9265,
"step": 105450
},
{
"epoch": 28.391096037898365,
"grad_norm": 0.49454420804977417,
"learning_rate": 0.0001742601319509896,
"loss": 2.9277,
"step": 105500
},
{
"epoch": 28.404554263565892,
"grad_norm": 0.5095421075820923,
"learning_rate": 0.00017405816615053183,
"loss": 2.9204,
"step": 105550
},
{
"epoch": 28.41801248923342,
"grad_norm": 0.527397632598877,
"learning_rate": 0.00017385620035007403,
"loss": 2.9252,
"step": 105600
},
{
"epoch": 28.431470714900946,
"grad_norm": 0.4911178648471832,
"learning_rate": 0.00017365423454961622,
"loss": 2.9129,
"step": 105650
},
{
"epoch": 28.444928940568474,
"grad_norm": 0.5081452131271362,
"learning_rate": 0.00017345226874915847,
"loss": 2.922,
"step": 105700
},
{
"epoch": 28.458387166236005,
"grad_norm": 0.5088328123092651,
"learning_rate": 0.00017325030294870067,
"loss": 2.9274,
"step": 105750
},
{
"epoch": 28.471845391903532,
"grad_norm": 0.4922160804271698,
"learning_rate": 0.0001730483371482429,
"loss": 2.9178,
"step": 105800
},
{
"epoch": 28.48530361757106,
"grad_norm": 0.5186408162117004,
"learning_rate": 0.0001728463713477851,
"loss": 2.9271,
"step": 105850
},
{
"epoch": 28.498761843238587,
"grad_norm": 0.5223609209060669,
"learning_rate": 0.0001726444055473273,
"loss": 2.9204,
"step": 105900
},
{
"epoch": 28.512220068906114,
"grad_norm": 0.5327643752098083,
"learning_rate": 0.0001724424397468695,
"loss": 2.9196,
"step": 105950
},
{
"epoch": 28.525678294573645,
"grad_norm": 0.5054581761360168,
"learning_rate": 0.00017224047394641173,
"loss": 2.9167,
"step": 106000
},
{
"epoch": 28.525678294573645,
"eval_accuracy": 0.39750092675767795,
"eval_loss": 3.2850351333618164,
"eval_runtime": 146.736,
"eval_samples_per_second": 122.751,
"eval_steps_per_second": 7.674,
"step": 106000
},
{
"epoch": 28.539136520241172,
"grad_norm": 0.5137544870376587,
"learning_rate": 0.00017203850814595393,
"loss": 2.9188,
"step": 106050
},
{
"epoch": 28.5525947459087,
"grad_norm": 0.5124856233596802,
"learning_rate": 0.00017183654234549612,
"loss": 2.9283,
"step": 106100
},
{
"epoch": 28.566052971576227,
"grad_norm": 0.5131354928016663,
"learning_rate": 0.00017163457654503837,
"loss": 2.9297,
"step": 106150
},
{
"epoch": 28.579511197243754,
"grad_norm": 0.5123969912528992,
"learning_rate": 0.00017143261074458057,
"loss": 2.9228,
"step": 106200
},
{
"epoch": 28.592969422911285,
"grad_norm": 0.5282868146896362,
"learning_rate": 0.0001712306449441228,
"loss": 2.9372,
"step": 106250
},
{
"epoch": 28.606427648578812,
"grad_norm": 0.5144837498664856,
"learning_rate": 0.000171028679143665,
"loss": 2.9326,
"step": 106300
},
{
"epoch": 28.61988587424634,
"grad_norm": 0.5485707521438599,
"learning_rate": 0.0001708267133432072,
"loss": 2.9339,
"step": 106350
},
{
"epoch": 28.633344099913867,
"grad_norm": 0.5258968472480774,
"learning_rate": 0.0001706247475427494,
"loss": 2.9307,
"step": 106400
},
{
"epoch": 28.646802325581394,
"grad_norm": 0.49330660700798035,
"learning_rate": 0.0001704227817422916,
"loss": 2.9332,
"step": 106450
},
{
"epoch": 28.660260551248925,
"grad_norm": 0.4907183051109314,
"learning_rate": 0.00017022081594183383,
"loss": 2.9261,
"step": 106500
},
{
"epoch": 28.673718776916452,
"grad_norm": 0.496756374835968,
"learning_rate": 0.00017001885014137605,
"loss": 2.9219,
"step": 106550
},
{
"epoch": 28.68717700258398,
"grad_norm": 0.5075603723526001,
"learning_rate": 0.00016981688434091827,
"loss": 2.928,
"step": 106600
},
{
"epoch": 28.700635228251507,
"grad_norm": 0.4845956861972809,
"learning_rate": 0.00016961491854046047,
"loss": 2.9293,
"step": 106650
},
{
"epoch": 28.714093453919034,
"grad_norm": 0.5165157914161682,
"learning_rate": 0.0001694129527400027,
"loss": 2.9381,
"step": 106700
},
{
"epoch": 28.727551679586565,
"grad_norm": 0.539211094379425,
"learning_rate": 0.0001692109869395449,
"loss": 2.9374,
"step": 106750
},
{
"epoch": 28.741009905254092,
"grad_norm": 0.5208998322486877,
"learning_rate": 0.00016900902113908708,
"loss": 2.9233,
"step": 106800
},
{
"epoch": 28.75446813092162,
"grad_norm": 0.5117523670196533,
"learning_rate": 0.0001688070553386293,
"loss": 2.9272,
"step": 106850
},
{
"epoch": 28.767926356589147,
"grad_norm": 0.48024994134902954,
"learning_rate": 0.0001686050895381715,
"loss": 2.9297,
"step": 106900
},
{
"epoch": 28.781384582256674,
"grad_norm": 0.5294599533081055,
"learning_rate": 0.00016840312373771373,
"loss": 2.9464,
"step": 106950
},
{
"epoch": 28.794842807924205,
"grad_norm": 0.49581241607666016,
"learning_rate": 0.00016820115793725595,
"loss": 2.9433,
"step": 107000
},
{
"epoch": 28.794842807924205,
"eval_accuracy": 0.39789781372105487,
"eval_loss": 3.2776732444763184,
"eval_runtime": 146.8371,
"eval_samples_per_second": 122.667,
"eval_steps_per_second": 7.668,
"step": 107000
},
{
"epoch": 28.808301033591732,
"grad_norm": 0.4745948612689972,
"learning_rate": 0.00016799919213679817,
"loss": 2.9495,
"step": 107050
},
{
"epoch": 28.82175925925926,
"grad_norm": 0.5273579955101013,
"learning_rate": 0.00016779722633634037,
"loss": 2.9345,
"step": 107100
},
{
"epoch": 28.835217484926787,
"grad_norm": 0.5143641829490662,
"learning_rate": 0.0001675952605358826,
"loss": 2.9346,
"step": 107150
},
{
"epoch": 28.848675710594314,
"grad_norm": 0.4993511438369751,
"learning_rate": 0.0001673932947354248,
"loss": 2.9314,
"step": 107200
},
{
"epoch": 28.86213393626184,
"grad_norm": 0.5145397782325745,
"learning_rate": 0.00016719132893496698,
"loss": 2.943,
"step": 107250
},
{
"epoch": 28.875592161929372,
"grad_norm": 0.5847262740135193,
"learning_rate": 0.0001669893631345092,
"loss": 2.939,
"step": 107300
},
{
"epoch": 28.8890503875969,
"grad_norm": 0.5064164400100708,
"learning_rate": 0.0001667873973340514,
"loss": 2.9254,
"step": 107350
},
{
"epoch": 28.902508613264427,
"grad_norm": 0.5213847160339355,
"learning_rate": 0.00016658543153359363,
"loss": 2.9416,
"step": 107400
},
{
"epoch": 28.915966838931954,
"grad_norm": 0.5266076922416687,
"learning_rate": 0.00016638346573313585,
"loss": 2.9345,
"step": 107450
},
{
"epoch": 28.92942506459948,
"grad_norm": 0.5118159055709839,
"learning_rate": 0.00016618149993267807,
"loss": 2.9467,
"step": 107500
},
{
"epoch": 28.942883290267012,
"grad_norm": 0.5127436518669128,
"learning_rate": 0.00016597953413222027,
"loss": 2.9452,
"step": 107550
},
{
"epoch": 28.95634151593454,
"grad_norm": 0.5206359028816223,
"learning_rate": 0.00016577756833176247,
"loss": 2.9348,
"step": 107600
},
{
"epoch": 28.969799741602067,
"grad_norm": 0.5183207392692566,
"learning_rate": 0.0001655756025313047,
"loss": 2.9361,
"step": 107650
},
{
"epoch": 28.983257967269594,
"grad_norm": 0.489900141954422,
"learning_rate": 0.00016537363673084688,
"loss": 2.9478,
"step": 107700
},
{
"epoch": 28.99671619293712,
"grad_norm": 0.5312089323997498,
"learning_rate": 0.0001651716709303891,
"loss": 2.953,
"step": 107750
},
{
"epoch": 29.00995908699397,
"grad_norm": 0.48560231924057007,
"learning_rate": 0.0001649697051299313,
"loss": 2.8764,
"step": 107800
},
{
"epoch": 29.023417312661497,
"grad_norm": 0.5327926874160767,
"learning_rate": 0.0001647677393294735,
"loss": 2.8725,
"step": 107850
},
{
"epoch": 29.036875538329028,
"grad_norm": 0.48088765144348145,
"learning_rate": 0.00016456577352901575,
"loss": 2.8643,
"step": 107900
},
{
"epoch": 29.050333763996555,
"grad_norm": 0.5463573932647705,
"learning_rate": 0.00016436380772855797,
"loss": 2.8722,
"step": 107950
},
{
"epoch": 29.063791989664082,
"grad_norm": 0.5013686418533325,
"learning_rate": 0.00016416184192810017,
"loss": 2.8714,
"step": 108000
},
{
"epoch": 29.063791989664082,
"eval_accuracy": 0.39687685921931587,
"eval_loss": 3.2901217937469482,
"eval_runtime": 146.7973,
"eval_samples_per_second": 122.7,
"eval_steps_per_second": 7.67,
"step": 108000
},
{
"epoch": 29.07725021533161,
"grad_norm": 0.5061647891998291,
"learning_rate": 0.00016395987612764237,
"loss": 2.8791,
"step": 108050
},
{
"epoch": 29.090708440999137,
"grad_norm": 0.5251488089561462,
"learning_rate": 0.0001637579103271846,
"loss": 2.8695,
"step": 108100
},
{
"epoch": 29.104166666666668,
"grad_norm": 0.5041511058807373,
"learning_rate": 0.00016355594452672679,
"loss": 2.8829,
"step": 108150
},
{
"epoch": 29.117624892334195,
"grad_norm": 0.49148404598236084,
"learning_rate": 0.000163353978726269,
"loss": 2.8866,
"step": 108200
},
{
"epoch": 29.131083118001722,
"grad_norm": 0.5406279563903809,
"learning_rate": 0.0001631520129258112,
"loss": 2.8911,
"step": 108250
},
{
"epoch": 29.14454134366925,
"grad_norm": 0.5385282039642334,
"learning_rate": 0.0001629500471253534,
"loss": 2.8916,
"step": 108300
},
{
"epoch": 29.157999569336777,
"grad_norm": 0.5390298366546631,
"learning_rate": 0.00016274808132489565,
"loss": 2.8827,
"step": 108350
},
{
"epoch": 29.171457795004308,
"grad_norm": 0.5405099391937256,
"learning_rate": 0.00016254611552443785,
"loss": 2.8937,
"step": 108400
},
{
"epoch": 29.184916020671835,
"grad_norm": 0.5607936382293701,
"learning_rate": 0.00016234414972398007,
"loss": 2.8967,
"step": 108450
},
{
"epoch": 29.198374246339363,
"grad_norm": 0.5277544856071472,
"learning_rate": 0.00016214218392352227,
"loss": 2.913,
"step": 108500
},
{
"epoch": 29.21183247200689,
"grad_norm": 0.5103113055229187,
"learning_rate": 0.0001619402181230645,
"loss": 2.904,
"step": 108550
},
{
"epoch": 29.225290697674417,
"grad_norm": 0.5134518146514893,
"learning_rate": 0.00016173825232260669,
"loss": 2.8923,
"step": 108600
},
{
"epoch": 29.238748923341948,
"grad_norm": 0.5226130485534668,
"learning_rate": 0.00016153628652214888,
"loss": 2.8982,
"step": 108650
},
{
"epoch": 29.252207149009475,
"grad_norm": 0.5704584121704102,
"learning_rate": 0.0001613343207216911,
"loss": 2.8925,
"step": 108700
},
{
"epoch": 29.265665374677003,
"grad_norm": 0.4942656457424164,
"learning_rate": 0.0001611323549212333,
"loss": 2.8977,
"step": 108750
},
{
"epoch": 29.27912360034453,
"grad_norm": 0.5402054190635681,
"learning_rate": 0.00016093038912077555,
"loss": 2.9063,
"step": 108800
},
{
"epoch": 29.292581826012057,
"grad_norm": 0.5127511620521545,
"learning_rate": 0.00016072842332031775,
"loss": 2.9033,
"step": 108850
},
{
"epoch": 29.306040051679588,
"grad_norm": 0.529906690120697,
"learning_rate": 0.00016052645751985997,
"loss": 2.8883,
"step": 108900
},
{
"epoch": 29.319498277347115,
"grad_norm": 0.5000776648521423,
"learning_rate": 0.00016032449171940217,
"loss": 2.896,
"step": 108950
},
{
"epoch": 29.332956503014643,
"grad_norm": 0.5496556162834167,
"learning_rate": 0.0001601225259189444,
"loss": 2.897,
"step": 109000
},
{
"epoch": 29.332956503014643,
"eval_accuracy": 0.3971406537846263,
"eval_loss": 3.2881126403808594,
"eval_runtime": 146.7304,
"eval_samples_per_second": 122.756,
"eval_steps_per_second": 7.674,
"step": 109000
},
{
"epoch": 29.34641472868217,
"grad_norm": 0.4822516441345215,
"learning_rate": 0.00015992056011848659,
"loss": 2.916,
"step": 109050
},
{
"epoch": 29.359872954349697,
"grad_norm": 0.5103018283843994,
"learning_rate": 0.00015971859431802878,
"loss": 2.9114,
"step": 109100
},
{
"epoch": 29.373331180017228,
"grad_norm": 0.5238035917282104,
"learning_rate": 0.000159516628517571,
"loss": 2.906,
"step": 109150
},
{
"epoch": 29.386789405684755,
"grad_norm": 1.0078877210617065,
"learning_rate": 0.0001593146627171132,
"loss": 2.9091,
"step": 109200
},
{
"epoch": 29.400247631352283,
"grad_norm": 0.5114656090736389,
"learning_rate": 0.00015911269691665545,
"loss": 2.908,
"step": 109250
},
{
"epoch": 29.41370585701981,
"grad_norm": 0.4911574423313141,
"learning_rate": 0.00015891073111619765,
"loss": 2.9068,
"step": 109300
},
{
"epoch": 29.427164082687337,
"grad_norm": 0.5115836262702942,
"learning_rate": 0.00015870876531573987,
"loss": 2.9027,
"step": 109350
},
{
"epoch": 29.440622308354868,
"grad_norm": 0.538198709487915,
"learning_rate": 0.00015850679951528207,
"loss": 2.9203,
"step": 109400
},
{
"epoch": 29.454080534022395,
"grad_norm": 0.5085831880569458,
"learning_rate": 0.00015830483371482426,
"loss": 2.9104,
"step": 109450
},
{
"epoch": 29.467538759689923,
"grad_norm": 0.503837525844574,
"learning_rate": 0.00015810286791436649,
"loss": 2.9144,
"step": 109500
},
{
"epoch": 29.48099698535745,
"grad_norm": 0.5280073285102844,
"learning_rate": 0.00015790090211390868,
"loss": 2.9088,
"step": 109550
},
{
"epoch": 29.494455211024977,
"grad_norm": 0.5246075391769409,
"learning_rate": 0.0001576989363134509,
"loss": 2.9121,
"step": 109600
},
{
"epoch": 29.507913436692505,
"grad_norm": 0.5014354586601257,
"learning_rate": 0.0001574969705129931,
"loss": 2.911,
"step": 109650
},
{
"epoch": 29.521371662360036,
"grad_norm": 0.5218913555145264,
"learning_rate": 0.00015729500471253535,
"loss": 2.9129,
"step": 109700
},
{
"epoch": 29.534829888027563,
"grad_norm": 0.5031024217605591,
"learning_rate": 0.00015709303891207755,
"loss": 2.9214,
"step": 109750
},
{
"epoch": 29.54828811369509,
"grad_norm": 0.5340117812156677,
"learning_rate": 0.00015689107311161977,
"loss": 2.9132,
"step": 109800
},
{
"epoch": 29.561746339362617,
"grad_norm": 0.5272226929664612,
"learning_rate": 0.00015668910731116197,
"loss": 2.9138,
"step": 109850
},
{
"epoch": 29.575204565030145,
"grad_norm": 0.522051215171814,
"learning_rate": 0.00015648714151070416,
"loss": 2.9115,
"step": 109900
},
{
"epoch": 29.588662790697676,
"grad_norm": 0.5091174244880676,
"learning_rate": 0.0001562851757102464,
"loss": 2.909,
"step": 109950
},
{
"epoch": 29.602121016365203,
"grad_norm": 0.5172711610794067,
"learning_rate": 0.00015608320990978858,
"loss": 2.9142,
"step": 110000
},
{
"epoch": 29.602121016365203,
"eval_accuracy": 0.39755785770999374,
"eval_loss": 3.281320571899414,
"eval_runtime": 146.6744,
"eval_samples_per_second": 122.803,
"eval_steps_per_second": 7.677,
"step": 110000
},
{
"epoch": 29.602121016365203,
"step": 110000,
"total_flos": 2.29889824948224e+18,
"train_loss": 0.5332863204956054,
"train_runtime": 28956.9492,
"train_samples_per_second": 410.561,
"train_steps_per_second": 5.133
}
],
"logging_steps": 50,
"max_steps": 148640,
"num_input_tokens_seen": 0,
"num_train_epochs": 40,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.29889824948224e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}