exceptions / cost_to_hit_frequency_1001 /trainer_state.json
craa's picture
Upload folder using huggingface_hub
453f8fc verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 65000,
"best_metric": 3.518871307373047,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_1001/checkpoint-30000",
"epoch": 20.0,
"eval_steps": 1000,
"global_step": 68660,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01456536937776742,
"grad_norm": 1.0411887168884277,
"learning_rate": 0.000294,
"loss": 8.4313,
"step": 50
},
{
"epoch": 0.02913073875553484,
"grad_norm": 0.9934611916542053,
"learning_rate": 0.0005939999999999999,
"loss": 6.7347,
"step": 100
},
{
"epoch": 0.04369610813330226,
"grad_norm": 0.6710580587387085,
"learning_rate": 0.0005995711785297549,
"loss": 6.3491,
"step": 150
},
{
"epoch": 0.05826147751106968,
"grad_norm": 0.46758192777633667,
"learning_rate": 0.0005991336056009335,
"loss": 6.1272,
"step": 200
},
{
"epoch": 0.0728268468888371,
"grad_norm": 0.5224136710166931,
"learning_rate": 0.000598696032672112,
"loss": 5.9887,
"step": 250
},
{
"epoch": 0.08739221626660452,
"grad_norm": 0.5374292135238647,
"learning_rate": 0.0005982584597432905,
"loss": 5.8537,
"step": 300
},
{
"epoch": 0.10195758564437195,
"grad_norm": 0.42533859610557556,
"learning_rate": 0.0005978208868144691,
"loss": 5.7305,
"step": 350
},
{
"epoch": 0.11652295502213936,
"grad_norm": 0.5247730016708374,
"learning_rate": 0.0005973833138856476,
"loss": 5.6139,
"step": 400
},
{
"epoch": 0.13108832439990678,
"grad_norm": 0.5447224974632263,
"learning_rate": 0.000596945740956826,
"loss": 5.4885,
"step": 450
},
{
"epoch": 0.1456536937776742,
"grad_norm": 0.5301450490951538,
"learning_rate": 0.0005965081680280046,
"loss": 5.4119,
"step": 500
},
{
"epoch": 0.16021906315544163,
"grad_norm": 0.5106812119483948,
"learning_rate": 0.0005960705950991831,
"loss": 5.3254,
"step": 550
},
{
"epoch": 0.17478443253320905,
"grad_norm": 0.42297908663749695,
"learning_rate": 0.0005956330221703616,
"loss": 5.2446,
"step": 600
},
{
"epoch": 0.18934980191097647,
"grad_norm": 0.47052502632141113,
"learning_rate": 0.0005951954492415402,
"loss": 5.1939,
"step": 650
},
{
"epoch": 0.2039151712887439,
"grad_norm": 0.4460456371307373,
"learning_rate": 0.0005947578763127188,
"loss": 5.1229,
"step": 700
},
{
"epoch": 0.2184805406665113,
"grad_norm": 0.46692177653312683,
"learning_rate": 0.0005943203033838973,
"loss": 5.0837,
"step": 750
},
{
"epoch": 0.23304591004427871,
"grad_norm": 0.4475383758544922,
"learning_rate": 0.0005938827304550758,
"loss": 5.0189,
"step": 800
},
{
"epoch": 0.24761127942204614,
"grad_norm": 0.4715788960456848,
"learning_rate": 0.0005934451575262544,
"loss": 4.9712,
"step": 850
},
{
"epoch": 0.26217664879981356,
"grad_norm": 0.5530597567558289,
"learning_rate": 0.0005930075845974328,
"loss": 4.9247,
"step": 900
},
{
"epoch": 0.276742018177581,
"grad_norm": 0.5195161700248718,
"learning_rate": 0.0005925700116686113,
"loss": 4.8776,
"step": 950
},
{
"epoch": 0.2913073875553484,
"grad_norm": 0.46087169647216797,
"learning_rate": 0.0005921324387397899,
"loss": 4.8133,
"step": 1000
},
{
"epoch": 0.2913073875553484,
"eval_accuracy": 0.2545722064591014,
"eval_loss": 4.754235744476318,
"eval_runtime": 180.1197,
"eval_samples_per_second": 92.405,
"eval_steps_per_second": 5.779,
"step": 1000
},
{
"epoch": 0.30587275693311583,
"grad_norm": 0.44081172347068787,
"learning_rate": 0.0005916948658109684,
"loss": 4.7792,
"step": 1050
},
{
"epoch": 0.32043812631088325,
"grad_norm": 0.48162946105003357,
"learning_rate": 0.000591257292882147,
"loss": 4.7359,
"step": 1100
},
{
"epoch": 0.3350034956886507,
"grad_norm": 0.4235544204711914,
"learning_rate": 0.0005908197199533255,
"loss": 4.6992,
"step": 1150
},
{
"epoch": 0.3495688650664181,
"grad_norm": 0.4740869402885437,
"learning_rate": 0.0005903821470245041,
"loss": 4.6654,
"step": 1200
},
{
"epoch": 0.3641342344441855,
"grad_norm": 0.4276205897331238,
"learning_rate": 0.0005899445740956826,
"loss": 4.6244,
"step": 1250
},
{
"epoch": 0.37869960382195295,
"grad_norm": 0.40895992517471313,
"learning_rate": 0.0005895070011668611,
"loss": 4.6068,
"step": 1300
},
{
"epoch": 0.39326497319972037,
"grad_norm": 0.4188133478164673,
"learning_rate": 0.0005890694282380397,
"loss": 4.5584,
"step": 1350
},
{
"epoch": 0.4078303425774878,
"grad_norm": 0.4317689538002014,
"learning_rate": 0.0005886318553092181,
"loss": 4.5422,
"step": 1400
},
{
"epoch": 0.42239571195525516,
"grad_norm": 0.40392470359802246,
"learning_rate": 0.0005881942823803966,
"loss": 4.5333,
"step": 1450
},
{
"epoch": 0.4369610813330226,
"grad_norm": 0.4244018793106079,
"learning_rate": 0.0005877567094515752,
"loss": 4.4986,
"step": 1500
},
{
"epoch": 0.45152645071079,
"grad_norm": 0.44470831751823425,
"learning_rate": 0.0005873191365227537,
"loss": 4.4928,
"step": 1550
},
{
"epoch": 0.46609182008855743,
"grad_norm": 0.4386588931083679,
"learning_rate": 0.0005868815635939323,
"loss": 4.4553,
"step": 1600
},
{
"epoch": 0.48065718946632485,
"grad_norm": 0.42980971932411194,
"learning_rate": 0.0005864439906651108,
"loss": 4.4385,
"step": 1650
},
{
"epoch": 0.4952225588440923,
"grad_norm": 0.3935016691684723,
"learning_rate": 0.0005860064177362894,
"loss": 4.4327,
"step": 1700
},
{
"epoch": 0.5097879282218597,
"grad_norm": 0.4373241662979126,
"learning_rate": 0.0005855688448074679,
"loss": 4.4099,
"step": 1750
},
{
"epoch": 0.5243532975996271,
"grad_norm": 0.4172551929950714,
"learning_rate": 0.0005851312718786464,
"loss": 4.3901,
"step": 1800
},
{
"epoch": 0.5389186669773945,
"grad_norm": 0.40378788113594055,
"learning_rate": 0.0005846936989498249,
"loss": 4.383,
"step": 1850
},
{
"epoch": 0.553484036355162,
"grad_norm": 0.38236093521118164,
"learning_rate": 0.0005842561260210034,
"loss": 4.3598,
"step": 1900
},
{
"epoch": 0.5680494057329294,
"grad_norm": 0.381078839302063,
"learning_rate": 0.000583818553092182,
"loss": 4.3638,
"step": 1950
},
{
"epoch": 0.5826147751106968,
"grad_norm": 0.4327857196331024,
"learning_rate": 0.0005833809801633605,
"loss": 4.3432,
"step": 2000
},
{
"epoch": 0.5826147751106968,
"eval_accuracy": 0.29888864119390235,
"eval_loss": 4.287432670593262,
"eval_runtime": 180.4505,
"eval_samples_per_second": 92.236,
"eval_steps_per_second": 5.769,
"step": 2000
},
{
"epoch": 0.5971801444884642,
"grad_norm": 0.4143087863922119,
"learning_rate": 0.000582943407234539,
"loss": 4.329,
"step": 2050
},
{
"epoch": 0.6117455138662317,
"grad_norm": 0.3753448724746704,
"learning_rate": 0.0005825058343057176,
"loss": 4.3033,
"step": 2100
},
{
"epoch": 0.6263108832439991,
"grad_norm": 0.40621188282966614,
"learning_rate": 0.0005820682613768961,
"loss": 4.3041,
"step": 2150
},
{
"epoch": 0.6408762526217665,
"grad_norm": 0.40833911299705505,
"learning_rate": 0.0005816306884480747,
"loss": 4.2834,
"step": 2200
},
{
"epoch": 0.6554416219995339,
"grad_norm": 0.4088577628135681,
"learning_rate": 0.0005811931155192532,
"loss": 4.2804,
"step": 2250
},
{
"epoch": 0.6700069913773014,
"grad_norm": 0.3746855556964874,
"learning_rate": 0.0005807555425904316,
"loss": 4.2762,
"step": 2300
},
{
"epoch": 0.6845723607550688,
"grad_norm": 0.3618931770324707,
"learning_rate": 0.0005803179696616102,
"loss": 4.2598,
"step": 2350
},
{
"epoch": 0.6991377301328362,
"grad_norm": 0.3690814971923828,
"learning_rate": 0.0005798803967327887,
"loss": 4.2413,
"step": 2400
},
{
"epoch": 0.7137030995106036,
"grad_norm": 0.40264639258384705,
"learning_rate": 0.0005794428238039673,
"loss": 4.2375,
"step": 2450
},
{
"epoch": 0.728268468888371,
"grad_norm": 0.4249323606491089,
"learning_rate": 0.0005790052508751458,
"loss": 4.233,
"step": 2500
},
{
"epoch": 0.7428338382661385,
"grad_norm": 0.39969372749328613,
"learning_rate": 0.0005785676779463243,
"loss": 4.2202,
"step": 2550
},
{
"epoch": 0.7573992076439059,
"grad_norm": 0.3819160759449005,
"learning_rate": 0.0005781301050175029,
"loss": 4.2199,
"step": 2600
},
{
"epoch": 0.7719645770216733,
"grad_norm": 0.361541211605072,
"learning_rate": 0.0005776925320886814,
"loss": 4.204,
"step": 2650
},
{
"epoch": 0.7865299463994407,
"grad_norm": 0.3613761365413666,
"learning_rate": 0.00057725495915986,
"loss": 4.1961,
"step": 2700
},
{
"epoch": 0.8010953157772082,
"grad_norm": 0.4024335741996765,
"learning_rate": 0.0005768173862310384,
"loss": 4.1899,
"step": 2750
},
{
"epoch": 0.8156606851549756,
"grad_norm": 0.34226447343826294,
"learning_rate": 0.0005763798133022169,
"loss": 4.1648,
"step": 2800
},
{
"epoch": 0.8302260545327429,
"grad_norm": 0.3609713315963745,
"learning_rate": 0.0005759422403733955,
"loss": 4.1756,
"step": 2850
},
{
"epoch": 0.8447914239105103,
"grad_norm": 0.37800899147987366,
"learning_rate": 0.000575504667444574,
"loss": 4.1618,
"step": 2900
},
{
"epoch": 0.8593567932882777,
"grad_norm": 0.35399892926216125,
"learning_rate": 0.0005750670945157526,
"loss": 4.151,
"step": 2950
},
{
"epoch": 0.8739221626660452,
"grad_norm": 0.35685768723487854,
"learning_rate": 0.0005746295215869311,
"loss": 4.1316,
"step": 3000
},
{
"epoch": 0.8739221626660452,
"eval_accuracy": 0.3165242106956263,
"eval_loss": 4.094448566436768,
"eval_runtime": 180.2904,
"eval_samples_per_second": 92.318,
"eval_steps_per_second": 5.774,
"step": 3000
},
{
"epoch": 0.8884875320438126,
"grad_norm": 0.3725755214691162,
"learning_rate": 0.0005741919486581096,
"loss": 4.1422,
"step": 3050
},
{
"epoch": 0.90305290142158,
"grad_norm": 0.3675600290298462,
"learning_rate": 0.0005737543757292882,
"loss": 4.1361,
"step": 3100
},
{
"epoch": 0.9176182707993474,
"grad_norm": 0.3458426892757416,
"learning_rate": 0.0005733168028004667,
"loss": 4.1292,
"step": 3150
},
{
"epoch": 0.9321836401771149,
"grad_norm": 0.35508471727371216,
"learning_rate": 0.0005728792298716453,
"loss": 4.1173,
"step": 3200
},
{
"epoch": 0.9467490095548823,
"grad_norm": 0.3473420739173889,
"learning_rate": 0.0005724416569428237,
"loss": 4.1051,
"step": 3250
},
{
"epoch": 0.9613143789326497,
"grad_norm": 0.38041558861732483,
"learning_rate": 0.0005720040840140023,
"loss": 4.1141,
"step": 3300
},
{
"epoch": 0.9758797483104171,
"grad_norm": 0.36010000109672546,
"learning_rate": 0.0005715665110851808,
"loss": 4.1036,
"step": 3350
},
{
"epoch": 0.9904451176881846,
"grad_norm": 0.35879573225975037,
"learning_rate": 0.0005711289381563593,
"loss": 4.099,
"step": 3400
},
{
"epoch": 1.0049522255884409,
"grad_norm": 0.36708125472068787,
"learning_rate": 0.0005706913652275379,
"loss": 4.0613,
"step": 3450
},
{
"epoch": 1.0195175949662083,
"grad_norm": 0.36130914092063904,
"learning_rate": 0.0005702537922987164,
"loss": 4.023,
"step": 3500
},
{
"epoch": 1.0340829643439757,
"grad_norm": 0.3636951446533203,
"learning_rate": 0.0005698162193698949,
"loss": 4.0197,
"step": 3550
},
{
"epoch": 1.0486483337217432,
"grad_norm": 0.3318287432193756,
"learning_rate": 0.0005693786464410735,
"loss": 4.009,
"step": 3600
},
{
"epoch": 1.0632137030995106,
"grad_norm": 0.3470671474933624,
"learning_rate": 0.000568941073512252,
"loss": 3.9984,
"step": 3650
},
{
"epoch": 1.077779072477278,
"grad_norm": 0.3891682028770447,
"learning_rate": 0.0005685035005834305,
"loss": 4.0067,
"step": 3700
},
{
"epoch": 1.0923444418550454,
"grad_norm": 0.3590176999568939,
"learning_rate": 0.000568065927654609,
"loss": 4.011,
"step": 3750
},
{
"epoch": 1.1069098112328128,
"grad_norm": 0.3605038821697235,
"learning_rate": 0.0005676283547257876,
"loss": 4.0014,
"step": 3800
},
{
"epoch": 1.1214751806105803,
"grad_norm": 0.35803160071372986,
"learning_rate": 0.0005671907817969661,
"loss": 3.9972,
"step": 3850
},
{
"epoch": 1.1360405499883477,
"grad_norm": 0.3601053059101105,
"learning_rate": 0.0005667532088681446,
"loss": 3.9894,
"step": 3900
},
{
"epoch": 1.1506059193661151,
"grad_norm": 0.3761005699634552,
"learning_rate": 0.0005663156359393232,
"loss": 3.9892,
"step": 3950
},
{
"epoch": 1.1651712887438825,
"grad_norm": 0.34091663360595703,
"learning_rate": 0.0005658780630105017,
"loss": 3.9856,
"step": 4000
},
{
"epoch": 1.1651712887438825,
"eval_accuracy": 0.3262635618883952,
"eval_loss": 3.9839839935302734,
"eval_runtime": 180.3348,
"eval_samples_per_second": 92.295,
"eval_steps_per_second": 5.773,
"step": 4000
},
{
"epoch": 1.17973665812165,
"grad_norm": 0.3509597182273865,
"learning_rate": 0.0005654404900816802,
"loss": 3.9952,
"step": 4050
},
{
"epoch": 1.1943020274994174,
"grad_norm": 0.35156598687171936,
"learning_rate": 0.0005650029171528588,
"loss": 3.9769,
"step": 4100
},
{
"epoch": 1.2088673968771848,
"grad_norm": 0.34221357107162476,
"learning_rate": 0.0005645653442240373,
"loss": 3.9839,
"step": 4150
},
{
"epoch": 1.2234327662549522,
"grad_norm": 0.3706187307834625,
"learning_rate": 0.0005641277712952158,
"loss": 3.9845,
"step": 4200
},
{
"epoch": 1.2379981356327197,
"grad_norm": 0.3384045660495758,
"learning_rate": 0.0005636901983663943,
"loss": 3.9635,
"step": 4250
},
{
"epoch": 1.252563505010487,
"grad_norm": 0.36682382225990295,
"learning_rate": 0.0005632526254375729,
"loss": 3.9741,
"step": 4300
},
{
"epoch": 1.2671288743882545,
"grad_norm": 0.3488970398902893,
"learning_rate": 0.0005628150525087514,
"loss": 3.9682,
"step": 4350
},
{
"epoch": 1.281694243766022,
"grad_norm": 0.3281860053539276,
"learning_rate": 0.0005623774795799299,
"loss": 3.9527,
"step": 4400
},
{
"epoch": 1.2962596131437893,
"grad_norm": 0.3465306758880615,
"learning_rate": 0.0005619399066511085,
"loss": 3.9529,
"step": 4450
},
{
"epoch": 1.3108249825215568,
"grad_norm": 0.3299737870693207,
"learning_rate": 0.000561502333722287,
"loss": 3.9559,
"step": 4500
},
{
"epoch": 1.3253903518993242,
"grad_norm": 0.3362690508365631,
"learning_rate": 0.0005610647607934655,
"loss": 3.9554,
"step": 4550
},
{
"epoch": 1.3399557212770916,
"grad_norm": 0.34816160798072815,
"learning_rate": 0.000560627187864644,
"loss": 3.951,
"step": 4600
},
{
"epoch": 1.354521090654859,
"grad_norm": 0.3506231904029846,
"learning_rate": 0.0005601896149358226,
"loss": 3.9438,
"step": 4650
},
{
"epoch": 1.3690864600326265,
"grad_norm": 0.34532177448272705,
"learning_rate": 0.0005597520420070011,
"loss": 3.9489,
"step": 4700
},
{
"epoch": 1.3836518294103939,
"grad_norm": 0.3467022180557251,
"learning_rate": 0.0005593144690781796,
"loss": 3.9438,
"step": 4750
},
{
"epoch": 1.3982171987881613,
"grad_norm": 0.3695443272590637,
"learning_rate": 0.0005588768961493582,
"loss": 3.9331,
"step": 4800
},
{
"epoch": 1.4127825681659287,
"grad_norm": 0.33018758893013,
"learning_rate": 0.0005584393232205367,
"loss": 3.9295,
"step": 4850
},
{
"epoch": 1.4273479375436962,
"grad_norm": 0.3564456105232239,
"learning_rate": 0.0005580017502917152,
"loss": 3.9395,
"step": 4900
},
{
"epoch": 1.4419133069214636,
"grad_norm": 0.3350387215614319,
"learning_rate": 0.0005575641773628938,
"loss": 3.9266,
"step": 4950
},
{
"epoch": 1.456478676299231,
"grad_norm": 0.33804184198379517,
"learning_rate": 0.0005571266044340723,
"loss": 3.9435,
"step": 5000
},
{
"epoch": 1.456478676299231,
"eval_accuracy": 0.33277813599489436,
"eval_loss": 3.9095029830932617,
"eval_runtime": 180.2432,
"eval_samples_per_second": 92.342,
"eval_steps_per_second": 5.776,
"step": 5000
},
{
"epoch": 1.4710440456769984,
"grad_norm": 0.33603259921073914,
"learning_rate": 0.0005566890315052507,
"loss": 3.9152,
"step": 5050
},
{
"epoch": 1.4856094150547658,
"grad_norm": 0.3263108730316162,
"learning_rate": 0.0005562514585764293,
"loss": 3.9295,
"step": 5100
},
{
"epoch": 1.500174784432533,
"grad_norm": 0.3421551287174225,
"learning_rate": 0.0005558138856476079,
"loss": 3.9092,
"step": 5150
},
{
"epoch": 1.5147401538103007,
"grad_norm": 0.3283444344997406,
"learning_rate": 0.0005553763127187864,
"loss": 3.9035,
"step": 5200
},
{
"epoch": 1.529305523188068,
"grad_norm": 0.34648576378822327,
"learning_rate": 0.0005549387397899649,
"loss": 3.9155,
"step": 5250
},
{
"epoch": 1.5438708925658355,
"grad_norm": 0.3276433050632477,
"learning_rate": 0.0005545011668611435,
"loss": 3.9183,
"step": 5300
},
{
"epoch": 1.5584362619436027,
"grad_norm": 0.333248108625412,
"learning_rate": 0.000554063593932322,
"loss": 3.9177,
"step": 5350
},
{
"epoch": 1.5730016313213704,
"grad_norm": 0.337734580039978,
"learning_rate": 0.0005536260210035005,
"loss": 3.9095,
"step": 5400
},
{
"epoch": 1.5875670006991376,
"grad_norm": 0.3482300043106079,
"learning_rate": 0.0005531884480746791,
"loss": 3.8858,
"step": 5450
},
{
"epoch": 1.6021323700769052,
"grad_norm": 0.3184448480606079,
"learning_rate": 0.0005527508751458577,
"loss": 3.8971,
"step": 5500
},
{
"epoch": 1.6166977394546724,
"grad_norm": 0.3283264935016632,
"learning_rate": 0.0005523133022170361,
"loss": 3.877,
"step": 5550
},
{
"epoch": 1.63126310883244,
"grad_norm": 0.34048277139663696,
"learning_rate": 0.0005518757292882146,
"loss": 3.8963,
"step": 5600
},
{
"epoch": 1.6458284782102073,
"grad_norm": 0.32839393615722656,
"learning_rate": 0.0005514381563593932,
"loss": 3.8916,
"step": 5650
},
{
"epoch": 1.660393847587975,
"grad_norm": 0.33673906326293945,
"learning_rate": 0.0005510005834305717,
"loss": 3.8848,
"step": 5700
},
{
"epoch": 1.6749592169657421,
"grad_norm": 0.34184661507606506,
"learning_rate": 0.0005505630105017502,
"loss": 3.8913,
"step": 5750
},
{
"epoch": 1.6895245863435098,
"grad_norm": 0.3328911364078522,
"learning_rate": 0.0005501254375729288,
"loss": 3.8844,
"step": 5800
},
{
"epoch": 1.704089955721277,
"grad_norm": 0.34918013215065,
"learning_rate": 0.0005496878646441073,
"loss": 3.8826,
"step": 5850
},
{
"epoch": 1.7186553250990446,
"grad_norm": 0.3278209865093231,
"learning_rate": 0.0005492502917152858,
"loss": 3.8737,
"step": 5900
},
{
"epoch": 1.7332206944768118,
"grad_norm": 0.32764044404029846,
"learning_rate": 0.0005488127187864644,
"loss": 3.8653,
"step": 5950
},
{
"epoch": 1.7477860638545795,
"grad_norm": 0.320593923330307,
"learning_rate": 0.000548375145857643,
"loss": 3.8665,
"step": 6000
},
{
"epoch": 1.7477860638545795,
"eval_accuracy": 0.33797937798145206,
"eval_loss": 3.8514480590820312,
"eval_runtime": 180.396,
"eval_samples_per_second": 92.264,
"eval_steps_per_second": 5.771,
"step": 6000
},
{
"epoch": 1.7623514332323467,
"grad_norm": 0.32193249464035034,
"learning_rate": 0.0005479375729288214,
"loss": 3.863,
"step": 6050
},
{
"epoch": 1.7769168026101143,
"grad_norm": 0.35405752062797546,
"learning_rate": 0.0005474999999999999,
"loss": 3.8648,
"step": 6100
},
{
"epoch": 1.7914821719878815,
"grad_norm": 0.3290136754512787,
"learning_rate": 0.0005470624270711785,
"loss": 3.8579,
"step": 6150
},
{
"epoch": 1.8060475413656492,
"grad_norm": 0.3399069607257843,
"learning_rate": 0.000546624854142357,
"loss": 3.8711,
"step": 6200
},
{
"epoch": 1.8206129107434164,
"grad_norm": 0.333492249250412,
"learning_rate": 0.0005461872812135355,
"loss": 3.8686,
"step": 6250
},
{
"epoch": 1.835178280121184,
"grad_norm": 0.3360602557659149,
"learning_rate": 0.0005457497082847141,
"loss": 3.8639,
"step": 6300
},
{
"epoch": 1.8497436494989512,
"grad_norm": 0.349657267332077,
"learning_rate": 0.0005453121353558927,
"loss": 3.8582,
"step": 6350
},
{
"epoch": 1.8643090188767188,
"grad_norm": 0.31816044449806213,
"learning_rate": 0.0005448745624270712,
"loss": 3.8605,
"step": 6400
},
{
"epoch": 1.878874388254486,
"grad_norm": 0.3400065004825592,
"learning_rate": 0.0005444369894982496,
"loss": 3.8617,
"step": 6450
},
{
"epoch": 1.8934397576322537,
"grad_norm": 0.3279556632041931,
"learning_rate": 0.0005439994165694282,
"loss": 3.8528,
"step": 6500
},
{
"epoch": 1.908005127010021,
"grad_norm": 0.33743831515312195,
"learning_rate": 0.0005435618436406067,
"loss": 3.8504,
"step": 6550
},
{
"epoch": 1.9225704963877885,
"grad_norm": 0.3401290476322174,
"learning_rate": 0.0005431242707117852,
"loss": 3.8496,
"step": 6600
},
{
"epoch": 1.9371358657655557,
"grad_norm": 0.3282126486301422,
"learning_rate": 0.0005426866977829638,
"loss": 3.8469,
"step": 6650
},
{
"epoch": 1.9517012351433234,
"grad_norm": 0.3605695068836212,
"learning_rate": 0.0005422491248541423,
"loss": 3.858,
"step": 6700
},
{
"epoch": 1.9662666045210906,
"grad_norm": 0.32521483302116394,
"learning_rate": 0.0005418115519253208,
"loss": 3.852,
"step": 6750
},
{
"epoch": 1.9808319738988582,
"grad_norm": 0.33284640312194824,
"learning_rate": 0.0005413739789964994,
"loss": 3.8486,
"step": 6800
},
{
"epoch": 1.9953973432766254,
"grad_norm": 0.3308689296245575,
"learning_rate": 0.000540936406067678,
"loss": 3.8482,
"step": 6850
},
{
"epoch": 2.0099044511768818,
"grad_norm": 0.33800598978996277,
"learning_rate": 0.0005404988331388564,
"loss": 3.77,
"step": 6900
},
{
"epoch": 2.0244698205546494,
"grad_norm": 0.3277951180934906,
"learning_rate": 0.0005400612602100349,
"loss": 3.7368,
"step": 6950
},
{
"epoch": 2.0390351899324166,
"grad_norm": 0.3203679919242859,
"learning_rate": 0.0005396236872812135,
"loss": 3.7491,
"step": 7000
},
{
"epoch": 2.0390351899324166,
"eval_accuracy": 0.34216722609676753,
"eval_loss": 3.8109800815582275,
"eval_runtime": 180.255,
"eval_samples_per_second": 92.336,
"eval_steps_per_second": 5.775,
"step": 7000
},
{
"epoch": 2.0536005593101843,
"grad_norm": 0.35276371240615845,
"learning_rate": 0.000539186114352392,
"loss": 3.7337,
"step": 7050
},
{
"epoch": 2.0681659286879515,
"grad_norm": 0.35237714648246765,
"learning_rate": 0.0005387485414235705,
"loss": 3.7357,
"step": 7100
},
{
"epoch": 2.082731298065719,
"grad_norm": 0.3209347426891327,
"learning_rate": 0.0005383109684947491,
"loss": 3.7583,
"step": 7150
},
{
"epoch": 2.0972966674434863,
"grad_norm": 0.32085931301116943,
"learning_rate": 0.0005378733955659276,
"loss": 3.7539,
"step": 7200
},
{
"epoch": 2.111862036821254,
"grad_norm": 0.31919988989830017,
"learning_rate": 0.0005374358226371061,
"loss": 3.7393,
"step": 7250
},
{
"epoch": 2.126427406199021,
"grad_norm": 0.3325698673725128,
"learning_rate": 0.0005369982497082847,
"loss": 3.751,
"step": 7300
},
{
"epoch": 2.140992775576789,
"grad_norm": 0.32088345289230347,
"learning_rate": 0.0005365606767794633,
"loss": 3.7441,
"step": 7350
},
{
"epoch": 2.155558144954556,
"grad_norm": 0.31885406374931335,
"learning_rate": 0.0005361231038506417,
"loss": 3.7468,
"step": 7400
},
{
"epoch": 2.1701235143323236,
"grad_norm": 0.32321396470069885,
"learning_rate": 0.0005356855309218202,
"loss": 3.7511,
"step": 7450
},
{
"epoch": 2.184688883710091,
"grad_norm": 0.339028924703598,
"learning_rate": 0.0005352479579929988,
"loss": 3.7568,
"step": 7500
},
{
"epoch": 2.1992542530878585,
"grad_norm": 0.3378174901008606,
"learning_rate": 0.0005348103850641773,
"loss": 3.7351,
"step": 7550
},
{
"epoch": 2.2138196224656257,
"grad_norm": 0.32842838764190674,
"learning_rate": 0.0005343728121353558,
"loss": 3.7614,
"step": 7600
},
{
"epoch": 2.2283849918433933,
"grad_norm": 0.3337772488594055,
"learning_rate": 0.0005339352392065344,
"loss": 3.7502,
"step": 7650
},
{
"epoch": 2.2429503612211605,
"grad_norm": 0.31574419140815735,
"learning_rate": 0.000533497666277713,
"loss": 3.7559,
"step": 7700
},
{
"epoch": 2.257515730598928,
"grad_norm": 0.3204760253429413,
"learning_rate": 0.0005330600933488915,
"loss": 3.7382,
"step": 7750
},
{
"epoch": 2.2720810999766954,
"grad_norm": 0.33120566606521606,
"learning_rate": 0.00053262252042007,
"loss": 3.7488,
"step": 7800
},
{
"epoch": 2.286646469354463,
"grad_norm": 0.3328082263469696,
"learning_rate": 0.0005321849474912485,
"loss": 3.7518,
"step": 7850
},
{
"epoch": 2.3012118387322302,
"grad_norm": 0.3446897268295288,
"learning_rate": 0.000531747374562427,
"loss": 3.7421,
"step": 7900
},
{
"epoch": 2.3157772081099974,
"grad_norm": 0.3277474641799927,
"learning_rate": 0.0005313098016336055,
"loss": 3.7376,
"step": 7950
},
{
"epoch": 2.330342577487765,
"grad_norm": 0.32570740580558777,
"learning_rate": 0.0005308722287047841,
"loss": 3.7416,
"step": 8000
},
{
"epoch": 2.330342577487765,
"eval_accuracy": 0.3451675491976329,
"eval_loss": 3.7787580490112305,
"eval_runtime": 180.355,
"eval_samples_per_second": 92.285,
"eval_steps_per_second": 5.772,
"step": 8000
},
{
"epoch": 2.3449079468655327,
"grad_norm": 0.3236296474933624,
"learning_rate": 0.0005304346557759626,
"loss": 3.737,
"step": 8050
},
{
"epoch": 2.3594733162433,
"grad_norm": 0.3300062417984009,
"learning_rate": 0.0005299970828471411,
"loss": 3.74,
"step": 8100
},
{
"epoch": 2.374038685621067,
"grad_norm": 0.3218088746070862,
"learning_rate": 0.0005295595099183197,
"loss": 3.7618,
"step": 8150
},
{
"epoch": 2.3886040549988348,
"grad_norm": 0.32456105947494507,
"learning_rate": 0.0005291219369894983,
"loss": 3.7461,
"step": 8200
},
{
"epoch": 2.4031694243766024,
"grad_norm": 0.3256712257862091,
"learning_rate": 0.0005286843640606768,
"loss": 3.7343,
"step": 8250
},
{
"epoch": 2.4177347937543696,
"grad_norm": 0.3265218734741211,
"learning_rate": 0.0005282467911318552,
"loss": 3.7396,
"step": 8300
},
{
"epoch": 2.432300163132137,
"grad_norm": 0.3039201498031616,
"learning_rate": 0.0005278092182030338,
"loss": 3.7405,
"step": 8350
},
{
"epoch": 2.4468655325099045,
"grad_norm": 0.3367139995098114,
"learning_rate": 0.0005273716452742123,
"loss": 3.7426,
"step": 8400
},
{
"epoch": 2.461430901887672,
"grad_norm": 0.314224511384964,
"learning_rate": 0.0005269340723453908,
"loss": 3.7412,
"step": 8450
},
{
"epoch": 2.4759962712654393,
"grad_norm": 0.3330950140953064,
"learning_rate": 0.0005264964994165694,
"loss": 3.7463,
"step": 8500
},
{
"epoch": 2.4905616406432065,
"grad_norm": 0.3340461552143097,
"learning_rate": 0.000526058926487748,
"loss": 3.7469,
"step": 8550
},
{
"epoch": 2.505127010020974,
"grad_norm": 0.3362635672092438,
"learning_rate": 0.0005256213535589265,
"loss": 3.7338,
"step": 8600
},
{
"epoch": 2.519692379398742,
"grad_norm": 0.3297460377216339,
"learning_rate": 0.000525183780630105,
"loss": 3.7498,
"step": 8650
},
{
"epoch": 2.534257748776509,
"grad_norm": 0.3183857500553131,
"learning_rate": 0.0005247462077012836,
"loss": 3.7391,
"step": 8700
},
{
"epoch": 2.548823118154276,
"grad_norm": 0.33508941531181335,
"learning_rate": 0.000524308634772462,
"loss": 3.7348,
"step": 8750
},
{
"epoch": 2.563388487532044,
"grad_norm": 0.3083733022212982,
"learning_rate": 0.0005238710618436405,
"loss": 3.7285,
"step": 8800
},
{
"epoch": 2.5779538569098115,
"grad_norm": 0.31876590847969055,
"learning_rate": 0.0005234334889148191,
"loss": 3.7443,
"step": 8850
},
{
"epoch": 2.5925192262875787,
"grad_norm": 0.3193049430847168,
"learning_rate": 0.0005229959159859976,
"loss": 3.7422,
"step": 8900
},
{
"epoch": 2.607084595665346,
"grad_norm": 0.32590124011039734,
"learning_rate": 0.0005225583430571761,
"loss": 3.7424,
"step": 8950
},
{
"epoch": 2.6216499650431135,
"grad_norm": 0.3363872170448303,
"learning_rate": 0.0005221207701283547,
"loss": 3.728,
"step": 9000
},
{
"epoch": 2.6216499650431135,
"eval_accuracy": 0.3480742812181514,
"eval_loss": 3.7482504844665527,
"eval_runtime": 180.2179,
"eval_samples_per_second": 92.355,
"eval_steps_per_second": 5.776,
"step": 9000
},
{
"epoch": 2.636215334420881,
"grad_norm": 0.3201189935207367,
"learning_rate": 0.0005216831971995333,
"loss": 3.7477,
"step": 9050
},
{
"epoch": 2.6507807037986484,
"grad_norm": 0.33287513256073,
"learning_rate": 0.0005212456242707118,
"loss": 3.7203,
"step": 9100
},
{
"epoch": 2.6653460731764156,
"grad_norm": 0.3236483335494995,
"learning_rate": 0.0005208080513418903,
"loss": 3.7265,
"step": 9150
},
{
"epoch": 2.6799114425541832,
"grad_norm": 0.3180456757545471,
"learning_rate": 0.0005203704784130689,
"loss": 3.7303,
"step": 9200
},
{
"epoch": 2.6944768119319504,
"grad_norm": 0.3273324966430664,
"learning_rate": 0.0005199329054842473,
"loss": 3.7266,
"step": 9250
},
{
"epoch": 2.709042181309718,
"grad_norm": 0.3243292272090912,
"learning_rate": 0.0005194953325554258,
"loss": 3.7301,
"step": 9300
},
{
"epoch": 2.7236075506874853,
"grad_norm": 0.32646605372428894,
"learning_rate": 0.0005190577596266044,
"loss": 3.7284,
"step": 9350
},
{
"epoch": 2.738172920065253,
"grad_norm": 0.3168424665927887,
"learning_rate": 0.0005186201866977829,
"loss": 3.7384,
"step": 9400
},
{
"epoch": 2.75273828944302,
"grad_norm": 0.3341065049171448,
"learning_rate": 0.0005181826137689614,
"loss": 3.7279,
"step": 9450
},
{
"epoch": 2.7673036588207878,
"grad_norm": 0.3197799623012543,
"learning_rate": 0.00051774504084014,
"loss": 3.7302,
"step": 9500
},
{
"epoch": 2.781869028198555,
"grad_norm": 0.31474462151527405,
"learning_rate": 0.0005173074679113186,
"loss": 3.735,
"step": 9550
},
{
"epoch": 2.7964343975763226,
"grad_norm": 0.3133241832256317,
"learning_rate": 0.0005168698949824971,
"loss": 3.7139,
"step": 9600
},
{
"epoch": 2.81099976695409,
"grad_norm": 0.31363457441329956,
"learning_rate": 0.0005164323220536755,
"loss": 3.7076,
"step": 9650
},
{
"epoch": 2.8255651363318575,
"grad_norm": 0.32894420623779297,
"learning_rate": 0.0005159947491248541,
"loss": 3.717,
"step": 9700
},
{
"epoch": 2.8401305057096247,
"grad_norm": 0.33178263902664185,
"learning_rate": 0.0005155571761960326,
"loss": 3.7305,
"step": 9750
},
{
"epoch": 2.8546958750873923,
"grad_norm": 0.31269919872283936,
"learning_rate": 0.0005151196032672111,
"loss": 3.7172,
"step": 9800
},
{
"epoch": 2.8692612444651595,
"grad_norm": 0.32776308059692383,
"learning_rate": 0.0005146820303383897,
"loss": 3.7171,
"step": 9850
},
{
"epoch": 2.883826613842927,
"grad_norm": 0.3176999092102051,
"learning_rate": 0.0005142444574095682,
"loss": 3.6958,
"step": 9900
},
{
"epoch": 2.8983919832206944,
"grad_norm": 0.3453384339809418,
"learning_rate": 0.0005138068844807468,
"loss": 3.7037,
"step": 9950
},
{
"epoch": 2.912957352598462,
"grad_norm": 0.31886717677116394,
"learning_rate": 0.0005133693115519253,
"loss": 3.6958,
"step": 10000
},
{
"epoch": 2.912957352598462,
"eval_accuracy": 0.3504000665954622,
"eval_loss": 3.722299575805664,
"eval_runtime": 180.4701,
"eval_samples_per_second": 92.226,
"eval_steps_per_second": 5.768,
"step": 10000
},
{
"epoch": 2.927522721976229,
"grad_norm": 0.3301357626914978,
"learning_rate": 0.0005129317386231039,
"loss": 3.7174,
"step": 10050
},
{
"epoch": 2.942088091353997,
"grad_norm": 0.31266558170318604,
"learning_rate": 0.0005124941656942824,
"loss": 3.7087,
"step": 10100
},
{
"epoch": 2.956653460731764,
"grad_norm": 0.2986539602279663,
"learning_rate": 0.0005120565927654608,
"loss": 3.7032,
"step": 10150
},
{
"epoch": 2.9712188301095317,
"grad_norm": 0.3215900659561157,
"learning_rate": 0.0005116190198366394,
"loss": 3.7156,
"step": 10200
},
{
"epoch": 2.985784199487299,
"grad_norm": 0.34506484866142273,
"learning_rate": 0.0005111814469078179,
"loss": 3.7182,
"step": 10250
},
{
"epoch": 3.0002913073875552,
"grad_norm": 0.3209165036678314,
"learning_rate": 0.0005107438739789964,
"loss": 3.6979,
"step": 10300
},
{
"epoch": 3.014856676765323,
"grad_norm": 0.3145550489425659,
"learning_rate": 0.000510306301050175,
"loss": 3.5923,
"step": 10350
},
{
"epoch": 3.02942204614309,
"grad_norm": 0.33601146936416626,
"learning_rate": 0.0005098687281213535,
"loss": 3.6103,
"step": 10400
},
{
"epoch": 3.0439874155208577,
"grad_norm": 0.31010255217552185,
"learning_rate": 0.0005094311551925321,
"loss": 3.6064,
"step": 10450
},
{
"epoch": 3.058552784898625,
"grad_norm": 0.3235945701599121,
"learning_rate": 0.0005089935822637106,
"loss": 3.5974,
"step": 10500
},
{
"epoch": 3.0731181542763926,
"grad_norm": 0.32663047313690186,
"learning_rate": 0.0005085560093348892,
"loss": 3.6164,
"step": 10550
},
{
"epoch": 3.0876835236541598,
"grad_norm": 0.32186612486839294,
"learning_rate": 0.0005081184364060676,
"loss": 3.605,
"step": 10600
},
{
"epoch": 3.1022488930319274,
"grad_norm": 0.3103710114955902,
"learning_rate": 0.0005076808634772461,
"loss": 3.622,
"step": 10650
},
{
"epoch": 3.1168142624096946,
"grad_norm": 0.32506147027015686,
"learning_rate": 0.0005072432905484247,
"loss": 3.6183,
"step": 10700
},
{
"epoch": 3.1313796317874623,
"grad_norm": 0.354626327753067,
"learning_rate": 0.0005068057176196032,
"loss": 3.6236,
"step": 10750
},
{
"epoch": 3.1459450011652295,
"grad_norm": 0.31761565804481506,
"learning_rate": 0.0005063681446907818,
"loss": 3.6218,
"step": 10800
},
{
"epoch": 3.160510370542997,
"grad_norm": 0.3158835172653198,
"learning_rate": 0.0005059305717619603,
"loss": 3.6275,
"step": 10850
},
{
"epoch": 3.1750757399207643,
"grad_norm": 0.3345862925052643,
"learning_rate": 0.0005054929988331388,
"loss": 3.6209,
"step": 10900
},
{
"epoch": 3.189641109298532,
"grad_norm": 0.33414244651794434,
"learning_rate": 0.0005050554259043174,
"loss": 3.6138,
"step": 10950
},
{
"epoch": 3.204206478676299,
"grad_norm": 0.321621835231781,
"learning_rate": 0.0005046178529754959,
"loss": 3.6306,
"step": 11000
},
{
"epoch": 3.204206478676299,
"eval_accuracy": 0.3523780599932934,
"eval_loss": 3.7092323303222656,
"eval_runtime": 181.978,
"eval_samples_per_second": 91.462,
"eval_steps_per_second": 5.72,
"step": 11000
},
{
"epoch": 3.218771848054067,
"grad_norm": 0.3331759572029114,
"learning_rate": 0.0005041802800466744,
"loss": 3.6116,
"step": 11050
},
{
"epoch": 3.233337217431834,
"grad_norm": 0.33480656147003174,
"learning_rate": 0.0005037427071178529,
"loss": 3.6186,
"step": 11100
},
{
"epoch": 3.2479025868096016,
"grad_norm": 0.32737287878990173,
"learning_rate": 0.0005033051341890314,
"loss": 3.6176,
"step": 11150
},
{
"epoch": 3.262467956187369,
"grad_norm": 0.33219143748283386,
"learning_rate": 0.00050286756126021,
"loss": 3.6299,
"step": 11200
},
{
"epoch": 3.2770333255651365,
"grad_norm": 0.3134367763996124,
"learning_rate": 0.0005024299883313885,
"loss": 3.6269,
"step": 11250
},
{
"epoch": 3.2915986949429037,
"grad_norm": 0.3368885815143585,
"learning_rate": 0.0005019924154025671,
"loss": 3.6383,
"step": 11300
},
{
"epoch": 3.3061640643206713,
"grad_norm": 0.30437996983528137,
"learning_rate": 0.0005015548424737456,
"loss": 3.6245,
"step": 11350
},
{
"epoch": 3.3207294336984385,
"grad_norm": 0.33528828620910645,
"learning_rate": 0.0005011172695449241,
"loss": 3.6251,
"step": 11400
},
{
"epoch": 3.335294803076206,
"grad_norm": 0.33781698346138,
"learning_rate": 0.0005006796966161027,
"loss": 3.63,
"step": 11450
},
{
"epoch": 3.3498601724539734,
"grad_norm": 0.329375296831131,
"learning_rate": 0.0005002421236872811,
"loss": 3.6387,
"step": 11500
},
{
"epoch": 3.364425541831741,
"grad_norm": 0.31199130415916443,
"learning_rate": 0.0004998045507584597,
"loss": 3.6199,
"step": 11550
},
{
"epoch": 3.3789909112095082,
"grad_norm": 0.31993409991264343,
"learning_rate": 0.0004993669778296382,
"loss": 3.6383,
"step": 11600
},
{
"epoch": 3.393556280587276,
"grad_norm": 0.33537372946739197,
"learning_rate": 0.0004989294049008167,
"loss": 3.6409,
"step": 11650
},
{
"epoch": 3.408121649965043,
"grad_norm": 0.3288818299770355,
"learning_rate": 0.0004984918319719953,
"loss": 3.6544,
"step": 11700
},
{
"epoch": 3.4226870193428107,
"grad_norm": 0.3143393099308014,
"learning_rate": 0.0004980542590431738,
"loss": 3.632,
"step": 11750
},
{
"epoch": 3.437252388720578,
"grad_norm": 0.3316044211387634,
"learning_rate": 0.0004976166861143524,
"loss": 3.6256,
"step": 11800
},
{
"epoch": 3.4518177580983456,
"grad_norm": 0.3158373534679413,
"learning_rate": 0.0004971791131855309,
"loss": 3.6283,
"step": 11850
},
{
"epoch": 3.4663831274761128,
"grad_norm": 0.3310090899467468,
"learning_rate": 0.0004967415402567094,
"loss": 3.6383,
"step": 11900
},
{
"epoch": 3.4809484968538804,
"grad_norm": 0.3304344415664673,
"learning_rate": 0.000496303967327888,
"loss": 3.6364,
"step": 11950
},
{
"epoch": 3.4955138662316476,
"grad_norm": 0.3196583390235901,
"learning_rate": 0.0004958663943990664,
"loss": 3.6245,
"step": 12000
},
{
"epoch": 3.4955138662316476,
"eval_accuracy": 0.3539525300396798,
"eval_loss": 3.6910691261291504,
"eval_runtime": 180.2749,
"eval_samples_per_second": 92.326,
"eval_steps_per_second": 5.775,
"step": 12000
},
{
"epoch": 3.510079235609415,
"grad_norm": 0.334721177816391,
"learning_rate": 0.000495428821470245,
"loss": 3.6405,
"step": 12050
},
{
"epoch": 3.5246446049871825,
"grad_norm": 0.30898579955101013,
"learning_rate": 0.0004949912485414235,
"loss": 3.633,
"step": 12100
},
{
"epoch": 3.53920997436495,
"grad_norm": 0.3296958804130554,
"learning_rate": 0.0004945536756126021,
"loss": 3.6336,
"step": 12150
},
{
"epoch": 3.5537753437427173,
"grad_norm": 0.3156227469444275,
"learning_rate": 0.0004941161026837806,
"loss": 3.6264,
"step": 12200
},
{
"epoch": 3.5683407131204845,
"grad_norm": 0.32900500297546387,
"learning_rate": 0.0004936785297549591,
"loss": 3.6286,
"step": 12250
},
{
"epoch": 3.582906082498252,
"grad_norm": 0.33001989126205444,
"learning_rate": 0.0004932409568261377,
"loss": 3.6485,
"step": 12300
},
{
"epoch": 3.59747145187602,
"grad_norm": 0.32858744263648987,
"learning_rate": 0.0004928033838973162,
"loss": 3.6323,
"step": 12350
},
{
"epoch": 3.612036821253787,
"grad_norm": 0.35113999247550964,
"learning_rate": 0.0004923658109684946,
"loss": 3.647,
"step": 12400
},
{
"epoch": 3.626602190631554,
"grad_norm": 0.3282478153705597,
"learning_rate": 0.0004919282380396732,
"loss": 3.6335,
"step": 12450
},
{
"epoch": 3.641167560009322,
"grad_norm": 0.31611868739128113,
"learning_rate": 0.0004914906651108517,
"loss": 3.631,
"step": 12500
},
{
"epoch": 3.6557329293870895,
"grad_norm": 0.33487775921821594,
"learning_rate": 0.0004910530921820303,
"loss": 3.6274,
"step": 12550
},
{
"epoch": 3.6702982987648567,
"grad_norm": 0.33004793524742126,
"learning_rate": 0.0004906155192532088,
"loss": 3.618,
"step": 12600
},
{
"epoch": 3.684863668142624,
"grad_norm": 0.30851587653160095,
"learning_rate": 0.0004901779463243874,
"loss": 3.6229,
"step": 12650
},
{
"epoch": 3.6994290375203915,
"grad_norm": 0.325185090303421,
"learning_rate": 0.0004897403733955659,
"loss": 3.6289,
"step": 12700
},
{
"epoch": 3.713994406898159,
"grad_norm": 0.3187962770462036,
"learning_rate": 0.0004893028004667444,
"loss": 3.6355,
"step": 12750
},
{
"epoch": 3.7285597762759264,
"grad_norm": 0.32004639506340027,
"learning_rate": 0.000488865227537923,
"loss": 3.6424,
"step": 12800
},
{
"epoch": 3.7431251456536936,
"grad_norm": 0.331478476524353,
"learning_rate": 0.0004884276546091015,
"loss": 3.624,
"step": 12850
},
{
"epoch": 3.7576905150314612,
"grad_norm": 0.31720319390296936,
"learning_rate": 0.00048799008168028,
"loss": 3.6329,
"step": 12900
},
{
"epoch": 3.772255884409229,
"grad_norm": 0.32388386130332947,
"learning_rate": 0.00048755250875145853,
"loss": 3.6237,
"step": 12950
},
{
"epoch": 3.786821253786996,
"grad_norm": 0.326471209526062,
"learning_rate": 0.0004871149358226371,
"loss": 3.6365,
"step": 13000
},
{
"epoch": 3.786821253786996,
"eval_accuracy": 0.35590465655600817,
"eval_loss": 3.6707494258880615,
"eval_runtime": 180.189,
"eval_samples_per_second": 92.37,
"eval_steps_per_second": 5.777,
"step": 13000
},
{
"epoch": 3.8013866231647633,
"grad_norm": 0.3287231922149658,
"learning_rate": 0.0004866773628938156,
"loss": 3.6351,
"step": 13050
},
{
"epoch": 3.815951992542531,
"grad_norm": 0.3224816620349884,
"learning_rate": 0.0004862397899649941,
"loss": 3.631,
"step": 13100
},
{
"epoch": 3.8305173619202986,
"grad_norm": 0.34565699100494385,
"learning_rate": 0.00048580221703617264,
"loss": 3.6365,
"step": 13150
},
{
"epoch": 3.8450827312980658,
"grad_norm": 0.31353557109832764,
"learning_rate": 0.00048536464410735123,
"loss": 3.6346,
"step": 13200
},
{
"epoch": 3.859648100675833,
"grad_norm": 0.31035754084587097,
"learning_rate": 0.00048492707117852966,
"loss": 3.6353,
"step": 13250
},
{
"epoch": 3.8742134700536006,
"grad_norm": 0.3304181694984436,
"learning_rate": 0.00048448949824970826,
"loss": 3.631,
"step": 13300
},
{
"epoch": 3.888778839431368,
"grad_norm": 0.3305014669895172,
"learning_rate": 0.0004840519253208868,
"loss": 3.6234,
"step": 13350
},
{
"epoch": 3.9033442088091355,
"grad_norm": 0.33002111315727234,
"learning_rate": 0.0004836143523920653,
"loss": 3.6389,
"step": 13400
},
{
"epoch": 3.9179095781869027,
"grad_norm": 0.3106802701950073,
"learning_rate": 0.0004831767794632438,
"loss": 3.6413,
"step": 13450
},
{
"epoch": 3.9324749475646703,
"grad_norm": 0.32683488726615906,
"learning_rate": 0.00048273920653442236,
"loss": 3.6114,
"step": 13500
},
{
"epoch": 3.9470403169424375,
"grad_norm": 0.3140070140361786,
"learning_rate": 0.0004823016336056009,
"loss": 3.6246,
"step": 13550
},
{
"epoch": 3.961605686320205,
"grad_norm": 0.3176632523536682,
"learning_rate": 0.0004818640606767794,
"loss": 3.6215,
"step": 13600
},
{
"epoch": 3.9761710556979724,
"grad_norm": 0.33348730206489563,
"learning_rate": 0.00048142648774795793,
"loss": 3.6198,
"step": 13650
},
{
"epoch": 3.99073642507574,
"grad_norm": 0.3215520679950714,
"learning_rate": 0.0004809889148191365,
"loss": 3.6326,
"step": 13700
},
{
"epoch": 4.005243532975996,
"grad_norm": 0.3232531249523163,
"learning_rate": 0.000480551341890315,
"loss": 3.5772,
"step": 13750
},
{
"epoch": 4.0198089023537635,
"grad_norm": 0.3443015515804291,
"learning_rate": 0.00048011376896149355,
"loss": 3.5136,
"step": 13800
},
{
"epoch": 4.034374271731531,
"grad_norm": 0.3226703405380249,
"learning_rate": 0.0004796761960326721,
"loss": 3.5234,
"step": 13850
},
{
"epoch": 4.048939641109299,
"grad_norm": 0.32913267612457275,
"learning_rate": 0.0004792386231038506,
"loss": 3.5215,
"step": 13900
},
{
"epoch": 4.063505010487066,
"grad_norm": 0.3350991904735565,
"learning_rate": 0.0004788010501750291,
"loss": 3.5211,
"step": 13950
},
{
"epoch": 4.078070379864833,
"grad_norm": 0.3143565058708191,
"learning_rate": 0.00047836347724620766,
"loss": 3.5138,
"step": 14000
},
{
"epoch": 4.078070379864833,
"eval_accuracy": 0.3570977076769612,
"eval_loss": 3.6638998985290527,
"eval_runtime": 180.2734,
"eval_samples_per_second": 92.326,
"eval_steps_per_second": 5.775,
"step": 14000
},
{
"epoch": 4.092635749242601,
"grad_norm": 0.318641722202301,
"learning_rate": 0.0004779259043173862,
"loss": 3.5201,
"step": 14050
},
{
"epoch": 4.1072011186203685,
"grad_norm": 0.31839898228645325,
"learning_rate": 0.0004774883313885647,
"loss": 3.5223,
"step": 14100
},
{
"epoch": 4.121766487998135,
"grad_norm": 0.3353429436683655,
"learning_rate": 0.0004770507584597433,
"loss": 3.5314,
"step": 14150
},
{
"epoch": 4.136331857375903,
"grad_norm": 0.3297600746154785,
"learning_rate": 0.0004766131855309218,
"loss": 3.5386,
"step": 14200
},
{
"epoch": 4.150897226753671,
"grad_norm": 0.35828185081481934,
"learning_rate": 0.0004761756126021003,
"loss": 3.5442,
"step": 14250
},
{
"epoch": 4.165462596131438,
"grad_norm": 0.32543322443962097,
"learning_rate": 0.00047573803967327884,
"loss": 3.5498,
"step": 14300
},
{
"epoch": 4.180027965509205,
"grad_norm": 0.33324652910232544,
"learning_rate": 0.0004753004667444574,
"loss": 3.5393,
"step": 14350
},
{
"epoch": 4.194593334886973,
"grad_norm": 0.3401516079902649,
"learning_rate": 0.00047486289381563587,
"loss": 3.5485,
"step": 14400
},
{
"epoch": 4.20915870426474,
"grad_norm": 0.34022200107574463,
"learning_rate": 0.0004744253208868144,
"loss": 3.5287,
"step": 14450
},
{
"epoch": 4.223724073642508,
"grad_norm": 0.3375685214996338,
"learning_rate": 0.00047398774795799295,
"loss": 3.5567,
"step": 14500
},
{
"epoch": 4.238289443020275,
"grad_norm": 0.32578080892562866,
"learning_rate": 0.00047355017502917154,
"loss": 3.5511,
"step": 14550
},
{
"epoch": 4.252854812398042,
"grad_norm": 0.3124660551548004,
"learning_rate": 0.00047311260210035,
"loss": 3.5519,
"step": 14600
},
{
"epoch": 4.26742018177581,
"grad_norm": 0.317643940448761,
"learning_rate": 0.00047267502917152857,
"loss": 3.5485,
"step": 14650
},
{
"epoch": 4.281985551153578,
"grad_norm": 0.3317655026912689,
"learning_rate": 0.0004722374562427071,
"loss": 3.5541,
"step": 14700
},
{
"epoch": 4.296550920531344,
"grad_norm": 0.32578787207603455,
"learning_rate": 0.0004717998833138856,
"loss": 3.5354,
"step": 14750
},
{
"epoch": 4.311116289909112,
"grad_norm": 0.32401853799819946,
"learning_rate": 0.00047136231038506413,
"loss": 3.5608,
"step": 14800
},
{
"epoch": 4.32568165928688,
"grad_norm": 0.33071812987327576,
"learning_rate": 0.00047092473745624267,
"loss": 3.5453,
"step": 14850
},
{
"epoch": 4.340247028664647,
"grad_norm": 0.3195439577102661,
"learning_rate": 0.00047048716452742116,
"loss": 3.5509,
"step": 14900
},
{
"epoch": 4.354812398042414,
"grad_norm": 0.32133200764656067,
"learning_rate": 0.0004700495915985997,
"loss": 3.5631,
"step": 14950
},
{
"epoch": 4.369377767420182,
"grad_norm": 0.345612108707428,
"learning_rate": 0.0004696120186697783,
"loss": 3.5632,
"step": 15000
},
{
"epoch": 4.369377767420182,
"eval_accuracy": 0.3582166854554288,
"eval_loss": 3.653571128845215,
"eval_runtime": 180.4957,
"eval_samples_per_second": 92.213,
"eval_steps_per_second": 5.767,
"step": 15000
},
{
"epoch": 4.383943136797949,
"grad_norm": 0.33550721406936646,
"learning_rate": 0.00046917444574095683,
"loss": 3.5562,
"step": 15050
},
{
"epoch": 4.398508506175717,
"grad_norm": 0.32593655586242676,
"learning_rate": 0.0004687368728121353,
"loss": 3.5542,
"step": 15100
},
{
"epoch": 4.413073875553484,
"grad_norm": 0.32876867055892944,
"learning_rate": 0.00046829929988331386,
"loss": 3.5537,
"step": 15150
},
{
"epoch": 4.427639244931251,
"grad_norm": 0.31340348720550537,
"learning_rate": 0.0004678617269544924,
"loss": 3.5547,
"step": 15200
},
{
"epoch": 4.442204614309019,
"grad_norm": 0.325003981590271,
"learning_rate": 0.0004674241540256709,
"loss": 3.5638,
"step": 15250
},
{
"epoch": 4.456769983686787,
"grad_norm": 0.31941288709640503,
"learning_rate": 0.0004669865810968494,
"loss": 3.5625,
"step": 15300
},
{
"epoch": 4.471335353064553,
"grad_norm": 0.32604023814201355,
"learning_rate": 0.00046654900816802796,
"loss": 3.5542,
"step": 15350
},
{
"epoch": 4.485900722442321,
"grad_norm": 0.3184167444705963,
"learning_rate": 0.00046611143523920645,
"loss": 3.5597,
"step": 15400
},
{
"epoch": 4.500466091820089,
"grad_norm": 0.32676759362220764,
"learning_rate": 0.00046567386231038504,
"loss": 3.5518,
"step": 15450
},
{
"epoch": 4.515031461197856,
"grad_norm": 0.3253229260444641,
"learning_rate": 0.0004652362893815636,
"loss": 3.5636,
"step": 15500
},
{
"epoch": 4.529596830575623,
"grad_norm": 0.33474475145339966,
"learning_rate": 0.0004647987164527421,
"loss": 3.5638,
"step": 15550
},
{
"epoch": 4.544162199953391,
"grad_norm": 0.34634941816329956,
"learning_rate": 0.0004643611435239206,
"loss": 3.5473,
"step": 15600
},
{
"epoch": 4.558727569331158,
"grad_norm": 0.33891260623931885,
"learning_rate": 0.00046392357059509915,
"loss": 3.5675,
"step": 15650
},
{
"epoch": 4.573292938708926,
"grad_norm": 0.32942262291908264,
"learning_rate": 0.0004634859976662777,
"loss": 3.5603,
"step": 15700
},
{
"epoch": 4.587858308086693,
"grad_norm": 0.3374430239200592,
"learning_rate": 0.0004630484247374562,
"loss": 3.5538,
"step": 15750
},
{
"epoch": 4.6024236774644605,
"grad_norm": 0.3401276767253876,
"learning_rate": 0.0004626108518086347,
"loss": 3.5644,
"step": 15800
},
{
"epoch": 4.616989046842228,
"grad_norm": 0.3286304473876953,
"learning_rate": 0.0004621732788798133,
"loss": 3.5653,
"step": 15850
},
{
"epoch": 4.631554416219995,
"grad_norm": 0.31420665979385376,
"learning_rate": 0.00046173570595099174,
"loss": 3.556,
"step": 15900
},
{
"epoch": 4.6461197855977625,
"grad_norm": 0.3286356031894684,
"learning_rate": 0.00046129813302217033,
"loss": 3.552,
"step": 15950
},
{
"epoch": 4.66068515497553,
"grad_norm": 0.33006393909454346,
"learning_rate": 0.00046086056009334887,
"loss": 3.5684,
"step": 16000
},
{
"epoch": 4.66068515497553,
"eval_accuracy": 0.3596702866191563,
"eval_loss": 3.640338897705078,
"eval_runtime": 180.2507,
"eval_samples_per_second": 92.338,
"eval_steps_per_second": 5.775,
"step": 16000
},
{
"epoch": 4.675250524353298,
"grad_norm": 0.3313292860984802,
"learning_rate": 0.0004604229871645274,
"loss": 3.5606,
"step": 16050
},
{
"epoch": 4.689815893731065,
"grad_norm": 0.31922703981399536,
"learning_rate": 0.0004599854142357059,
"loss": 3.5739,
"step": 16100
},
{
"epoch": 4.704381263108832,
"grad_norm": 0.3161007761955261,
"learning_rate": 0.00045954784130688444,
"loss": 3.5688,
"step": 16150
},
{
"epoch": 4.7189466324866,
"grad_norm": 0.33094581961631775,
"learning_rate": 0.000459110268378063,
"loss": 3.564,
"step": 16200
},
{
"epoch": 4.7335120018643675,
"grad_norm": 0.3282545804977417,
"learning_rate": 0.00045867269544924146,
"loss": 3.5759,
"step": 16250
},
{
"epoch": 4.748077371242134,
"grad_norm": 0.32690319418907166,
"learning_rate": 0.00045823512252042,
"loss": 3.5601,
"step": 16300
},
{
"epoch": 4.762642740619902,
"grad_norm": 0.3375246524810791,
"learning_rate": 0.0004577975495915986,
"loss": 3.5569,
"step": 16350
},
{
"epoch": 4.7772081099976695,
"grad_norm": 0.3194766044616699,
"learning_rate": 0.0004573599766627771,
"loss": 3.5536,
"step": 16400
},
{
"epoch": 4.791773479375437,
"grad_norm": 0.31809139251708984,
"learning_rate": 0.0004569224037339556,
"loss": 3.5626,
"step": 16450
},
{
"epoch": 4.806338848753205,
"grad_norm": 0.3298538327217102,
"learning_rate": 0.00045648483080513416,
"loss": 3.5597,
"step": 16500
},
{
"epoch": 4.820904218130972,
"grad_norm": 0.343118816614151,
"learning_rate": 0.0004560472578763127,
"loss": 3.563,
"step": 16550
},
{
"epoch": 4.835469587508739,
"grad_norm": 0.32174625992774963,
"learning_rate": 0.0004556096849474912,
"loss": 3.5602,
"step": 16600
},
{
"epoch": 4.850034956886507,
"grad_norm": 0.3458464741706848,
"learning_rate": 0.00045517211201866973,
"loss": 3.5485,
"step": 16650
},
{
"epoch": 4.864600326264274,
"grad_norm": 0.3370623290538788,
"learning_rate": 0.00045473453908984827,
"loss": 3.5624,
"step": 16700
},
{
"epoch": 4.879165695642041,
"grad_norm": 0.33553197979927063,
"learning_rate": 0.00045429696616102675,
"loss": 3.5675,
"step": 16750
},
{
"epoch": 4.893731065019809,
"grad_norm": 0.3206152617931366,
"learning_rate": 0.00045385939323220535,
"loss": 3.5618,
"step": 16800
},
{
"epoch": 4.908296434397577,
"grad_norm": 0.3171241581439972,
"learning_rate": 0.0004534218203033839,
"loss": 3.5692,
"step": 16850
},
{
"epoch": 4.922861803775344,
"grad_norm": 0.3172144889831543,
"learning_rate": 0.0004529842473745624,
"loss": 3.5552,
"step": 16900
},
{
"epoch": 4.937427173153111,
"grad_norm": 0.32273098826408386,
"learning_rate": 0.0004525466744457409,
"loss": 3.5712,
"step": 16950
},
{
"epoch": 4.951992542530879,
"grad_norm": 0.3339548707008362,
"learning_rate": 0.00045210910151691945,
"loss": 3.5646,
"step": 17000
},
{
"epoch": 4.951992542530879,
"eval_accuracy": 0.36071871835716146,
"eval_loss": 3.625553846359253,
"eval_runtime": 180.4809,
"eval_samples_per_second": 92.22,
"eval_steps_per_second": 5.768,
"step": 17000
},
{
"epoch": 4.966557911908646,
"grad_norm": 0.3269366919994354,
"learning_rate": 0.000451671528588098,
"loss": 3.5542,
"step": 17050
},
{
"epoch": 4.981123281286413,
"grad_norm": 0.31721675395965576,
"learning_rate": 0.0004512339556592765,
"loss": 3.5729,
"step": 17100
},
{
"epoch": 4.995688650664181,
"grad_norm": 0.3314802050590515,
"learning_rate": 0.000450796382730455,
"loss": 3.5643,
"step": 17150
},
{
"epoch": 5.010195758564437,
"grad_norm": 0.34938499331474304,
"learning_rate": 0.0004503588098016336,
"loss": 3.4822,
"step": 17200
},
{
"epoch": 5.024761127942204,
"grad_norm": 0.3565429449081421,
"learning_rate": 0.0004499212368728121,
"loss": 3.4413,
"step": 17250
},
{
"epoch": 5.039326497319972,
"grad_norm": 0.34626901149749756,
"learning_rate": 0.00044948366394399064,
"loss": 3.4539,
"step": 17300
},
{
"epoch": 5.0538918666977395,
"grad_norm": 0.336347758769989,
"learning_rate": 0.0004490460910151692,
"loss": 3.4579,
"step": 17350
},
{
"epoch": 5.068457236075507,
"grad_norm": 0.3387928605079651,
"learning_rate": 0.00044860851808634767,
"loss": 3.4767,
"step": 17400
},
{
"epoch": 5.083022605453274,
"grad_norm": 0.3393719494342804,
"learning_rate": 0.0004481709451575262,
"loss": 3.4596,
"step": 17450
},
{
"epoch": 5.0975879748310415,
"grad_norm": 0.3251345157623291,
"learning_rate": 0.00044773337222870475,
"loss": 3.4748,
"step": 17500
},
{
"epoch": 5.112153344208809,
"grad_norm": 0.32468897104263306,
"learning_rate": 0.0004472957992998833,
"loss": 3.4805,
"step": 17550
},
{
"epoch": 5.126718713586577,
"grad_norm": 0.3337823450565338,
"learning_rate": 0.00044685822637106177,
"loss": 3.4754,
"step": 17600
},
{
"epoch": 5.141284082964344,
"grad_norm": 0.3582659959793091,
"learning_rate": 0.00044642065344224037,
"loss": 3.4752,
"step": 17650
},
{
"epoch": 5.155849452342111,
"grad_norm": 0.3382004499435425,
"learning_rate": 0.0004459830805134189,
"loss": 3.4633,
"step": 17700
},
{
"epoch": 5.170414821719879,
"grad_norm": 0.33493444323539734,
"learning_rate": 0.0004455455075845974,
"loss": 3.4896,
"step": 17750
},
{
"epoch": 5.1849801910976465,
"grad_norm": 0.33412331342697144,
"learning_rate": 0.00044510793465577593,
"loss": 3.4854,
"step": 17800
},
{
"epoch": 5.199545560475413,
"grad_norm": 0.3649858832359314,
"learning_rate": 0.00044467036172695447,
"loss": 3.4862,
"step": 17850
},
{
"epoch": 5.214110929853181,
"grad_norm": 0.3273285925388336,
"learning_rate": 0.00044423278879813296,
"loss": 3.4898,
"step": 17900
},
{
"epoch": 5.228676299230949,
"grad_norm": 0.36678996682167053,
"learning_rate": 0.0004437952158693115,
"loss": 3.4798,
"step": 17950
},
{
"epoch": 5.243241668608716,
"grad_norm": 0.33765271306037903,
"learning_rate": 0.00044335764294049004,
"loss": 3.4892,
"step": 18000
},
{
"epoch": 5.243241668608716,
"eval_accuracy": 0.3612939037403981,
"eval_loss": 3.6294894218444824,
"eval_runtime": 180.4902,
"eval_samples_per_second": 92.216,
"eval_steps_per_second": 5.768,
"step": 18000
},
{
"epoch": 5.257807037986483,
"grad_norm": 0.35786503553390503,
"learning_rate": 0.00044292007001166863,
"loss": 3.4823,
"step": 18050
},
{
"epoch": 5.272372407364251,
"grad_norm": 0.3416072130203247,
"learning_rate": 0.00044248249708284706,
"loss": 3.4879,
"step": 18100
},
{
"epoch": 5.286937776742018,
"grad_norm": 0.34881216287612915,
"learning_rate": 0.00044204492415402566,
"loss": 3.4922,
"step": 18150
},
{
"epoch": 5.301503146119786,
"grad_norm": 0.34528306126594543,
"learning_rate": 0.0004416073512252042,
"loss": 3.4949,
"step": 18200
},
{
"epoch": 5.316068515497553,
"grad_norm": 0.328722208738327,
"learning_rate": 0.0004411697782963827,
"loss": 3.4898,
"step": 18250
},
{
"epoch": 5.33063388487532,
"grad_norm": 0.3258577585220337,
"learning_rate": 0.0004407322053675612,
"loss": 3.4844,
"step": 18300
},
{
"epoch": 5.345199254253088,
"grad_norm": 0.3687138855457306,
"learning_rate": 0.00044029463243873976,
"loss": 3.4868,
"step": 18350
},
{
"epoch": 5.359764623630856,
"grad_norm": 0.3356603682041168,
"learning_rate": 0.00043985705950991825,
"loss": 3.4854,
"step": 18400
},
{
"epoch": 5.374329993008622,
"grad_norm": 0.3445993661880493,
"learning_rate": 0.0004394194865810968,
"loss": 3.4981,
"step": 18450
},
{
"epoch": 5.38889536238639,
"grad_norm": 0.328427791595459,
"learning_rate": 0.00043898191365227533,
"loss": 3.4887,
"step": 18500
},
{
"epoch": 5.403460731764158,
"grad_norm": 0.3391731083393097,
"learning_rate": 0.0004385443407234539,
"loss": 3.5023,
"step": 18550
},
{
"epoch": 5.418026101141925,
"grad_norm": 0.3405122458934784,
"learning_rate": 0.0004381067677946324,
"loss": 3.5082,
"step": 18600
},
{
"epoch": 5.432591470519692,
"grad_norm": 0.32964596152305603,
"learning_rate": 0.00043766919486581095,
"loss": 3.5064,
"step": 18650
},
{
"epoch": 5.44715683989746,
"grad_norm": 0.32743725180625916,
"learning_rate": 0.0004372316219369895,
"loss": 3.5069,
"step": 18700
},
{
"epoch": 5.461722209275227,
"grad_norm": 0.33889785408973694,
"learning_rate": 0.00043679404900816797,
"loss": 3.4917,
"step": 18750
},
{
"epoch": 5.476287578652995,
"grad_norm": 0.3374757468700409,
"learning_rate": 0.0004363564760793465,
"loss": 3.5129,
"step": 18800
},
{
"epoch": 5.490852948030762,
"grad_norm": 0.32586970925331116,
"learning_rate": 0.00043591890315052505,
"loss": 3.4983,
"step": 18850
},
{
"epoch": 5.505418317408529,
"grad_norm": 0.3159201443195343,
"learning_rate": 0.00043548133022170354,
"loss": 3.5049,
"step": 18900
},
{
"epoch": 5.519983686786297,
"grad_norm": 0.3207235634326935,
"learning_rate": 0.0004350437572928821,
"loss": 3.5027,
"step": 18950
},
{
"epoch": 5.534549056164065,
"grad_norm": 0.32409095764160156,
"learning_rate": 0.00043460618436406067,
"loss": 3.5037,
"step": 19000
},
{
"epoch": 5.534549056164065,
"eval_accuracy": 0.36227261247507964,
"eval_loss": 3.617015838623047,
"eval_runtime": 180.3747,
"eval_samples_per_second": 92.275,
"eval_steps_per_second": 5.771,
"step": 19000
},
{
"epoch": 5.549114425541831,
"grad_norm": 0.33343568444252014,
"learning_rate": 0.0004341686114352392,
"loss": 3.5044,
"step": 19050
},
{
"epoch": 5.563679794919599,
"grad_norm": 0.3471834063529968,
"learning_rate": 0.0004337310385064177,
"loss": 3.4995,
"step": 19100
},
{
"epoch": 5.578245164297367,
"grad_norm": 0.32965055108070374,
"learning_rate": 0.00043329346557759624,
"loss": 3.5091,
"step": 19150
},
{
"epoch": 5.592810533675134,
"grad_norm": 0.32729023694992065,
"learning_rate": 0.0004328558926487748,
"loss": 3.4987,
"step": 19200
},
{
"epoch": 5.607375903052901,
"grad_norm": 0.32407552003860474,
"learning_rate": 0.00043241831971995326,
"loss": 3.5105,
"step": 19250
},
{
"epoch": 5.621941272430669,
"grad_norm": 0.3459337055683136,
"learning_rate": 0.0004319807467911318,
"loss": 3.5139,
"step": 19300
},
{
"epoch": 5.636506641808436,
"grad_norm": 0.34581705927848816,
"learning_rate": 0.00043154317386231034,
"loss": 3.5169,
"step": 19350
},
{
"epoch": 5.651072011186204,
"grad_norm": 0.323258638381958,
"learning_rate": 0.00043110560093348883,
"loss": 3.5006,
"step": 19400
},
{
"epoch": 5.665637380563971,
"grad_norm": 0.3501630127429962,
"learning_rate": 0.0004306680280046674,
"loss": 3.5059,
"step": 19450
},
{
"epoch": 5.6802027499417385,
"grad_norm": 0.3383364975452423,
"learning_rate": 0.00043023045507584596,
"loss": 3.5082,
"step": 19500
},
{
"epoch": 5.694768119319506,
"grad_norm": 0.3391266465187073,
"learning_rate": 0.0004297928821470245,
"loss": 3.5073,
"step": 19550
},
{
"epoch": 5.709333488697274,
"grad_norm": 0.33838364481925964,
"learning_rate": 0.000429355309218203,
"loss": 3.5057,
"step": 19600
},
{
"epoch": 5.7238988580750405,
"grad_norm": 0.3325950801372528,
"learning_rate": 0.00042891773628938153,
"loss": 3.518,
"step": 19650
},
{
"epoch": 5.738464227452808,
"grad_norm": 0.3349588215351105,
"learning_rate": 0.00042848016336056007,
"loss": 3.5155,
"step": 19700
},
{
"epoch": 5.753029596830576,
"grad_norm": 0.33944258093833923,
"learning_rate": 0.00042804259043173855,
"loss": 3.5028,
"step": 19750
},
{
"epoch": 5.7675949662083426,
"grad_norm": 0.3170711398124695,
"learning_rate": 0.0004276050175029171,
"loss": 3.5181,
"step": 19800
},
{
"epoch": 5.78216033558611,
"grad_norm": 0.3340502083301544,
"learning_rate": 0.0004271674445740957,
"loss": 3.4968,
"step": 19850
},
{
"epoch": 5.796725704963878,
"grad_norm": 0.34251272678375244,
"learning_rate": 0.0004267298716452741,
"loss": 3.5079,
"step": 19900
},
{
"epoch": 5.8112910743416455,
"grad_norm": 0.3394465446472168,
"learning_rate": 0.0004262922987164527,
"loss": 3.5056,
"step": 19950
},
{
"epoch": 5.825856443719413,
"grad_norm": 0.32446908950805664,
"learning_rate": 0.00042585472578763125,
"loss": 3.5029,
"step": 20000
},
{
"epoch": 5.825856443719413,
"eval_accuracy": 0.36320252686510796,
"eval_loss": 3.60426664352417,
"eval_runtime": 180.5652,
"eval_samples_per_second": 92.177,
"eval_steps_per_second": 5.765,
"step": 20000
},
{
"epoch": 5.84042181309718,
"grad_norm": 0.3498161733150482,
"learning_rate": 0.0004254171528588098,
"loss": 3.5086,
"step": 20050
},
{
"epoch": 5.8549871824749475,
"grad_norm": 0.33967551589012146,
"learning_rate": 0.0004249795799299883,
"loss": 3.5126,
"step": 20100
},
{
"epoch": 5.869552551852715,
"grad_norm": 0.3366953730583191,
"learning_rate": 0.0004245420070011668,
"loss": 3.5301,
"step": 20150
},
{
"epoch": 5.884117921230482,
"grad_norm": 0.33286792039871216,
"learning_rate": 0.00042410443407234536,
"loss": 3.511,
"step": 20200
},
{
"epoch": 5.89868329060825,
"grad_norm": 0.34662094712257385,
"learning_rate": 0.00042366686114352385,
"loss": 3.512,
"step": 20250
},
{
"epoch": 5.913248659986017,
"grad_norm": 0.3202279508113861,
"learning_rate": 0.0004232292882147024,
"loss": 3.5006,
"step": 20300
},
{
"epoch": 5.927814029363785,
"grad_norm": 0.34777122735977173,
"learning_rate": 0.000422791715285881,
"loss": 3.521,
"step": 20350
},
{
"epoch": 5.9423793987415525,
"grad_norm": 0.34444618225097656,
"learning_rate": 0.00042235414235705947,
"loss": 3.5126,
"step": 20400
},
{
"epoch": 5.956944768119319,
"grad_norm": 0.3303092122077942,
"learning_rate": 0.000421916569428238,
"loss": 3.516,
"step": 20450
},
{
"epoch": 5.971510137497087,
"grad_norm": 0.34319791197776794,
"learning_rate": 0.00042147899649941654,
"loss": 3.5029,
"step": 20500
},
{
"epoch": 5.986075506874855,
"grad_norm": 0.33462879061698914,
"learning_rate": 0.0004210414235705951,
"loss": 3.509,
"step": 20550
},
{
"epoch": 6.0005826147751105,
"grad_norm": 0.332768976688385,
"learning_rate": 0.00042060385064177357,
"loss": 3.5038,
"step": 20600
},
{
"epoch": 6.015147984152878,
"grad_norm": 0.32959234714508057,
"learning_rate": 0.0004201662777129521,
"loss": 3.3948,
"step": 20650
},
{
"epoch": 6.029713353530646,
"grad_norm": 0.3324235677719116,
"learning_rate": 0.00041972870478413065,
"loss": 3.4112,
"step": 20700
},
{
"epoch": 6.044278722908413,
"grad_norm": 0.3403053879737854,
"learning_rate": 0.00041929113185530914,
"loss": 3.4081,
"step": 20750
},
{
"epoch": 6.05884409228618,
"grad_norm": 0.3473146855831146,
"learning_rate": 0.00041885355892648773,
"loss": 3.4049,
"step": 20800
},
{
"epoch": 6.073409461663948,
"grad_norm": 0.34440669417381287,
"learning_rate": 0.00041841598599766627,
"loss": 3.4084,
"step": 20850
},
{
"epoch": 6.087974831041715,
"grad_norm": 0.3304244875907898,
"learning_rate": 0.00041797841306884476,
"loss": 3.4264,
"step": 20900
},
{
"epoch": 6.102540200419483,
"grad_norm": 0.33876416087150574,
"learning_rate": 0.0004175408401400233,
"loss": 3.415,
"step": 20950
},
{
"epoch": 6.11710556979725,
"grad_norm": 0.3384806215763092,
"learning_rate": 0.00041710326721120184,
"loss": 3.4399,
"step": 21000
},
{
"epoch": 6.11710556979725,
"eval_accuracy": 0.3637827680479111,
"eval_loss": 3.6081302165985107,
"eval_runtime": 180.2744,
"eval_samples_per_second": 92.326,
"eval_steps_per_second": 5.775,
"step": 21000
},
{
"epoch": 6.1316709391750175,
"grad_norm": 0.363090455532074,
"learning_rate": 0.0004166656942823804,
"loss": 3.4368,
"step": 21050
},
{
"epoch": 6.146236308552785,
"grad_norm": 0.3336644470691681,
"learning_rate": 0.00041622812135355886,
"loss": 3.4255,
"step": 21100
},
{
"epoch": 6.160801677930552,
"grad_norm": 0.3573184311389923,
"learning_rate": 0.0004157905484247374,
"loss": 3.43,
"step": 21150
},
{
"epoch": 6.1753670473083195,
"grad_norm": 0.3469174802303314,
"learning_rate": 0.000415352975495916,
"loss": 3.4248,
"step": 21200
},
{
"epoch": 6.189932416686087,
"grad_norm": 0.33994483947753906,
"learning_rate": 0.0004149154025670945,
"loss": 3.4331,
"step": 21250
},
{
"epoch": 6.204497786063855,
"grad_norm": 0.34334084391593933,
"learning_rate": 0.000414477829638273,
"loss": 3.4336,
"step": 21300
},
{
"epoch": 6.219063155441622,
"grad_norm": 0.3307756185531616,
"learning_rate": 0.00041404025670945156,
"loss": 3.4457,
"step": 21350
},
{
"epoch": 6.233628524819389,
"grad_norm": 0.3440045118331909,
"learning_rate": 0.00041360268378063005,
"loss": 3.4505,
"step": 21400
},
{
"epoch": 6.248193894197157,
"grad_norm": 0.32408636808395386,
"learning_rate": 0.0004131651108518086,
"loss": 3.4495,
"step": 21450
},
{
"epoch": 6.2627592635749245,
"grad_norm": 0.3418697714805603,
"learning_rate": 0.0004127275379229871,
"loss": 3.4413,
"step": 21500
},
{
"epoch": 6.277324632952691,
"grad_norm": 0.3394606113433838,
"learning_rate": 0.00041228996499416567,
"loss": 3.4419,
"step": 21550
},
{
"epoch": 6.291890002330459,
"grad_norm": 0.3462677299976349,
"learning_rate": 0.00041185239206534415,
"loss": 3.4454,
"step": 21600
},
{
"epoch": 6.306455371708227,
"grad_norm": 0.33543628454208374,
"learning_rate": 0.00041141481913652275,
"loss": 3.4359,
"step": 21650
},
{
"epoch": 6.321020741085994,
"grad_norm": 0.3553283214569092,
"learning_rate": 0.0004109772462077013,
"loss": 3.4364,
"step": 21700
},
{
"epoch": 6.335586110463761,
"grad_norm": 0.3360411822795868,
"learning_rate": 0.00041053967327887977,
"loss": 3.4451,
"step": 21750
},
{
"epoch": 6.350151479841529,
"grad_norm": 0.33588552474975586,
"learning_rate": 0.0004101021003500583,
"loss": 3.439,
"step": 21800
},
{
"epoch": 6.364716849219296,
"grad_norm": 0.3321113884449005,
"learning_rate": 0.00040966452742123685,
"loss": 3.4385,
"step": 21850
},
{
"epoch": 6.379282218597064,
"grad_norm": 0.3304464817047119,
"learning_rate": 0.00040922695449241534,
"loss": 3.4573,
"step": 21900
},
{
"epoch": 6.393847587974831,
"grad_norm": 0.3388485014438629,
"learning_rate": 0.0004087893815635939,
"loss": 3.4549,
"step": 21950
},
{
"epoch": 6.408412957352598,
"grad_norm": 0.36697396636009216,
"learning_rate": 0.0004083518086347724,
"loss": 3.4438,
"step": 22000
},
{
"epoch": 6.408412957352598,
"eval_accuracy": 0.3643471363716102,
"eval_loss": 3.600018262863159,
"eval_runtime": 180.2847,
"eval_samples_per_second": 92.321,
"eval_steps_per_second": 5.774,
"step": 22000
},
{
"epoch": 6.422978326730366,
"grad_norm": 0.3477044999599457,
"learning_rate": 0.000407914235705951,
"loss": 3.4598,
"step": 22050
},
{
"epoch": 6.437543696108134,
"grad_norm": 0.32996484637260437,
"learning_rate": 0.00040747666277712944,
"loss": 3.4483,
"step": 22100
},
{
"epoch": 6.4521090654859,
"grad_norm": 0.33145061135292053,
"learning_rate": 0.00040703908984830804,
"loss": 3.4543,
"step": 22150
},
{
"epoch": 6.466674434863668,
"grad_norm": 0.33102595806121826,
"learning_rate": 0.0004066015169194866,
"loss": 3.437,
"step": 22200
},
{
"epoch": 6.481239804241436,
"grad_norm": 0.34182071685791016,
"learning_rate": 0.00040616394399066506,
"loss": 3.4591,
"step": 22250
},
{
"epoch": 6.495805173619203,
"grad_norm": 0.35360977053642273,
"learning_rate": 0.0004057263710618436,
"loss": 3.4661,
"step": 22300
},
{
"epoch": 6.51037054299697,
"grad_norm": 0.34044864773750305,
"learning_rate": 0.00040528879813302214,
"loss": 3.4658,
"step": 22350
},
{
"epoch": 6.524935912374738,
"grad_norm": 0.3496011793613434,
"learning_rate": 0.00040485122520420063,
"loss": 3.455,
"step": 22400
},
{
"epoch": 6.539501281752505,
"grad_norm": 0.31914111971855164,
"learning_rate": 0.00040441365227537917,
"loss": 3.4606,
"step": 22450
},
{
"epoch": 6.554066651130273,
"grad_norm": 0.32800233364105225,
"learning_rate": 0.0004039760793465577,
"loss": 3.4562,
"step": 22500
},
{
"epoch": 6.56863202050804,
"grad_norm": 0.33165040612220764,
"learning_rate": 0.0004035385064177363,
"loss": 3.455,
"step": 22550
},
{
"epoch": 6.583197389885807,
"grad_norm": 0.3741567134857178,
"learning_rate": 0.0004031009334889148,
"loss": 3.4562,
"step": 22600
},
{
"epoch": 6.597762759263575,
"grad_norm": 0.35394638776779175,
"learning_rate": 0.00040266336056009333,
"loss": 3.4607,
"step": 22650
},
{
"epoch": 6.612328128641343,
"grad_norm": 0.3237501084804535,
"learning_rate": 0.00040222578763127187,
"loss": 3.4658,
"step": 22700
},
{
"epoch": 6.626893498019109,
"grad_norm": 0.34644386172294617,
"learning_rate": 0.00040178821470245035,
"loss": 3.4642,
"step": 22750
},
{
"epoch": 6.641458867396877,
"grad_norm": 0.34503695368766785,
"learning_rate": 0.0004013506417736289,
"loss": 3.4739,
"step": 22800
},
{
"epoch": 6.656024236774645,
"grad_norm": 0.3343126177787781,
"learning_rate": 0.00040091306884480743,
"loss": 3.4505,
"step": 22850
},
{
"epoch": 6.670589606152412,
"grad_norm": 0.33412104845046997,
"learning_rate": 0.0004004754959159859,
"loss": 3.4603,
"step": 22900
},
{
"epoch": 6.685154975530179,
"grad_norm": 0.32703226804733276,
"learning_rate": 0.00040003792298716446,
"loss": 3.4612,
"step": 22950
},
{
"epoch": 6.699720344907947,
"grad_norm": 0.32835039496421814,
"learning_rate": 0.00039960035005834305,
"loss": 3.4622,
"step": 23000
},
{
"epoch": 6.699720344907947,
"eval_accuracy": 0.36494254495311274,
"eval_loss": 3.5912156105041504,
"eval_runtime": 180.4184,
"eval_samples_per_second": 92.252,
"eval_steps_per_second": 5.77,
"step": 23000
},
{
"epoch": 6.714285714285714,
"grad_norm": 0.3277016878128052,
"learning_rate": 0.0003991627771295216,
"loss": 3.4609,
"step": 23050
},
{
"epoch": 6.728851083663482,
"grad_norm": 0.3436872363090515,
"learning_rate": 0.0003987252042007001,
"loss": 3.4605,
"step": 23100
},
{
"epoch": 6.743416453041249,
"grad_norm": 0.32483038306236267,
"learning_rate": 0.0003982876312718786,
"loss": 3.468,
"step": 23150
},
{
"epoch": 6.7579818224190165,
"grad_norm": 0.3559059500694275,
"learning_rate": 0.00039785005834305716,
"loss": 3.4694,
"step": 23200
},
{
"epoch": 6.772547191796784,
"grad_norm": 0.34260398149490356,
"learning_rate": 0.00039741248541423564,
"loss": 3.4727,
"step": 23250
},
{
"epoch": 6.787112561174552,
"grad_norm": 0.32523587346076965,
"learning_rate": 0.0003969749124854142,
"loss": 3.4571,
"step": 23300
},
{
"epoch": 6.8016779305523185,
"grad_norm": 0.3347657322883606,
"learning_rate": 0.0003965373395565927,
"loss": 3.4717,
"step": 23350
},
{
"epoch": 6.816243299930086,
"grad_norm": 0.33626583218574524,
"learning_rate": 0.0003960997666277712,
"loss": 3.4646,
"step": 23400
},
{
"epoch": 6.830808669307854,
"grad_norm": 0.36179831624031067,
"learning_rate": 0.0003956621936989498,
"loss": 3.4717,
"step": 23450
},
{
"epoch": 6.845374038685621,
"grad_norm": 0.34891805052757263,
"learning_rate": 0.00039522462077012834,
"loss": 3.4699,
"step": 23500
},
{
"epoch": 6.859939408063388,
"grad_norm": 0.37656670808792114,
"learning_rate": 0.0003947870478413069,
"loss": 3.4674,
"step": 23550
},
{
"epoch": 6.874504777441156,
"grad_norm": 0.3371601402759552,
"learning_rate": 0.00039434947491248537,
"loss": 3.4684,
"step": 23600
},
{
"epoch": 6.8890701468189235,
"grad_norm": 0.3327315151691437,
"learning_rate": 0.0003939119019836639,
"loss": 3.4778,
"step": 23650
},
{
"epoch": 6.903635516196691,
"grad_norm": 0.33458471298217773,
"learning_rate": 0.00039347432905484245,
"loss": 3.4688,
"step": 23700
},
{
"epoch": 6.918200885574458,
"grad_norm": 0.3311387896537781,
"learning_rate": 0.00039303675612602094,
"loss": 3.4707,
"step": 23750
},
{
"epoch": 6.9327662549522255,
"grad_norm": 0.33576178550720215,
"learning_rate": 0.0003925991831971995,
"loss": 3.4649,
"step": 23800
},
{
"epoch": 6.947331624329993,
"grad_norm": 0.316501259803772,
"learning_rate": 0.00039216161026837807,
"loss": 3.4702,
"step": 23850
},
{
"epoch": 6.961896993707761,
"grad_norm": 0.3234950006008148,
"learning_rate": 0.00039172403733955656,
"loss": 3.4551,
"step": 23900
},
{
"epoch": 6.976462363085528,
"grad_norm": 0.34549012780189514,
"learning_rate": 0.0003912864644107351,
"loss": 3.4708,
"step": 23950
},
{
"epoch": 6.991027732463295,
"grad_norm": 0.33629485964775085,
"learning_rate": 0.00039084889148191364,
"loss": 3.4761,
"step": 24000
},
{
"epoch": 6.991027732463295,
"eval_accuracy": 0.3658422421224764,
"eval_loss": 3.5796563625335693,
"eval_runtime": 180.4423,
"eval_samples_per_second": 92.24,
"eval_steps_per_second": 5.769,
"step": 24000
},
{
"epoch": 7.005534840363552,
"grad_norm": 0.3567199409008026,
"learning_rate": 0.0003904113185530922,
"loss": 3.4224,
"step": 24050
},
{
"epoch": 7.020100209741319,
"grad_norm": 0.33759805560112,
"learning_rate": 0.00038997374562427066,
"loss": 3.3554,
"step": 24100
},
{
"epoch": 7.034665579119086,
"grad_norm": 0.3463039696216583,
"learning_rate": 0.0003895361726954492,
"loss": 3.3629,
"step": 24150
},
{
"epoch": 7.049230948496854,
"grad_norm": 0.34043920040130615,
"learning_rate": 0.00038909859976662774,
"loss": 3.3713,
"step": 24200
},
{
"epoch": 7.063796317874622,
"grad_norm": 0.3372809886932373,
"learning_rate": 0.0003886610268378062,
"loss": 3.3729,
"step": 24250
},
{
"epoch": 7.0783616872523885,
"grad_norm": 0.3626004159450531,
"learning_rate": 0.0003882234539089848,
"loss": 3.3779,
"step": 24300
},
{
"epoch": 7.092927056630156,
"grad_norm": 0.3814680278301239,
"learning_rate": 0.00038778588098016336,
"loss": 3.3831,
"step": 24350
},
{
"epoch": 7.107492426007924,
"grad_norm": 0.3421391248703003,
"learning_rate": 0.00038734830805134185,
"loss": 3.3799,
"step": 24400
},
{
"epoch": 7.122057795385691,
"grad_norm": 0.34770506620407104,
"learning_rate": 0.0003869107351225204,
"loss": 3.3751,
"step": 24450
},
{
"epoch": 7.136623164763458,
"grad_norm": 0.348093181848526,
"learning_rate": 0.0003864731621936989,
"loss": 3.3744,
"step": 24500
},
{
"epoch": 7.151188534141226,
"grad_norm": 0.34899893403053284,
"learning_rate": 0.00038603558926487747,
"loss": 3.378,
"step": 24550
},
{
"epoch": 7.165753903518993,
"grad_norm": 0.32636308670043945,
"learning_rate": 0.00038559801633605595,
"loss": 3.3811,
"step": 24600
},
{
"epoch": 7.180319272896761,
"grad_norm": 0.32693901658058167,
"learning_rate": 0.0003851604434072345,
"loss": 3.3978,
"step": 24650
},
{
"epoch": 7.194884642274528,
"grad_norm": 0.35337990522384644,
"learning_rate": 0.0003847228704784131,
"loss": 3.4016,
"step": 24700
},
{
"epoch": 7.2094500116522955,
"grad_norm": 0.33998918533325195,
"learning_rate": 0.0003842852975495915,
"loss": 3.3913,
"step": 24750
},
{
"epoch": 7.224015381030063,
"grad_norm": 0.34085580706596375,
"learning_rate": 0.0003838477246207701,
"loss": 3.3876,
"step": 24800
},
{
"epoch": 7.238580750407831,
"grad_norm": 0.34505322575569153,
"learning_rate": 0.00038341015169194865,
"loss": 3.4013,
"step": 24850
},
{
"epoch": 7.2531461197855975,
"grad_norm": 0.35665056109428406,
"learning_rate": 0.00038297257876312714,
"loss": 3.3945,
"step": 24900
},
{
"epoch": 7.267711489163365,
"grad_norm": 0.33130306005477905,
"learning_rate": 0.0003825350058343057,
"loss": 3.3973,
"step": 24950
},
{
"epoch": 7.282276858541133,
"grad_norm": 0.33717137575149536,
"learning_rate": 0.0003820974329054842,
"loss": 3.4035,
"step": 25000
},
{
"epoch": 7.282276858541133,
"eval_accuracy": 0.36597134137652254,
"eval_loss": 3.5882999897003174,
"eval_runtime": 180.2331,
"eval_samples_per_second": 92.347,
"eval_steps_per_second": 5.776,
"step": 25000
},
{
"epoch": 7.2968422279189,
"grad_norm": 0.343801349401474,
"learning_rate": 0.00038165985997666276,
"loss": 3.3994,
"step": 25050
},
{
"epoch": 7.311407597296667,
"grad_norm": 0.34225597977638245,
"learning_rate": 0.00038122228704784124,
"loss": 3.4,
"step": 25100
},
{
"epoch": 7.325972966674435,
"grad_norm": 0.3473186492919922,
"learning_rate": 0.0003807847141190198,
"loss": 3.397,
"step": 25150
},
{
"epoch": 7.3405383360522025,
"grad_norm": 0.3287709653377533,
"learning_rate": 0.0003803471411901984,
"loss": 3.4058,
"step": 25200
},
{
"epoch": 7.35510370542997,
"grad_norm": 0.351204514503479,
"learning_rate": 0.00037990956826137686,
"loss": 3.4163,
"step": 25250
},
{
"epoch": 7.369669074807737,
"grad_norm": 0.35390347242355347,
"learning_rate": 0.0003794719953325554,
"loss": 3.4228,
"step": 25300
},
{
"epoch": 7.384234444185505,
"grad_norm": 0.3401016891002655,
"learning_rate": 0.00037903442240373394,
"loss": 3.406,
"step": 25350
},
{
"epoch": 7.398799813563272,
"grad_norm": 0.35391902923583984,
"learning_rate": 0.00037859684947491243,
"loss": 3.4097,
"step": 25400
},
{
"epoch": 7.413365182941039,
"grad_norm": 0.342098206281662,
"learning_rate": 0.00037815927654609097,
"loss": 3.4266,
"step": 25450
},
{
"epoch": 7.427930552318807,
"grad_norm": 0.34706324338912964,
"learning_rate": 0.0003777217036172695,
"loss": 3.4083,
"step": 25500
},
{
"epoch": 7.442495921696574,
"grad_norm": 0.35251185297966003,
"learning_rate": 0.00037728413068844805,
"loss": 3.4125,
"step": 25550
},
{
"epoch": 7.457061291074342,
"grad_norm": 0.3509294390678406,
"learning_rate": 0.00037684655775962653,
"loss": 3.4203,
"step": 25600
},
{
"epoch": 7.471626660452109,
"grad_norm": 0.3560699224472046,
"learning_rate": 0.00037640898483080513,
"loss": 3.4272,
"step": 25650
},
{
"epoch": 7.486192029829876,
"grad_norm": 0.34861257672309875,
"learning_rate": 0.00037597141190198367,
"loss": 3.4182,
"step": 25700
},
{
"epoch": 7.500757399207644,
"grad_norm": 0.33859142661094666,
"learning_rate": 0.00037553383897316215,
"loss": 3.4299,
"step": 25750
},
{
"epoch": 7.515322768585412,
"grad_norm": 0.35380759835243225,
"learning_rate": 0.0003750962660443407,
"loss": 3.4201,
"step": 25800
},
{
"epoch": 7.529888137963178,
"grad_norm": 0.34941068291664124,
"learning_rate": 0.00037465869311551923,
"loss": 3.4167,
"step": 25850
},
{
"epoch": 7.544453507340946,
"grad_norm": 0.35646477341651917,
"learning_rate": 0.0003742211201866977,
"loss": 3.4306,
"step": 25900
},
{
"epoch": 7.559018876718714,
"grad_norm": 0.35378143191337585,
"learning_rate": 0.00037378354725787626,
"loss": 3.4086,
"step": 25950
},
{
"epoch": 7.573584246096481,
"grad_norm": 0.3527311384677887,
"learning_rate": 0.0003733459743290548,
"loss": 3.4207,
"step": 26000
},
{
"epoch": 7.573584246096481,
"eval_accuracy": 0.36644117800600207,
"eval_loss": 3.5814883708953857,
"eval_runtime": 180.4499,
"eval_samples_per_second": 92.236,
"eval_steps_per_second": 5.769,
"step": 26000
},
{
"epoch": 7.588149615474248,
"grad_norm": 0.3543234169483185,
"learning_rate": 0.0003729084014002334,
"loss": 3.4278,
"step": 26050
},
{
"epoch": 7.602714984852016,
"grad_norm": 0.35281285643577576,
"learning_rate": 0.0003724708284714119,
"loss": 3.421,
"step": 26100
},
{
"epoch": 7.617280354229783,
"grad_norm": 0.3394710123538971,
"learning_rate": 0.0003720332555425904,
"loss": 3.4238,
"step": 26150
},
{
"epoch": 7.631845723607551,
"grad_norm": 0.34304413199424744,
"learning_rate": 0.00037159568261376896,
"loss": 3.4145,
"step": 26200
},
{
"epoch": 7.646411092985318,
"grad_norm": 0.3429325520992279,
"learning_rate": 0.00037115810968494744,
"loss": 3.4268,
"step": 26250
},
{
"epoch": 7.660976462363085,
"grad_norm": 0.3383738100528717,
"learning_rate": 0.000370720536756126,
"loss": 3.4228,
"step": 26300
},
{
"epoch": 7.675541831740853,
"grad_norm": 0.3476937413215637,
"learning_rate": 0.0003702829638273045,
"loss": 3.4207,
"step": 26350
},
{
"epoch": 7.690107201118621,
"grad_norm": 0.342341810464859,
"learning_rate": 0.000369845390898483,
"loss": 3.4207,
"step": 26400
},
{
"epoch": 7.704672570496387,
"grad_norm": 0.35490912199020386,
"learning_rate": 0.00036940781796966155,
"loss": 3.4281,
"step": 26450
},
{
"epoch": 7.719237939874155,
"grad_norm": 0.34896060824394226,
"learning_rate": 0.00036897024504084014,
"loss": 3.4361,
"step": 26500
},
{
"epoch": 7.733803309251923,
"grad_norm": 0.3462172746658325,
"learning_rate": 0.0003685326721120187,
"loss": 3.425,
"step": 26550
},
{
"epoch": 7.74836867862969,
"grad_norm": 0.35829275846481323,
"learning_rate": 0.00036809509918319717,
"loss": 3.4346,
"step": 26600
},
{
"epoch": 7.762934048007457,
"grad_norm": 0.3367747664451599,
"learning_rate": 0.0003676575262543757,
"loss": 3.4247,
"step": 26650
},
{
"epoch": 7.777499417385225,
"grad_norm": 0.33087530732154846,
"learning_rate": 0.00036721995332555425,
"loss": 3.432,
"step": 26700
},
{
"epoch": 7.792064786762992,
"grad_norm": 0.3543736934661865,
"learning_rate": 0.00036678238039673274,
"loss": 3.4379,
"step": 26750
},
{
"epoch": 7.80663015614076,
"grad_norm": 0.3304196894168854,
"learning_rate": 0.0003663448074679113,
"loss": 3.4238,
"step": 26800
},
{
"epoch": 7.821195525518527,
"grad_norm": 0.35223904252052307,
"learning_rate": 0.0003659072345390898,
"loss": 3.426,
"step": 26850
},
{
"epoch": 7.8357608948962945,
"grad_norm": 0.34050217270851135,
"learning_rate": 0.0003654696616102683,
"loss": 3.4172,
"step": 26900
},
{
"epoch": 7.850326264274062,
"grad_norm": 0.3450503349304199,
"learning_rate": 0.00036503208868144684,
"loss": 3.4337,
"step": 26950
},
{
"epoch": 7.86489163365183,
"grad_norm": 0.3508300483226776,
"learning_rate": 0.00036459451575262543,
"loss": 3.4351,
"step": 27000
},
{
"epoch": 7.86489163365183,
"eval_accuracy": 0.36722659058981666,
"eval_loss": 3.571290969848633,
"eval_runtime": 180.2666,
"eval_samples_per_second": 92.33,
"eval_steps_per_second": 5.775,
"step": 27000
},
{
"epoch": 7.8794570030295965,
"grad_norm": 0.35574260354042053,
"learning_rate": 0.000364156942823804,
"loss": 3.4341,
"step": 27050
},
{
"epoch": 7.894022372407364,
"grad_norm": 0.34330523014068604,
"learning_rate": 0.00036371936989498246,
"loss": 3.4396,
"step": 27100
},
{
"epoch": 7.908587741785132,
"grad_norm": 0.35326018929481506,
"learning_rate": 0.000363281796966161,
"loss": 3.4192,
"step": 27150
},
{
"epoch": 7.923153111162899,
"grad_norm": 0.356656938791275,
"learning_rate": 0.00036284422403733954,
"loss": 3.4359,
"step": 27200
},
{
"epoch": 7.937718480540666,
"grad_norm": 0.32995936274528503,
"learning_rate": 0.000362406651108518,
"loss": 3.4363,
"step": 27250
},
{
"epoch": 7.952283849918434,
"grad_norm": 0.3421317934989929,
"learning_rate": 0.00036196907817969657,
"loss": 3.4263,
"step": 27300
},
{
"epoch": 7.9668492192962015,
"grad_norm": 0.33741775155067444,
"learning_rate": 0.0003615315052508751,
"loss": 3.4332,
"step": 27350
},
{
"epoch": 7.981414588673969,
"grad_norm": 0.34820324182510376,
"learning_rate": 0.0003610939323220536,
"loss": 3.4311,
"step": 27400
},
{
"epoch": 7.995979958051736,
"grad_norm": 0.36536943912506104,
"learning_rate": 0.0003606563593932322,
"loss": 3.4287,
"step": 27450
},
{
"epoch": 8.010487065951992,
"grad_norm": 0.3467245399951935,
"learning_rate": 0.0003602187864644107,
"loss": 3.3578,
"step": 27500
},
{
"epoch": 8.02505243532976,
"grad_norm": 0.36606982350349426,
"learning_rate": 0.00035978121353558927,
"loss": 3.3254,
"step": 27550
},
{
"epoch": 8.039617804707527,
"grad_norm": 0.370090126991272,
"learning_rate": 0.00035934364060676775,
"loss": 3.3324,
"step": 27600
},
{
"epoch": 8.054183174085296,
"grad_norm": 0.3692021667957306,
"learning_rate": 0.0003589060676779463,
"loss": 3.3385,
"step": 27650
},
{
"epoch": 8.068748543463062,
"grad_norm": 0.35137107968330383,
"learning_rate": 0.00035846849474912483,
"loss": 3.3448,
"step": 27700
},
{
"epoch": 8.08331391284083,
"grad_norm": 0.3459080755710602,
"learning_rate": 0.0003580309218203033,
"loss": 3.3318,
"step": 27750
},
{
"epoch": 8.097879282218598,
"grad_norm": 0.35793742537498474,
"learning_rate": 0.00035759334889148186,
"loss": 3.3345,
"step": 27800
},
{
"epoch": 8.112444651596364,
"grad_norm": 0.35751616954803467,
"learning_rate": 0.00035715577596266045,
"loss": 3.3613,
"step": 27850
},
{
"epoch": 8.127010020974131,
"grad_norm": 0.3466125428676605,
"learning_rate": 0.00035671820303383894,
"loss": 3.3478,
"step": 27900
},
{
"epoch": 8.1415753903519,
"grad_norm": 0.3528430759906769,
"learning_rate": 0.0003562806301050175,
"loss": 3.3564,
"step": 27950
},
{
"epoch": 8.156140759729666,
"grad_norm": 0.36010900139808655,
"learning_rate": 0.000355843057176196,
"loss": 3.3456,
"step": 28000
},
{
"epoch": 8.156140759729666,
"eval_accuracy": 0.36759131361900715,
"eval_loss": 3.580268144607544,
"eval_runtime": 180.2651,
"eval_samples_per_second": 92.331,
"eval_steps_per_second": 5.775,
"step": 28000
},
{
"epoch": 8.170706129107435,
"grad_norm": 0.3616182804107666,
"learning_rate": 0.00035540548424737456,
"loss": 3.3554,
"step": 28050
},
{
"epoch": 8.185271498485202,
"grad_norm": 0.3429206311702728,
"learning_rate": 0.00035496791131855304,
"loss": 3.3689,
"step": 28100
},
{
"epoch": 8.199836867862969,
"grad_norm": 0.3601152300834656,
"learning_rate": 0.0003545303383897316,
"loss": 3.3647,
"step": 28150
},
{
"epoch": 8.214402237240737,
"grad_norm": 0.346986323595047,
"learning_rate": 0.0003540927654609101,
"loss": 3.3639,
"step": 28200
},
{
"epoch": 8.228967606618504,
"grad_norm": 0.3525499105453491,
"learning_rate": 0.0003536551925320886,
"loss": 3.3606,
"step": 28250
},
{
"epoch": 8.24353297599627,
"grad_norm": 0.3487248420715332,
"learning_rate": 0.0003532176196032672,
"loss": 3.3582,
"step": 28300
},
{
"epoch": 8.258098345374039,
"grad_norm": 0.3517068028450012,
"learning_rate": 0.00035278004667444574,
"loss": 3.3674,
"step": 28350
},
{
"epoch": 8.272663714751806,
"grad_norm": 0.3672351837158203,
"learning_rate": 0.00035234247374562423,
"loss": 3.3631,
"step": 28400
},
{
"epoch": 8.287229084129574,
"grad_norm": 0.3655698001384735,
"learning_rate": 0.00035190490081680277,
"loss": 3.3605,
"step": 28450
},
{
"epoch": 8.301794453507341,
"grad_norm": 0.3492221534252167,
"learning_rate": 0.0003514673278879813,
"loss": 3.3721,
"step": 28500
},
{
"epoch": 8.316359822885108,
"grad_norm": 0.33222460746765137,
"learning_rate": 0.00035102975495915985,
"loss": 3.3655,
"step": 28550
},
{
"epoch": 8.330925192262876,
"grad_norm": 0.3473198115825653,
"learning_rate": 0.00035059218203033833,
"loss": 3.3765,
"step": 28600
},
{
"epoch": 8.345490561640643,
"grad_norm": 0.3525267541408539,
"learning_rate": 0.0003501546091015169,
"loss": 3.3692,
"step": 28650
},
{
"epoch": 8.36005593101841,
"grad_norm": 0.3442334234714508,
"learning_rate": 0.00034971703617269547,
"loss": 3.3892,
"step": 28700
},
{
"epoch": 8.374621300396178,
"grad_norm": 0.35674968361854553,
"learning_rate": 0.0003492794632438739,
"loss": 3.378,
"step": 28750
},
{
"epoch": 8.389186669773945,
"grad_norm": 0.34580376744270325,
"learning_rate": 0.0003488418903150525,
"loss": 3.3797,
"step": 28800
},
{
"epoch": 8.403752039151712,
"grad_norm": 0.34604698419570923,
"learning_rate": 0.00034840431738623103,
"loss": 3.3674,
"step": 28850
},
{
"epoch": 8.41831740852948,
"grad_norm": 0.3592158854007721,
"learning_rate": 0.0003479667444574095,
"loss": 3.377,
"step": 28900
},
{
"epoch": 8.432882777907247,
"grad_norm": 0.359283447265625,
"learning_rate": 0.00034752917152858806,
"loss": 3.3884,
"step": 28950
},
{
"epoch": 8.447448147285016,
"grad_norm": 0.34626027941703796,
"learning_rate": 0.0003470915985997666,
"loss": 3.379,
"step": 29000
},
{
"epoch": 8.447448147285016,
"eval_accuracy": 0.3679311103805677,
"eval_loss": 3.57328462600708,
"eval_runtime": 180.157,
"eval_samples_per_second": 92.386,
"eval_steps_per_second": 5.778,
"step": 29000
},
{
"epoch": 8.462013516662783,
"grad_norm": 0.3666757643222809,
"learning_rate": 0.00034665402567094514,
"loss": 3.3796,
"step": 29050
},
{
"epoch": 8.47657888604055,
"grad_norm": 0.3576624393463135,
"learning_rate": 0.0003462164527421236,
"loss": 3.3846,
"step": 29100
},
{
"epoch": 8.491144255418318,
"grad_norm": 0.36986008286476135,
"learning_rate": 0.00034577887981330216,
"loss": 3.3789,
"step": 29150
},
{
"epoch": 8.505709624796085,
"grad_norm": 0.34708988666534424,
"learning_rate": 0.00034534130688448076,
"loss": 3.3752,
"step": 29200
},
{
"epoch": 8.520274994173853,
"grad_norm": 0.36563989520072937,
"learning_rate": 0.00034490373395565924,
"loss": 3.3733,
"step": 29250
},
{
"epoch": 8.53484036355162,
"grad_norm": 0.36509010195732117,
"learning_rate": 0.0003444661610268378,
"loss": 3.3893,
"step": 29300
},
{
"epoch": 8.549405732929387,
"grad_norm": 0.3598864674568176,
"learning_rate": 0.0003440285880980163,
"loss": 3.3822,
"step": 29350
},
{
"epoch": 8.563971102307155,
"grad_norm": 0.3573833405971527,
"learning_rate": 0.0003435910151691948,
"loss": 3.3875,
"step": 29400
},
{
"epoch": 8.578536471684922,
"grad_norm": 0.33812621235847473,
"learning_rate": 0.00034315344224037335,
"loss": 3.3907,
"step": 29450
},
{
"epoch": 8.593101841062689,
"grad_norm": 0.3446572422981262,
"learning_rate": 0.0003427158693115519,
"loss": 3.3985,
"step": 29500
},
{
"epoch": 8.607667210440457,
"grad_norm": 0.34378212690353394,
"learning_rate": 0.00034227829638273043,
"loss": 3.3868,
"step": 29550
},
{
"epoch": 8.622232579818224,
"grad_norm": 0.35360199213027954,
"learning_rate": 0.0003418407234539089,
"loss": 3.4024,
"step": 29600
},
{
"epoch": 8.63679794919599,
"grad_norm": 0.35581034421920776,
"learning_rate": 0.0003414031505250875,
"loss": 3.3898,
"step": 29650
},
{
"epoch": 8.65136331857376,
"grad_norm": 0.35615333914756775,
"learning_rate": 0.00034096557759626605,
"loss": 3.3959,
"step": 29700
},
{
"epoch": 8.665928687951526,
"grad_norm": 0.35061442852020264,
"learning_rate": 0.00034052800466744453,
"loss": 3.39,
"step": 29750
},
{
"epoch": 8.680494057329295,
"grad_norm": 0.3618820905685425,
"learning_rate": 0.0003400904317386231,
"loss": 3.3843,
"step": 29800
},
{
"epoch": 8.695059426707061,
"grad_norm": 0.34694838523864746,
"learning_rate": 0.0003396528588098016,
"loss": 3.3842,
"step": 29850
},
{
"epoch": 8.709624796084828,
"grad_norm": 0.34534159302711487,
"learning_rate": 0.0003392152858809801,
"loss": 3.392,
"step": 29900
},
{
"epoch": 8.724190165462597,
"grad_norm": 0.35484299063682556,
"learning_rate": 0.00033877771295215864,
"loss": 3.3917,
"step": 29950
},
{
"epoch": 8.738755534840363,
"grad_norm": 0.34446921944618225,
"learning_rate": 0.0003383401400233372,
"loss": 3.3926,
"step": 30000
},
{
"epoch": 8.738755534840363,
"eval_accuracy": 0.3686057656808563,
"eval_loss": 3.563647985458374,
"eval_runtime": 180.6198,
"eval_samples_per_second": 92.149,
"eval_steps_per_second": 5.763,
"step": 30000
},
{
"epoch": 8.753320904218132,
"grad_norm": 0.38022834062576294,
"learning_rate": 0.0003379025670945158,
"loss": 3.3971,
"step": 30050
},
{
"epoch": 8.767886273595899,
"grad_norm": 0.3438722491264343,
"learning_rate": 0.00033746499416569426,
"loss": 3.3964,
"step": 30100
},
{
"epoch": 8.782451642973665,
"grad_norm": 0.3680688142776489,
"learning_rate": 0.0003370274212368728,
"loss": 3.4007,
"step": 30150
},
{
"epoch": 8.797017012351434,
"grad_norm": 0.3484303653240204,
"learning_rate": 0.00033658984830805134,
"loss": 3.3908,
"step": 30200
},
{
"epoch": 8.8115823817292,
"grad_norm": 0.34887251257896423,
"learning_rate": 0.0003361522753792298,
"loss": 3.3864,
"step": 30250
},
{
"epoch": 8.826147751106967,
"grad_norm": 0.38353490829467773,
"learning_rate": 0.00033571470245040837,
"loss": 3.3924,
"step": 30300
},
{
"epoch": 8.840713120484736,
"grad_norm": 0.3619522750377655,
"learning_rate": 0.0003352771295215869,
"loss": 3.3943,
"step": 30350
},
{
"epoch": 8.855278489862503,
"grad_norm": 0.34908801317214966,
"learning_rate": 0.0003348395565927654,
"loss": 3.4001,
"step": 30400
},
{
"epoch": 8.86984385924027,
"grad_norm": 0.3426980972290039,
"learning_rate": 0.00033440198366394393,
"loss": 3.3924,
"step": 30450
},
{
"epoch": 8.884409228618038,
"grad_norm": 0.3510425090789795,
"learning_rate": 0.0003339644107351225,
"loss": 3.3992,
"step": 30500
},
{
"epoch": 8.898974597995805,
"grad_norm": 0.3653899133205414,
"learning_rate": 0.00033352683780630107,
"loss": 3.3945,
"step": 30550
},
{
"epoch": 8.913539967373573,
"grad_norm": 0.3542352318763733,
"learning_rate": 0.00033308926487747955,
"loss": 3.4124,
"step": 30600
},
{
"epoch": 8.92810533675134,
"grad_norm": 0.3585004508495331,
"learning_rate": 0.0003326516919486581,
"loss": 3.3896,
"step": 30650
},
{
"epoch": 8.942670706129107,
"grad_norm": 0.3748304843902588,
"learning_rate": 0.00033221411901983663,
"loss": 3.3986,
"step": 30700
},
{
"epoch": 8.957236075506875,
"grad_norm": 0.3452504277229309,
"learning_rate": 0.0003317765460910151,
"loss": 3.3955,
"step": 30750
},
{
"epoch": 8.971801444884642,
"grad_norm": 0.3590230941772461,
"learning_rate": 0.00033133897316219366,
"loss": 3.396,
"step": 30800
},
{
"epoch": 8.986366814262409,
"grad_norm": 0.3469237685203552,
"learning_rate": 0.0003309014002333722,
"loss": 3.3886,
"step": 30850
},
{
"epoch": 9.000873922162667,
"grad_norm": 0.34952783584594727,
"learning_rate": 0.0003304638273045507,
"loss": 3.3905,
"step": 30900
},
{
"epoch": 9.015439291540433,
"grad_norm": 0.3712822198867798,
"learning_rate": 0.0003300262543757292,
"loss": 3.2828,
"step": 30950
},
{
"epoch": 9.0300046609182,
"grad_norm": 0.371115505695343,
"learning_rate": 0.0003295886814469078,
"loss": 3.305,
"step": 31000
},
{
"epoch": 9.0300046609182,
"eval_accuracy": 0.36886643330036484,
"eval_loss": 3.5686051845550537,
"eval_runtime": 181.9038,
"eval_samples_per_second": 91.499,
"eval_steps_per_second": 5.723,
"step": 31000
},
{
"epoch": 9.044570030295969,
"grad_norm": 0.3616848289966583,
"learning_rate": 0.00032915110851808636,
"loss": 3.2979,
"step": 31050
},
{
"epoch": 9.059135399673735,
"grad_norm": 0.3446025550365448,
"learning_rate": 0.00032871353558926484,
"loss": 3.3074,
"step": 31100
},
{
"epoch": 9.073700769051504,
"grad_norm": 0.36741337180137634,
"learning_rate": 0.0003282759626604434,
"loss": 3.2965,
"step": 31150
},
{
"epoch": 9.08826613842927,
"grad_norm": 0.3401558995246887,
"learning_rate": 0.0003278383897316219,
"loss": 3.305,
"step": 31200
},
{
"epoch": 9.102831507807037,
"grad_norm": 0.364580363035202,
"learning_rate": 0.0003274008168028004,
"loss": 3.3159,
"step": 31250
},
{
"epoch": 9.117396877184806,
"grad_norm": 0.3511641025543213,
"learning_rate": 0.00032696324387397895,
"loss": 3.3232,
"step": 31300
},
{
"epoch": 9.131962246562573,
"grad_norm": 0.3950608968734741,
"learning_rate": 0.0003265256709451575,
"loss": 3.3135,
"step": 31350
},
{
"epoch": 9.14652761594034,
"grad_norm": 0.35723525285720825,
"learning_rate": 0.00032608809801633597,
"loss": 3.3117,
"step": 31400
},
{
"epoch": 9.161092985318108,
"grad_norm": 0.38060232996940613,
"learning_rate": 0.00032565052508751457,
"loss": 3.3211,
"step": 31450
},
{
"epoch": 9.175658354695875,
"grad_norm": 0.3600994944572449,
"learning_rate": 0.0003252129521586931,
"loss": 3.3195,
"step": 31500
},
{
"epoch": 9.190223724073643,
"grad_norm": 0.38114050030708313,
"learning_rate": 0.00032477537922987165,
"loss": 3.3272,
"step": 31550
},
{
"epoch": 9.20478909345141,
"grad_norm": 0.35321810841560364,
"learning_rate": 0.00032433780630105013,
"loss": 3.3198,
"step": 31600
},
{
"epoch": 9.219354462829177,
"grad_norm": 0.3709951639175415,
"learning_rate": 0.00032390023337222867,
"loss": 3.3285,
"step": 31650
},
{
"epoch": 9.233919832206945,
"grad_norm": 0.3782629668712616,
"learning_rate": 0.0003234626604434072,
"loss": 3.3391,
"step": 31700
},
{
"epoch": 9.248485201584712,
"grad_norm": 0.3684213161468506,
"learning_rate": 0.0003230250875145857,
"loss": 3.3321,
"step": 31750
},
{
"epoch": 9.263050570962479,
"grad_norm": 0.37817445397377014,
"learning_rate": 0.00032258751458576424,
"loss": 3.3253,
"step": 31800
},
{
"epoch": 9.277615940340247,
"grad_norm": 0.37698403000831604,
"learning_rate": 0.00032214994165694283,
"loss": 3.3263,
"step": 31850
},
{
"epoch": 9.292181309718014,
"grad_norm": 0.35898423194885254,
"learning_rate": 0.0003217123687281213,
"loss": 3.3407,
"step": 31900
},
{
"epoch": 9.306746679095783,
"grad_norm": 0.37121668457984924,
"learning_rate": 0.00032127479579929986,
"loss": 3.3469,
"step": 31950
},
{
"epoch": 9.32131204847355,
"grad_norm": 0.3602243959903717,
"learning_rate": 0.0003208372228704784,
"loss": 3.333,
"step": 32000
},
{
"epoch": 9.32131204847355,
"eval_accuracy": 0.36881940260672325,
"eval_loss": 3.568206787109375,
"eval_runtime": 181.6399,
"eval_samples_per_second": 91.632,
"eval_steps_per_second": 5.731,
"step": 32000
},
{
"epoch": 9.335877417851316,
"grad_norm": 0.3777805268764496,
"learning_rate": 0.00032039964994165694,
"loss": 3.3404,
"step": 32050
},
{
"epoch": 9.350442787229085,
"grad_norm": 0.36622655391693115,
"learning_rate": 0.0003199620770128354,
"loss": 3.3462,
"step": 32100
},
{
"epoch": 9.365008156606851,
"grad_norm": 0.3432258069515228,
"learning_rate": 0.00031952450408401396,
"loss": 3.3499,
"step": 32150
},
{
"epoch": 9.379573525984618,
"grad_norm": 0.3571391999721527,
"learning_rate": 0.0003190869311551925,
"loss": 3.3437,
"step": 32200
},
{
"epoch": 9.394138895362387,
"grad_norm": 0.3796580731868744,
"learning_rate": 0.000318649358226371,
"loss": 3.3445,
"step": 32250
},
{
"epoch": 9.408704264740154,
"grad_norm": 0.3999924659729004,
"learning_rate": 0.0003182117852975496,
"loss": 3.3398,
"step": 32300
},
{
"epoch": 9.423269634117922,
"grad_norm": 0.3521633744239807,
"learning_rate": 0.0003177742123687281,
"loss": 3.3518,
"step": 32350
},
{
"epoch": 9.437835003495689,
"grad_norm": 0.34816059470176697,
"learning_rate": 0.0003173366394399066,
"loss": 3.3498,
"step": 32400
},
{
"epoch": 9.452400372873456,
"grad_norm": 0.3519940674304962,
"learning_rate": 0.00031689906651108515,
"loss": 3.3491,
"step": 32450
},
{
"epoch": 9.466965742251224,
"grad_norm": 0.366641104221344,
"learning_rate": 0.0003164614935822637,
"loss": 3.3575,
"step": 32500
},
{
"epoch": 9.48153111162899,
"grad_norm": 0.3859027028083801,
"learning_rate": 0.00031602392065344223,
"loss": 3.3477,
"step": 32550
},
{
"epoch": 9.496096481006758,
"grad_norm": 0.3514662981033325,
"learning_rate": 0.0003155863477246207,
"loss": 3.3536,
"step": 32600
},
{
"epoch": 9.510661850384526,
"grad_norm": 0.37433597445487976,
"learning_rate": 0.00031514877479579925,
"loss": 3.3606,
"step": 32650
},
{
"epoch": 9.525227219762293,
"grad_norm": 0.3747974932193756,
"learning_rate": 0.00031471120186697785,
"loss": 3.3621,
"step": 32700
},
{
"epoch": 9.53979258914006,
"grad_norm": 0.38271790742874146,
"learning_rate": 0.00031427362893815633,
"loss": 3.3599,
"step": 32750
},
{
"epoch": 9.554357958517828,
"grad_norm": 0.3738161027431488,
"learning_rate": 0.0003138360560093349,
"loss": 3.35,
"step": 32800
},
{
"epoch": 9.568923327895595,
"grad_norm": 0.37082603573799133,
"learning_rate": 0.0003133984830805134,
"loss": 3.3593,
"step": 32850
},
{
"epoch": 9.583488697273363,
"grad_norm": 0.38742882013320923,
"learning_rate": 0.0003129609101516919,
"loss": 3.3513,
"step": 32900
},
{
"epoch": 9.59805406665113,
"grad_norm": 0.36848726868629456,
"learning_rate": 0.00031252333722287044,
"loss": 3.3619,
"step": 32950
},
{
"epoch": 9.612619436028897,
"grad_norm": 0.34450680017471313,
"learning_rate": 0.000312085764294049,
"loss": 3.3523,
"step": 33000
},
{
"epoch": 9.612619436028897,
"eval_accuracy": 0.36961469163620253,
"eval_loss": 3.5594406127929688,
"eval_runtime": 181.5027,
"eval_samples_per_second": 91.701,
"eval_steps_per_second": 5.735,
"step": 33000
},
{
"epoch": 9.627184805406666,
"grad_norm": 0.34740373492240906,
"learning_rate": 0.0003116481913652275,
"loss": 3.3523,
"step": 33050
},
{
"epoch": 9.641750174784432,
"grad_norm": 0.3639390468597412,
"learning_rate": 0.000311210618436406,
"loss": 3.3569,
"step": 33100
},
{
"epoch": 9.6563155441622,
"grad_norm": 0.3668532073497772,
"learning_rate": 0.0003107730455075846,
"loss": 3.3582,
"step": 33150
},
{
"epoch": 9.670880913539968,
"grad_norm": 0.3689277172088623,
"learning_rate": 0.00031033547257876314,
"loss": 3.3615,
"step": 33200
},
{
"epoch": 9.685446282917734,
"grad_norm": 0.3605565130710602,
"learning_rate": 0.0003098978996499416,
"loss": 3.3501,
"step": 33250
},
{
"epoch": 9.700011652295503,
"grad_norm": 0.3678613007068634,
"learning_rate": 0.00030946032672112016,
"loss": 3.3653,
"step": 33300
},
{
"epoch": 9.71457702167327,
"grad_norm": 0.360675185918808,
"learning_rate": 0.0003090227537922987,
"loss": 3.3637,
"step": 33350
},
{
"epoch": 9.729142391051036,
"grad_norm": 0.3719678819179535,
"learning_rate": 0.0003085851808634772,
"loss": 3.357,
"step": 33400
},
{
"epoch": 9.743707760428805,
"grad_norm": 0.3562043607234955,
"learning_rate": 0.00030814760793465573,
"loss": 3.3531,
"step": 33450
},
{
"epoch": 9.758273129806572,
"grad_norm": 0.37112271785736084,
"learning_rate": 0.00030771003500583427,
"loss": 3.3629,
"step": 33500
},
{
"epoch": 9.772838499184338,
"grad_norm": 0.3823767900466919,
"learning_rate": 0.00030727246207701286,
"loss": 3.3711,
"step": 33550
},
{
"epoch": 9.787403868562107,
"grad_norm": 0.3594043552875519,
"learning_rate": 0.0003068348891481913,
"loss": 3.3712,
"step": 33600
},
{
"epoch": 9.801969237939874,
"grad_norm": 0.3566214442253113,
"learning_rate": 0.0003063973162193699,
"loss": 3.3842,
"step": 33650
},
{
"epoch": 9.816534607317642,
"grad_norm": 0.36310461163520813,
"learning_rate": 0.00030595974329054843,
"loss": 3.3596,
"step": 33700
},
{
"epoch": 9.831099976695409,
"grad_norm": 0.36038920283317566,
"learning_rate": 0.0003055221703617269,
"loss": 3.3677,
"step": 33750
},
{
"epoch": 9.845665346073176,
"grad_norm": 0.34875422716140747,
"learning_rate": 0.00030508459743290546,
"loss": 3.3738,
"step": 33800
},
{
"epoch": 9.860230715450944,
"grad_norm": 0.3687998056411743,
"learning_rate": 0.000304647024504084,
"loss": 3.3725,
"step": 33850
},
{
"epoch": 9.874796084828711,
"grad_norm": 0.3492382764816284,
"learning_rate": 0.0003042094515752625,
"loss": 3.3642,
"step": 33900
},
{
"epoch": 9.88936145420648,
"grad_norm": 0.34819406270980835,
"learning_rate": 0.000303771878646441,
"loss": 3.3783,
"step": 33950
},
{
"epoch": 9.903926823584246,
"grad_norm": 0.36848151683807373,
"learning_rate": 0.00030333430571761956,
"loss": 3.3654,
"step": 34000
},
{
"epoch": 9.903926823584246,
"eval_accuracy": 0.3703816446727628,
"eval_loss": 3.5503089427948,
"eval_runtime": 181.4493,
"eval_samples_per_second": 91.728,
"eval_steps_per_second": 5.737,
"step": 34000
},
{
"epoch": 9.918492192962013,
"grad_norm": 0.39530327916145325,
"learning_rate": 0.00030289673278879816,
"loss": 3.3634,
"step": 34050
},
{
"epoch": 9.933057562339782,
"grad_norm": 0.3620380759239197,
"learning_rate": 0.00030245915985997664,
"loss": 3.3729,
"step": 34100
},
{
"epoch": 9.947622931717548,
"grad_norm": 0.356423020362854,
"learning_rate": 0.0003020215869311552,
"loss": 3.3823,
"step": 34150
},
{
"epoch": 9.962188301095315,
"grad_norm": 0.35574576258659363,
"learning_rate": 0.0003015840140023337,
"loss": 3.3698,
"step": 34200
},
{
"epoch": 9.976753670473084,
"grad_norm": 0.3700348734855652,
"learning_rate": 0.0003011464410735122,
"loss": 3.368,
"step": 34250
},
{
"epoch": 9.99131903985085,
"grad_norm": 0.3582363724708557,
"learning_rate": 0.00030070886814469075,
"loss": 3.3747,
"step": 34300
},
{
"epoch": 10.005826147751106,
"grad_norm": 0.3664577901363373,
"learning_rate": 0.0003002712952158693,
"loss": 3.3242,
"step": 34350
},
{
"epoch": 10.020391517128875,
"grad_norm": 0.3791219890117645,
"learning_rate": 0.0002998337222870478,
"loss": 3.2693,
"step": 34400
},
{
"epoch": 10.034956886506642,
"grad_norm": 0.3724304437637329,
"learning_rate": 0.0002993961493582263,
"loss": 3.2642,
"step": 34450
},
{
"epoch": 10.049522255884408,
"grad_norm": 0.36520498991012573,
"learning_rate": 0.0002989585764294049,
"loss": 3.2756,
"step": 34500
},
{
"epoch": 10.064087625262177,
"grad_norm": 0.3840792179107666,
"learning_rate": 0.0002985210035005834,
"loss": 3.28,
"step": 34550
},
{
"epoch": 10.078652994639944,
"grad_norm": 0.3588644564151764,
"learning_rate": 0.00029808343057176193,
"loss": 3.2725,
"step": 34600
},
{
"epoch": 10.093218364017712,
"grad_norm": 0.3608386516571045,
"learning_rate": 0.00029764585764294047,
"loss": 3.2845,
"step": 34650
},
{
"epoch": 10.107783733395479,
"grad_norm": 0.37310636043548584,
"learning_rate": 0.00029720828471411896,
"loss": 3.2934,
"step": 34700
},
{
"epoch": 10.122349102773246,
"grad_norm": 0.3664185404777527,
"learning_rate": 0.00029677071178529755,
"loss": 3.2953,
"step": 34750
},
{
"epoch": 10.136914472151014,
"grad_norm": 0.3596240282058716,
"learning_rate": 0.00029633313885647604,
"loss": 3.304,
"step": 34800
},
{
"epoch": 10.151479841528781,
"grad_norm": 0.3951849341392517,
"learning_rate": 0.0002958955659276546,
"loss": 3.2999,
"step": 34850
},
{
"epoch": 10.166045210906548,
"grad_norm": 0.38322994112968445,
"learning_rate": 0.0002954579929988331,
"loss": 3.304,
"step": 34900
},
{
"epoch": 10.180610580284316,
"grad_norm": 0.36491626501083374,
"learning_rate": 0.00029502042007001166,
"loss": 3.2911,
"step": 34950
},
{
"epoch": 10.195175949662083,
"grad_norm": 0.37527546286582947,
"learning_rate": 0.0002945828471411902,
"loss": 3.2834,
"step": 35000
},
{
"epoch": 10.195175949662083,
"eval_accuracy": 0.3699247414840347,
"eval_loss": 3.5616824626922607,
"eval_runtime": 180.3891,
"eval_samples_per_second": 92.267,
"eval_steps_per_second": 5.771,
"step": 35000
},
{
"epoch": 10.209741319039852,
"grad_norm": 0.3800257742404938,
"learning_rate": 0.0002941452742123687,
"loss": 3.2981,
"step": 35050
},
{
"epoch": 10.224306688417618,
"grad_norm": 0.3631065785884857,
"learning_rate": 0.0002937077012835472,
"loss": 3.3006,
"step": 35100
},
{
"epoch": 10.238872057795385,
"grad_norm": 0.3786700963973999,
"learning_rate": 0.00029327012835472576,
"loss": 3.3138,
"step": 35150
},
{
"epoch": 10.253437427173154,
"grad_norm": 0.35197684168815613,
"learning_rate": 0.0002928325554259043,
"loss": 3.3003,
"step": 35200
},
{
"epoch": 10.26800279655092,
"grad_norm": 0.36957064270973206,
"learning_rate": 0.00029239498249708284,
"loss": 3.3105,
"step": 35250
},
{
"epoch": 10.282568165928687,
"grad_norm": 0.37074217200279236,
"learning_rate": 0.00029195740956826133,
"loss": 3.3123,
"step": 35300
},
{
"epoch": 10.297133535306456,
"grad_norm": 0.3711046278476715,
"learning_rate": 0.0002915198366394399,
"loss": 3.302,
"step": 35350
},
{
"epoch": 10.311698904684222,
"grad_norm": 0.3888838589191437,
"learning_rate": 0.0002910822637106184,
"loss": 3.3077,
"step": 35400
},
{
"epoch": 10.326264274061991,
"grad_norm": 0.3660491704940796,
"learning_rate": 0.00029064469078179695,
"loss": 3.3186,
"step": 35450
},
{
"epoch": 10.340829643439758,
"grad_norm": 0.3750646412372589,
"learning_rate": 0.0002902071178529755,
"loss": 3.3088,
"step": 35500
},
{
"epoch": 10.355395012817525,
"grad_norm": 0.3611460030078888,
"learning_rate": 0.000289769544924154,
"loss": 3.3126,
"step": 35550
},
{
"epoch": 10.369960382195293,
"grad_norm": 0.3784548342227936,
"learning_rate": 0.00028933197199533257,
"loss": 3.3199,
"step": 35600
},
{
"epoch": 10.38452575157306,
"grad_norm": 0.3654816746711731,
"learning_rate": 0.00028889439906651105,
"loss": 3.317,
"step": 35650
},
{
"epoch": 10.399091120950827,
"grad_norm": 0.3819401264190674,
"learning_rate": 0.0002884568261376896,
"loss": 3.3174,
"step": 35700
},
{
"epoch": 10.413656490328595,
"grad_norm": 0.3685275912284851,
"learning_rate": 0.00028801925320886813,
"loss": 3.3172,
"step": 35750
},
{
"epoch": 10.428221859706362,
"grad_norm": 0.3687780201435089,
"learning_rate": 0.0002875816802800466,
"loss": 3.3185,
"step": 35800
},
{
"epoch": 10.44278722908413,
"grad_norm": 0.3637807369232178,
"learning_rate": 0.0002871441073512252,
"loss": 3.3257,
"step": 35850
},
{
"epoch": 10.457352598461897,
"grad_norm": 0.3877573013305664,
"learning_rate": 0.0002867065344224037,
"loss": 3.3316,
"step": 35900
},
{
"epoch": 10.471917967839664,
"grad_norm": 0.37709304690361023,
"learning_rate": 0.00028626896149358224,
"loss": 3.3028,
"step": 35950
},
{
"epoch": 10.486483337217432,
"grad_norm": 0.36883544921875,
"learning_rate": 0.0002858313885647608,
"loss": 3.3313,
"step": 36000
},
{
"epoch": 10.486483337217432,
"eval_accuracy": 0.37047523575310953,
"eval_loss": 3.555171489715576,
"eval_runtime": 180.6124,
"eval_samples_per_second": 92.153,
"eval_steps_per_second": 5.764,
"step": 36000
},
{
"epoch": 10.5010487065952,
"grad_norm": 0.37332433462142944,
"learning_rate": 0.0002853938156359393,
"loss": 3.3147,
"step": 36050
},
{
"epoch": 10.515614075972966,
"grad_norm": 0.3772258758544922,
"learning_rate": 0.00028495624270711786,
"loss": 3.3325,
"step": 36100
},
{
"epoch": 10.530179445350734,
"grad_norm": 0.35568490624427795,
"learning_rate": 0.00028451866977829634,
"loss": 3.3381,
"step": 36150
},
{
"epoch": 10.544744814728501,
"grad_norm": 0.3858466148376465,
"learning_rate": 0.0002840810968494749,
"loss": 3.3242,
"step": 36200
},
{
"epoch": 10.55931018410627,
"grad_norm": 0.3936407268047333,
"learning_rate": 0.0002836435239206534,
"loss": 3.3213,
"step": 36250
},
{
"epoch": 10.573875553484037,
"grad_norm": 0.3783574104309082,
"learning_rate": 0.00028320595099183196,
"loss": 3.3174,
"step": 36300
},
{
"epoch": 10.588440922861803,
"grad_norm": 0.3611924648284912,
"learning_rate": 0.0002827683780630105,
"loss": 3.3281,
"step": 36350
},
{
"epoch": 10.603006292239572,
"grad_norm": 0.36673375964164734,
"learning_rate": 0.000282330805134189,
"loss": 3.3274,
"step": 36400
},
{
"epoch": 10.617571661617339,
"grad_norm": 0.3864386975765228,
"learning_rate": 0.00028189323220536753,
"loss": 3.3263,
"step": 36450
},
{
"epoch": 10.632137030995105,
"grad_norm": 0.37186652421951294,
"learning_rate": 0.00028145565927654607,
"loss": 3.3256,
"step": 36500
},
{
"epoch": 10.646702400372874,
"grad_norm": 0.3645637333393097,
"learning_rate": 0.0002810180863477246,
"loss": 3.3304,
"step": 36550
},
{
"epoch": 10.66126776975064,
"grad_norm": 0.3960283696651459,
"learning_rate": 0.00028058051341890315,
"loss": 3.3293,
"step": 36600
},
{
"epoch": 10.675833139128407,
"grad_norm": 0.3968350291252136,
"learning_rate": 0.00028014294049008164,
"loss": 3.3289,
"step": 36650
},
{
"epoch": 10.690398508506176,
"grad_norm": 0.3649657666683197,
"learning_rate": 0.0002797053675612602,
"loss": 3.326,
"step": 36700
},
{
"epoch": 10.704963877883943,
"grad_norm": 0.366464227437973,
"learning_rate": 0.0002792677946324387,
"loss": 3.3453,
"step": 36750
},
{
"epoch": 10.719529247261711,
"grad_norm": 0.36643803119659424,
"learning_rate": 0.00027883022170361726,
"loss": 3.3272,
"step": 36800
},
{
"epoch": 10.734094616639478,
"grad_norm": 0.35845255851745605,
"learning_rate": 0.0002783926487747958,
"loss": 3.3341,
"step": 36850
},
{
"epoch": 10.748659986017245,
"grad_norm": 0.3823663294315338,
"learning_rate": 0.0002779550758459743,
"loss": 3.3404,
"step": 36900
},
{
"epoch": 10.763225355395013,
"grad_norm": 0.36972370743751526,
"learning_rate": 0.0002775175029171528,
"loss": 3.3372,
"step": 36950
},
{
"epoch": 10.77779072477278,
"grad_norm": 0.3613353967666626,
"learning_rate": 0.00027707992998833136,
"loss": 3.3399,
"step": 37000
},
{
"epoch": 10.77779072477278,
"eval_accuracy": 0.37111332468909186,
"eval_loss": 3.54584002494812,
"eval_runtime": 180.3064,
"eval_samples_per_second": 92.31,
"eval_steps_per_second": 5.774,
"step": 37000
},
{
"epoch": 10.792356094150549,
"grad_norm": 0.37549999356269836,
"learning_rate": 0.0002766423570595099,
"loss": 3.3357,
"step": 37050
},
{
"epoch": 10.806921463528315,
"grad_norm": 0.38684558868408203,
"learning_rate": 0.00027620478413068844,
"loss": 3.3365,
"step": 37100
},
{
"epoch": 10.821486832906082,
"grad_norm": 0.3710017204284668,
"learning_rate": 0.000275767211201867,
"loss": 3.3438,
"step": 37150
},
{
"epoch": 10.83605220228385,
"grad_norm": 0.3841908872127533,
"learning_rate": 0.00027532963827304547,
"loss": 3.3451,
"step": 37200
},
{
"epoch": 10.850617571661617,
"grad_norm": 0.37406450510025024,
"learning_rate": 0.000274892065344224,
"loss": 3.3508,
"step": 37250
},
{
"epoch": 10.865182941039384,
"grad_norm": 0.37421655654907227,
"learning_rate": 0.00027445449241540255,
"loss": 3.3346,
"step": 37300
},
{
"epoch": 10.879748310417153,
"grad_norm": 0.3724828064441681,
"learning_rate": 0.0002740169194865811,
"loss": 3.3251,
"step": 37350
},
{
"epoch": 10.89431367979492,
"grad_norm": 0.37316784262657166,
"learning_rate": 0.0002735793465577596,
"loss": 3.3407,
"step": 37400
},
{
"epoch": 10.908879049172686,
"grad_norm": 0.35748419165611267,
"learning_rate": 0.0002731417736289381,
"loss": 3.3403,
"step": 37450
},
{
"epoch": 10.923444418550455,
"grad_norm": 0.3694682717323303,
"learning_rate": 0.00027270420070011665,
"loss": 3.3413,
"step": 37500
},
{
"epoch": 10.938009787928221,
"grad_norm": 0.37885811924934387,
"learning_rate": 0.0002722666277712952,
"loss": 3.3426,
"step": 37550
},
{
"epoch": 10.95257515730599,
"grad_norm": 0.38499969244003296,
"learning_rate": 0.00027182905484247373,
"loss": 3.3389,
"step": 37600
},
{
"epoch": 10.967140526683757,
"grad_norm": 0.36556389927864075,
"learning_rate": 0.00027139148191365227,
"loss": 3.3459,
"step": 37650
},
{
"epoch": 10.981705896061523,
"grad_norm": 0.3706257939338684,
"learning_rate": 0.00027095390898483076,
"loss": 3.3406,
"step": 37700
},
{
"epoch": 10.996271265439292,
"grad_norm": 0.36823248863220215,
"learning_rate": 0.0002705163360560093,
"loss": 3.3361,
"step": 37750
},
{
"epoch": 11.010778373339548,
"grad_norm": 0.35125017166137695,
"learning_rate": 0.00027007876312718784,
"loss": 3.2779,
"step": 37800
},
{
"epoch": 11.025343742717315,
"grad_norm": 0.3946673274040222,
"learning_rate": 0.0002696411901983664,
"loss": 3.2391,
"step": 37850
},
{
"epoch": 11.039909112095083,
"grad_norm": 0.37063831090927124,
"learning_rate": 0.0002692036172695449,
"loss": 3.2478,
"step": 37900
},
{
"epoch": 11.05447448147285,
"grad_norm": 0.36030757427215576,
"learning_rate": 0.0002687660443407234,
"loss": 3.2521,
"step": 37950
},
{
"epoch": 11.069039850850617,
"grad_norm": 0.3777706027030945,
"learning_rate": 0.00026832847141190194,
"loss": 3.2605,
"step": 38000
},
{
"epoch": 11.069039850850617,
"eval_accuracy": 0.370730142112647,
"eval_loss": 3.558288097381592,
"eval_runtime": 180.3247,
"eval_samples_per_second": 92.3,
"eval_steps_per_second": 5.773,
"step": 38000
},
{
"epoch": 11.083605220228385,
"grad_norm": 0.3810880482196808,
"learning_rate": 0.0002678908984830805,
"loss": 3.2545,
"step": 38050
},
{
"epoch": 11.098170589606152,
"grad_norm": 0.38010433316230774,
"learning_rate": 0.000267453325554259,
"loss": 3.2619,
"step": 38100
},
{
"epoch": 11.11273595898392,
"grad_norm": 0.38559970259666443,
"learning_rate": 0.00026701575262543756,
"loss": 3.26,
"step": 38150
},
{
"epoch": 11.127301328361687,
"grad_norm": 0.37837737798690796,
"learning_rate": 0.00026657817969661605,
"loss": 3.2626,
"step": 38200
},
{
"epoch": 11.141866697739454,
"grad_norm": 0.3784601092338562,
"learning_rate": 0.00026614060676779464,
"loss": 3.258,
"step": 38250
},
{
"epoch": 11.156432067117223,
"grad_norm": 0.35845887660980225,
"learning_rate": 0.00026570303383897313,
"loss": 3.2586,
"step": 38300
},
{
"epoch": 11.17099743649499,
"grad_norm": 0.37323319911956787,
"learning_rate": 0.00026526546091015167,
"loss": 3.2596,
"step": 38350
},
{
"epoch": 11.185562805872756,
"grad_norm": 0.4025420546531677,
"learning_rate": 0.0002648278879813302,
"loss": 3.2722,
"step": 38400
},
{
"epoch": 11.200128175250525,
"grad_norm": 0.3721407651901245,
"learning_rate": 0.0002643903150525087,
"loss": 3.2737,
"step": 38450
},
{
"epoch": 11.214693544628291,
"grad_norm": 0.3787324130535126,
"learning_rate": 0.0002639527421236873,
"loss": 3.267,
"step": 38500
},
{
"epoch": 11.22925891400606,
"grad_norm": 0.385883092880249,
"learning_rate": 0.0002635151691948658,
"loss": 3.2813,
"step": 38550
},
{
"epoch": 11.243824283383827,
"grad_norm": 0.3823045492172241,
"learning_rate": 0.0002630775962660443,
"loss": 3.2806,
"step": 38600
},
{
"epoch": 11.258389652761593,
"grad_norm": 0.3888196349143982,
"learning_rate": 0.00026264002333722285,
"loss": 3.2669,
"step": 38650
},
{
"epoch": 11.272955022139362,
"grad_norm": 0.4065677523612976,
"learning_rate": 0.00026220245040840134,
"loss": 3.2815,
"step": 38700
},
{
"epoch": 11.287520391517129,
"grad_norm": 0.36197319626808167,
"learning_rate": 0.00026176487747957993,
"loss": 3.2846,
"step": 38750
},
{
"epoch": 11.302085760894895,
"grad_norm": 0.379085510969162,
"learning_rate": 0.0002613273045507584,
"loss": 3.2861,
"step": 38800
},
{
"epoch": 11.316651130272664,
"grad_norm": 0.39975711703300476,
"learning_rate": 0.00026088973162193696,
"loss": 3.3018,
"step": 38850
},
{
"epoch": 11.33121649965043,
"grad_norm": 0.390550434589386,
"learning_rate": 0.0002604521586931155,
"loss": 3.2864,
"step": 38900
},
{
"epoch": 11.3457818690282,
"grad_norm": 0.40258410573005676,
"learning_rate": 0.00026001458576429404,
"loss": 3.2845,
"step": 38950
},
{
"epoch": 11.360347238405966,
"grad_norm": 0.36794647574424744,
"learning_rate": 0.0002595770128354726,
"loss": 3.2906,
"step": 39000
},
{
"epoch": 11.360347238405966,
"eval_accuracy": 0.37129404012940964,
"eval_loss": 3.552013397216797,
"eval_runtime": 180.3711,
"eval_samples_per_second": 92.276,
"eval_steps_per_second": 5.771,
"step": 39000
},
{
"epoch": 11.374912607783733,
"grad_norm": 0.3978714048862457,
"learning_rate": 0.00025913943990665106,
"loss": 3.2952,
"step": 39050
},
{
"epoch": 11.389477977161501,
"grad_norm": 0.3712661862373352,
"learning_rate": 0.0002587018669778296,
"loss": 3.2887,
"step": 39100
},
{
"epoch": 11.404043346539268,
"grad_norm": 0.3962024748325348,
"learning_rate": 0.00025826429404900814,
"loss": 3.2818,
"step": 39150
},
{
"epoch": 11.418608715917035,
"grad_norm": 0.3791441321372986,
"learning_rate": 0.0002578267211201867,
"loss": 3.2947,
"step": 39200
},
{
"epoch": 11.433174085294803,
"grad_norm": 0.38361239433288574,
"learning_rate": 0.0002573891481913652,
"loss": 3.3016,
"step": 39250
},
{
"epoch": 11.44773945467257,
"grad_norm": 0.3753799498081207,
"learning_rate": 0.0002569515752625437,
"loss": 3.291,
"step": 39300
},
{
"epoch": 11.462304824050339,
"grad_norm": 0.37291768193244934,
"learning_rate": 0.0002565140023337223,
"loss": 3.2858,
"step": 39350
},
{
"epoch": 11.476870193428105,
"grad_norm": 0.382899671792984,
"learning_rate": 0.0002560764294049008,
"loss": 3.2902,
"step": 39400
},
{
"epoch": 11.491435562805872,
"grad_norm": 0.38682591915130615,
"learning_rate": 0.00025563885647607933,
"loss": 3.3095,
"step": 39450
},
{
"epoch": 11.50600093218364,
"grad_norm": 0.39052340388298035,
"learning_rate": 0.00025520128354725787,
"loss": 3.3024,
"step": 39500
},
{
"epoch": 11.520566301561407,
"grad_norm": 0.38648220896720886,
"learning_rate": 0.00025476371061843636,
"loss": 3.3037,
"step": 39550
},
{
"epoch": 11.535131670939174,
"grad_norm": 0.37323859333992004,
"learning_rate": 0.00025432613768961495,
"loss": 3.2976,
"step": 39600
},
{
"epoch": 11.549697040316943,
"grad_norm": 0.3768618106842041,
"learning_rate": 0.00025388856476079343,
"loss": 3.3111,
"step": 39650
},
{
"epoch": 11.56426240969471,
"grad_norm": 0.3987623155117035,
"learning_rate": 0.000253450991831972,
"loss": 3.301,
"step": 39700
},
{
"epoch": 11.578827779072478,
"grad_norm": 0.42070692777633667,
"learning_rate": 0.0002530134189031505,
"loss": 3.3047,
"step": 39750
},
{
"epoch": 11.593393148450245,
"grad_norm": 0.37431833148002625,
"learning_rate": 0.000252575845974329,
"loss": 3.3051,
"step": 39800
},
{
"epoch": 11.607958517828012,
"grad_norm": 0.3832058310508728,
"learning_rate": 0.0002521382730455076,
"loss": 3.2968,
"step": 39850
},
{
"epoch": 11.62252388720578,
"grad_norm": 0.3984127342700958,
"learning_rate": 0.0002517007001166861,
"loss": 3.3157,
"step": 39900
},
{
"epoch": 11.637089256583547,
"grad_norm": 0.40139371156692505,
"learning_rate": 0.0002512631271878646,
"loss": 3.2996,
"step": 39950
},
{
"epoch": 11.651654625961314,
"grad_norm": 0.3891284465789795,
"learning_rate": 0.00025082555425904316,
"loss": 3.3088,
"step": 40000
},
{
"epoch": 11.651654625961314,
"eval_accuracy": 0.3715696399941494,
"eval_loss": 3.5464000701904297,
"eval_runtime": 180.4742,
"eval_samples_per_second": 92.224,
"eval_steps_per_second": 5.768,
"step": 40000
},
{
"epoch": 11.666219995339082,
"grad_norm": 0.37767043709754944,
"learning_rate": 0.0002503879813302217,
"loss": 3.2951,
"step": 40050
},
{
"epoch": 11.680785364716849,
"grad_norm": 0.40471839904785156,
"learning_rate": 0.00024995040840140024,
"loss": 3.2946,
"step": 40100
},
{
"epoch": 11.695350734094617,
"grad_norm": 0.3795658051967621,
"learning_rate": 0.0002495128354725787,
"loss": 3.3187,
"step": 40150
},
{
"epoch": 11.709916103472384,
"grad_norm": 0.3852717876434326,
"learning_rate": 0.00024907526254375727,
"loss": 3.2955,
"step": 40200
},
{
"epoch": 11.724481472850151,
"grad_norm": 0.37112799286842346,
"learning_rate": 0.0002486376896149358,
"loss": 3.3023,
"step": 40250
},
{
"epoch": 11.73904684222792,
"grad_norm": 0.37619829177856445,
"learning_rate": 0.00024820011668611435,
"loss": 3.3103,
"step": 40300
},
{
"epoch": 11.753612211605686,
"grad_norm": 0.3923087418079376,
"learning_rate": 0.0002477625437572929,
"loss": 3.3131,
"step": 40350
},
{
"epoch": 11.768177580983453,
"grad_norm": 0.3909642696380615,
"learning_rate": 0.00024732497082847137,
"loss": 3.3085,
"step": 40400
},
{
"epoch": 11.782742950361222,
"grad_norm": 0.3891732096672058,
"learning_rate": 0.0002468873978996499,
"loss": 3.3118,
"step": 40450
},
{
"epoch": 11.797308319738988,
"grad_norm": 0.39520296454429626,
"learning_rate": 0.00024644982497082845,
"loss": 3.3134,
"step": 40500
},
{
"epoch": 11.811873689116755,
"grad_norm": 0.3944683074951172,
"learning_rate": 0.000246012252042007,
"loss": 3.3071,
"step": 40550
},
{
"epoch": 11.826439058494524,
"grad_norm": 0.3806307315826416,
"learning_rate": 0.00024557467911318553,
"loss": 3.3058,
"step": 40600
},
{
"epoch": 11.84100442787229,
"grad_norm": 0.38682928681373596,
"learning_rate": 0.000245137106184364,
"loss": 3.3067,
"step": 40650
},
{
"epoch": 11.855569797250059,
"grad_norm": 0.3885536789894104,
"learning_rate": 0.00024469953325554256,
"loss": 3.3188,
"step": 40700
},
{
"epoch": 11.870135166627826,
"grad_norm": 0.39508283138275146,
"learning_rate": 0.0002442619603267211,
"loss": 3.3167,
"step": 40750
},
{
"epoch": 11.884700536005592,
"grad_norm": 0.37365779280662537,
"learning_rate": 0.00024382438739789964,
"loss": 3.2989,
"step": 40800
},
{
"epoch": 11.899265905383361,
"grad_norm": 0.37982553243637085,
"learning_rate": 0.00024338681446907818,
"loss": 3.3162,
"step": 40850
},
{
"epoch": 11.913831274761128,
"grad_norm": 0.3698308765888214,
"learning_rate": 0.0002429492415402567,
"loss": 3.3203,
"step": 40900
},
{
"epoch": 11.928396644138896,
"grad_norm": 0.3770948052406311,
"learning_rate": 0.0002425116686114352,
"loss": 3.315,
"step": 40950
},
{
"epoch": 11.942962013516663,
"grad_norm": 0.3643822968006134,
"learning_rate": 0.00024207409568261377,
"loss": 3.3188,
"step": 41000
},
{
"epoch": 11.942962013516663,
"eval_accuracy": 0.37253788439949564,
"eval_loss": 3.536294460296631,
"eval_runtime": 180.6316,
"eval_samples_per_second": 92.143,
"eval_steps_per_second": 5.763,
"step": 41000
},
{
"epoch": 11.95752738289443,
"grad_norm": 0.37310171127319336,
"learning_rate": 0.00024163652275379228,
"loss": 3.3067,
"step": 41050
},
{
"epoch": 11.972092752272198,
"grad_norm": 0.3831028342247009,
"learning_rate": 0.00024119894982497082,
"loss": 3.3263,
"step": 41100
},
{
"epoch": 11.986658121649965,
"grad_norm": 0.3900957405567169,
"learning_rate": 0.00024076137689614933,
"loss": 3.3106,
"step": 41150
},
{
"epoch": 12.001165229550221,
"grad_norm": 0.3873760998249054,
"learning_rate": 0.00024032380396732785,
"loss": 3.3022,
"step": 41200
},
{
"epoch": 12.01573059892799,
"grad_norm": 0.3788856863975525,
"learning_rate": 0.00023988623103850641,
"loss": 3.2189,
"step": 41250
},
{
"epoch": 12.030295968305756,
"grad_norm": 0.4146612584590912,
"learning_rate": 0.00023944865810968493,
"loss": 3.2262,
"step": 41300
},
{
"epoch": 12.044861337683523,
"grad_norm": 0.3976421356201172,
"learning_rate": 0.00023901108518086347,
"loss": 3.2397,
"step": 41350
},
{
"epoch": 12.059426707061291,
"grad_norm": 0.3815682828426361,
"learning_rate": 0.00023857351225204198,
"loss": 3.2293,
"step": 41400
},
{
"epoch": 12.073992076439058,
"grad_norm": 0.3816235363483429,
"learning_rate": 0.0002381359393232205,
"loss": 3.2318,
"step": 41450
},
{
"epoch": 12.088557445816827,
"grad_norm": 0.3730505108833313,
"learning_rate": 0.00023769836639439906,
"loss": 3.2306,
"step": 41500
},
{
"epoch": 12.103122815194594,
"grad_norm": 0.36907413601875305,
"learning_rate": 0.00023726079346557757,
"loss": 3.2172,
"step": 41550
},
{
"epoch": 12.11768818457236,
"grad_norm": 0.3938505947589874,
"learning_rate": 0.0002368232205367561,
"loss": 3.2349,
"step": 41600
},
{
"epoch": 12.132253553950129,
"grad_norm": 0.39459192752838135,
"learning_rate": 0.00023638564760793463,
"loss": 3.244,
"step": 41650
},
{
"epoch": 12.146818923327896,
"grad_norm": 0.3762718141078949,
"learning_rate": 0.00023594807467911317,
"loss": 3.2336,
"step": 41700
},
{
"epoch": 12.161384292705662,
"grad_norm": 0.38366296887397766,
"learning_rate": 0.0002355105017502917,
"loss": 3.2468,
"step": 41750
},
{
"epoch": 12.17594966208343,
"grad_norm": 0.37330591678619385,
"learning_rate": 0.00023507292882147022,
"loss": 3.2367,
"step": 41800
},
{
"epoch": 12.190515031461198,
"grad_norm": 0.39677342772483826,
"learning_rate": 0.00023463535589264876,
"loss": 3.2597,
"step": 41850
},
{
"epoch": 12.205080400838966,
"grad_norm": 0.3829995393753052,
"learning_rate": 0.0002341977829638273,
"loss": 3.2453,
"step": 41900
},
{
"epoch": 12.219645770216733,
"grad_norm": 0.40625911951065063,
"learning_rate": 0.0002337602100350058,
"loss": 3.2457,
"step": 41950
},
{
"epoch": 12.2342111395945,
"grad_norm": 0.3920283019542694,
"learning_rate": 0.00023332263710618435,
"loss": 3.2678,
"step": 42000
},
{
"epoch": 12.2342111395945,
"eval_accuracy": 0.3720842733593225,
"eval_loss": 3.546815872192383,
"eval_runtime": 180.3312,
"eval_samples_per_second": 92.297,
"eval_steps_per_second": 5.773,
"step": 42000
},
{
"epoch": 12.248776508972268,
"grad_norm": 0.398946613073349,
"learning_rate": 0.00023288506417736286,
"loss": 3.245,
"step": 42050
},
{
"epoch": 12.263341878350035,
"grad_norm": 0.3961947560310364,
"learning_rate": 0.00023244749124854143,
"loss": 3.2517,
"step": 42100
},
{
"epoch": 12.277907247727802,
"grad_norm": 0.3835267722606659,
"learning_rate": 0.00023200991831971994,
"loss": 3.2522,
"step": 42150
},
{
"epoch": 12.29247261710557,
"grad_norm": 0.42905566096305847,
"learning_rate": 0.00023157234539089846,
"loss": 3.2587,
"step": 42200
},
{
"epoch": 12.307037986483337,
"grad_norm": 0.39819803833961487,
"learning_rate": 0.000231134772462077,
"loss": 3.2556,
"step": 42250
},
{
"epoch": 12.321603355861104,
"grad_norm": 0.393216997385025,
"learning_rate": 0.0002306971995332555,
"loss": 3.2738,
"step": 42300
},
{
"epoch": 12.336168725238872,
"grad_norm": 0.3680713176727295,
"learning_rate": 0.00023025962660443408,
"loss": 3.2608,
"step": 42350
},
{
"epoch": 12.350734094616639,
"grad_norm": 0.3907005488872528,
"learning_rate": 0.0002298220536756126,
"loss": 3.2592,
"step": 42400
},
{
"epoch": 12.365299463994408,
"grad_norm": 0.39694586396217346,
"learning_rate": 0.0002293844807467911,
"loss": 3.2655,
"step": 42450
},
{
"epoch": 12.379864833372174,
"grad_norm": 0.3920033276081085,
"learning_rate": 0.00022894690781796964,
"loss": 3.2694,
"step": 42500
},
{
"epoch": 12.394430202749941,
"grad_norm": 0.38897332549095154,
"learning_rate": 0.00022850933488914815,
"loss": 3.2831,
"step": 42550
},
{
"epoch": 12.40899557212771,
"grad_norm": 0.3850444257259369,
"learning_rate": 0.00022807176196032672,
"loss": 3.2698,
"step": 42600
},
{
"epoch": 12.423560941505476,
"grad_norm": 0.39484626054763794,
"learning_rate": 0.00022763418903150523,
"loss": 3.2621,
"step": 42650
},
{
"epoch": 12.438126310883243,
"grad_norm": 0.4130299985408783,
"learning_rate": 0.00022719661610268375,
"loss": 3.2765,
"step": 42700
},
{
"epoch": 12.452691680261012,
"grad_norm": 0.3730163872241974,
"learning_rate": 0.0002267590431738623,
"loss": 3.2681,
"step": 42750
},
{
"epoch": 12.467257049638778,
"grad_norm": 0.3927021026611328,
"learning_rate": 0.00022632147024504083,
"loss": 3.2684,
"step": 42800
},
{
"epoch": 12.481822419016547,
"grad_norm": 0.39260363578796387,
"learning_rate": 0.00022588389731621937,
"loss": 3.2652,
"step": 42850
},
{
"epoch": 12.496387788394314,
"grad_norm": 0.3787255883216858,
"learning_rate": 0.00022544632438739788,
"loss": 3.2619,
"step": 42900
},
{
"epoch": 12.51095315777208,
"grad_norm": 0.38174960017204285,
"learning_rate": 0.0002250087514585764,
"loss": 3.2742,
"step": 42950
},
{
"epoch": 12.525518527149849,
"grad_norm": 0.39155444502830505,
"learning_rate": 0.00022457117852975496,
"loss": 3.2818,
"step": 43000
},
{
"epoch": 12.525518527149849,
"eval_accuracy": 0.37251695574082516,
"eval_loss": 3.541425943374634,
"eval_runtime": 180.4575,
"eval_samples_per_second": 92.232,
"eval_steps_per_second": 5.769,
"step": 43000
},
{
"epoch": 12.540083896527616,
"grad_norm": 0.3748781979084015,
"learning_rate": 0.00022413360560093347,
"loss": 3.28,
"step": 43050
},
{
"epoch": 12.554649265905383,
"grad_norm": 0.3864782452583313,
"learning_rate": 0.000223696032672112,
"loss": 3.2746,
"step": 43100
},
{
"epoch": 12.569214635283151,
"grad_norm": 0.39516115188598633,
"learning_rate": 0.00022325845974329053,
"loss": 3.2664,
"step": 43150
},
{
"epoch": 12.583780004660918,
"grad_norm": 0.3874489367008209,
"learning_rate": 0.00022282088681446904,
"loss": 3.2765,
"step": 43200
},
{
"epoch": 12.598345374038686,
"grad_norm": 0.4148963689804077,
"learning_rate": 0.0002223833138856476,
"loss": 3.2833,
"step": 43250
},
{
"epoch": 12.612910743416453,
"grad_norm": 0.38245537877082825,
"learning_rate": 0.00022194574095682612,
"loss": 3.2826,
"step": 43300
},
{
"epoch": 12.62747611279422,
"grad_norm": 0.3959484100341797,
"learning_rate": 0.00022150816802800466,
"loss": 3.2772,
"step": 43350
},
{
"epoch": 12.642041482171988,
"grad_norm": 0.3956339359283447,
"learning_rate": 0.00022107059509918317,
"loss": 3.2741,
"step": 43400
},
{
"epoch": 12.656606851549755,
"grad_norm": 0.3839803636074066,
"learning_rate": 0.00022063302217036168,
"loss": 3.2662,
"step": 43450
},
{
"epoch": 12.671172220927522,
"grad_norm": 0.40059152245521545,
"learning_rate": 0.00022019544924154025,
"loss": 3.2851,
"step": 43500
},
{
"epoch": 12.68573759030529,
"grad_norm": 0.3880845904350281,
"learning_rate": 0.00021975787631271876,
"loss": 3.2854,
"step": 43550
},
{
"epoch": 12.700302959683057,
"grad_norm": 0.3912261128425598,
"learning_rate": 0.0002193203033838973,
"loss": 3.2838,
"step": 43600
},
{
"epoch": 12.714868329060826,
"grad_norm": 0.41812238097190857,
"learning_rate": 0.00021888273045507582,
"loss": 3.2902,
"step": 43650
},
{
"epoch": 12.729433698438593,
"grad_norm": 0.3847753703594208,
"learning_rate": 0.00021844515752625436,
"loss": 3.2726,
"step": 43700
},
{
"epoch": 12.74399906781636,
"grad_norm": 0.3847730755805969,
"learning_rate": 0.0002180075845974329,
"loss": 3.2761,
"step": 43750
},
{
"epoch": 12.758564437194128,
"grad_norm": 0.3814358413219452,
"learning_rate": 0.0002175700116686114,
"loss": 3.2816,
"step": 43800
},
{
"epoch": 12.773129806571895,
"grad_norm": 0.39806804060935974,
"learning_rate": 0.00021713243873978995,
"loss": 3.2889,
"step": 43850
},
{
"epoch": 12.787695175949661,
"grad_norm": 0.3872688412666321,
"learning_rate": 0.0002166948658109685,
"loss": 3.2889,
"step": 43900
},
{
"epoch": 12.80226054532743,
"grad_norm": 0.3840930461883545,
"learning_rate": 0.000216257292882147,
"loss": 3.2767,
"step": 43950
},
{
"epoch": 12.816825914705197,
"grad_norm": 0.3970656096935272,
"learning_rate": 0.00021581971995332554,
"loss": 3.2772,
"step": 44000
},
{
"epoch": 12.816825914705197,
"eval_accuracy": 0.37332153333229867,
"eval_loss": 3.5360467433929443,
"eval_runtime": 180.2447,
"eval_samples_per_second": 92.341,
"eval_steps_per_second": 5.775,
"step": 44000
},
{
"epoch": 12.831391284082965,
"grad_norm": 0.3944132328033447,
"learning_rate": 0.00021538214702450405,
"loss": 3.2893,
"step": 44050
},
{
"epoch": 12.845956653460732,
"grad_norm": 0.40921568870544434,
"learning_rate": 0.00021494457409568262,
"loss": 3.2811,
"step": 44100
},
{
"epoch": 12.860522022838499,
"grad_norm": 0.37589746713638306,
"learning_rate": 0.00021450700116686113,
"loss": 3.278,
"step": 44150
},
{
"epoch": 12.875087392216267,
"grad_norm": 0.4068247377872467,
"learning_rate": 0.00021406942823803965,
"loss": 3.2848,
"step": 44200
},
{
"epoch": 12.889652761594034,
"grad_norm": 0.41013479232788086,
"learning_rate": 0.0002136318553092182,
"loss": 3.2929,
"step": 44250
},
{
"epoch": 12.9042181309718,
"grad_norm": 0.39379021525382996,
"learning_rate": 0.0002131942823803967,
"loss": 3.2898,
"step": 44300
},
{
"epoch": 12.91878350034957,
"grad_norm": 0.38993388414382935,
"learning_rate": 0.00021275670945157527,
"loss": 3.287,
"step": 44350
},
{
"epoch": 12.933348869727336,
"grad_norm": 0.4032069146633148,
"learning_rate": 0.00021231913652275378,
"loss": 3.2932,
"step": 44400
},
{
"epoch": 12.947914239105105,
"grad_norm": 0.40004608035087585,
"learning_rate": 0.0002118815635939323,
"loss": 3.2954,
"step": 44450
},
{
"epoch": 12.962479608482871,
"grad_norm": 0.39480239152908325,
"learning_rate": 0.00021144399066511083,
"loss": 3.2967,
"step": 44500
},
{
"epoch": 12.977044977860638,
"grad_norm": 0.4099850058555603,
"learning_rate": 0.00021100641773628935,
"loss": 3.2809,
"step": 44550
},
{
"epoch": 12.991610347238407,
"grad_norm": 0.38592153787612915,
"learning_rate": 0.0002105688448074679,
"loss": 3.2948,
"step": 44600
},
{
"epoch": 13.006117455138662,
"grad_norm": 0.3878653049468994,
"learning_rate": 0.00021013127187864643,
"loss": 3.2447,
"step": 44650
},
{
"epoch": 13.02068282451643,
"grad_norm": 0.40472573041915894,
"learning_rate": 0.00020969369894982494,
"loss": 3.1872,
"step": 44700
},
{
"epoch": 13.035248193894198,
"grad_norm": 0.38480067253112793,
"learning_rate": 0.00020925612602100348,
"loss": 3.1966,
"step": 44750
},
{
"epoch": 13.049813563271965,
"grad_norm": 0.4043852388858795,
"learning_rate": 0.00020881855309218202,
"loss": 3.1928,
"step": 44800
},
{
"epoch": 13.064378932649731,
"grad_norm": 0.3920169174671173,
"learning_rate": 0.00020838098016336056,
"loss": 3.2142,
"step": 44850
},
{
"epoch": 13.0789443020275,
"grad_norm": 0.4085189402103424,
"learning_rate": 0.00020794340723453907,
"loss": 3.1996,
"step": 44900
},
{
"epoch": 13.093509671405267,
"grad_norm": 0.39081132411956787,
"learning_rate": 0.00020750583430571758,
"loss": 3.2162,
"step": 44950
},
{
"epoch": 13.108075040783035,
"grad_norm": 0.4104847311973572,
"learning_rate": 0.00020706826137689615,
"loss": 3.215,
"step": 45000
},
{
"epoch": 13.108075040783035,
"eval_accuracy": 0.3724899130919812,
"eval_loss": 3.5466861724853516,
"eval_runtime": 180.2123,
"eval_samples_per_second": 92.358,
"eval_steps_per_second": 5.777,
"step": 45000
},
{
"epoch": 13.122640410160802,
"grad_norm": 0.40783169865608215,
"learning_rate": 0.00020663068844807466,
"loss": 3.217,
"step": 45050
},
{
"epoch": 13.137205779538569,
"grad_norm": 0.3994167149066925,
"learning_rate": 0.0002061931155192532,
"loss": 3.214,
"step": 45100
},
{
"epoch": 13.151771148916337,
"grad_norm": 0.41038912534713745,
"learning_rate": 0.00020575554259043172,
"loss": 3.2265,
"step": 45150
},
{
"epoch": 13.166336518294104,
"grad_norm": 0.3970767557621002,
"learning_rate": 0.00020531796966161023,
"loss": 3.2219,
"step": 45200
},
{
"epoch": 13.18090188767187,
"grad_norm": 0.4076697528362274,
"learning_rate": 0.0002048803967327888,
"loss": 3.2133,
"step": 45250
},
{
"epoch": 13.19546725704964,
"grad_norm": 0.40613362193107605,
"learning_rate": 0.0002044428238039673,
"loss": 3.22,
"step": 45300
},
{
"epoch": 13.210032626427406,
"grad_norm": 0.39395052194595337,
"learning_rate": 0.00020400525087514585,
"loss": 3.2279,
"step": 45350
},
{
"epoch": 13.224597995805174,
"grad_norm": 0.3916940987110138,
"learning_rate": 0.00020356767794632436,
"loss": 3.2346,
"step": 45400
},
{
"epoch": 13.239163365182941,
"grad_norm": 0.41231533885002136,
"learning_rate": 0.00020313010501750287,
"loss": 3.2357,
"step": 45450
},
{
"epoch": 13.253728734560708,
"grad_norm": 0.4182799160480499,
"learning_rate": 0.00020269253208868144,
"loss": 3.2334,
"step": 45500
},
{
"epoch": 13.268294103938477,
"grad_norm": 0.4099382162094116,
"learning_rate": 0.00020225495915985995,
"loss": 3.2341,
"step": 45550
},
{
"epoch": 13.282859473316243,
"grad_norm": 0.4044232666492462,
"learning_rate": 0.0002018173862310385,
"loss": 3.2213,
"step": 45600
},
{
"epoch": 13.29742484269401,
"grad_norm": 0.39154335856437683,
"learning_rate": 0.000201379813302217,
"loss": 3.237,
"step": 45650
},
{
"epoch": 13.311990212071779,
"grad_norm": 0.4079340398311615,
"learning_rate": 0.00020094224037339555,
"loss": 3.2376,
"step": 45700
},
{
"epoch": 13.326555581449545,
"grad_norm": 0.39542028307914734,
"learning_rate": 0.0002005046674445741,
"loss": 3.2315,
"step": 45750
},
{
"epoch": 13.341120950827314,
"grad_norm": 0.39488768577575684,
"learning_rate": 0.0002000670945157526,
"loss": 3.2401,
"step": 45800
},
{
"epoch": 13.35568632020508,
"grad_norm": 0.41860339045524597,
"learning_rate": 0.00019962952158693114,
"loss": 3.2385,
"step": 45850
},
{
"epoch": 13.370251689582847,
"grad_norm": 0.4021410644054413,
"learning_rate": 0.00019919194865810968,
"loss": 3.2472,
"step": 45900
},
{
"epoch": 13.384817058960616,
"grad_norm": 0.3935169279575348,
"learning_rate": 0.0001987543757292882,
"loss": 3.2474,
"step": 45950
},
{
"epoch": 13.399382428338383,
"grad_norm": 0.4164498448371887,
"learning_rate": 0.00019831680280046673,
"loss": 3.2448,
"step": 46000
},
{
"epoch": 13.399382428338383,
"eval_accuracy": 0.37297574015729884,
"eval_loss": 3.5430777072906494,
"eval_runtime": 180.1561,
"eval_samples_per_second": 92.387,
"eval_steps_per_second": 5.778,
"step": 46000
},
{
"epoch": 13.41394779771615,
"grad_norm": 0.4161559045314789,
"learning_rate": 0.00019787922987164524,
"loss": 3.2397,
"step": 46050
},
{
"epoch": 13.428513167093918,
"grad_norm": 0.40776827931404114,
"learning_rate": 0.0001974416569428238,
"loss": 3.2325,
"step": 46100
},
{
"epoch": 13.443078536471685,
"grad_norm": 0.3878330886363983,
"learning_rate": 0.00019700408401400232,
"loss": 3.2609,
"step": 46150
},
{
"epoch": 13.457643905849451,
"grad_norm": 0.40034887194633484,
"learning_rate": 0.00019656651108518084,
"loss": 3.2584,
"step": 46200
},
{
"epoch": 13.47220927522722,
"grad_norm": 0.40647125244140625,
"learning_rate": 0.00019612893815635938,
"loss": 3.2431,
"step": 46250
},
{
"epoch": 13.486774644604987,
"grad_norm": 0.3935099244117737,
"learning_rate": 0.0001956913652275379,
"loss": 3.2455,
"step": 46300
},
{
"epoch": 13.501340013982755,
"grad_norm": 0.3952663540840149,
"learning_rate": 0.00019525379229871646,
"loss": 3.2482,
"step": 46350
},
{
"epoch": 13.515905383360522,
"grad_norm": 0.390480637550354,
"learning_rate": 0.00019481621936989497,
"loss": 3.2544,
"step": 46400
},
{
"epoch": 13.530470752738289,
"grad_norm": 0.40572217106819153,
"learning_rate": 0.00019437864644107348,
"loss": 3.2502,
"step": 46450
},
{
"epoch": 13.545036122116057,
"grad_norm": 0.38214248418807983,
"learning_rate": 0.00019394107351225202,
"loss": 3.2427,
"step": 46500
},
{
"epoch": 13.559601491493824,
"grad_norm": 0.4259106516838074,
"learning_rate": 0.00019350350058343054,
"loss": 3.2479,
"step": 46550
},
{
"epoch": 13.574166860871593,
"grad_norm": 0.3941766917705536,
"learning_rate": 0.0001930659276546091,
"loss": 3.2628,
"step": 46600
},
{
"epoch": 13.58873223024936,
"grad_norm": 0.40022504329681396,
"learning_rate": 0.00019262835472578762,
"loss": 3.2478,
"step": 46650
},
{
"epoch": 13.603297599627126,
"grad_norm": 0.3927033841609955,
"learning_rate": 0.00019219078179696613,
"loss": 3.2597,
"step": 46700
},
{
"epoch": 13.617862969004895,
"grad_norm": 0.4204312562942505,
"learning_rate": 0.00019175320886814467,
"loss": 3.2552,
"step": 46750
},
{
"epoch": 13.632428338382661,
"grad_norm": 0.4014910161495209,
"learning_rate": 0.0001913156359393232,
"loss": 3.2582,
"step": 46800
},
{
"epoch": 13.646993707760428,
"grad_norm": 0.3960302770137787,
"learning_rate": 0.00019087806301050175,
"loss": 3.247,
"step": 46850
},
{
"epoch": 13.661559077138197,
"grad_norm": 0.40421754121780396,
"learning_rate": 0.00019044049008168026,
"loss": 3.2509,
"step": 46900
},
{
"epoch": 13.676124446515963,
"grad_norm": 0.4028851091861725,
"learning_rate": 0.00019000291715285877,
"loss": 3.2603,
"step": 46950
},
{
"epoch": 13.69068981589373,
"grad_norm": 0.4152960181236267,
"learning_rate": 0.00018956534422403734,
"loss": 3.2716,
"step": 47000
},
{
"epoch": 13.69068981589373,
"eval_accuracy": 0.37343640580151827,
"eval_loss": 3.536942481994629,
"eval_runtime": 180.1541,
"eval_samples_per_second": 92.388,
"eval_steps_per_second": 5.778,
"step": 47000
},
{
"epoch": 13.705255185271499,
"grad_norm": 0.40029028058052063,
"learning_rate": 0.00018912777129521585,
"loss": 3.2608,
"step": 47050
},
{
"epoch": 13.719820554649266,
"grad_norm": 0.4005506634712219,
"learning_rate": 0.0001886901983663944,
"loss": 3.2562,
"step": 47100
},
{
"epoch": 13.734385924027034,
"grad_norm": 0.4043956398963928,
"learning_rate": 0.0001882526254375729,
"loss": 3.2553,
"step": 47150
},
{
"epoch": 13.7489512934048,
"grad_norm": 0.393660306930542,
"learning_rate": 0.00018781505250875142,
"loss": 3.2504,
"step": 47200
},
{
"epoch": 13.763516662782568,
"grad_norm": 0.41873812675476074,
"learning_rate": 0.00018737747957992999,
"loss": 3.2641,
"step": 47250
},
{
"epoch": 13.778082032160336,
"grad_norm": 0.39937934279441833,
"learning_rate": 0.0001869399066511085,
"loss": 3.2601,
"step": 47300
},
{
"epoch": 13.792647401538103,
"grad_norm": 0.39644569158554077,
"learning_rate": 0.00018650233372228704,
"loss": 3.2579,
"step": 47350
},
{
"epoch": 13.80721277091587,
"grad_norm": 0.4110250174999237,
"learning_rate": 0.00018606476079346555,
"loss": 3.2545,
"step": 47400
},
{
"epoch": 13.821778140293638,
"grad_norm": 0.39572134613990784,
"learning_rate": 0.00018562718786464406,
"loss": 3.2551,
"step": 47450
},
{
"epoch": 13.836343509671405,
"grad_norm": 0.40120694041252136,
"learning_rate": 0.00018518961493582263,
"loss": 3.2497,
"step": 47500
},
{
"epoch": 13.850908879049173,
"grad_norm": 0.3942031264305115,
"learning_rate": 0.00018475204200700114,
"loss": 3.2592,
"step": 47550
},
{
"epoch": 13.86547424842694,
"grad_norm": 0.4140487611293793,
"learning_rate": 0.00018431446907817968,
"loss": 3.2552,
"step": 47600
},
{
"epoch": 13.880039617804707,
"grad_norm": 0.39110127091407776,
"learning_rate": 0.0001838768961493582,
"loss": 3.261,
"step": 47650
},
{
"epoch": 13.894604987182475,
"grad_norm": 0.4091663360595703,
"learning_rate": 0.00018343932322053674,
"loss": 3.2709,
"step": 47700
},
{
"epoch": 13.909170356560242,
"grad_norm": 0.39773812890052795,
"learning_rate": 0.00018300175029171528,
"loss": 3.2645,
"step": 47750
},
{
"epoch": 13.923735725938009,
"grad_norm": 0.4022299647331238,
"learning_rate": 0.0001825641773628938,
"loss": 3.2597,
"step": 47800
},
{
"epoch": 13.938301095315778,
"grad_norm": 0.3977898061275482,
"learning_rate": 0.00018212660443407233,
"loss": 3.2697,
"step": 47850
},
{
"epoch": 13.952866464693544,
"grad_norm": 0.38834723830223083,
"learning_rate": 0.00018168903150525087,
"loss": 3.2585,
"step": 47900
},
{
"epoch": 13.967431834071313,
"grad_norm": 0.3896270990371704,
"learning_rate": 0.00018125145857642938,
"loss": 3.2654,
"step": 47950
},
{
"epoch": 13.98199720344908,
"grad_norm": 0.41397517919540405,
"learning_rate": 0.00018081388564760792,
"loss": 3.2672,
"step": 48000
},
{
"epoch": 13.98199720344908,
"eval_accuracy": 0.37412834488172014,
"eval_loss": 3.5272507667541504,
"eval_runtime": 180.2142,
"eval_samples_per_second": 92.357,
"eval_steps_per_second": 5.776,
"step": 48000
},
{
"epoch": 13.996562572826846,
"grad_norm": 0.3925948739051819,
"learning_rate": 0.00018037631271878644,
"loss": 3.2785,
"step": 48050
},
{
"epoch": 14.011069680727104,
"grad_norm": 0.39326012134552,
"learning_rate": 0.000179938739789965,
"loss": 3.2021,
"step": 48100
},
{
"epoch": 14.02563505010487,
"grad_norm": 0.40781304240226746,
"learning_rate": 0.00017950116686114352,
"loss": 3.1782,
"step": 48150
},
{
"epoch": 14.040200419482638,
"grad_norm": 0.3889636695384979,
"learning_rate": 0.00017906359393232203,
"loss": 3.1885,
"step": 48200
},
{
"epoch": 14.054765788860406,
"grad_norm": 0.4008404314517975,
"learning_rate": 0.00017862602100350057,
"loss": 3.1821,
"step": 48250
},
{
"epoch": 14.069331158238173,
"grad_norm": 0.4058891832828522,
"learning_rate": 0.00017818844807467908,
"loss": 3.1926,
"step": 48300
},
{
"epoch": 14.08389652761594,
"grad_norm": 0.3980492949485779,
"learning_rate": 0.00017775087514585765,
"loss": 3.1981,
"step": 48350
},
{
"epoch": 14.098461896993708,
"grad_norm": 0.4085221588611603,
"learning_rate": 0.00017731330221703616,
"loss": 3.1935,
"step": 48400
},
{
"epoch": 14.113027266371475,
"grad_norm": 0.41492384672164917,
"learning_rate": 0.00017687572928821467,
"loss": 3.194,
"step": 48450
},
{
"epoch": 14.127592635749243,
"grad_norm": 0.4290497899055481,
"learning_rate": 0.00017643815635939321,
"loss": 3.1947,
"step": 48500
},
{
"epoch": 14.14215800512701,
"grad_norm": 0.42287999391555786,
"learning_rate": 0.00017600058343057173,
"loss": 3.1991,
"step": 48550
},
{
"epoch": 14.156723374504777,
"grad_norm": 0.39472466707229614,
"learning_rate": 0.0001755630105017503,
"loss": 3.2116,
"step": 48600
},
{
"epoch": 14.171288743882545,
"grad_norm": 0.4188964068889618,
"learning_rate": 0.0001751254375729288,
"loss": 3.1942,
"step": 48650
},
{
"epoch": 14.185854113260312,
"grad_norm": 0.4070267975330353,
"learning_rate": 0.00017468786464410732,
"loss": 3.1969,
"step": 48700
},
{
"epoch": 14.200419482638079,
"grad_norm": 0.40462633967399597,
"learning_rate": 0.00017425029171528586,
"loss": 3.2017,
"step": 48750
},
{
"epoch": 14.214984852015847,
"grad_norm": 0.40400370955467224,
"learning_rate": 0.0001738127187864644,
"loss": 3.2093,
"step": 48800
},
{
"epoch": 14.229550221393614,
"grad_norm": 0.3998878002166748,
"learning_rate": 0.00017337514585764294,
"loss": 3.2056,
"step": 48850
},
{
"epoch": 14.244115590771383,
"grad_norm": 0.3977794945240021,
"learning_rate": 0.00017293757292882145,
"loss": 3.2038,
"step": 48900
},
{
"epoch": 14.25868096014915,
"grad_norm": 0.4316108226776123,
"learning_rate": 0.00017249999999999996,
"loss": 3.2037,
"step": 48950
},
{
"epoch": 14.273246329526916,
"grad_norm": 0.41260573267936707,
"learning_rate": 0.00017206242707117853,
"loss": 3.2114,
"step": 49000
},
{
"epoch": 14.273246329526916,
"eval_accuracy": 0.37353940302059335,
"eval_loss": 3.538419723510742,
"eval_runtime": 180.2527,
"eval_samples_per_second": 92.337,
"eval_steps_per_second": 5.775,
"step": 49000
},
{
"epoch": 14.287811698904685,
"grad_norm": 0.4174029231071472,
"learning_rate": 0.00017162485414235704,
"loss": 3.2237,
"step": 49050
},
{
"epoch": 14.302377068282452,
"grad_norm": 0.42132076621055603,
"learning_rate": 0.00017118728121353558,
"loss": 3.2049,
"step": 49100
},
{
"epoch": 14.316942437660218,
"grad_norm": 0.41422000527381897,
"learning_rate": 0.0001707497082847141,
"loss": 3.208,
"step": 49150
},
{
"epoch": 14.331507807037987,
"grad_norm": 0.4296468198299408,
"learning_rate": 0.0001703121353558926,
"loss": 3.2076,
"step": 49200
},
{
"epoch": 14.346073176415754,
"grad_norm": 0.40375787019729614,
"learning_rate": 0.00016987456242707118,
"loss": 3.2195,
"step": 49250
},
{
"epoch": 14.360638545793522,
"grad_norm": 0.4078134298324585,
"learning_rate": 0.0001694369894982497,
"loss": 3.2231,
"step": 49300
},
{
"epoch": 14.375203915171289,
"grad_norm": 0.4103347063064575,
"learning_rate": 0.00016899941656942823,
"loss": 3.224,
"step": 49350
},
{
"epoch": 14.389769284549056,
"grad_norm": 0.4056347906589508,
"learning_rate": 0.00016856184364060674,
"loss": 3.2211,
"step": 49400
},
{
"epoch": 14.404334653926824,
"grad_norm": 0.43045109510421753,
"learning_rate": 0.00016812427071178528,
"loss": 3.2178,
"step": 49450
},
{
"epoch": 14.418900023304591,
"grad_norm": 0.4060124158859253,
"learning_rate": 0.00016768669778296382,
"loss": 3.2132,
"step": 49500
},
{
"epoch": 14.433465392682358,
"grad_norm": 0.40384456515312195,
"learning_rate": 0.00016724912485414234,
"loss": 3.2172,
"step": 49550
},
{
"epoch": 14.448030762060126,
"grad_norm": 0.40116435289382935,
"learning_rate": 0.00016681155192532088,
"loss": 3.2099,
"step": 49600
},
{
"epoch": 14.462596131437893,
"grad_norm": 0.4094943404197693,
"learning_rate": 0.00016637397899649942,
"loss": 3.2127,
"step": 49650
},
{
"epoch": 14.477161500815662,
"grad_norm": 0.40145185589790344,
"learning_rate": 0.00016593640606767793,
"loss": 3.22,
"step": 49700
},
{
"epoch": 14.491726870193428,
"grad_norm": 0.42102572321891785,
"learning_rate": 0.00016549883313885647,
"loss": 3.2253,
"step": 49750
},
{
"epoch": 14.506292239571195,
"grad_norm": 0.41271886229515076,
"learning_rate": 0.00016506126021003498,
"loss": 3.207,
"step": 49800
},
{
"epoch": 14.520857608948964,
"grad_norm": 0.41741323471069336,
"learning_rate": 0.00016462368728121355,
"loss": 3.2322,
"step": 49850
},
{
"epoch": 14.53542297832673,
"grad_norm": 0.40796613693237305,
"learning_rate": 0.00016418611435239206,
"loss": 3.2178,
"step": 49900
},
{
"epoch": 14.549988347704497,
"grad_norm": 0.4142317771911621,
"learning_rate": 0.00016374854142357057,
"loss": 3.2196,
"step": 49950
},
{
"epoch": 14.564553717082266,
"grad_norm": 0.4134737253189087,
"learning_rate": 0.0001633109684947491,
"loss": 3.229,
"step": 50000
},
{
"epoch": 14.564553717082266,
"eval_accuracy": 0.3739324620427029,
"eval_loss": 3.535773515701294,
"eval_runtime": 180.2641,
"eval_samples_per_second": 92.331,
"eval_steps_per_second": 5.775,
"step": 50000
},
{
"epoch": 14.579119086460032,
"grad_norm": 0.42424920201301575,
"learning_rate": 0.00016287339556592763,
"loss": 3.2276,
"step": 50050
},
{
"epoch": 14.5936844558378,
"grad_norm": 0.40107038617134094,
"learning_rate": 0.0001624358226371062,
"loss": 3.232,
"step": 50100
},
{
"epoch": 14.608249825215568,
"grad_norm": 0.38732558488845825,
"learning_rate": 0.0001619982497082847,
"loss": 3.2306,
"step": 50150
},
{
"epoch": 14.622815194593334,
"grad_norm": 0.42881304025650024,
"learning_rate": 0.00016156067677946322,
"loss": 3.2385,
"step": 50200
},
{
"epoch": 14.637380563971103,
"grad_norm": 0.41138574481010437,
"learning_rate": 0.00016112310385064176,
"loss": 3.2379,
"step": 50250
},
{
"epoch": 14.65194593334887,
"grad_norm": 0.41807669401168823,
"learning_rate": 0.00016068553092182027,
"loss": 3.2306,
"step": 50300
},
{
"epoch": 14.666511302726637,
"grad_norm": 0.4081842005252838,
"learning_rate": 0.00016024795799299884,
"loss": 3.238,
"step": 50350
},
{
"epoch": 14.681076672104405,
"grad_norm": 0.39501240849494934,
"learning_rate": 0.00015981038506417735,
"loss": 3.2364,
"step": 50400
},
{
"epoch": 14.695642041482172,
"grad_norm": 0.4069629907608032,
"learning_rate": 0.00015937281213535586,
"loss": 3.2281,
"step": 50450
},
{
"epoch": 14.71020741085994,
"grad_norm": 0.4017820656299591,
"learning_rate": 0.0001589352392065344,
"loss": 3.2359,
"step": 50500
},
{
"epoch": 14.724772780237707,
"grad_norm": 0.430561900138855,
"learning_rate": 0.00015849766627771294,
"loss": 3.2218,
"step": 50550
},
{
"epoch": 14.739338149615474,
"grad_norm": 0.4142705202102661,
"learning_rate": 0.00015806009334889148,
"loss": 3.2357,
"step": 50600
},
{
"epoch": 14.753903518993242,
"grad_norm": 0.42446601390838623,
"learning_rate": 0.00015762252042007,
"loss": 3.2325,
"step": 50650
},
{
"epoch": 14.76846888837101,
"grad_norm": 0.41279682517051697,
"learning_rate": 0.0001571849474912485,
"loss": 3.2272,
"step": 50700
},
{
"epoch": 14.783034257748776,
"grad_norm": 0.4026637375354767,
"learning_rate": 0.00015674737456242708,
"loss": 3.2323,
"step": 50750
},
{
"epoch": 14.797599627126544,
"grad_norm": 0.4120595157146454,
"learning_rate": 0.0001563098016336056,
"loss": 3.241,
"step": 50800
},
{
"epoch": 14.812164996504311,
"grad_norm": 0.4040710926055908,
"learning_rate": 0.00015587222870478413,
"loss": 3.2423,
"step": 50850
},
{
"epoch": 14.826730365882078,
"grad_norm": 0.4115070402622223,
"learning_rate": 0.00015543465577596264,
"loss": 3.2362,
"step": 50900
},
{
"epoch": 14.841295735259846,
"grad_norm": 0.41906213760375977,
"learning_rate": 0.00015499708284714116,
"loss": 3.2435,
"step": 50950
},
{
"epoch": 14.855861104637613,
"grad_norm": 0.42066025733947754,
"learning_rate": 0.00015455950991831972,
"loss": 3.2327,
"step": 51000
},
{
"epoch": 14.855861104637613,
"eval_accuracy": 0.37450694196553497,
"eval_loss": 3.529242515563965,
"eval_runtime": 180.1778,
"eval_samples_per_second": 92.375,
"eval_steps_per_second": 5.778,
"step": 51000
},
{
"epoch": 14.870426474015382,
"grad_norm": 0.4169136881828308,
"learning_rate": 0.00015412193698949824,
"loss": 3.2329,
"step": 51050
},
{
"epoch": 14.884991843393149,
"grad_norm": 0.4026949405670166,
"learning_rate": 0.00015368436406067677,
"loss": 3.2343,
"step": 51100
},
{
"epoch": 14.899557212770915,
"grad_norm": 0.4167788028717041,
"learning_rate": 0.0001532467911318553,
"loss": 3.2343,
"step": 51150
},
{
"epoch": 14.914122582148684,
"grad_norm": 0.41968563199043274,
"learning_rate": 0.0001528092182030338,
"loss": 3.2343,
"step": 51200
},
{
"epoch": 14.92868795152645,
"grad_norm": 0.4078276753425598,
"learning_rate": 0.00015237164527421237,
"loss": 3.2437,
"step": 51250
},
{
"epoch": 14.943253320904217,
"grad_norm": 0.4137614667415619,
"learning_rate": 0.00015193407234539088,
"loss": 3.2419,
"step": 51300
},
{
"epoch": 14.957818690281986,
"grad_norm": 0.4062293469905853,
"learning_rate": 0.00015149649941656942,
"loss": 3.2358,
"step": 51350
},
{
"epoch": 14.972384059659753,
"grad_norm": 0.42490682005882263,
"learning_rate": 0.00015105892648774793,
"loss": 3.2424,
"step": 51400
},
{
"epoch": 14.986949429037521,
"grad_norm": 0.4156704545021057,
"learning_rate": 0.00015062135355892647,
"loss": 3.2426,
"step": 51450
},
{
"epoch": 15.001456536937777,
"grad_norm": 0.42323753237724304,
"learning_rate": 0.000150183780630105,
"loss": 3.2332,
"step": 51500
},
{
"epoch": 15.016021906315544,
"grad_norm": 0.41732102632522583,
"learning_rate": 0.00014974620770128353,
"loss": 3.1514,
"step": 51550
},
{
"epoch": 15.030587275693312,
"grad_norm": 0.407942533493042,
"learning_rate": 0.00014930863477246207,
"loss": 3.16,
"step": 51600
},
{
"epoch": 15.045152645071079,
"grad_norm": 0.4309288561344147,
"learning_rate": 0.0001488710618436406,
"loss": 3.1614,
"step": 51650
},
{
"epoch": 15.059718014448846,
"grad_norm": 0.42493224143981934,
"learning_rate": 0.00014843348891481912,
"loss": 3.1634,
"step": 51700
},
{
"epoch": 15.074283383826614,
"grad_norm": 0.39733976125717163,
"learning_rate": 0.00014799591598599766,
"loss": 3.1639,
"step": 51750
},
{
"epoch": 15.088848753204381,
"grad_norm": 0.42916181683540344,
"learning_rate": 0.00014755834305717617,
"loss": 3.1924,
"step": 51800
},
{
"epoch": 15.103414122582148,
"grad_norm": 0.4103613495826721,
"learning_rate": 0.0001471207701283547,
"loss": 3.1701,
"step": 51850
},
{
"epoch": 15.117979491959916,
"grad_norm": 0.409218430519104,
"learning_rate": 0.00014668319719953325,
"loss": 3.169,
"step": 51900
},
{
"epoch": 15.132544861337683,
"grad_norm": 0.42851245403289795,
"learning_rate": 0.00014624562427071176,
"loss": 3.1857,
"step": 51950
},
{
"epoch": 15.147110230715452,
"grad_norm": 0.41377684473991394,
"learning_rate": 0.0001458080513418903,
"loss": 3.1816,
"step": 52000
},
{
"epoch": 15.147110230715452,
"eval_accuracy": 0.3739413978744948,
"eval_loss": 3.5385658740997314,
"eval_runtime": 180.1964,
"eval_samples_per_second": 92.366,
"eval_steps_per_second": 5.777,
"step": 52000
},
{
"epoch": 15.161675600093218,
"grad_norm": 0.4140619933605194,
"learning_rate": 0.00014537047841306882,
"loss": 3.1848,
"step": 52050
},
{
"epoch": 15.176240969470985,
"grad_norm": 0.4383089244365692,
"learning_rate": 0.00014493290548424736,
"loss": 3.1781,
"step": 52100
},
{
"epoch": 15.190806338848754,
"grad_norm": 0.4208312928676605,
"learning_rate": 0.0001444953325554259,
"loss": 3.1806,
"step": 52150
},
{
"epoch": 15.20537170822652,
"grad_norm": 0.4226909279823303,
"learning_rate": 0.00014405775962660444,
"loss": 3.1649,
"step": 52200
},
{
"epoch": 15.219937077604287,
"grad_norm": Infinity,
"learning_rate": 0.00014362018669778295,
"loss": 3.1823,
"step": 52250
},
{
"epoch": 15.234502446982056,
"grad_norm": 0.40975677967071533,
"learning_rate": 0.00014318261376896146,
"loss": 3.183,
"step": 52300
},
{
"epoch": 15.249067816359823,
"grad_norm": 0.4181591868400574,
"learning_rate": 0.00014274504084014,
"loss": 3.1905,
"step": 52350
},
{
"epoch": 15.263633185737591,
"grad_norm": 0.41215869784355164,
"learning_rate": 0.00014230746791131854,
"loss": 3.1817,
"step": 52400
},
{
"epoch": 15.278198555115358,
"grad_norm": 0.412744402885437,
"learning_rate": 0.00014186989498249708,
"loss": 3.195,
"step": 52450
},
{
"epoch": 15.292763924493125,
"grad_norm": 0.42229944467544556,
"learning_rate": 0.0001414323220536756,
"loss": 3.1861,
"step": 52500
},
{
"epoch": 15.307329293870893,
"grad_norm": 0.4238891899585724,
"learning_rate": 0.00014099474912485413,
"loss": 3.1816,
"step": 52550
},
{
"epoch": 15.32189466324866,
"grad_norm": 0.40468868613243103,
"learning_rate": 0.00014055717619603265,
"loss": 3.2001,
"step": 52600
},
{
"epoch": 15.336460032626427,
"grad_norm": 0.41819027066230774,
"learning_rate": 0.0001401196032672112,
"loss": 3.182,
"step": 52650
},
{
"epoch": 15.351025402004195,
"grad_norm": 0.4215780794620514,
"learning_rate": 0.00013968203033838973,
"loss": 3.1968,
"step": 52700
},
{
"epoch": 15.365590771381962,
"grad_norm": 0.4218878149986267,
"learning_rate": 0.00013924445740956827,
"loss": 3.1884,
"step": 52750
},
{
"epoch": 15.38015614075973,
"grad_norm": 0.4183255434036255,
"learning_rate": 0.00013880688448074678,
"loss": 3.1919,
"step": 52800
},
{
"epoch": 15.394721510137497,
"grad_norm": 0.42416125535964966,
"learning_rate": 0.0001383693115519253,
"loss": 3.1977,
"step": 52850
},
{
"epoch": 15.409286879515264,
"grad_norm": 0.41610389947891235,
"learning_rate": 0.00013793173862310383,
"loss": 3.2003,
"step": 52900
},
{
"epoch": 15.423852248893033,
"grad_norm": 0.4229474365711212,
"learning_rate": 0.00013749416569428237,
"loss": 3.1983,
"step": 52950
},
{
"epoch": 15.4384176182708,
"grad_norm": 0.4127897024154663,
"learning_rate": 0.0001370565927654609,
"loss": 3.1996,
"step": 53000
},
{
"epoch": 15.4384176182708,
"eval_accuracy": 0.3744517984772402,
"eval_loss": 3.5346455574035645,
"eval_runtime": 180.9262,
"eval_samples_per_second": 91.993,
"eval_steps_per_second": 5.754,
"step": 53000
},
{
"epoch": 15.452982987648566,
"grad_norm": 0.4062581956386566,
"learning_rate": 0.00013661901983663943,
"loss": 3.2082,
"step": 53050
},
{
"epoch": 15.467548357026335,
"grad_norm": 0.4083750545978546,
"learning_rate": 0.00013618144690781797,
"loss": 3.2046,
"step": 53100
},
{
"epoch": 15.482113726404101,
"grad_norm": 0.44948476552963257,
"learning_rate": 0.00013574387397899648,
"loss": 3.1963,
"step": 53150
},
{
"epoch": 15.49667909578187,
"grad_norm": 0.4223315715789795,
"learning_rate": 0.00013530630105017502,
"loss": 3.2006,
"step": 53200
},
{
"epoch": 15.511244465159637,
"grad_norm": 0.43807777762413025,
"learning_rate": 0.00013486872812135356,
"loss": 3.19,
"step": 53250
},
{
"epoch": 15.525809834537403,
"grad_norm": 0.4165053367614746,
"learning_rate": 0.00013443115519253207,
"loss": 3.2033,
"step": 53300
},
{
"epoch": 15.540375203915172,
"grad_norm": 0.4165057837963104,
"learning_rate": 0.0001339935822637106,
"loss": 3.2031,
"step": 53350
},
{
"epoch": 15.554940573292939,
"grad_norm": 0.4309650957584381,
"learning_rate": 0.00013355600933488912,
"loss": 3.2105,
"step": 53400
},
{
"epoch": 15.569505942670705,
"grad_norm": 0.41557958722114563,
"learning_rate": 0.00013311843640606766,
"loss": 3.2082,
"step": 53450
},
{
"epoch": 15.584071312048474,
"grad_norm": 0.44082722067832947,
"learning_rate": 0.0001326808634772462,
"loss": 3.2097,
"step": 53500
},
{
"epoch": 15.59863668142624,
"grad_norm": 0.4245944619178772,
"learning_rate": 0.00013224329054842472,
"loss": 3.2059,
"step": 53550
},
{
"epoch": 15.61320205080401,
"grad_norm": 0.41007092595100403,
"learning_rate": 0.00013180571761960326,
"loss": 3.1955,
"step": 53600
},
{
"epoch": 15.627767420181776,
"grad_norm": 0.42977604269981384,
"learning_rate": 0.0001313681446907818,
"loss": 3.2066,
"step": 53650
},
{
"epoch": 15.642332789559543,
"grad_norm": 0.40306100249290466,
"learning_rate": 0.0001309305717619603,
"loss": 3.2192,
"step": 53700
},
{
"epoch": 15.656898158937311,
"grad_norm": 0.4248296916484833,
"learning_rate": 0.00013049299883313885,
"loss": 3.207,
"step": 53750
},
{
"epoch": 15.671463528315078,
"grad_norm": 0.4259008765220642,
"learning_rate": 0.00013005542590431736,
"loss": 3.2088,
"step": 53800
},
{
"epoch": 15.686028897692845,
"grad_norm": 0.43237951397895813,
"learning_rate": 0.0001296178529754959,
"loss": 3.2053,
"step": 53850
},
{
"epoch": 15.700594267070613,
"grad_norm": 0.42358967661857605,
"learning_rate": 0.00012918028004667444,
"loss": 3.2073,
"step": 53900
},
{
"epoch": 15.71515963644838,
"grad_norm": 0.42595556378364563,
"learning_rate": 0.00012874270711785295,
"loss": 3.2038,
"step": 53950
},
{
"epoch": 15.729725005826147,
"grad_norm": 0.4171249568462372,
"learning_rate": 0.0001283051341890315,
"loss": 3.2029,
"step": 54000
},
{
"epoch": 15.729725005826147,
"eval_accuracy": 0.3747556167581649,
"eval_loss": 3.530050754547119,
"eval_runtime": 180.9342,
"eval_samples_per_second": 91.989,
"eval_steps_per_second": 5.753,
"step": 54000
},
{
"epoch": 15.744290375203915,
"grad_norm": 0.42589497566223145,
"learning_rate": 0.00012786756126021,
"loss": 3.2115,
"step": 54050
},
{
"epoch": 15.758855744581682,
"grad_norm": 0.4358471632003784,
"learning_rate": 0.00012742998833138855,
"loss": 3.2096,
"step": 54100
},
{
"epoch": 15.77342111395945,
"grad_norm": 0.4199720323085785,
"learning_rate": 0.0001269924154025671,
"loss": 3.1963,
"step": 54150
},
{
"epoch": 15.787986483337217,
"grad_norm": 0.43083542585372925,
"learning_rate": 0.00012655484247374563,
"loss": 3.2158,
"step": 54200
},
{
"epoch": 15.802551852714984,
"grad_norm": 0.42154741287231445,
"learning_rate": 0.00012611726954492414,
"loss": 3.2087,
"step": 54250
},
{
"epoch": 15.817117222092753,
"grad_norm": 0.441120445728302,
"learning_rate": 0.00012567969661610265,
"loss": 3.2178,
"step": 54300
},
{
"epoch": 15.83168259147052,
"grad_norm": 0.4148479402065277,
"learning_rate": 0.0001252421236872812,
"loss": 3.2268,
"step": 54350
},
{
"epoch": 15.846247960848288,
"grad_norm": 0.419406920671463,
"learning_rate": 0.00012480455075845973,
"loss": 3.2142,
"step": 54400
},
{
"epoch": 15.860813330226055,
"grad_norm": 0.42558178305625916,
"learning_rate": 0.00012436697782963827,
"loss": 3.2133,
"step": 54450
},
{
"epoch": 15.875378699603822,
"grad_norm": 0.4201781153678894,
"learning_rate": 0.00012392940490081679,
"loss": 3.2105,
"step": 54500
},
{
"epoch": 15.88994406898159,
"grad_norm": 0.4343336522579193,
"learning_rate": 0.00012349183197199533,
"loss": 3.2153,
"step": 54550
},
{
"epoch": 15.904509438359357,
"grad_norm": 0.4077042043209076,
"learning_rate": 0.00012305425904317384,
"loss": 3.2177,
"step": 54600
},
{
"epoch": 15.919074807737124,
"grad_norm": 0.4239185154438019,
"learning_rate": 0.00012261668611435238,
"loss": 3.219,
"step": 54650
},
{
"epoch": 15.933640177114892,
"grad_norm": 0.41873300075531006,
"learning_rate": 0.00012217911318553092,
"loss": 3.21,
"step": 54700
},
{
"epoch": 15.948205546492659,
"grad_norm": 0.41313689947128296,
"learning_rate": 0.00012174154025670944,
"loss": 3.204,
"step": 54750
},
{
"epoch": 15.962770915870426,
"grad_norm": 0.41441574692726135,
"learning_rate": 0.00012130396732788796,
"loss": 3.2034,
"step": 54800
},
{
"epoch": 15.977336285248194,
"grad_norm": 0.43360719084739685,
"learning_rate": 0.0001208663943990665,
"loss": 3.2189,
"step": 54850
},
{
"epoch": 15.991901654625961,
"grad_norm": 0.4335940480232239,
"learning_rate": 0.00012042882147024502,
"loss": 3.2178,
"step": 54900
},
{
"epoch": 16.006408762526217,
"grad_norm": 0.4299444258213043,
"learning_rate": 0.00011999124854142356,
"loss": 3.1882,
"step": 54950
},
{
"epoch": 16.020974131903984,
"grad_norm": 0.4137655794620514,
"learning_rate": 0.00011955367561260209,
"loss": 3.151,
"step": 55000
},
{
"epoch": 16.020974131903984,
"eval_accuracy": 0.37487225287839604,
"eval_loss": 3.5326309204101562,
"eval_runtime": 181.1353,
"eval_samples_per_second": 91.887,
"eval_steps_per_second": 5.747,
"step": 55000
},
{
"epoch": 16.035539501281754,
"grad_norm": 0.4282309114933014,
"learning_rate": 0.00011911610268378062,
"loss": 3.1378,
"step": 55050
},
{
"epoch": 16.05010487065952,
"grad_norm": 0.43013036251068115,
"learning_rate": 0.00011867852975495914,
"loss": 3.1477,
"step": 55100
},
{
"epoch": 16.064670240037287,
"grad_norm": 0.4036613404750824,
"learning_rate": 0.00011824095682613768,
"loss": 3.1385,
"step": 55150
},
{
"epoch": 16.079235609415054,
"grad_norm": 0.4135037660598755,
"learning_rate": 0.00011780338389731621,
"loss": 3.1613,
"step": 55200
},
{
"epoch": 16.09380097879282,
"grad_norm": 0.4376969635486603,
"learning_rate": 0.00011736581096849475,
"loss": 3.1427,
"step": 55250
},
{
"epoch": 16.10836634817059,
"grad_norm": 0.449358731508255,
"learning_rate": 0.00011692823803967326,
"loss": 3.1549,
"step": 55300
},
{
"epoch": 16.122931717548358,
"grad_norm": 0.4239208698272705,
"learning_rate": 0.00011649066511085179,
"loss": 3.1624,
"step": 55350
},
{
"epoch": 16.137497086926125,
"grad_norm": 0.42871618270874023,
"learning_rate": 0.00011605309218203033,
"loss": 3.1611,
"step": 55400
},
{
"epoch": 16.15206245630389,
"grad_norm": 0.4187133014202118,
"learning_rate": 0.00011561551925320885,
"loss": 3.16,
"step": 55450
},
{
"epoch": 16.16662782568166,
"grad_norm": 0.43837517499923706,
"learning_rate": 0.0001151779463243874,
"loss": 3.1494,
"step": 55500
},
{
"epoch": 16.181193195059425,
"grad_norm": 0.4250280261039734,
"learning_rate": 0.00011474037339556591,
"loss": 3.1543,
"step": 55550
},
{
"epoch": 16.195758564437195,
"grad_norm": 0.42530959844589233,
"learning_rate": 0.00011430280046674445,
"loss": 3.1667,
"step": 55600
},
{
"epoch": 16.210323933814962,
"grad_norm": 0.4092886447906494,
"learning_rate": 0.00011386522753792297,
"loss": 3.1579,
"step": 55650
},
{
"epoch": 16.22488930319273,
"grad_norm": 0.4281422793865204,
"learning_rate": 0.00011342765460910151,
"loss": 3.1603,
"step": 55700
},
{
"epoch": 16.239454672570496,
"grad_norm": 0.41307032108306885,
"learning_rate": 0.00011299008168028004,
"loss": 3.1687,
"step": 55750
},
{
"epoch": 16.254020041948262,
"grad_norm": 0.44364210963249207,
"learning_rate": 0.00011255250875145855,
"loss": 3.1769,
"step": 55800
},
{
"epoch": 16.268585411326033,
"grad_norm": 0.42538779973983765,
"learning_rate": 0.00011211493582263709,
"loss": 3.1639,
"step": 55850
},
{
"epoch": 16.2831507807038,
"grad_norm": 0.42053961753845215,
"learning_rate": 0.00011167736289381562,
"loss": 3.1692,
"step": 55900
},
{
"epoch": 16.297716150081566,
"grad_norm": 0.41357651352882385,
"learning_rate": 0.00011123978996499416,
"loss": 3.1697,
"step": 55950
},
{
"epoch": 16.312281519459333,
"grad_norm": 0.42656847834587097,
"learning_rate": 0.00011080221703617269,
"loss": 3.1679,
"step": 56000
},
{
"epoch": 16.312281519459333,
"eval_accuracy": 0.374931041245448,
"eval_loss": 3.5328683853149414,
"eval_runtime": 180.861,
"eval_samples_per_second": 92.026,
"eval_steps_per_second": 5.756,
"step": 56000
},
{
"epoch": 16.3268468888371,
"grad_norm": 0.4274371266365051,
"learning_rate": 0.00011036464410735121,
"loss": 3.1626,
"step": 56050
},
{
"epoch": 16.34141225821487,
"grad_norm": 0.41279909014701843,
"learning_rate": 0.00010992707117852974,
"loss": 3.1719,
"step": 56100
},
{
"epoch": 16.355977627592637,
"grad_norm": 0.420175701379776,
"learning_rate": 0.00010948949824970828,
"loss": 3.172,
"step": 56150
},
{
"epoch": 16.370542996970403,
"grad_norm": 0.4424479603767395,
"learning_rate": 0.0001090519253208868,
"loss": 3.1707,
"step": 56200
},
{
"epoch": 16.38510836634817,
"grad_norm": 0.42122504115104675,
"learning_rate": 0.00010861435239206534,
"loss": 3.1643,
"step": 56250
},
{
"epoch": 16.399673735725937,
"grad_norm": 0.4253459572792053,
"learning_rate": 0.00010817677946324386,
"loss": 3.1608,
"step": 56300
},
{
"epoch": 16.414239105103704,
"grad_norm": 0.4293667674064636,
"learning_rate": 0.00010773920653442238,
"loss": 3.181,
"step": 56350
},
{
"epoch": 16.428804474481474,
"grad_norm": 0.42189478874206543,
"learning_rate": 0.00010730163360560092,
"loss": 3.1808,
"step": 56400
},
{
"epoch": 16.44336984385924,
"grad_norm": 0.41094401478767395,
"learning_rate": 0.00010686406067677945,
"loss": 3.1792,
"step": 56450
},
{
"epoch": 16.457935213237008,
"grad_norm": 0.43915942311286926,
"learning_rate": 0.00010642648774795799,
"loss": 3.1726,
"step": 56500
},
{
"epoch": 16.472500582614774,
"grad_norm": 0.4174824357032776,
"learning_rate": 0.0001059889148191365,
"loss": 3.1749,
"step": 56550
},
{
"epoch": 16.48706595199254,
"grad_norm": 0.4097413420677185,
"learning_rate": 0.00010555134189031504,
"loss": 3.1743,
"step": 56600
},
{
"epoch": 16.50163132137031,
"grad_norm": 0.4119010269641876,
"learning_rate": 0.00010511376896149357,
"loss": 3.1766,
"step": 56650
},
{
"epoch": 16.516196690748078,
"grad_norm": 0.4229290187358856,
"learning_rate": 0.00010467619603267211,
"loss": 3.1763,
"step": 56700
},
{
"epoch": 16.530762060125845,
"grad_norm": 0.42102253437042236,
"learning_rate": 0.00010423862310385064,
"loss": 3.1693,
"step": 56750
},
{
"epoch": 16.54532742950361,
"grad_norm": 0.40270158648490906,
"learning_rate": 0.00010380105017502915,
"loss": 3.1758,
"step": 56800
},
{
"epoch": 16.55989279888138,
"grad_norm": 0.4175575375556946,
"learning_rate": 0.00010336347724620769,
"loss": 3.1769,
"step": 56850
},
{
"epoch": 16.57445816825915,
"grad_norm": 0.43574297428131104,
"learning_rate": 0.00010292590431738621,
"loss": 3.1885,
"step": 56900
},
{
"epoch": 16.589023537636916,
"grad_norm": 0.41739705204963684,
"learning_rate": 0.00010248833138856475,
"loss": 3.177,
"step": 56950
},
{
"epoch": 16.603588907014682,
"grad_norm": 0.43442845344543457,
"learning_rate": 0.00010205075845974328,
"loss": 3.183,
"step": 57000
},
{
"epoch": 16.603588907014682,
"eval_accuracy": 0.3752198097044074,
"eval_loss": 3.528458595275879,
"eval_runtime": 180.2565,
"eval_samples_per_second": 92.335,
"eval_steps_per_second": 5.775,
"step": 57000
},
{
"epoch": 16.61815427639245,
"grad_norm": 0.4157189130783081,
"learning_rate": 0.00010161318553092181,
"loss": 3.1774,
"step": 57050
},
{
"epoch": 16.632719645770216,
"grad_norm": 0.4215414822101593,
"learning_rate": 0.00010117561260210033,
"loss": 3.1864,
"step": 57100
},
{
"epoch": 16.647285015147983,
"grad_norm": 0.42653682827949524,
"learning_rate": 0.00010073803967327887,
"loss": 3.185,
"step": 57150
},
{
"epoch": 16.661850384525753,
"grad_norm": 0.43946152925491333,
"learning_rate": 0.0001003004667444574,
"loss": 3.191,
"step": 57200
},
{
"epoch": 16.67641575390352,
"grad_norm": 0.4414174258708954,
"learning_rate": 9.986289381563594e-05,
"loss": 3.1942,
"step": 57250
},
{
"epoch": 16.690981123281286,
"grad_norm": 0.4162786602973938,
"learning_rate": 9.942532088681445e-05,
"loss": 3.1902,
"step": 57300
},
{
"epoch": 16.705546492659053,
"grad_norm": 0.4169985353946686,
"learning_rate": 9.898774795799299e-05,
"loss": 3.1915,
"step": 57350
},
{
"epoch": 16.72011186203682,
"grad_norm": 0.4140052795410156,
"learning_rate": 9.855017502917152e-05,
"loss": 3.1698,
"step": 57400
},
{
"epoch": 16.73467723141459,
"grad_norm": 0.45123833417892456,
"learning_rate": 9.811260210035005e-05,
"loss": 3.1909,
"step": 57450
},
{
"epoch": 16.749242600792357,
"grad_norm": 0.4276806116104126,
"learning_rate": 9.767502917152858e-05,
"loss": 3.188,
"step": 57500
},
{
"epoch": 16.763807970170124,
"grad_norm": 0.431024432182312,
"learning_rate": 9.72374562427071e-05,
"loss": 3.1845,
"step": 57550
},
{
"epoch": 16.77837333954789,
"grad_norm": 0.418350487947464,
"learning_rate": 9.679988331388564e-05,
"loss": 3.1848,
"step": 57600
},
{
"epoch": 16.792938708925657,
"grad_norm": 0.4283568561077118,
"learning_rate": 9.636231038506416e-05,
"loss": 3.1821,
"step": 57650
},
{
"epoch": 16.807504078303424,
"grad_norm": 0.434356153011322,
"learning_rate": 9.59247374562427e-05,
"loss": 3.1919,
"step": 57700
},
{
"epoch": 16.822069447681194,
"grad_norm": 0.43099457025527954,
"learning_rate": 9.548716452742123e-05,
"loss": 3.1865,
"step": 57750
},
{
"epoch": 16.83663481705896,
"grad_norm": 0.4274565577507019,
"learning_rate": 9.504959159859976e-05,
"loss": 3.1867,
"step": 57800
},
{
"epoch": 16.851200186436728,
"grad_norm": 0.42834699153900146,
"learning_rate": 9.461201866977828e-05,
"loss": 3.1868,
"step": 57850
},
{
"epoch": 16.865765555814495,
"grad_norm": 0.4234914481639862,
"learning_rate": 9.417444574095682e-05,
"loss": 3.1787,
"step": 57900
},
{
"epoch": 16.88033092519226,
"grad_norm": 0.42309653759002686,
"learning_rate": 9.373687281213535e-05,
"loss": 3.1881,
"step": 57950
},
{
"epoch": 16.89489629457003,
"grad_norm": 0.4064479470252991,
"learning_rate": 9.329929988331389e-05,
"loss": 3.1985,
"step": 58000
},
{
"epoch": 16.89489629457003,
"eval_accuracy": 0.37572386116351114,
"eval_loss": 3.522873640060425,
"eval_runtime": 180.6669,
"eval_samples_per_second": 92.125,
"eval_steps_per_second": 5.762,
"step": 58000
},
{
"epoch": 16.9094616639478,
"grad_norm": 0.4073335528373718,
"learning_rate": 9.28617269544924e-05,
"loss": 3.193,
"step": 58050
},
{
"epoch": 16.924027033325565,
"grad_norm": 0.42059651017189026,
"learning_rate": 9.242415402567093e-05,
"loss": 3.194,
"step": 58100
},
{
"epoch": 16.938592402703332,
"grad_norm": 0.4243110716342926,
"learning_rate": 9.198658109684947e-05,
"loss": 3.1857,
"step": 58150
},
{
"epoch": 16.9531577720811,
"grad_norm": 0.4175581932067871,
"learning_rate": 9.1549008168028e-05,
"loss": 3.1957,
"step": 58200
},
{
"epoch": 16.96772314145887,
"grad_norm": 0.4244025647640228,
"learning_rate": 9.111143523920653e-05,
"loss": 3.1841,
"step": 58250
},
{
"epoch": 16.982288510836636,
"grad_norm": 0.4220493733882904,
"learning_rate": 9.067386231038505e-05,
"loss": 3.1947,
"step": 58300
},
{
"epoch": 16.996853880214402,
"grad_norm": 0.43514513969421387,
"learning_rate": 9.023628938156359e-05,
"loss": 3.185,
"step": 58350
},
{
"epoch": 17.01136098811466,
"grad_norm": 0.44147107005119324,
"learning_rate": 8.979871645274211e-05,
"loss": 3.1409,
"step": 58400
},
{
"epoch": 17.025926357492427,
"grad_norm": 0.41750553250312805,
"learning_rate": 8.936114352392065e-05,
"loss": 3.1336,
"step": 58450
},
{
"epoch": 17.040491726870194,
"grad_norm": 0.43038827180862427,
"learning_rate": 8.892357059509918e-05,
"loss": 3.136,
"step": 58500
},
{
"epoch": 17.05505709624796,
"grad_norm": 0.43373844027519226,
"learning_rate": 8.848599766627769e-05,
"loss": 3.1353,
"step": 58550
},
{
"epoch": 17.069622465625727,
"grad_norm": 0.4187583923339844,
"learning_rate": 8.804842473745623e-05,
"loss": 3.1367,
"step": 58600
},
{
"epoch": 17.084187835003497,
"grad_norm": 0.43599000573158264,
"learning_rate": 8.761085180863476e-05,
"loss": 3.1356,
"step": 58650
},
{
"epoch": 17.098753204381264,
"grad_norm": 0.4198746085166931,
"learning_rate": 8.71732788798133e-05,
"loss": 3.1435,
"step": 58700
},
{
"epoch": 17.11331857375903,
"grad_norm": 0.4513174891471863,
"learning_rate": 8.673570595099183e-05,
"loss": 3.136,
"step": 58750
},
{
"epoch": 17.127883943136798,
"grad_norm": 0.43155333399772644,
"learning_rate": 8.629813302217035e-05,
"loss": 3.1333,
"step": 58800
},
{
"epoch": 17.142449312514564,
"grad_norm": 0.4096786379814148,
"learning_rate": 8.586056009334888e-05,
"loss": 3.1427,
"step": 58850
},
{
"epoch": 17.15701468189233,
"grad_norm": 0.43581393361091614,
"learning_rate": 8.542298716452742e-05,
"loss": 3.1465,
"step": 58900
},
{
"epoch": 17.1715800512701,
"grad_norm": 0.4341733753681183,
"learning_rate": 8.498541423570594e-05,
"loss": 3.1471,
"step": 58950
},
{
"epoch": 17.18614542064787,
"grad_norm": 0.4542756676673889,
"learning_rate": 8.454784130688448e-05,
"loss": 3.1424,
"step": 59000
},
{
"epoch": 17.18614542064787,
"eval_accuracy": 0.37545496317261534,
"eval_loss": 3.531616687774658,
"eval_runtime": 180.3206,
"eval_samples_per_second": 92.302,
"eval_steps_per_second": 5.773,
"step": 59000
},
{
"epoch": 17.200710790025635,
"grad_norm": 0.4263310134410858,
"learning_rate": 8.4110268378063e-05,
"loss": 3.1367,
"step": 59050
},
{
"epoch": 17.215276159403402,
"grad_norm": 0.4178173840045929,
"learning_rate": 8.367269544924152e-05,
"loss": 3.1371,
"step": 59100
},
{
"epoch": 17.22984152878117,
"grad_norm": 0.4277113676071167,
"learning_rate": 8.323512252042006e-05,
"loss": 3.1427,
"step": 59150
},
{
"epoch": 17.24440689815894,
"grad_norm": 0.42608314752578735,
"learning_rate": 8.279754959159859e-05,
"loss": 3.1416,
"step": 59200
},
{
"epoch": 17.258972267536706,
"grad_norm": 0.43384233117103577,
"learning_rate": 8.235997666277713e-05,
"loss": 3.1399,
"step": 59250
},
{
"epoch": 17.273537636914472,
"grad_norm": 0.4369141459465027,
"learning_rate": 8.192240373395564e-05,
"loss": 3.1502,
"step": 59300
},
{
"epoch": 17.28810300629224,
"grad_norm": 0.42754146456718445,
"learning_rate": 8.148483080513418e-05,
"loss": 3.1351,
"step": 59350
},
{
"epoch": 17.302668375670006,
"grad_norm": 0.42747077345848083,
"learning_rate": 8.104725787631271e-05,
"loss": 3.1469,
"step": 59400
},
{
"epoch": 17.317233745047773,
"grad_norm": 0.4387149512767792,
"learning_rate": 8.060968494749125e-05,
"loss": 3.1374,
"step": 59450
},
{
"epoch": 17.331799114425543,
"grad_norm": 0.4392690658569336,
"learning_rate": 8.017211201866978e-05,
"loss": 3.1515,
"step": 59500
},
{
"epoch": 17.34636448380331,
"grad_norm": 0.4403955042362213,
"learning_rate": 7.973453908984829e-05,
"loss": 3.1615,
"step": 59550
},
{
"epoch": 17.360929853181077,
"grad_norm": 0.43617773056030273,
"learning_rate": 7.929696616102683e-05,
"loss": 3.1528,
"step": 59600
},
{
"epoch": 17.375495222558843,
"grad_norm": 0.4383864998817444,
"learning_rate": 7.885939323220535e-05,
"loss": 3.1529,
"step": 59650
},
{
"epoch": 17.39006059193661,
"grad_norm": 0.43385154008865356,
"learning_rate": 7.84218203033839e-05,
"loss": 3.1476,
"step": 59700
},
{
"epoch": 17.40462596131438,
"grad_norm": 0.4384395182132721,
"learning_rate": 7.798424737456242e-05,
"loss": 3.1535,
"step": 59750
},
{
"epoch": 17.419191330692147,
"grad_norm": 0.431538462638855,
"learning_rate": 7.754667444574096e-05,
"loss": 3.1563,
"step": 59800
},
{
"epoch": 17.433756700069914,
"grad_norm": 0.42602774500846863,
"learning_rate": 7.710910151691947e-05,
"loss": 3.1524,
"step": 59850
},
{
"epoch": 17.44832206944768,
"grad_norm": 0.4278332591056824,
"learning_rate": 7.667152858809801e-05,
"loss": 3.1448,
"step": 59900
},
{
"epoch": 17.462887438825447,
"grad_norm": 0.4381519556045532,
"learning_rate": 7.623395565927654e-05,
"loss": 3.1589,
"step": 59950
},
{
"epoch": 17.477452808203218,
"grad_norm": 0.4320782721042633,
"learning_rate": 7.579638273045508e-05,
"loss": 3.1539,
"step": 60000
},
{
"epoch": 17.477452808203218,
"eval_accuracy": 0.37566424975932045,
"eval_loss": 3.527134418487549,
"eval_runtime": 181.5925,
"eval_samples_per_second": 91.656,
"eval_steps_per_second": 5.733,
"step": 60000
},
{
"epoch": 17.492018177580984,
"grad_norm": 0.4376213848590851,
"learning_rate": 7.53588098016336e-05,
"loss": 3.1551,
"step": 60050
},
{
"epoch": 17.50658354695875,
"grad_norm": 0.43057137727737427,
"learning_rate": 7.492123687281213e-05,
"loss": 3.1608,
"step": 60100
},
{
"epoch": 17.521148916336518,
"grad_norm": 0.42992404103279114,
"learning_rate": 7.448366394399066e-05,
"loss": 3.1541,
"step": 60150
},
{
"epoch": 17.535714285714285,
"grad_norm": 0.4488855302333832,
"learning_rate": 7.404609101516919e-05,
"loss": 3.1475,
"step": 60200
},
{
"epoch": 17.55027965509205,
"grad_norm": 0.4396044909954071,
"learning_rate": 7.360851808634771e-05,
"loss": 3.1491,
"step": 60250
},
{
"epoch": 17.56484502446982,
"grad_norm": 0.43914586305618286,
"learning_rate": 7.317094515752625e-05,
"loss": 3.1591,
"step": 60300
},
{
"epoch": 17.57941039384759,
"grad_norm": 0.42674726247787476,
"learning_rate": 7.273337222870478e-05,
"loss": 3.1557,
"step": 60350
},
{
"epoch": 17.593975763225355,
"grad_norm": 0.4434099495410919,
"learning_rate": 7.22957992998833e-05,
"loss": 3.1607,
"step": 60400
},
{
"epoch": 17.608541132603122,
"grad_norm": 0.4289220869541168,
"learning_rate": 7.185822637106184e-05,
"loss": 3.1573,
"step": 60450
},
{
"epoch": 17.62310650198089,
"grad_norm": 0.4319080412387848,
"learning_rate": 7.142065344224036e-05,
"loss": 3.1555,
"step": 60500
},
{
"epoch": 17.63767187135866,
"grad_norm": 0.4276769459247589,
"learning_rate": 7.09830805134189e-05,
"loss": 3.1646,
"step": 60550
},
{
"epoch": 17.652237240736426,
"grad_norm": 0.42025482654571533,
"learning_rate": 7.054550758459742e-05,
"loss": 3.1637,
"step": 60600
},
{
"epoch": 17.666802610114193,
"grad_norm": 0.44079354405403137,
"learning_rate": 7.010793465577595e-05,
"loss": 3.1607,
"step": 60650
},
{
"epoch": 17.68136797949196,
"grad_norm": 0.4618414342403412,
"learning_rate": 6.967036172695449e-05,
"loss": 3.1516,
"step": 60700
},
{
"epoch": 17.695933348869726,
"grad_norm": 0.43492668867111206,
"learning_rate": 6.923278879813302e-05,
"loss": 3.159,
"step": 60750
},
{
"epoch": 17.710498718247496,
"grad_norm": 0.4092758297920227,
"learning_rate": 6.879521586931154e-05,
"loss": 3.152,
"step": 60800
},
{
"epoch": 17.725064087625263,
"grad_norm": 0.4256676137447357,
"learning_rate": 6.835764294049008e-05,
"loss": 3.1687,
"step": 60850
},
{
"epoch": 17.73962945700303,
"grad_norm": 0.440639466047287,
"learning_rate": 6.792007001166861e-05,
"loss": 3.161,
"step": 60900
},
{
"epoch": 17.754194826380797,
"grad_norm": 0.4303901493549347,
"learning_rate": 6.748249708284714e-05,
"loss": 3.1631,
"step": 60950
},
{
"epoch": 17.768760195758563,
"grad_norm": 0.4405215084552765,
"learning_rate": 6.704492415402566e-05,
"loss": 3.1562,
"step": 61000
},
{
"epoch": 17.768760195758563,
"eval_accuracy": 0.3759568982505052,
"eval_loss": 3.524013042449951,
"eval_runtime": 180.1467,
"eval_samples_per_second": 92.391,
"eval_steps_per_second": 5.779,
"step": 61000
},
{
"epoch": 17.78332556513633,
"grad_norm": 0.4302792251110077,
"learning_rate": 6.660735122520419e-05,
"loss": 3.1568,
"step": 61050
},
{
"epoch": 17.7978909345141,
"grad_norm": 0.4325387179851532,
"learning_rate": 6.616977829638273e-05,
"loss": 3.1577,
"step": 61100
},
{
"epoch": 17.812456303891867,
"grad_norm": 0.43648290634155273,
"learning_rate": 6.573220536756125e-05,
"loss": 3.1602,
"step": 61150
},
{
"epoch": 17.827021673269634,
"grad_norm": 0.4282855987548828,
"learning_rate": 6.529463243873978e-05,
"loss": 3.166,
"step": 61200
},
{
"epoch": 17.8415870426474,
"grad_norm": 0.4502982497215271,
"learning_rate": 6.485705950991831e-05,
"loss": 3.1627,
"step": 61250
},
{
"epoch": 17.856152412025168,
"grad_norm": 0.42750799655914307,
"learning_rate": 6.441948658109685e-05,
"loss": 3.1722,
"step": 61300
},
{
"epoch": 17.870717781402938,
"grad_norm": 0.4294120669364929,
"learning_rate": 6.398191365227537e-05,
"loss": 3.1663,
"step": 61350
},
{
"epoch": 17.885283150780705,
"grad_norm": 0.4335509240627289,
"learning_rate": 6.35443407234539e-05,
"loss": 3.1612,
"step": 61400
},
{
"epoch": 17.89984852015847,
"grad_norm": 0.44340866804122925,
"learning_rate": 6.310676779463244e-05,
"loss": 3.1593,
"step": 61450
},
{
"epoch": 17.914413889536238,
"grad_norm": 0.44225451350212097,
"learning_rate": 6.266919486581095e-05,
"loss": 3.1667,
"step": 61500
},
{
"epoch": 17.928979258914005,
"grad_norm": 0.4261557459831238,
"learning_rate": 6.223162193698949e-05,
"loss": 3.1746,
"step": 61550
},
{
"epoch": 17.943544628291775,
"grad_norm": 0.43242040276527405,
"learning_rate": 6.179404900816802e-05,
"loss": 3.1554,
"step": 61600
},
{
"epoch": 17.958109997669542,
"grad_norm": 0.43706896901130676,
"learning_rate": 6.135647607934655e-05,
"loss": 3.1525,
"step": 61650
},
{
"epoch": 17.97267536704731,
"grad_norm": 0.43134328722953796,
"learning_rate": 6.0918903150525085e-05,
"loss": 3.1647,
"step": 61700
},
{
"epoch": 17.987240736425075,
"grad_norm": 0.4260832667350769,
"learning_rate": 6.048133022170361e-05,
"loss": 3.1699,
"step": 61750
},
{
"epoch": 18.001747844325333,
"grad_norm": 0.4271147549152374,
"learning_rate": 6.0043757292882145e-05,
"loss": 3.1569,
"step": 61800
},
{
"epoch": 18.0163132137031,
"grad_norm": 0.4241034984588623,
"learning_rate": 5.960618436406067e-05,
"loss": 3.1166,
"step": 61850
},
{
"epoch": 18.030878583080867,
"grad_norm": 0.443732887506485,
"learning_rate": 5.91686114352392e-05,
"loss": 3.1261,
"step": 61900
},
{
"epoch": 18.045443952458633,
"grad_norm": 0.4324038624763489,
"learning_rate": 5.873103850641773e-05,
"loss": 3.1091,
"step": 61950
},
{
"epoch": 18.0600093218364,
"grad_norm": 0.43087828159332275,
"learning_rate": 5.829346557759626e-05,
"loss": 3.1148,
"step": 62000
},
{
"epoch": 18.0600093218364,
"eval_accuracy": 0.3759096324033954,
"eval_loss": 3.526399612426758,
"eval_runtime": 180.2992,
"eval_samples_per_second": 92.313,
"eval_steps_per_second": 5.774,
"step": 62000
},
{
"epoch": 18.07457469121417,
"grad_norm": 0.43519923090934753,
"learning_rate": 5.785589264877479e-05,
"loss": 3.1092,
"step": 62050
},
{
"epoch": 18.089140060591937,
"grad_norm": 0.42430856823921204,
"learning_rate": 5.7418319719953323e-05,
"loss": 3.1264,
"step": 62100
},
{
"epoch": 18.103705429969704,
"grad_norm": 0.43190985918045044,
"learning_rate": 5.698074679113185e-05,
"loss": 3.1285,
"step": 62150
},
{
"epoch": 18.11827079934747,
"grad_norm": 0.4296327829360962,
"learning_rate": 5.654317386231038e-05,
"loss": 3.1147,
"step": 62200
},
{
"epoch": 18.132836168725238,
"grad_norm": 0.42043790221214294,
"learning_rate": 5.610560093348891e-05,
"loss": 3.121,
"step": 62250
},
{
"epoch": 18.147401538103008,
"grad_norm": 0.4322208762168884,
"learning_rate": 5.566802800466744e-05,
"loss": 3.1246,
"step": 62300
},
{
"epoch": 18.161966907480775,
"grad_norm": 0.4340595006942749,
"learning_rate": 5.5230455075845976e-05,
"loss": 3.1167,
"step": 62350
},
{
"epoch": 18.17653227685854,
"grad_norm": 0.4333188533782959,
"learning_rate": 5.4792882147024495e-05,
"loss": 3.1339,
"step": 62400
},
{
"epoch": 18.191097646236308,
"grad_norm": 0.43425410985946655,
"learning_rate": 5.435530921820303e-05,
"loss": 3.1255,
"step": 62450
},
{
"epoch": 18.205663015614075,
"grad_norm": 0.4476664960384369,
"learning_rate": 5.3917736289381555e-05,
"loss": 3.1252,
"step": 62500
},
{
"epoch": 18.22022838499184,
"grad_norm": 0.4289979040622711,
"learning_rate": 5.348016336056009e-05,
"loss": 3.1282,
"step": 62550
},
{
"epoch": 18.234793754369612,
"grad_norm": 0.4287092685699463,
"learning_rate": 5.304259043173862e-05,
"loss": 3.1205,
"step": 62600
},
{
"epoch": 18.24935912374738,
"grad_norm": 0.44103768467903137,
"learning_rate": 5.260501750291715e-05,
"loss": 3.1336,
"step": 62650
},
{
"epoch": 18.263924493125145,
"grad_norm": 0.45782148838043213,
"learning_rate": 5.216744457409568e-05,
"loss": 3.1296,
"step": 62700
},
{
"epoch": 18.278489862502912,
"grad_norm": 0.4396607279777527,
"learning_rate": 5.172987164527421e-05,
"loss": 3.1249,
"step": 62750
},
{
"epoch": 18.29305523188068,
"grad_norm": 0.4368782639503479,
"learning_rate": 5.129229871645274e-05,
"loss": 3.1074,
"step": 62800
},
{
"epoch": 18.30762060125845,
"grad_norm": 0.4307306706905365,
"learning_rate": 5.085472578763127e-05,
"loss": 3.1278,
"step": 62850
},
{
"epoch": 18.322185970636216,
"grad_norm": 0.45161545276641846,
"learning_rate": 5.041715285880979e-05,
"loss": 3.1397,
"step": 62900
},
{
"epoch": 18.336751340013983,
"grad_norm": 0.42871034145355225,
"learning_rate": 4.9979579929988326e-05,
"loss": 3.1199,
"step": 62950
},
{
"epoch": 18.35131670939175,
"grad_norm": 0.43537962436676025,
"learning_rate": 4.954200700116685e-05,
"loss": 3.1308,
"step": 63000
},
{
"epoch": 18.35131670939175,
"eval_accuracy": 0.3758205092389446,
"eval_loss": 3.527667284011841,
"eval_runtime": 180.5705,
"eval_samples_per_second": 92.175,
"eval_steps_per_second": 5.765,
"step": 63000
},
{
"epoch": 18.365882078769516,
"grad_norm": 0.43271610140800476,
"learning_rate": 4.9104434072345386e-05,
"loss": 3.1386,
"step": 63050
},
{
"epoch": 18.380447448147287,
"grad_norm": 0.4385630190372467,
"learning_rate": 4.866686114352392e-05,
"loss": 3.1426,
"step": 63100
},
{
"epoch": 18.395012817525053,
"grad_norm": 0.4242112934589386,
"learning_rate": 4.8229288214702445e-05,
"loss": 3.1247,
"step": 63150
},
{
"epoch": 18.40957818690282,
"grad_norm": 0.43731290102005005,
"learning_rate": 4.779171528588098e-05,
"loss": 3.1273,
"step": 63200
},
{
"epoch": 18.424143556280587,
"grad_norm": 0.43987950682640076,
"learning_rate": 4.7354142357059505e-05,
"loss": 3.1375,
"step": 63250
},
{
"epoch": 18.438708925658354,
"grad_norm": 0.4338039755821228,
"learning_rate": 4.691656942823804e-05,
"loss": 3.1448,
"step": 63300
},
{
"epoch": 18.45327429503612,
"grad_norm": 0.43445321917533875,
"learning_rate": 4.647899649941657e-05,
"loss": 3.1226,
"step": 63350
},
{
"epoch": 18.46783966441389,
"grad_norm": 0.4330557882785797,
"learning_rate": 4.604142357059509e-05,
"loss": 3.1239,
"step": 63400
},
{
"epoch": 18.482405033791657,
"grad_norm": 0.4354225993156433,
"learning_rate": 4.5603850641773624e-05,
"loss": 3.1444,
"step": 63450
},
{
"epoch": 18.496970403169424,
"grad_norm": 0.4307340085506439,
"learning_rate": 4.516627771295215e-05,
"loss": 3.1399,
"step": 63500
},
{
"epoch": 18.51153577254719,
"grad_norm": 0.4360226094722748,
"learning_rate": 4.472870478413068e-05,
"loss": 3.1405,
"step": 63550
},
{
"epoch": 18.526101141924958,
"grad_norm": 0.4463973343372345,
"learning_rate": 4.4291131855309216e-05,
"loss": 3.1232,
"step": 63600
},
{
"epoch": 18.540666511302728,
"grad_norm": 0.43620696663856506,
"learning_rate": 4.385355892648774e-05,
"loss": 3.1366,
"step": 63650
},
{
"epoch": 18.555231880680495,
"grad_norm": 0.4371740221977234,
"learning_rate": 4.3415985997666276e-05,
"loss": 3.1259,
"step": 63700
},
{
"epoch": 18.56979725005826,
"grad_norm": 0.43216463923454285,
"learning_rate": 4.29784130688448e-05,
"loss": 3.1226,
"step": 63750
},
{
"epoch": 18.58436261943603,
"grad_norm": 0.4308457374572754,
"learning_rate": 4.2540840140023335e-05,
"loss": 3.1279,
"step": 63800
},
{
"epoch": 18.598927988813795,
"grad_norm": 0.4411686360836029,
"learning_rate": 4.210326721120187e-05,
"loss": 3.1335,
"step": 63850
},
{
"epoch": 18.613493358191565,
"grad_norm": 0.4220650792121887,
"learning_rate": 4.166569428238039e-05,
"loss": 3.1346,
"step": 63900
},
{
"epoch": 18.628058727569332,
"grad_norm": 0.4371688961982727,
"learning_rate": 4.122812135355892e-05,
"loss": 3.1426,
"step": 63950
},
{
"epoch": 18.6426240969471,
"grad_norm": 0.4275096356868744,
"learning_rate": 4.079054842473745e-05,
"loss": 3.1398,
"step": 64000
},
{
"epoch": 18.6426240969471,
"eval_accuracy": 0.37634160932449345,
"eval_loss": 3.5230259895324707,
"eval_runtime": 180.4164,
"eval_samples_per_second": 92.253,
"eval_steps_per_second": 5.77,
"step": 64000
},
{
"epoch": 18.657189466324866,
"grad_norm": 0.4456894099712372,
"learning_rate": 4.035297549591598e-05,
"loss": 3.1328,
"step": 64050
},
{
"epoch": 18.671754835702632,
"grad_norm": 0.4323650896549225,
"learning_rate": 3.9915402567094514e-05,
"loss": 3.1381,
"step": 64100
},
{
"epoch": 18.6863202050804,
"grad_norm": 0.4565901756286621,
"learning_rate": 3.947782963827304e-05,
"loss": 3.1454,
"step": 64150
},
{
"epoch": 18.70088557445817,
"grad_norm": 0.4353121221065521,
"learning_rate": 3.9040256709451574e-05,
"loss": 3.1341,
"step": 64200
},
{
"epoch": 18.715450943835936,
"grad_norm": 0.43359240889549255,
"learning_rate": 3.86026837806301e-05,
"loss": 3.1411,
"step": 64250
},
{
"epoch": 18.730016313213703,
"grad_norm": 0.4561856985092163,
"learning_rate": 3.816511085180863e-05,
"loss": 3.1373,
"step": 64300
},
{
"epoch": 18.74458168259147,
"grad_norm": 0.45646873116493225,
"learning_rate": 3.7727537922987166e-05,
"loss": 3.1408,
"step": 64350
},
{
"epoch": 18.759147051969236,
"grad_norm": 0.4355615973472595,
"learning_rate": 3.728996499416569e-05,
"loss": 3.1335,
"step": 64400
},
{
"epoch": 18.773712421347007,
"grad_norm": 0.4390393793582916,
"learning_rate": 3.685239206534422e-05,
"loss": 3.1276,
"step": 64450
},
{
"epoch": 18.788277790724774,
"grad_norm": 0.4501241147518158,
"learning_rate": 3.641481913652275e-05,
"loss": 3.137,
"step": 64500
},
{
"epoch": 18.80284316010254,
"grad_norm": 0.44744792580604553,
"learning_rate": 3.597724620770128e-05,
"loss": 3.1348,
"step": 64550
},
{
"epoch": 18.817408529480307,
"grad_norm": 0.4351864755153656,
"learning_rate": 3.553967327887981e-05,
"loss": 3.1368,
"step": 64600
},
{
"epoch": 18.831973898858074,
"grad_norm": 0.44498175382614136,
"learning_rate": 3.5102100350058345e-05,
"loss": 3.1375,
"step": 64650
},
{
"epoch": 18.846539268235844,
"grad_norm": 0.4396421015262604,
"learning_rate": 3.466452742123687e-05,
"loss": 3.124,
"step": 64700
},
{
"epoch": 18.86110463761361,
"grad_norm": 0.43623414635658264,
"learning_rate": 3.42269544924154e-05,
"loss": 3.1377,
"step": 64750
},
{
"epoch": 18.875670006991378,
"grad_norm": 0.4486501216888428,
"learning_rate": 3.378938156359393e-05,
"loss": 3.1481,
"step": 64800
},
{
"epoch": 18.890235376369144,
"grad_norm": 0.4401872754096985,
"learning_rate": 3.335180863477246e-05,
"loss": 3.135,
"step": 64850
},
{
"epoch": 18.90480074574691,
"grad_norm": 0.43119847774505615,
"learning_rate": 3.291423570595099e-05,
"loss": 3.1324,
"step": 64900
},
{
"epoch": 18.919366115124678,
"grad_norm": 0.44835567474365234,
"learning_rate": 3.247666277712952e-05,
"loss": 3.1322,
"step": 64950
},
{
"epoch": 18.93393148450245,
"grad_norm": 0.4232603907585144,
"learning_rate": 3.203908984830805e-05,
"loss": 3.1284,
"step": 65000
},
{
"epoch": 18.93393148450245,
"eval_accuracy": 0.3767057444700135,
"eval_loss": 3.518871307373047,
"eval_runtime": 180.463,
"eval_samples_per_second": 92.229,
"eval_steps_per_second": 5.768,
"step": 65000
},
{
"epoch": 18.948496853880215,
"grad_norm": 0.4348292350769043,
"learning_rate": 3.1601516919486576e-05,
"loss": 3.1338,
"step": 65050
},
{
"epoch": 18.96306222325798,
"grad_norm": 0.4335695207118988,
"learning_rate": 3.116394399066511e-05,
"loss": 3.1433,
"step": 65100
},
{
"epoch": 18.97762759263575,
"grad_norm": 0.43353986740112305,
"learning_rate": 3.072637106184364e-05,
"loss": 3.1328,
"step": 65150
},
{
"epoch": 18.992192962013515,
"grad_norm": 0.42962124943733215,
"learning_rate": 3.028879813302217e-05,
"loss": 3.1416,
"step": 65200
},
{
"epoch": 19.006700069913773,
"grad_norm": 0.42514732480049133,
"learning_rate": 2.98512252042007e-05,
"loss": 3.1258,
"step": 65250
},
{
"epoch": 19.02126543929154,
"grad_norm": 0.4310900866985321,
"learning_rate": 2.9413652275379225e-05,
"loss": 3.096,
"step": 65300
},
{
"epoch": 19.035830808669306,
"grad_norm": 0.4301711618900299,
"learning_rate": 2.8976079346557755e-05,
"loss": 3.1224,
"step": 65350
},
{
"epoch": 19.050396178047077,
"grad_norm": 0.45508843660354614,
"learning_rate": 2.8538506417736288e-05,
"loss": 3.105,
"step": 65400
},
{
"epoch": 19.064961547424844,
"grad_norm": 0.4345990717411041,
"learning_rate": 2.8100933488914818e-05,
"loss": 3.0982,
"step": 65450
},
{
"epoch": 19.07952691680261,
"grad_norm": 0.459573894739151,
"learning_rate": 2.7663360560093347e-05,
"loss": 3.1139,
"step": 65500
},
{
"epoch": 19.094092286180377,
"grad_norm": 0.4281415045261383,
"learning_rate": 2.7225787631271874e-05,
"loss": 3.0963,
"step": 65550
},
{
"epoch": 19.108657655558144,
"grad_norm": 0.43718597292900085,
"learning_rate": 2.6788214702450404e-05,
"loss": 3.0962,
"step": 65600
},
{
"epoch": 19.123223024935914,
"grad_norm": 0.44362103939056396,
"learning_rate": 2.6350641773628937e-05,
"loss": 3.1158,
"step": 65650
},
{
"epoch": 19.13778839431368,
"grad_norm": 0.4409373700618744,
"learning_rate": 2.5913068844807467e-05,
"loss": 3.1172,
"step": 65700
},
{
"epoch": 19.152353763691448,
"grad_norm": 0.4337795078754425,
"learning_rate": 2.5475495915985996e-05,
"loss": 3.0972,
"step": 65750
},
{
"epoch": 19.166919133069214,
"grad_norm": 0.41607293486595154,
"learning_rate": 2.5037922987164523e-05,
"loss": 3.0982,
"step": 65800
},
{
"epoch": 19.18148450244698,
"grad_norm": 0.43771815299987793,
"learning_rate": 2.4600350058343052e-05,
"loss": 3.1119,
"step": 65850
},
{
"epoch": 19.196049871824748,
"grad_norm": 0.43401068449020386,
"learning_rate": 2.4162777129521586e-05,
"loss": 3.1157,
"step": 65900
},
{
"epoch": 19.210615241202518,
"grad_norm": 0.4301515817642212,
"learning_rate": 2.3725204200700115e-05,
"loss": 3.106,
"step": 65950
},
{
"epoch": 19.225180610580285,
"grad_norm": 0.43195417523384094,
"learning_rate": 2.3287631271878645e-05,
"loss": 3.1103,
"step": 66000
},
{
"epoch": 19.225180610580285,
"eval_accuracy": 0.37647012069486907,
"eval_loss": 3.523946762084961,
"eval_runtime": 180.3151,
"eval_samples_per_second": 92.305,
"eval_steps_per_second": 5.773,
"step": 66000
},
{
"epoch": 19.23974597995805,
"grad_norm": 0.43875738978385925,
"learning_rate": 2.2850058343057175e-05,
"loss": 3.1139,
"step": 66050
},
{
"epoch": 19.25431134933582,
"grad_norm": 0.4393763542175293,
"learning_rate": 2.24124854142357e-05,
"loss": 3.1166,
"step": 66100
},
{
"epoch": 19.268876718713585,
"grad_norm": 0.4409915804862976,
"learning_rate": 2.1974912485414234e-05,
"loss": 3.1088,
"step": 66150
},
{
"epoch": 19.283442088091356,
"grad_norm": 0.4492509961128235,
"learning_rate": 2.1537339556592764e-05,
"loss": 3.1115,
"step": 66200
},
{
"epoch": 19.298007457469122,
"grad_norm": 0.44549205899238586,
"learning_rate": 2.1099766627771294e-05,
"loss": 3.1064,
"step": 66250
},
{
"epoch": 19.31257282684689,
"grad_norm": 0.4245654046535492,
"learning_rate": 2.0662193698949824e-05,
"loss": 3.1117,
"step": 66300
},
{
"epoch": 19.327138196224656,
"grad_norm": 0.44473904371261597,
"learning_rate": 2.022462077012835e-05,
"loss": 3.1209,
"step": 66350
},
{
"epoch": 19.341703565602423,
"grad_norm": 0.42317965626716614,
"learning_rate": 1.9787047841306883e-05,
"loss": 3.1024,
"step": 66400
},
{
"epoch": 19.356268934980193,
"grad_norm": 0.4334244728088379,
"learning_rate": 1.9349474912485413e-05,
"loss": 3.1172,
"step": 66450
},
{
"epoch": 19.37083430435796,
"grad_norm": 0.44188904762268066,
"learning_rate": 1.8911901983663943e-05,
"loss": 3.1083,
"step": 66500
},
{
"epoch": 19.385399673735726,
"grad_norm": 0.4287189245223999,
"learning_rate": 1.8474329054842473e-05,
"loss": 3.1162,
"step": 66550
},
{
"epoch": 19.399965043113493,
"grad_norm": 0.4341401159763336,
"learning_rate": 1.8036756126021002e-05,
"loss": 3.1143,
"step": 66600
},
{
"epoch": 19.41453041249126,
"grad_norm": 0.4297761619091034,
"learning_rate": 1.7599183197199532e-05,
"loss": 3.1174,
"step": 66650
},
{
"epoch": 19.429095781869027,
"grad_norm": 0.4521428644657135,
"learning_rate": 1.7161610268378062e-05,
"loss": 3.0979,
"step": 66700
},
{
"epoch": 19.443661151246797,
"grad_norm": 0.43041300773620605,
"learning_rate": 1.672403733955659e-05,
"loss": 3.1183,
"step": 66750
},
{
"epoch": 19.458226520624564,
"grad_norm": 0.43775051832199097,
"learning_rate": 1.628646441073512e-05,
"loss": 3.1043,
"step": 66800
},
{
"epoch": 19.47279189000233,
"grad_norm": 0.4432560205459595,
"learning_rate": 1.584889148191365e-05,
"loss": 3.1062,
"step": 66850
},
{
"epoch": 19.487357259380097,
"grad_norm": 0.4381709396839142,
"learning_rate": 1.541131855309218e-05,
"loss": 3.1185,
"step": 66900
},
{
"epoch": 19.501922628757864,
"grad_norm": 0.42818522453308105,
"learning_rate": 1.497374562427071e-05,
"loss": 3.115,
"step": 66950
},
{
"epoch": 19.516487998135634,
"grad_norm": 0.43918377161026,
"learning_rate": 1.453617269544924e-05,
"loss": 3.1006,
"step": 67000
},
{
"epoch": 19.516487998135634,
"eval_accuracy": 0.3768343734171232,
"eval_loss": 3.5216574668884277,
"eval_runtime": 180.3716,
"eval_samples_per_second": 92.276,
"eval_steps_per_second": 5.771,
"step": 67000
},
{
"epoch": 19.5310533675134,
"grad_norm": 0.42537999153137207,
"learning_rate": 1.409859976662777e-05,
"loss": 3.1003,
"step": 67050
},
{
"epoch": 19.545618736891168,
"grad_norm": 0.4393448531627655,
"learning_rate": 1.36610268378063e-05,
"loss": 3.1052,
"step": 67100
},
{
"epoch": 19.560184106268935,
"grad_norm": 0.4335193336009979,
"learning_rate": 1.322345390898483e-05,
"loss": 3.1101,
"step": 67150
},
{
"epoch": 19.5747494756467,
"grad_norm": 0.444814532995224,
"learning_rate": 1.278588098016336e-05,
"loss": 3.1153,
"step": 67200
},
{
"epoch": 19.589314845024468,
"grad_norm": 0.43410614132881165,
"learning_rate": 1.234830805134189e-05,
"loss": 3.1015,
"step": 67250
},
{
"epoch": 19.60388021440224,
"grad_norm": 0.4306865930557251,
"learning_rate": 1.191073512252042e-05,
"loss": 3.109,
"step": 67300
},
{
"epoch": 19.618445583780005,
"grad_norm": 0.4378669857978821,
"learning_rate": 1.1473162193698949e-05,
"loss": 3.1203,
"step": 67350
},
{
"epoch": 19.633010953157772,
"grad_norm": 0.4324400722980499,
"learning_rate": 1.1035589264877479e-05,
"loss": 3.1169,
"step": 67400
},
{
"epoch": 19.64757632253554,
"grad_norm": 0.4444780945777893,
"learning_rate": 1.0598016336056008e-05,
"loss": 3.1076,
"step": 67450
},
{
"epoch": 19.662141691913305,
"grad_norm": 0.45415130257606506,
"learning_rate": 1.0160443407234538e-05,
"loss": 3.1047,
"step": 67500
},
{
"epoch": 19.676707061291076,
"grad_norm": 0.4280719459056854,
"learning_rate": 9.72287047841307e-06,
"loss": 3.1012,
"step": 67550
},
{
"epoch": 19.691272430668842,
"grad_norm": 0.44762077927589417,
"learning_rate": 9.285297549591598e-06,
"loss": 3.1153,
"step": 67600
},
{
"epoch": 19.70583780004661,
"grad_norm": 0.4268578290939331,
"learning_rate": 8.847724620770127e-06,
"loss": 3.1035,
"step": 67650
},
{
"epoch": 19.720403169424376,
"grad_norm": 0.4514009356498718,
"learning_rate": 8.410151691948657e-06,
"loss": 3.1084,
"step": 67700
},
{
"epoch": 19.734968538802143,
"grad_norm": 0.4248492121696472,
"learning_rate": 7.972578763127187e-06,
"loss": 3.1178,
"step": 67750
},
{
"epoch": 19.749533908179913,
"grad_norm": 0.4309752881526947,
"learning_rate": 7.535005834305717e-06,
"loss": 3.1176,
"step": 67800
},
{
"epoch": 19.76409927755768,
"grad_norm": 0.43412092328071594,
"learning_rate": 7.0974329054842465e-06,
"loss": 3.1268,
"step": 67850
},
{
"epoch": 19.778664646935447,
"grad_norm": 0.42582693696022034,
"learning_rate": 6.659859976662777e-06,
"loss": 3.1031,
"step": 67900
},
{
"epoch": 19.793230016313213,
"grad_norm": 0.4391798973083496,
"learning_rate": 6.222287047841307e-06,
"loss": 3.1095,
"step": 67950
},
{
"epoch": 19.80779538569098,
"grad_norm": 0.4283028841018677,
"learning_rate": 5.784714119019836e-06,
"loss": 3.1085,
"step": 68000
},
{
"epoch": 19.80779538569098,
"eval_accuracy": 0.3768690585536839,
"eval_loss": 3.520920753479004,
"eval_runtime": 180.5112,
"eval_samples_per_second": 92.205,
"eval_steps_per_second": 5.767,
"step": 68000
},
{
"epoch": 19.822360755068747,
"grad_norm": 0.43811023235321045,
"learning_rate": 5.3471411901983655e-06,
"loss": 3.1013,
"step": 68050
},
{
"epoch": 19.836926124446517,
"grad_norm": 0.4432801306247711,
"learning_rate": 4.909568261376895e-06,
"loss": 3.1118,
"step": 68100
},
{
"epoch": 19.851491493824284,
"grad_norm": 0.433024525642395,
"learning_rate": 4.471995332555426e-06,
"loss": 3.1143,
"step": 68150
},
{
"epoch": 19.86605686320205,
"grad_norm": 0.42264324426651,
"learning_rate": 4.034422403733955e-06,
"loss": 3.1129,
"step": 68200
},
{
"epoch": 19.880622232579817,
"grad_norm": 0.42544034123420715,
"learning_rate": 3.596849474912485e-06,
"loss": 3.1102,
"step": 68250
},
{
"epoch": 19.895187601957584,
"grad_norm": 0.4385799765586853,
"learning_rate": 3.159276546091015e-06,
"loss": 3.1155,
"step": 68300
},
{
"epoch": 19.909752971335354,
"grad_norm": 0.4359734058380127,
"learning_rate": 2.7217036172695445e-06,
"loss": 3.0982,
"step": 68350
},
{
"epoch": 19.92431834071312,
"grad_norm": 0.43336305022239685,
"learning_rate": 2.2841306884480747e-06,
"loss": 3.1107,
"step": 68400
},
{
"epoch": 19.938883710090888,
"grad_norm": 0.4339163899421692,
"learning_rate": 1.8465577596266043e-06,
"loss": 3.1284,
"step": 68450
},
{
"epoch": 19.953449079468655,
"grad_norm": 0.4268462359905243,
"learning_rate": 1.408984830805134e-06,
"loss": 3.1164,
"step": 68500
},
{
"epoch": 19.96801444884642,
"grad_norm": 0.4449523091316223,
"learning_rate": 9.714119019836638e-07,
"loss": 3.1082,
"step": 68550
},
{
"epoch": 19.982579818224192,
"grad_norm": 0.4338447153568268,
"learning_rate": 5.338389731621937e-07,
"loss": 3.1144,
"step": 68600
},
{
"epoch": 19.99714518760196,
"grad_norm": 0.430207222700119,
"learning_rate": 9.626604434072343e-08,
"loss": 3.11,
"step": 68650
},
{
"epoch": 20.0,
"step": 68660,
"total_flos": 1.43513603407872e+18,
"train_loss": 3.4363333413179364,
"train_runtime": 137192.0021,
"train_samples_per_second": 40.035,
"train_steps_per_second": 0.5
}
],
"logging_steps": 50,
"max_steps": 68660,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.43513603407872e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}