diff --git "a/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json" "b/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json" @@ -0,0 +1,7493 @@ +{ + "best_global_step": 48000, + "best_metric": 3.5272507667541504, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_1001/checkpoint-30000", + "epoch": 14.564553717082266, + "eval_steps": 1000, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 1.0411887168884277, + "learning_rate": 0.000294, + "loss": 8.4313, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.9934611916542053, + "learning_rate": 0.0005939999999999999, + "loss": 6.7347, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.6710580587387085, + "learning_rate": 0.0005995711785297549, + "loss": 6.3491, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.46758192777633667, + "learning_rate": 0.0005991336056009335, + "loss": 6.1272, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.5224136710166931, + "learning_rate": 0.000598696032672112, + "loss": 5.9887, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.5374292135238647, + "learning_rate": 0.0005982584597432905, + "loss": 5.8537, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.42533859610557556, + "learning_rate": 0.0005978208868144691, + "loss": 5.7305, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.5247730016708374, + "learning_rate": 0.0005973833138856476, + "loss": 5.6139, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.5447224974632263, + "learning_rate": 0.000596945740956826, + "loss": 5.4885, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.5301450490951538, + "learning_rate": 0.0005965081680280046, + "loss": 5.4119, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.5106812119483948, + "learning_rate": 0.0005960705950991831, + "loss": 5.3254, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.42297908663749695, + "learning_rate": 0.0005956330221703616, + "loss": 5.2446, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.47052502632141113, + "learning_rate": 0.0005951954492415402, + "loss": 5.1939, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.4460456371307373, + "learning_rate": 0.0005947578763127188, + "loss": 5.1229, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.46692177653312683, + "learning_rate": 0.0005943203033838973, + "loss": 5.0837, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.4475383758544922, + "learning_rate": 0.0005938827304550758, + "loss": 5.0189, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4715788960456848, + "learning_rate": 0.0005934451575262544, + "loss": 4.9712, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.5530597567558289, + "learning_rate": 0.0005930075845974328, + "loss": 4.9247, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.5195161700248718, + "learning_rate": 0.0005925700116686113, + "loss": 4.8776, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.46087169647216797, + "learning_rate": 0.0005921324387397899, + "loss": 4.8133, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.2545722064591014, + "eval_loss": 4.754235744476318, + "eval_runtime": 180.1197, + "eval_samples_per_second": 92.405, + "eval_steps_per_second": 5.779, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.44081172347068787, + "learning_rate": 0.0005916948658109684, + "loss": 4.7792, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.48162946105003357, + "learning_rate": 0.000591257292882147, + "loss": 4.7359, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.4235544204711914, + "learning_rate": 0.0005908197199533255, + "loss": 4.6992, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.4740869402885437, + "learning_rate": 0.0005903821470245041, + "loss": 4.6654, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.4276205897331238, + "learning_rate": 0.0005899445740956826, + "loss": 4.6244, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.40895992517471313, + "learning_rate": 0.0005895070011668611, + "loss": 4.6068, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.4188133478164673, + "learning_rate": 0.0005890694282380397, + "loss": 4.5584, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.4317689538002014, + "learning_rate": 0.0005886318553092181, + "loss": 4.5422, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.40392470359802246, + "learning_rate": 0.0005881942823803966, + "loss": 4.5333, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4244018793106079, + "learning_rate": 0.0005877567094515752, + "loss": 4.4986, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.44470831751823425, + "learning_rate": 0.0005873191365227537, + "loss": 4.4928, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.4386588931083679, + "learning_rate": 0.0005868815635939323, + "loss": 4.4553, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.42980971932411194, + "learning_rate": 0.0005864439906651108, + "loss": 4.4385, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.3935016691684723, + "learning_rate": 0.0005860064177362894, + "loss": 4.4327, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.4373241662979126, + "learning_rate": 0.0005855688448074679, + "loss": 4.4099, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.4172551929950714, + "learning_rate": 0.0005851312718786464, + "loss": 4.3901, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.40378788113594055, + "learning_rate": 0.0005846936989498249, + "loss": 4.383, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.38236093521118164, + "learning_rate": 0.0005842561260210034, + "loss": 4.3598, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.381078839302063, + "learning_rate": 0.000583818553092182, + "loss": 4.3638, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.4327857196331024, + "learning_rate": 0.0005833809801633605, + "loss": 4.3432, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.29888864119390235, + "eval_loss": 4.287432670593262, + "eval_runtime": 180.4505, + "eval_samples_per_second": 92.236, + "eval_steps_per_second": 5.769, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.4143087863922119, + "learning_rate": 0.000582943407234539, + "loss": 4.329, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.3753448724746704, + "learning_rate": 0.0005825058343057176, + "loss": 4.3033, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.40621188282966614, + "learning_rate": 0.0005820682613768961, + "loss": 4.3041, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.40833911299705505, + "learning_rate": 0.0005816306884480747, + "loss": 4.2834, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.4088577628135681, + "learning_rate": 0.0005811931155192532, + "loss": 4.2804, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.3746855556964874, + "learning_rate": 0.0005807555425904316, + "loss": 4.2762, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.3618931770324707, + "learning_rate": 0.0005803179696616102, + "loss": 4.2598, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.3690814971923828, + "learning_rate": 0.0005798803967327887, + "loss": 4.2413, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.40264639258384705, + "learning_rate": 0.0005794428238039673, + "loss": 4.2375, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.4249323606491089, + "learning_rate": 0.0005790052508751458, + "loss": 4.233, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.39969372749328613, + "learning_rate": 0.0005785676779463243, + "loss": 4.2202, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.3819160759449005, + "learning_rate": 0.0005781301050175029, + "loss": 4.2199, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.361541211605072, + "learning_rate": 0.0005776925320886814, + "loss": 4.204, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3613761365413666, + "learning_rate": 0.00057725495915986, + "loss": 4.1961, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.4024335741996765, + "learning_rate": 0.0005768173862310384, + "loss": 4.1899, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.34226447343826294, + "learning_rate": 0.0005763798133022169, + "loss": 4.1648, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.3609713315963745, + "learning_rate": 0.0005759422403733955, + "loss": 4.1756, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.37800899147987366, + "learning_rate": 0.000575504667444574, + "loss": 4.1618, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.35399892926216125, + "learning_rate": 0.0005750670945157526, + "loss": 4.151, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.35685768723487854, + "learning_rate": 0.0005746295215869311, + "loss": 4.1316, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.3165242106956263, + "eval_loss": 4.094448566436768, + "eval_runtime": 180.2904, + "eval_samples_per_second": 92.318, + "eval_steps_per_second": 5.774, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.3725755214691162, + "learning_rate": 0.0005741919486581096, + "loss": 4.1422, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.3675600290298462, + "learning_rate": 0.0005737543757292882, + "loss": 4.1361, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.3458426892757416, + "learning_rate": 0.0005733168028004667, + "loss": 4.1292, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.35508471727371216, + "learning_rate": 0.0005728792298716453, + "loss": 4.1173, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.3473420739173889, + "learning_rate": 0.0005724416569428237, + "loss": 4.1051, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.38041558861732483, + "learning_rate": 0.0005720040840140023, + "loss": 4.1141, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.36010000109672546, + "learning_rate": 0.0005715665110851808, + "loss": 4.1036, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.35879573225975037, + "learning_rate": 0.0005711289381563593, + "loss": 4.099, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.36708125472068787, + "learning_rate": 0.0005706913652275379, + "loss": 4.0613, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.36130914092063904, + "learning_rate": 0.0005702537922987164, + "loss": 4.023, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.3636951446533203, + "learning_rate": 0.0005698162193698949, + "loss": 4.0197, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.3318287432193756, + "learning_rate": 0.0005693786464410735, + "loss": 4.009, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3470671474933624, + "learning_rate": 0.000568941073512252, + "loss": 3.9984, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.3891682028770447, + "learning_rate": 0.0005685035005834305, + "loss": 4.0067, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.3590176999568939, + "learning_rate": 0.000568065927654609, + "loss": 4.011, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.3605038821697235, + "learning_rate": 0.0005676283547257876, + "loss": 4.0014, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.35803160071372986, + "learning_rate": 0.0005671907817969661, + "loss": 3.9972, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.3601053059101105, + "learning_rate": 0.0005667532088681446, + "loss": 3.9894, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.3761005699634552, + "learning_rate": 0.0005663156359393232, + "loss": 3.9892, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.34091663360595703, + "learning_rate": 0.0005658780630105017, + "loss": 3.9856, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.3262635618883952, + "eval_loss": 3.9839839935302734, + "eval_runtime": 180.3348, + "eval_samples_per_second": 92.295, + "eval_steps_per_second": 5.773, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.3509597182273865, + "learning_rate": 0.0005654404900816802, + "loss": 3.9952, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.35156598687171936, + "learning_rate": 0.0005650029171528588, + "loss": 3.9769, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.34221357107162476, + "learning_rate": 0.0005645653442240373, + "loss": 3.9839, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.3706187307834625, + "learning_rate": 0.0005641277712952158, + "loss": 3.9845, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.3384045660495758, + "learning_rate": 0.0005636901983663943, + "loss": 3.9635, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.36682382225990295, + "learning_rate": 0.0005632526254375729, + "loss": 3.9741, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.3488970398902893, + "learning_rate": 0.0005628150525087514, + "loss": 3.9682, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.3281860053539276, + "learning_rate": 0.0005623774795799299, + "loss": 3.9527, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.3465306758880615, + "learning_rate": 0.0005619399066511085, + "loss": 3.9529, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.3299737870693207, + "learning_rate": 0.000561502333722287, + "loss": 3.9559, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.3362690508365631, + "learning_rate": 0.0005610647607934655, + "loss": 3.9554, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.34816160798072815, + "learning_rate": 0.000560627187864644, + "loss": 3.951, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.3506231904029846, + "learning_rate": 0.0005601896149358226, + "loss": 3.9438, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.34532177448272705, + "learning_rate": 0.0005597520420070011, + "loss": 3.9489, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.3467022180557251, + "learning_rate": 0.0005593144690781796, + "loss": 3.9438, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.3695443272590637, + "learning_rate": 0.0005588768961493582, + "loss": 3.9331, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.33018758893013, + "learning_rate": 0.0005584393232205367, + "loss": 3.9295, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.3564456105232239, + "learning_rate": 0.0005580017502917152, + "loss": 3.9395, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.3350387215614319, + "learning_rate": 0.0005575641773628938, + "loss": 3.9266, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.33804184198379517, + "learning_rate": 0.0005571266044340723, + "loss": 3.9435, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.33277813599489436, + "eval_loss": 3.9095029830932617, + "eval_runtime": 180.2432, + "eval_samples_per_second": 92.342, + "eval_steps_per_second": 5.776, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.33603259921073914, + "learning_rate": 0.0005566890315052507, + "loss": 3.9152, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.3263108730316162, + "learning_rate": 0.0005562514585764293, + "loss": 3.9295, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.3421551287174225, + "learning_rate": 0.0005558138856476079, + "loss": 3.9092, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.3283444344997406, + "learning_rate": 0.0005553763127187864, + "loss": 3.9035, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.34648576378822327, + "learning_rate": 0.0005549387397899649, + "loss": 3.9155, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.3276433050632477, + "learning_rate": 0.0005545011668611435, + "loss": 3.9183, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.333248108625412, + "learning_rate": 0.000554063593932322, + "loss": 3.9177, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.337734580039978, + "learning_rate": 0.0005536260210035005, + "loss": 3.9095, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.3482300043106079, + "learning_rate": 0.0005531884480746791, + "loss": 3.8858, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.3184448480606079, + "learning_rate": 0.0005527508751458577, + "loss": 3.8971, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.3283264935016632, + "learning_rate": 0.0005523133022170361, + "loss": 3.877, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.34048277139663696, + "learning_rate": 0.0005518757292882146, + "loss": 3.8963, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.32839393615722656, + "learning_rate": 0.0005514381563593932, + "loss": 3.8916, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.33673906326293945, + "learning_rate": 0.0005510005834305717, + "loss": 3.8848, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.34184661507606506, + "learning_rate": 0.0005505630105017502, + "loss": 3.8913, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.3328911364078522, + "learning_rate": 0.0005501254375729288, + "loss": 3.8844, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.34918013215065, + "learning_rate": 0.0005496878646441073, + "loss": 3.8826, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.3278209865093231, + "learning_rate": 0.0005492502917152858, + "loss": 3.8737, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.32764044404029846, + "learning_rate": 0.0005488127187864644, + "loss": 3.8653, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.320593923330307, + "learning_rate": 0.000548375145857643, + "loss": 3.8665, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.33797937798145206, + "eval_loss": 3.8514480590820312, + "eval_runtime": 180.396, + "eval_samples_per_second": 92.264, + "eval_steps_per_second": 5.771, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.32193249464035034, + "learning_rate": 0.0005479375729288214, + "loss": 3.863, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.35405752062797546, + "learning_rate": 0.0005474999999999999, + "loss": 3.8648, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.3290136754512787, + "learning_rate": 0.0005470624270711785, + "loss": 3.8579, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.3399069607257843, + "learning_rate": 0.000546624854142357, + "loss": 3.8711, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.333492249250412, + "learning_rate": 0.0005461872812135355, + "loss": 3.8686, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.3360602557659149, + "learning_rate": 0.0005457497082847141, + "loss": 3.8639, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.349657267332077, + "learning_rate": 0.0005453121353558927, + "loss": 3.8582, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.31816044449806213, + "learning_rate": 0.0005448745624270712, + "loss": 3.8605, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.3400065004825592, + "learning_rate": 0.0005444369894982496, + "loss": 3.8617, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.3279556632041931, + "learning_rate": 0.0005439994165694282, + "loss": 3.8528, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.33743831515312195, + "learning_rate": 0.0005435618436406067, + "loss": 3.8504, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.3401290476322174, + "learning_rate": 0.0005431242707117852, + "loss": 3.8496, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.3282126486301422, + "learning_rate": 0.0005426866977829638, + "loss": 3.8469, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.3605695068836212, + "learning_rate": 0.0005422491248541423, + "loss": 3.858, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.32521483302116394, + "learning_rate": 0.0005418115519253208, + "loss": 3.852, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.33284640312194824, + "learning_rate": 0.0005413739789964994, + "loss": 3.8486, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.3308689296245575, + "learning_rate": 0.000540936406067678, + "loss": 3.8482, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.33800598978996277, + "learning_rate": 0.0005404988331388564, + "loss": 3.77, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.3277951180934906, + "learning_rate": 0.0005400612602100349, + "loss": 3.7368, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.3203679919242859, + "learning_rate": 0.0005396236872812135, + "loss": 3.7491, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34216722609676753, + "eval_loss": 3.8109800815582275, + "eval_runtime": 180.255, + "eval_samples_per_second": 92.336, + "eval_steps_per_second": 5.775, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.35276371240615845, + "learning_rate": 0.000539186114352392, + "loss": 3.7337, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.35237714648246765, + "learning_rate": 0.0005387485414235705, + "loss": 3.7357, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.3209347426891327, + "learning_rate": 0.0005383109684947491, + "loss": 3.7583, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.32085931301116943, + "learning_rate": 0.0005378733955659276, + "loss": 3.7539, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.31919988989830017, + "learning_rate": 0.0005374358226371061, + "loss": 3.7393, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.3325698673725128, + "learning_rate": 0.0005369982497082847, + "loss": 3.751, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.32088345289230347, + "learning_rate": 0.0005365606767794633, + "loss": 3.7441, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.31885406374931335, + "learning_rate": 0.0005361231038506417, + "loss": 3.7468, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.32321396470069885, + "learning_rate": 0.0005356855309218202, + "loss": 3.7511, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.339028924703598, + "learning_rate": 0.0005352479579929988, + "loss": 3.7568, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.3378174901008606, + "learning_rate": 0.0005348103850641773, + "loss": 3.7351, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.32842838764190674, + "learning_rate": 0.0005343728121353558, + "loss": 3.7614, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.3337772488594055, + "learning_rate": 0.0005339352392065344, + "loss": 3.7502, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.31574419140815735, + "learning_rate": 0.000533497666277713, + "loss": 3.7559, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.3204760253429413, + "learning_rate": 0.0005330600933488915, + "loss": 3.7382, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.33120566606521606, + "learning_rate": 0.00053262252042007, + "loss": 3.7488, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.3328082263469696, + "learning_rate": 0.0005321849474912485, + "loss": 3.7518, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3446897268295288, + "learning_rate": 0.000531747374562427, + "loss": 3.7421, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.3277474641799927, + "learning_rate": 0.0005313098016336055, + "loss": 3.7376, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.32570740580558777, + "learning_rate": 0.0005308722287047841, + "loss": 3.7416, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.3451675491976329, + "eval_loss": 3.7787580490112305, + "eval_runtime": 180.355, + "eval_samples_per_second": 92.285, + "eval_steps_per_second": 5.772, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.3236296474933624, + "learning_rate": 0.0005304346557759626, + "loss": 3.737, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.3300062417984009, + "learning_rate": 0.0005299970828471411, + "loss": 3.74, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3218088746070862, + "learning_rate": 0.0005295595099183197, + "loss": 3.7618, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.32456105947494507, + "learning_rate": 0.0005291219369894983, + "loss": 3.7461, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.3256712257862091, + "learning_rate": 0.0005286843640606768, + "loss": 3.7343, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.3265218734741211, + "learning_rate": 0.0005282467911318552, + "loss": 3.7396, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.3039201498031616, + "learning_rate": 0.0005278092182030338, + "loss": 3.7405, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.3367139995098114, + "learning_rate": 0.0005273716452742123, + "loss": 3.7426, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.314224511384964, + "learning_rate": 0.0005269340723453908, + "loss": 3.7412, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.3330950140953064, + "learning_rate": 0.0005264964994165694, + "loss": 3.7463, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.3340461552143097, + "learning_rate": 0.000526058926487748, + "loss": 3.7469, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.3362635672092438, + "learning_rate": 0.0005256213535589265, + "loss": 3.7338, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.3297460377216339, + "learning_rate": 0.000525183780630105, + "loss": 3.7498, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.3183857500553131, + "learning_rate": 0.0005247462077012836, + "loss": 3.7391, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.33508941531181335, + "learning_rate": 0.000524308634772462, + "loss": 3.7348, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.3083733022212982, + "learning_rate": 0.0005238710618436405, + "loss": 3.7285, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.31876590847969055, + "learning_rate": 0.0005234334889148191, + "loss": 3.7443, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.3193049430847168, + "learning_rate": 0.0005229959159859976, + "loss": 3.7422, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.32590124011039734, + "learning_rate": 0.0005225583430571761, + "loss": 3.7424, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3363872170448303, + "learning_rate": 0.0005221207701283547, + "loss": 3.728, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.3480742812181514, + "eval_loss": 3.7482504844665527, + "eval_runtime": 180.2179, + "eval_samples_per_second": 92.355, + "eval_steps_per_second": 5.776, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.3201189935207367, + "learning_rate": 0.0005216831971995333, + "loss": 3.7477, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.33287513256073, + "learning_rate": 0.0005212456242707118, + "loss": 3.7203, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.3236483335494995, + "learning_rate": 0.0005208080513418903, + "loss": 3.7265, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.3180456757545471, + "learning_rate": 0.0005203704784130689, + "loss": 3.7303, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.3273324966430664, + "learning_rate": 0.0005199329054842473, + "loss": 3.7266, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.3243292272090912, + "learning_rate": 0.0005194953325554258, + "loss": 3.7301, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.32646605372428894, + "learning_rate": 0.0005190577596266044, + "loss": 3.7284, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.3168424665927887, + "learning_rate": 0.0005186201866977829, + "loss": 3.7384, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.3341065049171448, + "learning_rate": 0.0005181826137689614, + "loss": 3.7279, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.3197799623012543, + "learning_rate": 0.00051774504084014, + "loss": 3.7302, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.31474462151527405, + "learning_rate": 0.0005173074679113186, + "loss": 3.735, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.3133241832256317, + "learning_rate": 0.0005168698949824971, + "loss": 3.7139, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.31363457441329956, + "learning_rate": 0.0005164323220536755, + "loss": 3.7076, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.32894420623779297, + "learning_rate": 0.0005159947491248541, + "loss": 3.717, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.33178263902664185, + "learning_rate": 0.0005155571761960326, + "loss": 3.7305, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.31269919872283936, + "learning_rate": 0.0005151196032672111, + "loss": 3.7172, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.32776308059692383, + "learning_rate": 0.0005146820303383897, + "loss": 3.7171, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.3176999092102051, + "learning_rate": 0.0005142444574095682, + "loss": 3.6958, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.3453384339809418, + "learning_rate": 0.0005138068844807468, + "loss": 3.7037, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.31886717677116394, + "learning_rate": 0.0005133693115519253, + "loss": 3.6958, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.3504000665954622, + "eval_loss": 3.722299575805664, + "eval_runtime": 180.4701, + "eval_samples_per_second": 92.226, + "eval_steps_per_second": 5.768, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.3301357626914978, + "learning_rate": 0.0005129317386231039, + "loss": 3.7174, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.31266558170318604, + "learning_rate": 0.0005124941656942824, + "loss": 3.7087, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.2986539602279663, + "learning_rate": 0.0005120565927654608, + "loss": 3.7032, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.3215900659561157, + "learning_rate": 0.0005116190198366394, + "loss": 3.7156, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.34506484866142273, + "learning_rate": 0.0005111814469078179, + "loss": 3.7182, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.3209165036678314, + "learning_rate": 0.0005107438739789964, + "loss": 3.6979, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.3145550489425659, + "learning_rate": 0.000510306301050175, + "loss": 3.5923, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.33601146936416626, + "learning_rate": 0.0005098687281213535, + "loss": 3.6103, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.31010255217552185, + "learning_rate": 0.0005094311551925321, + "loss": 3.6064, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.3235945701599121, + "learning_rate": 0.0005089935822637106, + "loss": 3.5974, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.32663047313690186, + "learning_rate": 0.0005085560093348892, + "loss": 3.6164, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.32186612486839294, + "learning_rate": 0.0005081184364060676, + "loss": 3.605, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3103710114955902, + "learning_rate": 0.0005076808634772461, + "loss": 3.622, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.32506147027015686, + "learning_rate": 0.0005072432905484247, + "loss": 3.6183, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.354626327753067, + "learning_rate": 0.0005068057176196032, + "loss": 3.6236, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.31761565804481506, + "learning_rate": 0.0005063681446907818, + "loss": 3.6218, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.3158835172653198, + "learning_rate": 0.0005059305717619603, + "loss": 3.6275, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.3345862925052643, + "learning_rate": 0.0005054929988331388, + "loss": 3.6209, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.33414244651794434, + "learning_rate": 0.0005050554259043174, + "loss": 3.6138, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.321621835231781, + "learning_rate": 0.0005046178529754959, + "loss": 3.6306, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.3523780599932934, + "eval_loss": 3.7092323303222656, + "eval_runtime": 181.978, + "eval_samples_per_second": 91.462, + "eval_steps_per_second": 5.72, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.3331759572029114, + "learning_rate": 0.0005041802800466744, + "loss": 3.6116, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.33480656147003174, + "learning_rate": 0.0005037427071178529, + "loss": 3.6186, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.32737287878990173, + "learning_rate": 0.0005033051341890314, + "loss": 3.6176, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.33219143748283386, + "learning_rate": 0.00050286756126021, + "loss": 3.6299, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.3134367763996124, + "learning_rate": 0.0005024299883313885, + "loss": 3.6269, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.3368885815143585, + "learning_rate": 0.0005019924154025671, + "loss": 3.6383, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.30437996983528137, + "learning_rate": 0.0005015548424737456, + "loss": 3.6245, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.33528828620910645, + "learning_rate": 0.0005011172695449241, + "loss": 3.6251, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.33781698346138, + "learning_rate": 0.0005006796966161027, + "loss": 3.63, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.329375296831131, + "learning_rate": 0.0005002421236872811, + "loss": 3.6387, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.31199130415916443, + "learning_rate": 0.0004998045507584597, + "loss": 3.6199, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.31993409991264343, + "learning_rate": 0.0004993669778296382, + "loss": 3.6383, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.33537372946739197, + "learning_rate": 0.0004989294049008167, + "loss": 3.6409, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.3288818299770355, + "learning_rate": 0.0004984918319719953, + "loss": 3.6544, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.3143393099308014, + "learning_rate": 0.0004980542590431738, + "loss": 3.632, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.3316044211387634, + "learning_rate": 0.0004976166861143524, + "loss": 3.6256, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.3158373534679413, + "learning_rate": 0.0004971791131855309, + "loss": 3.6283, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.3310090899467468, + "learning_rate": 0.0004967415402567094, + "loss": 3.6383, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.3304344415664673, + "learning_rate": 0.000496303967327888, + "loss": 3.6364, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.3196583390235901, + "learning_rate": 0.0004958663943990664, + "loss": 3.6245, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.3539525300396798, + "eval_loss": 3.6910691261291504, + "eval_runtime": 180.2749, + "eval_samples_per_second": 92.326, + "eval_steps_per_second": 5.775, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.334721177816391, + "learning_rate": 0.000495428821470245, + "loss": 3.6405, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.30898579955101013, + "learning_rate": 0.0004949912485414235, + "loss": 3.633, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.3296958804130554, + "learning_rate": 0.0004945536756126021, + "loss": 3.6336, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.3156227469444275, + "learning_rate": 0.0004941161026837806, + "loss": 3.6264, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.32900500297546387, + "learning_rate": 0.0004936785297549591, + "loss": 3.6286, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.33001989126205444, + "learning_rate": 0.0004932409568261377, + "loss": 3.6485, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.32858744263648987, + "learning_rate": 0.0004928033838973162, + "loss": 3.6323, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.35113999247550964, + "learning_rate": 0.0004923658109684946, + "loss": 3.647, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.3282478153705597, + "learning_rate": 0.0004919282380396732, + "loss": 3.6335, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.31611868739128113, + "learning_rate": 0.0004914906651108517, + "loss": 3.631, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.33487775921821594, + "learning_rate": 0.0004910530921820303, + "loss": 3.6274, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.33004793524742126, + "learning_rate": 0.0004906155192532088, + "loss": 3.618, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.30851587653160095, + "learning_rate": 0.0004901779463243874, + "loss": 3.6229, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.325185090303421, + "learning_rate": 0.0004897403733955659, + "loss": 3.6289, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.3187962770462036, + "learning_rate": 0.0004893028004667444, + "loss": 3.6355, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.32004639506340027, + "learning_rate": 0.000488865227537923, + "loss": 3.6424, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.331478476524353, + "learning_rate": 0.0004884276546091015, + "loss": 3.624, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.31720319390296936, + "learning_rate": 0.00048799008168028, + "loss": 3.6329, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.32388386130332947, + "learning_rate": 0.00048755250875145853, + "loss": 3.6237, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.326471209526062, + "learning_rate": 0.0004871149358226371, + "loss": 3.6365, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.35590465655600817, + "eval_loss": 3.6707494258880615, + "eval_runtime": 180.189, + "eval_samples_per_second": 92.37, + "eval_steps_per_second": 5.777, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.3287231922149658, + "learning_rate": 0.0004866773628938156, + "loss": 3.6351, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.3224816620349884, + "learning_rate": 0.0004862397899649941, + "loss": 3.631, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.34565699100494385, + "learning_rate": 0.00048580221703617264, + "loss": 3.6365, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.31353557109832764, + "learning_rate": 0.00048536464410735123, + "loss": 3.6346, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.31035754084587097, + "learning_rate": 0.00048492707117852966, + "loss": 3.6353, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.3304181694984436, + "learning_rate": 0.00048448949824970826, + "loss": 3.631, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.3305014669895172, + "learning_rate": 0.0004840519253208868, + "loss": 3.6234, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.33002111315727234, + "learning_rate": 0.0004836143523920653, + "loss": 3.6389, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.3106802701950073, + "learning_rate": 0.0004831767794632438, + "loss": 3.6413, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.32683488726615906, + "learning_rate": 0.00048273920653442236, + "loss": 3.6114, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.3140070140361786, + "learning_rate": 0.0004823016336056009, + "loss": 3.6246, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.3176632523536682, + "learning_rate": 0.0004818640606767794, + "loss": 3.6215, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.33348730206489563, + "learning_rate": 0.00048142648774795793, + "loss": 3.6198, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.3215520679950714, + "learning_rate": 0.0004809889148191365, + "loss": 3.6326, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.3232531249523163, + "learning_rate": 0.000480551341890315, + "loss": 3.5772, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.3443015515804291, + "learning_rate": 0.00048011376896149355, + "loss": 3.5136, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.3226703405380249, + "learning_rate": 0.0004796761960326721, + "loss": 3.5234, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.32913267612457275, + "learning_rate": 0.0004792386231038506, + "loss": 3.5215, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.3350991904735565, + "learning_rate": 0.0004788010501750291, + "loss": 3.5211, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.3143565058708191, + "learning_rate": 0.00047836347724620766, + "loss": 3.5138, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.3570977076769612, + "eval_loss": 3.6638998985290527, + "eval_runtime": 180.2734, + "eval_samples_per_second": 92.326, + "eval_steps_per_second": 5.775, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.318641722202301, + "learning_rate": 0.0004779259043173862, + "loss": 3.5201, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.31839898228645325, + "learning_rate": 0.0004774883313885647, + "loss": 3.5223, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.3353429436683655, + "learning_rate": 0.0004770507584597433, + "loss": 3.5314, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.3297600746154785, + "learning_rate": 0.0004766131855309218, + "loss": 3.5386, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.35828185081481934, + "learning_rate": 0.0004761756126021003, + "loss": 3.5442, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.32543322443962097, + "learning_rate": 0.00047573803967327884, + "loss": 3.5498, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.33324652910232544, + "learning_rate": 0.0004753004667444574, + "loss": 3.5393, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.3401516079902649, + "learning_rate": 0.00047486289381563587, + "loss": 3.5485, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.34022200107574463, + "learning_rate": 0.0004744253208868144, + "loss": 3.5287, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.3375685214996338, + "learning_rate": 0.00047398774795799295, + "loss": 3.5567, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.32578080892562866, + "learning_rate": 0.00047355017502917154, + "loss": 3.5511, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.3124660551548004, + "learning_rate": 0.00047311260210035, + "loss": 3.5519, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.317643940448761, + "learning_rate": 0.00047267502917152857, + "loss": 3.5485, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.3317655026912689, + "learning_rate": 0.0004722374562427071, + "loss": 3.5541, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": 0.32578787207603455, + "learning_rate": 0.0004717998833138856, + "loss": 3.5354, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.32401853799819946, + "learning_rate": 0.00047136231038506413, + "loss": 3.5608, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.33071812987327576, + "learning_rate": 0.00047092473745624267, + "loss": 3.5453, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.3195439577102661, + "learning_rate": 0.00047048716452742116, + "loss": 3.5509, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.32133200764656067, + "learning_rate": 0.0004700495915985997, + "loss": 3.5631, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.345612108707428, + "learning_rate": 0.0004696120186697783, + "loss": 3.5632, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.3582166854554288, + "eval_loss": 3.653571128845215, + "eval_runtime": 180.4957, + "eval_samples_per_second": 92.213, + "eval_steps_per_second": 5.767, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.33550721406936646, + "learning_rate": 0.00046917444574095683, + "loss": 3.5562, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.32593655586242676, + "learning_rate": 0.0004687368728121353, + "loss": 3.5542, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.32876867055892944, + "learning_rate": 0.00046829929988331386, + "loss": 3.5537, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.31340348720550537, + "learning_rate": 0.0004678617269544924, + "loss": 3.5547, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.325003981590271, + "learning_rate": 0.0004674241540256709, + "loss": 3.5638, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.31941288709640503, + "learning_rate": 0.0004669865810968494, + "loss": 3.5625, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": 0.32604023814201355, + "learning_rate": 0.00046654900816802796, + "loss": 3.5542, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.3184167444705963, + "learning_rate": 0.00046611143523920645, + "loss": 3.5597, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.32676759362220764, + "learning_rate": 0.00046567386231038504, + "loss": 3.5518, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.3253229260444641, + "learning_rate": 0.0004652362893815636, + "loss": 3.5636, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.33474475145339966, + "learning_rate": 0.0004647987164527421, + "loss": 3.5638, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.34634941816329956, + "learning_rate": 0.0004643611435239206, + "loss": 3.5473, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.33891260623931885, + "learning_rate": 0.00046392357059509915, + "loss": 3.5675, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.32942262291908264, + "learning_rate": 0.0004634859976662777, + "loss": 3.5603, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.3374430239200592, + "learning_rate": 0.0004630484247374562, + "loss": 3.5538, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.3401276767253876, + "learning_rate": 0.0004626108518086347, + "loss": 3.5644, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3286304473876953, + "learning_rate": 0.0004621732788798133, + "loss": 3.5653, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.31420665979385376, + "learning_rate": 0.00046173570595099174, + "loss": 3.556, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3286356031894684, + "learning_rate": 0.00046129813302217033, + "loss": 3.552, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.33006393909454346, + "learning_rate": 0.00046086056009334887, + "loss": 3.5684, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.3596702866191563, + "eval_loss": 3.640338897705078, + "eval_runtime": 180.2507, + "eval_samples_per_second": 92.338, + "eval_steps_per_second": 5.775, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.3313292860984802, + "learning_rate": 0.0004604229871645274, + "loss": 3.5606, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.31922703981399536, + "learning_rate": 0.0004599854142357059, + "loss": 3.5739, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.3161007761955261, + "learning_rate": 0.00045954784130688444, + "loss": 3.5688, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.33094581961631775, + "learning_rate": 0.000459110268378063, + "loss": 3.564, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.3282545804977417, + "learning_rate": 0.00045867269544924146, + "loss": 3.5759, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.32690319418907166, + "learning_rate": 0.00045823512252042, + "loss": 3.5601, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.3375246524810791, + "learning_rate": 0.0004577975495915986, + "loss": 3.5569, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.3194766044616699, + "learning_rate": 0.0004573599766627771, + "loss": 3.5536, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.31809139251708984, + "learning_rate": 0.0004569224037339556, + "loss": 3.5626, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.3298538327217102, + "learning_rate": 0.00045648483080513416, + "loss": 3.5597, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.343118816614151, + "learning_rate": 0.0004560472578763127, + "loss": 3.563, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.32174625992774963, + "learning_rate": 0.0004556096849474912, + "loss": 3.5602, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.3458464741706848, + "learning_rate": 0.00045517211201866973, + "loss": 3.5485, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.3370623290538788, + "learning_rate": 0.00045473453908984827, + "loss": 3.5624, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.33553197979927063, + "learning_rate": 0.00045429696616102675, + "loss": 3.5675, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.3206152617931366, + "learning_rate": 0.00045385939323220535, + "loss": 3.5618, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.3171241581439972, + "learning_rate": 0.0004534218203033839, + "loss": 3.5692, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.3172144889831543, + "learning_rate": 0.0004529842473745624, + "loss": 3.5552, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.32273098826408386, + "learning_rate": 0.0004525466744457409, + "loss": 3.5712, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.3339548707008362, + "learning_rate": 0.00045210910151691945, + "loss": 3.5646, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.36071871835716146, + "eval_loss": 3.625553846359253, + "eval_runtime": 180.4809, + "eval_samples_per_second": 92.22, + "eval_steps_per_second": 5.768, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.3269366919994354, + "learning_rate": 0.000451671528588098, + "loss": 3.5542, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.31721675395965576, + "learning_rate": 0.0004512339556592765, + "loss": 3.5729, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.3314802050590515, + "learning_rate": 0.000450796382730455, + "loss": 3.5643, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.34938499331474304, + "learning_rate": 0.0004503588098016336, + "loss": 3.4822, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.3565429449081421, + "learning_rate": 0.0004499212368728121, + "loss": 3.4413, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.34626901149749756, + "learning_rate": 0.00044948366394399064, + "loss": 3.4539, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.336347758769989, + "learning_rate": 0.0004490460910151692, + "loss": 3.4579, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.3387928605079651, + "learning_rate": 0.00044860851808634767, + "loss": 3.4767, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.3393719494342804, + "learning_rate": 0.0004481709451575262, + "loss": 3.4596, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.3251345157623291, + "learning_rate": 0.00044773337222870475, + "loss": 3.4748, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.32468897104263306, + "learning_rate": 0.0004472957992998833, + "loss": 3.4805, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.3337823450565338, + "learning_rate": 0.00044685822637106177, + "loss": 3.4754, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.3582659959793091, + "learning_rate": 0.00044642065344224037, + "loss": 3.4752, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3382004499435425, + "learning_rate": 0.0004459830805134189, + "loss": 3.4633, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.33493444323539734, + "learning_rate": 0.0004455455075845974, + "loss": 3.4896, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.33412331342697144, + "learning_rate": 0.00044510793465577593, + "loss": 3.4854, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.3649858832359314, + "learning_rate": 0.00044467036172695447, + "loss": 3.4862, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.3273285925388336, + "learning_rate": 0.00044423278879813296, + "loss": 3.4898, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.36678996682167053, + "learning_rate": 0.0004437952158693115, + "loss": 3.4798, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.33765271306037903, + "learning_rate": 0.00044335764294049004, + "loss": 3.4892, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.3612939037403981, + "eval_loss": 3.6294894218444824, + "eval_runtime": 180.4902, + "eval_samples_per_second": 92.216, + "eval_steps_per_second": 5.768, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.35786503553390503, + "learning_rate": 0.00044292007001166863, + "loss": 3.4823, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.3416072130203247, + "learning_rate": 0.00044248249708284706, + "loss": 3.4879, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.34881216287612915, + "learning_rate": 0.00044204492415402566, + "loss": 3.4922, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.34528306126594543, + "learning_rate": 0.0004416073512252042, + "loss": 3.4949, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.328722208738327, + "learning_rate": 0.0004411697782963827, + "loss": 3.4898, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.3258577585220337, + "learning_rate": 0.0004407322053675612, + "loss": 3.4844, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.3687138855457306, + "learning_rate": 0.00044029463243873976, + "loss": 3.4868, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.3356603682041168, + "learning_rate": 0.00043985705950991825, + "loss": 3.4854, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3445993661880493, + "learning_rate": 0.0004394194865810968, + "loss": 3.4981, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.328427791595459, + "learning_rate": 0.00043898191365227533, + "loss": 3.4887, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.3391731083393097, + "learning_rate": 0.0004385443407234539, + "loss": 3.5023, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.3405122458934784, + "learning_rate": 0.0004381067677946324, + "loss": 3.5082, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.32964596152305603, + "learning_rate": 0.00043766919486581095, + "loss": 3.5064, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.32743725180625916, + "learning_rate": 0.0004372316219369895, + "loss": 3.5069, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.33889785408973694, + "learning_rate": 0.00043679404900816797, + "loss": 3.4917, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.3374757468700409, + "learning_rate": 0.0004363564760793465, + "loss": 3.5129, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.32586970925331116, + "learning_rate": 0.00043591890315052505, + "loss": 3.4983, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.3159201443195343, + "learning_rate": 0.00043548133022170354, + "loss": 3.5049, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.3207235634326935, + "learning_rate": 0.0004350437572928821, + "loss": 3.5027, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.32409095764160156, + "learning_rate": 0.00043460618436406067, + "loss": 3.5037, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.36227261247507964, + "eval_loss": 3.617015838623047, + "eval_runtime": 180.3747, + "eval_samples_per_second": 92.275, + "eval_steps_per_second": 5.771, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.33343568444252014, + "learning_rate": 0.0004341686114352392, + "loss": 3.5044, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3471834063529968, + "learning_rate": 0.0004337310385064177, + "loss": 3.4995, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.32965055108070374, + "learning_rate": 0.00043329346557759624, + "loss": 3.5091, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.32729023694992065, + "learning_rate": 0.0004328558926487748, + "loss": 3.4987, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.32407552003860474, + "learning_rate": 0.00043241831971995326, + "loss": 3.5105, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.3459337055683136, + "learning_rate": 0.0004319807467911318, + "loss": 3.5139, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.34581705927848816, + "learning_rate": 0.00043154317386231034, + "loss": 3.5169, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.323258638381958, + "learning_rate": 0.00043110560093348883, + "loss": 3.5006, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.3501630127429962, + "learning_rate": 0.0004306680280046674, + "loss": 3.5059, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.3383364975452423, + "learning_rate": 0.00043023045507584596, + "loss": 3.5082, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.3391266465187073, + "learning_rate": 0.0004297928821470245, + "loss": 3.5073, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.33838364481925964, + "learning_rate": 0.000429355309218203, + "loss": 3.5057, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.3325950801372528, + "learning_rate": 0.00042891773628938153, + "loss": 3.518, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.3349588215351105, + "learning_rate": 0.00042848016336056007, + "loss": 3.5155, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.33944258093833923, + "learning_rate": 0.00042804259043173855, + "loss": 3.5028, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.3170711398124695, + "learning_rate": 0.0004276050175029171, + "loss": 3.5181, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3340502083301544, + "learning_rate": 0.0004271674445740957, + "loss": 3.4968, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.34251272678375244, + "learning_rate": 0.0004267298716452741, + "loss": 3.5079, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.3394465446472168, + "learning_rate": 0.0004262922987164527, + "loss": 3.5056, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.32446908950805664, + "learning_rate": 0.00042585472578763125, + "loss": 3.5029, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.36320252686510796, + "eval_loss": 3.60426664352417, + "eval_runtime": 180.5652, + "eval_samples_per_second": 92.177, + "eval_steps_per_second": 5.765, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.3498161733150482, + "learning_rate": 0.0004254171528588098, + "loss": 3.5086, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.33967551589012146, + "learning_rate": 0.0004249795799299883, + "loss": 3.5126, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.3366953730583191, + "learning_rate": 0.0004245420070011668, + "loss": 3.5301, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.33286792039871216, + "learning_rate": 0.00042410443407234536, + "loss": 3.511, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.34662094712257385, + "learning_rate": 0.00042366686114352385, + "loss": 3.512, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.3202279508113861, + "learning_rate": 0.0004232292882147024, + "loss": 3.5006, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.34777122735977173, + "learning_rate": 0.000422791715285881, + "loss": 3.521, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.34444618225097656, + "learning_rate": 0.00042235414235705947, + "loss": 3.5126, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.3303092122077942, + "learning_rate": 0.000421916569428238, + "loss": 3.516, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.34319791197776794, + "learning_rate": 0.00042147899649941654, + "loss": 3.5029, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.33462879061698914, + "learning_rate": 0.0004210414235705951, + "loss": 3.509, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.332768976688385, + "learning_rate": 0.00042060385064177357, + "loss": 3.5038, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.32959234714508057, + "learning_rate": 0.0004201662777129521, + "loss": 3.3948, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.3324235677719116, + "learning_rate": 0.00041972870478413065, + "loss": 3.4112, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.3403053879737854, + "learning_rate": 0.00041929113185530914, + "loss": 3.4081, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.3473146855831146, + "learning_rate": 0.00041885355892648773, + "loss": 3.4049, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.34440669417381287, + "learning_rate": 0.00041841598599766627, + "loss": 3.4084, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.3304244875907898, + "learning_rate": 0.00041797841306884476, + "loss": 3.4264, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.33876416087150574, + "learning_rate": 0.0004175408401400233, + "loss": 3.415, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.3384806215763092, + "learning_rate": 0.00041710326721120184, + "loss": 3.4399, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.3637827680479111, + "eval_loss": 3.6081302165985107, + "eval_runtime": 180.2744, + "eval_samples_per_second": 92.326, + "eval_steps_per_second": 5.775, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.363090455532074, + "learning_rate": 0.0004166656942823804, + "loss": 3.4368, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.3336644470691681, + "learning_rate": 0.00041622812135355886, + "loss": 3.4255, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.3573184311389923, + "learning_rate": 0.0004157905484247374, + "loss": 3.43, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.3469174802303314, + "learning_rate": 0.000415352975495916, + "loss": 3.4248, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.33994483947753906, + "learning_rate": 0.0004149154025670945, + "loss": 3.4331, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.34334084391593933, + "learning_rate": 0.000414477829638273, + "loss": 3.4336, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.3307756185531616, + "learning_rate": 0.00041404025670945156, + "loss": 3.4457, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.3440045118331909, + "learning_rate": 0.00041360268378063005, + "loss": 3.4505, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.32408636808395386, + "learning_rate": 0.0004131651108518086, + "loss": 3.4495, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.3418697714805603, + "learning_rate": 0.0004127275379229871, + "loss": 3.4413, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.3394606113433838, + "learning_rate": 0.00041228996499416567, + "loss": 3.4419, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.3462677299976349, + "learning_rate": 0.00041185239206534415, + "loss": 3.4454, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.33543628454208374, + "learning_rate": 0.00041141481913652275, + "loss": 3.4359, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.3553283214569092, + "learning_rate": 0.0004109772462077013, + "loss": 3.4364, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.3360411822795868, + "learning_rate": 0.00041053967327887977, + "loss": 3.4451, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.33588552474975586, + "learning_rate": 0.0004101021003500583, + "loss": 3.439, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.3321113884449005, + "learning_rate": 0.00040966452742123685, + "loss": 3.4385, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.3304464817047119, + "learning_rate": 0.00040922695449241534, + "loss": 3.4573, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.3388485014438629, + "learning_rate": 0.0004087893815635939, + "loss": 3.4549, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.36697396636009216, + "learning_rate": 0.0004083518086347724, + "loss": 3.4438, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.3643471363716102, + "eval_loss": 3.600018262863159, + "eval_runtime": 180.2847, + "eval_samples_per_second": 92.321, + "eval_steps_per_second": 5.774, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.3477044999599457, + "learning_rate": 0.000407914235705951, + "loss": 3.4598, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.32996484637260437, + "learning_rate": 0.00040747666277712944, + "loss": 3.4483, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.33145061135292053, + "learning_rate": 0.00040703908984830804, + "loss": 3.4543, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.33102595806121826, + "learning_rate": 0.0004066015169194866, + "loss": 3.437, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.34182071685791016, + "learning_rate": 0.00040616394399066506, + "loss": 3.4591, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.35360977053642273, + "learning_rate": 0.0004057263710618436, + "loss": 3.4661, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.34044864773750305, + "learning_rate": 0.00040528879813302214, + "loss": 3.4658, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.3496011793613434, + "learning_rate": 0.00040485122520420063, + "loss": 3.455, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.31914111971855164, + "learning_rate": 0.00040441365227537917, + "loss": 3.4606, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.32800233364105225, + "learning_rate": 0.0004039760793465577, + "loss": 3.4562, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.33165040612220764, + "learning_rate": 0.0004035385064177363, + "loss": 3.455, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.3741567134857178, + "learning_rate": 0.0004031009334889148, + "loss": 3.4562, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.35394638776779175, + "learning_rate": 0.00040266336056009333, + "loss": 3.4607, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.3237501084804535, + "learning_rate": 0.00040222578763127187, + "loss": 3.4658, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.34644386172294617, + "learning_rate": 0.00040178821470245035, + "loss": 3.4642, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.34503695368766785, + "learning_rate": 0.0004013506417736289, + "loss": 3.4739, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.3343126177787781, + "learning_rate": 0.00040091306884480743, + "loss": 3.4505, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.33412104845046997, + "learning_rate": 0.0004004754959159859, + "loss": 3.4603, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.32703226804733276, + "learning_rate": 0.00040003792298716446, + "loss": 3.4612, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.32835039496421814, + "learning_rate": 0.00039960035005834305, + "loss": 3.4622, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.36494254495311274, + "eval_loss": 3.5912156105041504, + "eval_runtime": 180.4184, + "eval_samples_per_second": 92.252, + "eval_steps_per_second": 5.77, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.3277016878128052, + "learning_rate": 0.0003991627771295216, + "loss": 3.4609, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.3436872363090515, + "learning_rate": 0.0003987252042007001, + "loss": 3.4605, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.32483038306236267, + "learning_rate": 0.0003982876312718786, + "loss": 3.468, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.3559059500694275, + "learning_rate": 0.00039785005834305716, + "loss": 3.4694, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.34260398149490356, + "learning_rate": 0.00039741248541423564, + "loss": 3.4727, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.32523587346076965, + "learning_rate": 0.0003969749124854142, + "loss": 3.4571, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.3347657322883606, + "learning_rate": 0.0003965373395565927, + "loss": 3.4717, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.33626583218574524, + "learning_rate": 0.0003960997666277712, + "loss": 3.4646, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.36179831624031067, + "learning_rate": 0.0003956621936989498, + "loss": 3.4717, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.34891805052757263, + "learning_rate": 0.00039522462077012834, + "loss": 3.4699, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.37656670808792114, + "learning_rate": 0.0003947870478413069, + "loss": 3.4674, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.3371601402759552, + "learning_rate": 0.00039434947491248537, + "loss": 3.4684, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.3327315151691437, + "learning_rate": 0.0003939119019836639, + "loss": 3.4778, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.33458471298217773, + "learning_rate": 0.00039347432905484245, + "loss": 3.4688, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.3311387896537781, + "learning_rate": 0.00039303675612602094, + "loss": 3.4707, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.33576178550720215, + "learning_rate": 0.0003925991831971995, + "loss": 3.4649, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.316501259803772, + "learning_rate": 0.00039216161026837807, + "loss": 3.4702, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.3234950006008148, + "learning_rate": 0.00039172403733955656, + "loss": 3.4551, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.34549012780189514, + "learning_rate": 0.0003912864644107351, + "loss": 3.4708, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.33629485964775085, + "learning_rate": 0.00039084889148191364, + "loss": 3.4761, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.3658422421224764, + "eval_loss": 3.5796563625335693, + "eval_runtime": 180.4423, + "eval_samples_per_second": 92.24, + "eval_steps_per_second": 5.769, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.3567199409008026, + "learning_rate": 0.0003904113185530922, + "loss": 3.4224, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.33759805560112, + "learning_rate": 0.00038997374562427066, + "loss": 3.3554, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.3463039696216583, + "learning_rate": 0.0003895361726954492, + "loss": 3.3629, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.34043920040130615, + "learning_rate": 0.00038909859976662774, + "loss": 3.3713, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.3372809886932373, + "learning_rate": 0.0003886610268378062, + "loss": 3.3729, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.3626004159450531, + "learning_rate": 0.0003882234539089848, + "loss": 3.3779, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.3814680278301239, + "learning_rate": 0.00038778588098016336, + "loss": 3.3831, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.3421391248703003, + "learning_rate": 0.00038734830805134185, + "loss": 3.3799, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.34770506620407104, + "learning_rate": 0.0003869107351225204, + "loss": 3.3751, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.348093181848526, + "learning_rate": 0.0003864731621936989, + "loss": 3.3744, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.34899893403053284, + "learning_rate": 0.00038603558926487747, + "loss": 3.378, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.32636308670043945, + "learning_rate": 0.00038559801633605595, + "loss": 3.3811, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.32693901658058167, + "learning_rate": 0.0003851604434072345, + "loss": 3.3978, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.35337990522384644, + "learning_rate": 0.0003847228704784131, + "loss": 3.4016, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.33998918533325195, + "learning_rate": 0.0003842852975495915, + "loss": 3.3913, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.34085580706596375, + "learning_rate": 0.0003838477246207701, + "loss": 3.3876, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.34505322575569153, + "learning_rate": 0.00038341015169194865, + "loss": 3.4013, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.35665056109428406, + "learning_rate": 0.00038297257876312714, + "loss": 3.3945, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.33130306005477905, + "learning_rate": 0.0003825350058343057, + "loss": 3.3973, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.33717137575149536, + "learning_rate": 0.0003820974329054842, + "loss": 3.4035, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.36597134137652254, + "eval_loss": 3.5882999897003174, + "eval_runtime": 180.2331, + "eval_samples_per_second": 92.347, + "eval_steps_per_second": 5.776, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.343801349401474, + "learning_rate": 0.00038165985997666276, + "loss": 3.3994, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.34225597977638245, + "learning_rate": 0.00038122228704784124, + "loss": 3.4, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.3473186492919922, + "learning_rate": 0.0003807847141190198, + "loss": 3.397, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.3287709653377533, + "learning_rate": 0.0003803471411901984, + "loss": 3.4058, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.351204514503479, + "learning_rate": 0.00037990956826137686, + "loss": 3.4163, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.35390347242355347, + "learning_rate": 0.0003794719953325554, + "loss": 3.4228, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.3401016891002655, + "learning_rate": 0.00037903442240373394, + "loss": 3.406, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.35391902923583984, + "learning_rate": 0.00037859684947491243, + "loss": 3.4097, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.342098206281662, + "learning_rate": 0.00037815927654609097, + "loss": 3.4266, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.34706324338912964, + "learning_rate": 0.0003777217036172695, + "loss": 3.4083, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.35251185297966003, + "learning_rate": 0.00037728413068844805, + "loss": 3.4125, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.3509294390678406, + "learning_rate": 0.00037684655775962653, + "loss": 3.4203, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.3560699224472046, + "learning_rate": 0.00037640898483080513, + "loss": 3.4272, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.34861257672309875, + "learning_rate": 0.00037597141190198367, + "loss": 3.4182, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.33859142661094666, + "learning_rate": 0.00037553383897316215, + "loss": 3.4299, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.35380759835243225, + "learning_rate": 0.0003750962660443407, + "loss": 3.4201, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.34941068291664124, + "learning_rate": 0.00037465869311551923, + "loss": 3.4167, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.35646477341651917, + "learning_rate": 0.0003742211201866977, + "loss": 3.4306, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.35378143191337585, + "learning_rate": 0.00037378354725787626, + "loss": 3.4086, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.3527311384677887, + "learning_rate": 0.0003733459743290548, + "loss": 3.4207, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.36644117800600207, + "eval_loss": 3.5814883708953857, + "eval_runtime": 180.4499, + "eval_samples_per_second": 92.236, + "eval_steps_per_second": 5.769, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.3543234169483185, + "learning_rate": 0.0003729084014002334, + "loss": 3.4278, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.35281285643577576, + "learning_rate": 0.0003724708284714119, + "loss": 3.421, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.3394710123538971, + "learning_rate": 0.0003720332555425904, + "loss": 3.4238, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.34304413199424744, + "learning_rate": 0.00037159568261376896, + "loss": 3.4145, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.3429325520992279, + "learning_rate": 0.00037115810968494744, + "loss": 3.4268, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.3383738100528717, + "learning_rate": 0.000370720536756126, + "loss": 3.4228, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.3476937413215637, + "learning_rate": 0.0003702829638273045, + "loss": 3.4207, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.342341810464859, + "learning_rate": 0.000369845390898483, + "loss": 3.4207, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.35490912199020386, + "learning_rate": 0.00036940781796966155, + "loss": 3.4281, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.34896060824394226, + "learning_rate": 0.00036897024504084014, + "loss": 3.4361, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.3462172746658325, + "learning_rate": 0.0003685326721120187, + "loss": 3.425, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.35829275846481323, + "learning_rate": 0.00036809509918319717, + "loss": 3.4346, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.3367747664451599, + "learning_rate": 0.0003676575262543757, + "loss": 3.4247, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.33087530732154846, + "learning_rate": 0.00036721995332555425, + "loss": 3.432, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3543736934661865, + "learning_rate": 0.00036678238039673274, + "loss": 3.4379, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.3304196894168854, + "learning_rate": 0.0003663448074679113, + "loss": 3.4238, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.35223904252052307, + "learning_rate": 0.0003659072345390898, + "loss": 3.426, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.34050217270851135, + "learning_rate": 0.0003654696616102683, + "loss": 3.4172, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.3450503349304199, + "learning_rate": 0.00036503208868144684, + "loss": 3.4337, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.3508300483226776, + "learning_rate": 0.00036459451575262543, + "loss": 3.4351, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.36722659058981666, + "eval_loss": 3.571290969848633, + "eval_runtime": 180.2666, + "eval_samples_per_second": 92.33, + "eval_steps_per_second": 5.775, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.35574260354042053, + "learning_rate": 0.000364156942823804, + "loss": 3.4341, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.34330523014068604, + "learning_rate": 0.00036371936989498246, + "loss": 3.4396, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.35326018929481506, + "learning_rate": 0.000363281796966161, + "loss": 3.4192, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.356656938791275, + "learning_rate": 0.00036284422403733954, + "loss": 3.4359, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.32995936274528503, + "learning_rate": 0.000362406651108518, + "loss": 3.4363, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3421317934989929, + "learning_rate": 0.00036196907817969657, + "loss": 3.4263, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.33741775155067444, + "learning_rate": 0.0003615315052508751, + "loss": 3.4332, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.34820324182510376, + "learning_rate": 0.0003610939323220536, + "loss": 3.4311, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.36536943912506104, + "learning_rate": 0.0003606563593932322, + "loss": 3.4287, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.3467245399951935, + "learning_rate": 0.0003602187864644107, + "loss": 3.3578, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.36606982350349426, + "learning_rate": 0.00035978121353558927, + "loss": 3.3254, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.370090126991272, + "learning_rate": 0.00035934364060676775, + "loss": 3.3324, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.3692021667957306, + "learning_rate": 0.0003589060676779463, + "loss": 3.3385, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.35137107968330383, + "learning_rate": 0.00035846849474912483, + "loss": 3.3448, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.3459080755710602, + "learning_rate": 0.0003580309218203033, + "loss": 3.3318, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.35793742537498474, + "learning_rate": 0.00035759334889148186, + "loss": 3.3345, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.35751616954803467, + "learning_rate": 0.00035715577596266045, + "loss": 3.3613, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.3466125428676605, + "learning_rate": 0.00035671820303383894, + "loss": 3.3478, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.3528430759906769, + "learning_rate": 0.0003562806301050175, + "loss": 3.3564, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.36010900139808655, + "learning_rate": 0.000355843057176196, + "loss": 3.3456, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.36759131361900715, + "eval_loss": 3.580268144607544, + "eval_runtime": 180.2651, + "eval_samples_per_second": 92.331, + "eval_steps_per_second": 5.775, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.3616182804107666, + "learning_rate": 0.00035540548424737456, + "loss": 3.3554, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.3429206311702728, + "learning_rate": 0.00035496791131855304, + "loss": 3.3689, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.3601152300834656, + "learning_rate": 0.0003545303383897316, + "loss": 3.3647, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.346986323595047, + "learning_rate": 0.0003540927654609101, + "loss": 3.3639, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.3525499105453491, + "learning_rate": 0.0003536551925320886, + "loss": 3.3606, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.3487248420715332, + "learning_rate": 0.0003532176196032672, + "loss": 3.3582, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.3517068028450012, + "learning_rate": 0.00035278004667444574, + "loss": 3.3674, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3672351837158203, + "learning_rate": 0.00035234247374562423, + "loss": 3.3631, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.3655698001384735, + "learning_rate": 0.00035190490081680277, + "loss": 3.3605, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.3492221534252167, + "learning_rate": 0.0003514673278879813, + "loss": 3.3721, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.33222460746765137, + "learning_rate": 0.00035102975495915985, + "loss": 3.3655, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.3473198115825653, + "learning_rate": 0.00035059218203033833, + "loss": 3.3765, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.3525267541408539, + "learning_rate": 0.0003501546091015169, + "loss": 3.3692, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.3442334234714508, + "learning_rate": 0.00034971703617269547, + "loss": 3.3892, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.35674968361854553, + "learning_rate": 0.0003492794632438739, + "loss": 3.378, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.34580376744270325, + "learning_rate": 0.0003488418903150525, + "loss": 3.3797, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.34604698419570923, + "learning_rate": 0.00034840431738623103, + "loss": 3.3674, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.3592158854007721, + "learning_rate": 0.0003479667444574095, + "loss": 3.377, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.359283447265625, + "learning_rate": 0.00034752917152858806, + "loss": 3.3884, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.34626027941703796, + "learning_rate": 0.0003470915985997666, + "loss": 3.379, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.3679311103805677, + "eval_loss": 3.57328462600708, + "eval_runtime": 180.157, + "eval_samples_per_second": 92.386, + "eval_steps_per_second": 5.778, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.3666757643222809, + "learning_rate": 0.00034665402567094514, + "loss": 3.3796, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.3576624393463135, + "learning_rate": 0.0003462164527421236, + "loss": 3.3846, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.36986008286476135, + "learning_rate": 0.00034577887981330216, + "loss": 3.3789, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.34708988666534424, + "learning_rate": 0.00034534130688448076, + "loss": 3.3752, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.36563989520072937, + "learning_rate": 0.00034490373395565924, + "loss": 3.3733, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.36509010195732117, + "learning_rate": 0.0003444661610268378, + "loss": 3.3893, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.3598864674568176, + "learning_rate": 0.0003440285880980163, + "loss": 3.3822, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.3573833405971527, + "learning_rate": 0.0003435910151691948, + "loss": 3.3875, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.33812621235847473, + "learning_rate": 0.00034315344224037335, + "loss": 3.3907, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.3446572422981262, + "learning_rate": 0.0003427158693115519, + "loss": 3.3985, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.34378212690353394, + "learning_rate": 0.00034227829638273043, + "loss": 3.3868, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.35360199213027954, + "learning_rate": 0.0003418407234539089, + "loss": 3.4024, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.35581034421920776, + "learning_rate": 0.0003414031505250875, + "loss": 3.3898, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.35615333914756775, + "learning_rate": 0.00034096557759626605, + "loss": 3.3959, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.35061442852020264, + "learning_rate": 0.00034052800466744453, + "loss": 3.39, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.3618820905685425, + "learning_rate": 0.0003400904317386231, + "loss": 3.3843, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.34694838523864746, + "learning_rate": 0.0003396528588098016, + "loss": 3.3842, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.34534159302711487, + "learning_rate": 0.0003392152858809801, + "loss": 3.392, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.35484299063682556, + "learning_rate": 0.00033877771295215864, + "loss": 3.3917, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.34446921944618225, + "learning_rate": 0.0003383401400233372, + "loss": 3.3926, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.3686057656808563, + "eval_loss": 3.563647985458374, + "eval_runtime": 180.6198, + "eval_samples_per_second": 92.149, + "eval_steps_per_second": 5.763, + "step": 30000 + }, + { + "epoch": 8.753320904218132, + "grad_norm": 0.38022834062576294, + "learning_rate": 0.0003379025670945158, + "loss": 3.3971, + "step": 30050 + }, + { + "epoch": 8.767886273595899, + "grad_norm": 0.3438722491264343, + "learning_rate": 0.00033746499416569426, + "loss": 3.3964, + "step": 30100 + }, + { + "epoch": 8.782451642973665, + "grad_norm": 0.3680688142776489, + "learning_rate": 0.0003370274212368728, + "loss": 3.4007, + "step": 30150 + }, + { + "epoch": 8.797017012351434, + "grad_norm": 0.3484303653240204, + "learning_rate": 0.00033658984830805134, + "loss": 3.3908, + "step": 30200 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.34887251257896423, + "learning_rate": 0.0003361522753792298, + "loss": 3.3864, + "step": 30250 + }, + { + "epoch": 8.826147751106967, + "grad_norm": 0.38353490829467773, + "learning_rate": 0.00033571470245040837, + "loss": 3.3924, + "step": 30300 + }, + { + "epoch": 8.840713120484736, + "grad_norm": 0.3619522750377655, + "learning_rate": 0.0003352771295215869, + "loss": 3.3943, + "step": 30350 + }, + { + "epoch": 8.855278489862503, + "grad_norm": 0.34908801317214966, + "learning_rate": 0.0003348395565927654, + "loss": 3.4001, + "step": 30400 + }, + { + "epoch": 8.86984385924027, + "grad_norm": 0.3426980972290039, + "learning_rate": 0.00033440198366394393, + "loss": 3.3924, + "step": 30450 + }, + { + "epoch": 8.884409228618038, + "grad_norm": 0.3510425090789795, + "learning_rate": 0.0003339644107351225, + "loss": 3.3992, + "step": 30500 + }, + { + "epoch": 8.898974597995805, + "grad_norm": 0.3653899133205414, + "learning_rate": 0.00033352683780630107, + "loss": 3.3945, + "step": 30550 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.3542352318763733, + "learning_rate": 0.00033308926487747955, + "loss": 3.4124, + "step": 30600 + }, + { + "epoch": 8.92810533675134, + "grad_norm": 0.3585004508495331, + "learning_rate": 0.0003326516919486581, + "loss": 3.3896, + "step": 30650 + }, + { + "epoch": 8.942670706129107, + "grad_norm": 0.3748304843902588, + "learning_rate": 0.00033221411901983663, + "loss": 3.3986, + "step": 30700 + }, + { + "epoch": 8.957236075506875, + "grad_norm": 0.3452504277229309, + "learning_rate": 0.0003317765460910151, + "loss": 3.3955, + "step": 30750 + }, + { + "epoch": 8.971801444884642, + "grad_norm": 0.3590230941772461, + "learning_rate": 0.00033133897316219366, + "loss": 3.396, + "step": 30800 + }, + { + "epoch": 8.986366814262409, + "grad_norm": 0.3469237685203552, + "learning_rate": 0.0003309014002333722, + "loss": 3.3886, + "step": 30850 + }, + { + "epoch": 9.000873922162667, + "grad_norm": 0.34952783584594727, + "learning_rate": 0.0003304638273045507, + "loss": 3.3905, + "step": 30900 + }, + { + "epoch": 9.015439291540433, + "grad_norm": 0.3712822198867798, + "learning_rate": 0.0003300262543757292, + "loss": 3.2828, + "step": 30950 + }, + { + "epoch": 9.0300046609182, + "grad_norm": 0.371115505695343, + "learning_rate": 0.0003295886814469078, + "loss": 3.305, + "step": 31000 + }, + { + "epoch": 9.0300046609182, + "eval_accuracy": 0.36886643330036484, + "eval_loss": 3.5686051845550537, + "eval_runtime": 181.9038, + "eval_samples_per_second": 91.499, + "eval_steps_per_second": 5.723, + "step": 31000 + }, + { + "epoch": 9.044570030295969, + "grad_norm": 0.3616848289966583, + "learning_rate": 0.00032915110851808636, + "loss": 3.2979, + "step": 31050 + }, + { + "epoch": 9.059135399673735, + "grad_norm": 0.3446025550365448, + "learning_rate": 0.00032871353558926484, + "loss": 3.3074, + "step": 31100 + }, + { + "epoch": 9.073700769051504, + "grad_norm": 0.36741337180137634, + "learning_rate": 0.0003282759626604434, + "loss": 3.2965, + "step": 31150 + }, + { + "epoch": 9.08826613842927, + "grad_norm": 0.3401558995246887, + "learning_rate": 0.0003278383897316219, + "loss": 3.305, + "step": 31200 + }, + { + "epoch": 9.102831507807037, + "grad_norm": 0.364580363035202, + "learning_rate": 0.0003274008168028004, + "loss": 3.3159, + "step": 31250 + }, + { + "epoch": 9.117396877184806, + "grad_norm": 0.3511641025543213, + "learning_rate": 0.00032696324387397895, + "loss": 3.3232, + "step": 31300 + }, + { + "epoch": 9.131962246562573, + "grad_norm": 0.3950608968734741, + "learning_rate": 0.0003265256709451575, + "loss": 3.3135, + "step": 31350 + }, + { + "epoch": 9.14652761594034, + "grad_norm": 0.35723525285720825, + "learning_rate": 0.00032608809801633597, + "loss": 3.3117, + "step": 31400 + }, + { + "epoch": 9.161092985318108, + "grad_norm": 0.38060232996940613, + "learning_rate": 0.00032565052508751457, + "loss": 3.3211, + "step": 31450 + }, + { + "epoch": 9.175658354695875, + "grad_norm": 0.3600994944572449, + "learning_rate": 0.0003252129521586931, + "loss": 3.3195, + "step": 31500 + }, + { + "epoch": 9.190223724073643, + "grad_norm": 0.38114050030708313, + "learning_rate": 0.00032477537922987165, + "loss": 3.3272, + "step": 31550 + }, + { + "epoch": 9.20478909345141, + "grad_norm": 0.35321810841560364, + "learning_rate": 0.00032433780630105013, + "loss": 3.3198, + "step": 31600 + }, + { + "epoch": 9.219354462829177, + "grad_norm": 0.3709951639175415, + "learning_rate": 0.00032390023337222867, + "loss": 3.3285, + "step": 31650 + }, + { + "epoch": 9.233919832206945, + "grad_norm": 0.3782629668712616, + "learning_rate": 0.0003234626604434072, + "loss": 3.3391, + "step": 31700 + }, + { + "epoch": 9.248485201584712, + "grad_norm": 0.3684213161468506, + "learning_rate": 0.0003230250875145857, + "loss": 3.3321, + "step": 31750 + }, + { + "epoch": 9.263050570962479, + "grad_norm": 0.37817445397377014, + "learning_rate": 0.00032258751458576424, + "loss": 3.3253, + "step": 31800 + }, + { + "epoch": 9.277615940340247, + "grad_norm": 0.37698403000831604, + "learning_rate": 0.00032214994165694283, + "loss": 3.3263, + "step": 31850 + }, + { + "epoch": 9.292181309718014, + "grad_norm": 0.35898423194885254, + "learning_rate": 0.0003217123687281213, + "loss": 3.3407, + "step": 31900 + }, + { + "epoch": 9.306746679095783, + "grad_norm": 0.37121668457984924, + "learning_rate": 0.00032127479579929986, + "loss": 3.3469, + "step": 31950 + }, + { + "epoch": 9.32131204847355, + "grad_norm": 0.3602243959903717, + "learning_rate": 0.0003208372228704784, + "loss": 3.333, + "step": 32000 + }, + { + "epoch": 9.32131204847355, + "eval_accuracy": 0.36881940260672325, + "eval_loss": 3.568206787109375, + "eval_runtime": 181.6399, + "eval_samples_per_second": 91.632, + "eval_steps_per_second": 5.731, + "step": 32000 + }, + { + "epoch": 9.335877417851316, + "grad_norm": 0.3777805268764496, + "learning_rate": 0.00032039964994165694, + "loss": 3.3404, + "step": 32050 + }, + { + "epoch": 9.350442787229085, + "grad_norm": 0.36622655391693115, + "learning_rate": 0.0003199620770128354, + "loss": 3.3462, + "step": 32100 + }, + { + "epoch": 9.365008156606851, + "grad_norm": 0.3432258069515228, + "learning_rate": 0.00031952450408401396, + "loss": 3.3499, + "step": 32150 + }, + { + "epoch": 9.379573525984618, + "grad_norm": 0.3571391999721527, + "learning_rate": 0.0003190869311551925, + "loss": 3.3437, + "step": 32200 + }, + { + "epoch": 9.394138895362387, + "grad_norm": 0.3796580731868744, + "learning_rate": 0.000318649358226371, + "loss": 3.3445, + "step": 32250 + }, + { + "epoch": 9.408704264740154, + "grad_norm": 0.3999924659729004, + "learning_rate": 0.0003182117852975496, + "loss": 3.3398, + "step": 32300 + }, + { + "epoch": 9.423269634117922, + "grad_norm": 0.3521633744239807, + "learning_rate": 0.0003177742123687281, + "loss": 3.3518, + "step": 32350 + }, + { + "epoch": 9.437835003495689, + "grad_norm": 0.34816059470176697, + "learning_rate": 0.0003173366394399066, + "loss": 3.3498, + "step": 32400 + }, + { + "epoch": 9.452400372873456, + "grad_norm": 0.3519940674304962, + "learning_rate": 0.00031689906651108515, + "loss": 3.3491, + "step": 32450 + }, + { + "epoch": 9.466965742251224, + "grad_norm": 0.366641104221344, + "learning_rate": 0.0003164614935822637, + "loss": 3.3575, + "step": 32500 + }, + { + "epoch": 9.48153111162899, + "grad_norm": 0.3859027028083801, + "learning_rate": 0.00031602392065344223, + "loss": 3.3477, + "step": 32550 + }, + { + "epoch": 9.496096481006758, + "grad_norm": 0.3514662981033325, + "learning_rate": 0.0003155863477246207, + "loss": 3.3536, + "step": 32600 + }, + { + "epoch": 9.510661850384526, + "grad_norm": 0.37433597445487976, + "learning_rate": 0.00031514877479579925, + "loss": 3.3606, + "step": 32650 + }, + { + "epoch": 9.525227219762293, + "grad_norm": 0.3747974932193756, + "learning_rate": 0.00031471120186697785, + "loss": 3.3621, + "step": 32700 + }, + { + "epoch": 9.53979258914006, + "grad_norm": 0.38271790742874146, + "learning_rate": 0.00031427362893815633, + "loss": 3.3599, + "step": 32750 + }, + { + "epoch": 9.554357958517828, + "grad_norm": 0.3738161027431488, + "learning_rate": 0.0003138360560093349, + "loss": 3.35, + "step": 32800 + }, + { + "epoch": 9.568923327895595, + "grad_norm": 0.37082603573799133, + "learning_rate": 0.0003133984830805134, + "loss": 3.3593, + "step": 32850 + }, + { + "epoch": 9.583488697273363, + "grad_norm": 0.38742882013320923, + "learning_rate": 0.0003129609101516919, + "loss": 3.3513, + "step": 32900 + }, + { + "epoch": 9.59805406665113, + "grad_norm": 0.36848726868629456, + "learning_rate": 0.00031252333722287044, + "loss": 3.3619, + "step": 32950 + }, + { + "epoch": 9.612619436028897, + "grad_norm": 0.34450680017471313, + "learning_rate": 0.000312085764294049, + "loss": 3.3523, + "step": 33000 + }, + { + "epoch": 9.612619436028897, + "eval_accuracy": 0.36961469163620253, + "eval_loss": 3.5594406127929688, + "eval_runtime": 181.5027, + "eval_samples_per_second": 91.701, + "eval_steps_per_second": 5.735, + "step": 33000 + }, + { + "epoch": 9.627184805406666, + "grad_norm": 0.34740373492240906, + "learning_rate": 0.0003116481913652275, + "loss": 3.3523, + "step": 33050 + }, + { + "epoch": 9.641750174784432, + "grad_norm": 0.3639390468597412, + "learning_rate": 0.000311210618436406, + "loss": 3.3569, + "step": 33100 + }, + { + "epoch": 9.6563155441622, + "grad_norm": 0.3668532073497772, + "learning_rate": 0.0003107730455075846, + "loss": 3.3582, + "step": 33150 + }, + { + "epoch": 9.670880913539968, + "grad_norm": 0.3689277172088623, + "learning_rate": 0.00031033547257876314, + "loss": 3.3615, + "step": 33200 + }, + { + "epoch": 9.685446282917734, + "grad_norm": 0.3605565130710602, + "learning_rate": 0.0003098978996499416, + "loss": 3.3501, + "step": 33250 + }, + { + "epoch": 9.700011652295503, + "grad_norm": 0.3678613007068634, + "learning_rate": 0.00030946032672112016, + "loss": 3.3653, + "step": 33300 + }, + { + "epoch": 9.71457702167327, + "grad_norm": 0.360675185918808, + "learning_rate": 0.0003090227537922987, + "loss": 3.3637, + "step": 33350 + }, + { + "epoch": 9.729142391051036, + "grad_norm": 0.3719678819179535, + "learning_rate": 0.0003085851808634772, + "loss": 3.357, + "step": 33400 + }, + { + "epoch": 9.743707760428805, + "grad_norm": 0.3562043607234955, + "learning_rate": 0.00030814760793465573, + "loss": 3.3531, + "step": 33450 + }, + { + "epoch": 9.758273129806572, + "grad_norm": 0.37112271785736084, + "learning_rate": 0.00030771003500583427, + "loss": 3.3629, + "step": 33500 + }, + { + "epoch": 9.772838499184338, + "grad_norm": 0.3823767900466919, + "learning_rate": 0.00030727246207701286, + "loss": 3.3711, + "step": 33550 + }, + { + "epoch": 9.787403868562107, + "grad_norm": 0.3594043552875519, + "learning_rate": 0.0003068348891481913, + "loss": 3.3712, + "step": 33600 + }, + { + "epoch": 9.801969237939874, + "grad_norm": 0.3566214442253113, + "learning_rate": 0.0003063973162193699, + "loss": 3.3842, + "step": 33650 + }, + { + "epoch": 9.816534607317642, + "grad_norm": 0.36310461163520813, + "learning_rate": 0.00030595974329054843, + "loss": 3.3596, + "step": 33700 + }, + { + "epoch": 9.831099976695409, + "grad_norm": 0.36038920283317566, + "learning_rate": 0.0003055221703617269, + "loss": 3.3677, + "step": 33750 + }, + { + "epoch": 9.845665346073176, + "grad_norm": 0.34875422716140747, + "learning_rate": 0.00030508459743290546, + "loss": 3.3738, + "step": 33800 + }, + { + "epoch": 9.860230715450944, + "grad_norm": 0.3687998056411743, + "learning_rate": 0.000304647024504084, + "loss": 3.3725, + "step": 33850 + }, + { + "epoch": 9.874796084828711, + "grad_norm": 0.3492382764816284, + "learning_rate": 0.0003042094515752625, + "loss": 3.3642, + "step": 33900 + }, + { + "epoch": 9.88936145420648, + "grad_norm": 0.34819406270980835, + "learning_rate": 0.000303771878646441, + "loss": 3.3783, + "step": 33950 + }, + { + "epoch": 9.903926823584246, + "grad_norm": 0.36848151683807373, + "learning_rate": 0.00030333430571761956, + "loss": 3.3654, + "step": 34000 + }, + { + "epoch": 9.903926823584246, + "eval_accuracy": 0.3703816446727628, + "eval_loss": 3.5503089427948, + "eval_runtime": 181.4493, + "eval_samples_per_second": 91.728, + "eval_steps_per_second": 5.737, + "step": 34000 + }, + { + "epoch": 9.918492192962013, + "grad_norm": 0.39530327916145325, + "learning_rate": 0.00030289673278879816, + "loss": 3.3634, + "step": 34050 + }, + { + "epoch": 9.933057562339782, + "grad_norm": 0.3620380759239197, + "learning_rate": 0.00030245915985997664, + "loss": 3.3729, + "step": 34100 + }, + { + "epoch": 9.947622931717548, + "grad_norm": 0.356423020362854, + "learning_rate": 0.0003020215869311552, + "loss": 3.3823, + "step": 34150 + }, + { + "epoch": 9.962188301095315, + "grad_norm": 0.35574576258659363, + "learning_rate": 0.0003015840140023337, + "loss": 3.3698, + "step": 34200 + }, + { + "epoch": 9.976753670473084, + "grad_norm": 0.3700348734855652, + "learning_rate": 0.0003011464410735122, + "loss": 3.368, + "step": 34250 + }, + { + "epoch": 9.99131903985085, + "grad_norm": 0.3582363724708557, + "learning_rate": 0.00030070886814469075, + "loss": 3.3747, + "step": 34300 + }, + { + "epoch": 10.005826147751106, + "grad_norm": 0.3664577901363373, + "learning_rate": 0.0003002712952158693, + "loss": 3.3242, + "step": 34350 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.3791219890117645, + "learning_rate": 0.0002998337222870478, + "loss": 3.2693, + "step": 34400 + }, + { + "epoch": 10.034956886506642, + "grad_norm": 0.3724304437637329, + "learning_rate": 0.0002993961493582263, + "loss": 3.2642, + "step": 34450 + }, + { + "epoch": 10.049522255884408, + "grad_norm": 0.36520498991012573, + "learning_rate": 0.0002989585764294049, + "loss": 3.2756, + "step": 34500 + }, + { + "epoch": 10.064087625262177, + "grad_norm": 0.3840792179107666, + "learning_rate": 0.0002985210035005834, + "loss": 3.28, + "step": 34550 + }, + { + "epoch": 10.078652994639944, + "grad_norm": 0.3588644564151764, + "learning_rate": 0.00029808343057176193, + "loss": 3.2725, + "step": 34600 + }, + { + "epoch": 10.093218364017712, + "grad_norm": 0.3608386516571045, + "learning_rate": 0.00029764585764294047, + "loss": 3.2845, + "step": 34650 + }, + { + "epoch": 10.107783733395479, + "grad_norm": 0.37310636043548584, + "learning_rate": 0.00029720828471411896, + "loss": 3.2934, + "step": 34700 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.3664185404777527, + "learning_rate": 0.00029677071178529755, + "loss": 3.2953, + "step": 34750 + }, + { + "epoch": 10.136914472151014, + "grad_norm": 0.3596240282058716, + "learning_rate": 0.00029633313885647604, + "loss": 3.304, + "step": 34800 + }, + { + "epoch": 10.151479841528781, + "grad_norm": 0.3951849341392517, + "learning_rate": 0.0002958955659276546, + "loss": 3.2999, + "step": 34850 + }, + { + "epoch": 10.166045210906548, + "grad_norm": 0.38322994112968445, + "learning_rate": 0.0002954579929988331, + "loss": 3.304, + "step": 34900 + }, + { + "epoch": 10.180610580284316, + "grad_norm": 0.36491626501083374, + "learning_rate": 0.00029502042007001166, + "loss": 3.2911, + "step": 34950 + }, + { + "epoch": 10.195175949662083, + "grad_norm": 0.37527546286582947, + "learning_rate": 0.0002945828471411902, + "loss": 3.2834, + "step": 35000 + }, + { + "epoch": 10.195175949662083, + "eval_accuracy": 0.3699247414840347, + "eval_loss": 3.5616824626922607, + "eval_runtime": 180.3891, + "eval_samples_per_second": 92.267, + "eval_steps_per_second": 5.771, + "step": 35000 + }, + { + "epoch": 10.209741319039852, + "grad_norm": 0.3800257742404938, + "learning_rate": 0.0002941452742123687, + "loss": 3.2981, + "step": 35050 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.3631065785884857, + "learning_rate": 0.0002937077012835472, + "loss": 3.3006, + "step": 35100 + }, + { + "epoch": 10.238872057795385, + "grad_norm": 0.3786700963973999, + "learning_rate": 0.00029327012835472576, + "loss": 3.3138, + "step": 35150 + }, + { + "epoch": 10.253437427173154, + "grad_norm": 0.35197684168815613, + "learning_rate": 0.0002928325554259043, + "loss": 3.3003, + "step": 35200 + }, + { + "epoch": 10.26800279655092, + "grad_norm": 0.36957064270973206, + "learning_rate": 0.00029239498249708284, + "loss": 3.3105, + "step": 35250 + }, + { + "epoch": 10.282568165928687, + "grad_norm": 0.37074217200279236, + "learning_rate": 0.00029195740956826133, + "loss": 3.3123, + "step": 35300 + }, + { + "epoch": 10.297133535306456, + "grad_norm": 0.3711046278476715, + "learning_rate": 0.0002915198366394399, + "loss": 3.302, + "step": 35350 + }, + { + "epoch": 10.311698904684222, + "grad_norm": 0.3888838589191437, + "learning_rate": 0.0002910822637106184, + "loss": 3.3077, + "step": 35400 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.3660491704940796, + "learning_rate": 0.00029064469078179695, + "loss": 3.3186, + "step": 35450 + }, + { + "epoch": 10.340829643439758, + "grad_norm": 0.3750646412372589, + "learning_rate": 0.0002902071178529755, + "loss": 3.3088, + "step": 35500 + }, + { + "epoch": 10.355395012817525, + "grad_norm": 0.3611460030078888, + "learning_rate": 0.000289769544924154, + "loss": 3.3126, + "step": 35550 + }, + { + "epoch": 10.369960382195293, + "grad_norm": 0.3784548342227936, + "learning_rate": 0.00028933197199533257, + "loss": 3.3199, + "step": 35600 + }, + { + "epoch": 10.38452575157306, + "grad_norm": 0.3654816746711731, + "learning_rate": 0.00028889439906651105, + "loss": 3.317, + "step": 35650 + }, + { + "epoch": 10.399091120950827, + "grad_norm": 0.3819401264190674, + "learning_rate": 0.0002884568261376896, + "loss": 3.3174, + "step": 35700 + }, + { + "epoch": 10.413656490328595, + "grad_norm": 0.3685275912284851, + "learning_rate": 0.00028801925320886813, + "loss": 3.3172, + "step": 35750 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.3687780201435089, + "learning_rate": 0.0002875816802800466, + "loss": 3.3185, + "step": 35800 + }, + { + "epoch": 10.44278722908413, + "grad_norm": 0.3637807369232178, + "learning_rate": 0.0002871441073512252, + "loss": 3.3257, + "step": 35850 + }, + { + "epoch": 10.457352598461897, + "grad_norm": 0.3877573013305664, + "learning_rate": 0.0002867065344224037, + "loss": 3.3316, + "step": 35900 + }, + { + "epoch": 10.471917967839664, + "grad_norm": 0.37709304690361023, + "learning_rate": 0.00028626896149358224, + "loss": 3.3028, + "step": 35950 + }, + { + "epoch": 10.486483337217432, + "grad_norm": 0.36883544921875, + "learning_rate": 0.0002858313885647608, + "loss": 3.3313, + "step": 36000 + }, + { + "epoch": 10.486483337217432, + "eval_accuracy": 0.37047523575310953, + "eval_loss": 3.555171489715576, + "eval_runtime": 180.6124, + "eval_samples_per_second": 92.153, + "eval_steps_per_second": 5.764, + "step": 36000 + }, + { + "epoch": 10.5010487065952, + "grad_norm": 0.37332433462142944, + "learning_rate": 0.0002853938156359393, + "loss": 3.3147, + "step": 36050 + }, + { + "epoch": 10.515614075972966, + "grad_norm": 0.3772258758544922, + "learning_rate": 0.00028495624270711786, + "loss": 3.3325, + "step": 36100 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.35568490624427795, + "learning_rate": 0.00028451866977829634, + "loss": 3.3381, + "step": 36150 + }, + { + "epoch": 10.544744814728501, + "grad_norm": 0.3858466148376465, + "learning_rate": 0.0002840810968494749, + "loss": 3.3242, + "step": 36200 + }, + { + "epoch": 10.55931018410627, + "grad_norm": 0.3936407268047333, + "learning_rate": 0.0002836435239206534, + "loss": 3.3213, + "step": 36250 + }, + { + "epoch": 10.573875553484037, + "grad_norm": 0.3783574104309082, + "learning_rate": 0.00028320595099183196, + "loss": 3.3174, + "step": 36300 + }, + { + "epoch": 10.588440922861803, + "grad_norm": 0.3611924648284912, + "learning_rate": 0.0002827683780630105, + "loss": 3.3281, + "step": 36350 + }, + { + "epoch": 10.603006292239572, + "grad_norm": 0.36673375964164734, + "learning_rate": 0.000282330805134189, + "loss": 3.3274, + "step": 36400 + }, + { + "epoch": 10.617571661617339, + "grad_norm": 0.3864386975765228, + "learning_rate": 0.00028189323220536753, + "loss": 3.3263, + "step": 36450 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.37186652421951294, + "learning_rate": 0.00028145565927654607, + "loss": 3.3256, + "step": 36500 + }, + { + "epoch": 10.646702400372874, + "grad_norm": 0.3645637333393097, + "learning_rate": 0.0002810180863477246, + "loss": 3.3304, + "step": 36550 + }, + { + "epoch": 10.66126776975064, + "grad_norm": 0.3960283696651459, + "learning_rate": 0.00028058051341890315, + "loss": 3.3293, + "step": 36600 + }, + { + "epoch": 10.675833139128407, + "grad_norm": 0.3968350291252136, + "learning_rate": 0.00028014294049008164, + "loss": 3.3289, + "step": 36650 + }, + { + "epoch": 10.690398508506176, + "grad_norm": 0.3649657666683197, + "learning_rate": 0.0002797053675612602, + "loss": 3.326, + "step": 36700 + }, + { + "epoch": 10.704963877883943, + "grad_norm": 0.366464227437973, + "learning_rate": 0.0002792677946324387, + "loss": 3.3453, + "step": 36750 + }, + { + "epoch": 10.719529247261711, + "grad_norm": 0.36643803119659424, + "learning_rate": 0.00027883022170361726, + "loss": 3.3272, + "step": 36800 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.35845255851745605, + "learning_rate": 0.0002783926487747958, + "loss": 3.3341, + "step": 36850 + }, + { + "epoch": 10.748659986017245, + "grad_norm": 0.3823663294315338, + "learning_rate": 0.0002779550758459743, + "loss": 3.3404, + "step": 36900 + }, + { + "epoch": 10.763225355395013, + "grad_norm": 0.36972370743751526, + "learning_rate": 0.0002775175029171528, + "loss": 3.3372, + "step": 36950 + }, + { + "epoch": 10.77779072477278, + "grad_norm": 0.3613353967666626, + "learning_rate": 0.00027707992998833136, + "loss": 3.3399, + "step": 37000 + }, + { + "epoch": 10.77779072477278, + "eval_accuracy": 0.37111332468909186, + "eval_loss": 3.54584002494812, + "eval_runtime": 180.3064, + "eval_samples_per_second": 92.31, + "eval_steps_per_second": 5.774, + "step": 37000 + }, + { + "epoch": 10.792356094150549, + "grad_norm": 0.37549999356269836, + "learning_rate": 0.0002766423570595099, + "loss": 3.3357, + "step": 37050 + }, + { + "epoch": 10.806921463528315, + "grad_norm": 0.38684558868408203, + "learning_rate": 0.00027620478413068844, + "loss": 3.3365, + "step": 37100 + }, + { + "epoch": 10.821486832906082, + "grad_norm": 0.3710017204284668, + "learning_rate": 0.000275767211201867, + "loss": 3.3438, + "step": 37150 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.3841908872127533, + "learning_rate": 0.00027532963827304547, + "loss": 3.3451, + "step": 37200 + }, + { + "epoch": 10.850617571661617, + "grad_norm": 0.37406450510025024, + "learning_rate": 0.000274892065344224, + "loss": 3.3508, + "step": 37250 + }, + { + "epoch": 10.865182941039384, + "grad_norm": 0.37421655654907227, + "learning_rate": 0.00027445449241540255, + "loss": 3.3346, + "step": 37300 + }, + { + "epoch": 10.879748310417153, + "grad_norm": 0.3724828064441681, + "learning_rate": 0.0002740169194865811, + "loss": 3.3251, + "step": 37350 + }, + { + "epoch": 10.89431367979492, + "grad_norm": 0.37316784262657166, + "learning_rate": 0.0002735793465577596, + "loss": 3.3407, + "step": 37400 + }, + { + "epoch": 10.908879049172686, + "grad_norm": 0.35748419165611267, + "learning_rate": 0.0002731417736289381, + "loss": 3.3403, + "step": 37450 + }, + { + "epoch": 10.923444418550455, + "grad_norm": 0.3694682717323303, + "learning_rate": 0.00027270420070011665, + "loss": 3.3413, + "step": 37500 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.37885811924934387, + "learning_rate": 0.0002722666277712952, + "loss": 3.3426, + "step": 37550 + }, + { + "epoch": 10.95257515730599, + "grad_norm": 0.38499969244003296, + "learning_rate": 0.00027182905484247373, + "loss": 3.3389, + "step": 37600 + }, + { + "epoch": 10.967140526683757, + "grad_norm": 0.36556389927864075, + "learning_rate": 0.00027139148191365227, + "loss": 3.3459, + "step": 37650 + }, + { + "epoch": 10.981705896061523, + "grad_norm": 0.3706257939338684, + "learning_rate": 0.00027095390898483076, + "loss": 3.3406, + "step": 37700 + }, + { + "epoch": 10.996271265439292, + "grad_norm": 0.36823248863220215, + "learning_rate": 0.0002705163360560093, + "loss": 3.3361, + "step": 37750 + }, + { + "epoch": 11.010778373339548, + "grad_norm": 0.35125017166137695, + "learning_rate": 0.00027007876312718784, + "loss": 3.2779, + "step": 37800 + }, + { + "epoch": 11.025343742717315, + "grad_norm": 0.3946673274040222, + "learning_rate": 0.0002696411901983664, + "loss": 3.2391, + "step": 37850 + }, + { + "epoch": 11.039909112095083, + "grad_norm": 0.37063831090927124, + "learning_rate": 0.0002692036172695449, + "loss": 3.2478, + "step": 37900 + }, + { + "epoch": 11.05447448147285, + "grad_norm": 0.36030757427215576, + "learning_rate": 0.0002687660443407234, + "loss": 3.2521, + "step": 37950 + }, + { + "epoch": 11.069039850850617, + "grad_norm": 0.3777706027030945, + "learning_rate": 0.00026832847141190194, + "loss": 3.2605, + "step": 38000 + }, + { + "epoch": 11.069039850850617, + "eval_accuracy": 0.370730142112647, + "eval_loss": 3.558288097381592, + "eval_runtime": 180.3247, + "eval_samples_per_second": 92.3, + "eval_steps_per_second": 5.773, + "step": 38000 + }, + { + "epoch": 11.083605220228385, + "grad_norm": 0.3810880482196808, + "learning_rate": 0.0002678908984830805, + "loss": 3.2545, + "step": 38050 + }, + { + "epoch": 11.098170589606152, + "grad_norm": 0.38010433316230774, + "learning_rate": 0.000267453325554259, + "loss": 3.2619, + "step": 38100 + }, + { + "epoch": 11.11273595898392, + "grad_norm": 0.38559970259666443, + "learning_rate": 0.00026701575262543756, + "loss": 3.26, + "step": 38150 + }, + { + "epoch": 11.127301328361687, + "grad_norm": 0.37837737798690796, + "learning_rate": 0.00026657817969661605, + "loss": 3.2626, + "step": 38200 + }, + { + "epoch": 11.141866697739454, + "grad_norm": 0.3784601092338562, + "learning_rate": 0.00026614060676779464, + "loss": 3.258, + "step": 38250 + }, + { + "epoch": 11.156432067117223, + "grad_norm": 0.35845887660980225, + "learning_rate": 0.00026570303383897313, + "loss": 3.2586, + "step": 38300 + }, + { + "epoch": 11.17099743649499, + "grad_norm": 0.37323319911956787, + "learning_rate": 0.00026526546091015167, + "loss": 3.2596, + "step": 38350 + }, + { + "epoch": 11.185562805872756, + "grad_norm": 0.4025420546531677, + "learning_rate": 0.0002648278879813302, + "loss": 3.2722, + "step": 38400 + }, + { + "epoch": 11.200128175250525, + "grad_norm": 0.3721407651901245, + "learning_rate": 0.0002643903150525087, + "loss": 3.2737, + "step": 38450 + }, + { + "epoch": 11.214693544628291, + "grad_norm": 0.3787324130535126, + "learning_rate": 0.0002639527421236873, + "loss": 3.267, + "step": 38500 + }, + { + "epoch": 11.22925891400606, + "grad_norm": 0.385883092880249, + "learning_rate": 0.0002635151691948658, + "loss": 3.2813, + "step": 38550 + }, + { + "epoch": 11.243824283383827, + "grad_norm": 0.3823045492172241, + "learning_rate": 0.0002630775962660443, + "loss": 3.2806, + "step": 38600 + }, + { + "epoch": 11.258389652761593, + "grad_norm": 0.3888196349143982, + "learning_rate": 0.00026264002333722285, + "loss": 3.2669, + "step": 38650 + }, + { + "epoch": 11.272955022139362, + "grad_norm": 0.4065677523612976, + "learning_rate": 0.00026220245040840134, + "loss": 3.2815, + "step": 38700 + }, + { + "epoch": 11.287520391517129, + "grad_norm": 0.36197319626808167, + "learning_rate": 0.00026176487747957993, + "loss": 3.2846, + "step": 38750 + }, + { + "epoch": 11.302085760894895, + "grad_norm": 0.379085510969162, + "learning_rate": 0.0002613273045507584, + "loss": 3.2861, + "step": 38800 + }, + { + "epoch": 11.316651130272664, + "grad_norm": 0.39975711703300476, + "learning_rate": 0.00026088973162193696, + "loss": 3.3018, + "step": 38850 + }, + { + "epoch": 11.33121649965043, + "grad_norm": 0.390550434589386, + "learning_rate": 0.0002604521586931155, + "loss": 3.2864, + "step": 38900 + }, + { + "epoch": 11.3457818690282, + "grad_norm": 0.40258410573005676, + "learning_rate": 0.00026001458576429404, + "loss": 3.2845, + "step": 38950 + }, + { + "epoch": 11.360347238405966, + "grad_norm": 0.36794647574424744, + "learning_rate": 0.0002595770128354726, + "loss": 3.2906, + "step": 39000 + }, + { + "epoch": 11.360347238405966, + "eval_accuracy": 0.37129404012940964, + "eval_loss": 3.552013397216797, + "eval_runtime": 180.3711, + "eval_samples_per_second": 92.276, + "eval_steps_per_second": 5.771, + "step": 39000 + }, + { + "epoch": 11.374912607783733, + "grad_norm": 0.3978714048862457, + "learning_rate": 0.00025913943990665106, + "loss": 3.2952, + "step": 39050 + }, + { + "epoch": 11.389477977161501, + "grad_norm": 0.3712661862373352, + "learning_rate": 0.0002587018669778296, + "loss": 3.2887, + "step": 39100 + }, + { + "epoch": 11.404043346539268, + "grad_norm": 0.3962024748325348, + "learning_rate": 0.00025826429404900814, + "loss": 3.2818, + "step": 39150 + }, + { + "epoch": 11.418608715917035, + "grad_norm": 0.3791441321372986, + "learning_rate": 0.0002578267211201867, + "loss": 3.2947, + "step": 39200 + }, + { + "epoch": 11.433174085294803, + "grad_norm": 0.38361239433288574, + "learning_rate": 0.0002573891481913652, + "loss": 3.3016, + "step": 39250 + }, + { + "epoch": 11.44773945467257, + "grad_norm": 0.3753799498081207, + "learning_rate": 0.0002569515752625437, + "loss": 3.291, + "step": 39300 + }, + { + "epoch": 11.462304824050339, + "grad_norm": 0.37291768193244934, + "learning_rate": 0.0002565140023337223, + "loss": 3.2858, + "step": 39350 + }, + { + "epoch": 11.476870193428105, + "grad_norm": 0.382899671792984, + "learning_rate": 0.0002560764294049008, + "loss": 3.2902, + "step": 39400 + }, + { + "epoch": 11.491435562805872, + "grad_norm": 0.38682591915130615, + "learning_rate": 0.00025563885647607933, + "loss": 3.3095, + "step": 39450 + }, + { + "epoch": 11.50600093218364, + "grad_norm": 0.39052340388298035, + "learning_rate": 0.00025520128354725787, + "loss": 3.3024, + "step": 39500 + }, + { + "epoch": 11.520566301561407, + "grad_norm": 0.38648220896720886, + "learning_rate": 0.00025476371061843636, + "loss": 3.3037, + "step": 39550 + }, + { + "epoch": 11.535131670939174, + "grad_norm": 0.37323859333992004, + "learning_rate": 0.00025432613768961495, + "loss": 3.2976, + "step": 39600 + }, + { + "epoch": 11.549697040316943, + "grad_norm": 0.3768618106842041, + "learning_rate": 0.00025388856476079343, + "loss": 3.3111, + "step": 39650 + }, + { + "epoch": 11.56426240969471, + "grad_norm": 0.3987623155117035, + "learning_rate": 0.000253450991831972, + "loss": 3.301, + "step": 39700 + }, + { + "epoch": 11.578827779072478, + "grad_norm": 0.42070692777633667, + "learning_rate": 0.0002530134189031505, + "loss": 3.3047, + "step": 39750 + }, + { + "epoch": 11.593393148450245, + "grad_norm": 0.37431833148002625, + "learning_rate": 0.000252575845974329, + "loss": 3.3051, + "step": 39800 + }, + { + "epoch": 11.607958517828012, + "grad_norm": 0.3832058310508728, + "learning_rate": 0.0002521382730455076, + "loss": 3.2968, + "step": 39850 + }, + { + "epoch": 11.62252388720578, + "grad_norm": 0.3984127342700958, + "learning_rate": 0.0002517007001166861, + "loss": 3.3157, + "step": 39900 + }, + { + "epoch": 11.637089256583547, + "grad_norm": 0.40139371156692505, + "learning_rate": 0.0002512631271878646, + "loss": 3.2996, + "step": 39950 + }, + { + "epoch": 11.651654625961314, + "grad_norm": 0.3891284465789795, + "learning_rate": 0.00025082555425904316, + "loss": 3.3088, + "step": 40000 + }, + { + "epoch": 11.651654625961314, + "eval_accuracy": 0.3715696399941494, + "eval_loss": 3.5464000701904297, + "eval_runtime": 180.4742, + "eval_samples_per_second": 92.224, + "eval_steps_per_second": 5.768, + "step": 40000 + }, + { + "epoch": 11.666219995339082, + "grad_norm": 0.37767043709754944, + "learning_rate": 0.0002503879813302217, + "loss": 3.2951, + "step": 40050 + }, + { + "epoch": 11.680785364716849, + "grad_norm": 0.40471839904785156, + "learning_rate": 0.00024995040840140024, + "loss": 3.2946, + "step": 40100 + }, + { + "epoch": 11.695350734094617, + "grad_norm": 0.3795658051967621, + "learning_rate": 0.0002495128354725787, + "loss": 3.3187, + "step": 40150 + }, + { + "epoch": 11.709916103472384, + "grad_norm": 0.3852717876434326, + "learning_rate": 0.00024907526254375727, + "loss": 3.2955, + "step": 40200 + }, + { + "epoch": 11.724481472850151, + "grad_norm": 0.37112799286842346, + "learning_rate": 0.0002486376896149358, + "loss": 3.3023, + "step": 40250 + }, + { + "epoch": 11.73904684222792, + "grad_norm": 0.37619829177856445, + "learning_rate": 0.00024820011668611435, + "loss": 3.3103, + "step": 40300 + }, + { + "epoch": 11.753612211605686, + "grad_norm": 0.3923087418079376, + "learning_rate": 0.0002477625437572929, + "loss": 3.3131, + "step": 40350 + }, + { + "epoch": 11.768177580983453, + "grad_norm": 0.3909642696380615, + "learning_rate": 0.00024732497082847137, + "loss": 3.3085, + "step": 40400 + }, + { + "epoch": 11.782742950361222, + "grad_norm": 0.3891732096672058, + "learning_rate": 0.0002468873978996499, + "loss": 3.3118, + "step": 40450 + }, + { + "epoch": 11.797308319738988, + "grad_norm": 0.39520296454429626, + "learning_rate": 0.00024644982497082845, + "loss": 3.3134, + "step": 40500 + }, + { + "epoch": 11.811873689116755, + "grad_norm": 0.3944683074951172, + "learning_rate": 0.000246012252042007, + "loss": 3.3071, + "step": 40550 + }, + { + "epoch": 11.826439058494524, + "grad_norm": 0.3806307315826416, + "learning_rate": 0.00024557467911318553, + "loss": 3.3058, + "step": 40600 + }, + { + "epoch": 11.84100442787229, + "grad_norm": 0.38682928681373596, + "learning_rate": 0.000245137106184364, + "loss": 3.3067, + "step": 40650 + }, + { + "epoch": 11.855569797250059, + "grad_norm": 0.3885536789894104, + "learning_rate": 0.00024469953325554256, + "loss": 3.3188, + "step": 40700 + }, + { + "epoch": 11.870135166627826, + "grad_norm": 0.39508283138275146, + "learning_rate": 0.0002442619603267211, + "loss": 3.3167, + "step": 40750 + }, + { + "epoch": 11.884700536005592, + "grad_norm": 0.37365779280662537, + "learning_rate": 0.00024382438739789964, + "loss": 3.2989, + "step": 40800 + }, + { + "epoch": 11.899265905383361, + "grad_norm": 0.37982553243637085, + "learning_rate": 0.00024338681446907818, + "loss": 3.3162, + "step": 40850 + }, + { + "epoch": 11.913831274761128, + "grad_norm": 0.3698308765888214, + "learning_rate": 0.0002429492415402567, + "loss": 3.3203, + "step": 40900 + }, + { + "epoch": 11.928396644138896, + "grad_norm": 0.3770948052406311, + "learning_rate": 0.0002425116686114352, + "loss": 3.315, + "step": 40950 + }, + { + "epoch": 11.942962013516663, + "grad_norm": 0.3643822968006134, + "learning_rate": 0.00024207409568261377, + "loss": 3.3188, + "step": 41000 + }, + { + "epoch": 11.942962013516663, + "eval_accuracy": 0.37253788439949564, + "eval_loss": 3.536294460296631, + "eval_runtime": 180.6316, + "eval_samples_per_second": 92.143, + "eval_steps_per_second": 5.763, + "step": 41000 + }, + { + "epoch": 11.95752738289443, + "grad_norm": 0.37310171127319336, + "learning_rate": 0.00024163652275379228, + "loss": 3.3067, + "step": 41050 + }, + { + "epoch": 11.972092752272198, + "grad_norm": 0.3831028342247009, + "learning_rate": 0.00024119894982497082, + "loss": 3.3263, + "step": 41100 + }, + { + "epoch": 11.986658121649965, + "grad_norm": 0.3900957405567169, + "learning_rate": 0.00024076137689614933, + "loss": 3.3106, + "step": 41150 + }, + { + "epoch": 12.001165229550221, + "grad_norm": 0.3873760998249054, + "learning_rate": 0.00024032380396732785, + "loss": 3.3022, + "step": 41200 + }, + { + "epoch": 12.01573059892799, + "grad_norm": 0.3788856863975525, + "learning_rate": 0.00023988623103850641, + "loss": 3.2189, + "step": 41250 + }, + { + "epoch": 12.030295968305756, + "grad_norm": 0.4146612584590912, + "learning_rate": 0.00023944865810968493, + "loss": 3.2262, + "step": 41300 + }, + { + "epoch": 12.044861337683523, + "grad_norm": 0.3976421356201172, + "learning_rate": 0.00023901108518086347, + "loss": 3.2397, + "step": 41350 + }, + { + "epoch": 12.059426707061291, + "grad_norm": 0.3815682828426361, + "learning_rate": 0.00023857351225204198, + "loss": 3.2293, + "step": 41400 + }, + { + "epoch": 12.073992076439058, + "grad_norm": 0.3816235363483429, + "learning_rate": 0.0002381359393232205, + "loss": 3.2318, + "step": 41450 + }, + { + "epoch": 12.088557445816827, + "grad_norm": 0.3730505108833313, + "learning_rate": 0.00023769836639439906, + "loss": 3.2306, + "step": 41500 + }, + { + "epoch": 12.103122815194594, + "grad_norm": 0.36907413601875305, + "learning_rate": 0.00023726079346557757, + "loss": 3.2172, + "step": 41550 + }, + { + "epoch": 12.11768818457236, + "grad_norm": 0.3938505947589874, + "learning_rate": 0.0002368232205367561, + "loss": 3.2349, + "step": 41600 + }, + { + "epoch": 12.132253553950129, + "grad_norm": 0.39459192752838135, + "learning_rate": 0.00023638564760793463, + "loss": 3.244, + "step": 41650 + }, + { + "epoch": 12.146818923327896, + "grad_norm": 0.3762718141078949, + "learning_rate": 0.00023594807467911317, + "loss": 3.2336, + "step": 41700 + }, + { + "epoch": 12.161384292705662, + "grad_norm": 0.38366296887397766, + "learning_rate": 0.0002355105017502917, + "loss": 3.2468, + "step": 41750 + }, + { + "epoch": 12.17594966208343, + "grad_norm": 0.37330591678619385, + "learning_rate": 0.00023507292882147022, + "loss": 3.2367, + "step": 41800 + }, + { + "epoch": 12.190515031461198, + "grad_norm": 0.39677342772483826, + "learning_rate": 0.00023463535589264876, + "loss": 3.2597, + "step": 41850 + }, + { + "epoch": 12.205080400838966, + "grad_norm": 0.3829995393753052, + "learning_rate": 0.0002341977829638273, + "loss": 3.2453, + "step": 41900 + }, + { + "epoch": 12.219645770216733, + "grad_norm": 0.40625911951065063, + "learning_rate": 0.0002337602100350058, + "loss": 3.2457, + "step": 41950 + }, + { + "epoch": 12.2342111395945, + "grad_norm": 0.3920283019542694, + "learning_rate": 0.00023332263710618435, + "loss": 3.2678, + "step": 42000 + }, + { + "epoch": 12.2342111395945, + "eval_accuracy": 0.3720842733593225, + "eval_loss": 3.546815872192383, + "eval_runtime": 180.3312, + "eval_samples_per_second": 92.297, + "eval_steps_per_second": 5.773, + "step": 42000 + }, + { + "epoch": 12.248776508972268, + "grad_norm": 0.398946613073349, + "learning_rate": 0.00023288506417736286, + "loss": 3.245, + "step": 42050 + }, + { + "epoch": 12.263341878350035, + "grad_norm": 0.3961947560310364, + "learning_rate": 0.00023244749124854143, + "loss": 3.2517, + "step": 42100 + }, + { + "epoch": 12.277907247727802, + "grad_norm": 0.3835267722606659, + "learning_rate": 0.00023200991831971994, + "loss": 3.2522, + "step": 42150 + }, + { + "epoch": 12.29247261710557, + "grad_norm": 0.42905566096305847, + "learning_rate": 0.00023157234539089846, + "loss": 3.2587, + "step": 42200 + }, + { + "epoch": 12.307037986483337, + "grad_norm": 0.39819803833961487, + "learning_rate": 0.000231134772462077, + "loss": 3.2556, + "step": 42250 + }, + { + "epoch": 12.321603355861104, + "grad_norm": 0.393216997385025, + "learning_rate": 0.0002306971995332555, + "loss": 3.2738, + "step": 42300 + }, + { + "epoch": 12.336168725238872, + "grad_norm": 0.3680713176727295, + "learning_rate": 0.00023025962660443408, + "loss": 3.2608, + "step": 42350 + }, + { + "epoch": 12.350734094616639, + "grad_norm": 0.3907005488872528, + "learning_rate": 0.0002298220536756126, + "loss": 3.2592, + "step": 42400 + }, + { + "epoch": 12.365299463994408, + "grad_norm": 0.39694586396217346, + "learning_rate": 0.0002293844807467911, + "loss": 3.2655, + "step": 42450 + }, + { + "epoch": 12.379864833372174, + "grad_norm": 0.3920033276081085, + "learning_rate": 0.00022894690781796964, + "loss": 3.2694, + "step": 42500 + }, + { + "epoch": 12.394430202749941, + "grad_norm": 0.38897332549095154, + "learning_rate": 0.00022850933488914815, + "loss": 3.2831, + "step": 42550 + }, + { + "epoch": 12.40899557212771, + "grad_norm": 0.3850444257259369, + "learning_rate": 0.00022807176196032672, + "loss": 3.2698, + "step": 42600 + }, + { + "epoch": 12.423560941505476, + "grad_norm": 0.39484626054763794, + "learning_rate": 0.00022763418903150523, + "loss": 3.2621, + "step": 42650 + }, + { + "epoch": 12.438126310883243, + "grad_norm": 0.4130299985408783, + "learning_rate": 0.00022719661610268375, + "loss": 3.2765, + "step": 42700 + }, + { + "epoch": 12.452691680261012, + "grad_norm": 0.3730163872241974, + "learning_rate": 0.0002267590431738623, + "loss": 3.2681, + "step": 42750 + }, + { + "epoch": 12.467257049638778, + "grad_norm": 0.3927021026611328, + "learning_rate": 0.00022632147024504083, + "loss": 3.2684, + "step": 42800 + }, + { + "epoch": 12.481822419016547, + "grad_norm": 0.39260363578796387, + "learning_rate": 0.00022588389731621937, + "loss": 3.2652, + "step": 42850 + }, + { + "epoch": 12.496387788394314, + "grad_norm": 0.3787255883216858, + "learning_rate": 0.00022544632438739788, + "loss": 3.2619, + "step": 42900 + }, + { + "epoch": 12.51095315777208, + "grad_norm": 0.38174960017204285, + "learning_rate": 0.0002250087514585764, + "loss": 3.2742, + "step": 42950 + }, + { + "epoch": 12.525518527149849, + "grad_norm": 0.39155444502830505, + "learning_rate": 0.00022457117852975496, + "loss": 3.2818, + "step": 43000 + }, + { + "epoch": 12.525518527149849, + "eval_accuracy": 0.37251695574082516, + "eval_loss": 3.541425943374634, + "eval_runtime": 180.4575, + "eval_samples_per_second": 92.232, + "eval_steps_per_second": 5.769, + "step": 43000 + }, + { + "epoch": 12.540083896527616, + "grad_norm": 0.3748781979084015, + "learning_rate": 0.00022413360560093347, + "loss": 3.28, + "step": 43050 + }, + { + "epoch": 12.554649265905383, + "grad_norm": 0.3864782452583313, + "learning_rate": 0.000223696032672112, + "loss": 3.2746, + "step": 43100 + }, + { + "epoch": 12.569214635283151, + "grad_norm": 0.39516115188598633, + "learning_rate": 0.00022325845974329053, + "loss": 3.2664, + "step": 43150 + }, + { + "epoch": 12.583780004660918, + "grad_norm": 0.3874489367008209, + "learning_rate": 0.00022282088681446904, + "loss": 3.2765, + "step": 43200 + }, + { + "epoch": 12.598345374038686, + "grad_norm": 0.4148963689804077, + "learning_rate": 0.0002223833138856476, + "loss": 3.2833, + "step": 43250 + }, + { + "epoch": 12.612910743416453, + "grad_norm": 0.38245537877082825, + "learning_rate": 0.00022194574095682612, + "loss": 3.2826, + "step": 43300 + }, + { + "epoch": 12.62747611279422, + "grad_norm": 0.3959484100341797, + "learning_rate": 0.00022150816802800466, + "loss": 3.2772, + "step": 43350 + }, + { + "epoch": 12.642041482171988, + "grad_norm": 0.3956339359283447, + "learning_rate": 0.00022107059509918317, + "loss": 3.2741, + "step": 43400 + }, + { + "epoch": 12.656606851549755, + "grad_norm": 0.3839803636074066, + "learning_rate": 0.00022063302217036168, + "loss": 3.2662, + "step": 43450 + }, + { + "epoch": 12.671172220927522, + "grad_norm": 0.40059152245521545, + "learning_rate": 0.00022019544924154025, + "loss": 3.2851, + "step": 43500 + }, + { + "epoch": 12.68573759030529, + "grad_norm": 0.3880845904350281, + "learning_rate": 0.00021975787631271876, + "loss": 3.2854, + "step": 43550 + }, + { + "epoch": 12.700302959683057, + "grad_norm": 0.3912261128425598, + "learning_rate": 0.0002193203033838973, + "loss": 3.2838, + "step": 43600 + }, + { + "epoch": 12.714868329060826, + "grad_norm": 0.41812238097190857, + "learning_rate": 0.00021888273045507582, + "loss": 3.2902, + "step": 43650 + }, + { + "epoch": 12.729433698438593, + "grad_norm": 0.3847753703594208, + "learning_rate": 0.00021844515752625436, + "loss": 3.2726, + "step": 43700 + }, + { + "epoch": 12.74399906781636, + "grad_norm": 0.3847730755805969, + "learning_rate": 0.0002180075845974329, + "loss": 3.2761, + "step": 43750 + }, + { + "epoch": 12.758564437194128, + "grad_norm": 0.3814358413219452, + "learning_rate": 0.0002175700116686114, + "loss": 3.2816, + "step": 43800 + }, + { + "epoch": 12.773129806571895, + "grad_norm": 0.39806804060935974, + "learning_rate": 0.00021713243873978995, + "loss": 3.2889, + "step": 43850 + }, + { + "epoch": 12.787695175949661, + "grad_norm": 0.3872688412666321, + "learning_rate": 0.0002166948658109685, + "loss": 3.2889, + "step": 43900 + }, + { + "epoch": 12.80226054532743, + "grad_norm": 0.3840930461883545, + "learning_rate": 0.000216257292882147, + "loss": 3.2767, + "step": 43950 + }, + { + "epoch": 12.816825914705197, + "grad_norm": 0.3970656096935272, + "learning_rate": 0.00021581971995332554, + "loss": 3.2772, + "step": 44000 + }, + { + "epoch": 12.816825914705197, + "eval_accuracy": 0.37332153333229867, + "eval_loss": 3.5360467433929443, + "eval_runtime": 180.2447, + "eval_samples_per_second": 92.341, + "eval_steps_per_second": 5.775, + "step": 44000 + }, + { + "epoch": 12.831391284082965, + "grad_norm": 0.3944132328033447, + "learning_rate": 0.00021538214702450405, + "loss": 3.2893, + "step": 44050 + }, + { + "epoch": 12.845956653460732, + "grad_norm": 0.40921568870544434, + "learning_rate": 0.00021494457409568262, + "loss": 3.2811, + "step": 44100 + }, + { + "epoch": 12.860522022838499, + "grad_norm": 0.37589746713638306, + "learning_rate": 0.00021450700116686113, + "loss": 3.278, + "step": 44150 + }, + { + "epoch": 12.875087392216267, + "grad_norm": 0.4068247377872467, + "learning_rate": 0.00021406942823803965, + "loss": 3.2848, + "step": 44200 + }, + { + "epoch": 12.889652761594034, + "grad_norm": 0.41013479232788086, + "learning_rate": 0.0002136318553092182, + "loss": 3.2929, + "step": 44250 + }, + { + "epoch": 12.9042181309718, + "grad_norm": 0.39379021525382996, + "learning_rate": 0.0002131942823803967, + "loss": 3.2898, + "step": 44300 + }, + { + "epoch": 12.91878350034957, + "grad_norm": 0.38993388414382935, + "learning_rate": 0.00021275670945157527, + "loss": 3.287, + "step": 44350 + }, + { + "epoch": 12.933348869727336, + "grad_norm": 0.4032069146633148, + "learning_rate": 0.00021231913652275378, + "loss": 3.2932, + "step": 44400 + }, + { + "epoch": 12.947914239105105, + "grad_norm": 0.40004608035087585, + "learning_rate": 0.0002118815635939323, + "loss": 3.2954, + "step": 44450 + }, + { + "epoch": 12.962479608482871, + "grad_norm": 0.39480239152908325, + "learning_rate": 0.00021144399066511083, + "loss": 3.2967, + "step": 44500 + }, + { + "epoch": 12.977044977860638, + "grad_norm": 0.4099850058555603, + "learning_rate": 0.00021100641773628935, + "loss": 3.2809, + "step": 44550 + }, + { + "epoch": 12.991610347238407, + "grad_norm": 0.38592153787612915, + "learning_rate": 0.0002105688448074679, + "loss": 3.2948, + "step": 44600 + }, + { + "epoch": 13.006117455138662, + "grad_norm": 0.3878653049468994, + "learning_rate": 0.00021013127187864643, + "loss": 3.2447, + "step": 44650 + }, + { + "epoch": 13.02068282451643, + "grad_norm": 0.40472573041915894, + "learning_rate": 0.00020969369894982494, + "loss": 3.1872, + "step": 44700 + }, + { + "epoch": 13.035248193894198, + "grad_norm": 0.38480067253112793, + "learning_rate": 0.00020925612602100348, + "loss": 3.1966, + "step": 44750 + }, + { + "epoch": 13.049813563271965, + "grad_norm": 0.4043852388858795, + "learning_rate": 0.00020881855309218202, + "loss": 3.1928, + "step": 44800 + }, + { + "epoch": 13.064378932649731, + "grad_norm": 0.3920169174671173, + "learning_rate": 0.00020838098016336056, + "loss": 3.2142, + "step": 44850 + }, + { + "epoch": 13.0789443020275, + "grad_norm": 0.4085189402103424, + "learning_rate": 0.00020794340723453907, + "loss": 3.1996, + "step": 44900 + }, + { + "epoch": 13.093509671405267, + "grad_norm": 0.39081132411956787, + "learning_rate": 0.00020750583430571758, + "loss": 3.2162, + "step": 44950 + }, + { + "epoch": 13.108075040783035, + "grad_norm": 0.4104847311973572, + "learning_rate": 0.00020706826137689615, + "loss": 3.215, + "step": 45000 + }, + { + "epoch": 13.108075040783035, + "eval_accuracy": 0.3724899130919812, + "eval_loss": 3.5466861724853516, + "eval_runtime": 180.2123, + "eval_samples_per_second": 92.358, + "eval_steps_per_second": 5.777, + "step": 45000 + }, + { + "epoch": 13.122640410160802, + "grad_norm": 0.40783169865608215, + "learning_rate": 0.00020663068844807466, + "loss": 3.217, + "step": 45050 + }, + { + "epoch": 13.137205779538569, + "grad_norm": 0.3994167149066925, + "learning_rate": 0.0002061931155192532, + "loss": 3.214, + "step": 45100 + }, + { + "epoch": 13.151771148916337, + "grad_norm": 0.41038912534713745, + "learning_rate": 0.00020575554259043172, + "loss": 3.2265, + "step": 45150 + }, + { + "epoch": 13.166336518294104, + "grad_norm": 0.3970767557621002, + "learning_rate": 0.00020531796966161023, + "loss": 3.2219, + "step": 45200 + }, + { + "epoch": 13.18090188767187, + "grad_norm": 0.4076697528362274, + "learning_rate": 0.0002048803967327888, + "loss": 3.2133, + "step": 45250 + }, + { + "epoch": 13.19546725704964, + "grad_norm": 0.40613362193107605, + "learning_rate": 0.0002044428238039673, + "loss": 3.22, + "step": 45300 + }, + { + "epoch": 13.210032626427406, + "grad_norm": 0.39395052194595337, + "learning_rate": 0.00020400525087514585, + "loss": 3.2279, + "step": 45350 + }, + { + "epoch": 13.224597995805174, + "grad_norm": 0.3916940987110138, + "learning_rate": 0.00020356767794632436, + "loss": 3.2346, + "step": 45400 + }, + { + "epoch": 13.239163365182941, + "grad_norm": 0.41231533885002136, + "learning_rate": 0.00020313010501750287, + "loss": 3.2357, + "step": 45450 + }, + { + "epoch": 13.253728734560708, + "grad_norm": 0.4182799160480499, + "learning_rate": 0.00020269253208868144, + "loss": 3.2334, + "step": 45500 + }, + { + "epoch": 13.268294103938477, + "grad_norm": 0.4099382162094116, + "learning_rate": 0.00020225495915985995, + "loss": 3.2341, + "step": 45550 + }, + { + "epoch": 13.282859473316243, + "grad_norm": 0.4044232666492462, + "learning_rate": 0.0002018173862310385, + "loss": 3.2213, + "step": 45600 + }, + { + "epoch": 13.29742484269401, + "grad_norm": 0.39154335856437683, + "learning_rate": 0.000201379813302217, + "loss": 3.237, + "step": 45650 + }, + { + "epoch": 13.311990212071779, + "grad_norm": 0.4079340398311615, + "learning_rate": 0.00020094224037339555, + "loss": 3.2376, + "step": 45700 + }, + { + "epoch": 13.326555581449545, + "grad_norm": 0.39542028307914734, + "learning_rate": 0.0002005046674445741, + "loss": 3.2315, + "step": 45750 + }, + { + "epoch": 13.341120950827314, + "grad_norm": 0.39488768577575684, + "learning_rate": 0.0002000670945157526, + "loss": 3.2401, + "step": 45800 + }, + { + "epoch": 13.35568632020508, + "grad_norm": 0.41860339045524597, + "learning_rate": 0.00019962952158693114, + "loss": 3.2385, + "step": 45850 + }, + { + "epoch": 13.370251689582847, + "grad_norm": 0.4021410644054413, + "learning_rate": 0.00019919194865810968, + "loss": 3.2472, + "step": 45900 + }, + { + "epoch": 13.384817058960616, + "grad_norm": 0.3935169279575348, + "learning_rate": 0.0001987543757292882, + "loss": 3.2474, + "step": 45950 + }, + { + "epoch": 13.399382428338383, + "grad_norm": 0.4164498448371887, + "learning_rate": 0.00019831680280046673, + "loss": 3.2448, + "step": 46000 + }, + { + "epoch": 13.399382428338383, + "eval_accuracy": 0.37297574015729884, + "eval_loss": 3.5430777072906494, + "eval_runtime": 180.1561, + "eval_samples_per_second": 92.387, + "eval_steps_per_second": 5.778, + "step": 46000 + }, + { + "epoch": 13.41394779771615, + "grad_norm": 0.4161559045314789, + "learning_rate": 0.00019787922987164524, + "loss": 3.2397, + "step": 46050 + }, + { + "epoch": 13.428513167093918, + "grad_norm": 0.40776827931404114, + "learning_rate": 0.0001974416569428238, + "loss": 3.2325, + "step": 46100 + }, + { + "epoch": 13.443078536471685, + "grad_norm": 0.3878330886363983, + "learning_rate": 0.00019700408401400232, + "loss": 3.2609, + "step": 46150 + }, + { + "epoch": 13.457643905849451, + "grad_norm": 0.40034887194633484, + "learning_rate": 0.00019656651108518084, + "loss": 3.2584, + "step": 46200 + }, + { + "epoch": 13.47220927522722, + "grad_norm": 0.40647125244140625, + "learning_rate": 0.00019612893815635938, + "loss": 3.2431, + "step": 46250 + }, + { + "epoch": 13.486774644604987, + "grad_norm": 0.3935099244117737, + "learning_rate": 0.0001956913652275379, + "loss": 3.2455, + "step": 46300 + }, + { + "epoch": 13.501340013982755, + "grad_norm": 0.3952663540840149, + "learning_rate": 0.00019525379229871646, + "loss": 3.2482, + "step": 46350 + }, + { + "epoch": 13.515905383360522, + "grad_norm": 0.390480637550354, + "learning_rate": 0.00019481621936989497, + "loss": 3.2544, + "step": 46400 + }, + { + "epoch": 13.530470752738289, + "grad_norm": 0.40572217106819153, + "learning_rate": 0.00019437864644107348, + "loss": 3.2502, + "step": 46450 + }, + { + "epoch": 13.545036122116057, + "grad_norm": 0.38214248418807983, + "learning_rate": 0.00019394107351225202, + "loss": 3.2427, + "step": 46500 + }, + { + "epoch": 13.559601491493824, + "grad_norm": 0.4259106516838074, + "learning_rate": 0.00019350350058343054, + "loss": 3.2479, + "step": 46550 + }, + { + "epoch": 13.574166860871593, + "grad_norm": 0.3941766917705536, + "learning_rate": 0.0001930659276546091, + "loss": 3.2628, + "step": 46600 + }, + { + "epoch": 13.58873223024936, + "grad_norm": 0.40022504329681396, + "learning_rate": 0.00019262835472578762, + "loss": 3.2478, + "step": 46650 + }, + { + "epoch": 13.603297599627126, + "grad_norm": 0.3927033841609955, + "learning_rate": 0.00019219078179696613, + "loss": 3.2597, + "step": 46700 + }, + { + "epoch": 13.617862969004895, + "grad_norm": 0.4204312562942505, + "learning_rate": 0.00019175320886814467, + "loss": 3.2552, + "step": 46750 + }, + { + "epoch": 13.632428338382661, + "grad_norm": 0.4014910161495209, + "learning_rate": 0.0001913156359393232, + "loss": 3.2582, + "step": 46800 + }, + { + "epoch": 13.646993707760428, + "grad_norm": 0.3960302770137787, + "learning_rate": 0.00019087806301050175, + "loss": 3.247, + "step": 46850 + }, + { + "epoch": 13.661559077138197, + "grad_norm": 0.40421754121780396, + "learning_rate": 0.00019044049008168026, + "loss": 3.2509, + "step": 46900 + }, + { + "epoch": 13.676124446515963, + "grad_norm": 0.4028851091861725, + "learning_rate": 0.00019000291715285877, + "loss": 3.2603, + "step": 46950 + }, + { + "epoch": 13.69068981589373, + "grad_norm": 0.4152960181236267, + "learning_rate": 0.00018956534422403734, + "loss": 3.2716, + "step": 47000 + }, + { + "epoch": 13.69068981589373, + "eval_accuracy": 0.37343640580151827, + "eval_loss": 3.536942481994629, + "eval_runtime": 180.1541, + "eval_samples_per_second": 92.388, + "eval_steps_per_second": 5.778, + "step": 47000 + }, + { + "epoch": 13.705255185271499, + "grad_norm": 0.40029028058052063, + "learning_rate": 0.00018912777129521585, + "loss": 3.2608, + "step": 47050 + }, + { + "epoch": 13.719820554649266, + "grad_norm": 0.4005506634712219, + "learning_rate": 0.0001886901983663944, + "loss": 3.2562, + "step": 47100 + }, + { + "epoch": 13.734385924027034, + "grad_norm": 0.4043956398963928, + "learning_rate": 0.0001882526254375729, + "loss": 3.2553, + "step": 47150 + }, + { + "epoch": 13.7489512934048, + "grad_norm": 0.393660306930542, + "learning_rate": 0.00018781505250875142, + "loss": 3.2504, + "step": 47200 + }, + { + "epoch": 13.763516662782568, + "grad_norm": 0.41873812675476074, + "learning_rate": 0.00018737747957992999, + "loss": 3.2641, + "step": 47250 + }, + { + "epoch": 13.778082032160336, + "grad_norm": 0.39937934279441833, + "learning_rate": 0.0001869399066511085, + "loss": 3.2601, + "step": 47300 + }, + { + "epoch": 13.792647401538103, + "grad_norm": 0.39644569158554077, + "learning_rate": 0.00018650233372228704, + "loss": 3.2579, + "step": 47350 + }, + { + "epoch": 13.80721277091587, + "grad_norm": 0.4110250174999237, + "learning_rate": 0.00018606476079346555, + "loss": 3.2545, + "step": 47400 + }, + { + "epoch": 13.821778140293638, + "grad_norm": 0.39572134613990784, + "learning_rate": 0.00018562718786464406, + "loss": 3.2551, + "step": 47450 + }, + { + "epoch": 13.836343509671405, + "grad_norm": 0.40120694041252136, + "learning_rate": 0.00018518961493582263, + "loss": 3.2497, + "step": 47500 + }, + { + "epoch": 13.850908879049173, + "grad_norm": 0.3942031264305115, + "learning_rate": 0.00018475204200700114, + "loss": 3.2592, + "step": 47550 + }, + { + "epoch": 13.86547424842694, + "grad_norm": 0.4140487611293793, + "learning_rate": 0.00018431446907817968, + "loss": 3.2552, + "step": 47600 + }, + { + "epoch": 13.880039617804707, + "grad_norm": 0.39110127091407776, + "learning_rate": 0.0001838768961493582, + "loss": 3.261, + "step": 47650 + }, + { + "epoch": 13.894604987182475, + "grad_norm": 0.4091663360595703, + "learning_rate": 0.00018343932322053674, + "loss": 3.2709, + "step": 47700 + }, + { + "epoch": 13.909170356560242, + "grad_norm": 0.39773812890052795, + "learning_rate": 0.00018300175029171528, + "loss": 3.2645, + "step": 47750 + }, + { + "epoch": 13.923735725938009, + "grad_norm": 0.4022299647331238, + "learning_rate": 0.0001825641773628938, + "loss": 3.2597, + "step": 47800 + }, + { + "epoch": 13.938301095315778, + "grad_norm": 0.3977898061275482, + "learning_rate": 0.00018212660443407233, + "loss": 3.2697, + "step": 47850 + }, + { + "epoch": 13.952866464693544, + "grad_norm": 0.38834723830223083, + "learning_rate": 0.00018168903150525087, + "loss": 3.2585, + "step": 47900 + }, + { + "epoch": 13.967431834071313, + "grad_norm": 0.3896270990371704, + "learning_rate": 0.00018125145857642938, + "loss": 3.2654, + "step": 47950 + }, + { + "epoch": 13.98199720344908, + "grad_norm": 0.41397517919540405, + "learning_rate": 0.00018081388564760792, + "loss": 3.2672, + "step": 48000 + }, + { + "epoch": 13.98199720344908, + "eval_accuracy": 0.37412834488172014, + "eval_loss": 3.5272507667541504, + "eval_runtime": 180.2142, + "eval_samples_per_second": 92.357, + "eval_steps_per_second": 5.776, + "step": 48000 + }, + { + "epoch": 13.996562572826846, + "grad_norm": 0.3925948739051819, + "learning_rate": 0.00018037631271878644, + "loss": 3.2785, + "step": 48050 + }, + { + "epoch": 14.011069680727104, + "grad_norm": 0.39326012134552, + "learning_rate": 0.000179938739789965, + "loss": 3.2021, + "step": 48100 + }, + { + "epoch": 14.02563505010487, + "grad_norm": 0.40781304240226746, + "learning_rate": 0.00017950116686114352, + "loss": 3.1782, + "step": 48150 + }, + { + "epoch": 14.040200419482638, + "grad_norm": 0.3889636695384979, + "learning_rate": 0.00017906359393232203, + "loss": 3.1885, + "step": 48200 + }, + { + "epoch": 14.054765788860406, + "grad_norm": 0.4008404314517975, + "learning_rate": 0.00017862602100350057, + "loss": 3.1821, + "step": 48250 + }, + { + "epoch": 14.069331158238173, + "grad_norm": 0.4058891832828522, + "learning_rate": 0.00017818844807467908, + "loss": 3.1926, + "step": 48300 + }, + { + "epoch": 14.08389652761594, + "grad_norm": 0.3980492949485779, + "learning_rate": 0.00017775087514585765, + "loss": 3.1981, + "step": 48350 + }, + { + "epoch": 14.098461896993708, + "grad_norm": 0.4085221588611603, + "learning_rate": 0.00017731330221703616, + "loss": 3.1935, + "step": 48400 + }, + { + "epoch": 14.113027266371475, + "grad_norm": 0.41492384672164917, + "learning_rate": 0.00017687572928821467, + "loss": 3.194, + "step": 48450 + }, + { + "epoch": 14.127592635749243, + "grad_norm": 0.4290497899055481, + "learning_rate": 0.00017643815635939321, + "loss": 3.1947, + "step": 48500 + }, + { + "epoch": 14.14215800512701, + "grad_norm": 0.42287999391555786, + "learning_rate": 0.00017600058343057173, + "loss": 3.1991, + "step": 48550 + }, + { + "epoch": 14.156723374504777, + "grad_norm": 0.39472466707229614, + "learning_rate": 0.0001755630105017503, + "loss": 3.2116, + "step": 48600 + }, + { + "epoch": 14.171288743882545, + "grad_norm": 0.4188964068889618, + "learning_rate": 0.0001751254375729288, + "loss": 3.1942, + "step": 48650 + }, + { + "epoch": 14.185854113260312, + "grad_norm": 0.4070267975330353, + "learning_rate": 0.00017468786464410732, + "loss": 3.1969, + "step": 48700 + }, + { + "epoch": 14.200419482638079, + "grad_norm": 0.40462633967399597, + "learning_rate": 0.00017425029171528586, + "loss": 3.2017, + "step": 48750 + }, + { + "epoch": 14.214984852015847, + "grad_norm": 0.40400370955467224, + "learning_rate": 0.0001738127187864644, + "loss": 3.2093, + "step": 48800 + }, + { + "epoch": 14.229550221393614, + "grad_norm": 0.3998878002166748, + "learning_rate": 0.00017337514585764294, + "loss": 3.2056, + "step": 48850 + }, + { + "epoch": 14.244115590771383, + "grad_norm": 0.3977794945240021, + "learning_rate": 0.00017293757292882145, + "loss": 3.2038, + "step": 48900 + }, + { + "epoch": 14.25868096014915, + "grad_norm": 0.4316108226776123, + "learning_rate": 0.00017249999999999996, + "loss": 3.2037, + "step": 48950 + }, + { + "epoch": 14.273246329526916, + "grad_norm": 0.41260573267936707, + "learning_rate": 0.00017206242707117853, + "loss": 3.2114, + "step": 49000 + }, + { + "epoch": 14.273246329526916, + "eval_accuracy": 0.37353940302059335, + "eval_loss": 3.538419723510742, + "eval_runtime": 180.2527, + "eval_samples_per_second": 92.337, + "eval_steps_per_second": 5.775, + "step": 49000 + }, + { + "epoch": 14.287811698904685, + "grad_norm": 0.4174029231071472, + "learning_rate": 0.00017162485414235704, + "loss": 3.2237, + "step": 49050 + }, + { + "epoch": 14.302377068282452, + "grad_norm": 0.42132076621055603, + "learning_rate": 0.00017118728121353558, + "loss": 3.2049, + "step": 49100 + }, + { + "epoch": 14.316942437660218, + "grad_norm": 0.41422000527381897, + "learning_rate": 0.0001707497082847141, + "loss": 3.208, + "step": 49150 + }, + { + "epoch": 14.331507807037987, + "grad_norm": 0.4296468198299408, + "learning_rate": 0.0001703121353558926, + "loss": 3.2076, + "step": 49200 + }, + { + "epoch": 14.346073176415754, + "grad_norm": 0.40375787019729614, + "learning_rate": 0.00016987456242707118, + "loss": 3.2195, + "step": 49250 + }, + { + "epoch": 14.360638545793522, + "grad_norm": 0.4078134298324585, + "learning_rate": 0.0001694369894982497, + "loss": 3.2231, + "step": 49300 + }, + { + "epoch": 14.375203915171289, + "grad_norm": 0.4103347063064575, + "learning_rate": 0.00016899941656942823, + "loss": 3.224, + "step": 49350 + }, + { + "epoch": 14.389769284549056, + "grad_norm": 0.4056347906589508, + "learning_rate": 0.00016856184364060674, + "loss": 3.2211, + "step": 49400 + }, + { + "epoch": 14.404334653926824, + "grad_norm": 0.43045109510421753, + "learning_rate": 0.00016812427071178528, + "loss": 3.2178, + "step": 49450 + }, + { + "epoch": 14.418900023304591, + "grad_norm": 0.4060124158859253, + "learning_rate": 0.00016768669778296382, + "loss": 3.2132, + "step": 49500 + }, + { + "epoch": 14.433465392682358, + "grad_norm": 0.40384456515312195, + "learning_rate": 0.00016724912485414234, + "loss": 3.2172, + "step": 49550 + }, + { + "epoch": 14.448030762060126, + "grad_norm": 0.40116435289382935, + "learning_rate": 0.00016681155192532088, + "loss": 3.2099, + "step": 49600 + }, + { + "epoch": 14.462596131437893, + "grad_norm": 0.4094943404197693, + "learning_rate": 0.00016637397899649942, + "loss": 3.2127, + "step": 49650 + }, + { + "epoch": 14.477161500815662, + "grad_norm": 0.40145185589790344, + "learning_rate": 0.00016593640606767793, + "loss": 3.22, + "step": 49700 + }, + { + "epoch": 14.491726870193428, + "grad_norm": 0.42102572321891785, + "learning_rate": 0.00016549883313885647, + "loss": 3.2253, + "step": 49750 + }, + { + "epoch": 14.506292239571195, + "grad_norm": 0.41271886229515076, + "learning_rate": 0.00016506126021003498, + "loss": 3.207, + "step": 49800 + }, + { + "epoch": 14.520857608948964, + "grad_norm": 0.41741323471069336, + "learning_rate": 0.00016462368728121355, + "loss": 3.2322, + "step": 49850 + }, + { + "epoch": 14.53542297832673, + "grad_norm": 0.40796613693237305, + "learning_rate": 0.00016418611435239206, + "loss": 3.2178, + "step": 49900 + }, + { + "epoch": 14.549988347704497, + "grad_norm": 0.4142317771911621, + "learning_rate": 0.00016374854142357057, + "loss": 3.2196, + "step": 49950 + }, + { + "epoch": 14.564553717082266, + "grad_norm": 0.4134737253189087, + "learning_rate": 0.0001633109684947491, + "loss": 3.229, + "step": 50000 + }, + { + "epoch": 14.564553717082266, + "eval_accuracy": 0.3739324620427029, + "eval_loss": 3.535773515701294, + "eval_runtime": 180.2641, + "eval_samples_per_second": 92.331, + "eval_steps_per_second": 5.775, + "step": 50000 + } + ], + "logging_steps": 50, + "max_steps": 68660, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 2 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.045105940496384e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}