diff --git "a/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json" "b/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last_to_hit_frequency_3591/checkpoint-100000/trainer_state.json" @@ -0,0 +1,14943 @@ +{ + "best_global_step": 72000, + "best_metric": 3.527845621109009, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_3591/checkpoint-30000", + "epoch": 29.129340480074575, + "eval_steps": 1000, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 1.7487624883651733, + "learning_rate": 0.000294, + "loss": 8.4085, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.8792372941970825, + "learning_rate": 0.0005939999999999999, + "loss": 6.7474, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.49055325984954834, + "learning_rate": 0.0005998286213931798, + "loss": 6.369, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.47406330704689026, + "learning_rate": 0.0005996537452637714, + "loss": 6.152, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.5468786358833313, + "learning_rate": 0.0005994788691343632, + "loss": 6.0121, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.5636491775512695, + "learning_rate": 0.0005993039930049548, + "loss": 5.8836, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.47112441062927246, + "learning_rate": 0.0005991291168755465, + "loss": 5.7826, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.614771842956543, + "learning_rate": 0.0005989542407461382, + "loss": 5.6472, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.49487683176994324, + "learning_rate": 0.0005987793646167297, + "loss": 5.5373, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.4133830964565277, + "learning_rate": 0.0005986044884873214, + "loss": 5.4464, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.43713563680648804, + "learning_rate": 0.0005984296123579131, + "loss": 5.3584, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.4758622348308563, + "learning_rate": 0.0005982547362285047, + "loss": 5.2639, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.5036144852638245, + "learning_rate": 0.0005980798600990964, + "loss": 5.2065, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.4153907597064972, + "learning_rate": 0.0005979049839696881, + "loss": 5.1452, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.49257349967956543, + "learning_rate": 0.0005977301078402798, + "loss": 5.0858, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.39646583795547485, + "learning_rate": 0.0005975552317108715, + "loss": 5.0535, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4354263246059418, + "learning_rate": 0.0005973803555814631, + "loss": 4.9802, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.45780736207962036, + "learning_rate": 0.0005972054794520547, + "loss": 4.9416, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.4360142946243286, + "learning_rate": 0.0005970306033226464, + "loss": 4.8913, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.49000921845436096, + "learning_rate": 0.0005968557271932381, + "loss": 4.8516, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.25072978433607523, + "eval_loss": 4.782108783721924, + "eval_runtime": 182.688, + "eval_samples_per_second": 91.112, + "eval_steps_per_second": 5.698, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.4772098958492279, + "learning_rate": 0.0005966808510638297, + "loss": 4.7909, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.4113984704017639, + "learning_rate": 0.0005965059749344214, + "loss": 4.7556, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.4781721830368042, + "learning_rate": 0.0005963310988050131, + "loss": 4.7197, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.5447441935539246, + "learning_rate": 0.0005961562226756047, + "loss": 4.6751, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.41747575998306274, + "learning_rate": 0.0005959813465461965, + "loss": 4.6392, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.44658181071281433, + "learning_rate": 0.000595806470416788, + "loss": 4.6204, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.4609386622905731, + "learning_rate": 0.0005956315942873797, + "loss": 4.5836, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.4060616195201874, + "learning_rate": 0.0005954567181579714, + "loss": 4.5615, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.4222695827484131, + "learning_rate": 0.000595281842028563, + "loss": 4.5376, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4395248591899872, + "learning_rate": 0.0005951069658991547, + "loss": 4.5107, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.4157288372516632, + "learning_rate": 0.0005949320897697464, + "loss": 4.5016, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.4212968349456787, + "learning_rate": 0.0005947572136403381, + "loss": 4.4616, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.49588635563850403, + "learning_rate": 0.0005945823375109297, + "loss": 4.4497, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.44670262932777405, + "learning_rate": 0.0005944074613815215, + "loss": 4.4344, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.36793598532676697, + "learning_rate": 0.000594232585252113, + "loss": 4.4265, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.4189833700656891, + "learning_rate": 0.0005940577091227047, + "loss": 4.4012, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.4342043995857239, + "learning_rate": 0.0005938828329932964, + "loss": 4.3671, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.4385491609573364, + "learning_rate": 0.000593707956863888, + "loss": 4.3797, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.3876365125179291, + "learning_rate": 0.0005935330807344797, + "loss": 4.3617, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.41458868980407715, + "learning_rate": 0.0005933582046050714, + "loss": 4.342, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.2995414195009285, + "eval_loss": 4.284348011016846, + "eval_runtime": 182.6917, + "eval_samples_per_second": 91.11, + "eval_steps_per_second": 5.698, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.41804736852645874, + "learning_rate": 0.000593183328475663, + "loss": 4.3264, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.36194130778312683, + "learning_rate": 0.0005930084523462546, + "loss": 4.3039, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.3918125629425049, + "learning_rate": 0.0005928335762168463, + "loss": 4.2908, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.40795278549194336, + "learning_rate": 0.000592658700087438, + "loss": 4.3007, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.38976508378982544, + "learning_rate": 0.0005924838239580297, + "loss": 4.2836, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.40438467264175415, + "learning_rate": 0.0005923089478286214, + "loss": 4.2674, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.41360440850257874, + "learning_rate": 0.000592134071699213, + "loss": 4.2654, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.377794474363327, + "learning_rate": 0.0005919591955698047, + "loss": 4.247, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.391387015581131, + "learning_rate": 0.0005917843194403964, + "loss": 4.2377, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.355221688747406, + "learning_rate": 0.000591609443310988, + "loss": 4.2396, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.4049054682254791, + "learning_rate": 0.0005914345671815796, + "loss": 4.225, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.3985639214515686, + "learning_rate": 0.0005912596910521713, + "loss": 4.204, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.3753962814807892, + "learning_rate": 0.0005910848149227629, + "loss": 4.207, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3719504773616791, + "learning_rate": 0.0005909099387933547, + "loss": 4.1872, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.3820745348930359, + "learning_rate": 0.0005907350626639463, + "loss": 4.1728, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.39611145853996277, + "learning_rate": 0.000590560186534538, + "loss": 4.175, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.3846186697483063, + "learning_rate": 0.0005903853104051297, + "loss": 4.171, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.37356239557266235, + "learning_rate": 0.0005902104342757214, + "loss": 4.1733, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.3760608434677124, + "learning_rate": 0.000590035558146313, + "loss": 4.1446, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.36522382497787476, + "learning_rate": 0.0005898606820169046, + "loss": 4.1349, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.3161861104367184, + "eval_loss": 4.093681335449219, + "eval_runtime": 182.834, + "eval_samples_per_second": 91.039, + "eval_steps_per_second": 5.694, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.3807234764099121, + "learning_rate": 0.0005896858058874963, + "loss": 4.1488, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.37441378831863403, + "learning_rate": 0.0005895109297580879, + "loss": 4.1366, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.3428540825843811, + "learning_rate": 0.0005893360536286797, + "loss": 4.1134, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.37319910526275635, + "learning_rate": 0.0005891611774992713, + "loss": 4.1224, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.3176666796207428, + "learning_rate": 0.000588986301369863, + "loss": 4.1011, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.33169302344322205, + "learning_rate": 0.0005888114252404547, + "loss": 4.109, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.37893936038017273, + "learning_rate": 0.0005886365491110463, + "loss": 4.0925, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.37214741110801697, + "learning_rate": 0.000588461672981638, + "loss": 4.0892, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.36634066700935364, + "learning_rate": 0.0005882867968522296, + "loss": 4.0616, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.35482263565063477, + "learning_rate": 0.0005881119207228212, + "loss": 4.0124, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.3779667913913727, + "learning_rate": 0.0005879370445934129, + "loss": 4.016, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.3397766947746277, + "learning_rate": 0.0005877621684640046, + "loss": 4.0033, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3722945749759674, + "learning_rate": 0.0005875872923345963, + "loss": 4.0129, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.34330493211746216, + "learning_rate": 0.000587412416205188, + "loss": 4.003, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.3765234649181366, + "learning_rate": 0.0005872375400757797, + "loss": 4.0107, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.35264670848846436, + "learning_rate": 0.0005870626639463713, + "loss": 4.0043, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.3567696213722229, + "learning_rate": 0.0005868877878169629, + "loss": 4.0016, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.37721458077430725, + "learning_rate": 0.0005867129116875546, + "loss": 4.0014, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.34036508202552795, + "learning_rate": 0.0005865380355581462, + "loss": 3.9731, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.33936476707458496, + "learning_rate": 0.0005863631594287379, + "loss": 3.9879, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.3259847194699489, + "eval_loss": 3.9844980239868164, + "eval_runtime": 182.8731, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 5.692, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.35038861632347107, + "learning_rate": 0.0005861882832993296, + "loss": 3.9796, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.3499089777469635, + "learning_rate": 0.0005860134071699212, + "loss": 4.0012, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.35973456501960754, + "learning_rate": 0.000585838531040513, + "loss": 3.9843, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.364793598651886, + "learning_rate": 0.0005856636549111046, + "loss": 3.9738, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.3340921998023987, + "learning_rate": 0.0005854887787816963, + "loss": 3.9804, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.33135804533958435, + "learning_rate": 0.0005853139026522879, + "loss": 3.9741, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.348160058259964, + "learning_rate": 0.0005851390265228796, + "loss": 3.9693, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.3392653167247772, + "learning_rate": 0.0005849641503934712, + "loss": 3.9645, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.33709239959716797, + "learning_rate": 0.0005847892742640629, + "loss": 3.9534, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.3292732238769531, + "learning_rate": 0.0005846143981346546, + "loss": 3.9565, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.346902459859848, + "learning_rate": 0.0005844395220052462, + "loss": 3.945, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.3570997416973114, + "learning_rate": 0.000584264645875838, + "loss": 3.9458, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.3431386351585388, + "learning_rate": 0.0005840897697464296, + "loss": 3.9343, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.32554003596305847, + "learning_rate": 0.0005839148936170212, + "loss": 3.9334, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.33119750022888184, + "learning_rate": 0.0005837400174876129, + "loss": 3.9402, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.32726046442985535, + "learning_rate": 0.0005835651413582045, + "loss": 3.9414, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.37814900279045105, + "learning_rate": 0.0005833902652287962, + "loss": 3.9381, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.35007208585739136, + "learning_rate": 0.0005832153890993879, + "loss": 3.941, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.47103646397590637, + "learning_rate": 0.0005830405129699796, + "loss": 3.9365, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.3166019916534424, + "learning_rate": 0.0005828656368405712, + "loss": 3.9229, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.33251971202484953, + "eval_loss": 3.9095144271850586, + "eval_runtime": 182.8302, + "eval_samples_per_second": 91.041, + "eval_steps_per_second": 5.694, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.32818934321403503, + "learning_rate": 0.0005826907607111629, + "loss": 3.9321, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.3326047658920288, + "learning_rate": 0.0005825158845817546, + "loss": 3.9226, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.3376672863960266, + "learning_rate": 0.0005823410084523462, + "loss": 3.93, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.3359002470970154, + "learning_rate": 0.0005821661323229379, + "loss": 3.9337, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.33936458826065063, + "learning_rate": 0.0005819912561935295, + "loss": 3.9042, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.34475040435791016, + "learning_rate": 0.0005818163800641212, + "loss": 3.9182, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.3286258578300476, + "learning_rate": 0.0005816415039347129, + "loss": 3.9125, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.3313068747520447, + "learning_rate": 0.0005814666278053045, + "loss": 3.8986, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.3305990695953369, + "learning_rate": 0.0005812917516758962, + "loss": 3.9158, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.3188944458961487, + "learning_rate": 0.0005811168755464879, + "loss": 3.8946, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.353261798620224, + "learning_rate": 0.0005809419994170794, + "loss": 3.9022, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.3191840648651123, + "learning_rate": 0.0005807671232876712, + "loss": 3.8972, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.3437453508377075, + "learning_rate": 0.0005805922471582628, + "loss": 3.8846, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.3136034309864044, + "learning_rate": 0.0005804173710288545, + "loss": 3.8851, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.31763720512390137, + "learning_rate": 0.0005802424948994462, + "loss": 3.8838, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.352448970079422, + "learning_rate": 0.0005800676187700379, + "loss": 3.8774, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.32752853631973267, + "learning_rate": 0.0005798927426406295, + "loss": 3.8735, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.31873783469200134, + "learning_rate": 0.0005797178665112212, + "loss": 3.8819, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.3134726881980896, + "learning_rate": 0.0005795429903818129, + "loss": 3.8784, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.32978641986846924, + "learning_rate": 0.0005793681142524044, + "loss": 3.8805, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.33774250948934204, + "eval_loss": 3.851607084274292, + "eval_runtime": 183.0051, + "eval_samples_per_second": 90.954, + "eval_steps_per_second": 5.688, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.31962209939956665, + "learning_rate": 0.0005791932381229961, + "loss": 3.8609, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.353483647108078, + "learning_rate": 0.0005790183619935878, + "loss": 3.866, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.34389597177505493, + "learning_rate": 0.0005788434858641795, + "loss": 3.8668, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.306185245513916, + "learning_rate": 0.0005786686097347712, + "loss": 3.8713, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.3174114525318146, + "learning_rate": 0.0005784937336053628, + "loss": 3.854, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.3356582522392273, + "learning_rate": 0.0005783188574759545, + "loss": 3.8665, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.3357125222682953, + "learning_rate": 0.0005781439813465462, + "loss": 3.8615, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.3359164297580719, + "learning_rate": 0.0005779691052171379, + "loss": 3.8584, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.3328222632408142, + "learning_rate": 0.0005777942290877294, + "loss": 3.8494, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.3154282569885254, + "learning_rate": 0.0005776193529583211, + "loss": 3.8592, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.35024163126945496, + "learning_rate": 0.0005774444768289128, + "loss": 3.8619, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.3441106379032135, + "learning_rate": 0.0005772696006995045, + "loss": 3.8442, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.3300905227661133, + "learning_rate": 0.0005770947245700962, + "loss": 3.8414, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.32087442278862, + "learning_rate": 0.0005769198484406878, + "loss": 3.8356, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.3275013864040375, + "learning_rate": 0.0005767449723112795, + "loss": 3.837, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.3164341151714325, + "learning_rate": 0.0005765700961818712, + "loss": 3.836, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.3237488269805908, + "learning_rate": 0.0005763952200524627, + "loss": 3.8226, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.3478614091873169, + "learning_rate": 0.0005762203439230544, + "loss": 3.7605, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.3280088007450104, + "learning_rate": 0.0005760454677936461, + "loss": 3.7386, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.3092538118362427, + "learning_rate": 0.0005758705916642378, + "loss": 3.7313, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34221756385061836, + "eval_loss": 3.8092734813690186, + "eval_runtime": 182.6845, + "eval_samples_per_second": 91.113, + "eval_steps_per_second": 5.698, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.3326481282711029, + "learning_rate": 0.0005756957155348294, + "loss": 3.7507, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.3384568989276886, + "learning_rate": 0.0005755208394054211, + "loss": 3.7393, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.33243587613105774, + "learning_rate": 0.0005753459632760128, + "loss": 3.7399, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.31076061725616455, + "learning_rate": 0.0005751710871466045, + "loss": 3.7378, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.3274184465408325, + "learning_rate": 0.0005749962110171962, + "loss": 3.7429, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.32674604654312134, + "learning_rate": 0.0005748213348877877, + "loss": 3.7583, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.32339632511138916, + "learning_rate": 0.0005746464587583794, + "loss": 3.7516, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.3199286162853241, + "learning_rate": 0.0005744715826289711, + "loss": 3.7587, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.3267718553543091, + "learning_rate": 0.0005742967064995627, + "loss": 3.749, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.32316339015960693, + "learning_rate": 0.0005741218303701544, + "loss": 3.747, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.3192504048347473, + "learning_rate": 0.0005739469542407461, + "loss": 3.7527, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.3137800097465515, + "learning_rate": 0.0005737720781113378, + "loss": 3.7386, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.3079003393650055, + "learning_rate": 0.0005735972019819295, + "loss": 3.7412, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.317622572183609, + "learning_rate": 0.000573422325852521, + "loss": 3.7524, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.3800288140773773, + "learning_rate": 0.0005732474497231127, + "loss": 3.7621, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.3083147704601288, + "learning_rate": 0.0005730725735937044, + "loss": 3.7586, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.3407652974128723, + "learning_rate": 0.0005728976974642961, + "loss": 3.7421, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3287487030029297, + "learning_rate": 0.0005727228213348877, + "loss": 3.7566, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.3335205912590027, + "learning_rate": 0.0005725479452054794, + "loss": 3.7335, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.3264673352241516, + "learning_rate": 0.0005723730690760711, + "loss": 3.7565, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.34492789746043634, + "eval_loss": 3.7804434299468994, + "eval_runtime": 183.2368, + "eval_samples_per_second": 90.839, + "eval_steps_per_second": 5.681, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.3331637382507324, + "learning_rate": 0.0005721981929466627, + "loss": 3.7361, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.332265704870224, + "learning_rate": 0.0005720233168172545, + "loss": 3.7488, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3338277041912079, + "learning_rate": 0.000571848440687846, + "loss": 3.749, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.3343018889427185, + "learning_rate": 0.0005716735645584377, + "loss": 3.7482, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.3533099591732025, + "learning_rate": 0.0005714986884290294, + "loss": 3.7509, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.3301255404949188, + "learning_rate": 0.000571323812299621, + "loss": 3.7334, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.31493982672691345, + "learning_rate": 0.0005711489361702127, + "loss": 3.7405, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.3294617235660553, + "learning_rate": 0.0005709740600408044, + "loss": 3.7471, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.316875696182251, + "learning_rate": 0.0005707991839113961, + "loss": 3.756, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.310937762260437, + "learning_rate": 0.0005706243077819877, + "loss": 3.7412, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.31915387511253357, + "learning_rate": 0.0005704494316525793, + "loss": 3.7455, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.32014259696006775, + "learning_rate": 0.000570274555523171, + "loss": 3.7348, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.32315295934677124, + "learning_rate": 0.0005700996793937627, + "loss": 3.7426, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.33680716156959534, + "learning_rate": 0.0005699248032643544, + "loss": 3.7458, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.30035462975502014, + "learning_rate": 0.000569749927134946, + "loss": 3.7355, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.3163704574108124, + "learning_rate": 0.0005695750510055377, + "loss": 3.745, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.3217114806175232, + "learning_rate": 0.0005694001748761294, + "loss": 3.7532, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.3296523988246918, + "learning_rate": 0.000569225298746721, + "loss": 3.7536, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.3380911648273468, + "learning_rate": 0.0005690504226173127, + "loss": 3.7383, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3250406086444855, + "learning_rate": 0.0005688755464879043, + "loss": 3.7287, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.3478768975009979, + "eval_loss": 3.7482099533081055, + "eval_runtime": 182.7889, + "eval_samples_per_second": 91.061, + "eval_steps_per_second": 5.695, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.2994716465473175, + "learning_rate": 0.000568700670358496, + "loss": 3.7238, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.34974968433380127, + "learning_rate": 0.0005685257942290877, + "loss": 3.7324, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.3160337209701538, + "learning_rate": 0.0005683509180996793, + "loss": 3.7271, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.3240397274494171, + "learning_rate": 0.000568176041970271, + "loss": 3.7258, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.32082828879356384, + "learning_rate": 0.0005680011658408627, + "loss": 3.7358, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.31649598479270935, + "learning_rate": 0.0005678262897114544, + "loss": 3.7392, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.31169888377189636, + "learning_rate": 0.000567651413582046, + "loss": 3.735, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.3002908229827881, + "learning_rate": 0.0005674765374526377, + "loss": 3.7288, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.30253180861473083, + "learning_rate": 0.0005673016613232293, + "loss": 3.7204, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.31831657886505127, + "learning_rate": 0.0005671267851938209, + "loss": 3.7258, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.31231117248535156, + "learning_rate": 0.0005669519090644127, + "loss": 3.721, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.3149794936180115, + "learning_rate": 0.0005667770329350043, + "loss": 3.7337, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.3181709945201874, + "learning_rate": 0.000566602156805596, + "loss": 3.7265, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.31284210085868835, + "learning_rate": 0.0005664272806761877, + "loss": 3.7288, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.3245266377925873, + "learning_rate": 0.0005662524045467793, + "loss": 3.7222, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.319336473941803, + "learning_rate": 0.000566077528417371, + "loss": 3.7298, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.3112030625343323, + "learning_rate": 0.0005659026522879626, + "loss": 3.7303, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.3266408443450928, + "learning_rate": 0.0005657277761585543, + "loss": 3.7114, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.3135480284690857, + "learning_rate": 0.0005655529000291459, + "loss": 3.7203, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.3140251040458679, + "learning_rate": 0.0005653780238997376, + "loss": 3.7117, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.3501354108677876, + "eval_loss": 3.7232017517089844, + "eval_runtime": 182.9143, + "eval_samples_per_second": 90.999, + "eval_steps_per_second": 5.691, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.30501222610473633, + "learning_rate": 0.0005652031477703293, + "loss": 3.7205, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.31396257877349854, + "learning_rate": 0.000565028271640921, + "loss": 3.7231, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.29565396904945374, + "learning_rate": 0.0005648533955115127, + "loss": 3.7167, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.29209980368614197, + "learning_rate": 0.0005646785193821043, + "loss": 3.7011, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.341580867767334, + "learning_rate": 0.000564503643252696, + "loss": 3.7199, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.306345134973526, + "learning_rate": 0.0005643287671232876, + "loss": 3.7135, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.3232329487800598, + "learning_rate": 0.0005641538909938792, + "loss": 3.602, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.3200088441371918, + "learning_rate": 0.0005639790148644709, + "loss": 3.6044, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.31947940587997437, + "learning_rate": 0.0005638041387350626, + "loss": 3.612, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.3416725695133209, + "learning_rate": 0.0005636292626056543, + "loss": 3.6212, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.3318784832954407, + "learning_rate": 0.000563454386476246, + "loss": 3.6079, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.3215178847312927, + "learning_rate": 0.0005632795103468376, + "loss": 3.623, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3258126974105835, + "learning_rate": 0.0005631046342174293, + "loss": 3.6237, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.3248828947544098, + "learning_rate": 0.000562929758088021, + "loss": 3.6206, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.3171083331108093, + "learning_rate": 0.0005627548819586126, + "loss": 3.6301, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.3476962745189667, + "learning_rate": 0.0005625800058292042, + "loss": 3.6342, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.32831183075904846, + "learning_rate": 0.0005624051296997959, + "loss": 3.6253, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.3208300769329071, + "learning_rate": 0.0005622302535703876, + "loss": 3.6201, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.34289947152137756, + "learning_rate": 0.0005620553774409792, + "loss": 3.6407, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.34401383996009827, + "learning_rate": 0.000561880501311571, + "loss": 3.6328, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.35195491908561366, + "eval_loss": 3.711780548095703, + "eval_runtime": 183.0966, + "eval_samples_per_second": 90.908, + "eval_steps_per_second": 5.686, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.3176616132259369, + "learning_rate": 0.0005617056251821626, + "loss": 3.6359, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.3091390132904053, + "learning_rate": 0.0005615307490527543, + "loss": 3.6274, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.31460776925086975, + "learning_rate": 0.000561355872923346, + "loss": 3.6288, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.33371245861053467, + "learning_rate": 0.0005611809967939375, + "loss": 3.6434, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.3317493796348572, + "learning_rate": 0.0005610061206645292, + "loss": 3.6385, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.3349264860153198, + "learning_rate": 0.0005608312445351209, + "loss": 3.6397, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.32256776094436646, + "learning_rate": 0.0005606563684057126, + "loss": 3.6336, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.32920846343040466, + "learning_rate": 0.0005604814922763042, + "loss": 3.633, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.32023611664772034, + "learning_rate": 0.0005603066161468959, + "loss": 3.6273, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.3239540755748749, + "learning_rate": 0.0005601317400174876, + "loss": 3.6354, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.3216831386089325, + "learning_rate": 0.0005599568638880793, + "loss": 3.6423, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.3342783749103546, + "learning_rate": 0.0005597819877586709, + "loss": 3.6189, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.30918747186660767, + "learning_rate": 0.0005596071116292625, + "loss": 3.6395, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.3271181583404541, + "learning_rate": 0.0005594322354998542, + "loss": 3.6403, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.30378925800323486, + "learning_rate": 0.0005592573593704459, + "loss": 3.6356, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.3317507803440094, + "learning_rate": 0.0005590824832410375, + "loss": 3.6269, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.33540189266204834, + "learning_rate": 0.0005589076071116292, + "loss": 3.64, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.2987757921218872, + "learning_rate": 0.0005587327309822209, + "loss": 3.6381, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.3357625901699066, + "learning_rate": 0.0005585578548528126, + "loss": 3.6504, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.3251186013221741, + "learning_rate": 0.0005583829787234043, + "loss": 3.6393, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.3538467326506846, + "eval_loss": 3.695300817489624, + "eval_runtime": 183.2049, + "eval_samples_per_second": 90.855, + "eval_steps_per_second": 5.682, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.3250819742679596, + "learning_rate": 0.0005582081025939958, + "loss": 3.628, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.3002704381942749, + "learning_rate": 0.0005580332264645875, + "loss": 3.6323, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.3166797161102295, + "learning_rate": 0.0005578583503351792, + "loss": 3.6524, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.32995665073394775, + "learning_rate": 0.0005576834742057709, + "loss": 3.6375, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.3259824812412262, + "learning_rate": 0.0005575085980763625, + "loss": 3.6469, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.35942357778549194, + "learning_rate": 0.0005573337219469542, + "loss": 3.6256, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.35430946946144104, + "learning_rate": 0.0005571588458175459, + "loss": 3.6376, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.31567761301994324, + "learning_rate": 0.0005569839696881374, + "loss": 3.6417, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.32618406414985657, + "learning_rate": 0.0005568090935587292, + "loss": 3.6446, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.31533339619636536, + "learning_rate": 0.0005566342174293208, + "loss": 3.6602, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.3274492621421814, + "learning_rate": 0.0005564593412999125, + "loss": 3.6288, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.3271532952785492, + "learning_rate": 0.0005562844651705042, + "loss": 3.6392, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.3347550630569458, + "learning_rate": 0.0005561095890410958, + "loss": 3.6288, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.32583537697792053, + "learning_rate": 0.0005559347129116875, + "loss": 3.6302, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.30043119192123413, + "learning_rate": 0.0005557598367822792, + "loss": 3.6391, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.328071653842926, + "learning_rate": 0.0005555849606528709, + "loss": 3.6402, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.3114347755908966, + "learning_rate": 0.0005554100845234624, + "loss": 3.6368, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.3095006048679352, + "learning_rate": 0.0005552352083940541, + "loss": 3.642, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.341740220785141, + "learning_rate": 0.0005550603322646458, + "loss": 3.6366, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.3182758390903473, + "learning_rate": 0.0005548854561352375, + "loss": 3.6428, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.35534045531206226, + "eval_loss": 3.67643141746521, + "eval_runtime": 182.9247, + "eval_samples_per_second": 90.994, + "eval_steps_per_second": 5.691, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.3386680781841278, + "learning_rate": 0.0005547105800058292, + "loss": 3.6498, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.317020446062088, + "learning_rate": 0.0005545357038764208, + "loss": 3.6364, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.3221585154533386, + "learning_rate": 0.0005543608277470125, + "loss": 3.639, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.2992505133152008, + "learning_rate": 0.0005541859516176042, + "loss": 3.6415, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.30075782537460327, + "learning_rate": 0.0005540110754881958, + "loss": 3.6458, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.3180374205112457, + "learning_rate": 0.0005538361993587874, + "loss": 3.6236, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.3106091022491455, + "learning_rate": 0.0005536613232293791, + "loss": 3.6353, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.29708361625671387, + "learning_rate": 0.0005534864470999708, + "loss": 3.6366, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.34170088171958923, + "learning_rate": 0.0005533115709705625, + "loss": 3.6262, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.3265573978424072, + "learning_rate": 0.0005531366948411541, + "loss": 3.635, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.31741398572921753, + "learning_rate": 0.0005529618187117458, + "loss": 3.631, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.32067936658859253, + "learning_rate": 0.0005527869425823375, + "loss": 3.6362, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.31947606801986694, + "learning_rate": 0.0005526120664529292, + "loss": 3.6329, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.2974706292152405, + "learning_rate": 0.0005524371903235207, + "loss": 3.636, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.3165639042854309, + "learning_rate": 0.0005522623141941124, + "loss": 3.5888, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.3143453299999237, + "learning_rate": 0.0005520874380647041, + "loss": 3.5185, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.3318156599998474, + "learning_rate": 0.0005519125619352957, + "loss": 3.52, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.30892449617385864, + "learning_rate": 0.0005517376858058875, + "loss": 3.5368, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.3253442645072937, + "learning_rate": 0.0005515628096764791, + "loss": 3.5317, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.31239765882492065, + "learning_rate": 0.0005513879335470708, + "loss": 3.5319, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.35649757600732224, + "eval_loss": 3.668315887451172, + "eval_runtime": 182.8782, + "eval_samples_per_second": 91.017, + "eval_steps_per_second": 5.692, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.32942360639572144, + "learning_rate": 0.0005512130574176625, + "loss": 3.5415, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.3166195750236511, + "learning_rate": 0.000551038181288254, + "loss": 3.5305, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.3233526349067688, + "learning_rate": 0.0005508633051588457, + "loss": 3.5415, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.32190340757369995, + "learning_rate": 0.0005506884290294374, + "loss": 3.5477, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.34432554244995117, + "learning_rate": 0.0005505135529000291, + "loss": 3.5525, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.3212747275829315, + "learning_rate": 0.0005503386767706207, + "loss": 3.5454, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.3225768804550171, + "learning_rate": 0.0005501638006412124, + "loss": 3.5577, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.3135558068752289, + "learning_rate": 0.0005499889245118041, + "loss": 3.5534, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.342723548412323, + "learning_rate": 0.0005498140483823958, + "loss": 3.5381, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.3262888193130493, + "learning_rate": 0.0005496391722529875, + "loss": 3.5649, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.3289225399494171, + "learning_rate": 0.000549464296123579, + "loss": 3.5588, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.30956584215164185, + "learning_rate": 0.0005492894199941707, + "loss": 3.5632, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.33964231610298157, + "learning_rate": 0.0005491145438647624, + "loss": 3.5687, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.33205127716064453, + "learning_rate": 0.000548939667735354, + "loss": 3.5543, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": Infinity, + "learning_rate": 0.0005487647916059457, + "loss": 3.5658, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.3393228054046631, + "learning_rate": 0.0005485899154765374, + "loss": 3.5577, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.31260740756988525, + "learning_rate": 0.0005484150393471291, + "loss": 3.5722, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.331304132938385, + "learning_rate": 0.0005482401632177208, + "loss": 3.5601, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.3720129728317261, + "learning_rate": 0.0005480652870883124, + "loss": 3.5754, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.31915804743766785, + "learning_rate": 0.000547890410958904, + "loss": 3.5417, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.3573578332850318, + "eval_loss": 3.6594953536987305, + "eval_runtime": 183.0024, + "eval_samples_per_second": 90.955, + "eval_steps_per_second": 5.688, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.3377160429954529, + "learning_rate": 0.0005477155348294957, + "loss": 3.5704, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.31284624338150024, + "learning_rate": 0.0005475406587000874, + "loss": 3.562, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.331798255443573, + "learning_rate": 0.000547365782570679, + "loss": 3.5583, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.31809282302856445, + "learning_rate": 0.0005471909064412707, + "loss": 3.5653, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.3349161446094513, + "learning_rate": 0.0005470160303118624, + "loss": 3.5674, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.32030266523361206, + "learning_rate": 0.000546841154182454, + "loss": 3.5776, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": NaN, + "learning_rate": 0.0005466662780530458, + "loss": 3.5804, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.3176333010196686, + "learning_rate": 0.0005464914019236374, + "loss": 3.5782, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.3277718126773834, + "learning_rate": 0.000546316525794229, + "loss": 3.5864, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.3107328712940216, + "learning_rate": 0.0005461416496648207, + "loss": 3.5818, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.32739174365997314, + "learning_rate": 0.0005459667735354123, + "loss": 3.5574, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.3131246268749237, + "learning_rate": 0.000545791897406004, + "loss": 3.5691, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.33344247937202454, + "learning_rate": 0.0005456170212765957, + "loss": 3.5742, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.3450835347175598, + "learning_rate": 0.0005454421451471874, + "loss": 3.5739, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.31020715832710266, + "learning_rate": 0.000545267269017779, + "loss": 3.5733, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.3077200651168823, + "learning_rate": 0.0005450923928883708, + "loss": 3.5576, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3353123068809509, + "learning_rate": 0.0005449175167589623, + "loss": 3.5723, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.32540038228034973, + "learning_rate": 0.000544742640629554, + "loss": 3.5793, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3153678774833679, + "learning_rate": 0.0005445677645001457, + "loss": 3.566, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.3324628472328186, + "learning_rate": 0.0005443928883707373, + "loss": 3.5679, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.35883380292619155, + "eval_loss": 3.6458113193511963, + "eval_runtime": 182.9054, + "eval_samples_per_second": 91.003, + "eval_steps_per_second": 5.691, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.30943557620048523, + "learning_rate": 0.000544218012241329, + "loss": 3.5781, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.33318641781806946, + "learning_rate": 0.0005440431361119207, + "loss": 3.5782, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.32952025532722473, + "learning_rate": 0.0005438682599825123, + "loss": 3.5731, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.33214232325553894, + "learning_rate": 0.000543693383853104, + "loss": 3.5754, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.3278939723968506, + "learning_rate": 0.0005435185077236957, + "loss": 3.5669, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.35436153411865234, + "learning_rate": 0.0005433436315942873, + "loss": 3.5705, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.32666391134262085, + "learning_rate": 0.000543168755464879, + "loss": 3.567, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.3220253586769104, + "learning_rate": 0.0005429938793354706, + "loss": 3.5843, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.3143666982650757, + "learning_rate": 0.0005428190032060623, + "loss": 3.5715, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.30706289410591125, + "learning_rate": 0.000542644127076654, + "loss": 3.5738, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.3221777677536011, + "learning_rate": 0.0005424692509472457, + "loss": 3.5769, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.32023727893829346, + "learning_rate": 0.0005422943748178373, + "loss": 3.5685, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.3428962230682373, + "learning_rate": 0.000542119498688429, + "loss": 3.5825, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.3187549412250519, + "learning_rate": 0.0005419446225590207, + "loss": 3.5684, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.32895606756210327, + "learning_rate": 0.0005417697464296122, + "loss": 3.5723, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.31578466296195984, + "learning_rate": 0.000541594870300204, + "loss": 3.5769, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.3406703472137451, + "learning_rate": 0.0005414199941707956, + "loss": 3.5636, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.32653164863586426, + "learning_rate": 0.0005412451180413873, + "loss": 3.5667, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.31869933009147644, + "learning_rate": 0.000541070241911979, + "loss": 3.5796, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.3086072504520416, + "learning_rate": 0.0005408953657825706, + "loss": 3.5802, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.36002831077661235, + "eval_loss": 3.6297221183776855, + "eval_runtime": 182.8739, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 5.692, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.3254891335964203, + "learning_rate": 0.0005407204896531623, + "loss": 3.5614, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.3315959870815277, + "learning_rate": 0.000540545613523754, + "loss": 3.5736, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.308977335691452, + "learning_rate": 0.0005403707373943456, + "loss": 3.5687, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.33981844782829285, + "learning_rate": 0.0005401958612649372, + "loss": 3.5112, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.32200807332992554, + "learning_rate": 0.000540020985135529, + "loss": 3.4613, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.30297327041625977, + "learning_rate": 0.0005398461090061206, + "loss": 3.4576, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.32989853620529175, + "learning_rate": 0.0005396712328767123, + "loss": 3.4718, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.31648069620132446, + "learning_rate": 0.000539496356747304, + "loss": 3.4785, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.3199455440044403, + "learning_rate": 0.0005393214806178956, + "loss": 3.4711, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.33137065172195435, + "learning_rate": 0.0005391466044884873, + "loss": 3.491, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.33032289147377014, + "learning_rate": 0.000538971728359079, + "loss": 3.4838, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.32575368881225586, + "learning_rate": 0.0005387968522296705, + "loss": 3.492, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.3349052965641022, + "learning_rate": 0.0005386219761002622, + "loss": 3.4838, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3152545094490051, + "learning_rate": 0.0005384470999708539, + "loss": 3.4819, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.321737676858902, + "learning_rate": 0.0005382722238414456, + "loss": 3.486, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.36409512162208557, + "learning_rate": 0.0005380973477120373, + "loss": 3.4959, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.32220658659935, + "learning_rate": 0.000537922471582629, + "loss": 3.4938, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.3427848219871521, + "learning_rate": 0.0005377475954532206, + "loss": 3.4992, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.33268028497695923, + "learning_rate": 0.0005375727193238123, + "loss": 3.4945, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.2996683120727539, + "learning_rate": 0.000537397843194404, + "loss": 3.4929, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.36062627011984466, + "eval_loss": 3.6342105865478516, + "eval_runtime": 182.824, + "eval_samples_per_second": 91.044, + "eval_steps_per_second": 5.694, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.30999842286109924, + "learning_rate": 0.0005372229670649955, + "loss": 3.5096, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.32671719789505005, + "learning_rate": 0.0005370480909355872, + "loss": 3.5092, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.32220521569252014, + "learning_rate": 0.0005368732148061789, + "loss": 3.5055, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.3446556031703949, + "learning_rate": 0.0005366983386767705, + "loss": 3.5103, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.333375483751297, + "learning_rate": 0.0005365234625473623, + "loss": 3.5227, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.32386448979377747, + "learning_rate": 0.0005363485864179539, + "loss": 3.5153, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.31273576617240906, + "learning_rate": 0.0005361737102885456, + "loss": 3.5114, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.32386016845703125, + "learning_rate": 0.0005359988341591373, + "loss": 3.5176, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3084457516670227, + "learning_rate": 0.000535823958029729, + "loss": 3.5144, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.34510570764541626, + "learning_rate": 0.0005356490819003205, + "loss": 3.5283, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.33156850934028625, + "learning_rate": 0.0005354742057709122, + "loss": 3.5198, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.32253149151802063, + "learning_rate": 0.0005352993296415039, + "loss": 3.5129, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.343431293964386, + "learning_rate": 0.0005351244535120955, + "loss": 3.5214, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.320697158575058, + "learning_rate": 0.0005349495773826873, + "loss": 3.5089, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.3079582154750824, + "learning_rate": 0.0005347747012532789, + "loss": 3.523, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.33627447485923767, + "learning_rate": 0.0005345998251238706, + "loss": 3.5175, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.3293239176273346, + "learning_rate": 0.0005344249489944623, + "loss": 3.5214, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.321121484041214, + "learning_rate": 0.0005342500728650538, + "loss": 3.5227, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.32323557138442993, + "learning_rate": 0.0005340751967356455, + "loss": 3.5089, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.3353577256202698, + "learning_rate": 0.0005339003206062372, + "loss": 3.5125, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.36119366134879455, + "eval_loss": 3.62491774559021, + "eval_runtime": 182.9044, + "eval_samples_per_second": 91.004, + "eval_steps_per_second": 5.691, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.3373064696788788, + "learning_rate": 0.0005337254444768288, + "loss": 3.5161, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3205850422382355, + "learning_rate": 0.0005335505683474205, + "loss": 3.5198, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.33948105573654175, + "learning_rate": 0.0005333756922180122, + "loss": 3.5155, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.30776116251945496, + "learning_rate": 0.0005332008160886039, + "loss": 3.5019, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.42596328258514404, + "learning_rate": 0.0005330259399591956, + "loss": 3.5311, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.34155476093292236, + "learning_rate": 0.0005328510638297873, + "loss": 3.5215, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.33613601326942444, + "learning_rate": 0.0005326761877003788, + "loss": 3.5089, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.32420244812965393, + "learning_rate": 0.0005325013115709705, + "loss": 3.5318, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.30287185311317444, + "learning_rate": 0.0005323264354415622, + "loss": 3.5207, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.3400566577911377, + "learning_rate": 0.0005321515593121538, + "loss": 3.5213, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.31341907382011414, + "learning_rate": 0.0005319766831827455, + "loss": 3.5249, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.3233737647533417, + "learning_rate": 0.0005318018070533372, + "loss": 3.5391, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.3121044635772705, + "learning_rate": 0.0005316269309239288, + "loss": 3.5232, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.32169246673583984, + "learning_rate": 0.0005314520547945206, + "loss": 3.5159, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.3152818977832794, + "learning_rate": 0.0005312771786651121, + "loss": 3.5194, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.31117457151412964, + "learning_rate": 0.0005311023025357038, + "loss": 3.5284, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3282223641872406, + "learning_rate": 0.0005309274264062955, + "loss": 3.526, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.3373311161994934, + "learning_rate": 0.0005307525502768872, + "loss": 3.5291, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.3154096007347107, + "learning_rate": 0.0005305776741474788, + "loss": 3.53, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.31639188528060913, + "learning_rate": 0.0005304027980180705, + "loss": 3.5162, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.3622192215829698, + "eval_loss": 3.612488269805908, + "eval_runtime": 182.8981, + "eval_samples_per_second": 91.007, + "eval_steps_per_second": 5.692, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.33335795998573303, + "learning_rate": 0.0005302279218886622, + "loss": 3.5267, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.32977914810180664, + "learning_rate": 0.0005300530457592538, + "loss": 3.5278, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.319991797208786, + "learning_rate": 0.0005298781696298456, + "loss": 3.5243, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.31696072220802307, + "learning_rate": 0.0005297032935004371, + "loss": 3.5255, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.31803423166275024, + "learning_rate": 0.0005295284173710288, + "loss": 3.5163, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.30268293619155884, + "learning_rate": 0.0005293535412416205, + "loss": 3.5247, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.3222343325614929, + "learning_rate": 0.0005291786651122121, + "loss": 3.5226, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.3210581839084625, + "learning_rate": 0.0005290037889828038, + "loss": 3.5315, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.30163270235061646, + "learning_rate": 0.0005288289128533955, + "loss": 3.531, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.3174237906932831, + "learning_rate": 0.0005286540367239872, + "loss": 3.5308, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.3105069398880005, + "learning_rate": 0.0005284791605945788, + "loss": 3.5229, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.35065528750419617, + "learning_rate": 0.0005283042844651704, + "loss": 3.5306, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.3209701180458069, + "learning_rate": 0.0005281294083357621, + "loss": 3.4202, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.35732001066207886, + "learning_rate": 0.0005279545322063538, + "loss": 3.4097, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.3293222188949585, + "learning_rate": 0.0005277796560769455, + "loss": 3.4211, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.3226439952850342, + "learning_rate": 0.0005276047799475371, + "loss": 3.4286, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.33143454790115356, + "learning_rate": 0.0005274299038181288, + "loss": 3.4274, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.31629180908203125, + "learning_rate": 0.0005272550276887205, + "loss": 3.4214, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.3248637020587921, + "learning_rate": 0.0005270801515593121, + "loss": 3.448, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.333915114402771, + "learning_rate": 0.0005269052754299037, + "loss": 3.4304, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.36297096205497675, + "eval_loss": 3.6168880462646484, + "eval_runtime": 183.1564, + "eval_samples_per_second": 90.879, + "eval_steps_per_second": 5.684, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.339853435754776, + "learning_rate": 0.0005267303993004954, + "loss": 3.432, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.3413807153701782, + "learning_rate": 0.000526555523171087, + "loss": 3.4503, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.3138124942779541, + "learning_rate": 0.0005263806470416788, + "loss": 3.4536, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.3247540593147278, + "learning_rate": 0.0005262057709122704, + "loss": 3.4454, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.30432644486427307, + "learning_rate": 0.0005260308947828621, + "loss": 3.4577, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.3186846077442169, + "learning_rate": 0.0005258560186534538, + "loss": 3.4504, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.3390951454639435, + "learning_rate": 0.0005256811425240455, + "loss": 3.4427, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.3416147828102112, + "learning_rate": 0.0005255062663946371, + "loss": 3.4451, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.330108106136322, + "learning_rate": 0.0005253313902652287, + "loss": 3.4634, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.32614409923553467, + "learning_rate": 0.0005251565141358204, + "loss": 3.4689, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.3183272182941437, + "learning_rate": 0.000524981638006412, + "loss": 3.4614, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.3260941803455353, + "learning_rate": 0.0005248067618770038, + "loss": 3.4687, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.34689396619796753, + "learning_rate": 0.0005246318857475954, + "loss": 3.4607, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.3259231150150299, + "learning_rate": 0.0005244570096181871, + "loss": 3.4606, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.33066433668136597, + "learning_rate": 0.0005242821334887788, + "loss": 3.4666, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.3282213807106018, + "learning_rate": 0.0005241072573593704, + "loss": 3.4566, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.32075440883636475, + "learning_rate": 0.000523932381229962, + "loss": 3.4641, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.3442796468734741, + "learning_rate": 0.0005237575051005537, + "loss": 3.4588, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.3072233498096466, + "learning_rate": 0.0005235826289711454, + "loss": 3.462, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.35822081565856934, + "learning_rate": 0.000523407752841737, + "loss": 3.4737, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.3631809414861629, + "eval_loss": 3.6075851917266846, + "eval_runtime": 183.2532, + "eval_samples_per_second": 90.831, + "eval_steps_per_second": 5.681, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.3440370559692383, + "learning_rate": 0.0005232328767123287, + "loss": 3.4619, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.3295625150203705, + "learning_rate": 0.0005230580005829204, + "loss": 3.4756, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.32679641246795654, + "learning_rate": 0.0005228831244535121, + "loss": 3.4795, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.3422777056694031, + "learning_rate": 0.0005227082483241038, + "loss": 3.4777, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.3188982903957367, + "learning_rate": 0.0005225333721946954, + "loss": 3.4789, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.33180058002471924, + "learning_rate": 0.000522358496065287, + "loss": 3.4764, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.35259467363357544, + "learning_rate": 0.0005221836199358787, + "loss": 3.4783, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.31127235293388367, + "learning_rate": 0.0005220087438064703, + "loss": 3.4812, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.3426322340965271, + "learning_rate": 0.000521833867677062, + "loss": 3.4858, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.33666297793388367, + "learning_rate": 0.0005216589915476537, + "loss": 3.4842, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.33777403831481934, + "learning_rate": 0.0005214841154182454, + "loss": 3.4867, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.33145034313201904, + "learning_rate": 0.0005213092392888371, + "loss": 3.4843, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.3483746647834778, + "learning_rate": 0.0005211343631594287, + "loss": 3.4842, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.34439757466316223, + "learning_rate": 0.0005209594870300204, + "loss": 3.4747, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.330509752035141, + "learning_rate": 0.000520784610900612, + "loss": 3.4757, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.3516590893268585, + "learning_rate": 0.0005206097347712037, + "loss": 3.4689, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.35408666729927063, + "learning_rate": 0.0005204348586417953, + "loss": 3.4825, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.35224005579948425, + "learning_rate": 0.000520259982512387, + "loss": 3.4933, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.33899161219596863, + "learning_rate": 0.0005200851063829787, + "loss": 3.4857, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.3293270170688629, + "learning_rate": 0.0005199102302535703, + "loss": 3.4775, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.3640601274807935, + "eval_loss": 3.599479913711548, + "eval_runtime": 183.341, + "eval_samples_per_second": 90.787, + "eval_steps_per_second": 5.678, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.32254496216773987, + "learning_rate": 0.0005197353541241621, + "loss": 3.4873, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.32425615191459656, + "learning_rate": 0.0005195604779947537, + "loss": 3.4951, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.3163357675075531, + "learning_rate": 0.0005193856018653454, + "loss": 3.4845, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.3210630416870117, + "learning_rate": 0.000519210725735937, + "loss": 3.4869, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.33914175629615784, + "learning_rate": 0.0005190358496065286, + "loss": 3.4848, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.3026992082595825, + "learning_rate": 0.0005188609734771203, + "loss": 3.4891, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.3406318724155426, + "learning_rate": 0.000518686097347712, + "loss": 3.4859, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.3310200273990631, + "learning_rate": 0.0005185112212183037, + "loss": 3.5022, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.30199411511421204, + "learning_rate": 0.0005183363450888953, + "loss": 3.4872, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.3239602744579315, + "learning_rate": 0.000518161468959487, + "loss": 3.4867, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.32365167140960693, + "learning_rate": 0.0005179865928300787, + "loss": 3.4923, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.31722819805145264, + "learning_rate": 0.0005178117167006703, + "loss": 3.4779, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.3375761806964874, + "learning_rate": 0.000517636840571262, + "loss": 3.4864, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.30887332558631897, + "learning_rate": 0.0005174619644418536, + "loss": 3.4974, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.34449508786201477, + "learning_rate": 0.0005172870883124453, + "loss": 3.4916, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.3328838348388672, + "learning_rate": 0.000517112212183037, + "loss": 3.4975, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.33597683906555176, + "learning_rate": 0.0005169373360536286, + "loss": 3.4857, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.3501330316066742, + "learning_rate": 0.0005167624599242203, + "loss": 3.4972, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.3559366762638092, + "learning_rate": 0.000516587583794812, + "loss": 3.4917, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.3154732286930084, + "learning_rate": 0.0005164127076654037, + "loss": 3.49, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.36516716349649847, + "eval_loss": 3.586782932281494, + "eval_runtime": 182.9453, + "eval_samples_per_second": 90.983, + "eval_steps_per_second": 5.69, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.3098287880420685, + "learning_rate": 0.0005162378315359953, + "loss": 3.4496, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.34997889399528503, + "learning_rate": 0.0005160629554065869, + "loss": 3.3794, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.33267155289649963, + "learning_rate": 0.0005158880792771786, + "loss": 3.3792, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.3368155360221863, + "learning_rate": 0.0005157132031477703, + "loss": 3.3876, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.33117562532424927, + "learning_rate": 0.000515538327018362, + "loss": 3.4, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.3448326587677002, + "learning_rate": 0.0005153634508889536, + "loss": 3.4042, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.3393695652484894, + "learning_rate": 0.0005151885747595453, + "loss": 3.4025, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.32103875279426575, + "learning_rate": 0.000515013698630137, + "loss": 3.3923, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.32782039046287537, + "learning_rate": 0.0005148388225007285, + "loss": 3.4004, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.321836918592453, + "learning_rate": 0.0005146639463713203, + "loss": 3.4083, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.3271026015281677, + "learning_rate": 0.0005144890702419119, + "loss": 3.4261, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.32715457677841187, + "learning_rate": 0.0005143141941125036, + "loss": 3.4129, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.3357333838939667, + "learning_rate": 0.0005141393179830953, + "loss": 3.4146, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.3631840646266937, + "learning_rate": 0.0005139644418536869, + "loss": 3.4181, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.3274785578250885, + "learning_rate": 0.0005137895657242786, + "loss": 3.4125, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.3449844419956207, + "learning_rate": 0.0005136146895948703, + "loss": 3.4166, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.32263556122779846, + "learning_rate": 0.000513439813465462, + "loss": 3.4285, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.3413465619087219, + "learning_rate": 0.0005132649373360535, + "loss": 3.4285, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.3473232388496399, + "learning_rate": 0.0005130900612066452, + "loss": 3.4347, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.32374686002731323, + "learning_rate": 0.0005129151850772369, + "loss": 3.4352, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.3647126391510529, + "eval_loss": 3.598836898803711, + "eval_runtime": 183.3183, + "eval_samples_per_second": 90.798, + "eval_steps_per_second": 5.679, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.3352644443511963, + "learning_rate": 0.0005127403089478286, + "loss": 3.4219, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.36998191475868225, + "learning_rate": 0.0005125654328184203, + "loss": 3.4354, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.33111071586608887, + "learning_rate": 0.0005123905566890119, + "loss": 3.4273, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.3304993212223053, + "learning_rate": 0.0005122156805596036, + "loss": 3.4285, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.34959882497787476, + "learning_rate": 0.0005120408044301953, + "loss": 3.4227, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.36705702543258667, + "learning_rate": 0.0005118659283007868, + "loss": 3.4247, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.3186073899269104, + "learning_rate": 0.0005116910521713785, + "loss": 3.4439, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.3507457971572876, + "learning_rate": 0.0005115161760419702, + "loss": 3.4373, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.3345440626144409, + "learning_rate": 0.0005113412999125619, + "loss": 3.4429, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.3337697684764862, + "learning_rate": 0.0005111664237831536, + "loss": 3.4366, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.3243204653263092, + "learning_rate": 0.0005109915476537452, + "loss": 3.451, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.34631916880607605, + "learning_rate": 0.0005108166715243369, + "loss": 3.4431, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.31301483511924744, + "learning_rate": 0.0005106417953949286, + "loss": 3.436, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.3258545994758606, + "learning_rate": 0.0005104669192655203, + "loss": 3.4431, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.3321167230606079, + "learning_rate": 0.0005102920431361118, + "loss": 3.4506, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.3389933407306671, + "learning_rate": 0.0005101171670067035, + "loss": 3.4348, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.34453973174095154, + "learning_rate": 0.0005099422908772952, + "loss": 3.43, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.3527055084705353, + "learning_rate": 0.0005097674147478868, + "loss": 3.4518, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.33843597769737244, + "learning_rate": 0.0005095925386184786, + "loss": 3.4435, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.3532460927963257, + "learning_rate": 0.0005094176624890702, + "loss": 3.4508, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.36540406638218725, + "eval_loss": 3.589508056640625, + "eval_runtime": 183.429, + "eval_samples_per_second": 90.744, + "eval_steps_per_second": 5.675, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.33581236004829407, + "learning_rate": 0.0005092427863596619, + "loss": 3.4393, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.31811484694480896, + "learning_rate": 0.0005090679102302536, + "loss": 3.4466, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.34038013219833374, + "learning_rate": 0.0005088930341008451, + "loss": 3.4549, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.314603716135025, + "learning_rate": 0.0005087181579714368, + "loss": 3.4585, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.3383300006389618, + "learning_rate": 0.0005085432818420285, + "loss": 3.4489, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.32058727741241455, + "learning_rate": 0.0005083684057126202, + "loss": 3.4562, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.3361349105834961, + "learning_rate": 0.0005081935295832118, + "loss": 3.4513, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.3190141022205353, + "learning_rate": 0.0005080186534538035, + "loss": 3.4377, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.32522907853126526, + "learning_rate": 0.0005078437773243952, + "loss": 3.4548, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.3320629596710205, + "learning_rate": 0.0005076689011949869, + "loss": 3.4482, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.3172106146812439, + "learning_rate": 0.0005074940250655786, + "loss": 3.4546, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.339695006608963, + "learning_rate": 0.0005073191489361701, + "loss": 3.456, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.3172706365585327, + "learning_rate": 0.0005071442728067618, + "loss": 3.4525, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.3277393877506256, + "learning_rate": 0.0005069693966773535, + "loss": 3.4459, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3323872685432434, + "learning_rate": 0.0005067945205479451, + "loss": 3.4475, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.339382529258728, + "learning_rate": 0.0005066196444185368, + "loss": 3.4398, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.33565738797187805, + "learning_rate": 0.0005064447682891285, + "loss": 3.4522, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.33060815930366516, + "learning_rate": 0.0005062698921597202, + "loss": 3.4555, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.3253958225250244, + "learning_rate": 0.0005060950160303119, + "loss": 3.4451, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.3636493384838104, + "learning_rate": 0.0005059201399009035, + "loss": 3.4523, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.36607174453991753, + "eval_loss": 3.5809550285339355, + "eval_runtime": 183.0826, + "eval_samples_per_second": 90.915, + "eval_steps_per_second": 5.686, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.3568457365036011, + "learning_rate": 0.0005057452637714951, + "loss": 3.4527, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.3288189172744751, + "learning_rate": 0.0005055703876420868, + "loss": 3.4552, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.29840394854545593, + "learning_rate": 0.0005053955115126785, + "loss": 3.4563, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.37738728523254395, + "learning_rate": 0.0005052206353832701, + "loss": 3.4605, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.3255780339241028, + "learning_rate": 0.0005050457592538618, + "loss": 3.4607, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3199935853481293, + "learning_rate": 0.0005048708831244535, + "loss": 3.4621, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.34419822692871094, + "learning_rate": 0.0005046960069950451, + "loss": 3.4547, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.31108030676841736, + "learning_rate": 0.0005045211308656369, + "loss": 3.4622, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.3241368532180786, + "learning_rate": 0.0005043462547362284, + "loss": 3.4599, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.34238871932029724, + "learning_rate": 0.0005041713786068201, + "loss": 3.3724, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.3241780400276184, + "learning_rate": 0.0005039965024774118, + "loss": 3.3516, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.33774638175964355, + "learning_rate": 0.0005038216263480034, + "loss": 3.3319, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.33839425444602966, + "learning_rate": 0.0005036467502185951, + "loss": 3.3643, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.33138301968574524, + "learning_rate": 0.0005034718740891868, + "loss": 3.3576, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.331544429063797, + "learning_rate": 0.0005032969979597785, + "loss": 3.3717, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.3336641192436218, + "learning_rate": 0.0005031221218303701, + "loss": 3.3637, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.3461131453514099, + "learning_rate": 0.0005029472457009618, + "loss": 3.367, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.3179863691329956, + "learning_rate": 0.0005027723695715534, + "loss": 3.3618, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.3158666491508484, + "learning_rate": 0.0005025974934421451, + "loss": 3.3798, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.34379681944847107, + "learning_rate": 0.0005024226173127368, + "loss": 3.3762, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.3662919525324213, + "eval_loss": 3.588055372238159, + "eval_runtime": 182.9945, + "eval_samples_per_second": 90.959, + "eval_steps_per_second": 5.689, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.3444662392139435, + "learning_rate": 0.0005022477411833284, + "loss": 3.3854, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.3341796398162842, + "learning_rate": 0.0005020728650539201, + "loss": 3.3786, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.3741398751735687, + "learning_rate": 0.0005018979889245118, + "loss": 3.3783, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.329504132270813, + "learning_rate": 0.0005017231127951034, + "loss": 3.3892, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.37047260999679565, + "learning_rate": 0.0005015482366656951, + "loss": 3.3807, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.35633140802383423, + "learning_rate": 0.0005013733605362868, + "loss": 3.392, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.39560022950172424, + "learning_rate": 0.0005011984844068784, + "loss": 3.3931, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3520383834838867, + "learning_rate": 0.0005010236082774701, + "loss": 3.404, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.3606109023094177, + "learning_rate": 0.0005008487321480617, + "loss": 3.3849, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.35454607009887695, + "learning_rate": 0.0005006738560186534, + "loss": 3.3959, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.3578483462333679, + "learning_rate": 0.0005004989798892451, + "loss": 3.4044, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.36204418540000916, + "learning_rate": 0.0005003241037598368, + "loss": 3.3994, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.35746678709983826, + "learning_rate": 0.0005001492276304284, + "loss": 3.3966, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.32434630393981934, + "learning_rate": 0.0004999743515010201, + "loss": 3.408, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.3382759094238281, + "learning_rate": 0.0004997994753716117, + "loss": 3.3965, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.3254185616970062, + "learning_rate": 0.0004996245992422033, + "loss": 3.4115, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.3328564763069153, + "learning_rate": 0.0004994497231127951, + "loss": 3.3962, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.3341105580329895, + "learning_rate": 0.0004992748469833867, + "loss": 3.4067, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.3595927655696869, + "learning_rate": 0.0004990999708539784, + "loss": 3.4093, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.32767578959465027, + "learning_rate": 0.0004989250947245701, + "loss": 3.4033, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.3668947322321366, + "eval_loss": 3.5828919410705566, + "eval_runtime": 183.1656, + "eval_samples_per_second": 90.874, + "eval_steps_per_second": 5.683, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.335256963968277, + "learning_rate": 0.0004987502185951617, + "loss": 3.4103, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.328800767660141, + "learning_rate": 0.0004985753424657534, + "loss": 3.4104, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.3295251429080963, + "learning_rate": 0.000498400466336345, + "loss": 3.4066, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.3609812259674072, + "learning_rate": 0.0004982255902069367, + "loss": 3.4191, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.353671669960022, + "learning_rate": 0.0004980507140775283, + "loss": 3.4129, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.32933372259140015, + "learning_rate": 0.0004978758379481201, + "loss": 3.423, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.32520967721939087, + "learning_rate": 0.0004977009618187117, + "loss": 3.4124, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.3547916114330292, + "learning_rate": 0.0004975260856893034, + "loss": 3.4198, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.34057968854904175, + "learning_rate": 0.0004973512095598951, + "loss": 3.4191, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.33685600757598877, + "learning_rate": 0.0004971763334304867, + "loss": 3.4209, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.34290191531181335, + "learning_rate": 0.0004970014573010784, + "loss": 3.4257, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.311635822057724, + "learning_rate": 0.00049682658117167, + "loss": 3.4108, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.3206999599933624, + "learning_rate": 0.0004966517050422616, + "loss": 3.4277, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.378736674785614, + "learning_rate": 0.0004964768289128533, + "loss": 3.4192, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.35404956340789795, + "learning_rate": 0.000496301952783445, + "loss": 3.4304, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.33619192242622375, + "learning_rate": 0.0004961270766540367, + "loss": 3.4157, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.3306516110897064, + "learning_rate": 0.0004959522005246284, + "loss": 3.4257, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.314596563577652, + "learning_rate": 0.00049577732439522, + "loss": 3.4326, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.3481888771057129, + "learning_rate": 0.0004956024482658117, + "loss": 3.4345, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.33885663747787476, + "learning_rate": 0.0004954275721364034, + "loss": 3.4225, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.3673473754628571, + "eval_loss": 3.574705123901367, + "eval_runtime": 183.0508, + "eval_samples_per_second": 90.931, + "eval_steps_per_second": 5.687, + "step": 30000 + }, + { + "epoch": 8.753320904218132, + "grad_norm": 0.37235426902770996, + "learning_rate": 0.000495252696006995, + "loss": 3.4281, + "step": 30050 + }, + { + "epoch": 8.767886273595899, + "grad_norm": 0.3520287573337555, + "learning_rate": 0.0004950778198775866, + "loss": 3.4421, + "step": 30100 + }, + { + "epoch": 8.782451642973665, + "grad_norm": 0.33238449692726135, + "learning_rate": 0.0004949029437481783, + "loss": 3.4196, + "step": 30150 + }, + { + "epoch": 8.797017012351434, + "grad_norm": 0.3224245607852936, + "learning_rate": 0.00049472806761877, + "loss": 3.4336, + "step": 30200 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.32464221119880676, + "learning_rate": 0.0004945531914893616, + "loss": 3.4363, + "step": 30250 + }, + { + "epoch": 8.826147751106967, + "grad_norm": 0.3441407382488251, + "learning_rate": 0.0004943783153599534, + "loss": 3.4194, + "step": 30300 + }, + { + "epoch": 8.840713120484736, + "grad_norm": 0.33133164048194885, + "learning_rate": 0.000494203439230545, + "loss": 3.4289, + "step": 30350 + }, + { + "epoch": 8.855278489862503, + "grad_norm": 0.38068103790283203, + "learning_rate": 0.0004940285631011367, + "loss": 3.4221, + "step": 30400 + }, + { + "epoch": 8.86984385924027, + "grad_norm": 0.3354259133338928, + "learning_rate": 0.0004938536869717284, + "loss": 3.4192, + "step": 30450 + }, + { + "epoch": 8.884409228618038, + "grad_norm": 0.3407922685146332, + "learning_rate": 0.0004936788108423199, + "loss": 3.4377, + "step": 30500 + }, + { + "epoch": 8.898974597995805, + "grad_norm": 0.31988245248794556, + "learning_rate": 0.0004935039347129116, + "loss": 3.4418, + "step": 30550 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.33597180247306824, + "learning_rate": 0.0004933290585835033, + "loss": 3.4441, + "step": 30600 + }, + { + "epoch": 8.92810533675134, + "grad_norm": 0.32772478461265564, + "learning_rate": 0.000493154182454095, + "loss": 3.4356, + "step": 30650 + }, + { + "epoch": 8.942670706129107, + "grad_norm": 0.3514624536037445, + "learning_rate": 0.0004929793063246866, + "loss": 3.4297, + "step": 30700 + }, + { + "epoch": 8.957236075506875, + "grad_norm": 0.38539716601371765, + "learning_rate": 0.0004928044301952783, + "loss": 3.4521, + "step": 30750 + }, + { + "epoch": 8.971801444884642, + "grad_norm": 0.33038049936294556, + "learning_rate": 0.00049262955406587, + "loss": 3.4149, + "step": 30800 + }, + { + "epoch": 8.986366814262409, + "grad_norm": 0.3251229226589203, + "learning_rate": 0.0004924546779364617, + "loss": 3.4331, + "step": 30850 + }, + { + "epoch": 9.000873922162667, + "grad_norm": 0.3207502067089081, + "learning_rate": 0.0004922798018070533, + "loss": 3.4394, + "step": 30900 + }, + { + "epoch": 9.015439291540433, + "grad_norm": 0.3693355321884155, + "learning_rate": 0.0004921049256776449, + "loss": 3.3291, + "step": 30950 + }, + { + "epoch": 9.0300046609182, + "grad_norm": 0.3489963114261627, + "learning_rate": 0.0004919300495482366, + "loss": 3.3258, + "step": 31000 + }, + { + "epoch": 9.0300046609182, + "eval_accuracy": 0.3673397334342865, + "eval_loss": 3.5807602405548096, + "eval_runtime": 182.9792, + "eval_samples_per_second": 90.967, + "eval_steps_per_second": 5.689, + "step": 31000 + }, + { + "epoch": 9.044570030295969, + "grad_norm": 0.3296785056591034, + "learning_rate": 0.0004917551734188283, + "loss": 3.3114, + "step": 31050 + }, + { + "epoch": 9.059135399673735, + "grad_norm": 0.3842584788799286, + "learning_rate": 0.0004915802972894199, + "loss": 3.3355, + "step": 31100 + }, + { + "epoch": 9.073700769051504, + "grad_norm": 0.3418821394443512, + "learning_rate": 0.0004914054211600116, + "loss": 3.3305, + "step": 31150 + }, + { + "epoch": 9.08826613842927, + "grad_norm": 0.3381640911102295, + "learning_rate": 0.0004912305450306033, + "loss": 3.3356, + "step": 31200 + }, + { + "epoch": 9.102831507807037, + "grad_norm": 0.33703309297561646, + "learning_rate": 0.000491055668901195, + "loss": 3.3429, + "step": 31250 + }, + { + "epoch": 9.117396877184806, + "grad_norm": 0.33812689781188965, + "learning_rate": 0.0004908807927717865, + "loss": 3.3502, + "step": 31300 + }, + { + "epoch": 9.131962246562573, + "grad_norm": 0.3623238801956177, + "learning_rate": 0.0004907059166423783, + "loss": 3.3456, + "step": 31350 + }, + { + "epoch": 9.14652761594034, + "grad_norm": 0.337973415851593, + "learning_rate": 0.0004905310405129699, + "loss": 3.3457, + "step": 31400 + }, + { + "epoch": 9.161092985318108, + "grad_norm": 0.3395603597164154, + "learning_rate": 0.0004903561643835616, + "loss": 3.3571, + "step": 31450 + }, + { + "epoch": 9.175658354695875, + "grad_norm": 0.35803207755088806, + "learning_rate": 0.0004901812882541533, + "loss": 3.3575, + "step": 31500 + }, + { + "epoch": 9.190223724073643, + "grad_norm": 0.3479365110397339, + "learning_rate": 0.0004900064121247449, + "loss": 3.3484, + "step": 31550 + }, + { + "epoch": 9.20478909345141, + "grad_norm": 0.3644011616706848, + "learning_rate": 0.0004898315359953366, + "loss": 3.3551, + "step": 31600 + }, + { + "epoch": 9.219354462829177, + "grad_norm": 0.36317089200019836, + "learning_rate": 0.0004896566598659283, + "loss": 3.3621, + "step": 31650 + }, + { + "epoch": 9.233919832206945, + "grad_norm": 0.33678680658340454, + "learning_rate": 0.0004894817837365199, + "loss": 3.3595, + "step": 31700 + }, + { + "epoch": 9.248485201584712, + "grad_norm": 0.3474813997745514, + "learning_rate": 0.0004893069076071115, + "loss": 3.3684, + "step": 31750 + }, + { + "epoch": 9.263050570962479, + "grad_norm": 0.35735565423965454, + "learning_rate": 0.0004891320314777032, + "loss": 3.3718, + "step": 31800 + }, + { + "epoch": 9.277615940340247, + "grad_norm": 0.3313765227794647, + "learning_rate": 0.0004889571553482949, + "loss": 3.3739, + "step": 31850 + }, + { + "epoch": 9.292181309718014, + "grad_norm": 0.33610573410987854, + "learning_rate": 0.0004887822792188866, + "loss": 3.375, + "step": 31900 + }, + { + "epoch": 9.306746679095783, + "grad_norm": 0.32763785123825073, + "learning_rate": 0.0004886074030894782, + "loss": 3.3646, + "step": 31950 + }, + { + "epoch": 9.32131204847355, + "grad_norm": 0.3415217995643616, + "learning_rate": 0.0004884325269600699, + "loss": 3.3753, + "step": 32000 + }, + { + "epoch": 9.32131204847355, + "eval_accuracy": 0.3678076607221482, + "eval_loss": 3.5773885250091553, + "eval_runtime": 182.9174, + "eval_samples_per_second": 90.997, + "eval_steps_per_second": 5.691, + "step": 32000 + }, + { + "epoch": 9.335877417851316, + "grad_norm": 0.3389699459075928, + "learning_rate": 0.0004882576508306615, + "loss": 3.3737, + "step": 32050 + }, + { + "epoch": 9.350442787229085, + "grad_norm": 0.3345525860786438, + "learning_rate": 0.00048808277470125327, + "loss": 3.3696, + "step": 32100 + }, + { + "epoch": 9.365008156606851, + "grad_norm": 0.3717873692512512, + "learning_rate": 0.0004879078985718449, + "loss": 3.3813, + "step": 32150 + }, + { + "epoch": 9.379573525984618, + "grad_norm": 0.3748685419559479, + "learning_rate": 0.0004877330224424366, + "loss": 3.3806, + "step": 32200 + }, + { + "epoch": 9.394138895362387, + "grad_norm": 0.34190633893013, + "learning_rate": 0.00048755814631302823, + "loss": 3.3887, + "step": 32250 + }, + { + "epoch": 9.408704264740154, + "grad_norm": 0.3739558756351471, + "learning_rate": 0.00048738327018361987, + "loss": 3.3818, + "step": 32300 + }, + { + "epoch": 9.423269634117922, + "grad_norm": 0.3366689085960388, + "learning_rate": 0.00048720839405421156, + "loss": 3.3763, + "step": 32350 + }, + { + "epoch": 9.437835003495689, + "grad_norm": 0.335202157497406, + "learning_rate": 0.0004870335179248032, + "loss": 3.3885, + "step": 32400 + }, + { + "epoch": 9.452400372873456, + "grad_norm": 0.35205066204071045, + "learning_rate": 0.0004868586417953949, + "loss": 3.3892, + "step": 32450 + }, + { + "epoch": 9.466965742251224, + "grad_norm": 0.3260645270347595, + "learning_rate": 0.0004866837656659865, + "loss": 3.368, + "step": 32500 + }, + { + "epoch": 9.48153111162899, + "grad_norm": 0.35463911294937134, + "learning_rate": 0.00048650888953657816, + "loss": 3.4004, + "step": 32550 + }, + { + "epoch": 9.496096481006758, + "grad_norm": 0.3363436162471771, + "learning_rate": 0.0004863340134071699, + "loss": 3.3733, + "step": 32600 + }, + { + "epoch": 9.510661850384526, + "grad_norm": 0.32455047965049744, + "learning_rate": 0.00048615913727776154, + "loss": 3.3817, + "step": 32650 + }, + { + "epoch": 9.525227219762293, + "grad_norm": 0.383531779050827, + "learning_rate": 0.00048598426114835323, + "loss": 3.3819, + "step": 32700 + }, + { + "epoch": 9.53979258914006, + "grad_norm": 0.3713645935058594, + "learning_rate": 0.00048580938501894486, + "loss": 3.3917, + "step": 32750 + }, + { + "epoch": 9.554357958517828, + "grad_norm": 0.3357818126678467, + "learning_rate": 0.00048563450888953655, + "loss": 3.3873, + "step": 32800 + }, + { + "epoch": 9.568923327895595, + "grad_norm": 0.3960472047328949, + "learning_rate": 0.0004854596327601282, + "loss": 3.4062, + "step": 32850 + }, + { + "epoch": 9.583488697273363, + "grad_norm": 0.33174315094947815, + "learning_rate": 0.0004852847566307198, + "loss": 3.3952, + "step": 32900 + }, + { + "epoch": 9.59805406665113, + "grad_norm": 0.33647504448890686, + "learning_rate": 0.0004851098805013115, + "loss": 3.3959, + "step": 32950 + }, + { + "epoch": 9.612619436028897, + "grad_norm": 0.32862532138824463, + "learning_rate": 0.00048493500437190315, + "loss": 3.397, + "step": 33000 + }, + { + "epoch": 9.612619436028897, + "eval_accuracy": 0.3678878432373044, + "eval_loss": 3.5697684288024902, + "eval_runtime": 182.9346, + "eval_samples_per_second": 90.989, + "eval_steps_per_second": 5.691, + "step": 33000 + }, + { + "epoch": 9.627184805406666, + "grad_norm": 0.33516356348991394, + "learning_rate": 0.0004847601282424949, + "loss": 3.3865, + "step": 33050 + }, + { + "epoch": 9.641750174784432, + "grad_norm": 0.3567980229854584, + "learning_rate": 0.00048458525211308653, + "loss": 3.401, + "step": 33100 + }, + { + "epoch": 9.6563155441622, + "grad_norm": 0.37242189049720764, + "learning_rate": 0.00048441037598367817, + "loss": 3.3982, + "step": 33150 + }, + { + "epoch": 9.670880913539968, + "grad_norm": 0.33064374327659607, + "learning_rate": 0.00048423549985426986, + "loss": 3.393, + "step": 33200 + }, + { + "epoch": 9.685446282917734, + "grad_norm": 0.3263695538043976, + "learning_rate": 0.0004840606237248615, + "loss": 3.396, + "step": 33250 + }, + { + "epoch": 9.700011652295503, + "grad_norm": 0.3375985622406006, + "learning_rate": 0.0004838857475954532, + "loss": 3.3941, + "step": 33300 + }, + { + "epoch": 9.71457702167327, + "grad_norm": 0.3303999900817871, + "learning_rate": 0.0004837108714660448, + "loss": 3.4037, + "step": 33350 + }, + { + "epoch": 9.729142391051036, + "grad_norm": 0.34881582856178284, + "learning_rate": 0.0004835359953366365, + "loss": 3.3999, + "step": 33400 + }, + { + "epoch": 9.743707760428805, + "grad_norm": 0.33368274569511414, + "learning_rate": 0.00048336111920722815, + "loss": 3.4048, + "step": 33450 + }, + { + "epoch": 9.758273129806572, + "grad_norm": 0.3425474166870117, + "learning_rate": 0.0004831862430778198, + "loss": 3.4066, + "step": 33500 + }, + { + "epoch": 9.772838499184338, + "grad_norm": 0.35753095149993896, + "learning_rate": 0.00048301136694841153, + "loss": 3.409, + "step": 33550 + }, + { + "epoch": 9.787403868562107, + "grad_norm": 0.37741026282310486, + "learning_rate": 0.00048283649081900317, + "loss": 3.4035, + "step": 33600 + }, + { + "epoch": 9.801969237939874, + "grad_norm": 0.33492302894592285, + "learning_rate": 0.00048266161468959486, + "loss": 3.4129, + "step": 33650 + }, + { + "epoch": 9.816534607317642, + "grad_norm": 0.3427729308605194, + "learning_rate": 0.0004824867385601865, + "loss": 3.4104, + "step": 33700 + }, + { + "epoch": 9.831099976695409, + "grad_norm": 0.32642075419425964, + "learning_rate": 0.00048231186243077813, + "loss": 3.4067, + "step": 33750 + }, + { + "epoch": 9.845665346073176, + "grad_norm": 0.34097906947135925, + "learning_rate": 0.0004821369863013698, + "loss": 3.4071, + "step": 33800 + }, + { + "epoch": 9.860230715450944, + "grad_norm": 0.3331773579120636, + "learning_rate": 0.00048196211017196146, + "loss": 3.3962, + "step": 33850 + }, + { + "epoch": 9.874796084828711, + "grad_norm": 0.33215972781181335, + "learning_rate": 0.00048178723404255315, + "loss": 3.4052, + "step": 33900 + }, + { + "epoch": 9.88936145420648, + "grad_norm": 0.3272652328014374, + "learning_rate": 0.0004816123579131448, + "loss": 3.4032, + "step": 33950 + }, + { + "epoch": 9.903926823584246, + "grad_norm": 0.31748270988464355, + "learning_rate": 0.0004814374817837364, + "loss": 3.4104, + "step": 34000 + }, + { + "epoch": 9.903926823584246, + "eval_accuracy": 0.36873904765039955, + "eval_loss": 3.559356212615967, + "eval_runtime": 182.8824, + "eval_samples_per_second": 91.015, + "eval_steps_per_second": 5.692, + "step": 34000 + }, + { + "epoch": 9.918492192962013, + "grad_norm": 0.33125990629196167, + "learning_rate": 0.00048126260565432816, + "loss": 3.4106, + "step": 34050 + }, + { + "epoch": 9.933057562339782, + "grad_norm": 0.32902291417121887, + "learning_rate": 0.0004810877295249198, + "loss": 3.4107, + "step": 34100 + }, + { + "epoch": 9.947622931717548, + "grad_norm": 0.3390238881111145, + "learning_rate": 0.0004809128533955115, + "loss": 3.4169, + "step": 34150 + }, + { + "epoch": 9.962188301095315, + "grad_norm": 0.3341410458087921, + "learning_rate": 0.0004807379772661031, + "loss": 3.4056, + "step": 34200 + }, + { + "epoch": 9.976753670473084, + "grad_norm": 0.3485892415046692, + "learning_rate": 0.0004805631011366948, + "loss": 3.4186, + "step": 34250 + }, + { + "epoch": 9.99131903985085, + "grad_norm": 0.3646714687347412, + "learning_rate": 0.00048038822500728645, + "loss": 3.4096, + "step": 34300 + }, + { + "epoch": 10.005826147751106, + "grad_norm": 0.35494521260261536, + "learning_rate": 0.0004802133488778781, + "loss": 3.367, + "step": 34350 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.37137651443481445, + "learning_rate": 0.0004800384727484698, + "loss": 3.2986, + "step": 34400 + }, + { + "epoch": 10.034956886506642, + "grad_norm": 0.34275004267692566, + "learning_rate": 0.0004798635966190614, + "loss": 3.3048, + "step": 34450 + }, + { + "epoch": 10.049522255884408, + "grad_norm": 0.3744957447052002, + "learning_rate": 0.00047968872048965316, + "loss": 3.3039, + "step": 34500 + }, + { + "epoch": 10.064087625262177, + "grad_norm": 0.3762577772140503, + "learning_rate": 0.0004795138443602448, + "loss": 3.3204, + "step": 34550 + }, + { + "epoch": 10.078652994639944, + "grad_norm": 0.3589562475681305, + "learning_rate": 0.00047933896823083643, + "loss": 3.3209, + "step": 34600 + }, + { + "epoch": 10.093218364017712, + "grad_norm": 0.3450121581554413, + "learning_rate": 0.0004791640921014281, + "loss": 3.3213, + "step": 34650 + }, + { + "epoch": 10.107783733395479, + "grad_norm": 0.35901615023612976, + "learning_rate": 0.00047898921597201976, + "loss": 3.3242, + "step": 34700 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.3458753228187561, + "learning_rate": 0.00047881433984261145, + "loss": 3.3272, + "step": 34750 + }, + { + "epoch": 10.136914472151014, + "grad_norm": 0.3470238149166107, + "learning_rate": 0.0004786394637132031, + "loss": 3.3357, + "step": 34800 + }, + { + "epoch": 10.151479841528781, + "grad_norm": 0.3423948585987091, + "learning_rate": 0.0004784645875837948, + "loss": 3.3348, + "step": 34850 + }, + { + "epoch": 10.166045210906548, + "grad_norm": 0.36468738317489624, + "learning_rate": 0.0004782897114543864, + "loss": 3.334, + "step": 34900 + }, + { + "epoch": 10.180610580284316, + "grad_norm": 0.36260437965393066, + "learning_rate": 0.00047811483532497805, + "loss": 3.3367, + "step": 34950 + }, + { + "epoch": 10.195175949662083, + "grad_norm": 0.34456831216812134, + "learning_rate": 0.0004779399591955698, + "loss": 3.3356, + "step": 35000 + }, + { + "epoch": 10.195175949662083, + "eval_accuracy": 0.36854717394844216, + "eval_loss": 3.571223497390747, + "eval_runtime": 182.8992, + "eval_samples_per_second": 91.006, + "eval_steps_per_second": 5.692, + "step": 35000 + }, + { + "epoch": 10.209741319039852, + "grad_norm": 0.397850900888443, + "learning_rate": 0.00047776508306616143, + "loss": 3.3288, + "step": 35050 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.3417603671550751, + "learning_rate": 0.0004775902069367531, + "loss": 3.3384, + "step": 35100 + }, + { + "epoch": 10.238872057795385, + "grad_norm": 0.37064996361732483, + "learning_rate": 0.00047741533080734476, + "loss": 3.3495, + "step": 35150 + }, + { + "epoch": 10.253437427173154, + "grad_norm": 0.3486710488796234, + "learning_rate": 0.0004772404546779364, + "loss": 3.3439, + "step": 35200 + }, + { + "epoch": 10.26800279655092, + "grad_norm": 0.35850661993026733, + "learning_rate": 0.0004770655785485281, + "loss": 3.3368, + "step": 35250 + }, + { + "epoch": 10.282568165928687, + "grad_norm": 0.3511759638786316, + "learning_rate": 0.0004768907024191197, + "loss": 3.3436, + "step": 35300 + }, + { + "epoch": 10.297133535306456, + "grad_norm": 0.34070059657096863, + "learning_rate": 0.0004767158262897114, + "loss": 3.3356, + "step": 35350 + }, + { + "epoch": 10.311698904684222, + "grad_norm": 0.3479948937892914, + "learning_rate": 0.00047654095016030305, + "loss": 3.3556, + "step": 35400 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.33916714787483215, + "learning_rate": 0.0004763660740308948, + "loss": 3.368, + "step": 35450 + }, + { + "epoch": 10.340829643439758, + "grad_norm": 0.3525298833847046, + "learning_rate": 0.0004761911979014864, + "loss": 3.3581, + "step": 35500 + }, + { + "epoch": 10.355395012817525, + "grad_norm": 0.3512914776802063, + "learning_rate": 0.00047601632177207806, + "loss": 3.3598, + "step": 35550 + }, + { + "epoch": 10.369960382195293, + "grad_norm": 0.3474045991897583, + "learning_rate": 0.00047584144564266975, + "loss": 3.3461, + "step": 35600 + }, + { + "epoch": 10.38452575157306, + "grad_norm": 0.3418622612953186, + "learning_rate": 0.0004756665695132614, + "loss": 3.3611, + "step": 35650 + }, + { + "epoch": 10.399091120950827, + "grad_norm": 0.34689825773239136, + "learning_rate": 0.0004754916933838531, + "loss": 3.3457, + "step": 35700 + }, + { + "epoch": 10.413656490328595, + "grad_norm": 0.32606783509254456, + "learning_rate": 0.0004753168172544447, + "loss": 3.3576, + "step": 35750 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.3652861714363098, + "learning_rate": 0.00047514194112503635, + "loss": 3.3603, + "step": 35800 + }, + { + "epoch": 10.44278722908413, + "grad_norm": 0.3363873362541199, + "learning_rate": 0.00047496706499562804, + "loss": 3.356, + "step": 35850 + }, + { + "epoch": 10.457352598461897, + "grad_norm": 0.3396660387516022, + "learning_rate": 0.0004747921888662197, + "loss": 3.3657, + "step": 35900 + }, + { + "epoch": 10.471917967839664, + "grad_norm": 0.3386980891227722, + "learning_rate": 0.0004746173127368114, + "loss": 3.3529, + "step": 35950 + }, + { + "epoch": 10.486483337217432, + "grad_norm": 0.35521814227104187, + "learning_rate": 0.00047444243660740306, + "loss": 3.3582, + "step": 36000 + }, + { + "epoch": 10.486483337217432, + "eval_accuracy": 0.36882287482533554, + "eval_loss": 3.565991163253784, + "eval_runtime": 182.8426, + "eval_samples_per_second": 91.035, + "eval_steps_per_second": 5.693, + "step": 36000 + }, + { + "epoch": 10.5010487065952, + "grad_norm": 0.36773911118507385, + "learning_rate": 0.0004742675604779947, + "loss": 3.3744, + "step": 36050 + }, + { + "epoch": 10.515614075972966, + "grad_norm": 0.3470185101032257, + "learning_rate": 0.0004740926843485864, + "loss": 3.3555, + "step": 36100 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.3499913513660431, + "learning_rate": 0.000473917808219178, + "loss": 3.3665, + "step": 36150 + }, + { + "epoch": 10.544744814728501, + "grad_norm": 0.32940107583999634, + "learning_rate": 0.0004737429320897697, + "loss": 3.3777, + "step": 36200 + }, + { + "epoch": 10.55931018410627, + "grad_norm": 0.3400062620639801, + "learning_rate": 0.00047356805596036135, + "loss": 3.3633, + "step": 36250 + }, + { + "epoch": 10.573875553484037, + "grad_norm": 0.35213765501976013, + "learning_rate": 0.00047339317983095304, + "loss": 3.3725, + "step": 36300 + }, + { + "epoch": 10.588440922861803, + "grad_norm": 0.3383064270019531, + "learning_rate": 0.0004732183037015447, + "loss": 3.3674, + "step": 36350 + }, + { + "epoch": 10.603006292239572, + "grad_norm": 0.3423946797847748, + "learning_rate": 0.0004730434275721363, + "loss": 3.3744, + "step": 36400 + }, + { + "epoch": 10.617571661617339, + "grad_norm": 0.32633453607559204, + "learning_rate": 0.00047286855144272806, + "loss": 3.3741, + "step": 36450 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.3443063199520111, + "learning_rate": 0.0004726936753133197, + "loss": 3.3679, + "step": 36500 + }, + { + "epoch": 10.646702400372874, + "grad_norm": 0.36050254106521606, + "learning_rate": 0.0004725187991839114, + "loss": 3.3732, + "step": 36550 + }, + { + "epoch": 10.66126776975064, + "grad_norm": 0.35534441471099854, + "learning_rate": 0.000472343923054503, + "loss": 3.3749, + "step": 36600 + }, + { + "epoch": 10.675833139128407, + "grad_norm": 0.369785338640213, + "learning_rate": 0.00047216904692509465, + "loss": 3.3807, + "step": 36650 + }, + { + "epoch": 10.690398508506176, + "grad_norm": 0.35582253336906433, + "learning_rate": 0.00047199417079568634, + "loss": 3.3754, + "step": 36700 + }, + { + "epoch": 10.704963877883943, + "grad_norm": 0.36503931879997253, + "learning_rate": 0.000471819294666278, + "loss": 3.373, + "step": 36750 + }, + { + "epoch": 10.719529247261711, + "grad_norm": 0.33128494024276733, + "learning_rate": 0.00047164441853686967, + "loss": 3.3887, + "step": 36800 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.32897520065307617, + "learning_rate": 0.0004714695424074613, + "loss": 3.3912, + "step": 36850 + }, + { + "epoch": 10.748659986017245, + "grad_norm": 0.3286604583263397, + "learning_rate": 0.00047129466627805305, + "loss": 3.3706, + "step": 36900 + }, + { + "epoch": 10.763225355395013, + "grad_norm": 0.3394903242588043, + "learning_rate": 0.0004711197901486447, + "loss": 3.3752, + "step": 36950 + }, + { + "epoch": 10.77779072477278, + "grad_norm": 0.3878060579299927, + "learning_rate": 0.0004709449140192363, + "loss": 3.3867, + "step": 37000 + }, + { + "epoch": 10.77779072477278, + "eval_accuracy": 0.3696466855052468, + "eval_loss": 3.55853533744812, + "eval_runtime": 182.8784, + "eval_samples_per_second": 91.017, + "eval_steps_per_second": 5.692, + "step": 37000 + }, + { + "epoch": 10.792356094150549, + "grad_norm": 0.3377073407173157, + "learning_rate": 0.000470770037889828, + "loss": 3.3732, + "step": 37050 + }, + { + "epoch": 10.806921463528315, + "grad_norm": 0.3298020660877228, + "learning_rate": 0.00047059516176041965, + "loss": 3.3806, + "step": 37100 + }, + { + "epoch": 10.821486832906082, + "grad_norm": 0.33295738697052, + "learning_rate": 0.00047042028563101134, + "loss": 3.389, + "step": 37150 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.332865446805954, + "learning_rate": 0.000470245409501603, + "loss": 3.3811, + "step": 37200 + }, + { + "epoch": 10.850617571661617, + "grad_norm": 0.3256121575832367, + "learning_rate": 0.0004700705333721946, + "loss": 3.3895, + "step": 37250 + }, + { + "epoch": 10.865182941039384, + "grad_norm": 0.3128420114517212, + "learning_rate": 0.0004698956572427863, + "loss": 3.3926, + "step": 37300 + }, + { + "epoch": 10.879748310417153, + "grad_norm": 0.3602403402328491, + "learning_rate": 0.00046972078111337794, + "loss": 3.3802, + "step": 37350 + }, + { + "epoch": 10.89431367979492, + "grad_norm": 0.3453914523124695, + "learning_rate": 0.0004695459049839697, + "loss": 3.3903, + "step": 37400 + }, + { + "epoch": 10.908879049172686, + "grad_norm": 0.3409225046634674, + "learning_rate": 0.0004693710288545613, + "loss": 3.386, + "step": 37450 + }, + { + "epoch": 10.923444418550455, + "grad_norm": 0.33663007616996765, + "learning_rate": 0.000469196152725153, + "loss": 3.3895, + "step": 37500 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.3527715504169464, + "learning_rate": 0.00046902127659574465, + "loss": 3.3847, + "step": 37550 + }, + { + "epoch": 10.95257515730599, + "grad_norm": 0.33517366647720337, + "learning_rate": 0.0004688464004663363, + "loss": 3.3846, + "step": 37600 + }, + { + "epoch": 10.967140526683757, + "grad_norm": 0.3335331082344055, + "learning_rate": 0.000468671524336928, + "loss": 3.3903, + "step": 37650 + }, + { + "epoch": 10.981705896061523, + "grad_norm": 0.36070242524147034, + "learning_rate": 0.0004684966482075196, + "loss": 3.3784, + "step": 37700 + }, + { + "epoch": 10.996271265439292, + "grad_norm": 0.359713077545166, + "learning_rate": 0.0004683217720781113, + "loss": 3.3875, + "step": 37750 + }, + { + "epoch": 11.010778373339548, + "grad_norm": 0.3132033050060272, + "learning_rate": 0.00046814689594870294, + "loss": 3.2974, + "step": 37800 + }, + { + "epoch": 11.025343742717315, + "grad_norm": 0.3545316755771637, + "learning_rate": 0.0004679720198192946, + "loss": 3.2811, + "step": 37850 + }, + { + "epoch": 11.039909112095083, + "grad_norm": 0.3404482901096344, + "learning_rate": 0.0004677971436898863, + "loss": 3.287, + "step": 37900 + }, + { + "epoch": 11.05447448147285, + "grad_norm": 0.36774441599845886, + "learning_rate": 0.00046762226756047795, + "loss": 3.2756, + "step": 37950 + }, + { + "epoch": 11.069039850850617, + "grad_norm": 0.3585566580295563, + "learning_rate": 0.00046744739143106964, + "loss": 3.2826, + "step": 38000 + }, + { + "epoch": 11.069039850850617, + "eval_accuracy": 0.36930385234660246, + "eval_loss": 3.566277027130127, + "eval_runtime": 182.9741, + "eval_samples_per_second": 90.969, + "eval_steps_per_second": 5.689, + "step": 38000 + }, + { + "epoch": 11.083605220228385, + "grad_norm": 0.31277865171432495, + "learning_rate": 0.0004672725153016613, + "loss": 3.2838, + "step": 38050 + }, + { + "epoch": 11.098170589606152, + "grad_norm": 0.3467193841934204, + "learning_rate": 0.00046709763917225297, + "loss": 3.2891, + "step": 38100 + }, + { + "epoch": 11.11273595898392, + "grad_norm": 0.358010470867157, + "learning_rate": 0.0004669227630428446, + "loss": 3.3082, + "step": 38150 + }, + { + "epoch": 11.127301328361687, + "grad_norm": 0.37875378131866455, + "learning_rate": 0.00046674788691343624, + "loss": 3.3049, + "step": 38200 + }, + { + "epoch": 11.141866697739454, + "grad_norm": 0.3386188745498657, + "learning_rate": 0.00046657301078402793, + "loss": 3.3061, + "step": 38250 + }, + { + "epoch": 11.156432067117223, + "grad_norm": 0.3643662929534912, + "learning_rate": 0.00046639813465461957, + "loss": 3.3008, + "step": 38300 + }, + { + "epoch": 11.17099743649499, + "grad_norm": 0.3499019145965576, + "learning_rate": 0.0004662232585252113, + "loss": 3.3057, + "step": 38350 + }, + { + "epoch": 11.185562805872756, + "grad_norm": 0.32544127106666565, + "learning_rate": 0.00046604838239580295, + "loss": 3.3152, + "step": 38400 + }, + { + "epoch": 11.200128175250525, + "grad_norm": 0.3370313048362732, + "learning_rate": 0.0004658735062663946, + "loss": 3.3052, + "step": 38450 + }, + { + "epoch": 11.214693544628291, + "grad_norm": 0.3501981496810913, + "learning_rate": 0.0004656986301369863, + "loss": 3.3075, + "step": 38500 + }, + { + "epoch": 11.22925891400606, + "grad_norm": 0.3423289954662323, + "learning_rate": 0.0004655237540075779, + "loss": 3.3127, + "step": 38550 + }, + { + "epoch": 11.243824283383827, + "grad_norm": 0.3272441625595093, + "learning_rate": 0.0004653488778781696, + "loss": 3.3347, + "step": 38600 + }, + { + "epoch": 11.258389652761593, + "grad_norm": 0.3582228422164917, + "learning_rate": 0.00046517400174876124, + "loss": 3.3142, + "step": 38650 + }, + { + "epoch": 11.272955022139362, + "grad_norm": 0.33023253083229065, + "learning_rate": 0.0004649991256193529, + "loss": 3.3241, + "step": 38700 + }, + { + "epoch": 11.287520391517129, + "grad_norm": 0.342748761177063, + "learning_rate": 0.00046482424948994457, + "loss": 3.3166, + "step": 38750 + }, + { + "epoch": 11.302085760894895, + "grad_norm": 0.34566137194633484, + "learning_rate": 0.0004646493733605362, + "loss": 3.3259, + "step": 38800 + }, + { + "epoch": 11.316651130272664, + "grad_norm": 0.35357779264450073, + "learning_rate": 0.00046447449723112795, + "loss": 3.3259, + "step": 38850 + }, + { + "epoch": 11.33121649965043, + "grad_norm": 0.3932008743286133, + "learning_rate": 0.0004642996211017196, + "loss": 3.3338, + "step": 38900 + }, + { + "epoch": 11.3457818690282, + "grad_norm": 0.3381226062774658, + "learning_rate": 0.0004641247449723113, + "loss": 3.3374, + "step": 38950 + }, + { + "epoch": 11.360347238405966, + "grad_norm": 0.384560763835907, + "learning_rate": 0.0004639498688429029, + "loss": 3.3467, + "step": 39000 + }, + { + "epoch": 11.360347238405966, + "eval_accuracy": 0.36921779134793037, + "eval_loss": 3.5652575492858887, + "eval_runtime": 182.9963, + "eval_samples_per_second": 90.958, + "eval_steps_per_second": 5.689, + "step": 39000 + }, + { + "epoch": 11.374912607783733, + "grad_norm": 0.3471319377422333, + "learning_rate": 0.00046377499271349455, + "loss": 3.3447, + "step": 39050 + }, + { + "epoch": 11.389477977161501, + "grad_norm": 0.34540602564811707, + "learning_rate": 0.00046360011658408624, + "loss": 3.3322, + "step": 39100 + }, + { + "epoch": 11.404043346539268, + "grad_norm": 0.35746103525161743, + "learning_rate": 0.00046342524045467787, + "loss": 3.3347, + "step": 39150 + }, + { + "epoch": 11.418608715917035, + "grad_norm": 0.35586029291152954, + "learning_rate": 0.00046325036432526956, + "loss": 3.3522, + "step": 39200 + }, + { + "epoch": 11.433174085294803, + "grad_norm": 0.34494882822036743, + "learning_rate": 0.0004630754881958612, + "loss": 3.3351, + "step": 39250 + }, + { + "epoch": 11.44773945467257, + "grad_norm": 0.36397963762283325, + "learning_rate": 0.00046290061206645284, + "loss": 3.3338, + "step": 39300 + }, + { + "epoch": 11.462304824050339, + "grad_norm": 0.3657397925853729, + "learning_rate": 0.0004627257359370446, + "loss": 3.3472, + "step": 39350 + }, + { + "epoch": 11.476870193428105, + "grad_norm": 0.3489623963832855, + "learning_rate": 0.0004625508598076362, + "loss": 3.3422, + "step": 39400 + }, + { + "epoch": 11.491435562805872, + "grad_norm": 0.34582746028900146, + "learning_rate": 0.0004623759836782279, + "loss": 3.3421, + "step": 39450 + }, + { + "epoch": 11.50600093218364, + "grad_norm": 0.3677220046520233, + "learning_rate": 0.00046220110754881954, + "loss": 3.3496, + "step": 39500 + }, + { + "epoch": 11.520566301561407, + "grad_norm": 0.38130640983581543, + "learning_rate": 0.00046202623141941123, + "loss": 3.3369, + "step": 39550 + }, + { + "epoch": 11.535131670939174, + "grad_norm": 0.4037802815437317, + "learning_rate": 0.00046185135529000287, + "loss": 3.353, + "step": 39600 + }, + { + "epoch": 11.549697040316943, + "grad_norm": 0.3462783396244049, + "learning_rate": 0.0004616764791605945, + "loss": 3.3579, + "step": 39650 + }, + { + "epoch": 11.56426240969471, + "grad_norm": 0.35526663064956665, + "learning_rate": 0.0004615016030311862, + "loss": 3.3736, + "step": 39700 + }, + { + "epoch": 11.578827779072478, + "grad_norm": 0.35431742668151855, + "learning_rate": 0.00046132672690177783, + "loss": 3.3481, + "step": 39750 + }, + { + "epoch": 11.593393148450245, + "grad_norm": 0.35547706484794617, + "learning_rate": 0.0004611518507723696, + "loss": 3.3512, + "step": 39800 + }, + { + "epoch": 11.607958517828012, + "grad_norm": 0.33214691281318665, + "learning_rate": 0.0004609769746429612, + "loss": 3.3563, + "step": 39850 + }, + { + "epoch": 11.62252388720578, + "grad_norm": 0.3578963875770569, + "learning_rate": 0.00046080209851355285, + "loss": 3.3504, + "step": 39900 + }, + { + "epoch": 11.637089256583547, + "grad_norm": 0.3476736545562744, + "learning_rate": 0.00046062722238414454, + "loss": 3.3642, + "step": 39950 + }, + { + "epoch": 11.651654625961314, + "grad_norm": 0.34914782643318176, + "learning_rate": 0.0004604523462547362, + "loss": 3.3574, + "step": 40000 + }, + { + "epoch": 11.651654625961314, + "eval_accuracy": 0.36975038195446647, + "eval_loss": 3.5585873126983643, + "eval_runtime": 182.9354, + "eval_samples_per_second": 90.988, + "eval_steps_per_second": 5.691, + "step": 40000 + }, + { + "epoch": 11.666219995339082, + "grad_norm": 0.3997795581817627, + "learning_rate": 0.00046027747012532787, + "loss": 3.3593, + "step": 40050 + }, + { + "epoch": 11.680785364716849, + "grad_norm": 0.34242531657218933, + "learning_rate": 0.0004601025939959195, + "loss": 3.351, + "step": 40100 + }, + { + "epoch": 11.695350734094617, + "grad_norm": 0.35713550448417664, + "learning_rate": 0.0004599277178665112, + "loss": 3.3586, + "step": 40150 + }, + { + "epoch": 11.709916103472384, + "grad_norm": 0.37306341528892517, + "learning_rate": 0.00045975284173710283, + "loss": 3.3586, + "step": 40200 + }, + { + "epoch": 11.724481472850151, + "grad_norm": 0.3383656144142151, + "learning_rate": 0.00045957796560769446, + "loss": 3.3614, + "step": 40250 + }, + { + "epoch": 11.73904684222792, + "grad_norm": 0.35948076844215393, + "learning_rate": 0.0004594030894782862, + "loss": 3.3551, + "step": 40300 + }, + { + "epoch": 11.753612211605686, + "grad_norm": 0.360495388507843, + "learning_rate": 0.00045922821334887785, + "loss": 3.3622, + "step": 40350 + }, + { + "epoch": 11.768177580983453, + "grad_norm": 0.34619754552841187, + "learning_rate": 0.00045905333721946954, + "loss": 3.3622, + "step": 40400 + }, + { + "epoch": 11.782742950361222, + "grad_norm": 0.34578973054885864, + "learning_rate": 0.00045887846109006117, + "loss": 3.3726, + "step": 40450 + }, + { + "epoch": 11.797308319738988, + "grad_norm": 0.3434268534183502, + "learning_rate": 0.0004587035849606528, + "loss": 3.3633, + "step": 40500 + }, + { + "epoch": 11.811873689116755, + "grad_norm": 0.35702431201934814, + "learning_rate": 0.0004585287088312445, + "loss": 3.3524, + "step": 40550 + }, + { + "epoch": 11.826439058494524, + "grad_norm": 0.34746211767196655, + "learning_rate": 0.00045835383270183613, + "loss": 3.377, + "step": 40600 + }, + { + "epoch": 11.84100442787229, + "grad_norm": 0.3523807227611542, + "learning_rate": 0.0004581789565724278, + "loss": 3.3717, + "step": 40650 + }, + { + "epoch": 11.855569797250059, + "grad_norm": 0.33209025859832764, + "learning_rate": 0.00045800408044301946, + "loss": 3.361, + "step": 40700 + }, + { + "epoch": 11.870135166627826, + "grad_norm": 0.3397273123264313, + "learning_rate": 0.0004578292043136111, + "loss": 3.361, + "step": 40750 + }, + { + "epoch": 11.884700536005592, + "grad_norm": 0.335334450006485, + "learning_rate": 0.00045765432818420284, + "loss": 3.3646, + "step": 40800 + }, + { + "epoch": 11.899265905383361, + "grad_norm": 0.34581536054611206, + "learning_rate": 0.0004574794520547945, + "loss": 3.3717, + "step": 40850 + }, + { + "epoch": 11.913831274761128, + "grad_norm": 0.36878204345703125, + "learning_rate": 0.00045730457592538617, + "loss": 3.3697, + "step": 40900 + }, + { + "epoch": 11.928396644138896, + "grad_norm": 0.33850565552711487, + "learning_rate": 0.0004571296997959778, + "loss": 3.3665, + "step": 40950 + }, + { + "epoch": 11.942962013516663, + "grad_norm": 0.3630038797855377, + "learning_rate": 0.0004569548236665695, + "loss": 3.3856, + "step": 41000 + }, + { + "epoch": 11.942962013516663, + "eval_accuracy": 0.37081274149545096, + "eval_loss": 3.5459952354431152, + "eval_runtime": 183.1108, + "eval_samples_per_second": 90.901, + "eval_steps_per_second": 5.685, + "step": 41000 + }, + { + "epoch": 11.95752738289443, + "grad_norm": 0.33172306418418884, + "learning_rate": 0.00045677994753716113, + "loss": 3.3737, + "step": 41050 + }, + { + "epoch": 11.972092752272198, + "grad_norm": 0.34910187125205994, + "learning_rate": 0.00045660507140775277, + "loss": 3.3773, + "step": 41100 + }, + { + "epoch": 11.986658121649965, + "grad_norm": 0.3395317494869232, + "learning_rate": 0.00045643019527834446, + "loss": 3.3751, + "step": 41150 + }, + { + "epoch": 12.001165229550221, + "grad_norm": 0.34665730595588684, + "learning_rate": 0.0004562553191489361, + "loss": 3.3681, + "step": 41200 + }, + { + "epoch": 12.01573059892799, + "grad_norm": 0.3783624768257141, + "learning_rate": 0.00045608044301952784, + "loss": 3.2697, + "step": 41250 + }, + { + "epoch": 12.030295968305756, + "grad_norm": 0.3724413514137268, + "learning_rate": 0.0004559055668901195, + "loss": 3.2685, + "step": 41300 + }, + { + "epoch": 12.044861337683523, + "grad_norm": 0.34241390228271484, + "learning_rate": 0.0004557306907607111, + "loss": 3.2785, + "step": 41350 + }, + { + "epoch": 12.059426707061291, + "grad_norm": 0.3715740442276001, + "learning_rate": 0.0004555558146313028, + "loss": 3.2753, + "step": 41400 + }, + { + "epoch": 12.073992076439058, + "grad_norm": 0.3630043864250183, + "learning_rate": 0.00045538093850189444, + "loss": 3.2737, + "step": 41450 + }, + { + "epoch": 12.088557445816827, + "grad_norm": 0.3790743947029114, + "learning_rate": 0.00045520606237248613, + "loss": 3.2885, + "step": 41500 + }, + { + "epoch": 12.103122815194594, + "grad_norm": 0.37035778164863586, + "learning_rate": 0.00045503118624307776, + "loss": 3.2849, + "step": 41550 + }, + { + "epoch": 12.11768818457236, + "grad_norm": 0.3581169545650482, + "learning_rate": 0.00045485631011366945, + "loss": 3.2934, + "step": 41600 + }, + { + "epoch": 12.132253553950129, + "grad_norm": 0.367798775434494, + "learning_rate": 0.0004546814339842611, + "loss": 3.2741, + "step": 41650 + }, + { + "epoch": 12.146818923327896, + "grad_norm": 0.3496311902999878, + "learning_rate": 0.0004545065578548527, + "loss": 3.2908, + "step": 41700 + }, + { + "epoch": 12.161384292705662, + "grad_norm": 0.3402174413204193, + "learning_rate": 0.00045433168172544447, + "loss": 3.2904, + "step": 41750 + }, + { + "epoch": 12.17594966208343, + "grad_norm": 0.3760812282562256, + "learning_rate": 0.0004541568055960361, + "loss": 3.2939, + "step": 41800 + }, + { + "epoch": 12.190515031461198, + "grad_norm": 0.36442825198173523, + "learning_rate": 0.0004539819294666278, + "loss": 3.2946, + "step": 41850 + }, + { + "epoch": 12.205080400838966, + "grad_norm": 0.39214181900024414, + "learning_rate": 0.00045380705333721943, + "loss": 3.3044, + "step": 41900 + }, + { + "epoch": 12.219645770216733, + "grad_norm": 0.3448289930820465, + "learning_rate": 0.00045363217720781107, + "loss": 3.2939, + "step": 41950 + }, + { + "epoch": 12.2342111395945, + "grad_norm": 0.3863697946071625, + "learning_rate": 0.00045345730107840276, + "loss": 3.3073, + "step": 42000 + }, + { + "epoch": 12.2342111395945, + "eval_accuracy": 0.36998775511883647, + "eval_loss": 3.563772439956665, + "eval_runtime": 183.0531, + "eval_samples_per_second": 90.93, + "eval_steps_per_second": 5.687, + "step": 42000 + }, + { + "epoch": 12.248776508972268, + "grad_norm": 0.3787136375904083, + "learning_rate": 0.0004532824249489944, + "loss": 3.3172, + "step": 42050 + }, + { + "epoch": 12.263341878350035, + "grad_norm": 0.3432846665382385, + "learning_rate": 0.0004531075488195861, + "loss": 3.3017, + "step": 42100 + }, + { + "epoch": 12.277907247727802, + "grad_norm": 0.3608255982398987, + "learning_rate": 0.0004529326726901777, + "loss": 3.306, + "step": 42150 + }, + { + "epoch": 12.29247261710557, + "grad_norm": 0.3482573628425598, + "learning_rate": 0.00045275779656076947, + "loss": 3.3011, + "step": 42200 + }, + { + "epoch": 12.307037986483337, + "grad_norm": 0.35142436623573303, + "learning_rate": 0.0004525829204313611, + "loss": 3.3088, + "step": 42250 + }, + { + "epoch": 12.321603355861104, + "grad_norm": 0.37184178829193115, + "learning_rate": 0.00045240804430195274, + "loss": 3.3109, + "step": 42300 + }, + { + "epoch": 12.336168725238872, + "grad_norm": 0.3423960208892822, + "learning_rate": 0.00045223316817254443, + "loss": 3.3097, + "step": 42350 + }, + { + "epoch": 12.350734094616639, + "grad_norm": 0.399601548910141, + "learning_rate": 0.00045205829204313607, + "loss": 3.3125, + "step": 42400 + }, + { + "epoch": 12.365299463994408, + "grad_norm": 0.3392581641674042, + "learning_rate": 0.00045188341591372776, + "loss": 3.3176, + "step": 42450 + }, + { + "epoch": 12.379864833372174, + "grad_norm": 0.3549819886684418, + "learning_rate": 0.0004517085397843194, + "loss": 3.3176, + "step": 42500 + }, + { + "epoch": 12.394430202749941, + "grad_norm": 0.37998539209365845, + "learning_rate": 0.00045153366365491103, + "loss": 3.3305, + "step": 42550 + }, + { + "epoch": 12.40899557212771, + "grad_norm": 0.3557809293270111, + "learning_rate": 0.0004513587875255027, + "loss": 3.3238, + "step": 42600 + }, + { + "epoch": 12.423560941505476, + "grad_norm": 0.3527306914329529, + "learning_rate": 0.00045118391139609436, + "loss": 3.3299, + "step": 42650 + }, + { + "epoch": 12.438126310883243, + "grad_norm": 0.3596922755241394, + "learning_rate": 0.0004510090352666861, + "loss": 3.3251, + "step": 42700 + }, + { + "epoch": 12.452691680261012, + "grad_norm": 0.3422442674636841, + "learning_rate": 0.00045083415913727774, + "loss": 3.321, + "step": 42750 + }, + { + "epoch": 12.467257049638778, + "grad_norm": 0.4016224443912506, + "learning_rate": 0.0004506592830078694, + "loss": 3.3283, + "step": 42800 + }, + { + "epoch": 12.481822419016547, + "grad_norm": 0.34260863065719604, + "learning_rate": 0.00045048440687846106, + "loss": 3.3187, + "step": 42850 + }, + { + "epoch": 12.496387788394314, + "grad_norm": 0.343815416097641, + "learning_rate": 0.0004503095307490527, + "loss": 3.3066, + "step": 42900 + }, + { + "epoch": 12.51095315777208, + "grad_norm": 0.37220168113708496, + "learning_rate": 0.0004501346546196444, + "loss": 3.3269, + "step": 42950 + }, + { + "epoch": 12.525518527149849, + "grad_norm": 0.3391141891479492, + "learning_rate": 0.000449959778490236, + "loss": 3.338, + "step": 43000 + }, + { + "epoch": 12.525518527149849, + "eval_accuracy": 0.37058042382690454, + "eval_loss": 3.5549614429473877, + "eval_runtime": 182.7229, + "eval_samples_per_second": 91.094, + "eval_steps_per_second": 5.697, + "step": 43000 + }, + { + "epoch": 12.540083896527616, + "grad_norm": 0.37905097007751465, + "learning_rate": 0.0004497849023608277, + "loss": 3.3293, + "step": 43050 + }, + { + "epoch": 12.554649265905383, + "grad_norm": 0.34309253096580505, + "learning_rate": 0.00044961002623141935, + "loss": 3.3142, + "step": 43100 + }, + { + "epoch": 12.569214635283151, + "grad_norm": 0.35698094964027405, + "learning_rate": 0.000449435150102011, + "loss": 3.3401, + "step": 43150 + }, + { + "epoch": 12.583780004660918, + "grad_norm": 0.3387773036956787, + "learning_rate": 0.00044926027397260273, + "loss": 3.3299, + "step": 43200 + }, + { + "epoch": 12.598345374038686, + "grad_norm": 0.37266799807548523, + "learning_rate": 0.00044908539784319437, + "loss": 3.3354, + "step": 43250 + }, + { + "epoch": 12.612910743416453, + "grad_norm": 0.3330574333667755, + "learning_rate": 0.00044891052171378606, + "loss": 3.3432, + "step": 43300 + }, + { + "epoch": 12.62747611279422, + "grad_norm": 0.34195274114608765, + "learning_rate": 0.0004487356455843777, + "loss": 3.341, + "step": 43350 + }, + { + "epoch": 12.642041482171988, + "grad_norm": 0.36545178294181824, + "learning_rate": 0.00044856076945496933, + "loss": 3.3416, + "step": 43400 + }, + { + "epoch": 12.656606851549755, + "grad_norm": 0.3684341311454773, + "learning_rate": 0.000448385893325561, + "loss": 3.3324, + "step": 43450 + }, + { + "epoch": 12.671172220927522, + "grad_norm": 0.34405165910720825, + "learning_rate": 0.00044821101719615266, + "loss": 3.3484, + "step": 43500 + }, + { + "epoch": 12.68573759030529, + "grad_norm": 0.34472227096557617, + "learning_rate": 0.00044803614106674435, + "loss": 3.3525, + "step": 43550 + }, + { + "epoch": 12.700302959683057, + "grad_norm": 0.39018353819847107, + "learning_rate": 0.000447861264937336, + "loss": 3.3332, + "step": 43600 + }, + { + "epoch": 12.714868329060826, + "grad_norm": 0.3903914988040924, + "learning_rate": 0.00044768638880792773, + "loss": 3.3509, + "step": 43650 + }, + { + "epoch": 12.729433698438593, + "grad_norm": 0.36807510256767273, + "learning_rate": 0.00044751151267851937, + "loss": 3.3308, + "step": 43700 + }, + { + "epoch": 12.74399906781636, + "grad_norm": 0.3552221953868866, + "learning_rate": 0.000447336636549111, + "loss": 3.3335, + "step": 43750 + }, + { + "epoch": 12.758564437194128, + "grad_norm": 0.36349716782569885, + "learning_rate": 0.0004471617604197027, + "loss": 3.3353, + "step": 43800 + }, + { + "epoch": 12.773129806571895, + "grad_norm": 0.4054705500602722, + "learning_rate": 0.00044698688429029433, + "loss": 3.3506, + "step": 43850 + }, + { + "epoch": 12.787695175949661, + "grad_norm": 0.3815021812915802, + "learning_rate": 0.000446812008160886, + "loss": 3.3482, + "step": 43900 + }, + { + "epoch": 12.80226054532743, + "grad_norm": 0.3345995843410492, + "learning_rate": 0.00044663713203147766, + "loss": 3.3389, + "step": 43950 + }, + { + "epoch": 12.816825914705197, + "grad_norm": 0.3273864984512329, + "learning_rate": 0.0004464622559020693, + "loss": 3.3361, + "step": 44000 + }, + { + "epoch": 12.816825914705197, + "eval_accuracy": 0.3706991691939247, + "eval_loss": 3.5476930141448975, + "eval_runtime": 182.941, + "eval_samples_per_second": 90.986, + "eval_steps_per_second": 5.69, + "step": 44000 + }, + { + "epoch": 12.831391284082965, + "grad_norm": 0.35794728994369507, + "learning_rate": 0.000446287379772661, + "loss": 3.3331, + "step": 44050 + }, + { + "epoch": 12.845956653460732, + "grad_norm": 0.38504233956336975, + "learning_rate": 0.0004461125036432526, + "loss": 3.3451, + "step": 44100 + }, + { + "epoch": 12.860522022838499, + "grad_norm": 0.3586675822734833, + "learning_rate": 0.00044593762751384436, + "loss": 3.3517, + "step": 44150 + }, + { + "epoch": 12.875087392216267, + "grad_norm": 0.39704591035842896, + "learning_rate": 0.000445762751384436, + "loss": 3.3519, + "step": 44200 + }, + { + "epoch": 12.889652761594034, + "grad_norm": 0.3523213267326355, + "learning_rate": 0.0004455878752550277, + "loss": 3.3524, + "step": 44250 + }, + { + "epoch": 12.9042181309718, + "grad_norm": 0.3636520802974701, + "learning_rate": 0.0004454129991256193, + "loss": 3.352, + "step": 44300 + }, + { + "epoch": 12.91878350034957, + "grad_norm": 0.34113213419914246, + "learning_rate": 0.00044523812299621096, + "loss": 3.3493, + "step": 44350 + }, + { + "epoch": 12.933348869727336, + "grad_norm": 0.33921322226524353, + "learning_rate": 0.00044506324686680265, + "loss": 3.3483, + "step": 44400 + }, + { + "epoch": 12.947914239105105, + "grad_norm": 0.3314392566680908, + "learning_rate": 0.0004448883707373943, + "loss": 3.3575, + "step": 44450 + }, + { + "epoch": 12.962479608482871, + "grad_norm": 0.33687737584114075, + "learning_rate": 0.000444713494607986, + "loss": 3.3469, + "step": 44500 + }, + { + "epoch": 12.977044977860638, + "grad_norm": 0.35737890005111694, + "learning_rate": 0.0004445386184785776, + "loss": 3.3468, + "step": 44550 + }, + { + "epoch": 12.991610347238407, + "grad_norm": 0.3497537672519684, + "learning_rate": 0.00044436374234916925, + "loss": 3.3558, + "step": 44600 + }, + { + "epoch": 13.006117455138662, + "grad_norm": 0.33811378479003906, + "learning_rate": 0.000444188866219761, + "loss": 3.3249, + "step": 44650 + }, + { + "epoch": 13.02068282451643, + "grad_norm": 0.3885464370250702, + "learning_rate": 0.00044401399009035263, + "loss": 3.246, + "step": 44700 + }, + { + "epoch": 13.035248193894198, + "grad_norm": 0.3625794053077698, + "learning_rate": 0.0004438391139609443, + "loss": 3.2451, + "step": 44750 + }, + { + "epoch": 13.049813563271965, + "grad_norm": 0.3693816363811493, + "learning_rate": 0.00044366423783153596, + "loss": 3.2544, + "step": 44800 + }, + { + "epoch": 13.064378932649731, + "grad_norm": 0.36462539434432983, + "learning_rate": 0.0004434893617021276, + "loss": 3.2496, + "step": 44850 + }, + { + "epoch": 13.0789443020275, + "grad_norm": 0.36766380071640015, + "learning_rate": 0.0004433144855727193, + "loss": 3.2578, + "step": 44900 + }, + { + "epoch": 13.093509671405267, + "grad_norm": 0.3557598888874054, + "learning_rate": 0.0004431396094433109, + "loss": 3.2583, + "step": 44950 + }, + { + "epoch": 13.108075040783035, + "grad_norm": 0.373970091342926, + "learning_rate": 0.0004429647333139026, + "loss": 3.2665, + "step": 45000 + }, + { + "epoch": 13.108075040783035, + "eval_accuracy": 0.370416296567142, + "eval_loss": 3.5617780685424805, + "eval_runtime": 183.1028, + "eval_samples_per_second": 90.905, + "eval_steps_per_second": 5.685, + "step": 45000 + }, + { + "epoch": 13.122640410160802, + "grad_norm": 0.3416595458984375, + "learning_rate": 0.00044278985718449425, + "loss": 3.2524, + "step": 45050 + }, + { + "epoch": 13.137205779538569, + "grad_norm": 0.3591947853565216, + "learning_rate": 0.000442614981055086, + "loss": 3.2659, + "step": 45100 + }, + { + "epoch": 13.151771148916337, + "grad_norm": 0.5230109095573425, + "learning_rate": 0.00044244010492567763, + "loss": 3.2759, + "step": 45150 + }, + { + "epoch": 13.166336518294104, + "grad_norm": 0.33014577627182007, + "learning_rate": 0.00044226522879626927, + "loss": 3.279, + "step": 45200 + }, + { + "epoch": 13.18090188767187, + "grad_norm": 0.3678034543991089, + "learning_rate": 0.00044209035266686096, + "loss": 3.2777, + "step": 45250 + }, + { + "epoch": 13.19546725704964, + "grad_norm": 0.35158881545066833, + "learning_rate": 0.0004419154765374526, + "loss": 3.2766, + "step": 45300 + }, + { + "epoch": 13.210032626427406, + "grad_norm": 0.3579759895801544, + "learning_rate": 0.0004417406004080443, + "loss": 3.2681, + "step": 45350 + }, + { + "epoch": 13.224597995805174, + "grad_norm": 0.36432188749313354, + "learning_rate": 0.0004415657242786359, + "loss": 3.2759, + "step": 45400 + }, + { + "epoch": 13.239163365182941, + "grad_norm": 0.3609389662742615, + "learning_rate": 0.00044139084814922755, + "loss": 3.2998, + "step": 45450 + }, + { + "epoch": 13.253728734560708, + "grad_norm": 0.3791234791278839, + "learning_rate": 0.00044121597201981924, + "loss": 3.2908, + "step": 45500 + }, + { + "epoch": 13.268294103938477, + "grad_norm": 0.3537718951702118, + "learning_rate": 0.0004410410958904109, + "loss": 3.2842, + "step": 45550 + }, + { + "epoch": 13.282859473316243, + "grad_norm": 0.3571259081363678, + "learning_rate": 0.0004408662197610026, + "loss": 3.306, + "step": 45600 + }, + { + "epoch": 13.29742484269401, + "grad_norm": 0.3450362980365753, + "learning_rate": 0.00044069134363159426, + "loss": 3.2932, + "step": 45650 + }, + { + "epoch": 13.311990212071779, + "grad_norm": 0.37204447388648987, + "learning_rate": 0.00044051646750218595, + "loss": 3.2995, + "step": 45700 + }, + { + "epoch": 13.326555581449545, + "grad_norm": 0.3619859516620636, + "learning_rate": 0.0004403415913727776, + "loss": 3.3131, + "step": 45750 + }, + { + "epoch": 13.341120950827314, + "grad_norm": 0.40078550577163696, + "learning_rate": 0.0004401667152433692, + "loss": 3.3, + "step": 45800 + }, + { + "epoch": 13.35568632020508, + "grad_norm": 0.3776214122772217, + "learning_rate": 0.0004399918391139609, + "loss": 3.3003, + "step": 45850 + }, + { + "epoch": 13.370251689582847, + "grad_norm": 0.34433192014694214, + "learning_rate": 0.00043981696298455255, + "loss": 3.2994, + "step": 45900 + }, + { + "epoch": 13.384817058960616, + "grad_norm": 0.3606876730918884, + "learning_rate": 0.00043964208685514424, + "loss": 3.3046, + "step": 45950 + }, + { + "epoch": 13.399382428338383, + "grad_norm": 0.3861118257045746, + "learning_rate": 0.0004394672107257359, + "loss": 3.2949, + "step": 46000 + }, + { + "epoch": 13.399382428338383, + "eval_accuracy": 0.37078323150820136, + "eval_loss": 3.5553009510040283, + "eval_runtime": 182.8517, + "eval_samples_per_second": 91.03, + "eval_steps_per_second": 5.693, + "step": 46000 + }, + { + "epoch": 13.41394779771615, + "grad_norm": 0.4186341464519501, + "learning_rate": 0.0004392923345963275, + "loss": 3.315, + "step": 46050 + }, + { + "epoch": 13.428513167093918, + "grad_norm": 0.33551856875419617, + "learning_rate": 0.00043911745846691926, + "loss": 3.3034, + "step": 46100 + }, + { + "epoch": 13.443078536471685, + "grad_norm": 0.3242250978946686, + "learning_rate": 0.0004389425823375109, + "loss": 3.3139, + "step": 46150 + }, + { + "epoch": 13.457643905849451, + "grad_norm": 0.3546141982078552, + "learning_rate": 0.0004387677062081026, + "loss": 3.3164, + "step": 46200 + }, + { + "epoch": 13.47220927522722, + "grad_norm": 0.3523365557193756, + "learning_rate": 0.0004385928300786942, + "loss": 3.3058, + "step": 46250 + }, + { + "epoch": 13.486774644604987, + "grad_norm": 0.3621370792388916, + "learning_rate": 0.0004384179539492859, + "loss": 3.3242, + "step": 46300 + }, + { + "epoch": 13.501340013982755, + "grad_norm": 0.36154744029045105, + "learning_rate": 0.00043824307781987755, + "loss": 3.3023, + "step": 46350 + }, + { + "epoch": 13.515905383360522, + "grad_norm": 0.34886816143989563, + "learning_rate": 0.0004380682016904692, + "loss": 3.3159, + "step": 46400 + }, + { + "epoch": 13.530470752738289, + "grad_norm": 0.3677714467048645, + "learning_rate": 0.0004378933255610609, + "loss": 3.3182, + "step": 46450 + }, + { + "epoch": 13.545036122116057, + "grad_norm": 0.351188987493515, + "learning_rate": 0.0004377184494316525, + "loss": 3.3209, + "step": 46500 + }, + { + "epoch": 13.559601491493824, + "grad_norm": 0.3655342161655426, + "learning_rate": 0.00043754357330224426, + "loss": 3.3248, + "step": 46550 + }, + { + "epoch": 13.574166860871593, + "grad_norm": 0.3502582907676697, + "learning_rate": 0.0004373686971728359, + "loss": 3.3134, + "step": 46600 + }, + { + "epoch": 13.58873223024936, + "grad_norm": 0.36190810799598694, + "learning_rate": 0.00043719382104342753, + "loss": 3.3175, + "step": 46650 + }, + { + "epoch": 13.603297599627126, + "grad_norm": 0.35235005617141724, + "learning_rate": 0.0004370189449140192, + "loss": 3.3199, + "step": 46700 + }, + { + "epoch": 13.617862969004895, + "grad_norm": 0.3493868410587311, + "learning_rate": 0.00043684406878461085, + "loss": 3.3251, + "step": 46750 + }, + { + "epoch": 13.632428338382661, + "grad_norm": 0.3571072816848755, + "learning_rate": 0.00043666919265520254, + "loss": 3.3171, + "step": 46800 + }, + { + "epoch": 13.646993707760428, + "grad_norm": 0.3887588083744049, + "learning_rate": 0.0004364943165257942, + "loss": 3.3209, + "step": 46850 + }, + { + "epoch": 13.661559077138197, + "grad_norm": 0.3646533489227295, + "learning_rate": 0.0004363194403963858, + "loss": 3.3227, + "step": 46900 + }, + { + "epoch": 13.676124446515963, + "grad_norm": 0.35989436507225037, + "learning_rate": 0.0004361445642669775, + "loss": 3.3222, + "step": 46950 + }, + { + "epoch": 13.69068981589373, + "grad_norm": 0.34057942032814026, + "learning_rate": 0.00043596968813756914, + "loss": 3.3255, + "step": 47000 + }, + { + "epoch": 13.69068981589373, + "eval_accuracy": 0.3710363590083939, + "eval_loss": 3.5472984313964844, + "eval_runtime": 182.8591, + "eval_samples_per_second": 91.026, + "eval_steps_per_second": 5.693, + "step": 47000 + }, + { + "epoch": 13.705255185271499, + "grad_norm": 0.3698166310787201, + "learning_rate": 0.0004357948120081609, + "loss": 3.3216, + "step": 47050 + }, + { + "epoch": 13.719820554649266, + "grad_norm": 0.3621658682823181, + "learning_rate": 0.0004356199358787525, + "loss": 3.3184, + "step": 47100 + }, + { + "epoch": 13.734385924027034, + "grad_norm": 0.36228302121162415, + "learning_rate": 0.0004354450597493442, + "loss": 3.3329, + "step": 47150 + }, + { + "epoch": 13.7489512934048, + "grad_norm": 0.33166074752807617, + "learning_rate": 0.00043527018361993585, + "loss": 3.3176, + "step": 47200 + }, + { + "epoch": 13.763516662782568, + "grad_norm": 0.3457214832305908, + "learning_rate": 0.0004350953074905275, + "loss": 3.3255, + "step": 47250 + }, + { + "epoch": 13.778082032160336, + "grad_norm": 0.38725757598876953, + "learning_rate": 0.0004349204313611192, + "loss": 3.3119, + "step": 47300 + }, + { + "epoch": 13.792647401538103, + "grad_norm": 0.3344460427761078, + "learning_rate": 0.0004347455552317108, + "loss": 3.3217, + "step": 47350 + }, + { + "epoch": 13.80721277091587, + "grad_norm": 0.3496969938278198, + "learning_rate": 0.0004345706791023025, + "loss": 3.3271, + "step": 47400 + }, + { + "epoch": 13.821778140293638, + "grad_norm": 0.3360762596130371, + "learning_rate": 0.00043439580297289414, + "loss": 3.3266, + "step": 47450 + }, + { + "epoch": 13.836343509671405, + "grad_norm": 0.34943684935569763, + "learning_rate": 0.0004342209268434858, + "loss": 3.3241, + "step": 47500 + }, + { + "epoch": 13.850908879049173, + "grad_norm": 0.3448510468006134, + "learning_rate": 0.0004340460507140775, + "loss": 3.3468, + "step": 47550 + }, + { + "epoch": 13.86547424842694, + "grad_norm": 0.35688409209251404, + "learning_rate": 0.00043387117458466916, + "loss": 3.3362, + "step": 47600 + }, + { + "epoch": 13.880039617804707, + "grad_norm": 0.36410847306251526, + "learning_rate": 0.00043369629845526085, + "loss": 3.3353, + "step": 47650 + }, + { + "epoch": 13.894604987182475, + "grad_norm": 0.33943018317222595, + "learning_rate": 0.0004335214223258525, + "loss": 3.3441, + "step": 47700 + }, + { + "epoch": 13.909170356560242, + "grad_norm": 0.34998592734336853, + "learning_rate": 0.0004333465461964442, + "loss": 3.3327, + "step": 47750 + }, + { + "epoch": 13.923735725938009, + "grad_norm": 0.36426639556884766, + "learning_rate": 0.0004331716700670358, + "loss": 3.3365, + "step": 47800 + }, + { + "epoch": 13.938301095315778, + "grad_norm": 0.3527946472167969, + "learning_rate": 0.00043299679393762745, + "loss": 3.3339, + "step": 47850 + }, + { + "epoch": 13.952866464693544, + "grad_norm": 0.36141130328178406, + "learning_rate": 0.00043282191780821914, + "loss": 3.3355, + "step": 47900 + }, + { + "epoch": 13.967431834071313, + "grad_norm": 0.35111555457115173, + "learning_rate": 0.00043264704167881077, + "loss": 3.3369, + "step": 47950 + }, + { + "epoch": 13.98199720344908, + "grad_norm": 0.3544767498970032, + "learning_rate": 0.0004324721655494025, + "loss": 3.3499, + "step": 48000 + }, + { + "epoch": 13.98199720344908, + "eval_accuracy": 0.37179233198853223, + "eval_loss": 3.53971004486084, + "eval_runtime": 182.9517, + "eval_samples_per_second": 90.98, + "eval_steps_per_second": 5.69, + "step": 48000 + }, + { + "epoch": 13.996562572826846, + "grad_norm": 0.3594323992729187, + "learning_rate": 0.00043229728941999415, + "loss": 3.3321, + "step": 48050 + }, + { + "epoch": 14.011069680727104, + "grad_norm": 0.37229645252227783, + "learning_rate": 0.0004321224132905858, + "loss": 3.2537, + "step": 48100 + }, + { + "epoch": 14.02563505010487, + "grad_norm": 0.3777659237384796, + "learning_rate": 0.0004319475371611775, + "loss": 3.2282, + "step": 48150 + }, + { + "epoch": 14.040200419482638, + "grad_norm": 0.34410324692726135, + "learning_rate": 0.0004317726610317691, + "loss": 3.237, + "step": 48200 + }, + { + "epoch": 14.054765788860406, + "grad_norm": 0.38047492504119873, + "learning_rate": 0.0004315977849023608, + "loss": 3.2391, + "step": 48250 + }, + { + "epoch": 14.069331158238173, + "grad_norm": 0.3687790334224701, + "learning_rate": 0.00043142290877295244, + "loss": 3.2302, + "step": 48300 + }, + { + "epoch": 14.08389652761594, + "grad_norm": 0.35645702481269836, + "learning_rate": 0.00043124803264354413, + "loss": 3.2453, + "step": 48350 + }, + { + "epoch": 14.098461896993708, + "grad_norm": 0.3644995093345642, + "learning_rate": 0.00043107315651413577, + "loss": 3.2361, + "step": 48400 + }, + { + "epoch": 14.113027266371475, + "grad_norm": 0.3667345643043518, + "learning_rate": 0.0004308982803847274, + "loss": 3.2444, + "step": 48450 + }, + { + "epoch": 14.127592635749243, + "grad_norm": 0.3460175693035126, + "learning_rate": 0.00043072340425531915, + "loss": 3.2458, + "step": 48500 + }, + { + "epoch": 14.14215800512701, + "grad_norm": 0.38901305198669434, + "learning_rate": 0.0004305485281259108, + "loss": 3.2477, + "step": 48550 + }, + { + "epoch": 14.156723374504777, + "grad_norm": 0.3598061203956604, + "learning_rate": 0.0004303736519965025, + "loss": 3.2543, + "step": 48600 + }, + { + "epoch": 14.171288743882545, + "grad_norm": 0.37196290493011475, + "learning_rate": 0.0004301987758670941, + "loss": 3.2641, + "step": 48650 + }, + { + "epoch": 14.185854113260312, + "grad_norm": 0.348818302154541, + "learning_rate": 0.00043002389973768575, + "loss": 3.2655, + "step": 48700 + }, + { + "epoch": 14.200419482638079, + "grad_norm": 0.3512094020843506, + "learning_rate": 0.00042984902360827744, + "loss": 3.2692, + "step": 48750 + }, + { + "epoch": 14.214984852015847, + "grad_norm": 0.36507096886634827, + "learning_rate": 0.0004296741474788691, + "loss": 3.2659, + "step": 48800 + }, + { + "epoch": 14.229550221393614, + "grad_norm": 0.3447706997394562, + "learning_rate": 0.00042949927134946077, + "loss": 3.2631, + "step": 48850 + }, + { + "epoch": 14.244115590771383, + "grad_norm": 0.34184086322784424, + "learning_rate": 0.0004293243952200524, + "loss": 3.2789, + "step": 48900 + }, + { + "epoch": 14.25868096014915, + "grad_norm": 0.352250337600708, + "learning_rate": 0.00042914951909064415, + "loss": 3.2707, + "step": 48950 + }, + { + "epoch": 14.273246329526916, + "grad_norm": 0.3569357693195343, + "learning_rate": 0.0004289746429612358, + "loss": 3.2776, + "step": 49000 + }, + { + "epoch": 14.273246329526916, + "eval_accuracy": 0.37109361543783825, + "eval_loss": 3.5551700592041016, + "eval_runtime": 182.8453, + "eval_samples_per_second": 91.033, + "eval_steps_per_second": 5.693, + "step": 49000 + }, + { + "epoch": 14.287811698904685, + "grad_norm": 0.3431905508041382, + "learning_rate": 0.0004287997668318274, + "loss": 3.2826, + "step": 49050 + }, + { + "epoch": 14.302377068282452, + "grad_norm": 0.3872802257537842, + "learning_rate": 0.0004286248907024191, + "loss": 3.2845, + "step": 49100 + }, + { + "epoch": 14.316942437660218, + "grad_norm": 0.36308524012565613, + "learning_rate": 0.00042845001457301075, + "loss": 3.2846, + "step": 49150 + }, + { + "epoch": 14.331507807037987, + "grad_norm": 0.4181770384311676, + "learning_rate": 0.00042827513844360244, + "loss": 3.2949, + "step": 49200 + }, + { + "epoch": 14.346073176415754, + "grad_norm": 0.33394357562065125, + "learning_rate": 0.00042810026231419407, + "loss": 3.2837, + "step": 49250 + }, + { + "epoch": 14.360638545793522, + "grad_norm": 0.3773725628852844, + "learning_rate": 0.0004279253861847857, + "loss": 3.28, + "step": 49300 + }, + { + "epoch": 14.375203915171289, + "grad_norm": 0.3761826455593109, + "learning_rate": 0.0004277505100553774, + "loss": 3.2864, + "step": 49350 + }, + { + "epoch": 14.389769284549056, + "grad_norm": 0.36056873202323914, + "learning_rate": 0.00042757563392596904, + "loss": 3.2878, + "step": 49400 + }, + { + "epoch": 14.404334653926824, + "grad_norm": 0.3693695366382599, + "learning_rate": 0.0004274007577965608, + "loss": 3.2889, + "step": 49450 + }, + { + "epoch": 14.418900023304591, + "grad_norm": 0.34379875659942627, + "learning_rate": 0.0004272258816671524, + "loss": 3.2927, + "step": 49500 + }, + { + "epoch": 14.433465392682358, + "grad_norm": 0.3781634569168091, + "learning_rate": 0.00042705100553774405, + "loss": 3.3048, + "step": 49550 + }, + { + "epoch": 14.448030762060126, + "grad_norm": 0.3523657023906708, + "learning_rate": 0.00042687612940833574, + "loss": 3.2899, + "step": 49600 + }, + { + "epoch": 14.462596131437893, + "grad_norm": 0.365022212266922, + "learning_rate": 0.0004267012532789274, + "loss": 3.2877, + "step": 49650 + }, + { + "epoch": 14.477161500815662, + "grad_norm": 0.3355906009674072, + "learning_rate": 0.00042652637714951907, + "loss": 3.2973, + "step": 49700 + }, + { + "epoch": 14.491726870193428, + "grad_norm": 0.37081584334373474, + "learning_rate": 0.0004263515010201107, + "loss": 3.3001, + "step": 49750 + }, + { + "epoch": 14.506292239571195, + "grad_norm": 0.3539626896381378, + "learning_rate": 0.0004261766248907024, + "loss": 3.2871, + "step": 49800 + }, + { + "epoch": 14.520857608948964, + "grad_norm": 0.36710795760154724, + "learning_rate": 0.00042600174876129403, + "loss": 3.2974, + "step": 49850 + }, + { + "epoch": 14.53542297832673, + "grad_norm": 0.34463170170783997, + "learning_rate": 0.00042582687263188567, + "loss": 3.2937, + "step": 49900 + }, + { + "epoch": 14.549988347704497, + "grad_norm": 0.3455180525779724, + "learning_rate": 0.0004256519965024774, + "loss": 3.2996, + "step": 49950 + }, + { + "epoch": 14.564553717082266, + "grad_norm": 0.34833067655563354, + "learning_rate": 0.00042547712037306905, + "loss": 3.2981, + "step": 50000 + }, + { + "epoch": 14.564553717082266, + "eval_accuracy": 0.37169604242854265, + "eval_loss": 3.54777193069458, + "eval_runtime": 182.9961, + "eval_samples_per_second": 90.958, + "eval_steps_per_second": 5.689, + "step": 50000 + }, + { + "epoch": 14.579119086460032, + "grad_norm": 0.3737424314022064, + "learning_rate": 0.00042530224424366074, + "loss": 3.3015, + "step": 50050 + }, + { + "epoch": 14.5936844558378, + "grad_norm": 0.3670142889022827, + "learning_rate": 0.0004251273681142524, + "loss": 3.3052, + "step": 50100 + }, + { + "epoch": 14.608249825215568, + "grad_norm": 0.38956716656684875, + "learning_rate": 0.000424952491984844, + "loss": 3.3031, + "step": 50150 + }, + { + "epoch": 14.622815194593334, + "grad_norm": 0.3635333180427551, + "learning_rate": 0.0004247776158554357, + "loss": 3.3088, + "step": 50200 + }, + { + "epoch": 14.637380563971103, + "grad_norm": 0.3675437569618225, + "learning_rate": 0.00042460273972602734, + "loss": 3.3087, + "step": 50250 + }, + { + "epoch": 14.65194593334887, + "grad_norm": 0.3805491328239441, + "learning_rate": 0.00042442786359661903, + "loss": 3.3056, + "step": 50300 + }, + { + "epoch": 14.666511302726637, + "grad_norm": 0.36096301674842834, + "learning_rate": 0.00042425298746721066, + "loss": 3.3121, + "step": 50350 + }, + { + "epoch": 14.681076672104405, + "grad_norm": 0.35494565963745117, + "learning_rate": 0.0004240781113378024, + "loss": 3.3134, + "step": 50400 + }, + { + "epoch": 14.695642041482172, + "grad_norm": 0.3403199017047882, + "learning_rate": 0.00042390323520839405, + "loss": 3.316, + "step": 50450 + }, + { + "epoch": 14.71020741085994, + "grad_norm": 0.3592222332954407, + "learning_rate": 0.0004237283590789857, + "loss": 3.3016, + "step": 50500 + }, + { + "epoch": 14.724772780237707, + "grad_norm": 0.4019494652748108, + "learning_rate": 0.00042355348294957737, + "loss": 3.3111, + "step": 50550 + }, + { + "epoch": 14.739338149615474, + "grad_norm": 0.3454861044883728, + "learning_rate": 0.000423378606820169, + "loss": 3.3105, + "step": 50600 + }, + { + "epoch": 14.753903518993242, + "grad_norm": 0.35580456256866455, + "learning_rate": 0.0004232037306907607, + "loss": 3.3183, + "step": 50650 + }, + { + "epoch": 14.76846888837101, + "grad_norm": 0.3536444306373596, + "learning_rate": 0.00042302885456135233, + "loss": 3.3263, + "step": 50700 + }, + { + "epoch": 14.783034257748776, + "grad_norm": 0.36378782987594604, + "learning_rate": 0.00042285397843194397, + "loss": 3.3226, + "step": 50750 + }, + { + "epoch": 14.797599627126544, + "grad_norm": 0.3512897193431854, + "learning_rate": 0.00042267910230253566, + "loss": 3.3141, + "step": 50800 + }, + { + "epoch": 14.812164996504311, + "grad_norm": 0.3477229177951813, + "learning_rate": 0.0004225042261731273, + "loss": 3.3043, + "step": 50850 + }, + { + "epoch": 14.826730365882078, + "grad_norm": 0.3499688506126404, + "learning_rate": 0.00042232935004371904, + "loss": 3.3226, + "step": 50900 + }, + { + "epoch": 14.841295735259846, + "grad_norm": 0.34953367710113525, + "learning_rate": 0.0004221544739143107, + "loss": 3.3144, + "step": 50950 + }, + { + "epoch": 14.855861104637613, + "grad_norm": 0.38450056314468384, + "learning_rate": 0.00042197959778490237, + "loss": 3.3145, + "step": 51000 + }, + { + "epoch": 14.855861104637613, + "eval_accuracy": 0.37208954811509365, + "eval_loss": 3.5404186248779297, + "eval_runtime": 230.8513, + "eval_samples_per_second": 72.103, + "eval_steps_per_second": 4.509, + "step": 51000 + }, + { + "epoch": 14.870426474015382, + "grad_norm": 0.37415948510169983, + "learning_rate": 0.000421804721655494, + "loss": 3.3218, + "step": 51050 + }, + { + "epoch": 14.884991843393149, + "grad_norm": 0.36808714270591736, + "learning_rate": 0.00042162984552608564, + "loss": 3.3253, + "step": 51100 + }, + { + "epoch": 14.899557212770915, + "grad_norm": 0.37905970215797424, + "learning_rate": 0.00042145496939667733, + "loss": 3.3039, + "step": 51150 + }, + { + "epoch": 14.914122582148684, + "grad_norm": 0.3744428753852844, + "learning_rate": 0.00042128009326726897, + "loss": 3.3124, + "step": 51200 + }, + { + "epoch": 14.92868795152645, + "grad_norm": 0.38499951362609863, + "learning_rate": 0.00042110521713786066, + "loss": 3.3305, + "step": 51250 + }, + { + "epoch": 14.943253320904217, + "grad_norm": 0.35487884283065796, + "learning_rate": 0.0004209303410084523, + "loss": 3.3269, + "step": 51300 + }, + { + "epoch": 14.957818690281986, + "grad_norm": 0.35819604992866516, + "learning_rate": 0.00042075546487904393, + "loss": 3.3218, + "step": 51350 + }, + { + "epoch": 14.972384059659753, + "grad_norm": 0.37901991605758667, + "learning_rate": 0.0004205805887496357, + "loss": 3.3316, + "step": 51400 + }, + { + "epoch": 14.986949429037521, + "grad_norm": 0.3591151833534241, + "learning_rate": 0.0004204057126202273, + "loss": 3.3292, + "step": 51450 + }, + { + "epoch": 15.001456536937777, + "grad_norm": 0.3722686171531677, + "learning_rate": 0.000420230836490819, + "loss": 3.3101, + "step": 51500 + }, + { + "epoch": 15.016021906315544, + "grad_norm": 0.3850182592868805, + "learning_rate": 0.00042005596036141064, + "loss": 3.2084, + "step": 51550 + }, + { + "epoch": 15.030587275693312, + "grad_norm": 0.3716738522052765, + "learning_rate": 0.0004198810842320023, + "loss": 3.2187, + "step": 51600 + }, + { + "epoch": 15.045152645071079, + "grad_norm": 0.3463042974472046, + "learning_rate": 0.00041970620810259396, + "loss": 3.2243, + "step": 51650 + }, + { + "epoch": 15.059718014448846, + "grad_norm": 0.3639398515224457, + "learning_rate": 0.0004195313319731856, + "loss": 3.2268, + "step": 51700 + }, + { + "epoch": 15.074283383826614, + "grad_norm": 0.3452468812465668, + "learning_rate": 0.0004193564558437773, + "loss": 3.222, + "step": 51750 + }, + { + "epoch": 15.088848753204381, + "grad_norm": 0.36396175622940063, + "learning_rate": 0.0004191815797143689, + "loss": 3.2334, + "step": 51800 + }, + { + "epoch": 15.103414122582148, + "grad_norm": 0.37959036231040955, + "learning_rate": 0.00041900670358496067, + "loss": 3.2208, + "step": 51850 + }, + { + "epoch": 15.117979491959916, + "grad_norm": 0.3539676070213318, + "learning_rate": 0.0004188318274555523, + "loss": 3.2322, + "step": 51900 + }, + { + "epoch": 15.132544861337683, + "grad_norm": 0.3693428039550781, + "learning_rate": 0.00041865695132614394, + "loss": 3.2495, + "step": 51950 + }, + { + "epoch": 15.147110230715452, + "grad_norm": 0.3708992004394531, + "learning_rate": 0.00041848207519673563, + "loss": 3.2624, + "step": 52000 + }, + { + "epoch": 15.147110230715452, + "eval_accuracy": 0.37109831822465095, + "eval_loss": 3.5561776161193848, + "eval_runtime": 182.9851, + "eval_samples_per_second": 90.964, + "eval_steps_per_second": 5.689, + "step": 52000 + }, + { + "epoch": 15.161675600093218, + "grad_norm": 0.3789459764957428, + "learning_rate": 0.00041830719906732727, + "loss": 3.2452, + "step": 52050 + }, + { + "epoch": 15.176240969470985, + "grad_norm": 0.3604375720024109, + "learning_rate": 0.00041813232293791896, + "loss": 3.2586, + "step": 52100 + }, + { + "epoch": 15.190806338848754, + "grad_norm": 0.3920636475086212, + "learning_rate": 0.0004179574468085106, + "loss": 3.2532, + "step": 52150 + }, + { + "epoch": 15.20537170822652, + "grad_norm": 0.36862772703170776, + "learning_rate": 0.00041778257067910223, + "loss": 3.2501, + "step": 52200 + }, + { + "epoch": 15.219937077604287, + "grad_norm": 0.3847063481807709, + "learning_rate": 0.0004176076945496939, + "loss": 3.2635, + "step": 52250 + }, + { + "epoch": 15.234502446982056, + "grad_norm": 0.37286511063575745, + "learning_rate": 0.00041743281842028556, + "loss": 3.2518, + "step": 52300 + }, + { + "epoch": 15.249067816359823, + "grad_norm": 0.35213422775268555, + "learning_rate": 0.0004172579422908773, + "loss": 3.2495, + "step": 52350 + }, + { + "epoch": 15.263633185737591, + "grad_norm": 0.37196075916290283, + "learning_rate": 0.00041708306616146894, + "loss": 3.2574, + "step": 52400 + }, + { + "epoch": 15.278198555115358, + "grad_norm": 0.36524370312690735, + "learning_rate": 0.00041690819003206063, + "loss": 3.2571, + "step": 52450 + }, + { + "epoch": 15.292763924493125, + "grad_norm": 0.3636349141597748, + "learning_rate": 0.00041673331390265227, + "loss": 3.2618, + "step": 52500 + }, + { + "epoch": 15.307329293870893, + "grad_norm": 0.37588223814964294, + "learning_rate": 0.0004165584377732439, + "loss": 3.2782, + "step": 52550 + }, + { + "epoch": 15.32189466324866, + "grad_norm": 0.4110542833805084, + "learning_rate": 0.0004163835616438356, + "loss": 3.2642, + "step": 52600 + }, + { + "epoch": 15.336460032626427, + "grad_norm": 0.3869507908821106, + "learning_rate": 0.00041620868551442723, + "loss": 3.269, + "step": 52650 + }, + { + "epoch": 15.351025402004195, + "grad_norm": 0.3692317306995392, + "learning_rate": 0.0004160338093850189, + "loss": 3.2689, + "step": 52700 + }, + { + "epoch": 15.365590771381962, + "grad_norm": 0.3578970730304718, + "learning_rate": 0.00041585893325561056, + "loss": 3.2703, + "step": 52750 + }, + { + "epoch": 15.38015614075973, + "grad_norm": 0.37819942831993103, + "learning_rate": 0.0004156840571262022, + "loss": 3.2626, + "step": 52800 + }, + { + "epoch": 15.394721510137497, + "grad_norm": 0.3595389425754547, + "learning_rate": 0.00041550918099679394, + "loss": 3.2812, + "step": 52850 + }, + { + "epoch": 15.409286879515264, + "grad_norm": 0.36239922046661377, + "learning_rate": 0.0004153343048673856, + "loss": 3.271, + "step": 52900 + }, + { + "epoch": 15.423852248893033, + "grad_norm": 0.3499103784561157, + "learning_rate": 0.00041515942873797726, + "loss": 3.2892, + "step": 52950 + }, + { + "epoch": 15.4384176182708, + "grad_norm": 0.4039655327796936, + "learning_rate": 0.0004149845526085689, + "loss": 3.2688, + "step": 53000 + }, + { + "epoch": 15.4384176182708, + "eval_accuracy": 0.371650190257119, + "eval_loss": 3.551593780517578, + "eval_runtime": 182.8196, + "eval_samples_per_second": 91.046, + "eval_steps_per_second": 5.694, + "step": 53000 + }, + { + "epoch": 15.452982987648566, + "grad_norm": 0.3695296347141266, + "learning_rate": 0.0004148096764791606, + "loss": 3.2855, + "step": 53050 + }, + { + "epoch": 15.467548357026335, + "grad_norm": 0.3560178875923157, + "learning_rate": 0.0004146348003497522, + "loss": 3.2919, + "step": 53100 + }, + { + "epoch": 15.482113726404101, + "grad_norm": 0.37681275606155396, + "learning_rate": 0.00041445992422034386, + "loss": 3.2786, + "step": 53150 + }, + { + "epoch": 15.49667909578187, + "grad_norm": 0.352400004863739, + "learning_rate": 0.00041428504809093555, + "loss": 3.2802, + "step": 53200 + }, + { + "epoch": 15.511244465159637, + "grad_norm": 0.3676430881023407, + "learning_rate": 0.0004141101719615272, + "loss": 3.27, + "step": 53250 + }, + { + "epoch": 15.525809834537403, + "grad_norm": 0.3506219983100891, + "learning_rate": 0.00041393529583211893, + "loss": 3.286, + "step": 53300 + }, + { + "epoch": 15.540375203915172, + "grad_norm": 0.37572669982910156, + "learning_rate": 0.00041376041970271057, + "loss": 3.2762, + "step": 53350 + }, + { + "epoch": 15.554940573292939, + "grad_norm": 0.37286514043807983, + "learning_rate": 0.0004135855435733022, + "loss": 3.2919, + "step": 53400 + }, + { + "epoch": 15.569505942670705, + "grad_norm": 0.3498823046684265, + "learning_rate": 0.0004134106674438939, + "loss": 3.281, + "step": 53450 + }, + { + "epoch": 15.584071312048474, + "grad_norm": 0.38120484352111816, + "learning_rate": 0.00041323579131448553, + "loss": 3.2949, + "step": 53500 + }, + { + "epoch": 15.59863668142624, + "grad_norm": 0.35132765769958496, + "learning_rate": 0.0004130609151850772, + "loss": 3.299, + "step": 53550 + }, + { + "epoch": 15.61320205080401, + "grad_norm": 0.39883989095687866, + "learning_rate": 0.00041288603905566886, + "loss": 3.2928, + "step": 53600 + }, + { + "epoch": 15.627767420181776, + "grad_norm": 0.3505145013332367, + "learning_rate": 0.0004127111629262605, + "loss": 3.2905, + "step": 53650 + }, + { + "epoch": 15.642332789559543, + "grad_norm": 0.35722294449806213, + "learning_rate": 0.0004125362867968522, + "loss": 3.3067, + "step": 53700 + }, + { + "epoch": 15.656898158937311, + "grad_norm": 0.34419217705726624, + "learning_rate": 0.0004123614106674438, + "loss": 3.3027, + "step": 53750 + }, + { + "epoch": 15.671463528315078, + "grad_norm": 0.3667759895324707, + "learning_rate": 0.00041218653453803557, + "loss": 3.293, + "step": 53800 + }, + { + "epoch": 15.686028897692845, + "grad_norm": 0.35497573018074036, + "learning_rate": 0.0004120116584086272, + "loss": 3.2875, + "step": 53850 + }, + { + "epoch": 15.700594267070613, + "grad_norm": 0.3888442814350128, + "learning_rate": 0.0004118367822792189, + "loss": 3.2854, + "step": 53900 + }, + { + "epoch": 15.71515963644838, + "grad_norm": 0.3837001621723175, + "learning_rate": 0.00041166190614981053, + "loss": 3.3003, + "step": 53950 + }, + { + "epoch": 15.729725005826147, + "grad_norm": 0.3705950081348419, + "learning_rate": 0.00041148703002040217, + "loss": 3.2895, + "step": 54000 + }, + { + "epoch": 15.729725005826147, + "eval_accuracy": 0.3725364304319686, + "eval_loss": 3.5416035652160645, + "eval_runtime": 194.42, + "eval_samples_per_second": 85.614, + "eval_steps_per_second": 5.354, + "step": 54000 + }, + { + "epoch": 15.744290375203915, + "grad_norm": 0.3562714159488678, + "learning_rate": 0.00041131215389099386, + "loss": 3.3019, + "step": 54050 + }, + { + "epoch": 15.758855744581682, + "grad_norm": 0.3780875504016876, + "learning_rate": 0.0004111372777615855, + "loss": 3.2982, + "step": 54100 + }, + { + "epoch": 15.77342111395945, + "grad_norm": 0.34928643703460693, + "learning_rate": 0.0004109624016321772, + "loss": 3.293, + "step": 54150 + }, + { + "epoch": 15.787986483337217, + "grad_norm": 0.3641933500766754, + "learning_rate": 0.0004107875255027688, + "loss": 3.2979, + "step": 54200 + }, + { + "epoch": 15.802551852714984, + "grad_norm": 0.34777817130088806, + "learning_rate": 0.00041061264937336045, + "loss": 3.3047, + "step": 54250 + }, + { + "epoch": 15.817117222092753, + "grad_norm": 0.3781992793083191, + "learning_rate": 0.0004104377732439522, + "loss": 3.3023, + "step": 54300 + }, + { + "epoch": 15.83168259147052, + "grad_norm": 0.35115933418273926, + "learning_rate": 0.00041026289711454384, + "loss": 3.3009, + "step": 54350 + }, + { + "epoch": 15.846247960848288, + "grad_norm": 0.3789314925670624, + "learning_rate": 0.0004100880209851355, + "loss": 3.3064, + "step": 54400 + }, + { + "epoch": 15.860813330226055, + "grad_norm": 0.35568493604660034, + "learning_rate": 0.00040991314485572716, + "loss": 3.2978, + "step": 54450 + }, + { + "epoch": 15.875378699603822, + "grad_norm": 0.34552639722824097, + "learning_rate": 0.00040973826872631885, + "loss": 3.3108, + "step": 54500 + }, + { + "epoch": 15.88994406898159, + "grad_norm": 0.34846824407577515, + "learning_rate": 0.0004095633925969105, + "loss": 3.3053, + "step": 54550 + }, + { + "epoch": 15.904509438359357, + "grad_norm": 0.37127169966697693, + "learning_rate": 0.0004093885164675021, + "loss": 3.3031, + "step": 54600 + }, + { + "epoch": 15.919074807737124, + "grad_norm": 0.37105128169059753, + "learning_rate": 0.0004092136403380938, + "loss": 3.3211, + "step": 54650 + }, + { + "epoch": 15.933640177114892, + "grad_norm": 0.34885090589523315, + "learning_rate": 0.00040903876420868545, + "loss": 3.3023, + "step": 54700 + }, + { + "epoch": 15.948205546492659, + "grad_norm": 0.3694795072078705, + "learning_rate": 0.00040886388807927714, + "loss": 3.3139, + "step": 54750 + }, + { + "epoch": 15.962770915870426, + "grad_norm": 0.37064146995544434, + "learning_rate": 0.00040868901194986883, + "loss": 3.2984, + "step": 54800 + }, + { + "epoch": 15.977336285248194, + "grad_norm": 0.3710354268550873, + "learning_rate": 0.00040851413582046047, + "loss": 3.314, + "step": 54850 + }, + { + "epoch": 15.991901654625961, + "grad_norm": 0.3885522484779358, + "learning_rate": 0.00040833925969105216, + "loss": 3.3173, + "step": 54900 + }, + { + "epoch": 16.006408762526217, + "grad_norm": 0.3504789173603058, + "learning_rate": 0.0004081643835616438, + "loss": 3.2611, + "step": 54950 + }, + { + "epoch": 16.020974131903984, + "grad_norm": 0.380199670791626, + "learning_rate": 0.0004079895074322355, + "loss": 3.2005, + "step": 55000 + }, + { + "epoch": 16.020974131903984, + "eval_accuracy": 0.37211976352036513, + "eval_loss": 3.5526411533355713, + "eval_runtime": 244.6938, + "eval_samples_per_second": 68.024, + "eval_steps_per_second": 4.254, + "step": 55000 + }, + { + "epoch": 16.035539501281754, + "grad_norm": 0.38104501366615295, + "learning_rate": 0.0004078146313028271, + "loss": 3.2048, + "step": 55050 + }, + { + "epoch": 16.05010487065952, + "grad_norm": 0.3903481364250183, + "learning_rate": 0.0004076397551734188, + "loss": 3.2114, + "step": 55100 + }, + { + "epoch": 16.064670240037287, + "grad_norm": 0.39314690232276917, + "learning_rate": 0.00040746487904401045, + "loss": 3.2053, + "step": 55150 + }, + { + "epoch": 16.079235609415054, + "grad_norm": 0.38397568464279175, + "learning_rate": 0.0004072900029146021, + "loss": 3.2069, + "step": 55200 + }, + { + "epoch": 16.09380097879282, + "grad_norm": 0.3606434762477875, + "learning_rate": 0.0004071151267851938, + "loss": 3.2255, + "step": 55250 + }, + { + "epoch": 16.10836634817059, + "grad_norm": 0.41309845447540283, + "learning_rate": 0.00040694025065578546, + "loss": 3.2299, + "step": 55300 + }, + { + "epoch": 16.122931717548358, + "grad_norm": 0.3519381582736969, + "learning_rate": 0.00040676537452637716, + "loss": 3.2235, + "step": 55350 + }, + { + "epoch": 16.137497086926125, + "grad_norm": 0.3964149057865143, + "learning_rate": 0.0004065904983969688, + "loss": 3.2243, + "step": 55400 + }, + { + "epoch": 16.15206245630389, + "grad_norm": 0.38055652379989624, + "learning_rate": 0.00040641562226756043, + "loss": 3.2306, + "step": 55450 + }, + { + "epoch": 16.16662782568166, + "grad_norm": 0.3741556406021118, + "learning_rate": 0.0004062407461381521, + "loss": 3.2467, + "step": 55500 + }, + { + "epoch": 16.181193195059425, + "grad_norm": 0.36357757449150085, + "learning_rate": 0.00040606587000874375, + "loss": 3.2398, + "step": 55550 + }, + { + "epoch": 16.195758564437195, + "grad_norm": 0.3765939772129059, + "learning_rate": 0.00040589099387933544, + "loss": 3.2399, + "step": 55600 + }, + { + "epoch": 16.210323933814962, + "grad_norm": 0.4116554856300354, + "learning_rate": 0.0004057161177499271, + "loss": 3.2463, + "step": 55650 + }, + { + "epoch": 16.22488930319273, + "grad_norm": 0.35418593883514404, + "learning_rate": 0.0004055412416205187, + "loss": 3.2358, + "step": 55700 + }, + { + "epoch": 16.239454672570496, + "grad_norm": 0.3857363164424896, + "learning_rate": 0.0004053663654911104, + "loss": 3.2392, + "step": 55750 + }, + { + "epoch": 16.254020041948262, + "grad_norm": 0.38058850169181824, + "learning_rate": 0.0004051914893617021, + "loss": 3.2453, + "step": 55800 + }, + { + "epoch": 16.268585411326033, + "grad_norm": 0.389410138130188, + "learning_rate": 0.0004050166132322938, + "loss": 3.2443, + "step": 55850 + }, + { + "epoch": 16.2831507807038, + "grad_norm": 0.3780372440814972, + "learning_rate": 0.0004048417371028854, + "loss": 3.2311, + "step": 55900 + }, + { + "epoch": 16.297716150081566, + "grad_norm": 0.3871661126613617, + "learning_rate": 0.0004046668609734771, + "loss": 3.2533, + "step": 55950 + }, + { + "epoch": 16.312281519459333, + "grad_norm": 0.3906373083591461, + "learning_rate": 0.00040449198484406875, + "loss": 3.2617, + "step": 56000 + }, + { + "epoch": 16.312281519459333, + "eval_accuracy": 0.37166923654371037, + "eval_loss": 3.552480697631836, + "eval_runtime": 183.2175, + "eval_samples_per_second": 90.848, + "eval_steps_per_second": 5.682, + "step": 56000 + }, + { + "epoch": 16.3268468888371, + "grad_norm": 0.38734468817710876, + "learning_rate": 0.0004043171087146604, + "loss": 3.2497, + "step": 56050 + }, + { + "epoch": 16.34141225821487, + "grad_norm": 0.3852713406085968, + "learning_rate": 0.0004041422325852521, + "loss": 3.2587, + "step": 56100 + }, + { + "epoch": 16.355977627592637, + "grad_norm": 0.3770441710948944, + "learning_rate": 0.0004039673564558437, + "loss": 3.2505, + "step": 56150 + }, + { + "epoch": 16.370542996970403, + "grad_norm": 0.36964693665504456, + "learning_rate": 0.0004037924803264354, + "loss": 3.2582, + "step": 56200 + }, + { + "epoch": 16.38510836634817, + "grad_norm": 0.3863416016101837, + "learning_rate": 0.00040361760419702704, + "loss": 3.2596, + "step": 56250 + }, + { + "epoch": 16.399673735725937, + "grad_norm": 0.4353122115135193, + "learning_rate": 0.00040344272806761873, + "loss": 3.2657, + "step": 56300 + }, + { + "epoch": 16.414239105103704, + "grad_norm": 0.3620751202106476, + "learning_rate": 0.0004032678519382104, + "loss": 3.2607, + "step": 56350 + }, + { + "epoch": 16.428804474481474, + "grad_norm": 0.3591553568840027, + "learning_rate": 0.00040309297580880206, + "loss": 3.2657, + "step": 56400 + }, + { + "epoch": 16.44336984385924, + "grad_norm": 0.3680947721004486, + "learning_rate": 0.00040291809967939375, + "loss": 3.27, + "step": 56450 + }, + { + "epoch": 16.457935213237008, + "grad_norm": 0.3693728446960449, + "learning_rate": 0.0004027432235499854, + "loss": 3.2616, + "step": 56500 + }, + { + "epoch": 16.472500582614774, + "grad_norm": 0.3734280467033386, + "learning_rate": 0.0004025683474205771, + "loss": 3.261, + "step": 56550 + }, + { + "epoch": 16.48706595199254, + "grad_norm": 0.38935521245002747, + "learning_rate": 0.0004023934712911687, + "loss": 3.2739, + "step": 56600 + }, + { + "epoch": 16.50163132137031, + "grad_norm": 0.34762194752693176, + "learning_rate": 0.00040221859516176035, + "loss": 3.2653, + "step": 56650 + }, + { + "epoch": 16.516196690748078, + "grad_norm": 0.36241579055786133, + "learning_rate": 0.00040204371903235204, + "loss": 3.2751, + "step": 56700 + }, + { + "epoch": 16.530762060125845, + "grad_norm": 0.37560877203941345, + "learning_rate": 0.0004018688429029437, + "loss": 3.2705, + "step": 56750 + }, + { + "epoch": 16.54532742950361, + "grad_norm": 0.37584882974624634, + "learning_rate": 0.0004016939667735354, + "loss": 3.2786, + "step": 56800 + }, + { + "epoch": 16.55989279888138, + "grad_norm": 0.38627269864082336, + "learning_rate": 0.00040151909064412705, + "loss": 3.2784, + "step": 56850 + }, + { + "epoch": 16.57445816825915, + "grad_norm": 0.37627264857292175, + "learning_rate": 0.0004013442145147187, + "loss": 3.2719, + "step": 56900 + }, + { + "epoch": 16.589023537636916, + "grad_norm": 0.4065730571746826, + "learning_rate": 0.0004011693383853104, + "loss": 3.2775, + "step": 56950 + }, + { + "epoch": 16.603588907014682, + "grad_norm": 0.382291704416275, + "learning_rate": 0.000400994462255902, + "loss": 3.2742, + "step": 57000 + }, + { + "epoch": 16.603588907014682, + "eval_accuracy": 0.3722539105141968, + "eval_loss": 3.5430514812469482, + "eval_runtime": 183.267, + "eval_samples_per_second": 90.824, + "eval_steps_per_second": 5.68, + "step": 57000 + }, + { + "epoch": 16.61815427639245, + "grad_norm": 0.3411348760128021, + "learning_rate": 0.0004008195861264937, + "loss": 3.2873, + "step": 57050 + }, + { + "epoch": 16.632719645770216, + "grad_norm": 0.3512347638607025, + "learning_rate": 0.00040064470999708534, + "loss": 3.2737, + "step": 57100 + }, + { + "epoch": 16.647285015147983, + "grad_norm": 0.3661727011203766, + "learning_rate": 0.00040046983386767703, + "loss": 3.2778, + "step": 57150 + }, + { + "epoch": 16.661850384525753, + "grad_norm": 0.36524951457977295, + "learning_rate": 0.00040029495773826867, + "loss": 3.2854, + "step": 57200 + }, + { + "epoch": 16.67641575390352, + "grad_norm": 0.408110111951828, + "learning_rate": 0.0004001200816088603, + "loss": 3.2824, + "step": 57250 + }, + { + "epoch": 16.690981123281286, + "grad_norm": 0.37179291248321533, + "learning_rate": 0.00039994520547945205, + "loss": 3.2841, + "step": 57300 + }, + { + "epoch": 16.705546492659053, + "grad_norm": 0.39645498991012573, + "learning_rate": 0.0003997703293500437, + "loss": 3.2906, + "step": 57350 + }, + { + "epoch": 16.72011186203682, + "grad_norm": 0.5533550977706909, + "learning_rate": 0.0003995954532206354, + "loss": 3.2817, + "step": 57400 + }, + { + "epoch": 16.73467723141459, + "grad_norm": 0.3692794740200043, + "learning_rate": 0.000399420577091227, + "loss": 3.2839, + "step": 57450 + }, + { + "epoch": 16.749242600792357, + "grad_norm": 0.3895152807235718, + "learning_rate": 0.00039924570096181865, + "loss": 3.2873, + "step": 57500 + }, + { + "epoch": 16.763807970170124, + "grad_norm": 0.36398881673812866, + "learning_rate": 0.00039907082483241034, + "loss": 3.2989, + "step": 57550 + }, + { + "epoch": 16.77837333954789, + "grad_norm": 0.3857356309890747, + "learning_rate": 0.000398895948703002, + "loss": 3.2908, + "step": 57600 + }, + { + "epoch": 16.792938708925657, + "grad_norm": 0.37029045820236206, + "learning_rate": 0.00039872107257359367, + "loss": 3.2951, + "step": 57650 + }, + { + "epoch": 16.807504078303424, + "grad_norm": 0.3431582450866699, + "learning_rate": 0.0003985461964441853, + "loss": 3.3015, + "step": 57700 + }, + { + "epoch": 16.822069447681194, + "grad_norm": 0.3670969605445862, + "learning_rate": 0.00039837132031477694, + "loss": 3.2909, + "step": 57750 + }, + { + "epoch": 16.83663481705896, + "grad_norm": 0.4164856970310211, + "learning_rate": 0.0003981964441853687, + "loss": 3.2918, + "step": 57800 + }, + { + "epoch": 16.851200186436728, + "grad_norm": 0.3781265318393707, + "learning_rate": 0.0003980215680559603, + "loss": 3.2909, + "step": 57850 + }, + { + "epoch": 16.865765555814495, + "grad_norm": 0.36362481117248535, + "learning_rate": 0.000397846691926552, + "loss": 3.2974, + "step": 57900 + }, + { + "epoch": 16.88033092519226, + "grad_norm": 0.37053555250167847, + "learning_rate": 0.00039767181579714365, + "loss": 3.2951, + "step": 57950 + }, + { + "epoch": 16.89489629457003, + "grad_norm": 0.3637015223503113, + "learning_rate": 0.00039749693966773534, + "loss": 3.288, + "step": 58000 + }, + { + "epoch": 16.89489629457003, + "eval_accuracy": 0.3729018369673139, + "eval_loss": 3.5355286598205566, + "eval_runtime": 183.1123, + "eval_samples_per_second": 90.901, + "eval_steps_per_second": 5.685, + "step": 58000 + }, + { + "epoch": 16.9094616639478, + "grad_norm": 0.3660704791545868, + "learning_rate": 0.00039732206353832697, + "loss": 3.2817, + "step": 58050 + }, + { + "epoch": 16.924027033325565, + "grad_norm": 0.36517202854156494, + "learning_rate": 0.0003971471874089186, + "loss": 3.3008, + "step": 58100 + }, + { + "epoch": 16.938592402703332, + "grad_norm": 0.3487236797809601, + "learning_rate": 0.0003969723112795103, + "loss": 3.2899, + "step": 58150 + }, + { + "epoch": 16.9531577720811, + "grad_norm": 0.3602113127708435, + "learning_rate": 0.00039679743515010194, + "loss": 3.2924, + "step": 58200 + }, + { + "epoch": 16.96772314145887, + "grad_norm": 0.33480265736579895, + "learning_rate": 0.0003966225590206937, + "loss": 3.2941, + "step": 58250 + }, + { + "epoch": 16.982288510836636, + "grad_norm": 0.4005352258682251, + "learning_rate": 0.0003964476828912853, + "loss": 3.3045, + "step": 58300 + }, + { + "epoch": 16.996853880214402, + "grad_norm": 0.35570263862609863, + "learning_rate": 0.00039627280676187695, + "loss": 3.2877, + "step": 58350 + }, + { + "epoch": 17.01136098811466, + "grad_norm": 0.3764328062534332, + "learning_rate": 0.00039609793063246864, + "loss": 3.2242, + "step": 58400 + }, + { + "epoch": 17.025926357492427, + "grad_norm": 0.39979997277259827, + "learning_rate": 0.0003959230545030603, + "loss": 3.189, + "step": 58450 + }, + { + "epoch": 17.040491726870194, + "grad_norm": 0.37430837750434875, + "learning_rate": 0.00039574817837365197, + "loss": 3.1966, + "step": 58500 + }, + { + "epoch": 17.05505709624796, + "grad_norm": 0.4127182066440582, + "learning_rate": 0.0003955733022442436, + "loss": 3.1846, + "step": 58550 + }, + { + "epoch": 17.069622465625727, + "grad_norm": 0.38014090061187744, + "learning_rate": 0.0003953984261148353, + "loss": 3.189, + "step": 58600 + }, + { + "epoch": 17.084187835003497, + "grad_norm": 0.36704063415527344, + "learning_rate": 0.00039522354998542693, + "loss": 3.1994, + "step": 58650 + }, + { + "epoch": 17.098753204381264, + "grad_norm": 0.36234790086746216, + "learning_rate": 0.00039504867385601857, + "loss": 3.2003, + "step": 58700 + }, + { + "epoch": 17.11331857375903, + "grad_norm": 0.42512327432632446, + "learning_rate": 0.0003948737977266103, + "loss": 3.197, + "step": 58750 + }, + { + "epoch": 17.127883943136798, + "grad_norm": 0.4088022708892822, + "learning_rate": 0.00039469892159720195, + "loss": 3.2194, + "step": 58800 + }, + { + "epoch": 17.142449312514564, + "grad_norm": 0.36195436120033264, + "learning_rate": 0.00039452404546779364, + "loss": 3.206, + "step": 58850 + }, + { + "epoch": 17.15701468189233, + "grad_norm": 0.3801659643650055, + "learning_rate": 0.0003943491693383853, + "loss": 3.2166, + "step": 58900 + }, + { + "epoch": 17.1715800512701, + "grad_norm": 0.4113653898239136, + "learning_rate": 0.0003941742932089769, + "loss": 3.2178, + "step": 58950 + }, + { + "epoch": 17.18614542064787, + "grad_norm": 0.38333582878112793, + "learning_rate": 0.0003939994170795686, + "loss": 3.2174, + "step": 59000 + }, + { + "epoch": 17.18614542064787, + "eval_accuracy": 0.37211094579509135, + "eval_loss": 3.5520923137664795, + "eval_runtime": 183.3352, + "eval_samples_per_second": 90.79, + "eval_steps_per_second": 5.678, + "step": 59000 + }, + { + "epoch": 17.200710790025635, + "grad_norm": 0.36782196164131165, + "learning_rate": 0.00039382454095016024, + "loss": 3.227, + "step": 59050 + }, + { + "epoch": 17.215276159403402, + "grad_norm": 0.3619454503059387, + "learning_rate": 0.00039364966482075193, + "loss": 3.2329, + "step": 59100 + }, + { + "epoch": 17.22984152878117, + "grad_norm": 0.38505318760871887, + "learning_rate": 0.00039347478869134356, + "loss": 3.2246, + "step": 59150 + }, + { + "epoch": 17.24440689815894, + "grad_norm": 0.3795805871486664, + "learning_rate": 0.0003932999125619353, + "loss": 3.2382, + "step": 59200 + }, + { + "epoch": 17.258972267536706, + "grad_norm": 0.39559125900268555, + "learning_rate": 0.00039312503643252695, + "loss": 3.231, + "step": 59250 + }, + { + "epoch": 17.273537636914472, + "grad_norm": 0.38150516152381897, + "learning_rate": 0.0003929501603031186, + "loss": 3.2364, + "step": 59300 + }, + { + "epoch": 17.28810300629224, + "grad_norm": 0.38355526328086853, + "learning_rate": 0.00039277528417371027, + "loss": 3.2434, + "step": 59350 + }, + { + "epoch": 17.302668375670006, + "grad_norm": 0.40026649832725525, + "learning_rate": 0.0003926004080443019, + "loss": 3.2365, + "step": 59400 + }, + { + "epoch": 17.317233745047773, + "grad_norm": 0.3706173300743103, + "learning_rate": 0.0003924255319148936, + "loss": 3.2459, + "step": 59450 + }, + { + "epoch": 17.331799114425543, + "grad_norm": 0.38115477561950684, + "learning_rate": 0.00039225065578548523, + "loss": 3.2438, + "step": 59500 + }, + { + "epoch": 17.34636448380331, + "grad_norm": 0.39307278394699097, + "learning_rate": 0.00039207577965607687, + "loss": 3.2399, + "step": 59550 + }, + { + "epoch": 17.360929853181077, + "grad_norm": 0.4148417115211487, + "learning_rate": 0.00039190090352666856, + "loss": 3.24, + "step": 59600 + }, + { + "epoch": 17.375495222558843, + "grad_norm": 0.3889954388141632, + "learning_rate": 0.0003917260273972602, + "loss": 3.2453, + "step": 59650 + }, + { + "epoch": 17.39006059193661, + "grad_norm": 0.40655389428138733, + "learning_rate": 0.00039155115126785194, + "loss": 3.2551, + "step": 59700 + }, + { + "epoch": 17.40462596131438, + "grad_norm": 0.37711861729621887, + "learning_rate": 0.0003913762751384436, + "loss": 3.2578, + "step": 59750 + }, + { + "epoch": 17.419191330692147, + "grad_norm": 0.3447685241699219, + "learning_rate": 0.00039120139900903527, + "loss": 3.2389, + "step": 59800 + }, + { + "epoch": 17.433756700069914, + "grad_norm": 0.3808634877204895, + "learning_rate": 0.0003910265228796269, + "loss": 3.2547, + "step": 59850 + }, + { + "epoch": 17.44832206944768, + "grad_norm": 0.3830925524234772, + "learning_rate": 0.00039085164675021854, + "loss": 3.2616, + "step": 59900 + }, + { + "epoch": 17.462887438825447, + "grad_norm": 0.38864314556121826, + "learning_rate": 0.00039067677062081023, + "loss": 3.2602, + "step": 59950 + }, + { + "epoch": 17.477452808203218, + "grad_norm": 0.3750167787075043, + "learning_rate": 0.00039050189449140187, + "loss": 3.2585, + "step": 60000 + }, + { + "epoch": 17.477452808203218, + "eval_accuracy": 0.3726959724745888, + "eval_loss": 3.546113967895508, + "eval_runtime": 183.2239, + "eval_samples_per_second": 90.845, + "eval_steps_per_second": 5.682, + "step": 60000 + }, + { + "epoch": 17.492018177580984, + "grad_norm": 0.3868557810783386, + "learning_rate": 0.00039032701836199356, + "loss": 3.2677, + "step": 60050 + }, + { + "epoch": 17.50658354695875, + "grad_norm": 0.369974821805954, + "learning_rate": 0.0003901521422325852, + "loss": 3.2681, + "step": 60100 + }, + { + "epoch": 17.521148916336518, + "grad_norm": 0.4058365821838379, + "learning_rate": 0.00038997726610317683, + "loss": 3.2593, + "step": 60150 + }, + { + "epoch": 17.535714285714285, + "grad_norm": 0.3893314599990845, + "learning_rate": 0.0003898023899737686, + "loss": 3.262, + "step": 60200 + }, + { + "epoch": 17.55027965509205, + "grad_norm": 0.41334789991378784, + "learning_rate": 0.0003896275138443602, + "loss": 3.2686, + "step": 60250 + }, + { + "epoch": 17.56484502446982, + "grad_norm": 0.4167849123477936, + "learning_rate": 0.0003894526377149519, + "loss": 3.2655, + "step": 60300 + }, + { + "epoch": 17.57941039384759, + "grad_norm": 0.3605276644229889, + "learning_rate": 0.00038927776158554354, + "loss": 3.2709, + "step": 60350 + }, + { + "epoch": 17.593975763225355, + "grad_norm": 0.3882104158401489, + "learning_rate": 0.0003891028854561352, + "loss": 3.2607, + "step": 60400 + }, + { + "epoch": 17.608541132603122, + "grad_norm": 0.35230183601379395, + "learning_rate": 0.00038892800932672686, + "loss": 3.2739, + "step": 60450 + }, + { + "epoch": 17.62310650198089, + "grad_norm": 0.41730719804763794, + "learning_rate": 0.0003887531331973185, + "loss": 3.2713, + "step": 60500 + }, + { + "epoch": 17.63767187135866, + "grad_norm": 0.3690003752708435, + "learning_rate": 0.0003885782570679102, + "loss": 3.2633, + "step": 60550 + }, + { + "epoch": 17.652237240736426, + "grad_norm": 0.35980063676834106, + "learning_rate": 0.0003884033809385018, + "loss": 3.2724, + "step": 60600 + }, + { + "epoch": 17.666802610114193, + "grad_norm": 0.3813663125038147, + "learning_rate": 0.00038822850480909357, + "loss": 3.2703, + "step": 60650 + }, + { + "epoch": 17.68136797949196, + "grad_norm": 0.35949084162712097, + "learning_rate": 0.0003880536286796852, + "loss": 3.2721, + "step": 60700 + }, + { + "epoch": 17.695933348869726, + "grad_norm": 0.4059002697467804, + "learning_rate": 0.00038787875255027684, + "loss": 3.2704, + "step": 60750 + }, + { + "epoch": 17.710498718247496, + "grad_norm": 0.3678555488586426, + "learning_rate": 0.00038770387642086853, + "loss": 3.2785, + "step": 60800 + }, + { + "epoch": 17.725064087625263, + "grad_norm": 0.4052152931690216, + "learning_rate": 0.00038752900029146017, + "loss": 3.2585, + "step": 60850 + }, + { + "epoch": 17.73962945700303, + "grad_norm": 0.36793798208236694, + "learning_rate": 0.00038735412416205186, + "loss": 3.271, + "step": 60900 + }, + { + "epoch": 17.754194826380797, + "grad_norm": 0.3850816786289215, + "learning_rate": 0.0003871792480326435, + "loss": 3.2786, + "step": 60950 + }, + { + "epoch": 17.768760195758563, + "grad_norm": 0.3746644854545593, + "learning_rate": 0.00038700437190323513, + "loss": 3.279, + "step": 61000 + }, + { + "epoch": 17.768760195758563, + "eval_accuracy": 0.3732691246173842, + "eval_loss": 3.5366263389587402, + "eval_runtime": 183.1426, + "eval_samples_per_second": 90.885, + "eval_steps_per_second": 5.684, + "step": 61000 + }, + { + "epoch": 17.78332556513633, + "grad_norm": 0.3772430717945099, + "learning_rate": 0.0003868294957738268, + "loss": 3.2763, + "step": 61050 + }, + { + "epoch": 17.7978909345141, + "grad_norm": 0.3624294698238373, + "learning_rate": 0.00038665461964441846, + "loss": 3.2655, + "step": 61100 + }, + { + "epoch": 17.812456303891867, + "grad_norm": 0.3729947507381439, + "learning_rate": 0.0003864797435150102, + "loss": 3.288, + "step": 61150 + }, + { + "epoch": 17.827021673269634, + "grad_norm": 0.37783336639404297, + "learning_rate": 0.00038630486738560184, + "loss": 3.2851, + "step": 61200 + }, + { + "epoch": 17.8415870426474, + "grad_norm": 0.35818716883659363, + "learning_rate": 0.00038612999125619353, + "loss": 3.2853, + "step": 61250 + }, + { + "epoch": 17.856152412025168, + "grad_norm": 0.39936739206314087, + "learning_rate": 0.00038595511512678517, + "loss": 3.2638, + "step": 61300 + }, + { + "epoch": 17.870717781402938, + "grad_norm": 0.38293829560279846, + "learning_rate": 0.0003857802389973768, + "loss": 3.2811, + "step": 61350 + }, + { + "epoch": 17.885283150780705, + "grad_norm": 0.3781496286392212, + "learning_rate": 0.0003856053628679685, + "loss": 3.294, + "step": 61400 + }, + { + "epoch": 17.89984852015847, + "grad_norm": 0.44559693336486816, + "learning_rate": 0.00038543048673856013, + "loss": 3.2725, + "step": 61450 + }, + { + "epoch": 17.914413889536238, + "grad_norm": 0.3721054196357727, + "learning_rate": 0.0003852556106091518, + "loss": 3.2932, + "step": 61500 + }, + { + "epoch": 17.928979258914005, + "grad_norm": 0.37438303232192993, + "learning_rate": 0.00038508073447974346, + "loss": 3.2904, + "step": 61550 + }, + { + "epoch": 17.943544628291775, + "grad_norm": 0.37687599658966064, + "learning_rate": 0.0003849058583503351, + "loss": 3.2774, + "step": 61600 + }, + { + "epoch": 17.958109997669542, + "grad_norm": 0.3619229197502136, + "learning_rate": 0.00038473098222092684, + "loss": 3.2736, + "step": 61650 + }, + { + "epoch": 17.97267536704731, + "grad_norm": 0.382866770029068, + "learning_rate": 0.0003845561060915185, + "loss": 3.2737, + "step": 61700 + }, + { + "epoch": 17.987240736425075, + "grad_norm": 0.43251529335975647, + "learning_rate": 0.00038438122996211016, + "loss": 3.2917, + "step": 61750 + }, + { + "epoch": 18.001747844325333, + "grad_norm": 0.41620609164237976, + "learning_rate": 0.0003842063538327018, + "loss": 3.27, + "step": 61800 + }, + { + "epoch": 18.0163132137031, + "grad_norm": 0.3748462200164795, + "learning_rate": 0.0003840314777032935, + "loss": 3.1763, + "step": 61850 + }, + { + "epoch": 18.030878583080867, + "grad_norm": 0.37598931789398193, + "learning_rate": 0.0003838566015738851, + "loss": 3.1863, + "step": 61900 + }, + { + "epoch": 18.045443952458633, + "grad_norm": 0.3749133348464966, + "learning_rate": 0.00038368172544447676, + "loss": 3.1954, + "step": 61950 + }, + { + "epoch": 18.0600093218364, + "grad_norm": 0.37899747490882874, + "learning_rate": 0.00038350684931506845, + "loss": 3.1786, + "step": 62000 + }, + { + "epoch": 18.0600093218364, + "eval_accuracy": 0.3728517522877588, + "eval_loss": 3.5506234169006348, + "eval_runtime": 183.1708, + "eval_samples_per_second": 90.871, + "eval_steps_per_second": 5.683, + "step": 62000 + }, + { + "epoch": 18.07457469121417, + "grad_norm": 0.37994885444641113, + "learning_rate": 0.0003833319731856601, + "loss": 3.1824, + "step": 62050 + }, + { + "epoch": 18.089140060591937, + "grad_norm": 0.3813993036746979, + "learning_rate": 0.00038315709705625183, + "loss": 3.1725, + "step": 62100 + }, + { + "epoch": 18.103705429969704, + "grad_norm": 0.38244864344596863, + "learning_rate": 0.00038298222092684347, + "loss": 3.1979, + "step": 62150 + }, + { + "epoch": 18.11827079934747, + "grad_norm": 0.38922345638275146, + "learning_rate": 0.0003828073447974351, + "loss": 3.2135, + "step": 62200 + }, + { + "epoch": 18.132836168725238, + "grad_norm": 0.35493120551109314, + "learning_rate": 0.0003826324686680268, + "loss": 3.2028, + "step": 62250 + }, + { + "epoch": 18.147401538103008, + "grad_norm": 0.4383052885532379, + "learning_rate": 0.00038245759253861843, + "loss": 3.2043, + "step": 62300 + }, + { + "epoch": 18.161966907480775, + "grad_norm": 0.38917025923728943, + "learning_rate": 0.0003822827164092101, + "loss": 3.2037, + "step": 62350 + }, + { + "epoch": 18.17653227685854, + "grad_norm": 0.40065309405326843, + "learning_rate": 0.00038210784027980176, + "loss": 3.2053, + "step": 62400 + }, + { + "epoch": 18.191097646236308, + "grad_norm": 0.37339338660240173, + "learning_rate": 0.0003819329641503934, + "loss": 3.2231, + "step": 62450 + }, + { + "epoch": 18.205663015614075, + "grad_norm": 0.4179166853427887, + "learning_rate": 0.0003817580880209851, + "loss": 3.2151, + "step": 62500 + }, + { + "epoch": 18.22022838499184, + "grad_norm": 0.3906039297580719, + "learning_rate": 0.0003815832118915767, + "loss": 3.2223, + "step": 62550 + }, + { + "epoch": 18.234793754369612, + "grad_norm": 0.3898009657859802, + "learning_rate": 0.00038140833576216847, + "loss": 3.2201, + "step": 62600 + }, + { + "epoch": 18.24935912374738, + "grad_norm": 0.3770427405834198, + "learning_rate": 0.0003812334596327601, + "loss": 3.2242, + "step": 62650 + }, + { + "epoch": 18.263924493125145, + "grad_norm": 0.39647871255874634, + "learning_rate": 0.0003810585835033518, + "loss": 3.2251, + "step": 62700 + }, + { + "epoch": 18.278489862502912, + "grad_norm": 0.40589639544487, + "learning_rate": 0.00038088370737394343, + "loss": 3.2276, + "step": 62750 + }, + { + "epoch": 18.29305523188068, + "grad_norm": 0.37991979718208313, + "learning_rate": 0.00038070883124453507, + "loss": 3.2265, + "step": 62800 + }, + { + "epoch": 18.30762060125845, + "grad_norm": 0.38895300030708313, + "learning_rate": 0.00038053395511512676, + "loss": 3.2223, + "step": 62850 + }, + { + "epoch": 18.322185970636216, + "grad_norm": 0.38880106806755066, + "learning_rate": 0.0003803590789857184, + "loss": 3.2355, + "step": 62900 + }, + { + "epoch": 18.336751340013983, + "grad_norm": 0.4023679196834564, + "learning_rate": 0.0003801842028563101, + "loss": 3.2323, + "step": 62950 + }, + { + "epoch": 18.35131670939175, + "grad_norm": 0.38894176483154297, + "learning_rate": 0.0003800093267269017, + "loss": 3.225, + "step": 63000 + }, + { + "epoch": 18.35131670939175, + "eval_accuracy": 0.37240310642582913, + "eval_loss": 3.552517890930176, + "eval_runtime": 183.0843, + "eval_samples_per_second": 90.914, + "eval_steps_per_second": 5.686, + "step": 63000 + }, + { + "epoch": 18.365882078769516, + "grad_norm": 0.4012887179851532, + "learning_rate": 0.00037983445059749335, + "loss": 3.2361, + "step": 63050 + }, + { + "epoch": 18.380447448147287, + "grad_norm": 0.44362279772758484, + "learning_rate": 0.0003796595744680851, + "loss": 3.2371, + "step": 63100 + }, + { + "epoch": 18.395012817525053, + "grad_norm": 0.4012256860733032, + "learning_rate": 0.00037948469833867674, + "loss": 3.2383, + "step": 63150 + }, + { + "epoch": 18.40957818690282, + "grad_norm": 0.3982083201408386, + "learning_rate": 0.0003793098222092684, + "loss": 3.241, + "step": 63200 + }, + { + "epoch": 18.424143556280587, + "grad_norm": 0.42237627506256104, + "learning_rate": 0.00037913494607986006, + "loss": 3.2331, + "step": 63250 + }, + { + "epoch": 18.438708925658354, + "grad_norm": 0.4271126985549927, + "learning_rate": 0.00037896006995045175, + "loss": 3.2448, + "step": 63300 + }, + { + "epoch": 18.45327429503612, + "grad_norm": 0.3965889811515808, + "learning_rate": 0.0003787851938210434, + "loss": 3.2412, + "step": 63350 + }, + { + "epoch": 18.46783966441389, + "grad_norm": 0.40629059076309204, + "learning_rate": 0.000378610317691635, + "loss": 3.2385, + "step": 63400 + }, + { + "epoch": 18.482405033791657, + "grad_norm": 0.38835418224334717, + "learning_rate": 0.0003784354415622267, + "loss": 3.2388, + "step": 63450 + }, + { + "epoch": 18.496970403169424, + "grad_norm": 0.4206826686859131, + "learning_rate": 0.00037826056543281835, + "loss": 3.2447, + "step": 63500 + }, + { + "epoch": 18.51153577254719, + "grad_norm": 0.37770959734916687, + "learning_rate": 0.0003780856893034101, + "loss": 3.2432, + "step": 63550 + }, + { + "epoch": 18.526101141924958, + "grad_norm": 0.37431496381759644, + "learning_rate": 0.00037791081317400173, + "loss": 3.2454, + "step": 63600 + }, + { + "epoch": 18.540666511302728, + "grad_norm": 0.36141154170036316, + "learning_rate": 0.00037773593704459337, + "loss": 3.2647, + "step": 63650 + }, + { + "epoch": 18.555231880680495, + "grad_norm": 0.37699899077415466, + "learning_rate": 0.00037756106091518506, + "loss": 3.254, + "step": 63700 + }, + { + "epoch": 18.56979725005826, + "grad_norm": 0.3868577778339386, + "learning_rate": 0.0003773861847857767, + "loss": 3.2491, + "step": 63750 + }, + { + "epoch": 18.58436261943603, + "grad_norm": 0.4043353796005249, + "learning_rate": 0.0003772113086563684, + "loss": 3.2591, + "step": 63800 + }, + { + "epoch": 18.598927988813795, + "grad_norm": 0.3505975008010864, + "learning_rate": 0.00037703643252696, + "loss": 3.2696, + "step": 63850 + }, + { + "epoch": 18.613493358191565, + "grad_norm": 0.39516180753707886, + "learning_rate": 0.0003768615563975517, + "loss": 3.2635, + "step": 63900 + }, + { + "epoch": 18.628058727569332, + "grad_norm": 0.3725748062133789, + "learning_rate": 0.00037668668026814335, + "loss": 3.2531, + "step": 63950 + }, + { + "epoch": 18.6426240969471, + "grad_norm": 0.40790900588035583, + "learning_rate": 0.000376511804138735, + "loss": 3.2611, + "step": 64000 + }, + { + "epoch": 18.6426240969471, + "eval_accuracy": 0.37318447445475594, + "eval_loss": 3.5376601219177246, + "eval_runtime": 183.0655, + "eval_samples_per_second": 90.924, + "eval_steps_per_second": 5.686, + "step": 64000 + }, + { + "epoch": 18.657189466324866, + "grad_norm": 0.36995676159858704, + "learning_rate": 0.00037633692800932673, + "loss": 3.2652, + "step": 64050 + }, + { + "epoch": 18.671754835702632, + "grad_norm": 0.3847509026527405, + "learning_rate": 0.00037616205187991837, + "loss": 3.2635, + "step": 64100 + }, + { + "epoch": 18.6863202050804, + "grad_norm": 0.3744068443775177, + "learning_rate": 0.00037598717575051006, + "loss": 3.2476, + "step": 64150 + }, + { + "epoch": 18.70088557445817, + "grad_norm": 0.3912813663482666, + "learning_rate": 0.0003758122996211017, + "loss": 3.2559, + "step": 64200 + }, + { + "epoch": 18.715450943835936, + "grad_norm": 0.37561044096946716, + "learning_rate": 0.00037563742349169333, + "loss": 3.2607, + "step": 64250 + }, + { + "epoch": 18.730016313213703, + "grad_norm": 0.37838783860206604, + "learning_rate": 0.000375462547362285, + "loss": 3.2648, + "step": 64300 + }, + { + "epoch": 18.74458168259147, + "grad_norm": 0.3800770044326782, + "learning_rate": 0.00037528767123287665, + "loss": 3.2628, + "step": 64350 + }, + { + "epoch": 18.759147051969236, + "grad_norm": 0.3997965157032013, + "learning_rate": 0.00037511279510346834, + "loss": 3.2526, + "step": 64400 + }, + { + "epoch": 18.773712421347007, + "grad_norm": 0.4012726843357086, + "learning_rate": 0.00037493791897406, + "loss": 3.2618, + "step": 64450 + }, + { + "epoch": 18.788277790724774, + "grad_norm": 0.3717740774154663, + "learning_rate": 0.0003747630428446516, + "loss": 3.2546, + "step": 64500 + }, + { + "epoch": 18.80284316010254, + "grad_norm": 0.37287068367004395, + "learning_rate": 0.00037458816671524336, + "loss": 3.2703, + "step": 64550 + }, + { + "epoch": 18.817408529480307, + "grad_norm": 0.35750555992126465, + "learning_rate": 0.000374413290585835, + "loss": 3.2715, + "step": 64600 + }, + { + "epoch": 18.831973898858074, + "grad_norm": 0.418002724647522, + "learning_rate": 0.0003742384144564267, + "loss": 3.2617, + "step": 64650 + }, + { + "epoch": 18.846539268235844, + "grad_norm": 0.3695155680179596, + "learning_rate": 0.0003740635383270183, + "loss": 3.2776, + "step": 64700 + }, + { + "epoch": 18.86110463761361, + "grad_norm": 0.3876601755619049, + "learning_rate": 0.00037388866219761, + "loss": 3.261, + "step": 64750 + }, + { + "epoch": 18.875670006991378, + "grad_norm": 0.38923588395118713, + "learning_rate": 0.00037371378606820165, + "loss": 3.278, + "step": 64800 + }, + { + "epoch": 18.890235376369144, + "grad_norm": 0.3957293629646301, + "learning_rate": 0.0003735389099387933, + "loss": 3.2707, + "step": 64850 + }, + { + "epoch": 18.90480074574691, + "grad_norm": 0.4016440212726593, + "learning_rate": 0.000373364033809385, + "loss": 3.2544, + "step": 64900 + }, + { + "epoch": 18.919366115124678, + "grad_norm": 0.35893428325653076, + "learning_rate": 0.0003731891576799766, + "loss": 3.2673, + "step": 64950 + }, + { + "epoch": 18.93393148450245, + "grad_norm": 0.3813352584838867, + "learning_rate": 0.00037301428155056836, + "loss": 3.2804, + "step": 65000 + }, + { + "epoch": 18.93393148450245, + "eval_accuracy": 0.37387366786215426, + "eval_loss": 3.531456470489502, + "eval_runtime": 183.037, + "eval_samples_per_second": 90.938, + "eval_steps_per_second": 5.687, + "step": 65000 + }, + { + "epoch": 18.948496853880215, + "grad_norm": 0.3757934272289276, + "learning_rate": 0.00037283940542116, + "loss": 3.2652, + "step": 65050 + }, + { + "epoch": 18.96306222325798, + "grad_norm": 0.3764069378376007, + "learning_rate": 0.00037266452929175163, + "loss": 3.278, + "step": 65100 + }, + { + "epoch": 18.97762759263575, + "grad_norm": 0.3668430745601654, + "learning_rate": 0.0003724896531623433, + "loss": 3.27, + "step": 65150 + }, + { + "epoch": 18.992192962013515, + "grad_norm": 0.4349370300769806, + "learning_rate": 0.00037231477703293496, + "loss": 3.2825, + "step": 65200 + }, + { + "epoch": 19.006700069913773, + "grad_norm": 0.42744481563568115, + "learning_rate": 0.00037213990090352665, + "loss": 3.2131, + "step": 65250 + }, + { + "epoch": 19.02126543929154, + "grad_norm": 0.3895418643951416, + "learning_rate": 0.0003719650247741183, + "loss": 3.1544, + "step": 65300 + }, + { + "epoch": 19.035830808669306, + "grad_norm": 0.401532381772995, + "learning_rate": 0.00037179014864471, + "loss": 3.1616, + "step": 65350 + }, + { + "epoch": 19.050396178047077, + "grad_norm": 0.38685935735702515, + "learning_rate": 0.0003716152725153016, + "loss": 3.1816, + "step": 65400 + }, + { + "epoch": 19.064961547424844, + "grad_norm": 0.42113545536994934, + "learning_rate": 0.00037144039638589325, + "loss": 3.1864, + "step": 65450 + }, + { + "epoch": 19.07952691680261, + "grad_norm": 0.39501887559890747, + "learning_rate": 0.000371265520256485, + "loss": 3.1961, + "step": 65500 + }, + { + "epoch": 19.094092286180377, + "grad_norm": 0.36792710423469543, + "learning_rate": 0.00037109064412707663, + "loss": 3.1827, + "step": 65550 + }, + { + "epoch": 19.108657655558144, + "grad_norm": 0.39077720046043396, + "learning_rate": 0.0003709157679976683, + "loss": 3.1918, + "step": 65600 + }, + { + "epoch": 19.123223024935914, + "grad_norm": 0.37719228863716125, + "learning_rate": 0.00037074089186825995, + "loss": 3.189, + "step": 65650 + }, + { + "epoch": 19.13778839431368, + "grad_norm": 0.38715848326683044, + "learning_rate": 0.0003705660157388516, + "loss": 3.19, + "step": 65700 + }, + { + "epoch": 19.152353763691448, + "grad_norm": 0.4000729024410248, + "learning_rate": 0.0003703911396094433, + "loss": 3.2001, + "step": 65750 + }, + { + "epoch": 19.166919133069214, + "grad_norm": 0.392840176820755, + "learning_rate": 0.0003702162634800349, + "loss": 3.1929, + "step": 65800 + }, + { + "epoch": 19.18148450244698, + "grad_norm": 0.41991594433784485, + "learning_rate": 0.0003700413873506266, + "loss": 3.2054, + "step": 65850 + }, + { + "epoch": 19.196049871824748, + "grad_norm": 0.40436795353889465, + "learning_rate": 0.00036986651122121824, + "loss": 3.1997, + "step": 65900 + }, + { + "epoch": 19.210615241202518, + "grad_norm": 0.37329864501953125, + "learning_rate": 0.00036969163509181, + "loss": 3.1967, + "step": 65950 + }, + { + "epoch": 19.225180610580285, + "grad_norm": 0.41763025522232056, + "learning_rate": 0.0003695167589624016, + "loss": 3.2131, + "step": 66000 + }, + { + "epoch": 19.225180610580285, + "eval_accuracy": 0.3733340230753992, + "eval_loss": 3.547522783279419, + "eval_runtime": 183.2911, + "eval_samples_per_second": 90.812, + "eval_steps_per_second": 5.679, + "step": 66000 + }, + { + "epoch": 19.23974597995805, + "grad_norm": 0.41672950983047485, + "learning_rate": 0.00036934188283299326, + "loss": 3.2026, + "step": 66050 + }, + { + "epoch": 19.25431134933582, + "grad_norm": 0.36991941928863525, + "learning_rate": 0.00036916700670358495, + "loss": 3.2116, + "step": 66100 + }, + { + "epoch": 19.268876718713585, + "grad_norm": 0.3786165118217468, + "learning_rate": 0.0003689921305741766, + "loss": 3.2133, + "step": 66150 + }, + { + "epoch": 19.283442088091356, + "grad_norm": 0.39697492122650146, + "learning_rate": 0.0003688172544447683, + "loss": 3.2199, + "step": 66200 + }, + { + "epoch": 19.298007457469122, + "grad_norm": 0.38866594433784485, + "learning_rate": 0.0003686423783153599, + "loss": 3.2099, + "step": 66250 + }, + { + "epoch": 19.31257282684689, + "grad_norm": 0.4164191484451294, + "learning_rate": 0.00036846750218595155, + "loss": 3.207, + "step": 66300 + }, + { + "epoch": 19.327138196224656, + "grad_norm": 0.3987915813922882, + "learning_rate": 0.00036829262605654324, + "loss": 3.2254, + "step": 66350 + }, + { + "epoch": 19.341703565602423, + "grad_norm": 0.40145906805992126, + "learning_rate": 0.0003681177499271349, + "loss": 3.2226, + "step": 66400 + }, + { + "epoch": 19.356268934980193, + "grad_norm": 0.3883962035179138, + "learning_rate": 0.0003679428737977266, + "loss": 3.2213, + "step": 66450 + }, + { + "epoch": 19.37083430435796, + "grad_norm": 0.3735280930995941, + "learning_rate": 0.00036776799766831826, + "loss": 3.2262, + "step": 66500 + }, + { + "epoch": 19.385399673735726, + "grad_norm": 0.3815906345844269, + "learning_rate": 0.0003675931215389099, + "loss": 3.2163, + "step": 66550 + }, + { + "epoch": 19.399965043113493, + "grad_norm": 0.4458494484424591, + "learning_rate": 0.0003674182454095016, + "loss": 3.2309, + "step": 66600 + }, + { + "epoch": 19.41453041249126, + "grad_norm": 0.37692028284072876, + "learning_rate": 0.0003672433692800932, + "loss": 3.2361, + "step": 66650 + }, + { + "epoch": 19.429095781869027, + "grad_norm": 0.3867899775505066, + "learning_rate": 0.0003670684931506849, + "loss": 3.2322, + "step": 66700 + }, + { + "epoch": 19.443661151246797, + "grad_norm": 0.3880974054336548, + "learning_rate": 0.00036689361702127655, + "loss": 3.2276, + "step": 66750 + }, + { + "epoch": 19.458226520624564, + "grad_norm": 0.397736519575119, + "learning_rate": 0.00036671874089186824, + "loss": 3.2354, + "step": 66800 + }, + { + "epoch": 19.47279189000233, + "grad_norm": 0.3957333266735077, + "learning_rate": 0.00036654386476245987, + "loss": 3.2387, + "step": 66850 + }, + { + "epoch": 19.487357259380097, + "grad_norm": 0.39339420199394226, + "learning_rate": 0.0003663689886330515, + "loss": 3.2303, + "step": 66900 + }, + { + "epoch": 19.501922628757864, + "grad_norm": 0.3895646035671234, + "learning_rate": 0.00036619411250364325, + "loss": 3.2333, + "step": 66950 + }, + { + "epoch": 19.516487998135634, + "grad_norm": 0.39041730761528015, + "learning_rate": 0.0003660192363742349, + "loss": 3.237, + "step": 67000 + }, + { + "epoch": 19.516487998135634, + "eval_accuracy": 0.37304538953477095, + "eval_loss": 3.542279005050659, + "eval_runtime": 183.1165, + "eval_samples_per_second": 90.898, + "eval_steps_per_second": 5.685, + "step": 67000 + }, + { + "epoch": 19.5310533675134, + "grad_norm": 0.4078094959259033, + "learning_rate": 0.0003658443602448266, + "loss": 3.2433, + "step": 67050 + }, + { + "epoch": 19.545618736891168, + "grad_norm": 0.3974609673023224, + "learning_rate": 0.0003656694841154182, + "loss": 3.2438, + "step": 67100 + }, + { + "epoch": 19.560184106268935, + "grad_norm": 0.3790847659111023, + "learning_rate": 0.00036549460798600985, + "loss": 3.2263, + "step": 67150 + }, + { + "epoch": 19.5747494756467, + "grad_norm": 0.3998972177505493, + "learning_rate": 0.00036531973185660154, + "loss": 3.2406, + "step": 67200 + }, + { + "epoch": 19.589314845024468, + "grad_norm": 0.3818765878677368, + "learning_rate": 0.0003651448557271932, + "loss": 3.2381, + "step": 67250 + }, + { + "epoch": 19.60388021440224, + "grad_norm": 0.4024520218372345, + "learning_rate": 0.00036496997959778487, + "loss": 3.2361, + "step": 67300 + }, + { + "epoch": 19.618445583780005, + "grad_norm": 0.42870834469795227, + "learning_rate": 0.0003647951034683765, + "loss": 3.2474, + "step": 67350 + }, + { + "epoch": 19.633010953157772, + "grad_norm": 0.37617725133895874, + "learning_rate": 0.00036462022733896825, + "loss": 3.2467, + "step": 67400 + }, + { + "epoch": 19.64757632253554, + "grad_norm": 0.3796943426132202, + "learning_rate": 0.0003644453512095599, + "loss": 3.2554, + "step": 67450 + }, + { + "epoch": 19.662141691913305, + "grad_norm": 0.36963871121406555, + "learning_rate": 0.0003642704750801515, + "loss": 3.2466, + "step": 67500 + }, + { + "epoch": 19.676707061291076, + "grad_norm": 0.37516793608665466, + "learning_rate": 0.0003640955989507432, + "loss": 3.2437, + "step": 67550 + }, + { + "epoch": 19.691272430668842, + "grad_norm": 0.3728578984737396, + "learning_rate": 0.00036392072282133485, + "loss": 3.2347, + "step": 67600 + }, + { + "epoch": 19.70583780004661, + "grad_norm": 0.3838897943496704, + "learning_rate": 0.00036374584669192654, + "loss": 3.2424, + "step": 67650 + }, + { + "epoch": 19.720403169424376, + "grad_norm": 0.3947892487049103, + "learning_rate": 0.0003635709705625182, + "loss": 3.252, + "step": 67700 + }, + { + "epoch": 19.734968538802143, + "grad_norm": 0.3661569058895111, + "learning_rate": 0.0003633960944331098, + "loss": 3.2519, + "step": 67750 + }, + { + "epoch": 19.749533908179913, + "grad_norm": 0.35730046033859253, + "learning_rate": 0.0003632212183037015, + "loss": 3.258, + "step": 67800 + }, + { + "epoch": 19.76409927755768, + "grad_norm": 0.41826891899108887, + "learning_rate": 0.00036304634217429314, + "loss": 3.246, + "step": 67850 + }, + { + "epoch": 19.778664646935447, + "grad_norm": 0.40090757608413696, + "learning_rate": 0.0003628714660448849, + "loss": 3.263, + "step": 67900 + }, + { + "epoch": 19.793230016313213, + "grad_norm": 0.4540848731994629, + "learning_rate": 0.0003626965899154765, + "loss": 3.2501, + "step": 67950 + }, + { + "epoch": 19.80779538569098, + "grad_norm": 0.41131654381752014, + "learning_rate": 0.0003625217137860682, + "loss": 3.2455, + "step": 68000 + }, + { + "epoch": 19.80779538569098, + "eval_accuracy": 0.3737583320155733, + "eval_loss": 3.5353798866271973, + "eval_runtime": 183.1485, + "eval_samples_per_second": 90.883, + "eval_steps_per_second": 5.684, + "step": 68000 + }, + { + "epoch": 19.822360755068747, + "grad_norm": 0.40951302647590637, + "learning_rate": 0.00036234683765665985, + "loss": 3.2595, + "step": 68050 + }, + { + "epoch": 19.836926124446517, + "grad_norm": 0.42168769240379333, + "learning_rate": 0.0003621719615272515, + "loss": 3.2715, + "step": 68100 + }, + { + "epoch": 19.851491493824284, + "grad_norm": 0.3935104310512543, + "learning_rate": 0.00036199708539784317, + "loss": 3.2587, + "step": 68150 + }, + { + "epoch": 19.86605686320205, + "grad_norm": 0.41649124026298523, + "learning_rate": 0.0003618222092684348, + "loss": 3.2611, + "step": 68200 + }, + { + "epoch": 19.880622232579817, + "grad_norm": 0.3767942786216736, + "learning_rate": 0.0003616473331390265, + "loss": 3.2503, + "step": 68250 + }, + { + "epoch": 19.895187601957584, + "grad_norm": 0.3804614543914795, + "learning_rate": 0.00036147245700961813, + "loss": 3.2593, + "step": 68300 + }, + { + "epoch": 19.909752971335354, + "grad_norm": 0.40164914727211, + "learning_rate": 0.00036129758088020977, + "loss": 3.2635, + "step": 68350 + }, + { + "epoch": 19.92431834071312, + "grad_norm": 0.37406373023986816, + "learning_rate": 0.0003611227047508015, + "loss": 3.2568, + "step": 68400 + }, + { + "epoch": 19.938883710090888, + "grad_norm": 0.38547801971435547, + "learning_rate": 0.00036094782862139315, + "loss": 3.2668, + "step": 68450 + }, + { + "epoch": 19.953449079468655, + "grad_norm": 0.40557870268821716, + "learning_rate": 0.00036077295249198484, + "loss": 3.2518, + "step": 68500 + }, + { + "epoch": 19.96801444884642, + "grad_norm": 0.39365437626838684, + "learning_rate": 0.0003605980763625765, + "loss": 3.2751, + "step": 68550 + }, + { + "epoch": 19.982579818224192, + "grad_norm": 0.3988928198814392, + "learning_rate": 0.0003604232002331681, + "loss": 3.2664, + "step": 68600 + }, + { + "epoch": 19.99714518760196, + "grad_norm": 0.3843889534473419, + "learning_rate": 0.0003602483241037598, + "loss": 3.2573, + "step": 68650 + }, + { + "epoch": 20.011652295502213, + "grad_norm": 0.4098789095878601, + "learning_rate": 0.00036007344797435144, + "loss": 3.1714, + "step": 68700 + }, + { + "epoch": 20.026217664879983, + "grad_norm": 0.37843582034111023, + "learning_rate": 0.00035989857184494313, + "loss": 3.1703, + "step": 68750 + }, + { + "epoch": 20.04078303425775, + "grad_norm": 0.4209931492805481, + "learning_rate": 0.00035972369571553477, + "loss": 3.163, + "step": 68800 + }, + { + "epoch": 20.055348403635517, + "grad_norm": 0.39720186591148376, + "learning_rate": 0.0003595488195861265, + "loss": 3.17, + "step": 68850 + }, + { + "epoch": 20.069913773013283, + "grad_norm": 0.41874927282333374, + "learning_rate": 0.00035937394345671815, + "loss": 3.1616, + "step": 68900 + }, + { + "epoch": 20.08447914239105, + "grad_norm": 0.3896577060222626, + "learning_rate": 0.0003591990673273098, + "loss": 3.1776, + "step": 68950 + }, + { + "epoch": 20.099044511768817, + "grad_norm": 0.4793306887149811, + "learning_rate": 0.0003590241911979015, + "loss": 3.1751, + "step": 69000 + }, + { + "epoch": 20.099044511768817, + "eval_accuracy": 0.3726623475488781, + "eval_loss": 3.5513575077056885, + "eval_runtime": 182.9863, + "eval_samples_per_second": 90.963, + "eval_steps_per_second": 5.689, + "step": 69000 + }, + { + "epoch": 20.113609881146587, + "grad_norm": 0.38327980041503906, + "learning_rate": 0.0003588493150684931, + "loss": 3.1769, + "step": 69050 + }, + { + "epoch": 20.128175250524354, + "grad_norm": 0.4346780776977539, + "learning_rate": 0.0003586744389390848, + "loss": 3.1907, + "step": 69100 + }, + { + "epoch": 20.14274061990212, + "grad_norm": 0.42224401235580444, + "learning_rate": 0.00035849956280967644, + "loss": 3.187, + "step": 69150 + }, + { + "epoch": 20.157305989279887, + "grad_norm": 0.37276560068130493, + "learning_rate": 0.0003583246866802681, + "loss": 3.1784, + "step": 69200 + }, + { + "epoch": 20.171871358657654, + "grad_norm": 0.40166395902633667, + "learning_rate": 0.00035814981055085976, + "loss": 3.1858, + "step": 69250 + }, + { + "epoch": 20.186436728035424, + "grad_norm": 0.3963995575904846, + "learning_rate": 0.0003579749344214514, + "loss": 3.1894, + "step": 69300 + }, + { + "epoch": 20.20100209741319, + "grad_norm": 0.39567843079566956, + "learning_rate": 0.00035780005829204315, + "loss": 3.2148, + "step": 69350 + }, + { + "epoch": 20.215567466790958, + "grad_norm": 0.40741077065467834, + "learning_rate": 0.0003576251821626348, + "loss": 3.1976, + "step": 69400 + }, + { + "epoch": 20.230132836168725, + "grad_norm": 0.3944525718688965, + "learning_rate": 0.00035745030603322647, + "loss": 3.1988, + "step": 69450 + }, + { + "epoch": 20.24469820554649, + "grad_norm": 0.3943832516670227, + "learning_rate": 0.0003572754299038181, + "loss": 3.2033, + "step": 69500 + }, + { + "epoch": 20.25926357492426, + "grad_norm": 0.39849257469177246, + "learning_rate": 0.00035710055377440974, + "loss": 3.1942, + "step": 69550 + }, + { + "epoch": 20.27382894430203, + "grad_norm": 0.3904484510421753, + "learning_rate": 0.00035692567764500143, + "loss": 3.1997, + "step": 69600 + }, + { + "epoch": 20.288394313679795, + "grad_norm": 0.4032699763774872, + "learning_rate": 0.00035675080151559307, + "loss": 3.2064, + "step": 69650 + }, + { + "epoch": 20.302959683057562, + "grad_norm": 0.4133860170841217, + "learning_rate": 0.00035657592538618476, + "loss": 3.2082, + "step": 69700 + }, + { + "epoch": 20.31752505243533, + "grad_norm": 0.3864280879497528, + "learning_rate": 0.0003564010492567764, + "loss": 3.207, + "step": 69750 + }, + { + "epoch": 20.332090421813096, + "grad_norm": 0.39940962195396423, + "learning_rate": 0.00035622617312736803, + "loss": 3.2138, + "step": 69800 + }, + { + "epoch": 20.346655791190866, + "grad_norm": 0.43951013684272766, + "learning_rate": 0.0003560512969979598, + "loss": 3.2098, + "step": 69850 + }, + { + "epoch": 20.361221160568633, + "grad_norm": 0.3883720934391022, + "learning_rate": 0.0003558764208685514, + "loss": 3.2001, + "step": 69900 + }, + { + "epoch": 20.3757865299464, + "grad_norm": 0.38097983598709106, + "learning_rate": 0.0003557015447391431, + "loss": 3.2136, + "step": 69950 + }, + { + "epoch": 20.390351899324166, + "grad_norm": 0.379999577999115, + "learning_rate": 0.00035552666860973474, + "loss": 3.2249, + "step": 70000 + }, + { + "epoch": 20.390351899324166, + "eval_accuracy": 0.37320645998310525, + "eval_loss": 3.547257423400879, + "eval_runtime": 183.2991, + "eval_samples_per_second": 90.808, + "eval_steps_per_second": 5.679, + "step": 70000 + }, + { + "epoch": 20.404917268701933, + "grad_norm": 0.4156287908554077, + "learning_rate": 0.00035535179248032643, + "loss": 3.2132, + "step": 70050 + }, + { + "epoch": 20.419482638079703, + "grad_norm": 0.43959635496139526, + "learning_rate": 0.00035517691635091807, + "loss": 3.2181, + "step": 70100 + }, + { + "epoch": 20.43404800745747, + "grad_norm": 0.4064927399158478, + "learning_rate": 0.0003550020402215097, + "loss": 3.2219, + "step": 70150 + }, + { + "epoch": 20.448613376835237, + "grad_norm": 0.40533578395843506, + "learning_rate": 0.0003548271640921014, + "loss": 3.2271, + "step": 70200 + }, + { + "epoch": 20.463178746213003, + "grad_norm": 0.4207044541835785, + "learning_rate": 0.00035465228796269303, + "loss": 3.2202, + "step": 70250 + }, + { + "epoch": 20.47774411559077, + "grad_norm": 0.4209517538547516, + "learning_rate": 0.0003544774118332848, + "loss": 3.2265, + "step": 70300 + }, + { + "epoch": 20.49230948496854, + "grad_norm": 0.40417852997779846, + "learning_rate": 0.0003543025357038764, + "loss": 3.2258, + "step": 70350 + }, + { + "epoch": 20.506874854346307, + "grad_norm": 0.4038628041744232, + "learning_rate": 0.00035412765957446805, + "loss": 3.2193, + "step": 70400 + }, + { + "epoch": 20.521440223724074, + "grad_norm": 0.3865199089050293, + "learning_rate": 0.00035395278344505974, + "loss": 3.2313, + "step": 70450 + }, + { + "epoch": 20.53600559310184, + "grad_norm": 0.39419683814048767, + "learning_rate": 0.0003537779073156514, + "loss": 3.2345, + "step": 70500 + }, + { + "epoch": 20.550570962479608, + "grad_norm": 0.3920558989048004, + "learning_rate": 0.00035360303118624306, + "loss": 3.2299, + "step": 70550 + }, + { + "epoch": 20.565136331857374, + "grad_norm": 0.41333022713661194, + "learning_rate": 0.0003534281550568347, + "loss": 3.2204, + "step": 70600 + }, + { + "epoch": 20.579701701235145, + "grad_norm": 0.3800913393497467, + "learning_rate": 0.0003532532789274264, + "loss": 3.2371, + "step": 70650 + }, + { + "epoch": 20.59426707061291, + "grad_norm": 0.3914024531841278, + "learning_rate": 0.000353078402798018, + "loss": 3.2452, + "step": 70700 + }, + { + "epoch": 20.608832439990678, + "grad_norm": 0.42056038975715637, + "learning_rate": 0.00035290352666860966, + "loss": 3.2355, + "step": 70750 + }, + { + "epoch": 20.623397809368445, + "grad_norm": 0.4147467613220215, + "learning_rate": 0.0003527286505392014, + "loss": 3.2157, + "step": 70800 + }, + { + "epoch": 20.63796317874621, + "grad_norm": 0.41056451201438904, + "learning_rate": 0.00035255377440979304, + "loss": 3.2341, + "step": 70850 + }, + { + "epoch": 20.652528548123982, + "grad_norm": 0.3810015320777893, + "learning_rate": 0.00035237889828038473, + "loss": 3.2236, + "step": 70900 + }, + { + "epoch": 20.66709391750175, + "grad_norm": 0.4104416072368622, + "learning_rate": 0.00035220402215097637, + "loss": 3.2287, + "step": 70950 + }, + { + "epoch": 20.681659286879515, + "grad_norm": 0.4050891101360321, + "learning_rate": 0.000352029146021568, + "loss": 3.233, + "step": 71000 + }, + { + "epoch": 20.681659286879515, + "eval_accuracy": 0.3739369203447848, + "eval_loss": 3.538299083709717, + "eval_runtime": 182.9222, + "eval_samples_per_second": 90.995, + "eval_steps_per_second": 5.691, + "step": 71000 + }, + { + "epoch": 20.696224656257282, + "grad_norm": 0.40285784006118774, + "learning_rate": 0.0003518542698921597, + "loss": 3.2479, + "step": 71050 + }, + { + "epoch": 20.71079002563505, + "grad_norm": 0.40842974185943604, + "learning_rate": 0.00035167939376275133, + "loss": 3.2222, + "step": 71100 + }, + { + "epoch": 20.72535539501282, + "grad_norm": 0.42046529054641724, + "learning_rate": 0.000351504517633343, + "loss": 3.23, + "step": 71150 + }, + { + "epoch": 20.739920764390586, + "grad_norm": 0.389822781085968, + "learning_rate": 0.00035132964150393466, + "loss": 3.2453, + "step": 71200 + }, + { + "epoch": 20.754486133768353, + "grad_norm": 0.3888729214668274, + "learning_rate": 0.0003511547653745263, + "loss": 3.2398, + "step": 71250 + }, + { + "epoch": 20.76905150314612, + "grad_norm": 0.42378807067871094, + "learning_rate": 0.00035097988924511804, + "loss": 3.2396, + "step": 71300 + }, + { + "epoch": 20.783616872523886, + "grad_norm": 0.3676110506057739, + "learning_rate": 0.0003508050131157097, + "loss": 3.2502, + "step": 71350 + }, + { + "epoch": 20.798182241901653, + "grad_norm": 0.4406678080558777, + "learning_rate": 0.00035063013698630137, + "loss": 3.2417, + "step": 71400 + }, + { + "epoch": 20.812747611279423, + "grad_norm": 0.3739905059337616, + "learning_rate": 0.000350455260856893, + "loss": 3.2365, + "step": 71450 + }, + { + "epoch": 20.82731298065719, + "grad_norm": 0.43861594796180725, + "learning_rate": 0.0003502803847274847, + "loss": 3.2463, + "step": 71500 + }, + { + "epoch": 20.841878350034957, + "grad_norm": 0.4056912064552307, + "learning_rate": 0.00035010550859807633, + "loss": 3.2361, + "step": 71550 + }, + { + "epoch": 20.856443719412724, + "grad_norm": 0.4023033082485199, + "learning_rate": 0.00034993063246866797, + "loss": 3.2484, + "step": 71600 + }, + { + "epoch": 20.87100908879049, + "grad_norm": 0.37357252836227417, + "learning_rate": 0.00034975575633925966, + "loss": 3.2357, + "step": 71650 + }, + { + "epoch": 20.88557445816826, + "grad_norm": 0.40037524700164795, + "learning_rate": 0.0003495808802098513, + "loss": 3.2396, + "step": 71700 + }, + { + "epoch": 20.900139827546028, + "grad_norm": 0.41211217641830444, + "learning_rate": 0.00034940600408044304, + "loss": 3.2631, + "step": 71750 + }, + { + "epoch": 20.914705196923794, + "grad_norm": 0.4259653091430664, + "learning_rate": 0.0003492311279510347, + "loss": 3.2562, + "step": 71800 + }, + { + "epoch": 20.92927056630156, + "grad_norm": 0.3834655284881592, + "learning_rate": 0.0003490562518216263, + "loss": 3.2472, + "step": 71850 + }, + { + "epoch": 20.943835935679328, + "grad_norm": 0.40790706872940063, + "learning_rate": 0.000348881375692218, + "loss": 3.254, + "step": 71900 + }, + { + "epoch": 20.958401305057095, + "grad_norm": 0.3871559202671051, + "learning_rate": 0.00034870649956280964, + "loss": 3.2523, + "step": 71950 + }, + { + "epoch": 20.972966674434865, + "grad_norm": 0.4063284397125244, + "learning_rate": 0.0003485316234334013, + "loss": 3.2472, + "step": 72000 + }, + { + "epoch": 20.972966674434865, + "eval_accuracy": 0.37428234003617616, + "eval_loss": 3.527845621109009, + "eval_runtime": 183.655, + "eval_samples_per_second": 90.632, + "eval_steps_per_second": 5.668, + "step": 72000 + }, + { + "epoch": 20.98753204381263, + "grad_norm": 0.3977331817150116, + "learning_rate": 0.00034835674730399296, + "loss": 3.2603, + "step": 72050 + }, + { + "epoch": 21.002039151712886, + "grad_norm": 0.42167672514915466, + "learning_rate": 0.00034818187117458465, + "loss": 3.2473, + "step": 72100 + }, + { + "epoch": 21.016604521090656, + "grad_norm": 0.38619017601013184, + "learning_rate": 0.0003480069950451763, + "loss": 3.1387, + "step": 72150 + }, + { + "epoch": 21.031169890468423, + "grad_norm": 0.40300482511520386, + "learning_rate": 0.0003478321189157679, + "loss": 3.1455, + "step": 72200 + }, + { + "epoch": 21.04573525984619, + "grad_norm": 0.40839800238609314, + "learning_rate": 0.00034765724278635967, + "loss": 3.1608, + "step": 72250 + }, + { + "epoch": 21.060300629223956, + "grad_norm": 0.38552382588386536, + "learning_rate": 0.0003474823666569513, + "loss": 3.1538, + "step": 72300 + }, + { + "epoch": 21.074865998601723, + "grad_norm": 0.4049597680568695, + "learning_rate": 0.000347307490527543, + "loss": 3.1621, + "step": 72350 + }, + { + "epoch": 21.089431367979493, + "grad_norm": 0.4319642186164856, + "learning_rate": 0.00034713261439813463, + "loss": 3.16, + "step": 72400 + }, + { + "epoch": 21.10399673735726, + "grad_norm": 0.3880416750907898, + "learning_rate": 0.00034695773826872627, + "loss": 3.155, + "step": 72450 + }, + { + "epoch": 21.118562106735027, + "grad_norm": 0.3752796947956085, + "learning_rate": 0.00034678286213931796, + "loss": 3.1564, + "step": 72500 + }, + { + "epoch": 21.133127476112794, + "grad_norm": 0.396068811416626, + "learning_rate": 0.0003466079860099096, + "loss": 3.1695, + "step": 72550 + }, + { + "epoch": 21.14769284549056, + "grad_norm": 0.3929023742675781, + "learning_rate": 0.0003464331098805013, + "loss": 3.1709, + "step": 72600 + }, + { + "epoch": 21.16225821486833, + "grad_norm": 0.44244226813316345, + "learning_rate": 0.0003462582337510929, + "loss": 3.1791, + "step": 72650 + }, + { + "epoch": 21.176823584246097, + "grad_norm": 0.40291813015937805, + "learning_rate": 0.00034608335762168467, + "loss": 3.1853, + "step": 72700 + }, + { + "epoch": 21.191388953623864, + "grad_norm": 0.39293503761291504, + "learning_rate": 0.0003459084814922763, + "loss": 3.1719, + "step": 72750 + }, + { + "epoch": 21.20595432300163, + "grad_norm": 0.3924226462841034, + "learning_rate": 0.00034573360536286794, + "loss": 3.1875, + "step": 72800 + }, + { + "epoch": 21.220519692379398, + "grad_norm": 0.3741477131843567, + "learning_rate": 0.00034555872923345963, + "loss": 3.2003, + "step": 72850 + }, + { + "epoch": 21.235085061757164, + "grad_norm": 0.40738731622695923, + "learning_rate": 0.00034538385310405127, + "loss": 3.1968, + "step": 72900 + }, + { + "epoch": 21.249650431134935, + "grad_norm": 0.3989109396934509, + "learning_rate": 0.00034520897697464296, + "loss": 3.1861, + "step": 72950 + }, + { + "epoch": 21.2642158005127, + "grad_norm": 0.3871462345123291, + "learning_rate": 0.0003450341008452346, + "loss": 3.1848, + "step": 73000 + }, + { + "epoch": 21.2642158005127, + "eval_accuracy": 0.3733212079813346, + "eval_loss": 3.550053596496582, + "eval_runtime": 183.2865, + "eval_samples_per_second": 90.814, + "eval_steps_per_second": 5.68, + "step": 73000 + }, + { + "epoch": 21.27878116989047, + "grad_norm": 0.3962025046348572, + "learning_rate": 0.00034485922471582623, + "loss": 3.1968, + "step": 73050 + }, + { + "epoch": 21.293346539268235, + "grad_norm": 0.3977092504501343, + "learning_rate": 0.0003446843485864179, + "loss": 3.1869, + "step": 73100 + }, + { + "epoch": 21.307911908646002, + "grad_norm": 0.37942761182785034, + "learning_rate": 0.00034450947245700955, + "loss": 3.192, + "step": 73150 + }, + { + "epoch": 21.322477278023772, + "grad_norm": 0.4310181438922882, + "learning_rate": 0.0003443345963276013, + "loss": 3.1912, + "step": 73200 + }, + { + "epoch": 21.33704264740154, + "grad_norm": 0.43790924549102783, + "learning_rate": 0.00034415972019819294, + "loss": 3.2091, + "step": 73250 + }, + { + "epoch": 21.351608016779306, + "grad_norm": 0.38981059193611145, + "learning_rate": 0.00034398484406878457, + "loss": 3.2036, + "step": 73300 + }, + { + "epoch": 21.366173386157072, + "grad_norm": 0.37896528840065, + "learning_rate": 0.00034380996793937626, + "loss": 3.1914, + "step": 73350 + }, + { + "epoch": 21.38073875553484, + "grad_norm": 0.4180900752544403, + "learning_rate": 0.0003436350918099679, + "loss": 3.2035, + "step": 73400 + }, + { + "epoch": 21.39530412491261, + "grad_norm": 0.40925297141075134, + "learning_rate": 0.0003434602156805596, + "loss": 3.2148, + "step": 73450 + }, + { + "epoch": 21.409869494290376, + "grad_norm": 0.42755791544914246, + "learning_rate": 0.0003432853395511512, + "loss": 3.2015, + "step": 73500 + }, + { + "epoch": 21.424434863668143, + "grad_norm": 0.4559558629989624, + "learning_rate": 0.0003431104634217429, + "loss": 3.224, + "step": 73550 + }, + { + "epoch": 21.43900023304591, + "grad_norm": 0.394946813583374, + "learning_rate": 0.00034293558729233455, + "loss": 3.21, + "step": 73600 + }, + { + "epoch": 21.453565602423676, + "grad_norm": 0.3784257769584656, + "learning_rate": 0.0003427607111629262, + "loss": 3.218, + "step": 73650 + }, + { + "epoch": 21.468130971801443, + "grad_norm": 0.4060593545436859, + "learning_rate": 0.00034258583503351793, + "loss": 3.2158, + "step": 73700 + }, + { + "epoch": 21.482696341179214, + "grad_norm": 0.41588184237480164, + "learning_rate": 0.00034241095890410957, + "loss": 3.2228, + "step": 73750 + }, + { + "epoch": 21.49726171055698, + "grad_norm": 0.3939662277698517, + "learning_rate": 0.00034223608277470126, + "loss": 3.2143, + "step": 73800 + }, + { + "epoch": 21.511827079934747, + "grad_norm": 0.4258659780025482, + "learning_rate": 0.0003420612066452929, + "loss": 3.2068, + "step": 73850 + }, + { + "epoch": 21.526392449312514, + "grad_norm": 0.387615829706192, + "learning_rate": 0.00034188633051588453, + "loss": 3.2207, + "step": 73900 + }, + { + "epoch": 21.54095781869028, + "grad_norm": 0.4352627694606781, + "learning_rate": 0.0003417114543864762, + "loss": 3.2228, + "step": 73950 + }, + { + "epoch": 21.55552318806805, + "grad_norm": 0.4175478219985962, + "learning_rate": 0.00034153657825706786, + "loss": 3.2036, + "step": 74000 + }, + { + "epoch": 21.55552318806805, + "eval_accuracy": 0.3738021855026015, + "eval_loss": 3.54341983795166, + "eval_runtime": 183.3969, + "eval_samples_per_second": 90.759, + "eval_steps_per_second": 5.676, + "step": 74000 + }, + { + "epoch": 21.570088557445818, + "grad_norm": 0.386900395154953, + "learning_rate": 0.00034136170212765955, + "loss": 3.2198, + "step": 74050 + }, + { + "epoch": 21.584653926823584, + "grad_norm": 0.3872682452201843, + "learning_rate": 0.0003411868259982512, + "loss": 3.2253, + "step": 74100 + }, + { + "epoch": 21.59921929620135, + "grad_norm": 0.4258601665496826, + "learning_rate": 0.00034101194986884293, + "loss": 3.2129, + "step": 74150 + }, + { + "epoch": 21.613784665579118, + "grad_norm": 0.3833194077014923, + "learning_rate": 0.00034083707373943456, + "loss": 3.2152, + "step": 74200 + }, + { + "epoch": 21.62835003495689, + "grad_norm": 0.4214939475059509, + "learning_rate": 0.0003406621976100262, + "loss": 3.2255, + "step": 74250 + }, + { + "epoch": 21.642915404334655, + "grad_norm": 0.4184681475162506, + "learning_rate": 0.0003404873214806179, + "loss": 3.2175, + "step": 74300 + }, + { + "epoch": 21.65748077371242, + "grad_norm": 0.42748090624809265, + "learning_rate": 0.00034031244535120953, + "loss": 3.2199, + "step": 74350 + }, + { + "epoch": 21.67204614309019, + "grad_norm": 0.4194568991661072, + "learning_rate": 0.0003401375692218012, + "loss": 3.2254, + "step": 74400 + }, + { + "epoch": 21.686611512467955, + "grad_norm": 0.4005660116672516, + "learning_rate": 0.00033996269309239285, + "loss": 3.2265, + "step": 74450 + }, + { + "epoch": 21.701176881845722, + "grad_norm": 0.3818527162075043, + "learning_rate": 0.0003397878169629845, + "loss": 3.2187, + "step": 74500 + }, + { + "epoch": 21.715742251223492, + "grad_norm": 0.4204731285572052, + "learning_rate": 0.0003396129408335762, + "loss": 3.2483, + "step": 74550 + }, + { + "epoch": 21.73030762060126, + "grad_norm": 0.4234389364719391, + "learning_rate": 0.0003394380647041678, + "loss": 3.236, + "step": 74600 + }, + { + "epoch": 21.744872989979026, + "grad_norm": 0.4275653064250946, + "learning_rate": 0.00033926318857475956, + "loss": 3.2234, + "step": 74650 + }, + { + "epoch": 21.759438359356793, + "grad_norm": 0.39097073674201965, + "learning_rate": 0.0003390883124453512, + "loss": 3.2285, + "step": 74700 + }, + { + "epoch": 21.77400372873456, + "grad_norm": 0.40437382459640503, + "learning_rate": 0.0003389134363159429, + "loss": 3.2288, + "step": 74750 + }, + { + "epoch": 21.78856909811233, + "grad_norm": 0.40619733929634094, + "learning_rate": 0.0003387385601865345, + "loss": 3.2254, + "step": 74800 + }, + { + "epoch": 21.803134467490096, + "grad_norm": 0.42823296785354614, + "learning_rate": 0.00033856368405712616, + "loss": 3.2394, + "step": 74850 + }, + { + "epoch": 21.817699836867863, + "grad_norm": 0.4197380542755127, + "learning_rate": 0.00033838880792771785, + "loss": 3.2329, + "step": 74900 + }, + { + "epoch": 21.83226520624563, + "grad_norm": 0.3925802707672119, + "learning_rate": 0.0003382139317983095, + "loss": 3.2434, + "step": 74950 + }, + { + "epoch": 21.846830575623397, + "grad_norm": 0.4069508910179138, + "learning_rate": 0.0003380390556689012, + "loss": 3.2344, + "step": 75000 + }, + { + "epoch": 21.846830575623397, + "eval_accuracy": 0.37450748595483324, + "eval_loss": 3.532540798187256, + "eval_runtime": 183.2285, + "eval_samples_per_second": 90.843, + "eval_steps_per_second": 5.681, + "step": 75000 + }, + { + "epoch": 21.861395945001163, + "grad_norm": 0.4070306718349457, + "learning_rate": 0.0003378641795394928, + "loss": 3.2438, + "step": 75050 + }, + { + "epoch": 21.875961314378934, + "grad_norm": 0.424502432346344, + "learning_rate": 0.00033768930341008445, + "loss": 3.2461, + "step": 75100 + }, + { + "epoch": 21.8905266837567, + "grad_norm": 0.4162021279335022, + "learning_rate": 0.0003375144272806762, + "loss": 3.2439, + "step": 75150 + }, + { + "epoch": 21.905092053134467, + "grad_norm": 0.4465700387954712, + "learning_rate": 0.00033733955115126783, + "loss": 3.2426, + "step": 75200 + }, + { + "epoch": 21.919657422512234, + "grad_norm": 0.37786081433296204, + "learning_rate": 0.0003371646750218595, + "loss": 3.2455, + "step": 75250 + }, + { + "epoch": 21.93422279189, + "grad_norm": 0.40799352526664734, + "learning_rate": 0.00033698979889245116, + "loss": 3.2295, + "step": 75300 + }, + { + "epoch": 21.94878816126777, + "grad_norm": 0.3989873230457306, + "learning_rate": 0.0003368149227630428, + "loss": 3.2471, + "step": 75350 + }, + { + "epoch": 21.963353530645538, + "grad_norm": 0.39666229486465454, + "learning_rate": 0.0003366400466336345, + "loss": 3.2369, + "step": 75400 + }, + { + "epoch": 21.977918900023305, + "grad_norm": 0.3877425193786621, + "learning_rate": 0.0003364651705042261, + "loss": 3.243, + "step": 75450 + }, + { + "epoch": 21.99248426940107, + "grad_norm": 0.40185490250587463, + "learning_rate": 0.0003362902943748178, + "loss": 3.2502, + "step": 75500 + }, + { + "epoch": 22.00699137730133, + "grad_norm": 0.41637974977493286, + "learning_rate": 0.00033611541824540945, + "loss": 3.1885, + "step": 75550 + }, + { + "epoch": 22.021556746679096, + "grad_norm": 0.3906157612800598, + "learning_rate": 0.0003359405421160012, + "loss": 3.1367, + "step": 75600 + }, + { + "epoch": 22.036122116056863, + "grad_norm": 0.4338746666908264, + "learning_rate": 0.0003357656659865928, + "loss": 3.1472, + "step": 75650 + }, + { + "epoch": 22.05068748543463, + "grad_norm": 0.39344149827957153, + "learning_rate": 0.00033559078985718446, + "loss": 3.1439, + "step": 75700 + }, + { + "epoch": 22.0652528548124, + "grad_norm": 0.3915230631828308, + "learning_rate": 0.00033541591372777615, + "loss": 3.1467, + "step": 75750 + }, + { + "epoch": 22.079818224190166, + "grad_norm": 0.4206335246562958, + "learning_rate": 0.0003352410375983678, + "loss": 3.1542, + "step": 75800 + }, + { + "epoch": 22.094383593567933, + "grad_norm": 0.40375348925590515, + "learning_rate": 0.0003350661614689595, + "loss": 3.1459, + "step": 75850 + }, + { + "epoch": 22.1089489629457, + "grad_norm": 0.405519962310791, + "learning_rate": 0.0003348912853395511, + "loss": 3.1644, + "step": 75900 + }, + { + "epoch": 22.123514332323467, + "grad_norm": 0.40984925627708435, + "learning_rate": 0.00033471640921014275, + "loss": 3.1715, + "step": 75950 + }, + { + "epoch": 22.138079701701233, + "grad_norm": 0.4426792860031128, + "learning_rate": 0.00033454153308073444, + "loss": 3.162, + "step": 76000 + }, + { + "epoch": 22.138079701701233, + "eval_accuracy": 0.37330615906353404, + "eval_loss": 3.5484302043914795, + "eval_runtime": 182.8859, + "eval_samples_per_second": 91.013, + "eval_steps_per_second": 5.692, + "step": 76000 + }, + { + "epoch": 22.152645071079004, + "grad_norm": 0.43078505992889404, + "learning_rate": 0.0003343666569513261, + "loss": 3.1689, + "step": 76050 + }, + { + "epoch": 22.16721044045677, + "grad_norm": 0.3985705077648163, + "learning_rate": 0.0003341917808219178, + "loss": 3.1696, + "step": 76100 + }, + { + "epoch": 22.181775809834537, + "grad_norm": 0.39892059564590454, + "learning_rate": 0.00033401690469250946, + "loss": 3.1683, + "step": 76150 + }, + { + "epoch": 22.196341179212304, + "grad_norm": 0.4003106355667114, + "learning_rate": 0.00033384202856310115, + "loss": 3.1748, + "step": 76200 + }, + { + "epoch": 22.21090654859007, + "grad_norm": 0.41936761140823364, + "learning_rate": 0.0003336671524336928, + "loss": 3.1732, + "step": 76250 + }, + { + "epoch": 22.22547191796784, + "grad_norm": 0.394231915473938, + "learning_rate": 0.0003334922763042844, + "loss": 3.1697, + "step": 76300 + }, + { + "epoch": 22.240037287345608, + "grad_norm": 0.43360409140586853, + "learning_rate": 0.0003333174001748761, + "loss": 3.169, + "step": 76350 + }, + { + "epoch": 22.254602656723375, + "grad_norm": 0.4110454320907593, + "learning_rate": 0.00033314252404546775, + "loss": 3.1752, + "step": 76400 + }, + { + "epoch": 22.26916802610114, + "grad_norm": 0.41084834933280945, + "learning_rate": 0.00033296764791605944, + "loss": 3.1829, + "step": 76450 + }, + { + "epoch": 22.283733395478908, + "grad_norm": 0.4034361243247986, + "learning_rate": 0.0003327927717866511, + "loss": 3.1763, + "step": 76500 + }, + { + "epoch": 22.29829876485668, + "grad_norm": 0.4124561548233032, + "learning_rate": 0.0003326178956572427, + "loss": 3.1874, + "step": 76550 + }, + { + "epoch": 22.312864134234445, + "grad_norm": 0.41186967492103577, + "learning_rate": 0.00033244301952783446, + "loss": 3.1828, + "step": 76600 + }, + { + "epoch": 22.327429503612212, + "grad_norm": 0.4203675389289856, + "learning_rate": 0.0003322681433984261, + "loss": 3.1781, + "step": 76650 + }, + { + "epoch": 22.34199487298998, + "grad_norm": 0.4065147638320923, + "learning_rate": 0.0003320932672690178, + "loss": 3.1785, + "step": 76700 + }, + { + "epoch": 22.356560242367745, + "grad_norm": 0.4006505608558655, + "learning_rate": 0.0003319183911396094, + "loss": 3.198, + "step": 76750 + }, + { + "epoch": 22.371125611745512, + "grad_norm": 0.42733311653137207, + "learning_rate": 0.0003317435150102011, + "loss": 3.1998, + "step": 76800 + }, + { + "epoch": 22.385690981123282, + "grad_norm": 0.4309224784374237, + "learning_rate": 0.00033156863888079275, + "loss": 3.1957, + "step": 76850 + }, + { + "epoch": 22.40025635050105, + "grad_norm": 0.3946070075035095, + "learning_rate": 0.0003313937627513844, + "loss": 3.2073, + "step": 76900 + }, + { + "epoch": 22.414821719878816, + "grad_norm": 0.4475387930870056, + "learning_rate": 0.00033121888662197607, + "loss": 3.2152, + "step": 76950 + }, + { + "epoch": 22.429387089256583, + "grad_norm": 0.4521602988243103, + "learning_rate": 0.0003310440104925677, + "loss": 3.198, + "step": 77000 + }, + { + "epoch": 22.429387089256583, + "eval_accuracy": 0.3736266539848182, + "eval_loss": 3.5453152656555176, + "eval_runtime": 182.8424, + "eval_samples_per_second": 91.035, + "eval_steps_per_second": 5.693, + "step": 77000 + }, + { + "epoch": 22.44395245863435, + "grad_norm": 0.4259926974773407, + "learning_rate": 0.00033086913436315945, + "loss": 3.1972, + "step": 77050 + }, + { + "epoch": 22.45851782801212, + "grad_norm": 0.39332616329193115, + "learning_rate": 0.0003306942582337511, + "loss": 3.2036, + "step": 77100 + }, + { + "epoch": 22.473083197389887, + "grad_norm": 0.4318368434906006, + "learning_rate": 0.0003305193821043427, + "loss": 3.1913, + "step": 77150 + }, + { + "epoch": 22.487648566767653, + "grad_norm": 0.3975502848625183, + "learning_rate": 0.0003303445059749344, + "loss": 3.1994, + "step": 77200 + }, + { + "epoch": 22.50221393614542, + "grad_norm": 0.39913398027420044, + "learning_rate": 0.00033016962984552605, + "loss": 3.2114, + "step": 77250 + }, + { + "epoch": 22.516779305523187, + "grad_norm": 0.3935025632381439, + "learning_rate": 0.00032999475371611774, + "loss": 3.1939, + "step": 77300 + }, + { + "epoch": 22.531344674900957, + "grad_norm": 0.40351587533950806, + "learning_rate": 0.0003298198775867094, + "loss": 3.2113, + "step": 77350 + }, + { + "epoch": 22.545910044278724, + "grad_norm": 0.4044545292854309, + "learning_rate": 0.000329645001457301, + "loss": 3.1963, + "step": 77400 + }, + { + "epoch": 22.56047541365649, + "grad_norm": 0.41852426528930664, + "learning_rate": 0.0003294701253278927, + "loss": 3.2051, + "step": 77450 + }, + { + "epoch": 22.575040783034257, + "grad_norm": 0.4577685594558716, + "learning_rate": 0.00032929524919848434, + "loss": 3.2122, + "step": 77500 + }, + { + "epoch": 22.589606152412024, + "grad_norm": 0.40526050329208374, + "learning_rate": 0.0003291203730690761, + "loss": 3.2019, + "step": 77550 + }, + { + "epoch": 22.60417152178979, + "grad_norm": 0.39129775762557983, + "learning_rate": 0.0003289454969396677, + "loss": 3.2085, + "step": 77600 + }, + { + "epoch": 22.61873689116756, + "grad_norm": 0.4263695776462555, + "learning_rate": 0.0003287706208102594, + "loss": 3.2036, + "step": 77650 + }, + { + "epoch": 22.633302260545328, + "grad_norm": 0.44772493839263916, + "learning_rate": 0.00032859574468085105, + "loss": 3.2119, + "step": 77700 + }, + { + "epoch": 22.647867629923095, + "grad_norm": 0.4202895164489746, + "learning_rate": 0.0003284208685514427, + "loss": 3.2207, + "step": 77750 + }, + { + "epoch": 22.66243299930086, + "grad_norm": 0.4418281316757202, + "learning_rate": 0.0003282459924220344, + "loss": 3.2265, + "step": 77800 + }, + { + "epoch": 22.67699836867863, + "grad_norm": 0.3960643410682678, + "learning_rate": 0.000328071116292626, + "loss": 3.2172, + "step": 77850 + }, + { + "epoch": 22.6915637380564, + "grad_norm": 0.4430347979068756, + "learning_rate": 0.0003278962401632177, + "loss": 3.2157, + "step": 77900 + }, + { + "epoch": 22.706129107434165, + "grad_norm": 0.3996141254901886, + "learning_rate": 0.00032772136403380934, + "loss": 3.2088, + "step": 77950 + }, + { + "epoch": 22.720694476811932, + "grad_norm": 0.40546032786369324, + "learning_rate": 0.000327546487904401, + "loss": 3.2291, + "step": 78000 + }, + { + "epoch": 22.720694476811932, + "eval_accuracy": 0.3741603027183871, + "eval_loss": 3.538947820663452, + "eval_runtime": 182.8782, + "eval_samples_per_second": 91.017, + "eval_steps_per_second": 5.692, + "step": 78000 + }, + { + "epoch": 22.7352598461897, + "grad_norm": 0.41795864701271057, + "learning_rate": 0.0003273716117749927, + "loss": 3.2355, + "step": 78050 + }, + { + "epoch": 22.749825215567466, + "grad_norm": 0.3912990391254425, + "learning_rate": 0.00032719673564558435, + "loss": 3.2291, + "step": 78100 + }, + { + "epoch": 22.764390584945236, + "grad_norm": 0.40984243154525757, + "learning_rate": 0.00032702185951617605, + "loss": 3.2333, + "step": 78150 + }, + { + "epoch": 22.778955954323003, + "grad_norm": 0.4171477258205414, + "learning_rate": 0.0003268469833867677, + "loss": 3.2243, + "step": 78200 + }, + { + "epoch": 22.79352132370077, + "grad_norm": 0.4628252685070038, + "learning_rate": 0.00032667210725735937, + "loss": 3.2146, + "step": 78250 + }, + { + "epoch": 22.808086693078536, + "grad_norm": 0.4350678324699402, + "learning_rate": 0.000326497231127951, + "loss": 3.2147, + "step": 78300 + }, + { + "epoch": 22.822652062456303, + "grad_norm": 0.42457160353660583, + "learning_rate": 0.00032632235499854264, + "loss": 3.2326, + "step": 78350 + }, + { + "epoch": 22.83721743183407, + "grad_norm": 0.4339698255062103, + "learning_rate": 0.00032614747886913433, + "loss": 3.2335, + "step": 78400 + }, + { + "epoch": 22.85178280121184, + "grad_norm": 0.41745054721832275, + "learning_rate": 0.00032597260273972597, + "loss": 3.2318, + "step": 78450 + }, + { + "epoch": 22.866348170589607, + "grad_norm": 0.43340957164764404, + "learning_rate": 0.0003257977266103177, + "loss": 3.2356, + "step": 78500 + }, + { + "epoch": 22.880913539967374, + "grad_norm": 0.41358107328414917, + "learning_rate": 0.00032562285048090935, + "loss": 3.2305, + "step": 78550 + }, + { + "epoch": 22.89547890934514, + "grad_norm": 0.4206470549106598, + "learning_rate": 0.000325447974351501, + "loss": 3.2347, + "step": 78600 + }, + { + "epoch": 22.910044278722907, + "grad_norm": 0.4107297658920288, + "learning_rate": 0.0003252730982220927, + "loss": 3.2458, + "step": 78650 + }, + { + "epoch": 22.924609648100677, + "grad_norm": 0.4097016453742981, + "learning_rate": 0.0003250982220926843, + "loss": 3.2243, + "step": 78700 + }, + { + "epoch": 22.939175017478444, + "grad_norm": 0.4124782383441925, + "learning_rate": 0.000324923345963276, + "loss": 3.217, + "step": 78750 + }, + { + "epoch": 22.95374038685621, + "grad_norm": 0.43206652998924255, + "learning_rate": 0.00032474846983386764, + "loss": 3.2337, + "step": 78800 + }, + { + "epoch": 22.968305756233978, + "grad_norm": 0.38564333319664, + "learning_rate": 0.00032457359370445933, + "loss": 3.2259, + "step": 78850 + }, + { + "epoch": 22.982871125611744, + "grad_norm": 0.43358278274536133, + "learning_rate": 0.00032439871757505097, + "loss": 3.2241, + "step": 78900 + }, + { + "epoch": 22.997436494989515, + "grad_norm": 0.4222619831562042, + "learning_rate": 0.0003242238414456426, + "loss": 3.2384, + "step": 78950 + }, + { + "epoch": 23.01194360288977, + "grad_norm": 0.4120100736618042, + "learning_rate": 0.00032404896531623435, + "loss": 3.16, + "step": 79000 + }, + { + "epoch": 23.01194360288977, + "eval_accuracy": 0.3737924272199652, + "eval_loss": 3.5487711429595947, + "eval_runtime": 183.134, + "eval_samples_per_second": 90.89, + "eval_steps_per_second": 5.684, + "step": 79000 + }, + { + "epoch": 23.026508972267536, + "grad_norm": 0.41206803917884827, + "learning_rate": 0.000323874089186826, + "loss": 3.1426, + "step": 79050 + }, + { + "epoch": 23.041074341645306, + "grad_norm": 0.421497642993927, + "learning_rate": 0.0003236992130574177, + "loss": 3.1405, + "step": 79100 + }, + { + "epoch": 23.055639711023073, + "grad_norm": 0.41028302907943726, + "learning_rate": 0.0003235243369280093, + "loss": 3.1416, + "step": 79150 + }, + { + "epoch": 23.07020508040084, + "grad_norm": 0.423358291387558, + "learning_rate": 0.00032334946079860095, + "loss": 3.132, + "step": 79200 + }, + { + "epoch": 23.084770449778606, + "grad_norm": 0.42327702045440674, + "learning_rate": 0.00032317458466919264, + "loss": 3.1533, + "step": 79250 + }, + { + "epoch": 23.099335819156373, + "grad_norm": 0.4391932189464569, + "learning_rate": 0.0003229997085397843, + "loss": 3.1389, + "step": 79300 + }, + { + "epoch": 23.11390118853414, + "grad_norm": 0.39794474840164185, + "learning_rate": 0.00032282483241037596, + "loss": 3.1481, + "step": 79350 + }, + { + "epoch": 23.12846655791191, + "grad_norm": 0.4148556590080261, + "learning_rate": 0.0003226499562809676, + "loss": 3.151, + "step": 79400 + }, + { + "epoch": 23.143031927289677, + "grad_norm": 0.4218699038028717, + "learning_rate": 0.00032247508015155924, + "loss": 3.1488, + "step": 79450 + }, + { + "epoch": 23.157597296667443, + "grad_norm": 0.440711110830307, + "learning_rate": 0.000322300204022151, + "loss": 3.161, + "step": 79500 + }, + { + "epoch": 23.17216266604521, + "grad_norm": 0.41110166907310486, + "learning_rate": 0.0003221253278927426, + "loss": 3.1612, + "step": 79550 + }, + { + "epoch": 23.186728035422977, + "grad_norm": 0.4470248520374298, + "learning_rate": 0.0003219504517633343, + "loss": 3.1722, + "step": 79600 + }, + { + "epoch": 23.201293404800747, + "grad_norm": 0.4257759749889374, + "learning_rate": 0.00032177557563392594, + "loss": 3.154, + "step": 79650 + }, + { + "epoch": 23.215858774178514, + "grad_norm": 0.44018658995628357, + "learning_rate": 0.00032160069950451763, + "loss": 3.1628, + "step": 79700 + }, + { + "epoch": 23.23042414355628, + "grad_norm": 0.40629279613494873, + "learning_rate": 0.00032142582337510927, + "loss": 3.1663, + "step": 79750 + }, + { + "epoch": 23.244989512934048, + "grad_norm": 0.41147536039352417, + "learning_rate": 0.0003212509472457009, + "loss": 3.1745, + "step": 79800 + }, + { + "epoch": 23.259554882311814, + "grad_norm": 0.43514347076416016, + "learning_rate": 0.0003210760711162926, + "loss": 3.1809, + "step": 79850 + }, + { + "epoch": 23.27412025168958, + "grad_norm": 0.42042967677116394, + "learning_rate": 0.00032090119498688423, + "loss": 3.1808, + "step": 79900 + }, + { + "epoch": 23.28868562106735, + "grad_norm": 0.41074612736701965, + "learning_rate": 0.0003207263188574759, + "loss": 3.1626, + "step": 79950 + }, + { + "epoch": 23.303250990445118, + "grad_norm": 0.41137275099754333, + "learning_rate": 0.0003205514427280676, + "loss": 3.1854, + "step": 80000 + }, + { + "epoch": 23.303250990445118, + "eval_accuracy": 0.3741075139364148, + "eval_loss": 3.5451858043670654, + "eval_runtime": 183.2227, + "eval_samples_per_second": 90.846, + "eval_steps_per_second": 5.682, + "step": 80000 + }, + { + "epoch": 23.317816359822885, + "grad_norm": 0.4325380027294159, + "learning_rate": 0.00032037656659865925, + "loss": 3.1423, + "step": 80050 + }, + { + "epoch": 23.33238172920065, + "grad_norm": 0.4023299813270569, + "learning_rate": 0.00032020169046925094, + "loss": 3.1404, + "step": 80100 + }, + { + "epoch": 23.34694709857842, + "grad_norm": 0.46395108103752136, + "learning_rate": 0.0003200268143398426, + "loss": 3.1341, + "step": 80150 + }, + { + "epoch": 23.36151246795619, + "grad_norm": 0.41562238335609436, + "learning_rate": 0.00031985193821043427, + "loss": 3.137, + "step": 80200 + }, + { + "epoch": 23.376077837333956, + "grad_norm": 0.42964160442352295, + "learning_rate": 0.0003196770620810259, + "loss": 3.1355, + "step": 80250 + }, + { + "epoch": 23.390643206711722, + "grad_norm": 0.4194352328777313, + "learning_rate": 0.0003195021859516176, + "loss": 3.15, + "step": 80300 + }, + { + "epoch": 23.40520857608949, + "grad_norm": 0.41986915469169617, + "learning_rate": 0.00031932730982220923, + "loss": 3.1585, + "step": 80350 + }, + { + "epoch": 23.419773945467256, + "grad_norm": 0.4277133345603943, + "learning_rate": 0.00031915243369280087, + "loss": 3.1664, + "step": 80400 + }, + { + "epoch": 23.434339314845026, + "grad_norm": 0.4289986491203308, + "learning_rate": 0.00031897755756339256, + "loss": 3.1489, + "step": 80450 + }, + { + "epoch": 23.448904684222793, + "grad_norm": 0.4255027770996094, + "learning_rate": 0.0003188026814339842, + "loss": 3.1757, + "step": 80500 + }, + { + "epoch": 23.46347005360056, + "grad_norm": 0.43643444776535034, + "learning_rate": 0.00031862780530457594, + "loss": 3.1501, + "step": 80550 + }, + { + "epoch": 23.478035422978326, + "grad_norm": 0.4336095452308655, + "learning_rate": 0.0003184529291751676, + "loss": 3.1631, + "step": 80600 + }, + { + "epoch": 23.492600792356093, + "grad_norm": 0.40593820810317993, + "learning_rate": 0.0003182780530457592, + "loss": 3.1595, + "step": 80650 + }, + { + "epoch": 23.50716616173386, + "grad_norm": 0.4076375961303711, + "learning_rate": 0.0003181031769163509, + "loss": 3.1733, + "step": 80700 + }, + { + "epoch": 23.52173153111163, + "grad_norm": 0.4589744508266449, + "learning_rate": 0.00031792830078694254, + "loss": 3.1649, + "step": 80750 + }, + { + "epoch": 23.536296900489397, + "grad_norm": 0.47728487849235535, + "learning_rate": 0.0003177534246575342, + "loss": 3.1545, + "step": 80800 + }, + { + "epoch": 23.550862269867164, + "grad_norm": 0.4282548725605011, + "learning_rate": 0.00031757854852812586, + "loss": 3.1784, + "step": 80850 + }, + { + "epoch": 23.56542763924493, + "grad_norm": 0.40563443303108215, + "learning_rate": 0.00031740367239871755, + "loss": 3.18, + "step": 80900 + }, + { + "epoch": 23.579993008622697, + "grad_norm": 0.4412521421909332, + "learning_rate": 0.0003172287962693092, + "loss": 3.181, + "step": 80950 + }, + { + "epoch": 23.594558378000468, + "grad_norm": 0.45530885457992554, + "learning_rate": 0.0003170539201399008, + "loss": 3.1779, + "step": 81000 + }, + { + "epoch": 23.594558378000468, + "eval_accuracy": 0.37350226527362285, + "eval_loss": 3.5499300956726074, + "eval_runtime": 179.6391, + "eval_samples_per_second": 92.658, + "eval_steps_per_second": 5.795, + "step": 81000 + }, + { + "epoch": 23.609123747378234, + "grad_norm": 0.38101035356521606, + "learning_rate": 0.00031687904401049257, + "loss": 3.1674, + "step": 81050 + }, + { + "epoch": 23.623689116756, + "grad_norm": 0.476788729429245, + "learning_rate": 0.0003167041678810842, + "loss": 3.1721, + "step": 81100 + }, + { + "epoch": 23.638254486133768, + "grad_norm": 0.4510898292064667, + "learning_rate": 0.0003165292917516759, + "loss": 3.1877, + "step": 81150 + }, + { + "epoch": 23.652819855511535, + "grad_norm": 0.4234790802001953, + "learning_rate": 0.00031635441562226753, + "loss": 3.1894, + "step": 81200 + }, + { + "epoch": 23.667385224889305, + "grad_norm": 0.4436555504798889, + "learning_rate": 0.00031617953949285917, + "loss": 3.1929, + "step": 81250 + }, + { + "epoch": 23.68195059426707, + "grad_norm": 0.4603196978569031, + "learning_rate": 0.00031600466336345086, + "loss": 3.1905, + "step": 81300 + }, + { + "epoch": 23.69651596364484, + "grad_norm": 0.4264376163482666, + "learning_rate": 0.0003158297872340425, + "loss": 3.1853, + "step": 81350 + }, + { + "epoch": 23.711081333022605, + "grad_norm": 0.46008777618408203, + "learning_rate": 0.0003156549111046342, + "loss": 3.1832, + "step": 81400 + }, + { + "epoch": 23.725646702400372, + "grad_norm": 0.3979647159576416, + "learning_rate": 0.0003154800349752258, + "loss": 3.2036, + "step": 81450 + }, + { + "epoch": 23.74021207177814, + "grad_norm": 0.41289305686950684, + "learning_rate": 0.00031530515884581757, + "loss": 3.2056, + "step": 81500 + }, + { + "epoch": 23.75477744115591, + "grad_norm": 0.3942318260669708, + "learning_rate": 0.0003151302827164092, + "loss": 3.1917, + "step": 81550 + }, + { + "epoch": 23.769342810533676, + "grad_norm": 0.408755362033844, + "learning_rate": 0.00031495540658700084, + "loss": 3.1889, + "step": 81600 + }, + { + "epoch": 23.783908179911442, + "grad_norm": 0.41167670488357544, + "learning_rate": 0.00031478053045759253, + "loss": 3.1915, + "step": 81650 + }, + { + "epoch": 23.79847354928921, + "grad_norm": 0.4455852508544922, + "learning_rate": 0.00031460565432818417, + "loss": 3.1898, + "step": 81700 + }, + { + "epoch": 23.813038918666976, + "grad_norm": 0.42409783601760864, + "learning_rate": 0.00031443077819877586, + "loss": 3.1875, + "step": 81750 + }, + { + "epoch": 23.827604288044746, + "grad_norm": 0.41861656308174133, + "learning_rate": 0.0003142559020693675, + "loss": 3.2021, + "step": 81800 + }, + { + "epoch": 23.842169657422513, + "grad_norm": 0.44218793511390686, + "learning_rate": 0.00031408102593995913, + "loss": 3.1987, + "step": 81850 + }, + { + "epoch": 23.85673502680028, + "grad_norm": 0.43261197209358215, + "learning_rate": 0.0003139061498105508, + "loss": 3.2028, + "step": 81900 + }, + { + "epoch": 23.871300396178047, + "grad_norm": 0.4348728656768799, + "learning_rate": 0.00031373127368114245, + "loss": 3.1951, + "step": 81950 + }, + { + "epoch": 23.885865765555813, + "grad_norm": 0.3952457010746002, + "learning_rate": 0.0003135563975517342, + "loss": 3.2077, + "step": 82000 + }, + { + "epoch": 23.885865765555813, + "eval_accuracy": 0.37416606363223265, + "eval_loss": 3.5419678688049316, + "eval_runtime": 179.7187, + "eval_samples_per_second": 92.617, + "eval_steps_per_second": 5.792, + "step": 82000 + }, + { + "epoch": 23.900431134933584, + "grad_norm": 0.4126368761062622, + "learning_rate": 0.00031338152142232584, + "loss": 3.2086, + "step": 82050 + }, + { + "epoch": 23.91499650431135, + "grad_norm": 0.42765992879867554, + "learning_rate": 0.00031320664529291747, + "loss": 3.2027, + "step": 82100 + }, + { + "epoch": 23.929561873689117, + "grad_norm": 0.44232431054115295, + "learning_rate": 0.00031303176916350916, + "loss": 3.2065, + "step": 82150 + }, + { + "epoch": 23.944127243066884, + "grad_norm": 0.41848278045654297, + "learning_rate": 0.0003128568930341008, + "loss": 3.1995, + "step": 82200 + }, + { + "epoch": 23.95869261244465, + "grad_norm": 0.431692898273468, + "learning_rate": 0.0003126820169046925, + "loss": 3.2061, + "step": 82250 + }, + { + "epoch": 23.973257981822417, + "grad_norm": 0.44336676597595215, + "learning_rate": 0.0003125071407752841, + "loss": 3.2129, + "step": 82300 + }, + { + "epoch": 23.987823351200188, + "grad_norm": 0.42227089405059814, + "learning_rate": 0.0003123322646458758, + "loss": 3.1983, + "step": 82350 + }, + { + "epoch": 24.002621766487998, + "grad_norm": 0.42413151264190674, + "learning_rate": 0.00031215738851646745, + "loss": 3.2515, + "step": 82400 + }, + { + "epoch": 24.017187135865765, + "grad_norm": 0.42558392882347107, + "learning_rate": 0.0003119825123870591, + "loss": 3.1148, + "step": 82450 + }, + { + "epoch": 24.03175250524353, + "grad_norm": 0.4119874835014343, + "learning_rate": 0.00031180763625765083, + "loss": 3.1196, + "step": 82500 + }, + { + "epoch": 24.0463178746213, + "grad_norm": 0.4366919994354248, + "learning_rate": 0.00031163276012824247, + "loss": 3.1312, + "step": 82550 + }, + { + "epoch": 24.06088324399907, + "grad_norm": 0.4563874304294586, + "learning_rate": 0.00031145788399883416, + "loss": 3.1297, + "step": 82600 + }, + { + "epoch": 24.075448613376835, + "grad_norm": 0.4409359395503998, + "learning_rate": 0.0003112830078694258, + "loss": 3.138, + "step": 82650 + }, + { + "epoch": 24.090013982754602, + "grad_norm": 0.419706255197525, + "learning_rate": 0.00031110813174001743, + "loss": 3.143, + "step": 82700 + }, + { + "epoch": 24.10457935213237, + "grad_norm": 0.41549739241600037, + "learning_rate": 0.0003109332556106091, + "loss": 3.1495, + "step": 82750 + }, + { + "epoch": 24.11914472151014, + "grad_norm": 0.4378606379032135, + "learning_rate": 0.00031075837948120076, + "loss": 3.1369, + "step": 82800 + }, + { + "epoch": 24.133710090887906, + "grad_norm": 0.45427650213241577, + "learning_rate": 0.00031058350335179245, + "loss": 3.159, + "step": 82850 + }, + { + "epoch": 24.148275460265673, + "grad_norm": 0.41194722056388855, + "learning_rate": 0.0003104086272223841, + "loss": 3.147, + "step": 82900 + }, + { + "epoch": 24.16284082964344, + "grad_norm": 0.4255869388580322, + "learning_rate": 0.00031023375109297583, + "loss": 3.1491, + "step": 82950 + }, + { + "epoch": 24.177406199021206, + "grad_norm": 0.4222257435321808, + "learning_rate": 0.00031005887496356746, + "loss": 3.1569, + "step": 83000 + }, + { + "epoch": 24.177406199021206, + "eval_accuracy": 0.3738667312516056, + "eval_loss": 3.5533342361450195, + "eval_runtime": 179.7382, + "eval_samples_per_second": 92.607, + "eval_steps_per_second": 5.792, + "step": 83000 + }, + { + "epoch": 24.191971568398973, + "grad_norm": 0.42919695377349854, + "learning_rate": 0.0003098839988341591, + "loss": 3.1545, + "step": 83050 + }, + { + "epoch": 24.206536937776743, + "grad_norm": 0.4294886291027069, + "learning_rate": 0.0003097091227047508, + "loss": 3.1609, + "step": 83100 + }, + { + "epoch": 24.22110230715451, + "grad_norm": 0.4393008053302765, + "learning_rate": 0.00030953424657534243, + "loss": 3.1681, + "step": 83150 + }, + { + "epoch": 24.235667676532277, + "grad_norm": 0.43206796050071716, + "learning_rate": 0.0003093593704459341, + "loss": 3.175, + "step": 83200 + }, + { + "epoch": 24.250233045910043, + "grad_norm": 0.42569947242736816, + "learning_rate": 0.00030918449431652575, + "loss": 3.1698, + "step": 83250 + }, + { + "epoch": 24.26479841528781, + "grad_norm": 0.4225282669067383, + "learning_rate": 0.0003090096181871174, + "loss": 3.1676, + "step": 83300 + }, + { + "epoch": 24.27936378466558, + "grad_norm": 0.43696901202201843, + "learning_rate": 0.0003088347420577091, + "loss": 3.168, + "step": 83350 + }, + { + "epoch": 24.293929154043347, + "grad_norm": 0.43535012006759644, + "learning_rate": 0.0003086598659283007, + "loss": 3.1767, + "step": 83400 + }, + { + "epoch": 24.308494523421114, + "grad_norm": 0.42330238223075867, + "learning_rate": 0.00030848498979889246, + "loss": 3.1706, + "step": 83450 + }, + { + "epoch": 24.32305989279888, + "grad_norm": 0.4490221440792084, + "learning_rate": 0.0003083101136694841, + "loss": 3.1757, + "step": 83500 + }, + { + "epoch": 24.337625262176648, + "grad_norm": 0.4481862783432007, + "learning_rate": 0.0003081352375400758, + "loss": 3.1745, + "step": 83550 + }, + { + "epoch": 24.352190631554418, + "grad_norm": 0.4269773066043854, + "learning_rate": 0.0003079603614106674, + "loss": 3.1707, + "step": 83600 + }, + { + "epoch": 24.366756000932185, + "grad_norm": 0.42694324254989624, + "learning_rate": 0.00030778548528125906, + "loss": 3.1766, + "step": 83650 + }, + { + "epoch": 24.38132137030995, + "grad_norm": 0.41363558173179626, + "learning_rate": 0.00030761060915185075, + "loss": 3.1951, + "step": 83700 + }, + { + "epoch": 24.395886739687718, + "grad_norm": 0.39224985241889954, + "learning_rate": 0.0003074357330224424, + "loss": 3.1796, + "step": 83750 + }, + { + "epoch": 24.410452109065485, + "grad_norm": 0.44722500443458557, + "learning_rate": 0.0003072608568930341, + "loss": 3.1833, + "step": 83800 + }, + { + "epoch": 24.42501747844325, + "grad_norm": 0.44555240869522095, + "learning_rate": 0.0003070859807636257, + "loss": 3.1924, + "step": 83850 + }, + { + "epoch": 24.439582847821022, + "grad_norm": 0.42261767387390137, + "learning_rate": 0.00030691110463421735, + "loss": 3.1853, + "step": 83900 + }, + { + "epoch": 24.45414821719879, + "grad_norm": 0.4288679361343384, + "learning_rate": 0.0003067362285048091, + "loss": 3.1812, + "step": 83950 + }, + { + "epoch": 24.468713586576555, + "grad_norm": 0.41109663248062134, + "learning_rate": 0.00030656135237540073, + "loss": 3.182, + "step": 84000 + }, + { + "epoch": 24.468713586576555, + "eval_accuracy": 0.37449290731571394, + "eval_loss": 3.5434653759002686, + "eval_runtime": 179.7152, + "eval_samples_per_second": 92.619, + "eval_steps_per_second": 5.792, + "step": 84000 + }, + { + "epoch": 24.483278955954322, + "grad_norm": 0.4189678430557251, + "learning_rate": 0.0003063864762459924, + "loss": 3.1902, + "step": 84050 + }, + { + "epoch": 24.49784432533209, + "grad_norm": 0.42589837312698364, + "learning_rate": 0.00030621160011658406, + "loss": 3.1965, + "step": 84100 + }, + { + "epoch": 24.51240969470986, + "grad_norm": 0.4201832115650177, + "learning_rate": 0.0003060367239871757, + "loss": 3.1925, + "step": 84150 + }, + { + "epoch": 24.526975064087626, + "grad_norm": 0.4573235809803009, + "learning_rate": 0.0003058618478577674, + "loss": 3.1754, + "step": 84200 + }, + { + "epoch": 24.541540433465393, + "grad_norm": 0.44319775700569153, + "learning_rate": 0.000305686971728359, + "loss": 3.1957, + "step": 84250 + }, + { + "epoch": 24.55610580284316, + "grad_norm": 0.43211740255355835, + "learning_rate": 0.0003055120955989507, + "loss": 3.1839, + "step": 84300 + }, + { + "epoch": 24.570671172220926, + "grad_norm": 0.4124164283275604, + "learning_rate": 0.00030533721946954235, + "loss": 3.1993, + "step": 84350 + }, + { + "epoch": 24.585236541598697, + "grad_norm": 0.44055572152137756, + "learning_rate": 0.0003051623433401341, + "loss": 3.1992, + "step": 84400 + }, + { + "epoch": 24.599801910976463, + "grad_norm": 0.39688199758529663, + "learning_rate": 0.00030498746721072573, + "loss": 3.1891, + "step": 84450 + }, + { + "epoch": 24.61436728035423, + "grad_norm": 0.4425677955150604, + "learning_rate": 0.00030481259108131736, + "loss": 3.2038, + "step": 84500 + }, + { + "epoch": 24.628932649731997, + "grad_norm": 0.4194990396499634, + "learning_rate": 0.00030463771495190905, + "loss": 3.1793, + "step": 84550 + }, + { + "epoch": 24.643498019109764, + "grad_norm": 0.4422365427017212, + "learning_rate": 0.0003044628388225007, + "loss": 3.1917, + "step": 84600 + }, + { + "epoch": 24.65806338848753, + "grad_norm": 0.43441274762153625, + "learning_rate": 0.0003042879626930924, + "loss": 3.205, + "step": 84650 + }, + { + "epoch": 24.6726287578653, + "grad_norm": 0.41350895166397095, + "learning_rate": 0.000304113086563684, + "loss": 3.2026, + "step": 84700 + }, + { + "epoch": 24.687194127243067, + "grad_norm": 0.45683303475379944, + "learning_rate": 0.00030393821043427565, + "loss": 3.2147, + "step": 84750 + }, + { + "epoch": 24.701759496620834, + "grad_norm": 0.4254341721534729, + "learning_rate": 0.00030376333430486734, + "loss": 3.1934, + "step": 84800 + }, + { + "epoch": 24.7163248659986, + "grad_norm": 0.446032851934433, + "learning_rate": 0.000303588458175459, + "loss": 3.1964, + "step": 84850 + }, + { + "epoch": 24.730890235376368, + "grad_norm": 0.4153694808483124, + "learning_rate": 0.0003034135820460507, + "loss": 3.2111, + "step": 84900 + }, + { + "epoch": 24.745455604754138, + "grad_norm": 0.4243052303791046, + "learning_rate": 0.00030323870591664236, + "loss": 3.2088, + "step": 84950 + }, + { + "epoch": 24.760020974131905, + "grad_norm": 0.4362107515335083, + "learning_rate": 0.00030306382978723405, + "loss": 3.2022, + "step": 85000 + }, + { + "epoch": 24.760020974131905, + "eval_accuracy": 0.3745567476466961, + "eval_loss": 3.536757707595825, + "eval_runtime": 179.8297, + "eval_samples_per_second": 92.56, + "eval_steps_per_second": 5.789, + "step": 85000 + }, + { + "epoch": 24.77458634350967, + "grad_norm": 0.4239880442619324, + "learning_rate": 0.0003028889536578257, + "loss": 3.194, + "step": 85050 + }, + { + "epoch": 24.78915171288744, + "grad_norm": 0.42055609822273254, + "learning_rate": 0.0003027140775284173, + "loss": 3.2129, + "step": 85100 + }, + { + "epoch": 24.803717082265205, + "grad_norm": 0.4157819449901581, + "learning_rate": 0.000302539201399009, + "loss": 3.1931, + "step": 85150 + }, + { + "epoch": 24.818282451642972, + "grad_norm": 0.38243451714515686, + "learning_rate": 0.00030236432526960065, + "loss": 3.2069, + "step": 85200 + }, + { + "epoch": 24.832847821020742, + "grad_norm": 0.4196301996707916, + "learning_rate": 0.00030218944914019234, + "loss": 3.2226, + "step": 85250 + }, + { + "epoch": 24.84741319039851, + "grad_norm": 0.43389689922332764, + "learning_rate": 0.000302014573010784, + "loss": 3.2144, + "step": 85300 + }, + { + "epoch": 24.861978559776276, + "grad_norm": 0.42272713780403137, + "learning_rate": 0.0003018396968813756, + "loss": 3.2223, + "step": 85350 + }, + { + "epoch": 24.876543929154042, + "grad_norm": 0.4422394335269928, + "learning_rate": 0.00030166482075196736, + "loss": 3.208, + "step": 85400 + }, + { + "epoch": 24.89110929853181, + "grad_norm": 0.4388560652732849, + "learning_rate": 0.000301489944622559, + "loss": 3.2095, + "step": 85450 + }, + { + "epoch": 24.90567466790958, + "grad_norm": 0.39322882890701294, + "learning_rate": 0.0003013150684931507, + "loss": 3.2127, + "step": 85500 + }, + { + "epoch": 24.920240037287346, + "grad_norm": 0.45943403244018555, + "learning_rate": 0.0003011401923637423, + "loss": 3.2097, + "step": 85550 + }, + { + "epoch": 24.934805406665113, + "grad_norm": 0.4043842852115631, + "learning_rate": 0.000300965316234334, + "loss": 3.2095, + "step": 85600 + }, + { + "epoch": 24.94937077604288, + "grad_norm": 0.41772520542144775, + "learning_rate": 0.00030079044010492565, + "loss": 3.2133, + "step": 85650 + }, + { + "epoch": 24.963936145420647, + "grad_norm": 0.44280874729156494, + "learning_rate": 0.0003006155639755173, + "loss": 3.2232, + "step": 85700 + }, + { + "epoch": 24.978501514798417, + "grad_norm": 0.4212973117828369, + "learning_rate": 0.00030044068784610897, + "loss": 3.2087, + "step": 85750 + }, + { + "epoch": 24.993066884176184, + "grad_norm": 0.4439827501773834, + "learning_rate": 0.0003002658117167006, + "loss": 3.2043, + "step": 85800 + }, + { + "epoch": 25.007573992076438, + "grad_norm": 0.41100841760635376, + "learning_rate": 0.00030009093558729235, + "loss": 3.1611, + "step": 85850 + }, + { + "epoch": 25.022139361454208, + "grad_norm": 0.40830469131469727, + "learning_rate": 0.000299916059457884, + "loss": 3.1028, + "step": 85900 + }, + { + "epoch": 25.036704730831975, + "grad_norm": 0.43471017479896545, + "learning_rate": 0.0002997411833284756, + "loss": 3.112, + "step": 85950 + }, + { + "epoch": 25.05127010020974, + "grad_norm": 0.4032357931137085, + "learning_rate": 0.0002995663071990673, + "loss": 3.1186, + "step": 86000 + }, + { + "epoch": 25.05127010020974, + "eval_accuracy": 0.37411198158388687, + "eval_loss": 3.550753116607666, + "eval_runtime": 179.7226, + "eval_samples_per_second": 92.615, + "eval_steps_per_second": 5.792, + "step": 86000 + }, + { + "epoch": 25.06583546958751, + "grad_norm": 0.41526585817337036, + "learning_rate": 0.00029939143106965895, + "loss": 3.127, + "step": 86050 + }, + { + "epoch": 25.080400838965275, + "grad_norm": 0.433913916349411, + "learning_rate": 0.00029921655494025064, + "loss": 3.1225, + "step": 86100 + }, + { + "epoch": 25.094966208343042, + "grad_norm": 0.4391394257545471, + "learning_rate": 0.0002990416788108423, + "loss": 3.1331, + "step": 86150 + }, + { + "epoch": 25.109531577720812, + "grad_norm": 0.4312891364097595, + "learning_rate": 0.00029886680268143397, + "loss": 3.1245, + "step": 86200 + }, + { + "epoch": 25.12409694709858, + "grad_norm": 0.4313848316669464, + "learning_rate": 0.0002986919265520256, + "loss": 3.1307, + "step": 86250 + }, + { + "epoch": 25.138662316476346, + "grad_norm": 0.4232284724712372, + "learning_rate": 0.0002985170504226173, + "loss": 3.1399, + "step": 86300 + }, + { + "epoch": 25.153227685854112, + "grad_norm": 0.46254777908325195, + "learning_rate": 0.00029834217429320893, + "loss": 3.146, + "step": 86350 + }, + { + "epoch": 25.16779305523188, + "grad_norm": 0.42061716318130493, + "learning_rate": 0.0002981672981638006, + "loss": 3.1409, + "step": 86400 + }, + { + "epoch": 25.18235842460965, + "grad_norm": 0.42561444640159607, + "learning_rate": 0.00029799242203439226, + "loss": 3.1381, + "step": 86450 + }, + { + "epoch": 25.196923793987416, + "grad_norm": 0.43058279156684875, + "learning_rate": 0.00029781754590498395, + "loss": 3.1516, + "step": 86500 + }, + { + "epoch": 25.211489163365183, + "grad_norm": 0.4757503867149353, + "learning_rate": 0.00029764266977557564, + "loss": 3.1511, + "step": 86550 + }, + { + "epoch": 25.22605453274295, + "grad_norm": 0.44378894567489624, + "learning_rate": 0.0002974677936461673, + "loss": 3.141, + "step": 86600 + }, + { + "epoch": 25.240619902120716, + "grad_norm": 0.4458007216453552, + "learning_rate": 0.0002972929175167589, + "loss": 3.1534, + "step": 86650 + }, + { + "epoch": 25.255185271498487, + "grad_norm": 0.43031567335128784, + "learning_rate": 0.0002971180413873506, + "loss": 3.1493, + "step": 86700 + }, + { + "epoch": 25.269750640876254, + "grad_norm": 0.4496525824069977, + "learning_rate": 0.00029694316525794224, + "loss": 3.1595, + "step": 86750 + }, + { + "epoch": 25.28431601025402, + "grad_norm": 0.45122572779655457, + "learning_rate": 0.00029676828912853393, + "loss": 3.1619, + "step": 86800 + }, + { + "epoch": 25.298881379631787, + "grad_norm": 0.4212110638618469, + "learning_rate": 0.0002965934129991256, + "loss": 3.1619, + "step": 86850 + }, + { + "epoch": 25.313446749009554, + "grad_norm": 0.4641132354736328, + "learning_rate": 0.00029641853686971726, + "loss": 3.1727, + "step": 86900 + }, + { + "epoch": 25.32801211838732, + "grad_norm": 0.45251330733299255, + "learning_rate": 0.0002962436607403089, + "loss": 3.169, + "step": 86950 + }, + { + "epoch": 25.34257748776509, + "grad_norm": 0.46525469422340393, + "learning_rate": 0.0002960687846109006, + "loss": 3.1696, + "step": 87000 + }, + { + "epoch": 25.34257748776509, + "eval_accuracy": 0.3741974547342073, + "eval_loss": 3.545832872390747, + "eval_runtime": 179.8303, + "eval_samples_per_second": 92.559, + "eval_steps_per_second": 5.789, + "step": 87000 + }, + { + "epoch": 25.357142857142858, + "grad_norm": 0.4239409267902374, + "learning_rate": 0.00029589390848149227, + "loss": 3.168, + "step": 87050 + }, + { + "epoch": 25.371708226520624, + "grad_norm": 0.46311140060424805, + "learning_rate": 0.0002957190323520839, + "loss": 3.1748, + "step": 87100 + }, + { + "epoch": 25.38627359589839, + "grad_norm": 0.4287487268447876, + "learning_rate": 0.0002955441562226756, + "loss": 3.1698, + "step": 87150 + }, + { + "epoch": 25.400838965276158, + "grad_norm": 0.4293224811553955, + "learning_rate": 0.00029536928009326723, + "loss": 3.1713, + "step": 87200 + }, + { + "epoch": 25.41540433465393, + "grad_norm": 0.4797385632991791, + "learning_rate": 0.00029519440396385887, + "loss": 3.178, + "step": 87250 + }, + { + "epoch": 25.429969704031695, + "grad_norm": 0.4255479872226715, + "learning_rate": 0.00029501952783445056, + "loss": 3.1895, + "step": 87300 + }, + { + "epoch": 25.44453507340946, + "grad_norm": 0.44172418117523193, + "learning_rate": 0.00029484465170504225, + "loss": 3.1718, + "step": 87350 + }, + { + "epoch": 25.45910044278723, + "grad_norm": 0.46718594431877136, + "learning_rate": 0.0002946697755756339, + "loss": 3.1744, + "step": 87400 + }, + { + "epoch": 25.473665812164995, + "grad_norm": 0.45305508375167847, + "learning_rate": 0.0002944948994462256, + "loss": 3.1695, + "step": 87450 + }, + { + "epoch": 25.488231181542766, + "grad_norm": 0.4728144109249115, + "learning_rate": 0.00029432002331681727, + "loss": 3.1797, + "step": 87500 + }, + { + "epoch": 25.502796550920532, + "grad_norm": 0.42346832156181335, + "learning_rate": 0.0002941451471874089, + "loss": 3.1763, + "step": 87550 + }, + { + "epoch": 25.5173619202983, + "grad_norm": 0.4842385947704315, + "learning_rate": 0.00029397027105800054, + "loss": 3.1913, + "step": 87600 + }, + { + "epoch": 25.531927289676066, + "grad_norm": 0.41806545853614807, + "learning_rate": 0.00029379539492859223, + "loss": 3.1802, + "step": 87650 + }, + { + "epoch": 25.546492659053833, + "grad_norm": 0.4152505397796631, + "learning_rate": 0.00029362051879918387, + "loss": 3.189, + "step": 87700 + }, + { + "epoch": 25.5610580284316, + "grad_norm": 0.47798603773117065, + "learning_rate": 0.00029344564266977556, + "loss": 3.195, + "step": 87750 + }, + { + "epoch": 25.57562339780937, + "grad_norm": 0.4499337673187256, + "learning_rate": 0.00029327076654036725, + "loss": 3.1853, + "step": 87800 + }, + { + "epoch": 25.590188767187136, + "grad_norm": 0.4044957458972931, + "learning_rate": 0.0002930958904109589, + "loss": 3.1807, + "step": 87850 + }, + { + "epoch": 25.604754136564903, + "grad_norm": 0.4229501485824585, + "learning_rate": 0.0002929210142815505, + "loss": 3.1816, + "step": 87900 + }, + { + "epoch": 25.61931950594267, + "grad_norm": 0.4179123342037201, + "learning_rate": 0.0002927461381521422, + "loss": 3.1858, + "step": 87950 + }, + { + "epoch": 25.633884875320437, + "grad_norm": 0.4255021810531616, + "learning_rate": 0.0002925712620227339, + "loss": 3.1863, + "step": 88000 + }, + { + "epoch": 25.633884875320437, + "eval_accuracy": 0.37432819220759983, + "eval_loss": 3.543217658996582, + "eval_runtime": 179.9205, + "eval_samples_per_second": 92.513, + "eval_steps_per_second": 5.786, + "step": 88000 + }, + { + "epoch": 25.648450244698207, + "grad_norm": 0.41656792163848877, + "learning_rate": 0.00029239638589332554, + "loss": 3.1984, + "step": 88050 + }, + { + "epoch": 25.663015614075974, + "grad_norm": 0.43181586265563965, + "learning_rate": 0.0002922215097639172, + "loss": 3.1834, + "step": 88100 + }, + { + "epoch": 25.67758098345374, + "grad_norm": 0.42854011058807373, + "learning_rate": 0.00029204663363450886, + "loss": 3.1853, + "step": 88150 + }, + { + "epoch": 25.692146352831507, + "grad_norm": 0.4229859709739685, + "learning_rate": 0.0002918717575051005, + "loss": 3.1944, + "step": 88200 + }, + { + "epoch": 25.706711722209274, + "grad_norm": 0.4565353989601135, + "learning_rate": 0.0002916968813756922, + "loss": 3.1926, + "step": 88250 + }, + { + "epoch": 25.721277091587044, + "grad_norm": 0.4147500991821289, + "learning_rate": 0.0002915220052462839, + "loss": 3.1754, + "step": 88300 + }, + { + "epoch": 25.73584246096481, + "grad_norm": 0.4348998963832855, + "learning_rate": 0.0002913471291168755, + "loss": 3.1911, + "step": 88350 + }, + { + "epoch": 25.750407830342578, + "grad_norm": 0.43094000220298767, + "learning_rate": 0.00029117225298746715, + "loss": 3.203, + "step": 88400 + }, + { + "epoch": 25.764973199720345, + "grad_norm": 0.467695415019989, + "learning_rate": 0.00029099737685805884, + "loss": 3.1814, + "step": 88450 + }, + { + "epoch": 25.77953856909811, + "grad_norm": 0.4421153962612152, + "learning_rate": 0.00029082250072865053, + "loss": 3.2027, + "step": 88500 + }, + { + "epoch": 25.794103938475878, + "grad_norm": 0.4013921916484833, + "learning_rate": 0.00029064762459924217, + "loss": 3.1853, + "step": 88550 + }, + { + "epoch": 25.80866930785365, + "grad_norm": 0.41378721594810486, + "learning_rate": 0.00029047274846983386, + "loss": 3.1945, + "step": 88600 + }, + { + "epoch": 25.823234677231415, + "grad_norm": 0.4195496141910553, + "learning_rate": 0.0002902978723404255, + "loss": 3.1907, + "step": 88650 + }, + { + "epoch": 25.837800046609182, + "grad_norm": 0.4184630811214447, + "learning_rate": 0.00029012299621101713, + "loss": 3.19, + "step": 88700 + }, + { + "epoch": 25.85236541598695, + "grad_norm": 0.44368013739585876, + "learning_rate": 0.0002899481200816088, + "loss": 3.2054, + "step": 88750 + }, + { + "epoch": 25.866930785364715, + "grad_norm": 0.41648930311203003, + "learning_rate": 0.0002897732439522005, + "loss": 3.19, + "step": 88800 + }, + { + "epoch": 25.881496154742486, + "grad_norm": 0.4401901662349701, + "learning_rate": 0.00028959836782279215, + "loss": 3.2152, + "step": 88850 + }, + { + "epoch": 25.896061524120253, + "grad_norm": 0.421739786863327, + "learning_rate": 0.00028942349169338384, + "loss": 3.1994, + "step": 88900 + }, + { + "epoch": 25.91062689349802, + "grad_norm": 0.45117563009262085, + "learning_rate": 0.00028924861556397553, + "loss": 3.1998, + "step": 88950 + }, + { + "epoch": 25.925192262875786, + "grad_norm": 0.4407546818256378, + "learning_rate": 0.00028907373943456717, + "loss": 3.1981, + "step": 89000 + }, + { + "epoch": 25.925192262875786, + "eval_accuracy": 0.37512190505190995, + "eval_loss": 3.5322093963623047, + "eval_runtime": 179.7889, + "eval_samples_per_second": 92.581, + "eval_steps_per_second": 5.79, + "step": 89000 + }, + { + "epoch": 25.939757632253553, + "grad_norm": 0.42473137378692627, + "learning_rate": 0.0002888988633051588, + "loss": 3.1987, + "step": 89050 + }, + { + "epoch": 25.954323001631323, + "grad_norm": 0.42266032099723816, + "learning_rate": 0.0002887239871757505, + "loss": 3.2087, + "step": 89100 + }, + { + "epoch": 25.96888837100909, + "grad_norm": 0.43083688616752625, + "learning_rate": 0.00028854911104634213, + "loss": 3.1976, + "step": 89150 + }, + { + "epoch": 25.983453740386857, + "grad_norm": 0.4433348476886749, + "learning_rate": 0.0002883742349169338, + "loss": 3.2047, + "step": 89200 + }, + { + "epoch": 25.998019109764623, + "grad_norm": 0.40691253542900085, + "learning_rate": 0.0002881993587875255, + "loss": 3.2031, + "step": 89250 + }, + { + "epoch": 26.01252621766488, + "grad_norm": 0.470239520072937, + "learning_rate": 0.00028802448265811715, + "loss": 3.1109, + "step": 89300 + }, + { + "epoch": 26.027091587042648, + "grad_norm": 0.43022575974464417, + "learning_rate": 0.0002878496065287088, + "loss": 3.1127, + "step": 89350 + }, + { + "epoch": 26.041656956420415, + "grad_norm": 0.4232766926288605, + "learning_rate": 0.0002876747303993005, + "loss": 3.1152, + "step": 89400 + }, + { + "epoch": 26.05622232579818, + "grad_norm": 0.449580579996109, + "learning_rate": 0.00028749985426989216, + "loss": 3.1077, + "step": 89450 + }, + { + "epoch": 26.070787695175948, + "grad_norm": 0.46129152178764343, + "learning_rate": 0.0002873249781404838, + "loss": 3.1017, + "step": 89500 + }, + { + "epoch": 26.08535306455372, + "grad_norm": 0.4587564468383789, + "learning_rate": 0.0002871501020110755, + "loss": 3.1181, + "step": 89550 + }, + { + "epoch": 26.099918433931485, + "grad_norm": 0.44854047894477844, + "learning_rate": 0.0002869752258816671, + "loss": 3.1179, + "step": 89600 + }, + { + "epoch": 26.114483803309252, + "grad_norm": 0.4396362006664276, + "learning_rate": 0.00028680034975225876, + "loss": 3.1191, + "step": 89650 + }, + { + "epoch": 26.12904917268702, + "grad_norm": 0.42307373881340027, + "learning_rate": 0.00028662547362285045, + "loss": 3.1332, + "step": 89700 + }, + { + "epoch": 26.143614542064785, + "grad_norm": 0.4569436311721802, + "learning_rate": 0.00028645059749344214, + "loss": 3.1235, + "step": 89750 + }, + { + "epoch": 26.158179911442556, + "grad_norm": 0.44630715250968933, + "learning_rate": 0.0002862757213640338, + "loss": 3.1303, + "step": 89800 + }, + { + "epoch": 26.172745280820322, + "grad_norm": 0.4254963994026184, + "learning_rate": 0.00028610084523462547, + "loss": 3.138, + "step": 89850 + }, + { + "epoch": 26.18731065019809, + "grad_norm": 0.4579327404499054, + "learning_rate": 0.0002859259691052171, + "loss": 3.1222, + "step": 89900 + }, + { + "epoch": 26.201876019575856, + "grad_norm": 0.4161672294139862, + "learning_rate": 0.0002857510929758088, + "loss": 3.1452, + "step": 89950 + }, + { + "epoch": 26.216441388953623, + "grad_norm": 0.4550322890281677, + "learning_rate": 0.00028557621684640043, + "loss": 3.1484, + "step": 90000 + }, + { + "epoch": 26.216441388953623, + "eval_accuracy": 0.37423554730739, + "eval_loss": 3.546004295349121, + "eval_runtime": 179.8845, + "eval_samples_per_second": 92.532, + "eval_steps_per_second": 5.787, + "step": 90000 + }, + { + "epoch": 26.23100675833139, + "grad_norm": 0.4519776403903961, + "learning_rate": 0.0002854013407169921, + "loss": 3.1474, + "step": 90050 + }, + { + "epoch": 26.24557212770916, + "grad_norm": 0.4274028539657593, + "learning_rate": 0.00028522646458758376, + "loss": 3.1445, + "step": 90100 + }, + { + "epoch": 26.260137497086927, + "grad_norm": 0.4484902620315552, + "learning_rate": 0.00028505158845817545, + "loss": 3.1477, + "step": 90150 + }, + { + "epoch": 26.274702866464693, + "grad_norm": 0.4268806576728821, + "learning_rate": 0.0002848767123287671, + "loss": 3.144, + "step": 90200 + }, + { + "epoch": 26.28926823584246, + "grad_norm": 0.4616459310054779, + "learning_rate": 0.0002847018361993588, + "loss": 3.1451, + "step": 90250 + }, + { + "epoch": 26.303833605220227, + "grad_norm": 0.4314412772655487, + "learning_rate": 0.0002845269600699504, + "loss": 3.1629, + "step": 90300 + }, + { + "epoch": 26.318398974597997, + "grad_norm": 0.4529995918273926, + "learning_rate": 0.0002843520839405421, + "loss": 3.1517, + "step": 90350 + }, + { + "epoch": 26.332964343975764, + "grad_norm": 0.4490029811859131, + "learning_rate": 0.0002841772078111338, + "loss": 3.1627, + "step": 90400 + }, + { + "epoch": 26.34752971335353, + "grad_norm": 0.46888235211372375, + "learning_rate": 0.00028400233168172543, + "loss": 3.1591, + "step": 90450 + }, + { + "epoch": 26.362095082731297, + "grad_norm": 0.43123456835746765, + "learning_rate": 0.00028382745555231707, + "loss": 3.1538, + "step": 90500 + }, + { + "epoch": 26.376660452109064, + "grad_norm": 0.4406403601169586, + "learning_rate": 0.00028365257942290876, + "loss": 3.1637, + "step": 90550 + }, + { + "epoch": 26.391225821486834, + "grad_norm": 0.44110873341560364, + "learning_rate": 0.0002834777032935004, + "loss": 3.1546, + "step": 90600 + }, + { + "epoch": 26.4057911908646, + "grad_norm": 0.45763951539993286, + "learning_rate": 0.0002833028271640921, + "loss": 3.167, + "step": 90650 + }, + { + "epoch": 26.420356560242368, + "grad_norm": 0.4703918695449829, + "learning_rate": 0.00028312795103468377, + "loss": 3.1606, + "step": 90700 + }, + { + "epoch": 26.434921929620135, + "grad_norm": 0.4339727759361267, + "learning_rate": 0.0002829530749052754, + "loss": 3.1597, + "step": 90750 + }, + { + "epoch": 26.4494872989979, + "grad_norm": 0.4744960367679596, + "learning_rate": 0.00028277819877586705, + "loss": 3.1638, + "step": 90800 + }, + { + "epoch": 26.46405266837567, + "grad_norm": 0.4523671269416809, + "learning_rate": 0.00028260332264645874, + "loss": 3.156, + "step": 90850 + }, + { + "epoch": 26.47861803775344, + "grad_norm": 0.4665144383907318, + "learning_rate": 0.0002824284465170504, + "loss": 3.1772, + "step": 90900 + }, + { + "epoch": 26.493183407131205, + "grad_norm": 0.5128541588783264, + "learning_rate": 0.00028225357038764206, + "loss": 3.167, + "step": 90950 + }, + { + "epoch": 26.507748776508972, + "grad_norm": 0.4322499930858612, + "learning_rate": 0.00028207869425823375, + "loss": 3.1698, + "step": 91000 + }, + { + "epoch": 26.507748776508972, + "eval_accuracy": 0.3743130257201289, + "eval_loss": 3.545729875564575, + "eval_runtime": 179.6993, + "eval_samples_per_second": 92.627, + "eval_steps_per_second": 5.793, + "step": 91000 + }, + { + "epoch": 26.52231414588674, + "grad_norm": 0.4499381482601166, + "learning_rate": 0.0002819038181288254, + "loss": 3.1626, + "step": 91050 + }, + { + "epoch": 26.536879515264506, + "grad_norm": 0.4228927195072174, + "learning_rate": 0.000281728941999417, + "loss": 3.161, + "step": 91100 + }, + { + "epoch": 26.551444884642276, + "grad_norm": 0.44955986738204956, + "learning_rate": 0.0002815540658700087, + "loss": 3.18, + "step": 91150 + }, + { + "epoch": 26.566010254020043, + "grad_norm": 0.4338165521621704, + "learning_rate": 0.0002813791897406004, + "loss": 3.1676, + "step": 91200 + }, + { + "epoch": 26.58057562339781, + "grad_norm": 0.4299832880496979, + "learning_rate": 0.00028120431361119204, + "loss": 3.1764, + "step": 91250 + }, + { + "epoch": 26.595140992775576, + "grad_norm": 0.4154253304004669, + "learning_rate": 0.00028102943748178373, + "loss": 3.1798, + "step": 91300 + }, + { + "epoch": 26.609706362153343, + "grad_norm": 0.44844770431518555, + "learning_rate": 0.00028085456135237537, + "loss": 3.1812, + "step": 91350 + }, + { + "epoch": 26.624271731531113, + "grad_norm": 0.45521214604377747, + "learning_rate": 0.00028067968522296706, + "loss": 3.1815, + "step": 91400 + }, + { + "epoch": 26.63883710090888, + "grad_norm": 0.41416242718696594, + "learning_rate": 0.0002805048090935587, + "loss": 3.1811, + "step": 91450 + }, + { + "epoch": 26.653402470286647, + "grad_norm": 0.4366213381290436, + "learning_rate": 0.0002803299329641504, + "loss": 3.1767, + "step": 91500 + }, + { + "epoch": 26.667967839664414, + "grad_norm": 0.4341444969177246, + "learning_rate": 0.000280155056834742, + "loss": 3.191, + "step": 91550 + }, + { + "epoch": 26.68253320904218, + "grad_norm": 0.4299844205379486, + "learning_rate": 0.0002799801807053337, + "loss": 3.1736, + "step": 91600 + }, + { + "epoch": 26.697098578419947, + "grad_norm": 0.4490987956523895, + "learning_rate": 0.00027980530457592535, + "loss": 3.1789, + "step": 91650 + }, + { + "epoch": 26.711663947797717, + "grad_norm": 0.4265900254249573, + "learning_rate": 0.00027963042844651704, + "loss": 3.169, + "step": 91700 + }, + { + "epoch": 26.726229317175484, + "grad_norm": 0.4504244923591614, + "learning_rate": 0.0002794555523171087, + "loss": 3.19, + "step": 91750 + }, + { + "epoch": 26.74079468655325, + "grad_norm": 0.441986620426178, + "learning_rate": 0.00027928067618770037, + "loss": 3.1873, + "step": 91800 + }, + { + "epoch": 26.755360055931018, + "grad_norm": 0.46930548548698425, + "learning_rate": 0.00027910580005829206, + "loss": 3.1911, + "step": 91850 + }, + { + "epoch": 26.769925425308784, + "grad_norm": 0.4317058026790619, + "learning_rate": 0.0002789309239288837, + "loss": 3.1826, + "step": 91900 + }, + { + "epoch": 26.784490794686555, + "grad_norm": 0.4412967562675476, + "learning_rate": 0.00027875604779947533, + "loss": 3.186, + "step": 91950 + }, + { + "epoch": 26.79905616406432, + "grad_norm": 0.4440974295139313, + "learning_rate": 0.000278581171670067, + "loss": 3.1977, + "step": 92000 + }, + { + "epoch": 26.79905616406432, + "eval_accuracy": 0.3749948122382973, + "eval_loss": 3.535423994064331, + "eval_runtime": 179.7826, + "eval_samples_per_second": 92.584, + "eval_steps_per_second": 5.79, + "step": 92000 + }, + { + "epoch": 26.813621533442088, + "grad_norm": 0.4377935826778412, + "learning_rate": 0.00027840629554065865, + "loss": 3.1886, + "step": 92050 + }, + { + "epoch": 26.828186902819855, + "grad_norm": 0.4575270414352417, + "learning_rate": 0.00027823141941125034, + "loss": 3.19, + "step": 92100 + }, + { + "epoch": 26.84275227219762, + "grad_norm": 0.45877814292907715, + "learning_rate": 0.00027805654328184204, + "loss": 3.191, + "step": 92150 + }, + { + "epoch": 26.857317641575392, + "grad_norm": 0.4479317367076874, + "learning_rate": 0.00027788166715243367, + "loss": 3.2019, + "step": 92200 + }, + { + "epoch": 26.87188301095316, + "grad_norm": 0.42838388681411743, + "learning_rate": 0.0002777067910230253, + "loss": 3.1834, + "step": 92250 + }, + { + "epoch": 26.886448380330926, + "grad_norm": 0.4574011564254761, + "learning_rate": 0.000277531914893617, + "loss": 3.1953, + "step": 92300 + }, + { + "epoch": 26.901013749708692, + "grad_norm": 0.43894481658935547, + "learning_rate": 0.00027735703876420863, + "loss": 3.1959, + "step": 92350 + }, + { + "epoch": 26.91557911908646, + "grad_norm": 0.42568439245224, + "learning_rate": 0.0002771821626348003, + "loss": 3.1886, + "step": 92400 + }, + { + "epoch": 26.930144488464226, + "grad_norm": 0.4158543348312378, + "learning_rate": 0.000277007286505392, + "loss": 3.1995, + "step": 92450 + }, + { + "epoch": 26.944709857841996, + "grad_norm": 0.4558238685131073, + "learning_rate": 0.00027683241037598365, + "loss": 3.1805, + "step": 92500 + }, + { + "epoch": 26.959275227219763, + "grad_norm": 0.4516817331314087, + "learning_rate": 0.0002766575342465753, + "loss": 3.1962, + "step": 92550 + }, + { + "epoch": 26.97384059659753, + "grad_norm": 0.42139333486557007, + "learning_rate": 0.000276482658117167, + "loss": 3.1903, + "step": 92600 + }, + { + "epoch": 26.988405965975296, + "grad_norm": 0.40730905532836914, + "learning_rate": 0.00027630778198775867, + "loss": 3.1924, + "step": 92650 + }, + { + "epoch": 27.002913073875554, + "grad_norm": 0.4469035863876343, + "learning_rate": 0.0002761329058583503, + "loss": 3.1777, + "step": 92700 + }, + { + "epoch": 27.01747844325332, + "grad_norm": 0.5051560997962952, + "learning_rate": 0.000275958029728942, + "loss": 3.0968, + "step": 92750 + }, + { + "epoch": 27.032043812631088, + "grad_norm": 0.4543512165546417, + "learning_rate": 0.00027578315359953363, + "loss": 3.0962, + "step": 92800 + }, + { + "epoch": 27.046609182008854, + "grad_norm": 0.4353955388069153, + "learning_rate": 0.00027560827747012527, + "loss": 3.1153, + "step": 92850 + }, + { + "epoch": 27.061174551386625, + "grad_norm": 0.48911988735198975, + "learning_rate": 0.00027543340134071696, + "loss": 3.1041, + "step": 92900 + }, + { + "epoch": 27.07573992076439, + "grad_norm": 0.4155479371547699, + "learning_rate": 0.00027525852521130865, + "loss": 3.0933, + "step": 92950 + }, + { + "epoch": 27.090305290142158, + "grad_norm": 0.43685251474380493, + "learning_rate": 0.0002750836490819003, + "loss": 3.1125, + "step": 93000 + }, + { + "epoch": 27.090305290142158, + "eval_accuracy": 0.3741177424977324, + "eval_loss": 3.550111770629883, + "eval_runtime": 179.8551, + "eval_samples_per_second": 92.547, + "eval_steps_per_second": 5.788, + "step": 93000 + }, + { + "epoch": 27.104870659519925, + "grad_norm": 0.4509027600288391, + "learning_rate": 0.000274908772952492, + "loss": 3.1063, + "step": 93050 + }, + { + "epoch": 27.11943602889769, + "grad_norm": 0.4444672465324402, + "learning_rate": 0.0002747338968230836, + "loss": 3.1204, + "step": 93100 + }, + { + "epoch": 27.134001398275462, + "grad_norm": 0.45079129934310913, + "learning_rate": 0.0002745590206936753, + "loss": 3.1147, + "step": 93150 + }, + { + "epoch": 27.14856676765323, + "grad_norm": 0.4966619610786438, + "learning_rate": 0.00027438414456426694, + "loss": 3.1341, + "step": 93200 + }, + { + "epoch": 27.163132137030995, + "grad_norm": 0.48015525937080383, + "learning_rate": 0.00027420926843485863, + "loss": 3.13, + "step": 93250 + }, + { + "epoch": 27.177697506408762, + "grad_norm": 0.483812540769577, + "learning_rate": 0.00027403439230545026, + "loss": 3.1373, + "step": 93300 + }, + { + "epoch": 27.19226287578653, + "grad_norm": 0.4629931151866913, + "learning_rate": 0.00027385951617604195, + "loss": 3.1216, + "step": 93350 + }, + { + "epoch": 27.206828245164296, + "grad_norm": 0.4691258668899536, + "learning_rate": 0.0002736846400466336, + "loss": 3.1361, + "step": 93400 + }, + { + "epoch": 27.221393614542066, + "grad_norm": 0.42496374249458313, + "learning_rate": 0.0002735097639172253, + "loss": 3.133, + "step": 93450 + }, + { + "epoch": 27.235958983919833, + "grad_norm": 0.46841350197792053, + "learning_rate": 0.0002733348877878169, + "loss": 3.1306, + "step": 93500 + }, + { + "epoch": 27.2505243532976, + "grad_norm": 0.43265077471733093, + "learning_rate": 0.0002731600116584086, + "loss": 3.1355, + "step": 93550 + }, + { + "epoch": 27.265089722675366, + "grad_norm": 0.47501933574676514, + "learning_rate": 0.0002729851355290003, + "loss": 3.1378, + "step": 93600 + }, + { + "epoch": 27.279655092053133, + "grad_norm": 0.469482958316803, + "learning_rate": 0.00027281025939959193, + "loss": 3.1349, + "step": 93650 + }, + { + "epoch": 27.294220461430903, + "grad_norm": 0.4316178262233734, + "learning_rate": 0.00027263538327018357, + "loss": 3.1486, + "step": 93700 + }, + { + "epoch": 27.30878583080867, + "grad_norm": 0.45734158158302307, + "learning_rate": 0.00027246050714077526, + "loss": 3.1382, + "step": 93750 + }, + { + "epoch": 27.323351200186437, + "grad_norm": 0.44604817032814026, + "learning_rate": 0.0002722856310113669, + "loss": 3.153, + "step": 93800 + }, + { + "epoch": 27.337916569564204, + "grad_norm": 0.4513714015483856, + "learning_rate": 0.0002721107548819586, + "loss": 3.1443, + "step": 93850 + }, + { + "epoch": 27.35248193894197, + "grad_norm": 0.4659077227115631, + "learning_rate": 0.0002719358787525503, + "loss": 3.135, + "step": 93900 + }, + { + "epoch": 27.36704730831974, + "grad_norm": 0.4294387698173523, + "learning_rate": 0.0002717610026231419, + "loss": 3.1445, + "step": 93950 + }, + { + "epoch": 27.381612677697508, + "grad_norm": 0.4388628900051117, + "learning_rate": 0.00027158612649373355, + "loss": 3.1546, + "step": 94000 + }, + { + "epoch": 27.381612677697508, + "eval_accuracy": 0.3744574012752782, + "eval_loss": 3.5467703342437744, + "eval_runtime": 179.9446, + "eval_samples_per_second": 92.501, + "eval_steps_per_second": 5.785, + "step": 94000 + }, + { + "epoch": 27.396178047075274, + "grad_norm": 0.4699251353740692, + "learning_rate": 0.00027141125036432524, + "loss": 3.1437, + "step": 94050 + }, + { + "epoch": 27.41074341645304, + "grad_norm": 0.44644948840141296, + "learning_rate": 0.00027123637423491693, + "loss": 3.1516, + "step": 94100 + }, + { + "epoch": 27.425308785830808, + "grad_norm": 0.4366833567619324, + "learning_rate": 0.00027106149810550857, + "loss": 3.157, + "step": 94150 + }, + { + "epoch": 27.439874155208575, + "grad_norm": 0.47586560249328613, + "learning_rate": 0.00027088662197610026, + "loss": 3.158, + "step": 94200 + }, + { + "epoch": 27.454439524586345, + "grad_norm": 0.449791818857193, + "learning_rate": 0.0002707117458466919, + "loss": 3.1498, + "step": 94250 + }, + { + "epoch": 27.46900489396411, + "grad_norm": 0.46725863218307495, + "learning_rate": 0.00027053686971728353, + "loss": 3.1569, + "step": 94300 + }, + { + "epoch": 27.48357026334188, + "grad_norm": 0.44953715801239014, + "learning_rate": 0.0002703619935878752, + "loss": 3.1548, + "step": 94350 + }, + { + "epoch": 27.498135632719645, + "grad_norm": 0.4506903290748596, + "learning_rate": 0.0002701871174584669, + "loss": 3.1679, + "step": 94400 + }, + { + "epoch": 27.512701002097412, + "grad_norm": 0.44679129123687744, + "learning_rate": 0.00027001224132905855, + "loss": 3.1668, + "step": 94450 + }, + { + "epoch": 27.527266371475182, + "grad_norm": 0.43930813670158386, + "learning_rate": 0.00026983736519965024, + "loss": 3.1537, + "step": 94500 + }, + { + "epoch": 27.54183174085295, + "grad_norm": 0.43353012204170227, + "learning_rate": 0.0002696624890702419, + "loss": 3.1702, + "step": 94550 + }, + { + "epoch": 27.556397110230716, + "grad_norm": 0.491400808095932, + "learning_rate": 0.00026948761294083356, + "loss": 3.1703, + "step": 94600 + }, + { + "epoch": 27.570962479608482, + "grad_norm": 0.4375465214252472, + "learning_rate": 0.0002693127368114252, + "loss": 3.1697, + "step": 94650 + }, + { + "epoch": 27.58552784898625, + "grad_norm": 0.42356833815574646, + "learning_rate": 0.0002691378606820169, + "loss": 3.1576, + "step": 94700 + }, + { + "epoch": 27.600093218364016, + "grad_norm": 0.4304982125759125, + "learning_rate": 0.0002689629845526085, + "loss": 3.1705, + "step": 94750 + }, + { + "epoch": 27.614658587741786, + "grad_norm": 0.4731658399105072, + "learning_rate": 0.0002687881084232002, + "loss": 3.1662, + "step": 94800 + }, + { + "epoch": 27.629223957119553, + "grad_norm": 0.44949278235435486, + "learning_rate": 0.00026861323229379185, + "loss": 3.165, + "step": 94850 + }, + { + "epoch": 27.64378932649732, + "grad_norm": 0.4739612638950348, + "learning_rate": 0.00026843835616438354, + "loss": 3.1635, + "step": 94900 + }, + { + "epoch": 27.658354695875087, + "grad_norm": 0.48769617080688477, + "learning_rate": 0.0002682634800349752, + "loss": 3.1613, + "step": 94950 + }, + { + "epoch": 27.672920065252853, + "grad_norm": 0.4974506199359894, + "learning_rate": 0.00026808860390556687, + "loss": 3.1704, + "step": 95000 + }, + { + "epoch": 27.672920065252853, + "eval_accuracy": 0.37493602740313875, + "eval_loss": 3.5381674766540527, + "eval_runtime": 179.802, + "eval_samples_per_second": 92.574, + "eval_steps_per_second": 5.79, + "step": 95000 + }, + { + "epoch": 27.687485434630624, + "grad_norm": 0.4498634934425354, + "learning_rate": 0.00026791372777615856, + "loss": 3.1687, + "step": 95050 + }, + { + "epoch": 27.70205080400839, + "grad_norm": 0.4463023841381073, + "learning_rate": 0.0002677388516467502, + "loss": 3.1677, + "step": 95100 + }, + { + "epoch": 27.716616173386157, + "grad_norm": 0.47234046459198, + "learning_rate": 0.00026756397551734183, + "loss": 3.1763, + "step": 95150 + }, + { + "epoch": 27.731181542763924, + "grad_norm": 0.4431990385055542, + "learning_rate": 0.0002673890993879335, + "loss": 3.1777, + "step": 95200 + }, + { + "epoch": 27.74574691214169, + "grad_norm": 0.43977609276771545, + "learning_rate": 0.00026721422325852516, + "loss": 3.1732, + "step": 95250 + }, + { + "epoch": 27.76031228151946, + "grad_norm": 0.43646240234375, + "learning_rate": 0.00026703934712911685, + "loss": 3.1871, + "step": 95300 + }, + { + "epoch": 27.774877650897228, + "grad_norm": 0.45789384841918945, + "learning_rate": 0.00026686447099970854, + "loss": 3.1791, + "step": 95350 + }, + { + "epoch": 27.789443020274994, + "grad_norm": 0.45078974962234497, + "learning_rate": 0.0002666895948703002, + "loss": 3.1786, + "step": 95400 + }, + { + "epoch": 27.80400838965276, + "grad_norm": 0.4329206347465515, + "learning_rate": 0.0002665147187408918, + "loss": 3.1785, + "step": 95450 + }, + { + "epoch": 27.818573759030528, + "grad_norm": 0.4348940849304199, + "learning_rate": 0.0002663398426114835, + "loss": 3.1867, + "step": 95500 + }, + { + "epoch": 27.833139128408295, + "grad_norm": 0.45594125986099243, + "learning_rate": 0.0002661649664820752, + "loss": 3.187, + "step": 95550 + }, + { + "epoch": 27.847704497786065, + "grad_norm": 0.4796610176563263, + "learning_rate": 0.00026599009035266683, + "loss": 3.1756, + "step": 95600 + }, + { + "epoch": 27.862269867163832, + "grad_norm": 0.4777950644493103, + "learning_rate": 0.0002658152142232585, + "loss": 3.1781, + "step": 95650 + }, + { + "epoch": 27.8768352365416, + "grad_norm": 0.46176958084106445, + "learning_rate": 0.00026564033809385016, + "loss": 3.1788, + "step": 95700 + }, + { + "epoch": 27.891400605919365, + "grad_norm": 0.434174120426178, + "learning_rate": 0.0002654654619644418, + "loss": 3.1862, + "step": 95750 + }, + { + "epoch": 27.905965975297132, + "grad_norm": 0.44971075654029846, + "learning_rate": 0.0002652905858350335, + "loss": 3.1892, + "step": 95800 + }, + { + "epoch": 27.920531344674902, + "grad_norm": 0.43806204199790955, + "learning_rate": 0.00026511570970562517, + "loss": 3.1733, + "step": 95850 + }, + { + "epoch": 27.93509671405267, + "grad_norm": 0.4868404269218445, + "learning_rate": 0.0002649408335762168, + "loss": 3.1825, + "step": 95900 + }, + { + "epoch": 27.949662083430436, + "grad_norm": 0.45844024419784546, + "learning_rate": 0.0002647659574468085, + "loss": 3.1906, + "step": 95950 + }, + { + "epoch": 27.964227452808203, + "grad_norm": 0.424188494682312, + "learning_rate": 0.0002645910813174002, + "loss": 3.1875, + "step": 96000 + }, + { + "epoch": 27.964227452808203, + "eval_accuracy": 0.3754160643670431, + "eval_loss": 3.5307044982910156, + "eval_runtime": 179.9575, + "eval_samples_per_second": 92.494, + "eval_steps_per_second": 5.785, + "step": 96000 + }, + { + "epoch": 27.97879282218597, + "grad_norm": 0.46329542994499207, + "learning_rate": 0.0002644162051879918, + "loss": 3.1886, + "step": 96050 + }, + { + "epoch": 27.99335819156374, + "grad_norm": 0.4756506681442261, + "learning_rate": 0.00026424132905858346, + "loss": 3.1786, + "step": 96100 + }, + { + "epoch": 28.007865299463994, + "grad_norm": 0.44213542342185974, + "learning_rate": 0.00026406645292917515, + "loss": 3.1369, + "step": 96150 + }, + { + "epoch": 28.02243066884176, + "grad_norm": 0.4677712917327881, + "learning_rate": 0.0002638915767997668, + "loss": 3.0854, + "step": 96200 + }, + { + "epoch": 28.03699603821953, + "grad_norm": 0.4562908709049225, + "learning_rate": 0.0002637167006703585, + "loss": 3.0957, + "step": 96250 + }, + { + "epoch": 28.051561407597298, + "grad_norm": 0.4334988594055176, + "learning_rate": 0.00026354182454095017, + "loss": 3.0935, + "step": 96300 + }, + { + "epoch": 28.066126776975064, + "grad_norm": 0.45454612374305725, + "learning_rate": 0.0002633669484115418, + "loss": 3.0987, + "step": 96350 + }, + { + "epoch": 28.08069214635283, + "grad_norm": 0.4465283453464508, + "learning_rate": 0.00026319207228213344, + "loss": 3.1039, + "step": 96400 + }, + { + "epoch": 28.095257515730598, + "grad_norm": 0.5014046430587769, + "learning_rate": 0.00026301719615272513, + "loss": 3.1036, + "step": 96450 + }, + { + "epoch": 28.109822885108365, + "grad_norm": 0.45270198583602905, + "learning_rate": 0.0002628423200233168, + "loss": 3.1143, + "step": 96500 + }, + { + "epoch": 28.124388254486135, + "grad_norm": 0.46294882893562317, + "learning_rate": 0.00026266744389390846, + "loss": 3.1056, + "step": 96550 + }, + { + "epoch": 28.1389536238639, + "grad_norm": 0.4628591239452362, + "learning_rate": 0.00026249256776450015, + "loss": 3.1132, + "step": 96600 + }, + { + "epoch": 28.15351899324167, + "grad_norm": 0.4873234033584595, + "learning_rate": 0.0002623176916350918, + "loss": 3.1179, + "step": 96650 + }, + { + "epoch": 28.168084362619435, + "grad_norm": 0.44500336050987244, + "learning_rate": 0.0002621428155056834, + "loss": 3.1162, + "step": 96700 + }, + { + "epoch": 28.182649731997202, + "grad_norm": 0.43908581137657166, + "learning_rate": 0.0002619679393762751, + "loss": 3.1182, + "step": 96750 + }, + { + "epoch": 28.197215101374972, + "grad_norm": 0.4487292170524597, + "learning_rate": 0.0002617930632468668, + "loss": 3.1227, + "step": 96800 + }, + { + "epoch": 28.21178047075274, + "grad_norm": 0.4873857796192169, + "learning_rate": 0.00026161818711745844, + "loss": 3.1296, + "step": 96850 + }, + { + "epoch": 28.226345840130506, + "grad_norm": 0.4577917754650116, + "learning_rate": 0.0002614433109880501, + "loss": 3.1267, + "step": 96900 + }, + { + "epoch": 28.240911209508273, + "grad_norm": 0.44587984681129456, + "learning_rate": 0.00026126843485864176, + "loss": 3.142, + "step": 96950 + }, + { + "epoch": 28.25547657888604, + "grad_norm": 0.44689640402793884, + "learning_rate": 0.00026109355872923345, + "loss": 3.1274, + "step": 97000 + }, + { + "epoch": 28.25547657888604, + "eval_accuracy": 0.3748532583552356, + "eval_loss": 3.5460498332977295, + "eval_runtime": 180.0191, + "eval_samples_per_second": 92.462, + "eval_steps_per_second": 5.783, + "step": 97000 + }, + { + "epoch": 28.27004194826381, + "grad_norm": 0.461459755897522, + "learning_rate": 0.0002609186825998251, + "loss": 3.1274, + "step": 97050 + }, + { + "epoch": 28.284607317641576, + "grad_norm": 0.4559226632118225, + "learning_rate": 0.0002607438064704168, + "loss": 3.1259, + "step": 97100 + }, + { + "epoch": 28.299172687019343, + "grad_norm": 0.4652230739593506, + "learning_rate": 0.0002605689303410084, + "loss": 3.1397, + "step": 97150 + }, + { + "epoch": 28.31373805639711, + "grad_norm": 0.4658327102661133, + "learning_rate": 0.00026039405421160005, + "loss": 3.1371, + "step": 97200 + }, + { + "epoch": 28.328303425774877, + "grad_norm": 0.44982901215553284, + "learning_rate": 0.00026021917808219174, + "loss": 3.1404, + "step": 97250 + }, + { + "epoch": 28.342868795152643, + "grad_norm": 0.45533254742622375, + "learning_rate": 0.00026004430195278343, + "loss": 3.1315, + "step": 97300 + }, + { + "epoch": 28.357434164530414, + "grad_norm": 0.4789443910121918, + "learning_rate": 0.00025986942582337507, + "loss": 3.1545, + "step": 97350 + }, + { + "epoch": 28.37199953390818, + "grad_norm": 0.4561361074447632, + "learning_rate": 0.00025969454969396676, + "loss": 3.1378, + "step": 97400 + }, + { + "epoch": 28.386564903285947, + "grad_norm": 0.48362991213798523, + "learning_rate": 0.00025951967356455845, + "loss": 3.1287, + "step": 97450 + }, + { + "epoch": 28.401130272663714, + "grad_norm": 0.4448145925998688, + "learning_rate": 0.0002593447974351501, + "loss": 3.1475, + "step": 97500 + }, + { + "epoch": 28.41569564204148, + "grad_norm": 0.4719396233558655, + "learning_rate": 0.0002591699213057417, + "loss": 3.1403, + "step": 97550 + }, + { + "epoch": 28.43026101141925, + "grad_norm": 0.4635832905769348, + "learning_rate": 0.0002589950451763334, + "loss": 3.1464, + "step": 97600 + }, + { + "epoch": 28.444826380797018, + "grad_norm": 0.44658058881759644, + "learning_rate": 0.00025882016904692505, + "loss": 3.1514, + "step": 97650 + }, + { + "epoch": 28.459391750174785, + "grad_norm": 0.4418274462223053, + "learning_rate": 0.00025864529291751674, + "loss": 3.1456, + "step": 97700 + }, + { + "epoch": 28.47395711955255, + "grad_norm": 0.4710671901702881, + "learning_rate": 0.00025847041678810843, + "loss": 3.1439, + "step": 97750 + }, + { + "epoch": 28.488522488930318, + "grad_norm": 0.4393328130245209, + "learning_rate": 0.00025829554065870007, + "loss": 3.1564, + "step": 97800 + }, + { + "epoch": 28.503087858308085, + "grad_norm": 0.4646104574203491, + "learning_rate": 0.0002581206645292917, + "loss": 3.1444, + "step": 97850 + }, + { + "epoch": 28.517653227685855, + "grad_norm": 0.46661272644996643, + "learning_rate": 0.0002579457883998834, + "loss": 3.1503, + "step": 97900 + }, + { + "epoch": 28.532218597063622, + "grad_norm": 0.4600259065628052, + "learning_rate": 0.0002577709122704751, + "loss": 3.1562, + "step": 97950 + }, + { + "epoch": 28.54678396644139, + "grad_norm": 0.4362897574901581, + "learning_rate": 0.0002575960361410667, + "loss": 3.1492, + "step": 98000 + }, + { + "epoch": 28.54678396644139, + "eval_accuracy": 0.37473874549634684, + "eval_loss": 3.5448083877563477, + "eval_runtime": 179.7838, + "eval_samples_per_second": 92.583, + "eval_steps_per_second": 5.79, + "step": 98000 + }, + { + "epoch": 28.561349335819155, + "grad_norm": 0.44857919216156006, + "learning_rate": 0.0002574211600116584, + "loss": 3.157, + "step": 98050 + }, + { + "epoch": 28.575914705196922, + "grad_norm": 0.4593052268028259, + "learning_rate": 0.00025724628388225005, + "loss": 3.1505, + "step": 98100 + }, + { + "epoch": 28.590480074574693, + "grad_norm": 0.42843860387802124, + "learning_rate": 0.0002570714077528417, + "loss": 3.1613, + "step": 98150 + }, + { + "epoch": 28.60504544395246, + "grad_norm": 0.44706371426582336, + "learning_rate": 0.0002568965316234334, + "loss": 3.1625, + "step": 98200 + }, + { + "epoch": 28.619610813330226, + "grad_norm": 0.47429358959198, + "learning_rate": 0.00025672165549402506, + "loss": 3.1648, + "step": 98250 + }, + { + "epoch": 28.634176182707993, + "grad_norm": 0.46534454822540283, + "learning_rate": 0.0002565467793646167, + "loss": 3.1579, + "step": 98300 + }, + { + "epoch": 28.64874155208576, + "grad_norm": 0.4708279073238373, + "learning_rate": 0.0002563719032352084, + "loss": 3.1612, + "step": 98350 + }, + { + "epoch": 28.66330692146353, + "grad_norm": 0.4480709135532379, + "learning_rate": 0.0002561970271058, + "loss": 3.1623, + "step": 98400 + }, + { + "epoch": 28.677872290841297, + "grad_norm": 0.5100323557853699, + "learning_rate": 0.0002560221509763917, + "loss": 3.1586, + "step": 98450 + }, + { + "epoch": 28.692437660219063, + "grad_norm": 0.46166589856147766, + "learning_rate": 0.00025584727484698335, + "loss": 3.1564, + "step": 98500 + }, + { + "epoch": 28.70700302959683, + "grad_norm": 0.4602072238922119, + "learning_rate": 0.00025567239871757504, + "loss": 3.1748, + "step": 98550 + }, + { + "epoch": 28.721568398974597, + "grad_norm": 0.44640272855758667, + "learning_rate": 0.0002554975225881667, + "loss": 3.1635, + "step": 98600 + }, + { + "epoch": 28.736133768352367, + "grad_norm": 0.45697900652885437, + "learning_rate": 0.00025532264645875837, + "loss": 3.1582, + "step": 98650 + }, + { + "epoch": 28.750699137730134, + "grad_norm": 0.45967525243759155, + "learning_rate": 0.00025514777032935, + "loss": 3.1723, + "step": 98700 + }, + { + "epoch": 28.7652645071079, + "grad_norm": 0.44456747174263, + "learning_rate": 0.0002549728941999417, + "loss": 3.1739, + "step": 98750 + }, + { + "epoch": 28.779829876485667, + "grad_norm": 0.449693888425827, + "learning_rate": 0.00025479801807053333, + "loss": 3.1562, + "step": 98800 + }, + { + "epoch": 28.794395245863434, + "grad_norm": 0.4497344195842743, + "learning_rate": 0.000254623141941125, + "loss": 3.1548, + "step": 98850 + }, + { + "epoch": 28.8089606152412, + "grad_norm": 0.45112860202789307, + "learning_rate": 0.0002544482658117167, + "loss": 3.1702, + "step": 98900 + }, + { + "epoch": 28.82352598461897, + "grad_norm": 0.4688662886619568, + "learning_rate": 0.00025427338968230835, + "loss": 3.1639, + "step": 98950 + }, + { + "epoch": 28.838091353996738, + "grad_norm": 0.4467281103134155, + "learning_rate": 0.0002540985135529, + "loss": 3.1687, + "step": 99000 + }, + { + "epoch": 28.838091353996738, + "eval_accuracy": 0.37525193710728055, + "eval_loss": 3.536275863647461, + "eval_runtime": 179.7842, + "eval_samples_per_second": 92.583, + "eval_steps_per_second": 5.79, + "step": 99000 + }, + { + "epoch": 28.852656723374505, + "grad_norm": 0.5201435685157776, + "learning_rate": 0.0002539236374234917, + "loss": 3.182, + "step": 99050 + }, + { + "epoch": 28.86722209275227, + "grad_norm": 0.48700618743896484, + "learning_rate": 0.0002537487612940833, + "loss": 3.1711, + "step": 99100 + }, + { + "epoch": 28.88178746213004, + "grad_norm": 0.427386611700058, + "learning_rate": 0.000253573885164675, + "loss": 3.1761, + "step": 99150 + }, + { + "epoch": 28.89635283150781, + "grad_norm": 0.4574876129627228, + "learning_rate": 0.0002533990090352667, + "loss": 3.1807, + "step": 99200 + }, + { + "epoch": 28.910918200885575, + "grad_norm": 0.4709291458129883, + "learning_rate": 0.00025322413290585833, + "loss": 3.1662, + "step": 99250 + }, + { + "epoch": 28.925483570263342, + "grad_norm": 0.5189015865325928, + "learning_rate": 0.00025304925677644997, + "loss": 3.1808, + "step": 99300 + }, + { + "epoch": 28.94004893964111, + "grad_norm": 0.4347054064273834, + "learning_rate": 0.00025287438064704166, + "loss": 3.1713, + "step": 99350 + }, + { + "epoch": 28.954614309018876, + "grad_norm": 0.4408940076828003, + "learning_rate": 0.00025269950451763335, + "loss": 3.1781, + "step": 99400 + }, + { + "epoch": 28.969179678396642, + "grad_norm": 0.44796082377433777, + "learning_rate": 0.000252524628388225, + "loss": 3.1815, + "step": 99450 + }, + { + "epoch": 28.983745047774413, + "grad_norm": 0.4597788453102112, + "learning_rate": 0.0002523497522588167, + "loss": 3.1787, + "step": 99500 + }, + { + "epoch": 28.99831041715218, + "grad_norm": 0.4454861581325531, + "learning_rate": 0.0002521748761294083, + "loss": 3.1821, + "step": 99550 + }, + { + "epoch": 29.012817525052434, + "grad_norm": 0.46412530541419983, + "learning_rate": 0.00025199999999999995, + "loss": 3.103, + "step": 99600 + }, + { + "epoch": 29.027382894430204, + "grad_norm": 0.46690133213996887, + "learning_rate": 0.00025182512387059164, + "loss": 3.0811, + "step": 99650 + }, + { + "epoch": 29.04194826380797, + "grad_norm": 0.4453943371772766, + "learning_rate": 0.0002516502477411833, + "loss": 3.0869, + "step": 99700 + }, + { + "epoch": 29.056513633185737, + "grad_norm": 0.45379212498664856, + "learning_rate": 0.00025147537161177496, + "loss": 3.1016, + "step": 99750 + }, + { + "epoch": 29.071079002563504, + "grad_norm": 0.4479667544364929, + "learning_rate": 0.00025130049548236665, + "loss": 3.0876, + "step": 99800 + }, + { + "epoch": 29.08564437194127, + "grad_norm": 0.44696569442749023, + "learning_rate": 0.0002511256193529583, + "loss": 3.0893, + "step": 99850 + }, + { + "epoch": 29.10020974131904, + "grad_norm": 0.4410049617290497, + "learning_rate": 0.00025095074322355, + "loss": 3.0996, + "step": 99900 + }, + { + "epoch": 29.114775110696808, + "grad_norm": 0.501434326171875, + "learning_rate": 0.0002507758670941416, + "loss": 3.0975, + "step": 99950 + }, + { + "epoch": 29.129340480074575, + "grad_norm": 0.4482145309448242, + "learning_rate": 0.0002506009909647333, + "loss": 3.0999, + "step": 100000 + }, + { + "epoch": 29.129340480074575, + "eval_accuracy": 0.37459178340845056, + "eval_loss": 3.550246000289917, + "eval_runtime": 180.3156, + "eval_samples_per_second": 92.31, + "eval_steps_per_second": 5.773, + "step": 100000 + } + ], + "logging_steps": 50, + "max_steps": 171650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 20 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.090213187452928e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}