diff --git "a/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json" "b/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last_to_hit_frequency_3591/checkpoint-30000/trainer_state.json" @@ -0,0 +1,4513 @@ +{ + "best_global_step": 30000, + "best_metric": 3.574705123901367, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_3591/checkpoint-30000", + "epoch": 8.738755534840363, + "eval_steps": 1000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 1.7487624883651733, + "learning_rate": 0.000294, + "loss": 8.4085, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.8792372941970825, + "learning_rate": 0.0005939999999999999, + "loss": 6.7474, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.49055325984954834, + "learning_rate": 0.0005998286213931798, + "loss": 6.369, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.47406330704689026, + "learning_rate": 0.0005996537452637714, + "loss": 6.152, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.5468786358833313, + "learning_rate": 0.0005994788691343632, + "loss": 6.0121, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.5636491775512695, + "learning_rate": 0.0005993039930049548, + "loss": 5.8836, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.47112441062927246, + "learning_rate": 0.0005991291168755465, + "loss": 5.7826, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.614771842956543, + "learning_rate": 0.0005989542407461382, + "loss": 5.6472, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.49487683176994324, + "learning_rate": 0.0005987793646167297, + "loss": 5.5373, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.4133830964565277, + "learning_rate": 0.0005986044884873214, + "loss": 5.4464, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.43713563680648804, + "learning_rate": 0.0005984296123579131, + "loss": 5.3584, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.4758622348308563, + "learning_rate": 0.0005982547362285047, + "loss": 5.2639, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.5036144852638245, + "learning_rate": 0.0005980798600990964, + "loss": 5.2065, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.4153907597064972, + "learning_rate": 0.0005979049839696881, + "loss": 5.1452, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.49257349967956543, + "learning_rate": 0.0005977301078402798, + "loss": 5.0858, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.39646583795547485, + "learning_rate": 0.0005975552317108715, + "loss": 5.0535, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4354263246059418, + "learning_rate": 0.0005973803555814631, + "loss": 4.9802, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.45780736207962036, + "learning_rate": 0.0005972054794520547, + "loss": 4.9416, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.4360142946243286, + "learning_rate": 0.0005970306033226464, + "loss": 4.8913, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.49000921845436096, + "learning_rate": 0.0005968557271932381, + "loss": 4.8516, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.25072978433607523, + "eval_loss": 4.782108783721924, + "eval_runtime": 182.688, + "eval_samples_per_second": 91.112, + "eval_steps_per_second": 5.698, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.4772098958492279, + "learning_rate": 0.0005966808510638297, + "loss": 4.7909, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.4113984704017639, + "learning_rate": 0.0005965059749344214, + "loss": 4.7556, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.4781721830368042, + "learning_rate": 0.0005963310988050131, + "loss": 4.7197, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.5447441935539246, + "learning_rate": 0.0005961562226756047, + "loss": 4.6751, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.41747575998306274, + "learning_rate": 0.0005959813465461965, + "loss": 4.6392, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.44658181071281433, + "learning_rate": 0.000595806470416788, + "loss": 4.6204, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.4609386622905731, + "learning_rate": 0.0005956315942873797, + "loss": 4.5836, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.4060616195201874, + "learning_rate": 0.0005954567181579714, + "loss": 4.5615, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.4222695827484131, + "learning_rate": 0.000595281842028563, + "loss": 4.5376, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4395248591899872, + "learning_rate": 0.0005951069658991547, + "loss": 4.5107, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.4157288372516632, + "learning_rate": 0.0005949320897697464, + "loss": 4.5016, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.4212968349456787, + "learning_rate": 0.0005947572136403381, + "loss": 4.4616, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.49588635563850403, + "learning_rate": 0.0005945823375109297, + "loss": 4.4497, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.44670262932777405, + "learning_rate": 0.0005944074613815215, + "loss": 4.4344, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.36793598532676697, + "learning_rate": 0.000594232585252113, + "loss": 4.4265, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.4189833700656891, + "learning_rate": 0.0005940577091227047, + "loss": 4.4012, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.4342043995857239, + "learning_rate": 0.0005938828329932964, + "loss": 4.3671, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.4385491609573364, + "learning_rate": 0.000593707956863888, + "loss": 4.3797, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.3876365125179291, + "learning_rate": 0.0005935330807344797, + "loss": 4.3617, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.41458868980407715, + "learning_rate": 0.0005933582046050714, + "loss": 4.342, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.2995414195009285, + "eval_loss": 4.284348011016846, + "eval_runtime": 182.6917, + "eval_samples_per_second": 91.11, + "eval_steps_per_second": 5.698, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.41804736852645874, + "learning_rate": 0.000593183328475663, + "loss": 4.3264, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.36194130778312683, + "learning_rate": 0.0005930084523462546, + "loss": 4.3039, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.3918125629425049, + "learning_rate": 0.0005928335762168463, + "loss": 4.2908, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.40795278549194336, + "learning_rate": 0.000592658700087438, + "loss": 4.3007, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.38976508378982544, + "learning_rate": 0.0005924838239580297, + "loss": 4.2836, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.40438467264175415, + "learning_rate": 0.0005923089478286214, + "loss": 4.2674, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.41360440850257874, + "learning_rate": 0.000592134071699213, + "loss": 4.2654, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.377794474363327, + "learning_rate": 0.0005919591955698047, + "loss": 4.247, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.391387015581131, + "learning_rate": 0.0005917843194403964, + "loss": 4.2377, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.355221688747406, + "learning_rate": 0.000591609443310988, + "loss": 4.2396, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.4049054682254791, + "learning_rate": 0.0005914345671815796, + "loss": 4.225, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.3985639214515686, + "learning_rate": 0.0005912596910521713, + "loss": 4.204, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.3753962814807892, + "learning_rate": 0.0005910848149227629, + "loss": 4.207, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3719504773616791, + "learning_rate": 0.0005909099387933547, + "loss": 4.1872, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.3820745348930359, + "learning_rate": 0.0005907350626639463, + "loss": 4.1728, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.39611145853996277, + "learning_rate": 0.000590560186534538, + "loss": 4.175, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.3846186697483063, + "learning_rate": 0.0005903853104051297, + "loss": 4.171, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.37356239557266235, + "learning_rate": 0.0005902104342757214, + "loss": 4.1733, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.3760608434677124, + "learning_rate": 0.000590035558146313, + "loss": 4.1446, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.36522382497787476, + "learning_rate": 0.0005898606820169046, + "loss": 4.1349, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.3161861104367184, + "eval_loss": 4.093681335449219, + "eval_runtime": 182.834, + "eval_samples_per_second": 91.039, + "eval_steps_per_second": 5.694, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.3807234764099121, + "learning_rate": 0.0005896858058874963, + "loss": 4.1488, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.37441378831863403, + "learning_rate": 0.0005895109297580879, + "loss": 4.1366, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.3428540825843811, + "learning_rate": 0.0005893360536286797, + "loss": 4.1134, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.37319910526275635, + "learning_rate": 0.0005891611774992713, + "loss": 4.1224, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.3176666796207428, + "learning_rate": 0.000588986301369863, + "loss": 4.1011, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.33169302344322205, + "learning_rate": 0.0005888114252404547, + "loss": 4.109, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.37893936038017273, + "learning_rate": 0.0005886365491110463, + "loss": 4.0925, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.37214741110801697, + "learning_rate": 0.000588461672981638, + "loss": 4.0892, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.36634066700935364, + "learning_rate": 0.0005882867968522296, + "loss": 4.0616, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.35482263565063477, + "learning_rate": 0.0005881119207228212, + "loss": 4.0124, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.3779667913913727, + "learning_rate": 0.0005879370445934129, + "loss": 4.016, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.3397766947746277, + "learning_rate": 0.0005877621684640046, + "loss": 4.0033, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3722945749759674, + "learning_rate": 0.0005875872923345963, + "loss": 4.0129, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.34330493211746216, + "learning_rate": 0.000587412416205188, + "loss": 4.003, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.3765234649181366, + "learning_rate": 0.0005872375400757797, + "loss": 4.0107, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.35264670848846436, + "learning_rate": 0.0005870626639463713, + "loss": 4.0043, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.3567696213722229, + "learning_rate": 0.0005868877878169629, + "loss": 4.0016, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.37721458077430725, + "learning_rate": 0.0005867129116875546, + "loss": 4.0014, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.34036508202552795, + "learning_rate": 0.0005865380355581462, + "loss": 3.9731, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.33936476707458496, + "learning_rate": 0.0005863631594287379, + "loss": 3.9879, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.3259847194699489, + "eval_loss": 3.9844980239868164, + "eval_runtime": 182.8731, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 5.692, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.35038861632347107, + "learning_rate": 0.0005861882832993296, + "loss": 3.9796, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.3499089777469635, + "learning_rate": 0.0005860134071699212, + "loss": 4.0012, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.35973456501960754, + "learning_rate": 0.000585838531040513, + "loss": 3.9843, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.364793598651886, + "learning_rate": 0.0005856636549111046, + "loss": 3.9738, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.3340921998023987, + "learning_rate": 0.0005854887787816963, + "loss": 3.9804, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.33135804533958435, + "learning_rate": 0.0005853139026522879, + "loss": 3.9741, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.348160058259964, + "learning_rate": 0.0005851390265228796, + "loss": 3.9693, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.3392653167247772, + "learning_rate": 0.0005849641503934712, + "loss": 3.9645, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.33709239959716797, + "learning_rate": 0.0005847892742640629, + "loss": 3.9534, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.3292732238769531, + "learning_rate": 0.0005846143981346546, + "loss": 3.9565, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.346902459859848, + "learning_rate": 0.0005844395220052462, + "loss": 3.945, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.3570997416973114, + "learning_rate": 0.000584264645875838, + "loss": 3.9458, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.3431386351585388, + "learning_rate": 0.0005840897697464296, + "loss": 3.9343, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.32554003596305847, + "learning_rate": 0.0005839148936170212, + "loss": 3.9334, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.33119750022888184, + "learning_rate": 0.0005837400174876129, + "loss": 3.9402, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.32726046442985535, + "learning_rate": 0.0005835651413582045, + "loss": 3.9414, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.37814900279045105, + "learning_rate": 0.0005833902652287962, + "loss": 3.9381, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.35007208585739136, + "learning_rate": 0.0005832153890993879, + "loss": 3.941, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.47103646397590637, + "learning_rate": 0.0005830405129699796, + "loss": 3.9365, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.3166019916534424, + "learning_rate": 0.0005828656368405712, + "loss": 3.9229, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.33251971202484953, + "eval_loss": 3.9095144271850586, + "eval_runtime": 182.8302, + "eval_samples_per_second": 91.041, + "eval_steps_per_second": 5.694, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.32818934321403503, + "learning_rate": 0.0005826907607111629, + "loss": 3.9321, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.3326047658920288, + "learning_rate": 0.0005825158845817546, + "loss": 3.9226, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.3376672863960266, + "learning_rate": 0.0005823410084523462, + "loss": 3.93, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.3359002470970154, + "learning_rate": 0.0005821661323229379, + "loss": 3.9337, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.33936458826065063, + "learning_rate": 0.0005819912561935295, + "loss": 3.9042, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.34475040435791016, + "learning_rate": 0.0005818163800641212, + "loss": 3.9182, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.3286258578300476, + "learning_rate": 0.0005816415039347129, + "loss": 3.9125, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.3313068747520447, + "learning_rate": 0.0005814666278053045, + "loss": 3.8986, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.3305990695953369, + "learning_rate": 0.0005812917516758962, + "loss": 3.9158, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.3188944458961487, + "learning_rate": 0.0005811168755464879, + "loss": 3.8946, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.353261798620224, + "learning_rate": 0.0005809419994170794, + "loss": 3.9022, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.3191840648651123, + "learning_rate": 0.0005807671232876712, + "loss": 3.8972, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.3437453508377075, + "learning_rate": 0.0005805922471582628, + "loss": 3.8846, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.3136034309864044, + "learning_rate": 0.0005804173710288545, + "loss": 3.8851, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.31763720512390137, + "learning_rate": 0.0005802424948994462, + "loss": 3.8838, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.352448970079422, + "learning_rate": 0.0005800676187700379, + "loss": 3.8774, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.32752853631973267, + "learning_rate": 0.0005798927426406295, + "loss": 3.8735, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.31873783469200134, + "learning_rate": 0.0005797178665112212, + "loss": 3.8819, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.3134726881980896, + "learning_rate": 0.0005795429903818129, + "loss": 3.8784, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.32978641986846924, + "learning_rate": 0.0005793681142524044, + "loss": 3.8805, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.33774250948934204, + "eval_loss": 3.851607084274292, + "eval_runtime": 183.0051, + "eval_samples_per_second": 90.954, + "eval_steps_per_second": 5.688, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.31962209939956665, + "learning_rate": 0.0005791932381229961, + "loss": 3.8609, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.353483647108078, + "learning_rate": 0.0005790183619935878, + "loss": 3.866, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.34389597177505493, + "learning_rate": 0.0005788434858641795, + "loss": 3.8668, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.306185245513916, + "learning_rate": 0.0005786686097347712, + "loss": 3.8713, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.3174114525318146, + "learning_rate": 0.0005784937336053628, + "loss": 3.854, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.3356582522392273, + "learning_rate": 0.0005783188574759545, + "loss": 3.8665, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.3357125222682953, + "learning_rate": 0.0005781439813465462, + "loss": 3.8615, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.3359164297580719, + "learning_rate": 0.0005779691052171379, + "loss": 3.8584, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.3328222632408142, + "learning_rate": 0.0005777942290877294, + "loss": 3.8494, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.3154282569885254, + "learning_rate": 0.0005776193529583211, + "loss": 3.8592, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.35024163126945496, + "learning_rate": 0.0005774444768289128, + "loss": 3.8619, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.3441106379032135, + "learning_rate": 0.0005772696006995045, + "loss": 3.8442, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.3300905227661133, + "learning_rate": 0.0005770947245700962, + "loss": 3.8414, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.32087442278862, + "learning_rate": 0.0005769198484406878, + "loss": 3.8356, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.3275013864040375, + "learning_rate": 0.0005767449723112795, + "loss": 3.837, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.3164341151714325, + "learning_rate": 0.0005765700961818712, + "loss": 3.836, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.3237488269805908, + "learning_rate": 0.0005763952200524627, + "loss": 3.8226, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.3478614091873169, + "learning_rate": 0.0005762203439230544, + "loss": 3.7605, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.3280088007450104, + "learning_rate": 0.0005760454677936461, + "loss": 3.7386, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.3092538118362427, + "learning_rate": 0.0005758705916642378, + "loss": 3.7313, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34221756385061836, + "eval_loss": 3.8092734813690186, + "eval_runtime": 182.6845, + "eval_samples_per_second": 91.113, + "eval_steps_per_second": 5.698, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.3326481282711029, + "learning_rate": 0.0005756957155348294, + "loss": 3.7507, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.3384568989276886, + "learning_rate": 0.0005755208394054211, + "loss": 3.7393, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.33243587613105774, + "learning_rate": 0.0005753459632760128, + "loss": 3.7399, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.31076061725616455, + "learning_rate": 0.0005751710871466045, + "loss": 3.7378, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.3274184465408325, + "learning_rate": 0.0005749962110171962, + "loss": 3.7429, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.32674604654312134, + "learning_rate": 0.0005748213348877877, + "loss": 3.7583, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.32339632511138916, + "learning_rate": 0.0005746464587583794, + "loss": 3.7516, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.3199286162853241, + "learning_rate": 0.0005744715826289711, + "loss": 3.7587, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.3267718553543091, + "learning_rate": 0.0005742967064995627, + "loss": 3.749, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.32316339015960693, + "learning_rate": 0.0005741218303701544, + "loss": 3.747, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.3192504048347473, + "learning_rate": 0.0005739469542407461, + "loss": 3.7527, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.3137800097465515, + "learning_rate": 0.0005737720781113378, + "loss": 3.7386, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.3079003393650055, + "learning_rate": 0.0005735972019819295, + "loss": 3.7412, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.317622572183609, + "learning_rate": 0.000573422325852521, + "loss": 3.7524, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.3800288140773773, + "learning_rate": 0.0005732474497231127, + "loss": 3.7621, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.3083147704601288, + "learning_rate": 0.0005730725735937044, + "loss": 3.7586, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.3407652974128723, + "learning_rate": 0.0005728976974642961, + "loss": 3.7421, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3287487030029297, + "learning_rate": 0.0005727228213348877, + "loss": 3.7566, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.3335205912590027, + "learning_rate": 0.0005725479452054794, + "loss": 3.7335, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.3264673352241516, + "learning_rate": 0.0005723730690760711, + "loss": 3.7565, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.34492789746043634, + "eval_loss": 3.7804434299468994, + "eval_runtime": 183.2368, + "eval_samples_per_second": 90.839, + "eval_steps_per_second": 5.681, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.3331637382507324, + "learning_rate": 0.0005721981929466627, + "loss": 3.7361, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.332265704870224, + "learning_rate": 0.0005720233168172545, + "loss": 3.7488, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3338277041912079, + "learning_rate": 0.000571848440687846, + "loss": 3.749, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.3343018889427185, + "learning_rate": 0.0005716735645584377, + "loss": 3.7482, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.3533099591732025, + "learning_rate": 0.0005714986884290294, + "loss": 3.7509, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.3301255404949188, + "learning_rate": 0.000571323812299621, + "loss": 3.7334, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.31493982672691345, + "learning_rate": 0.0005711489361702127, + "loss": 3.7405, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.3294617235660553, + "learning_rate": 0.0005709740600408044, + "loss": 3.7471, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.316875696182251, + "learning_rate": 0.0005707991839113961, + "loss": 3.756, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.310937762260437, + "learning_rate": 0.0005706243077819877, + "loss": 3.7412, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.31915387511253357, + "learning_rate": 0.0005704494316525793, + "loss": 3.7455, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.32014259696006775, + "learning_rate": 0.000570274555523171, + "loss": 3.7348, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.32315295934677124, + "learning_rate": 0.0005700996793937627, + "loss": 3.7426, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.33680716156959534, + "learning_rate": 0.0005699248032643544, + "loss": 3.7458, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.30035462975502014, + "learning_rate": 0.000569749927134946, + "loss": 3.7355, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.3163704574108124, + "learning_rate": 0.0005695750510055377, + "loss": 3.745, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.3217114806175232, + "learning_rate": 0.0005694001748761294, + "loss": 3.7532, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.3296523988246918, + "learning_rate": 0.000569225298746721, + "loss": 3.7536, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.3380911648273468, + "learning_rate": 0.0005690504226173127, + "loss": 3.7383, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3250406086444855, + "learning_rate": 0.0005688755464879043, + "loss": 3.7287, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.3478768975009979, + "eval_loss": 3.7482099533081055, + "eval_runtime": 182.7889, + "eval_samples_per_second": 91.061, + "eval_steps_per_second": 5.695, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.2994716465473175, + "learning_rate": 0.000568700670358496, + "loss": 3.7238, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.34974968433380127, + "learning_rate": 0.0005685257942290877, + "loss": 3.7324, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.3160337209701538, + "learning_rate": 0.0005683509180996793, + "loss": 3.7271, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.3240397274494171, + "learning_rate": 0.000568176041970271, + "loss": 3.7258, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.32082828879356384, + "learning_rate": 0.0005680011658408627, + "loss": 3.7358, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.31649598479270935, + "learning_rate": 0.0005678262897114544, + "loss": 3.7392, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.31169888377189636, + "learning_rate": 0.000567651413582046, + "loss": 3.735, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.3002908229827881, + "learning_rate": 0.0005674765374526377, + "loss": 3.7288, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.30253180861473083, + "learning_rate": 0.0005673016613232293, + "loss": 3.7204, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.31831657886505127, + "learning_rate": 0.0005671267851938209, + "loss": 3.7258, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.31231117248535156, + "learning_rate": 0.0005669519090644127, + "loss": 3.721, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.3149794936180115, + "learning_rate": 0.0005667770329350043, + "loss": 3.7337, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.3181709945201874, + "learning_rate": 0.000566602156805596, + "loss": 3.7265, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.31284210085868835, + "learning_rate": 0.0005664272806761877, + "loss": 3.7288, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.3245266377925873, + "learning_rate": 0.0005662524045467793, + "loss": 3.7222, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.319336473941803, + "learning_rate": 0.000566077528417371, + "loss": 3.7298, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.3112030625343323, + "learning_rate": 0.0005659026522879626, + "loss": 3.7303, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.3266408443450928, + "learning_rate": 0.0005657277761585543, + "loss": 3.7114, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.3135480284690857, + "learning_rate": 0.0005655529000291459, + "loss": 3.7203, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.3140251040458679, + "learning_rate": 0.0005653780238997376, + "loss": 3.7117, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.3501354108677876, + "eval_loss": 3.7232017517089844, + "eval_runtime": 182.9143, + "eval_samples_per_second": 90.999, + "eval_steps_per_second": 5.691, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.30501222610473633, + "learning_rate": 0.0005652031477703293, + "loss": 3.7205, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.31396257877349854, + "learning_rate": 0.000565028271640921, + "loss": 3.7231, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.29565396904945374, + "learning_rate": 0.0005648533955115127, + "loss": 3.7167, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.29209980368614197, + "learning_rate": 0.0005646785193821043, + "loss": 3.7011, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.341580867767334, + "learning_rate": 0.000564503643252696, + "loss": 3.7199, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.306345134973526, + "learning_rate": 0.0005643287671232876, + "loss": 3.7135, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.3232329487800598, + "learning_rate": 0.0005641538909938792, + "loss": 3.602, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.3200088441371918, + "learning_rate": 0.0005639790148644709, + "loss": 3.6044, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.31947940587997437, + "learning_rate": 0.0005638041387350626, + "loss": 3.612, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.3416725695133209, + "learning_rate": 0.0005636292626056543, + "loss": 3.6212, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.3318784832954407, + "learning_rate": 0.000563454386476246, + "loss": 3.6079, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.3215178847312927, + "learning_rate": 0.0005632795103468376, + "loss": 3.623, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3258126974105835, + "learning_rate": 0.0005631046342174293, + "loss": 3.6237, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.3248828947544098, + "learning_rate": 0.000562929758088021, + "loss": 3.6206, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.3171083331108093, + "learning_rate": 0.0005627548819586126, + "loss": 3.6301, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.3476962745189667, + "learning_rate": 0.0005625800058292042, + "loss": 3.6342, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.32831183075904846, + "learning_rate": 0.0005624051296997959, + "loss": 3.6253, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.3208300769329071, + "learning_rate": 0.0005622302535703876, + "loss": 3.6201, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.34289947152137756, + "learning_rate": 0.0005620553774409792, + "loss": 3.6407, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.34401383996009827, + "learning_rate": 0.000561880501311571, + "loss": 3.6328, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.35195491908561366, + "eval_loss": 3.711780548095703, + "eval_runtime": 183.0966, + "eval_samples_per_second": 90.908, + "eval_steps_per_second": 5.686, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.3176616132259369, + "learning_rate": 0.0005617056251821626, + "loss": 3.6359, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.3091390132904053, + "learning_rate": 0.0005615307490527543, + "loss": 3.6274, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.31460776925086975, + "learning_rate": 0.000561355872923346, + "loss": 3.6288, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.33371245861053467, + "learning_rate": 0.0005611809967939375, + "loss": 3.6434, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.3317493796348572, + "learning_rate": 0.0005610061206645292, + "loss": 3.6385, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.3349264860153198, + "learning_rate": 0.0005608312445351209, + "loss": 3.6397, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.32256776094436646, + "learning_rate": 0.0005606563684057126, + "loss": 3.6336, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.32920846343040466, + "learning_rate": 0.0005604814922763042, + "loss": 3.633, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.32023611664772034, + "learning_rate": 0.0005603066161468959, + "loss": 3.6273, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.3239540755748749, + "learning_rate": 0.0005601317400174876, + "loss": 3.6354, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.3216831386089325, + "learning_rate": 0.0005599568638880793, + "loss": 3.6423, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.3342783749103546, + "learning_rate": 0.0005597819877586709, + "loss": 3.6189, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.30918747186660767, + "learning_rate": 0.0005596071116292625, + "loss": 3.6395, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.3271181583404541, + "learning_rate": 0.0005594322354998542, + "loss": 3.6403, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.30378925800323486, + "learning_rate": 0.0005592573593704459, + "loss": 3.6356, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.3317507803440094, + "learning_rate": 0.0005590824832410375, + "loss": 3.6269, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.33540189266204834, + "learning_rate": 0.0005589076071116292, + "loss": 3.64, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.2987757921218872, + "learning_rate": 0.0005587327309822209, + "loss": 3.6381, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.3357625901699066, + "learning_rate": 0.0005585578548528126, + "loss": 3.6504, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.3251186013221741, + "learning_rate": 0.0005583829787234043, + "loss": 3.6393, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.3538467326506846, + "eval_loss": 3.695300817489624, + "eval_runtime": 183.2049, + "eval_samples_per_second": 90.855, + "eval_steps_per_second": 5.682, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.3250819742679596, + "learning_rate": 0.0005582081025939958, + "loss": 3.628, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.3002704381942749, + "learning_rate": 0.0005580332264645875, + "loss": 3.6323, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.3166797161102295, + "learning_rate": 0.0005578583503351792, + "loss": 3.6524, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.32995665073394775, + "learning_rate": 0.0005576834742057709, + "loss": 3.6375, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.3259824812412262, + "learning_rate": 0.0005575085980763625, + "loss": 3.6469, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.35942357778549194, + "learning_rate": 0.0005573337219469542, + "loss": 3.6256, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.35430946946144104, + "learning_rate": 0.0005571588458175459, + "loss": 3.6376, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.31567761301994324, + "learning_rate": 0.0005569839696881374, + "loss": 3.6417, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.32618406414985657, + "learning_rate": 0.0005568090935587292, + "loss": 3.6446, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.31533339619636536, + "learning_rate": 0.0005566342174293208, + "loss": 3.6602, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.3274492621421814, + "learning_rate": 0.0005564593412999125, + "loss": 3.6288, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.3271532952785492, + "learning_rate": 0.0005562844651705042, + "loss": 3.6392, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.3347550630569458, + "learning_rate": 0.0005561095890410958, + "loss": 3.6288, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.32583537697792053, + "learning_rate": 0.0005559347129116875, + "loss": 3.6302, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.30043119192123413, + "learning_rate": 0.0005557598367822792, + "loss": 3.6391, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.328071653842926, + "learning_rate": 0.0005555849606528709, + "loss": 3.6402, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.3114347755908966, + "learning_rate": 0.0005554100845234624, + "loss": 3.6368, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.3095006048679352, + "learning_rate": 0.0005552352083940541, + "loss": 3.642, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.341740220785141, + "learning_rate": 0.0005550603322646458, + "loss": 3.6366, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.3182758390903473, + "learning_rate": 0.0005548854561352375, + "loss": 3.6428, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.35534045531206226, + "eval_loss": 3.67643141746521, + "eval_runtime": 182.9247, + "eval_samples_per_second": 90.994, + "eval_steps_per_second": 5.691, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.3386680781841278, + "learning_rate": 0.0005547105800058292, + "loss": 3.6498, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.317020446062088, + "learning_rate": 0.0005545357038764208, + "loss": 3.6364, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.3221585154533386, + "learning_rate": 0.0005543608277470125, + "loss": 3.639, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.2992505133152008, + "learning_rate": 0.0005541859516176042, + "loss": 3.6415, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.30075782537460327, + "learning_rate": 0.0005540110754881958, + "loss": 3.6458, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.3180374205112457, + "learning_rate": 0.0005538361993587874, + "loss": 3.6236, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.3106091022491455, + "learning_rate": 0.0005536613232293791, + "loss": 3.6353, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.29708361625671387, + "learning_rate": 0.0005534864470999708, + "loss": 3.6366, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.34170088171958923, + "learning_rate": 0.0005533115709705625, + "loss": 3.6262, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.3265573978424072, + "learning_rate": 0.0005531366948411541, + "loss": 3.635, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.31741398572921753, + "learning_rate": 0.0005529618187117458, + "loss": 3.631, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.32067936658859253, + "learning_rate": 0.0005527869425823375, + "loss": 3.6362, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.31947606801986694, + "learning_rate": 0.0005526120664529292, + "loss": 3.6329, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.2974706292152405, + "learning_rate": 0.0005524371903235207, + "loss": 3.636, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.3165639042854309, + "learning_rate": 0.0005522623141941124, + "loss": 3.5888, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.3143453299999237, + "learning_rate": 0.0005520874380647041, + "loss": 3.5185, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.3318156599998474, + "learning_rate": 0.0005519125619352957, + "loss": 3.52, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.30892449617385864, + "learning_rate": 0.0005517376858058875, + "loss": 3.5368, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.3253442645072937, + "learning_rate": 0.0005515628096764791, + "loss": 3.5317, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.31239765882492065, + "learning_rate": 0.0005513879335470708, + "loss": 3.5319, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.35649757600732224, + "eval_loss": 3.668315887451172, + "eval_runtime": 182.8782, + "eval_samples_per_second": 91.017, + "eval_steps_per_second": 5.692, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.32942360639572144, + "learning_rate": 0.0005512130574176625, + "loss": 3.5415, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.3166195750236511, + "learning_rate": 0.000551038181288254, + "loss": 3.5305, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.3233526349067688, + "learning_rate": 0.0005508633051588457, + "loss": 3.5415, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.32190340757369995, + "learning_rate": 0.0005506884290294374, + "loss": 3.5477, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.34432554244995117, + "learning_rate": 0.0005505135529000291, + "loss": 3.5525, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.3212747275829315, + "learning_rate": 0.0005503386767706207, + "loss": 3.5454, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.3225768804550171, + "learning_rate": 0.0005501638006412124, + "loss": 3.5577, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.3135558068752289, + "learning_rate": 0.0005499889245118041, + "loss": 3.5534, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.342723548412323, + "learning_rate": 0.0005498140483823958, + "loss": 3.5381, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.3262888193130493, + "learning_rate": 0.0005496391722529875, + "loss": 3.5649, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.3289225399494171, + "learning_rate": 0.000549464296123579, + "loss": 3.5588, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.30956584215164185, + "learning_rate": 0.0005492894199941707, + "loss": 3.5632, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.33964231610298157, + "learning_rate": 0.0005491145438647624, + "loss": 3.5687, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.33205127716064453, + "learning_rate": 0.000548939667735354, + "loss": 3.5543, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": Infinity, + "learning_rate": 0.0005487647916059457, + "loss": 3.5658, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.3393228054046631, + "learning_rate": 0.0005485899154765374, + "loss": 3.5577, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.31260740756988525, + "learning_rate": 0.0005484150393471291, + "loss": 3.5722, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.331304132938385, + "learning_rate": 0.0005482401632177208, + "loss": 3.5601, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.3720129728317261, + "learning_rate": 0.0005480652870883124, + "loss": 3.5754, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.31915804743766785, + "learning_rate": 0.000547890410958904, + "loss": 3.5417, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.3573578332850318, + "eval_loss": 3.6594953536987305, + "eval_runtime": 183.0024, + "eval_samples_per_second": 90.955, + "eval_steps_per_second": 5.688, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.3377160429954529, + "learning_rate": 0.0005477155348294957, + "loss": 3.5704, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.31284624338150024, + "learning_rate": 0.0005475406587000874, + "loss": 3.562, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.331798255443573, + "learning_rate": 0.000547365782570679, + "loss": 3.5583, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.31809282302856445, + "learning_rate": 0.0005471909064412707, + "loss": 3.5653, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.3349161446094513, + "learning_rate": 0.0005470160303118624, + "loss": 3.5674, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.32030266523361206, + "learning_rate": 0.000546841154182454, + "loss": 3.5776, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": NaN, + "learning_rate": 0.0005466662780530458, + "loss": 3.5804, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.3176333010196686, + "learning_rate": 0.0005464914019236374, + "loss": 3.5782, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.3277718126773834, + "learning_rate": 0.000546316525794229, + "loss": 3.5864, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.3107328712940216, + "learning_rate": 0.0005461416496648207, + "loss": 3.5818, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.32739174365997314, + "learning_rate": 0.0005459667735354123, + "loss": 3.5574, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.3131246268749237, + "learning_rate": 0.000545791897406004, + "loss": 3.5691, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.33344247937202454, + "learning_rate": 0.0005456170212765957, + "loss": 3.5742, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.3450835347175598, + "learning_rate": 0.0005454421451471874, + "loss": 3.5739, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.31020715832710266, + "learning_rate": 0.000545267269017779, + "loss": 3.5733, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.3077200651168823, + "learning_rate": 0.0005450923928883708, + "loss": 3.5576, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3353123068809509, + "learning_rate": 0.0005449175167589623, + "loss": 3.5723, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.32540038228034973, + "learning_rate": 0.000544742640629554, + "loss": 3.5793, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3153678774833679, + "learning_rate": 0.0005445677645001457, + "loss": 3.566, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.3324628472328186, + "learning_rate": 0.0005443928883707373, + "loss": 3.5679, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.35883380292619155, + "eval_loss": 3.6458113193511963, + "eval_runtime": 182.9054, + "eval_samples_per_second": 91.003, + "eval_steps_per_second": 5.691, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.30943557620048523, + "learning_rate": 0.000544218012241329, + "loss": 3.5781, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.33318641781806946, + "learning_rate": 0.0005440431361119207, + "loss": 3.5782, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.32952025532722473, + "learning_rate": 0.0005438682599825123, + "loss": 3.5731, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.33214232325553894, + "learning_rate": 0.000543693383853104, + "loss": 3.5754, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.3278939723968506, + "learning_rate": 0.0005435185077236957, + "loss": 3.5669, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.35436153411865234, + "learning_rate": 0.0005433436315942873, + "loss": 3.5705, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.32666391134262085, + "learning_rate": 0.000543168755464879, + "loss": 3.567, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.3220253586769104, + "learning_rate": 0.0005429938793354706, + "loss": 3.5843, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.3143666982650757, + "learning_rate": 0.0005428190032060623, + "loss": 3.5715, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.30706289410591125, + "learning_rate": 0.000542644127076654, + "loss": 3.5738, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.3221777677536011, + "learning_rate": 0.0005424692509472457, + "loss": 3.5769, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.32023727893829346, + "learning_rate": 0.0005422943748178373, + "loss": 3.5685, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.3428962230682373, + "learning_rate": 0.000542119498688429, + "loss": 3.5825, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.3187549412250519, + "learning_rate": 0.0005419446225590207, + "loss": 3.5684, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.32895606756210327, + "learning_rate": 0.0005417697464296122, + "loss": 3.5723, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.31578466296195984, + "learning_rate": 0.000541594870300204, + "loss": 3.5769, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.3406703472137451, + "learning_rate": 0.0005414199941707956, + "loss": 3.5636, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.32653164863586426, + "learning_rate": 0.0005412451180413873, + "loss": 3.5667, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.31869933009147644, + "learning_rate": 0.000541070241911979, + "loss": 3.5796, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.3086072504520416, + "learning_rate": 0.0005408953657825706, + "loss": 3.5802, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.36002831077661235, + "eval_loss": 3.6297221183776855, + "eval_runtime": 182.8739, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 5.692, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.3254891335964203, + "learning_rate": 0.0005407204896531623, + "loss": 3.5614, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.3315959870815277, + "learning_rate": 0.000540545613523754, + "loss": 3.5736, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.308977335691452, + "learning_rate": 0.0005403707373943456, + "loss": 3.5687, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.33981844782829285, + "learning_rate": 0.0005401958612649372, + "loss": 3.5112, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.32200807332992554, + "learning_rate": 0.000540020985135529, + "loss": 3.4613, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.30297327041625977, + "learning_rate": 0.0005398461090061206, + "loss": 3.4576, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.32989853620529175, + "learning_rate": 0.0005396712328767123, + "loss": 3.4718, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.31648069620132446, + "learning_rate": 0.000539496356747304, + "loss": 3.4785, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.3199455440044403, + "learning_rate": 0.0005393214806178956, + "loss": 3.4711, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.33137065172195435, + "learning_rate": 0.0005391466044884873, + "loss": 3.491, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.33032289147377014, + "learning_rate": 0.000538971728359079, + "loss": 3.4838, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.32575368881225586, + "learning_rate": 0.0005387968522296705, + "loss": 3.492, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.3349052965641022, + "learning_rate": 0.0005386219761002622, + "loss": 3.4838, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3152545094490051, + "learning_rate": 0.0005384470999708539, + "loss": 3.4819, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.321737676858902, + "learning_rate": 0.0005382722238414456, + "loss": 3.486, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.36409512162208557, + "learning_rate": 0.0005380973477120373, + "loss": 3.4959, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.32220658659935, + "learning_rate": 0.000537922471582629, + "loss": 3.4938, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.3427848219871521, + "learning_rate": 0.0005377475954532206, + "loss": 3.4992, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.33268028497695923, + "learning_rate": 0.0005375727193238123, + "loss": 3.4945, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.2996683120727539, + "learning_rate": 0.000537397843194404, + "loss": 3.4929, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.36062627011984466, + "eval_loss": 3.6342105865478516, + "eval_runtime": 182.824, + "eval_samples_per_second": 91.044, + "eval_steps_per_second": 5.694, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.30999842286109924, + "learning_rate": 0.0005372229670649955, + "loss": 3.5096, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.32671719789505005, + "learning_rate": 0.0005370480909355872, + "loss": 3.5092, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.32220521569252014, + "learning_rate": 0.0005368732148061789, + "loss": 3.5055, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.3446556031703949, + "learning_rate": 0.0005366983386767705, + "loss": 3.5103, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.333375483751297, + "learning_rate": 0.0005365234625473623, + "loss": 3.5227, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.32386448979377747, + "learning_rate": 0.0005363485864179539, + "loss": 3.5153, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.31273576617240906, + "learning_rate": 0.0005361737102885456, + "loss": 3.5114, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.32386016845703125, + "learning_rate": 0.0005359988341591373, + "loss": 3.5176, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3084457516670227, + "learning_rate": 0.000535823958029729, + "loss": 3.5144, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.34510570764541626, + "learning_rate": 0.0005356490819003205, + "loss": 3.5283, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.33156850934028625, + "learning_rate": 0.0005354742057709122, + "loss": 3.5198, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.32253149151802063, + "learning_rate": 0.0005352993296415039, + "loss": 3.5129, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.343431293964386, + "learning_rate": 0.0005351244535120955, + "loss": 3.5214, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.320697158575058, + "learning_rate": 0.0005349495773826873, + "loss": 3.5089, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.3079582154750824, + "learning_rate": 0.0005347747012532789, + "loss": 3.523, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.33627447485923767, + "learning_rate": 0.0005345998251238706, + "loss": 3.5175, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.3293239176273346, + "learning_rate": 0.0005344249489944623, + "loss": 3.5214, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.321121484041214, + "learning_rate": 0.0005342500728650538, + "loss": 3.5227, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.32323557138442993, + "learning_rate": 0.0005340751967356455, + "loss": 3.5089, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.3353577256202698, + "learning_rate": 0.0005339003206062372, + "loss": 3.5125, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.36119366134879455, + "eval_loss": 3.62491774559021, + "eval_runtime": 182.9044, + "eval_samples_per_second": 91.004, + "eval_steps_per_second": 5.691, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.3373064696788788, + "learning_rate": 0.0005337254444768288, + "loss": 3.5161, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3205850422382355, + "learning_rate": 0.0005335505683474205, + "loss": 3.5198, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.33948105573654175, + "learning_rate": 0.0005333756922180122, + "loss": 3.5155, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.30776116251945496, + "learning_rate": 0.0005332008160886039, + "loss": 3.5019, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.42596328258514404, + "learning_rate": 0.0005330259399591956, + "loss": 3.5311, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.34155476093292236, + "learning_rate": 0.0005328510638297873, + "loss": 3.5215, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.33613601326942444, + "learning_rate": 0.0005326761877003788, + "loss": 3.5089, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.32420244812965393, + "learning_rate": 0.0005325013115709705, + "loss": 3.5318, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.30287185311317444, + "learning_rate": 0.0005323264354415622, + "loss": 3.5207, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.3400566577911377, + "learning_rate": 0.0005321515593121538, + "loss": 3.5213, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.31341907382011414, + "learning_rate": 0.0005319766831827455, + "loss": 3.5249, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.3233737647533417, + "learning_rate": 0.0005318018070533372, + "loss": 3.5391, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.3121044635772705, + "learning_rate": 0.0005316269309239288, + "loss": 3.5232, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.32169246673583984, + "learning_rate": 0.0005314520547945206, + "loss": 3.5159, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.3152818977832794, + "learning_rate": 0.0005312771786651121, + "loss": 3.5194, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.31117457151412964, + "learning_rate": 0.0005311023025357038, + "loss": 3.5284, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3282223641872406, + "learning_rate": 0.0005309274264062955, + "loss": 3.526, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.3373311161994934, + "learning_rate": 0.0005307525502768872, + "loss": 3.5291, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.3154096007347107, + "learning_rate": 0.0005305776741474788, + "loss": 3.53, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.31639188528060913, + "learning_rate": 0.0005304027980180705, + "loss": 3.5162, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.3622192215829698, + "eval_loss": 3.612488269805908, + "eval_runtime": 182.8981, + "eval_samples_per_second": 91.007, + "eval_steps_per_second": 5.692, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.33335795998573303, + "learning_rate": 0.0005302279218886622, + "loss": 3.5267, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.32977914810180664, + "learning_rate": 0.0005300530457592538, + "loss": 3.5278, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.319991797208786, + "learning_rate": 0.0005298781696298456, + "loss": 3.5243, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.31696072220802307, + "learning_rate": 0.0005297032935004371, + "loss": 3.5255, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.31803423166275024, + "learning_rate": 0.0005295284173710288, + "loss": 3.5163, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.30268293619155884, + "learning_rate": 0.0005293535412416205, + "loss": 3.5247, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.3222343325614929, + "learning_rate": 0.0005291786651122121, + "loss": 3.5226, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.3210581839084625, + "learning_rate": 0.0005290037889828038, + "loss": 3.5315, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.30163270235061646, + "learning_rate": 0.0005288289128533955, + "loss": 3.531, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.3174237906932831, + "learning_rate": 0.0005286540367239872, + "loss": 3.5308, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.3105069398880005, + "learning_rate": 0.0005284791605945788, + "loss": 3.5229, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.35065528750419617, + "learning_rate": 0.0005283042844651704, + "loss": 3.5306, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.3209701180458069, + "learning_rate": 0.0005281294083357621, + "loss": 3.4202, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.35732001066207886, + "learning_rate": 0.0005279545322063538, + "loss": 3.4097, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.3293222188949585, + "learning_rate": 0.0005277796560769455, + "loss": 3.4211, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.3226439952850342, + "learning_rate": 0.0005276047799475371, + "loss": 3.4286, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.33143454790115356, + "learning_rate": 0.0005274299038181288, + "loss": 3.4274, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.31629180908203125, + "learning_rate": 0.0005272550276887205, + "loss": 3.4214, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.3248637020587921, + "learning_rate": 0.0005270801515593121, + "loss": 3.448, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.333915114402771, + "learning_rate": 0.0005269052754299037, + "loss": 3.4304, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.36297096205497675, + "eval_loss": 3.6168880462646484, + "eval_runtime": 183.1564, + "eval_samples_per_second": 90.879, + "eval_steps_per_second": 5.684, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.339853435754776, + "learning_rate": 0.0005267303993004954, + "loss": 3.432, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.3413807153701782, + "learning_rate": 0.000526555523171087, + "loss": 3.4503, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.3138124942779541, + "learning_rate": 0.0005263806470416788, + "loss": 3.4536, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.3247540593147278, + "learning_rate": 0.0005262057709122704, + "loss": 3.4454, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.30432644486427307, + "learning_rate": 0.0005260308947828621, + "loss": 3.4577, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.3186846077442169, + "learning_rate": 0.0005258560186534538, + "loss": 3.4504, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.3390951454639435, + "learning_rate": 0.0005256811425240455, + "loss": 3.4427, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.3416147828102112, + "learning_rate": 0.0005255062663946371, + "loss": 3.4451, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.330108106136322, + "learning_rate": 0.0005253313902652287, + "loss": 3.4634, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.32614409923553467, + "learning_rate": 0.0005251565141358204, + "loss": 3.4689, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.3183272182941437, + "learning_rate": 0.000524981638006412, + "loss": 3.4614, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.3260941803455353, + "learning_rate": 0.0005248067618770038, + "loss": 3.4687, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.34689396619796753, + "learning_rate": 0.0005246318857475954, + "loss": 3.4607, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.3259231150150299, + "learning_rate": 0.0005244570096181871, + "loss": 3.4606, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.33066433668136597, + "learning_rate": 0.0005242821334887788, + "loss": 3.4666, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.3282213807106018, + "learning_rate": 0.0005241072573593704, + "loss": 3.4566, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.32075440883636475, + "learning_rate": 0.000523932381229962, + "loss": 3.4641, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.3442796468734741, + "learning_rate": 0.0005237575051005537, + "loss": 3.4588, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.3072233498096466, + "learning_rate": 0.0005235826289711454, + "loss": 3.462, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.35822081565856934, + "learning_rate": 0.000523407752841737, + "loss": 3.4737, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.3631809414861629, + "eval_loss": 3.6075851917266846, + "eval_runtime": 183.2532, + "eval_samples_per_second": 90.831, + "eval_steps_per_second": 5.681, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.3440370559692383, + "learning_rate": 0.0005232328767123287, + "loss": 3.4619, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.3295625150203705, + "learning_rate": 0.0005230580005829204, + "loss": 3.4756, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.32679641246795654, + "learning_rate": 0.0005228831244535121, + "loss": 3.4795, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.3422777056694031, + "learning_rate": 0.0005227082483241038, + "loss": 3.4777, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.3188982903957367, + "learning_rate": 0.0005225333721946954, + "loss": 3.4789, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.33180058002471924, + "learning_rate": 0.000522358496065287, + "loss": 3.4764, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.35259467363357544, + "learning_rate": 0.0005221836199358787, + "loss": 3.4783, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.31127235293388367, + "learning_rate": 0.0005220087438064703, + "loss": 3.4812, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.3426322340965271, + "learning_rate": 0.000521833867677062, + "loss": 3.4858, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.33666297793388367, + "learning_rate": 0.0005216589915476537, + "loss": 3.4842, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.33777403831481934, + "learning_rate": 0.0005214841154182454, + "loss": 3.4867, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.33145034313201904, + "learning_rate": 0.0005213092392888371, + "loss": 3.4843, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.3483746647834778, + "learning_rate": 0.0005211343631594287, + "loss": 3.4842, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.34439757466316223, + "learning_rate": 0.0005209594870300204, + "loss": 3.4747, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.330509752035141, + "learning_rate": 0.000520784610900612, + "loss": 3.4757, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.3516590893268585, + "learning_rate": 0.0005206097347712037, + "loss": 3.4689, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.35408666729927063, + "learning_rate": 0.0005204348586417953, + "loss": 3.4825, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.35224005579948425, + "learning_rate": 0.000520259982512387, + "loss": 3.4933, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.33899161219596863, + "learning_rate": 0.0005200851063829787, + "loss": 3.4857, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.3293270170688629, + "learning_rate": 0.0005199102302535703, + "loss": 3.4775, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.3640601274807935, + "eval_loss": 3.599479913711548, + "eval_runtime": 183.341, + "eval_samples_per_second": 90.787, + "eval_steps_per_second": 5.678, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.32254496216773987, + "learning_rate": 0.0005197353541241621, + "loss": 3.4873, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.32425615191459656, + "learning_rate": 0.0005195604779947537, + "loss": 3.4951, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.3163357675075531, + "learning_rate": 0.0005193856018653454, + "loss": 3.4845, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.3210630416870117, + "learning_rate": 0.000519210725735937, + "loss": 3.4869, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.33914175629615784, + "learning_rate": 0.0005190358496065286, + "loss": 3.4848, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.3026992082595825, + "learning_rate": 0.0005188609734771203, + "loss": 3.4891, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.3406318724155426, + "learning_rate": 0.000518686097347712, + "loss": 3.4859, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.3310200273990631, + "learning_rate": 0.0005185112212183037, + "loss": 3.5022, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.30199411511421204, + "learning_rate": 0.0005183363450888953, + "loss": 3.4872, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.3239602744579315, + "learning_rate": 0.000518161468959487, + "loss": 3.4867, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.32365167140960693, + "learning_rate": 0.0005179865928300787, + "loss": 3.4923, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.31722819805145264, + "learning_rate": 0.0005178117167006703, + "loss": 3.4779, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.3375761806964874, + "learning_rate": 0.000517636840571262, + "loss": 3.4864, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.30887332558631897, + "learning_rate": 0.0005174619644418536, + "loss": 3.4974, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.34449508786201477, + "learning_rate": 0.0005172870883124453, + "loss": 3.4916, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.3328838348388672, + "learning_rate": 0.000517112212183037, + "loss": 3.4975, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.33597683906555176, + "learning_rate": 0.0005169373360536286, + "loss": 3.4857, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.3501330316066742, + "learning_rate": 0.0005167624599242203, + "loss": 3.4972, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.3559366762638092, + "learning_rate": 0.000516587583794812, + "loss": 3.4917, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.3154732286930084, + "learning_rate": 0.0005164127076654037, + "loss": 3.49, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.36516716349649847, + "eval_loss": 3.586782932281494, + "eval_runtime": 182.9453, + "eval_samples_per_second": 90.983, + "eval_steps_per_second": 5.69, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.3098287880420685, + "learning_rate": 0.0005162378315359953, + "loss": 3.4496, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.34997889399528503, + "learning_rate": 0.0005160629554065869, + "loss": 3.3794, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.33267155289649963, + "learning_rate": 0.0005158880792771786, + "loss": 3.3792, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.3368155360221863, + "learning_rate": 0.0005157132031477703, + "loss": 3.3876, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.33117562532424927, + "learning_rate": 0.000515538327018362, + "loss": 3.4, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.3448326587677002, + "learning_rate": 0.0005153634508889536, + "loss": 3.4042, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.3393695652484894, + "learning_rate": 0.0005151885747595453, + "loss": 3.4025, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.32103875279426575, + "learning_rate": 0.000515013698630137, + "loss": 3.3923, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.32782039046287537, + "learning_rate": 0.0005148388225007285, + "loss": 3.4004, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.321836918592453, + "learning_rate": 0.0005146639463713203, + "loss": 3.4083, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.3271026015281677, + "learning_rate": 0.0005144890702419119, + "loss": 3.4261, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.32715457677841187, + "learning_rate": 0.0005143141941125036, + "loss": 3.4129, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.3357333838939667, + "learning_rate": 0.0005141393179830953, + "loss": 3.4146, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.3631840646266937, + "learning_rate": 0.0005139644418536869, + "loss": 3.4181, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.3274785578250885, + "learning_rate": 0.0005137895657242786, + "loss": 3.4125, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.3449844419956207, + "learning_rate": 0.0005136146895948703, + "loss": 3.4166, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.32263556122779846, + "learning_rate": 0.000513439813465462, + "loss": 3.4285, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.3413465619087219, + "learning_rate": 0.0005132649373360535, + "loss": 3.4285, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.3473232388496399, + "learning_rate": 0.0005130900612066452, + "loss": 3.4347, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.32374686002731323, + "learning_rate": 0.0005129151850772369, + "loss": 3.4352, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.3647126391510529, + "eval_loss": 3.598836898803711, + "eval_runtime": 183.3183, + "eval_samples_per_second": 90.798, + "eval_steps_per_second": 5.679, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.3352644443511963, + "learning_rate": 0.0005127403089478286, + "loss": 3.4219, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.36998191475868225, + "learning_rate": 0.0005125654328184203, + "loss": 3.4354, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.33111071586608887, + "learning_rate": 0.0005123905566890119, + "loss": 3.4273, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.3304993212223053, + "learning_rate": 0.0005122156805596036, + "loss": 3.4285, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.34959882497787476, + "learning_rate": 0.0005120408044301953, + "loss": 3.4227, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.36705702543258667, + "learning_rate": 0.0005118659283007868, + "loss": 3.4247, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.3186073899269104, + "learning_rate": 0.0005116910521713785, + "loss": 3.4439, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.3507457971572876, + "learning_rate": 0.0005115161760419702, + "loss": 3.4373, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.3345440626144409, + "learning_rate": 0.0005113412999125619, + "loss": 3.4429, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.3337697684764862, + "learning_rate": 0.0005111664237831536, + "loss": 3.4366, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.3243204653263092, + "learning_rate": 0.0005109915476537452, + "loss": 3.451, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.34631916880607605, + "learning_rate": 0.0005108166715243369, + "loss": 3.4431, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.31301483511924744, + "learning_rate": 0.0005106417953949286, + "loss": 3.436, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.3258545994758606, + "learning_rate": 0.0005104669192655203, + "loss": 3.4431, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.3321167230606079, + "learning_rate": 0.0005102920431361118, + "loss": 3.4506, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.3389933407306671, + "learning_rate": 0.0005101171670067035, + "loss": 3.4348, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.34453973174095154, + "learning_rate": 0.0005099422908772952, + "loss": 3.43, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.3527055084705353, + "learning_rate": 0.0005097674147478868, + "loss": 3.4518, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.33843597769737244, + "learning_rate": 0.0005095925386184786, + "loss": 3.4435, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.3532460927963257, + "learning_rate": 0.0005094176624890702, + "loss": 3.4508, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.36540406638218725, + "eval_loss": 3.589508056640625, + "eval_runtime": 183.429, + "eval_samples_per_second": 90.744, + "eval_steps_per_second": 5.675, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.33581236004829407, + "learning_rate": 0.0005092427863596619, + "loss": 3.4393, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.31811484694480896, + "learning_rate": 0.0005090679102302536, + "loss": 3.4466, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.34038013219833374, + "learning_rate": 0.0005088930341008451, + "loss": 3.4549, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.314603716135025, + "learning_rate": 0.0005087181579714368, + "loss": 3.4585, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.3383300006389618, + "learning_rate": 0.0005085432818420285, + "loss": 3.4489, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.32058727741241455, + "learning_rate": 0.0005083684057126202, + "loss": 3.4562, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.3361349105834961, + "learning_rate": 0.0005081935295832118, + "loss": 3.4513, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.3190141022205353, + "learning_rate": 0.0005080186534538035, + "loss": 3.4377, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.32522907853126526, + "learning_rate": 0.0005078437773243952, + "loss": 3.4548, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.3320629596710205, + "learning_rate": 0.0005076689011949869, + "loss": 3.4482, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.3172106146812439, + "learning_rate": 0.0005074940250655786, + "loss": 3.4546, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.339695006608963, + "learning_rate": 0.0005073191489361701, + "loss": 3.456, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.3172706365585327, + "learning_rate": 0.0005071442728067618, + "loss": 3.4525, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.3277393877506256, + "learning_rate": 0.0005069693966773535, + "loss": 3.4459, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3323872685432434, + "learning_rate": 0.0005067945205479451, + "loss": 3.4475, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.339382529258728, + "learning_rate": 0.0005066196444185368, + "loss": 3.4398, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.33565738797187805, + "learning_rate": 0.0005064447682891285, + "loss": 3.4522, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.33060815930366516, + "learning_rate": 0.0005062698921597202, + "loss": 3.4555, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.3253958225250244, + "learning_rate": 0.0005060950160303119, + "loss": 3.4451, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.3636493384838104, + "learning_rate": 0.0005059201399009035, + "loss": 3.4523, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.36607174453991753, + "eval_loss": 3.5809550285339355, + "eval_runtime": 183.0826, + "eval_samples_per_second": 90.915, + "eval_steps_per_second": 5.686, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.3568457365036011, + "learning_rate": 0.0005057452637714951, + "loss": 3.4527, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.3288189172744751, + "learning_rate": 0.0005055703876420868, + "loss": 3.4552, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.29840394854545593, + "learning_rate": 0.0005053955115126785, + "loss": 3.4563, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.37738728523254395, + "learning_rate": 0.0005052206353832701, + "loss": 3.4605, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.3255780339241028, + "learning_rate": 0.0005050457592538618, + "loss": 3.4607, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3199935853481293, + "learning_rate": 0.0005048708831244535, + "loss": 3.4621, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.34419822692871094, + "learning_rate": 0.0005046960069950451, + "loss": 3.4547, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.31108030676841736, + "learning_rate": 0.0005045211308656369, + "loss": 3.4622, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.3241368532180786, + "learning_rate": 0.0005043462547362284, + "loss": 3.4599, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.34238871932029724, + "learning_rate": 0.0005041713786068201, + "loss": 3.3724, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.3241780400276184, + "learning_rate": 0.0005039965024774118, + "loss": 3.3516, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.33774638175964355, + "learning_rate": 0.0005038216263480034, + "loss": 3.3319, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.33839425444602966, + "learning_rate": 0.0005036467502185951, + "loss": 3.3643, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.33138301968574524, + "learning_rate": 0.0005034718740891868, + "loss": 3.3576, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.331544429063797, + "learning_rate": 0.0005032969979597785, + "loss": 3.3717, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.3336641192436218, + "learning_rate": 0.0005031221218303701, + "loss": 3.3637, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.3461131453514099, + "learning_rate": 0.0005029472457009618, + "loss": 3.367, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.3179863691329956, + "learning_rate": 0.0005027723695715534, + "loss": 3.3618, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.3158666491508484, + "learning_rate": 0.0005025974934421451, + "loss": 3.3798, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.34379681944847107, + "learning_rate": 0.0005024226173127368, + "loss": 3.3762, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.3662919525324213, + "eval_loss": 3.588055372238159, + "eval_runtime": 182.9945, + "eval_samples_per_second": 90.959, + "eval_steps_per_second": 5.689, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.3444662392139435, + "learning_rate": 0.0005022477411833284, + "loss": 3.3854, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.3341796398162842, + "learning_rate": 0.0005020728650539201, + "loss": 3.3786, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.3741398751735687, + "learning_rate": 0.0005018979889245118, + "loss": 3.3783, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.329504132270813, + "learning_rate": 0.0005017231127951034, + "loss": 3.3892, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.37047260999679565, + "learning_rate": 0.0005015482366656951, + "loss": 3.3807, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.35633140802383423, + "learning_rate": 0.0005013733605362868, + "loss": 3.392, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.39560022950172424, + "learning_rate": 0.0005011984844068784, + "loss": 3.3931, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3520383834838867, + "learning_rate": 0.0005010236082774701, + "loss": 3.404, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.3606109023094177, + "learning_rate": 0.0005008487321480617, + "loss": 3.3849, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.35454607009887695, + "learning_rate": 0.0005006738560186534, + "loss": 3.3959, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.3578483462333679, + "learning_rate": 0.0005004989798892451, + "loss": 3.4044, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.36204418540000916, + "learning_rate": 0.0005003241037598368, + "loss": 3.3994, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.35746678709983826, + "learning_rate": 0.0005001492276304284, + "loss": 3.3966, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.32434630393981934, + "learning_rate": 0.0004999743515010201, + "loss": 3.408, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.3382759094238281, + "learning_rate": 0.0004997994753716117, + "loss": 3.3965, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.3254185616970062, + "learning_rate": 0.0004996245992422033, + "loss": 3.4115, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.3328564763069153, + "learning_rate": 0.0004994497231127951, + "loss": 3.3962, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.3341105580329895, + "learning_rate": 0.0004992748469833867, + "loss": 3.4067, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.3595927655696869, + "learning_rate": 0.0004990999708539784, + "loss": 3.4093, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.32767578959465027, + "learning_rate": 0.0004989250947245701, + "loss": 3.4033, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.3668947322321366, + "eval_loss": 3.5828919410705566, + "eval_runtime": 183.1656, + "eval_samples_per_second": 90.874, + "eval_steps_per_second": 5.683, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.335256963968277, + "learning_rate": 0.0004987502185951617, + "loss": 3.4103, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.328800767660141, + "learning_rate": 0.0004985753424657534, + "loss": 3.4104, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.3295251429080963, + "learning_rate": 0.000498400466336345, + "loss": 3.4066, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.3609812259674072, + "learning_rate": 0.0004982255902069367, + "loss": 3.4191, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.353671669960022, + "learning_rate": 0.0004980507140775283, + "loss": 3.4129, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.32933372259140015, + "learning_rate": 0.0004978758379481201, + "loss": 3.423, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.32520967721939087, + "learning_rate": 0.0004977009618187117, + "loss": 3.4124, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.3547916114330292, + "learning_rate": 0.0004975260856893034, + "loss": 3.4198, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.34057968854904175, + "learning_rate": 0.0004973512095598951, + "loss": 3.4191, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.33685600757598877, + "learning_rate": 0.0004971763334304867, + "loss": 3.4209, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.34290191531181335, + "learning_rate": 0.0004970014573010784, + "loss": 3.4257, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.311635822057724, + "learning_rate": 0.00049682658117167, + "loss": 3.4108, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.3206999599933624, + "learning_rate": 0.0004966517050422616, + "loss": 3.4277, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.378736674785614, + "learning_rate": 0.0004964768289128533, + "loss": 3.4192, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.35404956340789795, + "learning_rate": 0.000496301952783445, + "loss": 3.4304, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.33619192242622375, + "learning_rate": 0.0004961270766540367, + "loss": 3.4157, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.3306516110897064, + "learning_rate": 0.0004959522005246284, + "loss": 3.4257, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.314596563577652, + "learning_rate": 0.00049577732439522, + "loss": 3.4326, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.3481888771057129, + "learning_rate": 0.0004956024482658117, + "loss": 3.4345, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.33885663747787476, + "learning_rate": 0.0004954275721364034, + "loss": 3.4225, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.3673473754628571, + "eval_loss": 3.574705123901367, + "eval_runtime": 183.0508, + "eval_samples_per_second": 90.931, + "eval_steps_per_second": 5.687, + "step": 30000 + } + ], + "logging_steps": 50, + "max_steps": 171650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.27061160411136e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}