diff --git "a/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json" "b/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last_to_hit_frequency_1001/checkpoint-70000/trainer_state.json" @@ -0,0 +1,10473 @@ +{ + "best_global_step": 65000, + "best_metric": 3.5310399532318115, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_1001/checkpoint-40000", + "epoch": 20.390351899324166, + "eval_steps": 1000, + "global_step": 70000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 0.9732208251953125, + "learning_rate": 0.000294, + "loss": 8.429, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.9100438356399536, + "learning_rate": 0.0005939999999999999, + "loss": 6.7354, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.792998194694519, + "learning_rate": 0.0005998286213931798, + "loss": 6.3577, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.47094255685806274, + "learning_rate": 0.0005996537452637714, + "loss": 6.1522, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.5149641633033752, + "learning_rate": 0.0005994788691343632, + "loss": 6.0111, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.4672093391418457, + "learning_rate": 0.0005993039930049548, + "loss": 5.8874, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.47426706552505493, + "learning_rate": 0.0005991291168755465, + "loss": 5.7519, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.447353333234787, + "learning_rate": 0.0005989542407461382, + "loss": 5.6271, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.48666468262672424, + "learning_rate": 0.0005987793646167297, + "loss": 5.5233, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.4567582607269287, + "learning_rate": 0.0005986044884873214, + "loss": 5.4158, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.42497554421424866, + "learning_rate": 0.0005984296123579131, + "loss": 5.3416, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.5244524478912354, + "learning_rate": 0.0005982547362285047, + "loss": 5.2606, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.43386486172676086, + "learning_rate": 0.0005980798600990964, + "loss": 5.2099, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.4284716248512268, + "learning_rate": 0.0005979049839696881, + "loss": 5.141, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.7061876058578491, + "learning_rate": 0.0005977301078402798, + "loss": 5.0658, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.38007256388664246, + "learning_rate": 0.0005975552317108715, + "loss": 5.0179, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4920196533203125, + "learning_rate": 0.0005973803555814631, + "loss": 4.9658, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.41616320610046387, + "learning_rate": 0.0005972054794520547, + "loss": 4.9388, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.45925629138946533, + "learning_rate": 0.0005970306033226464, + "loss": 4.8788, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.43381017446517944, + "learning_rate": 0.0005968557271932381, + "loss": 4.8292, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.25263935092136414, + "eval_loss": 4.762474536895752, + "eval_runtime": 180.7317, + "eval_samples_per_second": 92.098, + "eval_steps_per_second": 5.76, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.6765235662460327, + "learning_rate": 0.0005966808510638297, + "loss": 4.7894, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.4921708405017853, + "learning_rate": 0.0005965059749344214, + "loss": 4.7573, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.42672333121299744, + "learning_rate": 0.0005963310988050131, + "loss": 4.7064, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.44009578227996826, + "learning_rate": 0.0005961562226756047, + "loss": 4.6818, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.45628976821899414, + "learning_rate": 0.0005959813465461965, + "loss": 4.64, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.4491446912288666, + "learning_rate": 0.000595806470416788, + "loss": 4.6006, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.4491466283798218, + "learning_rate": 0.0005956315942873797, + "loss": 4.5835, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.4405411183834076, + "learning_rate": 0.0005954567181579714, + "loss": 4.5493, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.45015254616737366, + "learning_rate": 0.000595281842028563, + "loss": 4.5427, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4123348593711853, + "learning_rate": 0.0005951069658991547, + "loss": 4.5047, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.4243234395980835, + "learning_rate": 0.0005949320897697464, + "loss": 4.4801, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.4439113140106201, + "learning_rate": 0.0005947572136403381, + "loss": 4.476, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.38935843110084534, + "learning_rate": 0.0005945823375109297, + "loss": 4.4492, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.4441373348236084, + "learning_rate": 0.0005944074613815215, + "loss": 4.4366, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.4098525643348694, + "learning_rate": 0.000594232585252113, + "loss": 4.4203, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.5222740173339844, + "learning_rate": 0.0005940577091227047, + "loss": 4.3938, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.42251360416412354, + "learning_rate": 0.0005938828329932964, + "loss": 4.3761, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.4452977180480957, + "learning_rate": 0.000593707956863888, + "loss": 4.3494, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.3798687160015106, + "learning_rate": 0.0005935330807344797, + "loss": 4.347, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.39384129643440247, + "learning_rate": 0.0005933582046050714, + "loss": 4.343, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.30013608689339194, + "eval_loss": 4.277587890625, + "eval_runtime": 180.1511, + "eval_samples_per_second": 92.395, + "eval_steps_per_second": 5.778, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.3870452344417572, + "learning_rate": 0.000593183328475663, + "loss": 4.3225, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.3716549277305603, + "learning_rate": 0.0005930084523462546, + "loss": 4.3048, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.3902703523635864, + "learning_rate": 0.0005928335762168463, + "loss": 4.2988, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.4183005690574646, + "learning_rate": 0.000592658700087438, + "loss": 4.2755, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.40300023555755615, + "learning_rate": 0.0005924838239580297, + "loss": 4.2868, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.3854156732559204, + "learning_rate": 0.0005923089478286214, + "loss": 4.2729, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.38318294286727905, + "learning_rate": 0.000592134071699213, + "loss": 4.2674, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.3910939395427704, + "learning_rate": 0.0005919591955698047, + "loss": 4.2482, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.380487322807312, + "learning_rate": 0.0005917843194403964, + "loss": 4.2344, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.3697279095649719, + "learning_rate": 0.000591609443310988, + "loss": 4.231, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.3669740557670593, + "learning_rate": 0.0005914345671815796, + "loss": 4.221, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.4364064931869507, + "learning_rate": 0.0005912596910521713, + "loss": 4.2146, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.36517658829689026, + "learning_rate": 0.0005910848149227629, + "loss": 4.1886, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3450356423854828, + "learning_rate": 0.0005909099387933547, + "loss": 4.1916, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.38316479325294495, + "learning_rate": 0.0005907350626639463, + "loss": 4.1823, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.39378035068511963, + "learning_rate": 0.000590560186534538, + "loss": 4.1847, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.36056962609291077, + "learning_rate": 0.0005903853104051297, + "loss": 4.1647, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.3993781507015228, + "learning_rate": 0.0005902104342757214, + "loss": 4.1662, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.3729277551174164, + "learning_rate": 0.000590035558146313, + "loss": 4.1351, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.3638637065887451, + "learning_rate": 0.0005898606820169046, + "loss": 4.1519, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.31632108041824236, + "eval_loss": 4.091220855712891, + "eval_runtime": 180.2511, + "eval_samples_per_second": 92.343, + "eval_steps_per_second": 5.775, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.3821454346179962, + "learning_rate": 0.0005896858058874963, + "loss": 4.1453, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.35036420822143555, + "learning_rate": 0.0005895109297580879, + "loss": 4.1283, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.3927556276321411, + "learning_rate": 0.0005893360536286797, + "loss": 4.1126, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.36200571060180664, + "learning_rate": 0.0005891611774992713, + "loss": 4.1101, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.39992815256118774, + "learning_rate": 0.000588986301369863, + "loss": 4.095, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.3586205840110779, + "learning_rate": 0.0005888114252404547, + "loss": 4.1105, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.35303160548210144, + "learning_rate": 0.0005886365491110463, + "loss": 4.0899, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.401599258184433, + "learning_rate": 0.000588461672981638, + "loss": 4.0786, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.3272145986557007, + "learning_rate": 0.0005882867968522296, + "loss": 4.0585, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.38398465514183044, + "learning_rate": 0.0005881119207228212, + "loss": 4.0137, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.35750123858451843, + "learning_rate": 0.0005879370445934129, + "loss": 4.0097, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.3321663737297058, + "learning_rate": 0.0005877621684640046, + "loss": 4.005, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3508462607860565, + "learning_rate": 0.0005875872923345963, + "loss": 4.0109, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.3782167434692383, + "learning_rate": 0.000587412416205188, + "loss": 3.998, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.33389317989349365, + "learning_rate": 0.0005872375400757797, + "loss": 4.0072, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.33711186051368713, + "learning_rate": 0.0005870626639463713, + "loss": 3.9995, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.38124826550483704, + "learning_rate": 0.0005868877878169629, + "loss": 3.9936, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.37443825602531433, + "learning_rate": 0.0005867129116875546, + "loss": 3.9935, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.34981346130371094, + "learning_rate": 0.0005865380355581462, + "loss": 4.0015, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.3286009132862091, + "learning_rate": 0.0005863631594287379, + "loss": 3.9832, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.32566798677811487, + "eval_loss": 3.987405300140381, + "eval_runtime": 180.1237, + "eval_samples_per_second": 92.409, + "eval_steps_per_second": 5.779, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.3387763798236847, + "learning_rate": 0.0005861882832993296, + "loss": 3.9905, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.32717615365982056, + "learning_rate": 0.0005860134071699212, + "loss": 3.9715, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.3719293177127838, + "learning_rate": 0.000585838531040513, + "loss": 3.9672, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.3480449318885803, + "learning_rate": 0.0005856636549111046, + "loss": 3.9802, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.35181939601898193, + "learning_rate": 0.0005854887787816963, + "loss": 3.9749, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.3547437787055969, + "learning_rate": 0.0005853139026522879, + "loss": 3.968, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.34064048528671265, + "learning_rate": 0.0005851390265228796, + "loss": 3.9684, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.34372982382774353, + "learning_rate": 0.0005849641503934712, + "loss": 3.9613, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.36687299609184265, + "learning_rate": 0.0005847892742640629, + "loss": 3.9657, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.3261411786079407, + "learning_rate": 0.0005846143981346546, + "loss": 3.9638, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.34175243973731995, + "learning_rate": 0.0005844395220052462, + "loss": 3.9506, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.3380016088485718, + "learning_rate": 0.000584264645875838, + "loss": 3.9473, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.31564953923225403, + "learning_rate": 0.0005840897697464296, + "loss": 3.9512, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.3103531002998352, + "learning_rate": 0.0005839148936170212, + "loss": 3.942, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.3472263216972351, + "learning_rate": 0.0005837400174876129, + "loss": 3.9413, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.3497699201107025, + "learning_rate": 0.0005835651413582045, + "loss": 3.9362, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.347032368183136, + "learning_rate": 0.0005833902652287962, + "loss": 3.935, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.33469849824905396, + "learning_rate": 0.0005832153890993879, + "loss": 3.9344, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.3481560945510864, + "learning_rate": 0.0005830405129699796, + "loss": 3.9247, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.3438446521759033, + "learning_rate": 0.0005828656368405712, + "loss": 3.9231, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.33212456036291405, + "eval_loss": 3.913280487060547, + "eval_runtime": 180.0039, + "eval_samples_per_second": 92.47, + "eval_steps_per_second": 5.783, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.3285478949546814, + "learning_rate": 0.0005826907607111629, + "loss": 3.9283, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.3346214294433594, + "learning_rate": 0.0005825158845817546, + "loss": 3.9257, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.34109315276145935, + "learning_rate": 0.0005823410084523462, + "loss": 3.9206, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.32906198501586914, + "learning_rate": 0.0005821661323229379, + "loss": 3.9209, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.3165450394153595, + "learning_rate": 0.0005819912561935295, + "loss": 3.9042, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.3268945515155792, + "learning_rate": 0.0005818163800641212, + "loss": 3.9095, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.34344053268432617, + "learning_rate": 0.0005816415039347129, + "loss": 3.9064, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.33473026752471924, + "learning_rate": 0.0005814666278053045, + "loss": 3.9084, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.3239467442035675, + "learning_rate": 0.0005812917516758962, + "loss": 3.9066, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.32284918427467346, + "learning_rate": 0.0005811168755464879, + "loss": 3.8854, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.34899482131004333, + "learning_rate": 0.0005809419994170794, + "loss": 3.8977, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.3288831114768982, + "learning_rate": 0.0005807671232876712, + "loss": 3.9047, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.32496726512908936, + "learning_rate": 0.0005805922471582628, + "loss": 3.8927, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.3319661021232605, + "learning_rate": 0.0005804173710288545, + "loss": 3.8783, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.32201242446899414, + "learning_rate": 0.0005802424948994462, + "loss": 3.8909, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.3138631284236908, + "learning_rate": 0.0005800676187700379, + "loss": 3.8901, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.3181164264678955, + "learning_rate": 0.0005798927426406295, + "loss": 3.8813, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.33025944232940674, + "learning_rate": 0.0005797178665112212, + "loss": 3.873, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.3095895051956177, + "learning_rate": 0.0005795429903818129, + "loss": 3.884, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.3384963572025299, + "learning_rate": 0.0005793681142524044, + "loss": 3.8709, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.3377239334814319, + "eval_loss": 3.851867914199829, + "eval_runtime": 180.0309, + "eval_samples_per_second": 92.456, + "eval_steps_per_second": 5.782, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.33476731181144714, + "learning_rate": 0.0005791932381229961, + "loss": 3.8649, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.318613737821579, + "learning_rate": 0.0005790183619935878, + "loss": 3.8848, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.31928250193595886, + "learning_rate": 0.0005788434858641795, + "loss": 3.8653, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.32100650668144226, + "learning_rate": 0.0005786686097347712, + "loss": 3.8764, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.3220023214817047, + "learning_rate": 0.0005784937336053628, + "loss": 3.8634, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.31344911456108093, + "learning_rate": 0.0005783188574759545, + "loss": 3.8532, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.32198983430862427, + "learning_rate": 0.0005781439813465462, + "loss": 3.8574, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.3163566589355469, + "learning_rate": 0.0005779691052171379, + "loss": 3.8587, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.32121846079826355, + "learning_rate": 0.0005777942290877294, + "loss": 3.8634, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.33083975315093994, + "learning_rate": 0.0005776193529583211, + "loss": 3.8503, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.30773088335990906, + "learning_rate": 0.0005774444768289128, + "loss": 3.8501, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.39235207438468933, + "learning_rate": 0.0005772696006995045, + "loss": 3.8508, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.32834747433662415, + "learning_rate": 0.0005770947245700962, + "loss": 3.8371, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.3106667995452881, + "learning_rate": 0.0005769198484406878, + "loss": 3.868, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.31455743312835693, + "learning_rate": 0.0005767449723112795, + "loss": 3.8564, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.33098679780960083, + "learning_rate": 0.0005765700961818712, + "loss": 3.8458, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.3470974266529083, + "learning_rate": 0.0005763952200524627, + "loss": 3.8377, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.32027867436408997, + "learning_rate": 0.0005762203439230544, + "loss": 3.7672, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.33726635575294495, + "learning_rate": 0.0005760454677936461, + "loss": 3.7396, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.3217654526233673, + "learning_rate": 0.0005758705916642378, + "loss": 3.753, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34205719882030594, + "eval_loss": 3.81199049949646, + "eval_runtime": 180.0121, + "eval_samples_per_second": 92.466, + "eval_steps_per_second": 5.783, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.33013007044792175, + "learning_rate": 0.0005756957155348294, + "loss": 3.7468, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.3293043375015259, + "learning_rate": 0.0005755208394054211, + "loss": 3.7574, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.32888132333755493, + "learning_rate": 0.0005753459632760128, + "loss": 3.7502, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.32330965995788574, + "learning_rate": 0.0005751710871466045, + "loss": 3.7581, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.3227783739566803, + "learning_rate": 0.0005749962110171962, + "loss": 3.7375, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.3198298215866089, + "learning_rate": 0.0005748213348877877, + "loss": 3.7408, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.31929996609687805, + "learning_rate": 0.0005746464587583794, + "loss": 3.7467, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.3041769564151764, + "learning_rate": 0.0005744715826289711, + "loss": 3.7418, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.32159364223480225, + "learning_rate": 0.0005742967064995627, + "loss": 3.7448, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.32719555497169495, + "learning_rate": 0.0005741218303701544, + "loss": 3.7598, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.32142728567123413, + "learning_rate": 0.0005739469542407461, + "loss": 3.7616, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.3248656392097473, + "learning_rate": 0.0005737720781113378, + "loss": 3.7569, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.32039493322372437, + "learning_rate": 0.0005735972019819295, + "loss": 3.7451, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.31409886479377747, + "learning_rate": 0.000573422325852521, + "loss": 3.7631, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.32058414816856384, + "learning_rate": 0.0005732474497231127, + "loss": 3.7544, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.3249547481536865, + "learning_rate": 0.0005730725735937044, + "loss": 3.7484, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.33606165647506714, + "learning_rate": 0.0005728976974642961, + "loss": 3.7481, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3388115465641022, + "learning_rate": 0.0005727228213348877, + "loss": 3.7435, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.3214958906173706, + "learning_rate": 0.0005725479452054794, + "loss": 3.7479, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.32727494835853577, + "learning_rate": 0.0005723730690760711, + "loss": 3.7469, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.3452295812344698, + "eval_loss": 3.780815839767456, + "eval_runtime": 180.1741, + "eval_samples_per_second": 92.383, + "eval_steps_per_second": 5.778, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.31999602913856506, + "learning_rate": 0.0005721981929466627, + "loss": 3.7439, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.3452329635620117, + "learning_rate": 0.0005720233168172545, + "loss": 3.7511, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3058473765850067, + "learning_rate": 0.000571848440687846, + "loss": 3.7567, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.338764488697052, + "learning_rate": 0.0005716735645584377, + "loss": 3.7437, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.3135814964771271, + "learning_rate": 0.0005714986884290294, + "loss": 3.7574, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.32909590005874634, + "learning_rate": 0.000571323812299621, + "loss": 3.7459, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.3048388361930847, + "learning_rate": 0.0005711489361702127, + "loss": 3.737, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.3441195785999298, + "learning_rate": 0.0005709740600408044, + "loss": 3.7551, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.32695695757865906, + "learning_rate": 0.0005707991839113961, + "loss": 3.7378, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.3172362446784973, + "learning_rate": 0.0005706243077819877, + "loss": 3.7473, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.3251330256462097, + "learning_rate": 0.0005704494316525793, + "loss": 3.7485, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.3160916268825531, + "learning_rate": 0.000570274555523171, + "loss": 3.7484, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.3058483898639679, + "learning_rate": 0.0005700996793937627, + "loss": 3.7459, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.3010760247707367, + "learning_rate": 0.0005699248032643544, + "loss": 3.7445, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.31458958983421326, + "learning_rate": 0.000569749927134946, + "loss": 3.7402, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.3097609877586365, + "learning_rate": 0.0005695750510055377, + "loss": 3.7328, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.3084356188774109, + "learning_rate": 0.0005694001748761294, + "loss": 3.7358, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.3240303695201874, + "learning_rate": 0.000569225298746721, + "loss": 3.7288, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.30645912885665894, + "learning_rate": 0.0005690504226173127, + "loss": 3.7361, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3031748831272125, + "learning_rate": 0.0005688755464879043, + "loss": 3.7452, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.34781717210847685, + "eval_loss": 3.7518084049224854, + "eval_runtime": 180.2025, + "eval_samples_per_second": 92.368, + "eval_steps_per_second": 5.777, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.30560678243637085, + "learning_rate": 0.000568700670358496, + "loss": 3.7391, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.33370503783226013, + "learning_rate": 0.0005685257942290877, + "loss": 3.7257, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.32426562905311584, + "learning_rate": 0.0005683509180996793, + "loss": 3.7313, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.31184735894203186, + "learning_rate": 0.000568176041970271, + "loss": 3.7244, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.30784913897514343, + "learning_rate": 0.0005680011658408627, + "loss": 3.7268, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.2977924942970276, + "learning_rate": 0.0005678262897114544, + "loss": 3.7179, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.3281992971897125, + "learning_rate": 0.000567651413582046, + "loss": 3.7368, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.32184773683547974, + "learning_rate": 0.0005674765374526377, + "loss": 3.7217, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.3028261363506317, + "learning_rate": 0.0005673016613232293, + "loss": 3.7288, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.36276352405548096, + "learning_rate": 0.0005671267851938209, + "loss": 3.7211, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.3130638301372528, + "learning_rate": 0.0005669519090644127, + "loss": 3.7186, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.31529319286346436, + "learning_rate": 0.0005667770329350043, + "loss": 3.7311, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.3072699010372162, + "learning_rate": 0.000566602156805596, + "loss": 3.7297, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.3379790186882019, + "learning_rate": 0.0005664272806761877, + "loss": 3.7178, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.3173193037509918, + "learning_rate": 0.0005662524045467793, + "loss": 3.7132, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.33217954635620117, + "learning_rate": 0.000566077528417371, + "loss": 3.7239, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.331225723028183, + "learning_rate": 0.0005659026522879626, + "loss": 3.7334, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.31492242217063904, + "learning_rate": 0.0005657277761585543, + "loss": 3.7225, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.327841192483902, + "learning_rate": 0.0005655529000291459, + "loss": 3.7265, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.3129492998123169, + "learning_rate": 0.0005653780238997376, + "loss": 3.7285, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.34999950032890115, + "eval_loss": 3.7232048511505127, + "eval_runtime": 179.9799, + "eval_samples_per_second": 92.483, + "eval_steps_per_second": 5.784, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.3213813006877899, + "learning_rate": 0.0005652031477703293, + "loss": 3.7164, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.313442200422287, + "learning_rate": 0.000565028271640921, + "loss": 3.7237, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.31209588050842285, + "learning_rate": 0.0005648533955115127, + "loss": 3.7229, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.309455007314682, + "learning_rate": 0.0005646785193821043, + "loss": 3.7254, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.30592969059944153, + "learning_rate": 0.000564503643252696, + "loss": 3.7176, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.32156163454055786, + "learning_rate": 0.0005643287671232876, + "loss": 3.7071, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.31477323174476624, + "learning_rate": 0.0005641538909938792, + "loss": 3.6161, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.3122144043445587, + "learning_rate": 0.0005639790148644709, + "loss": 3.6061, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.30838510394096375, + "learning_rate": 0.0005638041387350626, + "loss": 3.615, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.3112572133541107, + "learning_rate": 0.0005636292626056543, + "loss": 3.6097, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.31443509459495544, + "learning_rate": 0.000563454386476246, + "loss": 3.6103, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.3189432621002197, + "learning_rate": 0.0005632795103468376, + "loss": 3.6205, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3305325210094452, + "learning_rate": 0.0005631046342174293, + "loss": 3.6255, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.33095782995224, + "learning_rate": 0.000562929758088021, + "loss": 3.6204, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.3245634436607361, + "learning_rate": 0.0005627548819586126, + "loss": 3.6275, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.3105281889438629, + "learning_rate": 0.0005625800058292042, + "loss": 3.6391, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.31318241357803345, + "learning_rate": 0.0005624051296997959, + "loss": 3.6262, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.3430389165878296, + "learning_rate": 0.0005622302535703876, + "loss": 3.6341, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.3307541608810425, + "learning_rate": 0.0005620553774409792, + "loss": 3.6337, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.3157060444355011, + "learning_rate": 0.000561880501311571, + "loss": 3.6361, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.35203486646142923, + "eval_loss": 3.712493419647217, + "eval_runtime": 180.1969, + "eval_samples_per_second": 92.371, + "eval_steps_per_second": 5.777, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.31381699442863464, + "learning_rate": 0.0005617056251821626, + "loss": 3.6288, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.3294530510902405, + "learning_rate": 0.0005615307490527543, + "loss": 3.6322, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.34993889927864075, + "learning_rate": 0.000561355872923346, + "loss": 3.6331, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.31110501289367676, + "learning_rate": 0.0005611809967939375, + "loss": 3.6428, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.3065360486507416, + "learning_rate": 0.0005610061206645292, + "loss": 3.6328, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.30535629391670227, + "learning_rate": 0.0005608312445351209, + "loss": 3.6385, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.31860092282295227, + "learning_rate": 0.0005606563684057126, + "loss": 3.6457, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.30683571100234985, + "learning_rate": 0.0005604814922763042, + "loss": 3.6281, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.34014976024627686, + "learning_rate": 0.0005603066161468959, + "loss": 3.6465, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.32539984583854675, + "learning_rate": 0.0005601317400174876, + "loss": 3.6275, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.32786324620246887, + "learning_rate": 0.0005599568638880793, + "loss": 3.6468, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.31274884939193726, + "learning_rate": 0.0005597819877586709, + "loss": 3.6499, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.32175853848457336, + "learning_rate": 0.0005596071116292625, + "loss": 3.6414, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.3347005248069763, + "learning_rate": 0.0005594322354998542, + "loss": 3.6366, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.3298018276691437, + "learning_rate": 0.0005592573593704459, + "loss": 3.6363, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.3186033368110657, + "learning_rate": 0.0005590824832410375, + "loss": 3.6394, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.317862868309021, + "learning_rate": 0.0005589076071116292, + "loss": 3.6429, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.3186003267765045, + "learning_rate": 0.0005587327309822209, + "loss": 3.6498, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.3352317810058594, + "learning_rate": 0.0005585578548528126, + "loss": 3.649, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.31718602776527405, + "learning_rate": 0.0005583829787234043, + "loss": 3.6428, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.3535988957856564, + "eval_loss": 3.6945624351501465, + "eval_runtime": 180.0673, + "eval_samples_per_second": 92.438, + "eval_steps_per_second": 5.781, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.31061556935310364, + "learning_rate": 0.0005582081025939958, + "loss": 3.6393, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.30603349208831787, + "learning_rate": 0.0005580332264645875, + "loss": 3.6306, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.329946368932724, + "learning_rate": 0.0005578583503351792, + "loss": 3.645, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.3200969099998474, + "learning_rate": 0.0005576834742057709, + "loss": 3.6239, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.34068846702575684, + "learning_rate": 0.0005575085980763625, + "loss": 3.6462, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.32341474294662476, + "learning_rate": 0.0005573337219469542, + "loss": 3.6488, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.3050375282764435, + "learning_rate": 0.0005571588458175459, + "loss": 3.6434, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.32204675674438477, + "learning_rate": 0.0005569839696881374, + "loss": 3.6311, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.3137733042240143, + "learning_rate": 0.0005568090935587292, + "loss": 3.6475, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.3006216585636139, + "learning_rate": 0.0005566342174293208, + "loss": 3.6297, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.3186657428741455, + "learning_rate": 0.0005564593412999125, + "loss": 3.6436, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.316648930311203, + "learning_rate": 0.0005562844651705042, + "loss": 3.6409, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.3677234649658203, + "learning_rate": 0.0005561095890410958, + "loss": 3.6333, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.33473852276802063, + "learning_rate": 0.0005559347129116875, + "loss": 3.6376, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.3314206600189209, + "learning_rate": 0.0005557598367822792, + "loss": 3.6366, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.3139813244342804, + "learning_rate": 0.0005555849606528709, + "loss": 3.6416, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.3106994032859802, + "learning_rate": 0.0005554100845234624, + "loss": 3.641, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.3093438446521759, + "learning_rate": 0.0005552352083940541, + "loss": 3.6312, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.30269870162010193, + "learning_rate": 0.0005550603322646458, + "loss": 3.6342, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.34101831912994385, + "learning_rate": 0.0005548854561352375, + "loss": 3.6387, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.35501196565319654, + "eval_loss": 3.6745212078094482, + "eval_runtime": 180.0817, + "eval_samples_per_second": 92.43, + "eval_steps_per_second": 5.781, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.3471841812133789, + "learning_rate": 0.0005547105800058292, + "loss": 3.6363, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.3232862651348114, + "learning_rate": 0.0005545357038764208, + "loss": 3.6401, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.32816365361213684, + "learning_rate": 0.0005543608277470125, + "loss": 3.6346, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.3071276545524597, + "learning_rate": 0.0005541859516176042, + "loss": 3.6315, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.31473508477211, + "learning_rate": 0.0005540110754881958, + "loss": 3.6417, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.33018672466278076, + "learning_rate": 0.0005538361993587874, + "loss": 3.6422, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.29861611127853394, + "learning_rate": 0.0005536613232293791, + "loss": 3.6341, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.3709903955459595, + "learning_rate": 0.0005534864470999708, + "loss": 3.6334, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.30411627888679504, + "learning_rate": 0.0005533115709705625, + "loss": 3.6261, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.30915218591690063, + "learning_rate": 0.0005531366948411541, + "loss": 3.6405, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.3229111135005951, + "learning_rate": 0.0005529618187117458, + "loss": 3.6408, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.3003472089767456, + "learning_rate": 0.0005527869425823375, + "loss": 3.6375, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.31997328996658325, + "learning_rate": 0.0005526120664529292, + "loss": 3.6224, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.3155164420604706, + "learning_rate": 0.0005524371903235207, + "loss": 3.64, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.30095911026000977, + "learning_rate": 0.0005522623141941124, + "loss": 3.5881, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.3097637891769409, + "learning_rate": 0.0005520874380647041, + "loss": 3.5144, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.3112574815750122, + "learning_rate": 0.0005519125619352957, + "loss": 3.5128, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.3345850110054016, + "learning_rate": 0.0005517376858058875, + "loss": 3.5277, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.3319241404533386, + "learning_rate": 0.0005515628096764791, + "loss": 3.5387, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.32741302251815796, + "learning_rate": 0.0005513879335470708, + "loss": 3.5437, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.3564399668688669, + "eval_loss": 3.6676642894744873, + "eval_runtime": 180.2205, + "eval_samples_per_second": 92.359, + "eval_steps_per_second": 5.776, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.3242168426513672, + "learning_rate": 0.0005512130574176625, + "loss": 3.5313, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.3130347728729248, + "learning_rate": 0.000551038181288254, + "loss": 3.5486, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.31881386041641235, + "learning_rate": 0.0005508633051588457, + "loss": 3.5395, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.3165348768234253, + "learning_rate": 0.0005506884290294374, + "loss": 3.5563, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.31050053238868713, + "learning_rate": 0.0005505135529000291, + "loss": 3.5441, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.31978413462638855, + "learning_rate": 0.0005503386767706207, + "loss": 3.5413, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.3227480947971344, + "learning_rate": 0.0005501638006412124, + "loss": 3.5426, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.30381980538368225, + "learning_rate": 0.0005499889245118041, + "loss": 3.5495, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.3120647966861725, + "learning_rate": 0.0005498140483823958, + "loss": 3.5646, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.3166692852973938, + "learning_rate": 0.0005496391722529875, + "loss": 3.5624, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.31486088037490845, + "learning_rate": 0.000549464296123579, + "loss": 3.5641, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.32626253366470337, + "learning_rate": 0.0005492894199941707, + "loss": 3.5515, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.30358177423477173, + "learning_rate": 0.0005491145438647624, + "loss": 3.5685, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.3053493797779083, + "learning_rate": 0.000548939667735354, + "loss": 3.5679, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": 0.3187207579612732, + "learning_rate": 0.0005487647916059457, + "loss": 3.5632, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.32234564423561096, + "learning_rate": 0.0005485899154765374, + "loss": 3.5622, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.31007739901542664, + "learning_rate": 0.0005484150393471291, + "loss": 3.5543, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.315043181180954, + "learning_rate": 0.0005482401632177208, + "loss": 3.5664, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.30842092633247375, + "learning_rate": 0.0005480652870883124, + "loss": 3.5638, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.31273534893989563, + "learning_rate": 0.000547890410958904, + "loss": 3.5684, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.3577464010454295, + "eval_loss": 3.658200979232788, + "eval_runtime": 180.2541, + "eval_samples_per_second": 92.342, + "eval_steps_per_second": 5.775, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.32522329688072205, + "learning_rate": 0.0005477155348294957, + "loss": 3.5632, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.3170122504234314, + "learning_rate": 0.0005475406587000874, + "loss": 3.5591, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.30741000175476074, + "learning_rate": 0.000547365782570679, + "loss": 3.5775, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.32984650135040283, + "learning_rate": 0.0005471909064412707, + "loss": 3.5699, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.32355785369873047, + "learning_rate": 0.0005470160303118624, + "loss": 3.5782, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.31487616896629333, + "learning_rate": 0.000546841154182454, + "loss": 3.5612, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": 0.3270156681537628, + "learning_rate": 0.0005466662780530458, + "loss": 3.5596, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.3146279752254486, + "learning_rate": 0.0005464914019236374, + "loss": 3.5732, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.297494113445282, + "learning_rate": 0.000546316525794229, + "loss": 3.5721, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.3217732012271881, + "learning_rate": 0.0005461416496648207, + "loss": 3.5773, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.3060513734817505, + "learning_rate": 0.0005459667735354123, + "loss": 3.5638, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.33157843351364136, + "learning_rate": 0.000545791897406004, + "loss": 3.5654, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.3019466698169708, + "learning_rate": 0.0005456170212765957, + "loss": 3.5716, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.3082275688648224, + "learning_rate": 0.0005454421451471874, + "loss": 3.5811, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.3220166563987732, + "learning_rate": 0.000545267269017779, + "loss": 3.5781, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.30748656392097473, + "learning_rate": 0.0005450923928883708, + "loss": 3.5713, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3191044330596924, + "learning_rate": 0.0005449175167589623, + "loss": 3.5861, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.31128981709480286, + "learning_rate": 0.000544742640629554, + "loss": 3.5644, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3463725447654724, + "learning_rate": 0.0005445677645001457, + "loss": 3.5708, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.32188108563423157, + "learning_rate": 0.0005443928883707373, + "loss": 3.5745, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.35857467937281284, + "eval_loss": 3.6439030170440674, + "eval_runtime": 180.24, + "eval_samples_per_second": 92.349, + "eval_steps_per_second": 5.776, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.32182472944259644, + "learning_rate": 0.000544218012241329, + "loss": 3.5742, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.33084315061569214, + "learning_rate": 0.0005440431361119207, + "loss": 3.577, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.3214638829231262, + "learning_rate": 0.0005438682599825123, + "loss": 3.5804, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.30823513865470886, + "learning_rate": 0.000543693383853104, + "loss": 3.5711, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.31343257427215576, + "learning_rate": 0.0005435185077236957, + "loss": 3.5697, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.30805066227912903, + "learning_rate": 0.0005433436315942873, + "loss": 3.5826, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.31302279233932495, + "learning_rate": 0.000543168755464879, + "loss": 3.5666, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.32919782400131226, + "learning_rate": 0.0005429938793354706, + "loss": 3.5658, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.31369245052337646, + "learning_rate": 0.0005428190032060623, + "loss": 3.5781, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.31175729632377625, + "learning_rate": 0.000542644127076654, + "loss": 3.5749, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.3278850317001343, + "learning_rate": 0.0005424692509472457, + "loss": 3.5751, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.30360278487205505, + "learning_rate": 0.0005422943748178373, + "loss": 3.5759, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.30331000685691833, + "learning_rate": 0.000542119498688429, + "loss": 3.5919, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.3112633526325226, + "learning_rate": 0.0005419446225590207, + "loss": 3.5798, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.29969000816345215, + "learning_rate": 0.0005417697464296122, + "loss": 3.5732, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.30441269278526306, + "learning_rate": 0.000541594870300204, + "loss": 3.5842, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.292208731174469, + "learning_rate": 0.0005414199941707956, + "loss": 3.5711, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.3113518953323364, + "learning_rate": 0.0005412451180413873, + "loss": 3.5742, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.3084389567375183, + "learning_rate": 0.000541070241911979, + "loss": 3.5705, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.3114304542541504, + "learning_rate": 0.0005408953657825706, + "loss": 3.5827, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.3599802247814527, + "eval_loss": 3.6305477619171143, + "eval_runtime": 180.3299, + "eval_samples_per_second": 92.303, + "eval_steps_per_second": 5.773, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.3147881329059601, + "learning_rate": 0.0005407204896531623, + "loss": 3.5661, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.33159729838371277, + "learning_rate": 0.000540545613523754, + "loss": 3.5697, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.31183063983917236, + "learning_rate": 0.0005403707373943456, + "loss": 3.5775, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.3240616023540497, + "learning_rate": 0.0005401958612649372, + "loss": 3.4929, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.367702454328537, + "learning_rate": 0.000540020985135529, + "loss": 3.4545, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.2979840636253357, + "learning_rate": 0.0005398461090061206, + "loss": 3.4727, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.3102603852748871, + "learning_rate": 0.0005396712328767123, + "loss": 3.472, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.3156759738922119, + "learning_rate": 0.000539496356747304, + "loss": 3.4673, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.31477898359298706, + "learning_rate": 0.0005393214806178956, + "loss": 3.4812, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.3182123601436615, + "learning_rate": 0.0005391466044884873, + "loss": 3.4805, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.33958199620246887, + "learning_rate": 0.000538971728359079, + "loss": 3.4871, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.30561456084251404, + "learning_rate": 0.0005387968522296705, + "loss": 3.4898, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.33050698041915894, + "learning_rate": 0.0005386219761002622, + "loss": 3.4846, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3352583944797516, + "learning_rate": 0.0005384470999708539, + "loss": 3.4927, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.3153831958770752, + "learning_rate": 0.0005382722238414456, + "loss": 3.4836, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.3268112540245056, + "learning_rate": 0.0005380973477120373, + "loss": 3.4846, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.3340913951396942, + "learning_rate": 0.000537922471582629, + "loss": 3.5008, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.330852210521698, + "learning_rate": 0.0005377475954532206, + "loss": 3.5119, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.32476457953453064, + "learning_rate": 0.0005375727193238123, + "loss": 3.4917, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.30598244071006775, + "learning_rate": 0.000537397843194404, + "loss": 3.4979, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.359971054347168, + "eval_loss": 3.6366615295410156, + "eval_runtime": 180.2334, + "eval_samples_per_second": 92.352, + "eval_steps_per_second": 5.776, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.3335484266281128, + "learning_rate": 0.0005372229670649955, + "loss": 3.4986, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.3260400593280792, + "learning_rate": 0.0005370480909355872, + "loss": 3.4974, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.3084673583507538, + "learning_rate": 0.0005368732148061789, + "loss": 3.506, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.32834941148757935, + "learning_rate": 0.0005366983386767705, + "loss": 3.5106, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.3387486934661865, + "learning_rate": 0.0005365234625473623, + "loss": 3.5101, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.30053550004959106, + "learning_rate": 0.0005363485864179539, + "loss": 3.5012, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.3416799306869507, + "learning_rate": 0.0005361737102885456, + "loss": 3.5223, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.3232312500476837, + "learning_rate": 0.0005359988341591373, + "loss": 3.504, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3403994143009186, + "learning_rate": 0.000535823958029729, + "loss": 3.505, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.3213067352771759, + "learning_rate": 0.0005356490819003205, + "loss": 3.5181, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.31317710876464844, + "learning_rate": 0.0005354742057709122, + "loss": 3.5133, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.3116888701915741, + "learning_rate": 0.0005352993296415039, + "loss": 3.5131, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.31915172934532166, + "learning_rate": 0.0005351244535120955, + "loss": 3.5048, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.33018389344215393, + "learning_rate": 0.0005349495773826873, + "loss": 3.5235, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.32426100969314575, + "learning_rate": 0.0005347747012532789, + "loss": 3.5113, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.31989872455596924, + "learning_rate": 0.0005345998251238706, + "loss": 3.5174, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.30927300453186035, + "learning_rate": 0.0005344249489944623, + "loss": 3.5063, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.3078673183917999, + "learning_rate": 0.0005342500728650538, + "loss": 3.5206, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.3324156701564789, + "learning_rate": 0.0005340751967356455, + "loss": 3.5154, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.3175108730792999, + "learning_rate": 0.0005339003206062372, + "loss": 3.5219, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.36131487567889137, + "eval_loss": 3.623619556427002, + "eval_runtime": 180.1823, + "eval_samples_per_second": 92.379, + "eval_steps_per_second": 5.777, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.3166046440601349, + "learning_rate": 0.0005337254444768288, + "loss": 3.5227, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3559969961643219, + "learning_rate": 0.0005335505683474205, + "loss": 3.5278, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.31637391448020935, + "learning_rate": 0.0005333756922180122, + "loss": 3.5251, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.33826208114624023, + "learning_rate": 0.0005332008160886039, + "loss": 3.5306, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.3146003484725952, + "learning_rate": 0.0005330259399591956, + "loss": 3.5176, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.33475756645202637, + "learning_rate": 0.0005328510638297873, + "loss": 3.5252, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.31839892268180847, + "learning_rate": 0.0005326761877003788, + "loss": 3.5227, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.3179383873939514, + "learning_rate": 0.0005325013115709705, + "loss": 3.5276, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.32419490814208984, + "learning_rate": 0.0005323264354415622, + "loss": 3.5182, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.30967098474502563, + "learning_rate": 0.0005321515593121538, + "loss": 3.5231, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.33187001943588257, + "learning_rate": 0.0005319766831827455, + "loss": 3.5163, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.3205404281616211, + "learning_rate": 0.0005318018070533372, + "loss": 3.5162, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.3075636923313141, + "learning_rate": 0.0005316269309239288, + "loss": 3.5245, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.3177931010723114, + "learning_rate": 0.0005314520547945206, + "loss": 3.5179, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.3217032849788666, + "learning_rate": 0.0005312771786651121, + "loss": 3.5328, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.3182378113269806, + "learning_rate": 0.0005311023025357038, + "loss": 3.5269, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3116445541381836, + "learning_rate": 0.0005309274264062955, + "loss": 3.5313, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.3098820447921753, + "learning_rate": 0.0005307525502768872, + "loss": 3.5347, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.3134811818599701, + "learning_rate": 0.0005305776741474788, + "loss": 3.5424, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.3062841296195984, + "learning_rate": 0.0005304027980180705, + "loss": 3.5406, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.36257216573326145, + "eval_loss": 3.6113312244415283, + "eval_runtime": 180.3944, + "eval_samples_per_second": 92.27, + "eval_steps_per_second": 5.771, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.32947519421577454, + "learning_rate": 0.0005302279218886622, + "loss": 3.5347, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.3123909533023834, + "learning_rate": 0.0005300530457592538, + "loss": 3.5286, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.34147006273269653, + "learning_rate": 0.0005298781696298456, + "loss": 3.5263, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.3259049654006958, + "learning_rate": 0.0005297032935004371, + "loss": 3.5195, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.31503719091415405, + "learning_rate": 0.0005295284173710288, + "loss": 3.5295, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.32215699553489685, + "learning_rate": 0.0005293535412416205, + "loss": 3.5369, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.33477988839149475, + "learning_rate": 0.0005291786651122121, + "loss": 3.5262, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.3120667040348053, + "learning_rate": 0.0005290037889828038, + "loss": 3.5411, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.31322258710861206, + "learning_rate": 0.0005288289128533955, + "loss": 3.5162, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.3116031885147095, + "learning_rate": 0.0005286540367239872, + "loss": 3.5246, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.3136381506919861, + "learning_rate": 0.0005284791605945788, + "loss": 3.5164, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.314301460981369, + "learning_rate": 0.0005283042844651704, + "loss": 3.5269, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.3147795498371124, + "learning_rate": 0.0005281294083357621, + "loss": 3.4183, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.31358394026756287, + "learning_rate": 0.0005279545322063538, + "loss": 3.4052, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.3503838777542114, + "learning_rate": 0.0005277796560769455, + "loss": 3.4225, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.336975634098053, + "learning_rate": 0.0005276047799475371, + "loss": 3.4233, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.34221938252449036, + "learning_rate": 0.0005274299038181288, + "loss": 3.4379, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.30639681220054626, + "learning_rate": 0.0005272550276887205, + "loss": 3.4234, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.3100702464580536, + "learning_rate": 0.0005270801515593121, + "loss": 3.4353, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.31649187207221985, + "learning_rate": 0.0005269052754299037, + "loss": 3.4266, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.3626251896545744, + "eval_loss": 3.617746114730835, + "eval_runtime": 180.2077, + "eval_samples_per_second": 92.366, + "eval_steps_per_second": 5.777, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.31790605187416077, + "learning_rate": 0.0005267303993004954, + "loss": 3.4448, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.3185005486011505, + "learning_rate": 0.000526555523171087, + "loss": 3.4435, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.32815855741500854, + "learning_rate": 0.0005263806470416788, + "loss": 3.4373, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.3747202754020691, + "learning_rate": 0.0005262057709122704, + "loss": 3.4414, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.3081320524215698, + "learning_rate": 0.0005260308947828621, + "loss": 3.4516, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.33076637983322144, + "learning_rate": 0.0005258560186534538, + "loss": 3.445, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.34362319111824036, + "learning_rate": 0.0005256811425240455, + "loss": 3.4611, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.31307855248451233, + "learning_rate": 0.0005255062663946371, + "loss": 3.4547, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.33912956714630127, + "learning_rate": 0.0005253313902652287, + "loss": 3.4538, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.33405831456184387, + "learning_rate": 0.0005251565141358204, + "loss": 3.4433, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.32273417711257935, + "learning_rate": 0.000524981638006412, + "loss": 3.4635, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.3156605660915375, + "learning_rate": 0.0005248067618770038, + "loss": 3.4561, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.3214246928691864, + "learning_rate": 0.0005246318857475954, + "loss": 3.4539, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.30894410610198975, + "learning_rate": 0.0005244570096181871, + "loss": 3.4651, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.36027565598487854, + "learning_rate": 0.0005242821334887788, + "loss": 3.4633, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.33685246109962463, + "learning_rate": 0.0005241072573593704, + "loss": 3.4651, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.3304831087589264, + "learning_rate": 0.000523932381229962, + "loss": 3.4685, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.30572712421417236, + "learning_rate": 0.0005237575051005537, + "loss": 3.4666, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.3293927013874054, + "learning_rate": 0.0005235826289711454, + "loss": 3.4804, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.31569355726242065, + "learning_rate": 0.000523407752841737, + "loss": 3.4681, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.3631648344413295, + "eval_loss": 3.6071176528930664, + "eval_runtime": 180.3173, + "eval_samples_per_second": 92.31, + "eval_steps_per_second": 5.773, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.3268168568611145, + "learning_rate": 0.0005232328767123287, + "loss": 3.4681, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.3143610656261444, + "learning_rate": 0.0005230580005829204, + "loss": 3.4788, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.3352193534374237, + "learning_rate": 0.0005228831244535121, + "loss": 3.4648, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.3402239680290222, + "learning_rate": 0.0005227082483241038, + "loss": 3.4847, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.32882171869277954, + "learning_rate": 0.0005225333721946954, + "loss": 3.4692, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.3083683252334595, + "learning_rate": 0.000522358496065287, + "loss": 3.4799, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.32959502935409546, + "learning_rate": 0.0005221836199358787, + "loss": 3.4769, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.31663578748703003, + "learning_rate": 0.0005220087438064703, + "loss": 3.4759, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.3328782916069031, + "learning_rate": 0.000521833867677062, + "loss": 3.476, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.32036277651786804, + "learning_rate": 0.0005216589915476537, + "loss": 3.4838, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.31284037232398987, + "learning_rate": 0.0005214841154182454, + "loss": 3.4818, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.3226317763328552, + "learning_rate": 0.0005213092392888371, + "loss": 3.4757, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.31363436579704285, + "learning_rate": 0.0005211343631594287, + "loss": 3.4836, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.33146846294403076, + "learning_rate": 0.0005209594870300204, + "loss": 3.4733, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.3308202624320984, + "learning_rate": 0.000520784610900612, + "loss": 3.4909, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.331926554441452, + "learning_rate": 0.0005206097347712037, + "loss": 3.4868, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.31595379114151, + "learning_rate": 0.0005204348586417953, + "loss": 3.4817, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.3580048680305481, + "learning_rate": 0.000520259982512387, + "loss": 3.484, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.30390664935112, + "learning_rate": 0.0005200851063829787, + "loss": 3.4853, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.323244571685791, + "learning_rate": 0.0005199102302535703, + "loss": 3.4797, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.36416934970451803, + "eval_loss": 3.600242853164673, + "eval_runtime": 180.297, + "eval_samples_per_second": 92.32, + "eval_steps_per_second": 5.774, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.3307446539402008, + "learning_rate": 0.0005197353541241621, + "loss": 3.4901, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.3092699944972992, + "learning_rate": 0.0005195604779947537, + "loss": 3.4878, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.33178552985191345, + "learning_rate": 0.0005193856018653454, + "loss": 3.4827, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.35661131143569946, + "learning_rate": 0.000519210725735937, + "loss": 3.4937, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.319477915763855, + "learning_rate": 0.0005190358496065286, + "loss": 3.4894, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.3135737478733063, + "learning_rate": 0.0005188609734771203, + "loss": 3.5, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.3119906187057495, + "learning_rate": 0.000518686097347712, + "loss": 3.4882, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.31904417276382446, + "learning_rate": 0.0005185112212183037, + "loss": 3.4952, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.33189624547958374, + "learning_rate": 0.0005183363450888953, + "loss": 3.4895, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.3348432779312134, + "learning_rate": 0.000518161468959487, + "loss": 3.4911, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.3141877055168152, + "learning_rate": 0.0005179865928300787, + "loss": 3.5002, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.3227143883705139, + "learning_rate": 0.0005178117167006703, + "loss": 3.4831, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.317911833524704, + "learning_rate": 0.000517636840571262, + "loss": 3.494, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.3200852572917938, + "learning_rate": 0.0005174619644418536, + "loss": 3.4797, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.32029202580451965, + "learning_rate": 0.0005172870883124453, + "loss": 3.4846, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.322200208902359, + "learning_rate": 0.000517112212183037, + "loss": 3.4984, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.3308181166648865, + "learning_rate": 0.0005169373360536286, + "loss": 3.4858, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.3258877992630005, + "learning_rate": 0.0005167624599242203, + "loss": 3.4966, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.32055163383483887, + "learning_rate": 0.000516587583794812, + "loss": 3.4928, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.32862454652786255, + "learning_rate": 0.0005164127076654037, + "loss": 3.4969, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.3648933437343302, + "eval_loss": 3.586930274963379, + "eval_runtime": 185.7598, + "eval_samples_per_second": 89.605, + "eval_steps_per_second": 5.604, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.3120695948600769, + "learning_rate": 0.0005162378315359953, + "loss": 3.463, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.32572463154792786, + "learning_rate": 0.0005160629554065869, + "loss": 3.3853, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.3158482015132904, + "learning_rate": 0.0005158880792771786, + "loss": 3.3814, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.322257936000824, + "learning_rate": 0.0005157132031477703, + "loss": 3.3902, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.3240783214569092, + "learning_rate": 0.000515538327018362, + "loss": 3.3809, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.33611229062080383, + "learning_rate": 0.0005153634508889536, + "loss": 3.3813, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.3258345425128937, + "learning_rate": 0.0005151885747595453, + "loss": 3.4111, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.3187943994998932, + "learning_rate": 0.000515013698630137, + "loss": 3.3978, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.3566177487373352, + "learning_rate": 0.0005148388225007285, + "loss": 3.4093, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.32738471031188965, + "learning_rate": 0.0005146639463713203, + "loss": 3.4058, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.3418164849281311, + "learning_rate": 0.0005144890702419119, + "loss": 3.4025, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.3443870544433594, + "learning_rate": 0.0005143141941125036, + "loss": 3.4091, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.3457808494567871, + "learning_rate": 0.0005141393179830953, + "loss": 3.4135, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.3336106240749359, + "learning_rate": 0.0005139644418536869, + "loss": 3.4172, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.33773717284202576, + "learning_rate": 0.0005137895657242786, + "loss": 3.4075, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.3136851489543915, + "learning_rate": 0.0005136146895948703, + "loss": 3.4221, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.34105175733566284, + "learning_rate": 0.000513439813465462, + "loss": 3.4201, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.34829896688461304, + "learning_rate": 0.0005132649373360535, + "loss": 3.4305, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.32784610986709595, + "learning_rate": 0.0005130900612066452, + "loss": 3.4303, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.34191837906837463, + "learning_rate": 0.0005129151850772369, + "loss": 3.4076, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.36452899532601774, + "eval_loss": 3.5990021228790283, + "eval_runtime": 182.0119, + "eval_samples_per_second": 91.45, + "eval_steps_per_second": 5.719, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.32772010564804077, + "learning_rate": 0.0005127403089478286, + "loss": 3.4265, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.33544066548347473, + "learning_rate": 0.0005125654328184203, + "loss": 3.4307, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.3163904845714569, + "learning_rate": 0.0005123905566890119, + "loss": 3.4129, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.3212714195251465, + "learning_rate": 0.0005122156805596036, + "loss": 3.4289, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.3180087208747864, + "learning_rate": 0.0005120408044301953, + "loss": 3.441, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.3278086483478546, + "learning_rate": 0.0005118659283007868, + "loss": 3.4319, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.3221387267112732, + "learning_rate": 0.0005116910521713785, + "loss": 3.435, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.3392038941383362, + "learning_rate": 0.0005115161760419702, + "loss": 3.4346, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.30694445967674255, + "learning_rate": 0.0005113412999125619, + "loss": 3.4151, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.3072811961174011, + "learning_rate": 0.0005111664237831536, + "loss": 3.4429, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.354181170463562, + "learning_rate": 0.0005109915476537452, + "loss": 3.4374, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.33871990442276, + "learning_rate": 0.0005108166715243369, + "loss": 3.4306, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.33174610137939453, + "learning_rate": 0.0005106417953949286, + "loss": 3.4468, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.3390611410140991, + "learning_rate": 0.0005104669192655203, + "loss": 3.4303, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.3334279954433441, + "learning_rate": 0.0005102920431361118, + "loss": 3.4352, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.31724613904953003, + "learning_rate": 0.0005101171670067035, + "loss": 3.4353, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.3118076026439667, + "learning_rate": 0.0005099422908772952, + "loss": 3.4293, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.3568933606147766, + "learning_rate": 0.0005097674147478868, + "loss": 3.4445, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.3219817578792572, + "learning_rate": 0.0005095925386184786, + "loss": 3.4461, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.3226459324359894, + "learning_rate": 0.0005094176624890702, + "loss": 3.4535, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.3653061308468132, + "eval_loss": 3.5887749195098877, + "eval_runtime": 180.3689, + "eval_samples_per_second": 92.283, + "eval_steps_per_second": 5.772, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.33434975147247314, + "learning_rate": 0.0005092427863596619, + "loss": 3.433, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.33903586864471436, + "learning_rate": 0.0005090679102302536, + "loss": 3.4554, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.3249543309211731, + "learning_rate": 0.0005088930341008451, + "loss": 3.4551, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.33772045373916626, + "learning_rate": 0.0005087181579714368, + "loss": 3.4496, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.3366949260234833, + "learning_rate": 0.0005085432818420285, + "loss": 3.453, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.32443875074386597, + "learning_rate": 0.0005083684057126202, + "loss": 3.448, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.322132408618927, + "learning_rate": 0.0005081935295832118, + "loss": 3.4588, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.35065576434135437, + "learning_rate": 0.0005080186534538035, + "loss": 3.445, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.32592296600341797, + "learning_rate": 0.0005078437773243952, + "loss": 3.4581, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.3161769509315491, + "learning_rate": 0.0005076689011949869, + "loss": 3.4519, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.3316858410835266, + "learning_rate": 0.0005074940250655786, + "loss": 3.4564, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.3109246790409088, + "learning_rate": 0.0005073191489361701, + "loss": 3.4531, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.33062729239463806, + "learning_rate": 0.0005071442728067618, + "loss": 3.4741, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.31517359614372253, + "learning_rate": 0.0005069693966773535, + "loss": 3.4568, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3394618332386017, + "learning_rate": 0.0005067945205479451, + "loss": 3.4625, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.3200554847717285, + "learning_rate": 0.0005066196444185368, + "loss": 3.4538, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.30703428387641907, + "learning_rate": 0.0005064447682891285, + "loss": 3.4588, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.31843116879463196, + "learning_rate": 0.0005062698921597202, + "loss": 3.4522, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.3061220943927765, + "learning_rate": 0.0005060950160303119, + "loss": 3.4653, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.3118315637111664, + "learning_rate": 0.0005059201399009035, + "loss": 3.4522, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.3663362762981308, + "eval_loss": 3.5790724754333496, + "eval_runtime": 180.2755, + "eval_samples_per_second": 92.331, + "eval_steps_per_second": 5.774, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.31885090470314026, + "learning_rate": 0.0005057452637714951, + "loss": 3.4557, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.29833894968032837, + "learning_rate": 0.0005055703876420868, + "loss": 3.4556, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.3107469975948334, + "learning_rate": 0.0005053955115126785, + "loss": 3.4487, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.3197011649608612, + "learning_rate": 0.0005052206353832701, + "loss": 3.4516, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.3332655727863312, + "learning_rate": 0.0005050457592538618, + "loss": 3.4542, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3343665301799774, + "learning_rate": 0.0005048708831244535, + "loss": 3.4726, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.33328115940093994, + "learning_rate": 0.0005046960069950451, + "loss": 3.4682, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.30137884616851807, + "learning_rate": 0.0005045211308656369, + "loss": 3.457, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.33149558305740356, + "learning_rate": 0.0005043462547362284, + "loss": 3.4767, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.32484644651412964, + "learning_rate": 0.0005041713786068201, + "loss": 3.3776, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.3401671051979065, + "learning_rate": 0.0005039965024774118, + "loss": 3.3496, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.32481470704078674, + "learning_rate": 0.0005038216263480034, + "loss": 3.3514, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.33603399991989136, + "learning_rate": 0.0005036467502185951, + "loss": 3.3551, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.3104502558708191, + "learning_rate": 0.0005034718740891868, + "loss": 3.3531, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.3129752278327942, + "learning_rate": 0.0005032969979597785, + "loss": 3.3728, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.3313758075237274, + "learning_rate": 0.0005031221218303701, + "loss": 3.3705, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.3319389522075653, + "learning_rate": 0.0005029472457009618, + "loss": 3.3781, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.33893296122550964, + "learning_rate": 0.0005027723695715534, + "loss": 3.3757, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.34440794587135315, + "learning_rate": 0.0005025974934421451, + "loss": 3.3669, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.3383232057094574, + "learning_rate": 0.0005024226173127368, + "loss": 3.3902, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.36587481534213656, + "eval_loss": 3.5893173217773438, + "eval_runtime": 180.5544, + "eval_samples_per_second": 92.188, + "eval_steps_per_second": 5.766, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.3452877104282379, + "learning_rate": 0.0005022477411833284, + "loss": 3.3954, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.31373968720436096, + "learning_rate": 0.0005020728650539201, + "loss": 3.3883, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.3412425220012665, + "learning_rate": 0.0005018979889245118, + "loss": 3.3806, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.3370135724544525, + "learning_rate": 0.0005017231127951034, + "loss": 3.3744, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.32666200399398804, + "learning_rate": 0.0005015482366656951, + "loss": 3.3909, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.35053926706314087, + "learning_rate": 0.0005013733605362868, + "loss": 3.3971, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.32368576526641846, + "learning_rate": 0.0005011984844068784, + "loss": 3.3959, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3504972755908966, + "learning_rate": 0.0005010236082774701, + "loss": 3.4046, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.3331746459007263, + "learning_rate": 0.0005008487321480617, + "loss": 3.3945, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.33394086360931396, + "learning_rate": 0.0005006738560186534, + "loss": 3.3922, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.3699552118778229, + "learning_rate": 0.0005004989798892451, + "loss": 3.3974, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.3196350038051605, + "learning_rate": 0.0005003241037598368, + "loss": 3.3833, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.3315056264400482, + "learning_rate": 0.0005001492276304284, + "loss": 3.4032, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.322210431098938, + "learning_rate": 0.0004999743515010201, + "loss": 3.4025, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.31940722465515137, + "learning_rate": 0.0004997994753716117, + "loss": 3.4013, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.3116293251514435, + "learning_rate": 0.0004996245992422033, + "loss": 3.4114, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.33897754549980164, + "learning_rate": 0.0004994497231127951, + "loss": 3.409, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.3248927891254425, + "learning_rate": 0.0004992748469833867, + "loss": 3.4177, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.34883642196655273, + "learning_rate": 0.0004990999708539784, + "loss": 3.412, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.3144770860671997, + "learning_rate": 0.0004989250947245701, + "loss": 3.4024, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.36662279358469335, + "eval_loss": 3.5800533294677734, + "eval_runtime": 180.2514, + "eval_samples_per_second": 92.343, + "eval_steps_per_second": 5.775, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.3243623375892639, + "learning_rate": 0.0004987502185951617, + "loss": 3.4076, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.33650779724121094, + "learning_rate": 0.0004985753424657534, + "loss": 3.4107, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.3219531178474426, + "learning_rate": 0.000498400466336345, + "loss": 3.4279, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.33923518657684326, + "learning_rate": 0.0004982255902069367, + "loss": 3.4111, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.3253156244754791, + "learning_rate": 0.0004980507140775283, + "loss": 3.4206, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.32380804419517517, + "learning_rate": 0.0004978758379481201, + "loss": 3.418, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.3461204767227173, + "learning_rate": 0.0004977009618187117, + "loss": 3.423, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.31913691759109497, + "learning_rate": 0.0004975260856893034, + "loss": 3.4112, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.3127973675727844, + "learning_rate": 0.0004973512095598951, + "loss": 3.4102, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.35194820165634155, + "learning_rate": 0.0004971763334304867, + "loss": 3.4137, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.322832852602005, + "learning_rate": 0.0004970014573010784, + "loss": 3.4251, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.3397376537322998, + "learning_rate": 0.00049682658117167, + "loss": 3.4134, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.34631067514419556, + "learning_rate": 0.0004966517050422616, + "loss": 3.4247, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.30945879220962524, + "learning_rate": 0.0004964768289128533, + "loss": 3.4248, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.3211749792098999, + "learning_rate": 0.000496301952783445, + "loss": 3.4199, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.3627347946166992, + "learning_rate": 0.0004961270766540367, + "loss": 3.4054, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.32827696204185486, + "learning_rate": 0.0004959522005246284, + "loss": 3.4218, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.3287445902824402, + "learning_rate": 0.00049577732439522, + "loss": 3.4173, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.3574911952018738, + "learning_rate": 0.0004956024482658117, + "loss": 3.4242, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.33523955941200256, + "learning_rate": 0.0004954275721364034, + "loss": 3.4241, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.36724073977187954, + "eval_loss": 3.5715675354003906, + "eval_runtime": 184.6641, + "eval_samples_per_second": 90.137, + "eval_steps_per_second": 5.637, + "step": 30000 + }, + { + "epoch": 8.753320904218132, + "grad_norm": 0.3205890655517578, + "learning_rate": 0.000495252696006995, + "loss": 3.4306, + "step": 30050 + }, + { + "epoch": 8.767886273595899, + "grad_norm": 0.37505707144737244, + "learning_rate": 0.0004950778198775866, + "loss": 3.431, + "step": 30100 + }, + { + "epoch": 8.782451642973665, + "grad_norm": 0.34911301732063293, + "learning_rate": 0.0004949029437481783, + "loss": 3.4269, + "step": 30150 + }, + { + "epoch": 8.797017012351434, + "grad_norm": 0.3340999186038971, + "learning_rate": 0.00049472806761877, + "loss": 3.4223, + "step": 30200 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.34287843108177185, + "learning_rate": 0.0004945531914893616, + "loss": 3.431, + "step": 30250 + }, + { + "epoch": 8.826147751106967, + "grad_norm": 0.3214537799358368, + "learning_rate": 0.0004943783153599534, + "loss": 3.428, + "step": 30300 + }, + { + "epoch": 8.840713120484736, + "grad_norm": 0.33591654896736145, + "learning_rate": 0.000494203439230545, + "loss": 3.4299, + "step": 30350 + }, + { + "epoch": 8.855278489862503, + "grad_norm": 0.3329029679298401, + "learning_rate": 0.0004940285631011367, + "loss": 3.4187, + "step": 30400 + }, + { + "epoch": 8.86984385924027, + "grad_norm": 0.3495054543018341, + "learning_rate": 0.0004938536869717284, + "loss": 3.4225, + "step": 30450 + }, + { + "epoch": 8.884409228618038, + "grad_norm": 0.32809290289878845, + "learning_rate": 0.0004936788108423199, + "loss": 3.4434, + "step": 30500 + }, + { + "epoch": 8.898974597995805, + "grad_norm": 0.3029363751411438, + "learning_rate": 0.0004935039347129116, + "loss": 3.4226, + "step": 30550 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.3327055871486664, + "learning_rate": 0.0004933290585835033, + "loss": 3.4335, + "step": 30600 + }, + { + "epoch": 8.92810533675134, + "grad_norm": 0.3230193555355072, + "learning_rate": 0.000493154182454095, + "loss": 3.4221, + "step": 30650 + }, + { + "epoch": 8.942670706129107, + "grad_norm": 0.3522266745567322, + "learning_rate": 0.0004929793063246866, + "loss": 3.4195, + "step": 30700 + }, + { + "epoch": 8.957236075506875, + "grad_norm": 0.32179632782936096, + "learning_rate": 0.0004928044301952783, + "loss": 3.437, + "step": 30750 + }, + { + "epoch": 8.971801444884642, + "grad_norm": 0.32391178607940674, + "learning_rate": 0.00049262955406587, + "loss": 3.4254, + "step": 30800 + }, + { + "epoch": 8.986366814262409, + "grad_norm": 0.31046491861343384, + "learning_rate": 0.0004924546779364617, + "loss": 3.4317, + "step": 30850 + }, + { + "epoch": 9.000873922162667, + "grad_norm": 0.341305673122406, + "learning_rate": 0.0004922798018070533, + "loss": 3.4248, + "step": 30900 + }, + { + "epoch": 9.015439291540433, + "grad_norm": 0.3179487884044647, + "learning_rate": 0.0004921049256776449, + "loss": 3.317, + "step": 30950 + }, + { + "epoch": 9.0300046609182, + "grad_norm": 0.3541235029697418, + "learning_rate": 0.0004919300495482366, + "loss": 3.3306, + "step": 31000 + }, + { + "epoch": 9.0300046609182, + "eval_accuracy": 0.3668760386545562, + "eval_loss": 3.5815300941467285, + "eval_runtime": 180.3287, + "eval_samples_per_second": 92.304, + "eval_steps_per_second": 5.773, + "step": 31000 + }, + { + "epoch": 9.044570030295969, + "grad_norm": 0.3544383943080902, + "learning_rate": 0.0004917551734188283, + "loss": 3.3247, + "step": 31050 + }, + { + "epoch": 9.059135399673735, + "grad_norm": 0.32461535930633545, + "learning_rate": 0.0004915802972894199, + "loss": 3.3379, + "step": 31100 + }, + { + "epoch": 9.073700769051504, + "grad_norm": 0.3214743137359619, + "learning_rate": 0.0004914054211600116, + "loss": 3.3344, + "step": 31150 + }, + { + "epoch": 9.08826613842927, + "grad_norm": 0.3329453468322754, + "learning_rate": 0.0004912305450306033, + "loss": 3.3366, + "step": 31200 + }, + { + "epoch": 9.102831507807037, + "grad_norm": 0.33429789543151855, + "learning_rate": 0.000491055668901195, + "loss": 3.3535, + "step": 31250 + }, + { + "epoch": 9.117396877184806, + "grad_norm": 0.3591921031475067, + "learning_rate": 0.0004908807927717865, + "loss": 3.3407, + "step": 31300 + }, + { + "epoch": 9.131962246562573, + "grad_norm": 0.3514149785041809, + "learning_rate": 0.0004907059166423783, + "loss": 3.3391, + "step": 31350 + }, + { + "epoch": 9.14652761594034, + "grad_norm": 0.3230554163455963, + "learning_rate": 0.0004905310405129699, + "loss": 3.36, + "step": 31400 + }, + { + "epoch": 9.161092985318108, + "grad_norm": 0.3351898193359375, + "learning_rate": 0.0004903561643835616, + "loss": 3.3516, + "step": 31450 + }, + { + "epoch": 9.175658354695875, + "grad_norm": 0.32898926734924316, + "learning_rate": 0.0004901812882541533, + "loss": 3.3637, + "step": 31500 + }, + { + "epoch": 9.190223724073643, + "grad_norm": 0.37594887614250183, + "learning_rate": 0.0004900064121247449, + "loss": 3.3559, + "step": 31550 + }, + { + "epoch": 9.20478909345141, + "grad_norm": 0.3707999289035797, + "learning_rate": 0.0004898315359953366, + "loss": 3.3511, + "step": 31600 + }, + { + "epoch": 9.219354462829177, + "grad_norm": 0.34326040744781494, + "learning_rate": 0.0004896566598659283, + "loss": 3.3577, + "step": 31650 + }, + { + "epoch": 9.233919832206945, + "grad_norm": 0.3109116554260254, + "learning_rate": 0.0004894817837365199, + "loss": 3.3553, + "step": 31700 + }, + { + "epoch": 9.248485201584712, + "grad_norm": 0.343200147151947, + "learning_rate": 0.0004893069076071115, + "loss": 3.3755, + "step": 31750 + }, + { + "epoch": 9.263050570962479, + "grad_norm": 0.34142202138900757, + "learning_rate": 0.0004891320314777032, + "loss": 3.3705, + "step": 31800 + }, + { + "epoch": 9.277615940340247, + "grad_norm": 0.35971346497535706, + "learning_rate": 0.0004889571553482949, + "loss": 3.3715, + "step": 31850 + }, + { + "epoch": 9.292181309718014, + "grad_norm": 0.3268764913082123, + "learning_rate": 0.0004887822792188866, + "loss": 3.3794, + "step": 31900 + }, + { + "epoch": 9.306746679095783, + "grad_norm": 0.3492770493030548, + "learning_rate": 0.0004886074030894782, + "loss": 3.3712, + "step": 31950 + }, + { + "epoch": 9.32131204847355, + "grad_norm": 0.34961649775505066, + "learning_rate": 0.0004884325269600699, + "loss": 3.3808, + "step": 32000 + }, + { + "epoch": 9.32131204847355, + "eval_accuracy": 0.36758063368876603, + "eval_loss": 3.577639102935791, + "eval_runtime": 180.0042, + "eval_samples_per_second": 92.47, + "eval_steps_per_second": 5.783, + "step": 32000 + }, + { + "epoch": 9.335877417851316, + "grad_norm": 0.354534387588501, + "learning_rate": 0.0004882576508306615, + "loss": 3.3778, + "step": 32050 + }, + { + "epoch": 9.350442787229085, + "grad_norm": 0.3517109453678131, + "learning_rate": 0.00048808277470125327, + "loss": 3.3701, + "step": 32100 + }, + { + "epoch": 9.365008156606851, + "grad_norm": 0.333987295627594, + "learning_rate": 0.0004879078985718449, + "loss": 3.3867, + "step": 32150 + }, + { + "epoch": 9.379573525984618, + "grad_norm": 0.34107351303100586, + "learning_rate": 0.0004877330224424366, + "loss": 3.374, + "step": 32200 + }, + { + "epoch": 9.394138895362387, + "grad_norm": 0.3421863615512848, + "learning_rate": 0.00048755814631302823, + "loss": 3.377, + "step": 32250 + }, + { + "epoch": 9.408704264740154, + "grad_norm": 0.3399357795715332, + "learning_rate": 0.00048738327018361987, + "loss": 3.3739, + "step": 32300 + }, + { + "epoch": 9.423269634117922, + "grad_norm": 0.3419632613658905, + "learning_rate": 0.00048720839405421156, + "loss": 3.381, + "step": 32350 + }, + { + "epoch": 9.437835003495689, + "grad_norm": 0.3296915292739868, + "learning_rate": 0.0004870335179248032, + "loss": 3.3788, + "step": 32400 + }, + { + "epoch": 9.452400372873456, + "grad_norm": 0.3411202132701874, + "learning_rate": 0.0004868586417953949, + "loss": 3.3895, + "step": 32450 + }, + { + "epoch": 9.466965742251224, + "grad_norm": 0.35072267055511475, + "learning_rate": 0.0004866837656659865, + "loss": 3.3781, + "step": 32500 + }, + { + "epoch": 9.48153111162899, + "grad_norm": 0.3408583700656891, + "learning_rate": 0.00048650888953657816, + "loss": 3.3765, + "step": 32550 + }, + { + "epoch": 9.496096481006758, + "grad_norm": 0.3342673182487488, + "learning_rate": 0.0004863340134071699, + "loss": 3.384, + "step": 32600 + }, + { + "epoch": 9.510661850384526, + "grad_norm": 0.3424014151096344, + "learning_rate": 0.00048615913727776154, + "loss": 3.3787, + "step": 32650 + }, + { + "epoch": 9.525227219762293, + "grad_norm": 0.3425311744213104, + "learning_rate": 0.00048598426114835323, + "loss": 3.3784, + "step": 32700 + }, + { + "epoch": 9.53979258914006, + "grad_norm": 0.35819771885871887, + "learning_rate": 0.00048580938501894486, + "loss": 3.3871, + "step": 32750 + }, + { + "epoch": 9.554357958517828, + "grad_norm": 0.3209281265735626, + "learning_rate": 0.00048563450888953655, + "loss": 3.3878, + "step": 32800 + }, + { + "epoch": 9.568923327895595, + "grad_norm": 0.35813918709754944, + "learning_rate": 0.0004854596327601282, + "loss": 3.3865, + "step": 32850 + }, + { + "epoch": 9.583488697273363, + "grad_norm": 0.3399278521537781, + "learning_rate": 0.0004852847566307198, + "loss": 3.3913, + "step": 32900 + }, + { + "epoch": 9.59805406665113, + "grad_norm": 0.3297010362148285, + "learning_rate": 0.0004851098805013115, + "loss": 3.3961, + "step": 32950 + }, + { + "epoch": 9.612619436028897, + "grad_norm": 0.3605840504169464, + "learning_rate": 0.00048493500437190315, + "loss": 3.3982, + "step": 33000 + }, + { + "epoch": 9.612619436028897, + "eval_accuracy": 0.3680854778531073, + "eval_loss": 3.5693137645721436, + "eval_runtime": 181.9882, + "eval_samples_per_second": 91.462, + "eval_steps_per_second": 5.72, + "step": 33000 + }, + { + "epoch": 9.627184805406666, + "grad_norm": 0.3284258246421814, + "learning_rate": 0.0004847601282424949, + "loss": 3.3981, + "step": 33050 + }, + { + "epoch": 9.641750174784432, + "grad_norm": 0.3391687273979187, + "learning_rate": 0.00048458525211308653, + "loss": 3.4001, + "step": 33100 + }, + { + "epoch": 9.6563155441622, + "grad_norm": 0.33698102831840515, + "learning_rate": 0.00048441037598367817, + "loss": 3.3997, + "step": 33150 + }, + { + "epoch": 9.670880913539968, + "grad_norm": 0.34788277745246887, + "learning_rate": 0.00048423549985426986, + "loss": 3.4001, + "step": 33200 + }, + { + "epoch": 9.685446282917734, + "grad_norm": 0.3518361747264862, + "learning_rate": 0.0004840606237248615, + "loss": 3.3944, + "step": 33250 + }, + { + "epoch": 9.700011652295503, + "grad_norm": 0.3476645052433014, + "learning_rate": 0.0004838857475954532, + "loss": 3.4001, + "step": 33300 + }, + { + "epoch": 9.71457702167327, + "grad_norm": 0.34337055683135986, + "learning_rate": 0.0004837108714660448, + "loss": 3.3963, + "step": 33350 + }, + { + "epoch": 9.729142391051036, + "grad_norm": 0.37030595541000366, + "learning_rate": 0.0004835359953366365, + "loss": 3.4053, + "step": 33400 + }, + { + "epoch": 9.743707760428805, + "grad_norm": 0.343307763338089, + "learning_rate": 0.00048336111920722815, + "loss": 3.4102, + "step": 33450 + }, + { + "epoch": 9.758273129806572, + "grad_norm": 0.3450109660625458, + "learning_rate": 0.0004831862430778198, + "loss": 3.3953, + "step": 33500 + }, + { + "epoch": 9.772838499184338, + "grad_norm": 0.3278052508831024, + "learning_rate": 0.00048301136694841153, + "loss": 3.3951, + "step": 33550 + }, + { + "epoch": 9.787403868562107, + "grad_norm": 0.33179140090942383, + "learning_rate": 0.00048283649081900317, + "loss": 3.4122, + "step": 33600 + }, + { + "epoch": 9.801969237939874, + "grad_norm": 0.32797035574913025, + "learning_rate": 0.00048266161468959486, + "loss": 3.4051, + "step": 33650 + }, + { + "epoch": 9.816534607317642, + "grad_norm": 0.3269261419773102, + "learning_rate": 0.0004824867385601865, + "loss": 3.4172, + "step": 33700 + }, + { + "epoch": 9.831099976695409, + "grad_norm": 0.3510269522666931, + "learning_rate": 0.00048231186243077813, + "loss": 3.4084, + "step": 33750 + }, + { + "epoch": 9.845665346073176, + "grad_norm": 0.31174078583717346, + "learning_rate": 0.0004821369863013698, + "loss": 3.3988, + "step": 33800 + }, + { + "epoch": 9.860230715450944, + "grad_norm": 0.3384522497653961, + "learning_rate": 0.00048196211017196146, + "loss": 3.4053, + "step": 33850 + }, + { + "epoch": 9.874796084828711, + "grad_norm": 0.3284479081630707, + "learning_rate": 0.00048178723404255315, + "loss": 3.4068, + "step": 33900 + }, + { + "epoch": 9.88936145420648, + "grad_norm": 0.3625960052013397, + "learning_rate": 0.0004816123579131448, + "loss": 3.4109, + "step": 33950 + }, + { + "epoch": 9.903926823584246, + "grad_norm": 0.31597065925598145, + "learning_rate": 0.0004814374817837364, + "loss": 3.4131, + "step": 34000 + }, + { + "epoch": 9.903926823584246, + "eval_accuracy": 0.3685431765796514, + "eval_loss": 3.559751510620117, + "eval_runtime": 180.1509, + "eval_samples_per_second": 92.395, + "eval_steps_per_second": 5.778, + "step": 34000 + }, + { + "epoch": 9.918492192962013, + "grad_norm": 0.3383018672466278, + "learning_rate": 0.00048126260565432816, + "loss": 3.3986, + "step": 34050 + }, + { + "epoch": 9.933057562339782, + "grad_norm": 0.3599507510662079, + "learning_rate": 0.0004810877295249198, + "loss": 3.4032, + "step": 34100 + }, + { + "epoch": 9.947622931717548, + "grad_norm": 0.31639835238456726, + "learning_rate": 0.0004809128533955115, + "loss": 3.4118, + "step": 34150 + }, + { + "epoch": 9.962188301095315, + "grad_norm": 0.3275424540042877, + "learning_rate": 0.0004807379772661031, + "loss": 3.407, + "step": 34200 + }, + { + "epoch": 9.976753670473084, + "grad_norm": 0.33506444096565247, + "learning_rate": 0.0004805631011366948, + "loss": 3.4083, + "step": 34250 + }, + { + "epoch": 9.99131903985085, + "grad_norm": 0.3339884877204895, + "learning_rate": 0.00048038822500728645, + "loss": 3.4199, + "step": 34300 + }, + { + "epoch": 10.005826147751106, + "grad_norm": 0.35206592082977295, + "learning_rate": 0.0004802133488778781, + "loss": 3.3579, + "step": 34350 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.3488442301750183, + "learning_rate": 0.0004800384727484698, + "loss": 3.2863, + "step": 34400 + }, + { + "epoch": 10.034956886506642, + "grad_norm": 0.35724684596061707, + "learning_rate": 0.0004798635966190614, + "loss": 3.3027, + "step": 34450 + }, + { + "epoch": 10.049522255884408, + "grad_norm": 0.3650865852832794, + "learning_rate": 0.00047968872048965316, + "loss": 3.315, + "step": 34500 + }, + { + "epoch": 10.064087625262177, + "grad_norm": 0.34090808033943176, + "learning_rate": 0.0004795138443602448, + "loss": 3.3156, + "step": 34550 + }, + { + "epoch": 10.078652994639944, + "grad_norm": 0.3337085247039795, + "learning_rate": 0.00047933896823083643, + "loss": 3.3165, + "step": 34600 + }, + { + "epoch": 10.093218364017712, + "grad_norm": 0.3569350838661194, + "learning_rate": 0.0004791640921014281, + "loss": 3.3215, + "step": 34650 + }, + { + "epoch": 10.107783733395479, + "grad_norm": 0.35704872012138367, + "learning_rate": 0.00047898921597201976, + "loss": 3.3143, + "step": 34700 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.3637121617794037, + "learning_rate": 0.00047881433984261145, + "loss": 3.3123, + "step": 34750 + }, + { + "epoch": 10.136914472151014, + "grad_norm": 0.37336090207099915, + "learning_rate": 0.0004786394637132031, + "loss": 3.3213, + "step": 34800 + }, + { + "epoch": 10.151479841528781, + "grad_norm": 0.35010263323783875, + "learning_rate": 0.0004784645875837948, + "loss": 3.3336, + "step": 34850 + }, + { + "epoch": 10.166045210906548, + "grad_norm": 0.33684661984443665, + "learning_rate": 0.0004782897114543864, + "loss": 3.3302, + "step": 34900 + }, + { + "epoch": 10.180610580284316, + "grad_norm": 0.34124529361724854, + "learning_rate": 0.00047811483532497805, + "loss": 3.3294, + "step": 34950 + }, + { + "epoch": 10.195175949662083, + "grad_norm": 0.35172775387763977, + "learning_rate": 0.0004779399591955698, + "loss": 3.3494, + "step": 35000 + }, + { + "epoch": 10.195175949662083, + "eval_accuracy": 0.36845782099900126, + "eval_loss": 3.5712480545043945, + "eval_runtime": 180.0647, + "eval_samples_per_second": 92.439, + "eval_steps_per_second": 5.781, + "step": 35000 + }, + { + "epoch": 10.209741319039852, + "grad_norm": 0.33524951338768005, + "learning_rate": 0.00047776508306616143, + "loss": 3.3295, + "step": 35050 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.32747432589530945, + "learning_rate": 0.0004775902069367531, + "loss": 3.3427, + "step": 35100 + }, + { + "epoch": 10.238872057795385, + "grad_norm": 0.37668707966804504, + "learning_rate": 0.00047741533080734476, + "loss": 3.3447, + "step": 35150 + }, + { + "epoch": 10.253437427173154, + "grad_norm": 0.35878944396972656, + "learning_rate": 0.0004772404546779364, + "loss": 3.3322, + "step": 35200 + }, + { + "epoch": 10.26800279655092, + "grad_norm": 0.343294233083725, + "learning_rate": 0.0004770655785485281, + "loss": 3.3489, + "step": 35250 + }, + { + "epoch": 10.282568165928687, + "grad_norm": 0.35359030961990356, + "learning_rate": 0.0004768907024191197, + "loss": 3.3337, + "step": 35300 + }, + { + "epoch": 10.297133535306456, + "grad_norm": 0.3531194031238556, + "learning_rate": 0.0004767158262897114, + "loss": 3.3502, + "step": 35350 + }, + { + "epoch": 10.311698904684222, + "grad_norm": 0.34489333629608154, + "learning_rate": 0.00047654095016030305, + "loss": 3.3373, + "step": 35400 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.35143405199050903, + "learning_rate": 0.0004763660740308948, + "loss": 3.3543, + "step": 35450 + }, + { + "epoch": 10.340829643439758, + "grad_norm": 0.345109760761261, + "learning_rate": 0.0004761911979014864, + "loss": 3.3522, + "step": 35500 + }, + { + "epoch": 10.355395012817525, + "grad_norm": 0.3299683928489685, + "learning_rate": 0.00047601632177207806, + "loss": 3.3578, + "step": 35550 + }, + { + "epoch": 10.369960382195293, + "grad_norm": 0.3354707360267639, + "learning_rate": 0.00047584144564266975, + "loss": 3.3613, + "step": 35600 + }, + { + "epoch": 10.38452575157306, + "grad_norm": 0.36335381865501404, + "learning_rate": 0.0004756665695132614, + "loss": 3.3644, + "step": 35650 + }, + { + "epoch": 10.399091120950827, + "grad_norm": 0.3393089771270752, + "learning_rate": 0.0004754916933838531, + "loss": 3.3578, + "step": 35700 + }, + { + "epoch": 10.413656490328595, + "grad_norm": 0.3224678337574005, + "learning_rate": 0.0004753168172544447, + "loss": 3.3585, + "step": 35750 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.34779009222984314, + "learning_rate": 0.00047514194112503635, + "loss": 3.3551, + "step": 35800 + }, + { + "epoch": 10.44278722908413, + "grad_norm": 0.34187746047973633, + "learning_rate": 0.00047496706499562804, + "loss": 3.3558, + "step": 35850 + }, + { + "epoch": 10.457352598461897, + "grad_norm": 0.34698963165283203, + "learning_rate": 0.0004747921888662197, + "loss": 3.3626, + "step": 35900 + }, + { + "epoch": 10.471917967839664, + "grad_norm": 0.333051860332489, + "learning_rate": 0.0004746173127368114, + "loss": 3.3626, + "step": 35950 + }, + { + "epoch": 10.486483337217432, + "grad_norm": 0.3579745888710022, + "learning_rate": 0.00047444243660740306, + "loss": 3.3611, + "step": 36000 + }, + { + "epoch": 10.486483337217432, + "eval_accuracy": 0.3689163427132376, + "eval_loss": 3.564375638961792, + "eval_runtime": 180.2426, + "eval_samples_per_second": 92.348, + "eval_steps_per_second": 5.776, + "step": 36000 + }, + { + "epoch": 10.5010487065952, + "grad_norm": 0.3685716986656189, + "learning_rate": 0.0004742675604779947, + "loss": 3.3582, + "step": 36050 + }, + { + "epoch": 10.515614075972966, + "grad_norm": 0.3455582559108734, + "learning_rate": 0.0004740926843485864, + "loss": 3.3756, + "step": 36100 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.34760144352912903, + "learning_rate": 0.000473917808219178, + "loss": 3.3706, + "step": 36150 + }, + { + "epoch": 10.544744814728501, + "grad_norm": 0.3610580861568451, + "learning_rate": 0.0004737429320897697, + "loss": 3.3661, + "step": 36200 + }, + { + "epoch": 10.55931018410627, + "grad_norm": 0.37062379717826843, + "learning_rate": 0.00047356805596036135, + "loss": 3.3746, + "step": 36250 + }, + { + "epoch": 10.573875553484037, + "grad_norm": 0.33157944679260254, + "learning_rate": 0.00047339317983095304, + "loss": 3.3633, + "step": 36300 + }, + { + "epoch": 10.588440922861803, + "grad_norm": 0.33509373664855957, + "learning_rate": 0.0004732183037015447, + "loss": 3.3758, + "step": 36350 + }, + { + "epoch": 10.603006292239572, + "grad_norm": 0.33611050248146057, + "learning_rate": 0.0004730434275721363, + "loss": 3.3778, + "step": 36400 + }, + { + "epoch": 10.617571661617339, + "grad_norm": 0.34995800256729126, + "learning_rate": 0.00047286855144272806, + "loss": 3.3576, + "step": 36450 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.3561765253543854, + "learning_rate": 0.0004726936753133197, + "loss": 3.376, + "step": 36500 + }, + { + "epoch": 10.646702400372874, + "grad_norm": 0.34412702918052673, + "learning_rate": 0.0004725187991839114, + "loss": 3.3736, + "step": 36550 + }, + { + "epoch": 10.66126776975064, + "grad_norm": 0.34151721000671387, + "learning_rate": 0.000472343923054503, + "loss": 3.3746, + "step": 36600 + }, + { + "epoch": 10.675833139128407, + "grad_norm": 0.3609941601753235, + "learning_rate": 0.00047216904692509465, + "loss": 3.3746, + "step": 36650 + }, + { + "epoch": 10.690398508506176, + "grad_norm": 0.31918060779571533, + "learning_rate": 0.00047199417079568634, + "loss": 3.3827, + "step": 36700 + }, + { + "epoch": 10.704963877883943, + "grad_norm": 0.35245707631111145, + "learning_rate": 0.000471819294666278, + "loss": 3.3743, + "step": 36750 + }, + { + "epoch": 10.719529247261711, + "grad_norm": 0.333524614572525, + "learning_rate": 0.00047164441853686967, + "loss": 3.3766, + "step": 36800 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.3567027747631073, + "learning_rate": 0.0004714695424074613, + "loss": 3.3738, + "step": 36850 + }, + { + "epoch": 10.748659986017245, + "grad_norm": 0.35120370984077454, + "learning_rate": 0.00047129466627805305, + "loss": 3.3758, + "step": 36900 + }, + { + "epoch": 10.763225355395013, + "grad_norm": 0.35132184624671936, + "learning_rate": 0.0004711197901486447, + "loss": 3.3646, + "step": 36950 + }, + { + "epoch": 10.77779072477278, + "grad_norm": 0.3338523507118225, + "learning_rate": 0.0004709449140192363, + "loss": 3.3847, + "step": 37000 + }, + { + "epoch": 10.77779072477278, + "eval_accuracy": 0.3693834470134071, + "eval_loss": 3.5580570697784424, + "eval_runtime": 180.1912, + "eval_samples_per_second": 92.374, + "eval_steps_per_second": 5.777, + "step": 37000 + }, + { + "epoch": 10.792356094150549, + "grad_norm": 0.339985728263855, + "learning_rate": 0.000470770037889828, + "loss": 3.3951, + "step": 37050 + }, + { + "epoch": 10.806921463528315, + "grad_norm": 0.34119653701782227, + "learning_rate": 0.00047059516176041965, + "loss": 3.375, + "step": 37100 + }, + { + "epoch": 10.821486832906082, + "grad_norm": 0.3279690742492676, + "learning_rate": 0.00047042028563101134, + "loss": 3.3907, + "step": 37150 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.34971579909324646, + "learning_rate": 0.000470245409501603, + "loss": 3.3908, + "step": 37200 + }, + { + "epoch": 10.850617571661617, + "grad_norm": 0.3415312170982361, + "learning_rate": 0.0004700705333721946, + "loss": 3.3816, + "step": 37250 + }, + { + "epoch": 10.865182941039384, + "grad_norm": 0.3437788188457489, + "learning_rate": 0.0004698956572427863, + "loss": 3.3843, + "step": 37300 + }, + { + "epoch": 10.879748310417153, + "grad_norm": 0.3201417028903961, + "learning_rate": 0.00046972078111337794, + "loss": 3.3878, + "step": 37350 + }, + { + "epoch": 10.89431367979492, + "grad_norm": 0.3246171176433563, + "learning_rate": 0.0004695459049839697, + "loss": 3.3897, + "step": 37400 + }, + { + "epoch": 10.908879049172686, + "grad_norm": 0.3374573886394501, + "learning_rate": 0.0004693710288545613, + "loss": 3.3913, + "step": 37450 + }, + { + "epoch": 10.923444418550455, + "grad_norm": 0.3322266638278961, + "learning_rate": 0.000469196152725153, + "loss": 3.3877, + "step": 37500 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.32962697744369507, + "learning_rate": 0.00046902127659574465, + "loss": 3.3924, + "step": 37550 + }, + { + "epoch": 10.95257515730599, + "grad_norm": 0.34244483709335327, + "learning_rate": 0.0004688464004663363, + "loss": 3.394, + "step": 37600 + }, + { + "epoch": 10.967140526683757, + "grad_norm": 0.3375890552997589, + "learning_rate": 0.000468671524336928, + "loss": 3.3964, + "step": 37650 + }, + { + "epoch": 10.981705896061523, + "grad_norm": 0.33612722158432007, + "learning_rate": 0.0004684966482075196, + "loss": 3.3835, + "step": 37700 + }, + { + "epoch": 10.996271265439292, + "grad_norm": 0.30385395884513855, + "learning_rate": 0.0004683217720781113, + "loss": 3.384, + "step": 37750 + }, + { + "epoch": 11.010778373339548, + "grad_norm": 0.3476569652557373, + "learning_rate": 0.00046814689594870294, + "loss": 3.3007, + "step": 37800 + }, + { + "epoch": 11.025343742717315, + "grad_norm": 0.3526822626590729, + "learning_rate": 0.0004679720198192946, + "loss": 3.2748, + "step": 37850 + }, + { + "epoch": 11.039909112095083, + "grad_norm": 0.33929720520973206, + "learning_rate": 0.0004677971436898863, + "loss": 3.2772, + "step": 37900 + }, + { + "epoch": 11.05447448147285, + "grad_norm": 0.34976741671562195, + "learning_rate": 0.00046762226756047795, + "loss": 3.2923, + "step": 37950 + }, + { + "epoch": 11.069039850850617, + "grad_norm": 0.362759530544281, + "learning_rate": 0.00046744739143106964, + "loss": 3.2988, + "step": 38000 + }, + { + "epoch": 11.069039850850617, + "eval_accuracy": 0.36924154042133445, + "eval_loss": 3.5658700466156006, + "eval_runtime": 180.2854, + "eval_samples_per_second": 92.326, + "eval_steps_per_second": 5.774, + "step": 38000 + }, + { + "epoch": 11.083605220228385, + "grad_norm": 0.33938461542129517, + "learning_rate": 0.0004672725153016613, + "loss": 3.2849, + "step": 38050 + }, + { + "epoch": 11.098170589606152, + "grad_norm": 0.32288795709609985, + "learning_rate": 0.00046709763917225297, + "loss": 3.3015, + "step": 38100 + }, + { + "epoch": 11.11273595898392, + "grad_norm": 0.3679756820201874, + "learning_rate": 0.0004669227630428446, + "loss": 3.3041, + "step": 38150 + }, + { + "epoch": 11.127301328361687, + "grad_norm": 0.3594905734062195, + "learning_rate": 0.00046674788691343624, + "loss": 3.2987, + "step": 38200 + }, + { + "epoch": 11.141866697739454, + "grad_norm": 0.3440781235694885, + "learning_rate": 0.00046657301078402793, + "loss": 3.3041, + "step": 38250 + }, + { + "epoch": 11.156432067117223, + "grad_norm": 0.35526901483535767, + "learning_rate": 0.00046639813465461957, + "loss": 3.3127, + "step": 38300 + }, + { + "epoch": 11.17099743649499, + "grad_norm": 0.3565421998500824, + "learning_rate": 0.0004662232585252113, + "loss": 3.3124, + "step": 38350 + }, + { + "epoch": 11.185562805872756, + "grad_norm": 0.3327281177043915, + "learning_rate": 0.00046604838239580295, + "loss": 3.3171, + "step": 38400 + }, + { + "epoch": 11.200128175250525, + "grad_norm": 0.35897812247276306, + "learning_rate": 0.0004658735062663946, + "loss": 3.3073, + "step": 38450 + }, + { + "epoch": 11.214693544628291, + "grad_norm": 0.36023351550102234, + "learning_rate": 0.0004656986301369863, + "loss": 3.3196, + "step": 38500 + }, + { + "epoch": 11.22925891400606, + "grad_norm": 0.3633512556552887, + "learning_rate": 0.0004655237540075779, + "loss": 3.333, + "step": 38550 + }, + { + "epoch": 11.243824283383827, + "grad_norm": 0.36204254627227783, + "learning_rate": 0.0004653488778781696, + "loss": 3.3122, + "step": 38600 + }, + { + "epoch": 11.258389652761593, + "grad_norm": 0.3590092360973358, + "learning_rate": 0.00046517400174876124, + "loss": 3.3238, + "step": 38650 + }, + { + "epoch": 11.272955022139362, + "grad_norm": 0.33919012546539307, + "learning_rate": 0.0004649991256193529, + "loss": 3.3255, + "step": 38700 + }, + { + "epoch": 11.287520391517129, + "grad_norm": 0.3492119312286377, + "learning_rate": 0.00046482424948994457, + "loss": 3.3148, + "step": 38750 + }, + { + "epoch": 11.302085760894895, + "grad_norm": 0.3562859296798706, + "learning_rate": 0.0004646493733605362, + "loss": 3.3275, + "step": 38800 + }, + { + "epoch": 11.316651130272664, + "grad_norm": 0.3738594055175781, + "learning_rate": 0.00046447449723112795, + "loss": 3.3265, + "step": 38850 + }, + { + "epoch": 11.33121649965043, + "grad_norm": 0.34161487221717834, + "learning_rate": 0.0004642996211017196, + "loss": 3.3333, + "step": 38900 + }, + { + "epoch": 11.3457818690282, + "grad_norm": 0.3317897617816925, + "learning_rate": 0.0004641247449723113, + "loss": 3.3255, + "step": 38950 + }, + { + "epoch": 11.360347238405966, + "grad_norm": 0.3387396037578583, + "learning_rate": 0.0004639498688429029, + "loss": 3.3317, + "step": 39000 + }, + { + "epoch": 11.360347238405966, + "eval_accuracy": 0.3696292851940399, + "eval_loss": 3.560234785079956, + "eval_runtime": 180.1583, + "eval_samples_per_second": 92.391, + "eval_steps_per_second": 5.778, + "step": 39000 + }, + { + "epoch": 11.374912607783733, + "grad_norm": 0.3764457106590271, + "learning_rate": 0.00046377499271349455, + "loss": 3.33, + "step": 39050 + }, + { + "epoch": 11.389477977161501, + "grad_norm": 0.3497089147567749, + "learning_rate": 0.00046360011658408624, + "loss": 3.3435, + "step": 39100 + }, + { + "epoch": 11.404043346539268, + "grad_norm": 0.3582172095775604, + "learning_rate": 0.00046342524045467787, + "loss": 3.3332, + "step": 39150 + }, + { + "epoch": 11.418608715917035, + "grad_norm": 0.3324296176433563, + "learning_rate": 0.00046325036432526956, + "loss": 3.3399, + "step": 39200 + }, + { + "epoch": 11.433174085294803, + "grad_norm": 0.33598825335502625, + "learning_rate": 0.0004630754881958612, + "loss": 3.3347, + "step": 39250 + }, + { + "epoch": 11.44773945467257, + "grad_norm": 0.3521103858947754, + "learning_rate": 0.00046290061206645284, + "loss": 3.3471, + "step": 39300 + }, + { + "epoch": 11.462304824050339, + "grad_norm": 0.364227831363678, + "learning_rate": 0.0004627257359370446, + "loss": 3.3419, + "step": 39350 + }, + { + "epoch": 11.476870193428105, + "grad_norm": 0.34679660201072693, + "learning_rate": 0.0004625508598076362, + "loss": 3.336, + "step": 39400 + }, + { + "epoch": 11.491435562805872, + "grad_norm": 0.3476027250289917, + "learning_rate": 0.0004623759836782279, + "loss": 3.336, + "step": 39450 + }, + { + "epoch": 11.50600093218364, + "grad_norm": 0.3519826829433441, + "learning_rate": 0.00046220110754881954, + "loss": 3.3531, + "step": 39500 + }, + { + "epoch": 11.520566301561407, + "grad_norm": 0.3578990697860718, + "learning_rate": 0.00046202623141941123, + "loss": 3.3458, + "step": 39550 + }, + { + "epoch": 11.535131670939174, + "grad_norm": 0.3664044439792633, + "learning_rate": 0.00046185135529000287, + "loss": 3.3502, + "step": 39600 + }, + { + "epoch": 11.549697040316943, + "grad_norm": 0.3508833050727844, + "learning_rate": 0.0004616764791605945, + "loss": 3.354, + "step": 39650 + }, + { + "epoch": 11.56426240969471, + "grad_norm": 0.3304528295993805, + "learning_rate": 0.0004615016030311862, + "loss": 3.3462, + "step": 39700 + }, + { + "epoch": 11.578827779072478, + "grad_norm": 0.3349263668060303, + "learning_rate": 0.00046132672690177783, + "loss": 3.3452, + "step": 39750 + }, + { + "epoch": 11.593393148450245, + "grad_norm": 0.3545154631137848, + "learning_rate": 0.0004611518507723696, + "loss": 3.3546, + "step": 39800 + }, + { + "epoch": 11.607958517828012, + "grad_norm": 0.3606700301170349, + "learning_rate": 0.0004609769746429612, + "loss": 3.3595, + "step": 39850 + }, + { + "epoch": 11.62252388720578, + "grad_norm": 0.36492571234703064, + "learning_rate": 0.00046080209851355285, + "loss": 3.3623, + "step": 39900 + }, + { + "epoch": 11.637089256583547, + "grad_norm": 0.3475785255432129, + "learning_rate": 0.00046062722238414454, + "loss": 3.3574, + "step": 39950 + }, + { + "epoch": 11.651654625961314, + "grad_norm": 0.33172130584716797, + "learning_rate": 0.0004604523462547362, + "loss": 3.3584, + "step": 40000 + }, + { + "epoch": 11.651654625961314, + "eval_accuracy": 0.369777305408969, + "eval_loss": 3.554752826690674, + "eval_runtime": 180.1092, + "eval_samples_per_second": 92.416, + "eval_steps_per_second": 5.78, + "step": 40000 + }, + { + "epoch": 11.666219995339082, + "grad_norm": 0.3251342475414276, + "learning_rate": 0.00046027747012532787, + "loss": 3.3516, + "step": 40050 + }, + { + "epoch": 11.680785364716849, + "grad_norm": 0.3509256839752197, + "learning_rate": 0.0004601025939959195, + "loss": 3.3697, + "step": 40100 + }, + { + "epoch": 11.695350734094617, + "grad_norm": 0.33280062675476074, + "learning_rate": 0.0004599277178665112, + "loss": 3.3534, + "step": 40150 + }, + { + "epoch": 11.709916103472384, + "grad_norm": 0.3410335183143616, + "learning_rate": 0.00045975284173710283, + "loss": 3.358, + "step": 40200 + }, + { + "epoch": 11.724481472850151, + "grad_norm": 0.3339652419090271, + "learning_rate": 0.00045957796560769446, + "loss": 3.3617, + "step": 40250 + }, + { + "epoch": 11.73904684222792, + "grad_norm": 0.34843549132347107, + "learning_rate": 0.0004594030894782862, + "loss": 3.3564, + "step": 40300 + }, + { + "epoch": 11.753612211605686, + "grad_norm": 0.3603808879852295, + "learning_rate": 0.00045922821334887785, + "loss": 3.3563, + "step": 40350 + }, + { + "epoch": 11.768177580983453, + "grad_norm": 0.35512664914131165, + "learning_rate": 0.00045905333721946954, + "loss": 3.3686, + "step": 40400 + }, + { + "epoch": 11.782742950361222, + "grad_norm": 0.3335427939891815, + "learning_rate": 0.00045887846109006117, + "loss": 3.3768, + "step": 40450 + }, + { + "epoch": 11.797308319738988, + "grad_norm": 0.34287896752357483, + "learning_rate": 0.0004587035849606528, + "loss": 3.3561, + "step": 40500 + }, + { + "epoch": 11.811873689116755, + "grad_norm": 0.3359990417957306, + "learning_rate": 0.0004585287088312445, + "loss": 3.3572, + "step": 40550 + }, + { + "epoch": 11.826439058494524, + "grad_norm": 0.3517856299877167, + "learning_rate": 0.00045835383270183613, + "loss": 3.3605, + "step": 40600 + }, + { + "epoch": 11.84100442787229, + "grad_norm": 0.36212751269340515, + "learning_rate": 0.0004581789565724278, + "loss": 3.3682, + "step": 40650 + }, + { + "epoch": 11.855569797250059, + "grad_norm": 0.3393063545227051, + "learning_rate": 0.00045800408044301946, + "loss": 3.3724, + "step": 40700 + }, + { + "epoch": 11.870135166627826, + "grad_norm": 0.3907620310783386, + "learning_rate": 0.0004578292043136111, + "loss": 3.3667, + "step": 40750 + }, + { + "epoch": 11.884700536005592, + "grad_norm": 0.3694191873073578, + "learning_rate": 0.00045765432818420284, + "loss": 3.3633, + "step": 40800 + }, + { + "epoch": 11.899265905383361, + "grad_norm": 0.3315177261829376, + "learning_rate": 0.0004574794520547945, + "loss": 3.3846, + "step": 40850 + }, + { + "epoch": 11.913831274761128, + "grad_norm": 0.37399667501449585, + "learning_rate": 0.00045730457592538617, + "loss": 3.3641, + "step": 40900 + }, + { + "epoch": 11.928396644138896, + "grad_norm": 0.35480326414108276, + "learning_rate": 0.0004571296997959778, + "loss": 3.3734, + "step": 40950 + }, + { + "epoch": 11.942962013516663, + "grad_norm": 0.3346291482448578, + "learning_rate": 0.0004569548236665695, + "loss": 3.3695, + "step": 41000 + }, + { + "epoch": 11.942962013516663, + "eval_accuracy": 0.37045638782472007, + "eval_loss": 3.548724412918091, + "eval_runtime": 180.0916, + "eval_samples_per_second": 92.425, + "eval_steps_per_second": 5.78, + "step": 41000 + }, + { + "epoch": 11.95752738289443, + "grad_norm": 0.3365858197212219, + "learning_rate": 0.00045677994753716113, + "loss": 3.3701, + "step": 41050 + }, + { + "epoch": 11.972092752272198, + "grad_norm": 0.3484339118003845, + "learning_rate": 0.00045660507140775277, + "loss": 3.3669, + "step": 41100 + }, + { + "epoch": 11.986658121649965, + "grad_norm": 0.35748663544654846, + "learning_rate": 0.00045643019527834446, + "loss": 3.3608, + "step": 41150 + }, + { + "epoch": 12.001165229550221, + "grad_norm": 0.3389853537082672, + "learning_rate": 0.0004562553191489361, + "loss": 3.3647, + "step": 41200 + }, + { + "epoch": 12.01573059892799, + "grad_norm": 0.3526709973812103, + "learning_rate": 0.00045608044301952784, + "loss": 3.2626, + "step": 41250 + }, + { + "epoch": 12.030295968305756, + "grad_norm": 0.3826402425765991, + "learning_rate": 0.0004559055668901195, + "loss": 3.2511, + "step": 41300 + }, + { + "epoch": 12.044861337683523, + "grad_norm": 0.35080841183662415, + "learning_rate": 0.0004557306907607111, + "loss": 3.2537, + "step": 41350 + }, + { + "epoch": 12.059426707061291, + "grad_norm": 0.34643131494522095, + "learning_rate": 0.0004555558146313028, + "loss": 3.279, + "step": 41400 + }, + { + "epoch": 12.073992076439058, + "grad_norm": 0.35819053649902344, + "learning_rate": 0.00045538093850189444, + "loss": 3.2798, + "step": 41450 + }, + { + "epoch": 12.088557445816827, + "grad_norm": 0.35073381662368774, + "learning_rate": 0.00045520606237248613, + "loss": 3.282, + "step": 41500 + }, + { + "epoch": 12.103122815194594, + "grad_norm": 0.36466553807258606, + "learning_rate": 0.00045503118624307776, + "loss": 3.2781, + "step": 41550 + }, + { + "epoch": 12.11768818457236, + "grad_norm": 0.3526218831539154, + "learning_rate": 0.00045485631011366945, + "loss": 3.2747, + "step": 41600 + }, + { + "epoch": 12.132253553950129, + "grad_norm": 0.34587806463241577, + "learning_rate": 0.0004546814339842611, + "loss": 3.2836, + "step": 41650 + }, + { + "epoch": 12.146818923327896, + "grad_norm": 0.3643037974834442, + "learning_rate": 0.0004545065578548527, + "loss": 3.2837, + "step": 41700 + }, + { + "epoch": 12.161384292705662, + "grad_norm": 0.3476613759994507, + "learning_rate": 0.00045433168172544447, + "loss": 3.2871, + "step": 41750 + }, + { + "epoch": 12.17594966208343, + "grad_norm": 0.35576480627059937, + "learning_rate": 0.0004541568055960361, + "loss": 3.2914, + "step": 41800 + }, + { + "epoch": 12.190515031461198, + "grad_norm": 0.33326926827430725, + "learning_rate": 0.0004539819294666278, + "loss": 3.3, + "step": 41850 + }, + { + "epoch": 12.205080400838966, + "grad_norm": 0.3574065864086151, + "learning_rate": 0.00045380705333721943, + "loss": 3.296, + "step": 41900 + }, + { + "epoch": 12.219645770216733, + "grad_norm": 0.33230534195899963, + "learning_rate": 0.00045363217720781107, + "loss": 3.2943, + "step": 41950 + }, + { + "epoch": 12.2342111395945, + "grad_norm": 0.3553365170955658, + "learning_rate": 0.00045345730107840276, + "loss": 3.3134, + "step": 42000 + }, + { + "epoch": 12.2342111395945, + "eval_accuracy": 0.37017045838650914, + "eval_loss": 3.5607879161834717, + "eval_runtime": 180.0939, + "eval_samples_per_second": 92.424, + "eval_steps_per_second": 5.78, + "step": 42000 + }, + { + "epoch": 12.248776508972268, + "grad_norm": 0.3781476616859436, + "learning_rate": 0.0004532824249489944, + "loss": 3.3071, + "step": 42050 + }, + { + "epoch": 12.263341878350035, + "grad_norm": 0.34425944089889526, + "learning_rate": 0.0004531075488195861, + "loss": 3.3064, + "step": 42100 + }, + { + "epoch": 12.277907247727802, + "grad_norm": 0.3801526129245758, + "learning_rate": 0.0004529326726901777, + "loss": 3.302, + "step": 42150 + }, + { + "epoch": 12.29247261710557, + "grad_norm": 0.3539813160896301, + "learning_rate": 0.00045275779656076947, + "loss": 3.3085, + "step": 42200 + }, + { + "epoch": 12.307037986483337, + "grad_norm": 0.3623591661453247, + "learning_rate": 0.0004525829204313611, + "loss": 3.3184, + "step": 42250 + }, + { + "epoch": 12.321603355861104, + "grad_norm": 0.4033423364162445, + "learning_rate": 0.00045240804430195274, + "loss": 3.2947, + "step": 42300 + }, + { + "epoch": 12.336168725238872, + "grad_norm": 0.4017917811870575, + "learning_rate": 0.00045223316817254443, + "loss": 3.3224, + "step": 42350 + }, + { + "epoch": 12.350734094616639, + "grad_norm": 0.3531772792339325, + "learning_rate": 0.00045205829204313607, + "loss": 3.3133, + "step": 42400 + }, + { + "epoch": 12.365299463994408, + "grad_norm": 0.3572141230106354, + "learning_rate": 0.00045188341591372776, + "loss": 3.2987, + "step": 42450 + }, + { + "epoch": 12.379864833372174, + "grad_norm": 0.3440505266189575, + "learning_rate": 0.0004517085397843194, + "loss": 3.3061, + "step": 42500 + }, + { + "epoch": 12.394430202749941, + "grad_norm": 0.38698717951774597, + "learning_rate": 0.00045153366365491103, + "loss": 3.3218, + "step": 42550 + }, + { + "epoch": 12.40899557212771, + "grad_norm": 0.34767866134643555, + "learning_rate": 0.0004513587875255027, + "loss": 3.3256, + "step": 42600 + }, + { + "epoch": 12.423560941505476, + "grad_norm": 0.36052611470222473, + "learning_rate": 0.00045118391139609436, + "loss": 3.3373, + "step": 42650 + }, + { + "epoch": 12.438126310883243, + "grad_norm": 0.36833488941192627, + "learning_rate": 0.0004510090352666861, + "loss": 3.3225, + "step": 42700 + }, + { + "epoch": 12.452691680261012, + "grad_norm": 0.36538413166999817, + "learning_rate": 0.00045083415913727774, + "loss": 3.3232, + "step": 42750 + }, + { + "epoch": 12.467257049638778, + "grad_norm": 0.38341739773750305, + "learning_rate": 0.0004506592830078694, + "loss": 3.3253, + "step": 42800 + }, + { + "epoch": 12.481822419016547, + "grad_norm": 0.35237112641334534, + "learning_rate": 0.00045048440687846106, + "loss": 3.3285, + "step": 42850 + }, + { + "epoch": 12.496387788394314, + "grad_norm": 0.3648768961429596, + "learning_rate": 0.0004503095307490527, + "loss": 3.3253, + "step": 42900 + }, + { + "epoch": 12.51095315777208, + "grad_norm": 0.37519994378089905, + "learning_rate": 0.0004501346546196444, + "loss": 3.3266, + "step": 42950 + }, + { + "epoch": 12.525518527149849, + "grad_norm": 0.342909574508667, + "learning_rate": 0.000449959778490236, + "loss": 3.3341, + "step": 43000 + }, + { + "epoch": 12.525518527149849, + "eval_accuracy": 0.3702159578489218, + "eval_loss": 3.556406021118164, + "eval_runtime": 180.1154, + "eval_samples_per_second": 92.413, + "eval_steps_per_second": 5.78, + "step": 43000 + }, + { + "epoch": 12.540083896527616, + "grad_norm": 0.34145408868789673, + "learning_rate": 0.0004497849023608277, + "loss": 3.3171, + "step": 43050 + }, + { + "epoch": 12.554649265905383, + "grad_norm": 0.34982040524482727, + "learning_rate": 0.00044961002623141935, + "loss": 3.3283, + "step": 43100 + }, + { + "epoch": 12.569214635283151, + "grad_norm": 0.35648754239082336, + "learning_rate": 0.000449435150102011, + "loss": 3.3203, + "step": 43150 + }, + { + "epoch": 12.583780004660918, + "grad_norm": 0.35109570622444153, + "learning_rate": 0.00044926027397260273, + "loss": 3.333, + "step": 43200 + }, + { + "epoch": 12.598345374038686, + "grad_norm": 0.36943650245666504, + "learning_rate": 0.00044908539784319437, + "loss": 3.343, + "step": 43250 + }, + { + "epoch": 12.612910743416453, + "grad_norm": 0.33632126450538635, + "learning_rate": 0.00044891052171378606, + "loss": 3.3266, + "step": 43300 + }, + { + "epoch": 12.62747611279422, + "grad_norm": 0.34395912289619446, + "learning_rate": 0.0004487356455843777, + "loss": 3.3298, + "step": 43350 + }, + { + "epoch": 12.642041482171988, + "grad_norm": 0.3842009902000427, + "learning_rate": 0.00044856076945496933, + "loss": 3.3248, + "step": 43400 + }, + { + "epoch": 12.656606851549755, + "grad_norm": 0.33698466420173645, + "learning_rate": 0.000448385893325561, + "loss": 3.34, + "step": 43450 + }, + { + "epoch": 12.671172220927522, + "grad_norm": 0.36925747990608215, + "learning_rate": 0.00044821101719615266, + "loss": 3.3425, + "step": 43500 + }, + { + "epoch": 12.68573759030529, + "grad_norm": 0.34613460302352905, + "learning_rate": 0.00044803614106674435, + "loss": 3.334, + "step": 43550 + }, + { + "epoch": 12.700302959683057, + "grad_norm": 0.34533941745758057, + "learning_rate": 0.000447861264937336, + "loss": 3.3415, + "step": 43600 + }, + { + "epoch": 12.714868329060826, + "grad_norm": 0.35583844780921936, + "learning_rate": 0.00044768638880792773, + "loss": 3.3443, + "step": 43650 + }, + { + "epoch": 12.729433698438593, + "grad_norm": 0.3499312996864319, + "learning_rate": 0.00044751151267851937, + "loss": 3.3507, + "step": 43700 + }, + { + "epoch": 12.74399906781636, + "grad_norm": 0.3817194402217865, + "learning_rate": 0.000447336636549111, + "loss": 3.3313, + "step": 43750 + }, + { + "epoch": 12.758564437194128, + "grad_norm": 0.3556305170059204, + "learning_rate": 0.0004471617604197027, + "loss": 3.3493, + "step": 43800 + }, + { + "epoch": 12.773129806571895, + "grad_norm": 0.3326449394226074, + "learning_rate": 0.00044698688429029433, + "loss": 3.3476, + "step": 43850 + }, + { + "epoch": 12.787695175949661, + "grad_norm": 0.34188127517700195, + "learning_rate": 0.000446812008160886, + "loss": 3.3484, + "step": 43900 + }, + { + "epoch": 12.80226054532743, + "grad_norm": 0.36247870326042175, + "learning_rate": 0.00044663713203147766, + "loss": 3.3548, + "step": 43950 + }, + { + "epoch": 12.816825914705197, + "grad_norm": 0.3323318660259247, + "learning_rate": 0.0004464622559020693, + "loss": 3.34, + "step": 44000 + }, + { + "epoch": 12.816825914705197, + "eval_accuracy": 0.3710601080817979, + "eval_loss": 3.5469486713409424, + "eval_runtime": 180.4604, + "eval_samples_per_second": 92.236, + "eval_steps_per_second": 5.769, + "step": 44000 + }, + { + "epoch": 12.831391284082965, + "grad_norm": 0.3398094177246094, + "learning_rate": 0.000446287379772661, + "loss": 3.3637, + "step": 44050 + }, + { + "epoch": 12.845956653460732, + "grad_norm": 0.3441350758075714, + "learning_rate": 0.0004461125036432526, + "loss": 3.3565, + "step": 44100 + }, + { + "epoch": 12.860522022838499, + "grad_norm": 0.3864867091178894, + "learning_rate": 0.00044593762751384436, + "loss": 3.3531, + "step": 44150 + }, + { + "epoch": 12.875087392216267, + "grad_norm": 0.33451637625694275, + "learning_rate": 0.000445762751384436, + "loss": 3.3514, + "step": 44200 + }, + { + "epoch": 12.889652761594034, + "grad_norm": 0.35273078083992004, + "learning_rate": 0.0004455878752550277, + "loss": 3.3488, + "step": 44250 + }, + { + "epoch": 12.9042181309718, + "grad_norm": 0.35807234048843384, + "learning_rate": 0.0004454129991256193, + "loss": 3.3618, + "step": 44300 + }, + { + "epoch": 12.91878350034957, + "grad_norm": 0.37384292483329773, + "learning_rate": 0.00044523812299621096, + "loss": 3.355, + "step": 44350 + }, + { + "epoch": 12.933348869727336, + "grad_norm": 0.356778085231781, + "learning_rate": 0.00044506324686680265, + "loss": 3.3615, + "step": 44400 + }, + { + "epoch": 12.947914239105105, + "grad_norm": 0.3216100335121155, + "learning_rate": 0.0004448883707373943, + "loss": 3.3523, + "step": 44450 + }, + { + "epoch": 12.962479608482871, + "grad_norm": 0.34460195899009705, + "learning_rate": 0.000444713494607986, + "loss": 3.3419, + "step": 44500 + }, + { + "epoch": 12.977044977860638, + "grad_norm": 0.3426755368709564, + "learning_rate": 0.0004445386184785776, + "loss": 3.3618, + "step": 44550 + }, + { + "epoch": 12.991610347238407, + "grad_norm": 0.33371907472610474, + "learning_rate": 0.00044436374234916925, + "loss": 3.3606, + "step": 44600 + }, + { + "epoch": 13.006117455138662, + "grad_norm": 0.3647765815258026, + "learning_rate": 0.000444188866219761, + "loss": 3.3039, + "step": 44650 + }, + { + "epoch": 13.02068282451643, + "grad_norm": 0.35740309953689575, + "learning_rate": 0.00044401399009035263, + "loss": 3.2546, + "step": 44700 + }, + { + "epoch": 13.035248193894198, + "grad_norm": 0.34486448764801025, + "learning_rate": 0.0004438391139609443, + "loss": 3.2578, + "step": 44750 + }, + { + "epoch": 13.049813563271965, + "grad_norm": 0.3458172380924225, + "learning_rate": 0.00044366423783153596, + "loss": 3.2525, + "step": 44800 + }, + { + "epoch": 13.064378932649731, + "grad_norm": 0.3752981126308441, + "learning_rate": 0.0004434893617021276, + "loss": 3.2551, + "step": 44850 + }, + { + "epoch": 13.0789443020275, + "grad_norm": 0.3851625919342041, + "learning_rate": 0.0004433144855727193, + "loss": 3.2575, + "step": 44900 + }, + { + "epoch": 13.093509671405267, + "grad_norm": 0.3515491187572479, + "learning_rate": 0.0004431396094433109, + "loss": 3.2561, + "step": 44950 + }, + { + "epoch": 13.108075040783035, + "grad_norm": 0.3828083872795105, + "learning_rate": 0.0004429647333139026, + "loss": 3.2644, + "step": 45000 + }, + { + "epoch": 13.108075040783035, + "eval_accuracy": 0.3702774467864976, + "eval_loss": 3.5636725425720215, + "eval_runtime": 180.05, + "eval_samples_per_second": 92.447, + "eval_steps_per_second": 5.782, + "step": 45000 + }, + { + "epoch": 13.122640410160802, + "grad_norm": 0.3796162009239197, + "learning_rate": 0.00044278985718449425, + "loss": 3.2588, + "step": 45050 + }, + { + "epoch": 13.137205779538569, + "grad_norm": 0.3526037633419037, + "learning_rate": 0.000442614981055086, + "loss": 3.2679, + "step": 45100 + }, + { + "epoch": 13.151771148916337, + "grad_norm": 0.3708899915218353, + "learning_rate": 0.00044244010492567763, + "loss": 3.2736, + "step": 45150 + }, + { + "epoch": 13.166336518294104, + "grad_norm": 0.36107999086380005, + "learning_rate": 0.00044226522879626927, + "loss": 3.2702, + "step": 45200 + }, + { + "epoch": 13.18090188767187, + "grad_norm": 0.36529046297073364, + "learning_rate": 0.00044209035266686096, + "loss": 3.2764, + "step": 45250 + }, + { + "epoch": 13.19546725704964, + "grad_norm": 0.32587742805480957, + "learning_rate": 0.0004419154765374526, + "loss": 3.2871, + "step": 45300 + }, + { + "epoch": 13.210032626427406, + "grad_norm": 0.3842274844646454, + "learning_rate": 0.0004417406004080443, + "loss": 3.2774, + "step": 45350 + }, + { + "epoch": 13.224597995805174, + "grad_norm": 0.3462216556072235, + "learning_rate": 0.0004415657242786359, + "loss": 3.2834, + "step": 45400 + }, + { + "epoch": 13.239163365182941, + "grad_norm": 0.3876231610774994, + "learning_rate": 0.00044139084814922755, + "loss": 3.2794, + "step": 45450 + }, + { + "epoch": 13.253728734560708, + "grad_norm": 0.3549770712852478, + "learning_rate": 0.00044121597201981924, + "loss": 3.29, + "step": 45500 + }, + { + "epoch": 13.268294103938477, + "grad_norm": 0.36762481927871704, + "learning_rate": 0.0004410410958904109, + "loss": 3.2889, + "step": 45550 + }, + { + "epoch": 13.282859473316243, + "grad_norm": 0.3711779713630676, + "learning_rate": 0.0004408662197610026, + "loss": 3.2911, + "step": 45600 + }, + { + "epoch": 13.29742484269401, + "grad_norm": 0.38503533601760864, + "learning_rate": 0.00044069134363159426, + "loss": 3.2988, + "step": 45650 + }, + { + "epoch": 13.311990212071779, + "grad_norm": 0.35451096296310425, + "learning_rate": 0.00044051646750218595, + "loss": 3.2966, + "step": 45700 + }, + { + "epoch": 13.326555581449545, + "grad_norm": 0.36363229155540466, + "learning_rate": 0.0004403415913727776, + "loss": 3.2889, + "step": 45750 + }, + { + "epoch": 13.341120950827314, + "grad_norm": 0.3751690089702606, + "learning_rate": 0.0004401667152433692, + "loss": 3.3034, + "step": 45800 + }, + { + "epoch": 13.35568632020508, + "grad_norm": 0.3827272057533264, + "learning_rate": 0.0004399918391139609, + "loss": 3.2939, + "step": 45850 + }, + { + "epoch": 13.370251689582847, + "grad_norm": 0.3678208887577057, + "learning_rate": 0.00043981696298455255, + "loss": 3.2945, + "step": 45900 + }, + { + "epoch": 13.384817058960616, + "grad_norm": 0.367930144071579, + "learning_rate": 0.00043964208685514424, + "loss": 3.3054, + "step": 45950 + }, + { + "epoch": 13.399382428338383, + "grad_norm": 0.4058651924133301, + "learning_rate": 0.0004394672107257359, + "loss": 3.3033, + "step": 46000 + }, + { + "epoch": 13.399382428338383, + "eval_accuracy": 0.37058559689239845, + "eval_loss": 3.558558940887451, + "eval_runtime": 180.1666, + "eval_samples_per_second": 92.387, + "eval_steps_per_second": 5.778, + "step": 46000 + }, + { + "epoch": 13.41394779771615, + "grad_norm": 0.33443596959114075, + "learning_rate": 0.0004392923345963275, + "loss": 3.3057, + "step": 46050 + }, + { + "epoch": 13.428513167093918, + "grad_norm": 0.35828691720962524, + "learning_rate": 0.00043911745846691926, + "loss": 3.3038, + "step": 46100 + }, + { + "epoch": 13.443078536471685, + "grad_norm": 0.3267059922218323, + "learning_rate": 0.0004389425823375109, + "loss": 3.3025, + "step": 46150 + }, + { + "epoch": 13.457643905849451, + "grad_norm": 0.3633759319782257, + "learning_rate": 0.0004387677062081026, + "loss": 3.3072, + "step": 46200 + }, + { + "epoch": 13.47220927522722, + "grad_norm": 0.3840404748916626, + "learning_rate": 0.0004385928300786942, + "loss": 3.3114, + "step": 46250 + }, + { + "epoch": 13.486774644604987, + "grad_norm": 0.37473759055137634, + "learning_rate": 0.0004384179539492859, + "loss": 3.3016, + "step": 46300 + }, + { + "epoch": 13.501340013982755, + "grad_norm": 0.3564804792404175, + "learning_rate": 0.00043824307781987755, + "loss": 3.3094, + "step": 46350 + }, + { + "epoch": 13.515905383360522, + "grad_norm": 0.3550237715244293, + "learning_rate": 0.0004380682016904692, + "loss": 3.3267, + "step": 46400 + }, + { + "epoch": 13.530470752738289, + "grad_norm": 0.3277883529663086, + "learning_rate": 0.0004378933255610609, + "loss": 3.3101, + "step": 46450 + }, + { + "epoch": 13.545036122116057, + "grad_norm": 0.3435348570346832, + "learning_rate": 0.0004377184494316525, + "loss": 3.3102, + "step": 46500 + }, + { + "epoch": 13.559601491493824, + "grad_norm": 0.3904513120651245, + "learning_rate": 0.00043754357330224426, + "loss": 3.3174, + "step": 46550 + }, + { + "epoch": 13.574166860871593, + "grad_norm": 0.3408344089984894, + "learning_rate": 0.0004373686971728359, + "loss": 3.3124, + "step": 46600 + }, + { + "epoch": 13.58873223024936, + "grad_norm": 0.35491397976875305, + "learning_rate": 0.00043719382104342753, + "loss": 3.3261, + "step": 46650 + }, + { + "epoch": 13.603297599627126, + "grad_norm": 0.374520868062973, + "learning_rate": 0.0004370189449140192, + "loss": 3.3192, + "step": 46700 + }, + { + "epoch": 13.617862969004895, + "grad_norm": 0.36937081813812256, + "learning_rate": 0.00043684406878461085, + "loss": 3.3193, + "step": 46750 + }, + { + "epoch": 13.632428338382661, + "grad_norm": 0.36104243993759155, + "learning_rate": 0.00043666919265520254, + "loss": 3.3299, + "step": 46800 + }, + { + "epoch": 13.646993707760428, + "grad_norm": 0.38521620631217957, + "learning_rate": 0.0004364943165257942, + "loss": 3.3275, + "step": 46850 + }, + { + "epoch": 13.661559077138197, + "grad_norm": 0.36809036135673523, + "learning_rate": 0.0004363194403963858, + "loss": 3.3141, + "step": 46900 + }, + { + "epoch": 13.676124446515963, + "grad_norm": 0.36899736523628235, + "learning_rate": 0.0004361445642669775, + "loss": 3.3134, + "step": 46950 + }, + { + "epoch": 13.69068981589373, + "grad_norm": 0.340524286031723, + "learning_rate": 0.00043596968813756914, + "loss": 3.321, + "step": 47000 + }, + { + "epoch": 13.69068981589373, + "eval_accuracy": 0.3713288723481426, + "eval_loss": 3.546154022216797, + "eval_runtime": 180.3194, + "eval_samples_per_second": 92.308, + "eval_steps_per_second": 5.773, + "step": 47000 + }, + { + "epoch": 13.705255185271499, + "grad_norm": 0.4061141610145569, + "learning_rate": 0.0004357948120081609, + "loss": 3.3105, + "step": 47050 + }, + { + "epoch": 13.719820554649266, + "grad_norm": 0.38260596990585327, + "learning_rate": 0.0004356199358787525, + "loss": 3.3253, + "step": 47100 + }, + { + "epoch": 13.734385924027034, + "grad_norm": 0.33172404766082764, + "learning_rate": 0.0004354450597493442, + "loss": 3.3317, + "step": 47150 + }, + { + "epoch": 13.7489512934048, + "grad_norm": 0.37275978922843933, + "learning_rate": 0.00043527018361993585, + "loss": 3.316, + "step": 47200 + }, + { + "epoch": 13.763516662782568, + "grad_norm": 0.34196189045906067, + "learning_rate": 0.0004350953074905275, + "loss": 3.3284, + "step": 47250 + }, + { + "epoch": 13.778082032160336, + "grad_norm": 0.38566258549690247, + "learning_rate": 0.0004349204313611192, + "loss": 3.3284, + "step": 47300 + }, + { + "epoch": 13.792647401538103, + "grad_norm": 0.34335869550704956, + "learning_rate": 0.0004347455552317108, + "loss": 3.3268, + "step": 47350 + }, + { + "epoch": 13.80721277091587, + "grad_norm": 0.347888320684433, + "learning_rate": 0.0004345706791023025, + "loss": 3.3291, + "step": 47400 + }, + { + "epoch": 13.821778140293638, + "grad_norm": 0.34273475408554077, + "learning_rate": 0.00043439580297289414, + "loss": 3.3313, + "step": 47450 + }, + { + "epoch": 13.836343509671405, + "grad_norm": 0.35187360644340515, + "learning_rate": 0.0004342209268434858, + "loss": 3.3271, + "step": 47500 + }, + { + "epoch": 13.850908879049173, + "grad_norm": 0.35880064964294434, + "learning_rate": 0.0004340460507140775, + "loss": 3.3399, + "step": 47550 + }, + { + "epoch": 13.86547424842694, + "grad_norm": 0.3588651418685913, + "learning_rate": 0.00043387117458466916, + "loss": 3.3369, + "step": 47600 + }, + { + "epoch": 13.880039617804707, + "grad_norm": 0.3750631809234619, + "learning_rate": 0.00043369629845526085, + "loss": 3.3353, + "step": 47650 + }, + { + "epoch": 13.894604987182475, + "grad_norm": 0.3625471889972687, + "learning_rate": 0.0004335214223258525, + "loss": 3.3288, + "step": 47700 + }, + { + "epoch": 13.909170356560242, + "grad_norm": 0.34764859080314636, + "learning_rate": 0.0004333465461964442, + "loss": 3.3396, + "step": 47750 + }, + { + "epoch": 13.923735725938009, + "grad_norm": 0.37005481123924255, + "learning_rate": 0.0004331716700670358, + "loss": 3.345, + "step": 47800 + }, + { + "epoch": 13.938301095315778, + "grad_norm": 0.36900705099105835, + "learning_rate": 0.00043299679393762745, + "loss": 3.3468, + "step": 47850 + }, + { + "epoch": 13.952866464693544, + "grad_norm": 0.36026182770729065, + "learning_rate": 0.00043282191780821914, + "loss": 3.3357, + "step": 47900 + }, + { + "epoch": 13.967431834071313, + "grad_norm": 0.41666218638420105, + "learning_rate": 0.00043264704167881077, + "loss": 3.3406, + "step": 47950 + }, + { + "epoch": 13.98199720344908, + "grad_norm": 0.3536786735057831, + "learning_rate": 0.0004324721655494025, + "loss": 3.3478, + "step": 48000 + }, + { + "epoch": 13.98199720344908, + "eval_accuracy": 0.37184794244259217, + "eval_loss": 3.5400092601776123, + "eval_runtime": 180.0491, + "eval_samples_per_second": 92.447, + "eval_steps_per_second": 5.782, + "step": 48000 + }, + { + "epoch": 13.996562572826846, + "grad_norm": 0.36019057035446167, + "learning_rate": 0.00043229728941999415, + "loss": 3.3448, + "step": 48050 + }, + { + "epoch": 14.011069680727104, + "grad_norm": 0.36364710330963135, + "learning_rate": 0.0004321224132905858, + "loss": 3.2384, + "step": 48100 + }, + { + "epoch": 14.02563505010487, + "grad_norm": 0.37141087651252747, + "learning_rate": 0.0004319475371611775, + "loss": 3.2325, + "step": 48150 + }, + { + "epoch": 14.040200419482638, + "grad_norm": 0.3692861497402191, + "learning_rate": 0.0004317726610317691, + "loss": 3.2346, + "step": 48200 + }, + { + "epoch": 14.054765788860406, + "grad_norm": 0.3691665530204773, + "learning_rate": 0.0004315977849023608, + "loss": 3.2326, + "step": 48250 + }, + { + "epoch": 14.069331158238173, + "grad_norm": 0.3210379481315613, + "learning_rate": 0.00043142290877295244, + "loss": 3.2295, + "step": 48300 + }, + { + "epoch": 14.08389652761594, + "grad_norm": 0.3676183819770813, + "learning_rate": 0.00043124803264354413, + "loss": 3.2278, + "step": 48350 + }, + { + "epoch": 14.098461896993708, + "grad_norm": 0.3678940236568451, + "learning_rate": 0.00043107315651413577, + "loss": 3.2426, + "step": 48400 + }, + { + "epoch": 14.113027266371475, + "grad_norm": 0.35546252131462097, + "learning_rate": 0.0004308982803847274, + "loss": 3.2504, + "step": 48450 + }, + { + "epoch": 14.127592635749243, + "grad_norm": 0.36175820231437683, + "learning_rate": 0.00043072340425531915, + "loss": 3.2673, + "step": 48500 + }, + { + "epoch": 14.14215800512701, + "grad_norm": 0.3764038681983948, + "learning_rate": 0.0004305485281259108, + "loss": 3.2708, + "step": 48550 + }, + { + "epoch": 14.156723374504777, + "grad_norm": 0.35129106044769287, + "learning_rate": 0.0004303736519965025, + "loss": 3.2607, + "step": 48600 + }, + { + "epoch": 14.171288743882545, + "grad_norm": 0.3690101206302643, + "learning_rate": 0.0004301987758670941, + "loss": 3.2637, + "step": 48650 + }, + { + "epoch": 14.185854113260312, + "grad_norm": 0.3953830897808075, + "learning_rate": 0.00043002389973768575, + "loss": 3.2679, + "step": 48700 + }, + { + "epoch": 14.200419482638079, + "grad_norm": 0.4228529930114746, + "learning_rate": 0.00042984902360827744, + "loss": 3.2749, + "step": 48750 + }, + { + "epoch": 14.214984852015847, + "grad_norm": 0.3526473641395569, + "learning_rate": 0.0004296741474788691, + "loss": 3.2685, + "step": 48800 + }, + { + "epoch": 14.229550221393614, + "grad_norm": 0.3725534677505493, + "learning_rate": 0.00042949927134946077, + "loss": 3.2709, + "step": 48850 + }, + { + "epoch": 14.244115590771383, + "grad_norm": 0.4134677052497864, + "learning_rate": 0.0004293243952200524, + "loss": 3.2807, + "step": 48900 + }, + { + "epoch": 14.25868096014915, + "grad_norm": 0.373789519071579, + "learning_rate": 0.00042914951909064415, + "loss": 3.2732, + "step": 48950 + }, + { + "epoch": 14.273246329526916, + "grad_norm": 0.3695909082889557, + "learning_rate": 0.0004289746429612358, + "loss": 3.2776, + "step": 49000 + }, + { + "epoch": 14.273246329526916, + "eval_accuracy": 0.37125997652133685, + "eval_loss": 3.555926561355591, + "eval_runtime": 180.0211, + "eval_samples_per_second": 92.461, + "eval_steps_per_second": 5.783, + "step": 49000 + }, + { + "epoch": 14.287811698904685, + "grad_norm": 0.3744010031223297, + "learning_rate": 0.0004287997668318274, + "loss": 3.2625, + "step": 49050 + }, + { + "epoch": 14.302377068282452, + "grad_norm": 0.3698079288005829, + "learning_rate": 0.0004286248907024191, + "loss": 3.2764, + "step": 49100 + }, + { + "epoch": 14.316942437660218, + "grad_norm": 0.40776345133781433, + "learning_rate": 0.00042845001457301075, + "loss": 3.2732, + "step": 49150 + }, + { + "epoch": 14.331507807037987, + "grad_norm": 0.3719671368598938, + "learning_rate": 0.00042827513844360244, + "loss": 3.2849, + "step": 49200 + }, + { + "epoch": 14.346073176415754, + "grad_norm": 0.36249688267707825, + "learning_rate": 0.00042810026231419407, + "loss": 3.2847, + "step": 49250 + }, + { + "epoch": 14.360638545793522, + "grad_norm": 0.3670303523540497, + "learning_rate": 0.0004279253861847857, + "loss": 3.278, + "step": 49300 + }, + { + "epoch": 14.375203915171289, + "grad_norm": 0.36930960416793823, + "learning_rate": 0.0004277505100553774, + "loss": 3.3051, + "step": 49350 + }, + { + "epoch": 14.389769284549056, + "grad_norm": 0.3947696387767792, + "learning_rate": 0.00042757563392596904, + "loss": 3.2951, + "step": 49400 + }, + { + "epoch": 14.404334653926824, + "grad_norm": 0.3678928315639496, + "learning_rate": 0.0004274007577965608, + "loss": 3.2893, + "step": 49450 + }, + { + "epoch": 14.418900023304591, + "grad_norm": 0.3470092713832855, + "learning_rate": 0.0004272258816671524, + "loss": 3.3077, + "step": 49500 + }, + { + "epoch": 14.433465392682358, + "grad_norm": 0.380835622549057, + "learning_rate": 0.00042705100553774405, + "loss": 3.29, + "step": 49550 + }, + { + "epoch": 14.448030762060126, + "grad_norm": 0.3495819568634033, + "learning_rate": 0.00042687612940833574, + "loss": 3.289, + "step": 49600 + }, + { + "epoch": 14.462596131437893, + "grad_norm": 0.3957236707210541, + "learning_rate": 0.0004267012532789274, + "loss": 3.3002, + "step": 49650 + }, + { + "epoch": 14.477161500815662, + "grad_norm": 0.35682642459869385, + "learning_rate": 0.00042652637714951907, + "loss": 3.2979, + "step": 49700 + }, + { + "epoch": 14.491726870193428, + "grad_norm": 0.37553074955940247, + "learning_rate": 0.0004263515010201107, + "loss": 3.2958, + "step": 49750 + }, + { + "epoch": 14.506292239571195, + "grad_norm": 0.35635843873023987, + "learning_rate": 0.0004261766248907024, + "loss": 3.2884, + "step": 49800 + }, + { + "epoch": 14.520857608948964, + "grad_norm": 0.3419688940048218, + "learning_rate": 0.00042600174876129403, + "loss": 3.301, + "step": 49850 + }, + { + "epoch": 14.53542297832673, + "grad_norm": 0.34367799758911133, + "learning_rate": 0.00042582687263188567, + "loss": 3.2852, + "step": 49900 + }, + { + "epoch": 14.549988347704497, + "grad_norm": 0.40431395173072815, + "learning_rate": 0.0004256519965024774, + "loss": 3.2959, + "step": 49950 + }, + { + "epoch": 14.564553717082266, + "grad_norm": 0.35024210810661316, + "learning_rate": 0.00042547712037306905, + "loss": 3.3024, + "step": 50000 + }, + { + "epoch": 14.564553717082266, + "eval_accuracy": 0.3717955063696308, + "eval_loss": 3.5450422763824463, + "eval_runtime": 179.9316, + "eval_samples_per_second": 92.507, + "eval_steps_per_second": 5.786, + "step": 50000 + }, + { + "epoch": 14.579119086460032, + "grad_norm": 0.3777245581150055, + "learning_rate": 0.00042530224424366074, + "loss": 3.2999, + "step": 50050 + }, + { + "epoch": 14.5936844558378, + "grad_norm": 0.38056281208992004, + "learning_rate": 0.0004251273681142524, + "loss": 3.3083, + "step": 50100 + }, + { + "epoch": 14.608249825215568, + "grad_norm": 0.34788084030151367, + "learning_rate": 0.000424952491984844, + "loss": 3.31, + "step": 50150 + }, + { + "epoch": 14.622815194593334, + "grad_norm": 0.3699731230735779, + "learning_rate": 0.0004247776158554357, + "loss": 3.3078, + "step": 50200 + }, + { + "epoch": 14.637380563971103, + "grad_norm": 0.37978971004486084, + "learning_rate": 0.00042460273972602734, + "loss": 3.308, + "step": 50250 + }, + { + "epoch": 14.65194593334887, + "grad_norm": 0.34603598713874817, + "learning_rate": 0.00042442786359661903, + "loss": 3.3014, + "step": 50300 + }, + { + "epoch": 14.666511302726637, + "grad_norm": 0.36627814173698425, + "learning_rate": 0.00042425298746721066, + "loss": 3.3084, + "step": 50350 + }, + { + "epoch": 14.681076672104405, + "grad_norm": 0.37918820977211, + "learning_rate": 0.0004240781113378024, + "loss": 3.3007, + "step": 50400 + }, + { + "epoch": 14.695642041482172, + "grad_norm": 0.373757004737854, + "learning_rate": 0.00042390323520839405, + "loss": 3.3043, + "step": 50450 + }, + { + "epoch": 14.71020741085994, + "grad_norm": 0.36418765783309937, + "learning_rate": 0.0004237283590789857, + "loss": 3.31, + "step": 50500 + }, + { + "epoch": 14.724772780237707, + "grad_norm": 0.36318641901016235, + "learning_rate": 0.00042355348294957737, + "loss": 3.313, + "step": 50550 + }, + { + "epoch": 14.739338149615474, + "grad_norm": 0.3450626730918884, + "learning_rate": 0.000423378606820169, + "loss": 3.3003, + "step": 50600 + }, + { + "epoch": 14.753903518993242, + "grad_norm": 0.35747483372688293, + "learning_rate": 0.0004232037306907607, + "loss": 3.3018, + "step": 50650 + }, + { + "epoch": 14.76846888837101, + "grad_norm": 0.37540534138679504, + "learning_rate": 0.00042302885456135233, + "loss": 3.3205, + "step": 50700 + }, + { + "epoch": 14.783034257748776, + "grad_norm": 0.3449225425720215, + "learning_rate": 0.00042285397843194397, + "loss": 3.3091, + "step": 50750 + }, + { + "epoch": 14.797599627126544, + "grad_norm": 0.3874026834964752, + "learning_rate": 0.00042267910230253566, + "loss": 3.3202, + "step": 50800 + }, + { + "epoch": 14.812164996504311, + "grad_norm": 0.37947866320610046, + "learning_rate": 0.0004225042261731273, + "loss": 3.313, + "step": 50850 + }, + { + "epoch": 14.826730365882078, + "grad_norm": 0.337933212518692, + "learning_rate": 0.00042232935004371904, + "loss": 3.316, + "step": 50900 + }, + { + "epoch": 14.841295735259846, + "grad_norm": 0.3563149869441986, + "learning_rate": 0.0004221544739143107, + "loss": 3.323, + "step": 50950 + }, + { + "epoch": 14.855861104637613, + "grad_norm": 0.38767126202583313, + "learning_rate": 0.00042197959778490237, + "loss": 3.3119, + "step": 51000 + }, + { + "epoch": 14.855861104637613, + "eval_accuracy": 0.3718384192992965, + "eval_loss": 3.544301748275757, + "eval_runtime": 180.083, + "eval_samples_per_second": 92.43, + "eval_steps_per_second": 5.781, + "step": 51000 + }, + { + "epoch": 14.870426474015382, + "grad_norm": 0.37552863359451294, + "learning_rate": 0.000421804721655494, + "loss": 3.3259, + "step": 51050 + }, + { + "epoch": 14.884991843393149, + "grad_norm": 0.36282673478126526, + "learning_rate": 0.00042162984552608564, + "loss": 3.3247, + "step": 51100 + }, + { + "epoch": 14.899557212770915, + "grad_norm": 0.3442082703113556, + "learning_rate": 0.00042145496939667733, + "loss": 3.3353, + "step": 51150 + }, + { + "epoch": 14.914122582148684, + "grad_norm": 0.37182241678237915, + "learning_rate": 0.00042128009326726897, + "loss": 3.3221, + "step": 51200 + }, + { + "epoch": 14.92868795152645, + "grad_norm": 0.35627925395965576, + "learning_rate": 0.00042110521713786066, + "loss": 3.3225, + "step": 51250 + }, + { + "epoch": 14.943253320904217, + "grad_norm": 0.3497479557991028, + "learning_rate": 0.0004209303410084523, + "loss": 3.3091, + "step": 51300 + }, + { + "epoch": 14.957818690281986, + "grad_norm": 0.3572853207588196, + "learning_rate": 0.00042075546487904393, + "loss": 3.3194, + "step": 51350 + }, + { + "epoch": 14.972384059659753, + "grad_norm": 0.34908217191696167, + "learning_rate": 0.0004205805887496357, + "loss": 3.3197, + "step": 51400 + }, + { + "epoch": 14.986949429037521, + "grad_norm": 0.3747841417789459, + "learning_rate": 0.0004204057126202273, + "loss": 3.3196, + "step": 51450 + }, + { + "epoch": 15.001456536937777, + "grad_norm": 0.3852183520793915, + "learning_rate": 0.000420230836490819, + "loss": 3.3223, + "step": 51500 + }, + { + "epoch": 15.016021906315544, + "grad_norm": 0.36252638697624207, + "learning_rate": 0.00042005596036141064, + "loss": 3.2195, + "step": 51550 + }, + { + "epoch": 15.030587275693312, + "grad_norm": 0.37796682119369507, + "learning_rate": 0.0004198810842320023, + "loss": 3.2084, + "step": 51600 + }, + { + "epoch": 15.045152645071079, + "grad_norm": 0.35127121210098267, + "learning_rate": 0.00041970620810259396, + "loss": 3.2168, + "step": 51650 + }, + { + "epoch": 15.059718014448846, + "grad_norm": 0.3487504720687866, + "learning_rate": 0.0004195313319731856, + "loss": 3.2213, + "step": 51700 + }, + { + "epoch": 15.074283383826614, + "grad_norm": 0.3657950460910797, + "learning_rate": 0.0004193564558437773, + "loss": 3.2238, + "step": 51750 + }, + { + "epoch": 15.088848753204381, + "grad_norm": 0.3831111490726471, + "learning_rate": 0.0004191815797143689, + "loss": 3.2249, + "step": 51800 + }, + { + "epoch": 15.103414122582148, + "grad_norm": 0.35564419627189636, + "learning_rate": 0.00041900670358496067, + "loss": 3.2333, + "step": 51850 + }, + { + "epoch": 15.117979491959916, + "grad_norm": 0.36560797691345215, + "learning_rate": 0.0004188318274555523, + "loss": 3.2305, + "step": 51900 + }, + { + "epoch": 15.132544861337683, + "grad_norm": 0.34069278836250305, + "learning_rate": 0.00041865695132614394, + "loss": 3.2335, + "step": 51950 + }, + { + "epoch": 15.147110230715452, + "grad_norm": 0.3655010163784027, + "learning_rate": 0.00041848207519673563, + "loss": 3.242, + "step": 52000 + }, + { + "epoch": 15.147110230715452, + "eval_accuracy": 0.37159199327031206, + "eval_loss": 3.5560638904571533, + "eval_runtime": 180.1032, + "eval_samples_per_second": 92.419, + "eval_steps_per_second": 5.78, + "step": 52000 + }, + { + "epoch": 15.161675600093218, + "grad_norm": 0.35667118430137634, + "learning_rate": 0.00041830719906732727, + "loss": 3.2559, + "step": 52050 + }, + { + "epoch": 15.176240969470985, + "grad_norm": 0.38437923789024353, + "learning_rate": 0.00041813232293791896, + "loss": 3.2499, + "step": 52100 + }, + { + "epoch": 15.190806338848754, + "grad_norm": 0.361551433801651, + "learning_rate": 0.0004179574468085106, + "loss": 3.2595, + "step": 52150 + }, + { + "epoch": 15.20537170822652, + "grad_norm": 0.3916930854320526, + "learning_rate": 0.00041778257067910223, + "loss": 3.2515, + "step": 52200 + }, + { + "epoch": 15.219937077604287, + "grad_norm": 0.3800903558731079, + "learning_rate": 0.0004176076945496939, + "loss": 3.2494, + "step": 52250 + }, + { + "epoch": 15.234502446982056, + "grad_norm": 0.3692256808280945, + "learning_rate": 0.00041743281842028556, + "loss": 3.2523, + "step": 52300 + }, + { + "epoch": 15.249067816359823, + "grad_norm": 0.3822937607765198, + "learning_rate": 0.0004172579422908773, + "loss": 3.242, + "step": 52350 + }, + { + "epoch": 15.263633185737591, + "grad_norm": 0.3688431680202484, + "learning_rate": 0.00041708306616146894, + "loss": 3.2595, + "step": 52400 + }, + { + "epoch": 15.278198555115358, + "grad_norm": 0.37444230914115906, + "learning_rate": 0.00041690819003206063, + "loss": 3.2575, + "step": 52450 + }, + { + "epoch": 15.292763924493125, + "grad_norm": 0.36975809931755066, + "learning_rate": 0.00041673331390265227, + "loss": 3.2686, + "step": 52500 + }, + { + "epoch": 15.307329293870893, + "grad_norm": 0.363016813993454, + "learning_rate": 0.0004165584377732439, + "loss": 3.258, + "step": 52550 + }, + { + "epoch": 15.32189466324866, + "grad_norm": 0.39108654856681824, + "learning_rate": 0.0004163835616438356, + "loss": 3.2627, + "step": 52600 + }, + { + "epoch": 15.336460032626427, + "grad_norm": 0.3677827715873718, + "learning_rate": 0.00041620868551442723, + "loss": 3.2786, + "step": 52650 + }, + { + "epoch": 15.351025402004195, + "grad_norm": 0.3940332531929016, + "learning_rate": 0.0004160338093850189, + "loss": 3.2634, + "step": 52700 + }, + { + "epoch": 15.365590771381962, + "grad_norm": 0.3877134919166565, + "learning_rate": 0.00041585893325561056, + "loss": 3.2748, + "step": 52750 + }, + { + "epoch": 15.38015614075973, + "grad_norm": 0.37534767389297485, + "learning_rate": 0.0004156840571262022, + "loss": 3.2739, + "step": 52800 + }, + { + "epoch": 15.394721510137497, + "grad_norm": 0.35570940375328064, + "learning_rate": 0.00041550918099679394, + "loss": 3.2879, + "step": 52850 + }, + { + "epoch": 15.409286879515264, + "grad_norm": 0.39126497507095337, + "learning_rate": 0.0004153343048673856, + "loss": 3.2646, + "step": 52900 + }, + { + "epoch": 15.423852248893033, + "grad_norm": 0.3982384204864502, + "learning_rate": 0.00041515942873797726, + "loss": 3.2762, + "step": 52950 + }, + { + "epoch": 15.4384176182708, + "grad_norm": 0.37969404458999634, + "learning_rate": 0.0004149845526085689, + "loss": 3.2813, + "step": 53000 + }, + { + "epoch": 15.4384176182708, + "eval_accuracy": 0.37195399028521814, + "eval_loss": 3.5497677326202393, + "eval_runtime": 179.951, + "eval_samples_per_second": 92.497, + "eval_steps_per_second": 5.785, + "step": 53000 + }, + { + "epoch": 15.452982987648566, + "grad_norm": 0.35420140624046326, + "learning_rate": 0.0004148096764791606, + "loss": 3.2798, + "step": 53050 + }, + { + "epoch": 15.467548357026335, + "grad_norm": 0.37509068846702576, + "learning_rate": 0.0004146348003497522, + "loss": 3.28, + "step": 53100 + }, + { + "epoch": 15.482113726404101, + "grad_norm": 0.3744305372238159, + "learning_rate": 0.00041445992422034386, + "loss": 3.2838, + "step": 53150 + }, + { + "epoch": 15.49667909578187, + "grad_norm": 0.39247238636016846, + "learning_rate": 0.00041428504809093555, + "loss": 3.2829, + "step": 53200 + }, + { + "epoch": 15.511244465159637, + "grad_norm": 0.35992783308029175, + "learning_rate": 0.0004141101719615272, + "loss": 3.2892, + "step": 53250 + }, + { + "epoch": 15.525809834537403, + "grad_norm": 0.3768855035305023, + "learning_rate": 0.00041393529583211893, + "loss": 3.2853, + "step": 53300 + }, + { + "epoch": 15.540375203915172, + "grad_norm": 0.4003032147884369, + "learning_rate": 0.00041376041970271057, + "loss": 3.2819, + "step": 53350 + }, + { + "epoch": 15.554940573292939, + "grad_norm": 0.3577634394168854, + "learning_rate": 0.0004135855435733022, + "loss": 3.2958, + "step": 53400 + }, + { + "epoch": 15.569505942670705, + "grad_norm": 0.3649856150150299, + "learning_rate": 0.0004134106674438939, + "loss": 3.2893, + "step": 53450 + }, + { + "epoch": 15.584071312048474, + "grad_norm": 0.363534539937973, + "learning_rate": 0.00041323579131448553, + "loss": 3.2882, + "step": 53500 + }, + { + "epoch": 15.59863668142624, + "grad_norm": 0.36388495564460754, + "learning_rate": 0.0004130609151850772, + "loss": 3.2978, + "step": 53550 + }, + { + "epoch": 15.61320205080401, + "grad_norm": 0.37562793493270874, + "learning_rate": 0.00041288603905566886, + "loss": 3.3022, + "step": 53600 + }, + { + "epoch": 15.627767420181776, + "grad_norm": 0.38429683446884155, + "learning_rate": 0.0004127111629262605, + "loss": 3.2914, + "step": 53650 + }, + { + "epoch": 15.642332789559543, + "grad_norm": 0.3696308434009552, + "learning_rate": 0.0004125362867968522, + "loss": 3.3008, + "step": 53700 + }, + { + "epoch": 15.656898158937311, + "grad_norm": 0.3845422565937042, + "learning_rate": 0.0004123614106674438, + "loss": 3.3035, + "step": 53750 + }, + { + "epoch": 15.671463528315078, + "grad_norm": 0.372842401266098, + "learning_rate": 0.00041218653453803557, + "loss": 3.2908, + "step": 53800 + }, + { + "epoch": 15.686028897692845, + "grad_norm": 0.34964442253112793, + "learning_rate": 0.0004120116584086272, + "loss": 3.2873, + "step": 53850 + }, + { + "epoch": 15.700594267070613, + "grad_norm": 0.3756067752838135, + "learning_rate": 0.0004118367822792189, + "loss": 3.3061, + "step": 53900 + }, + { + "epoch": 15.71515963644838, + "grad_norm": 0.3714199960231781, + "learning_rate": 0.00041166190614981053, + "loss": 3.2919, + "step": 53950 + }, + { + "epoch": 15.729725005826147, + "grad_norm": 0.36421453952789307, + "learning_rate": 0.00041148703002040217, + "loss": 3.3013, + "step": 54000 + }, + { + "epoch": 15.729725005826147, + "eval_accuracy": 0.3723953468275882, + "eval_loss": 3.541537046432495, + "eval_runtime": 180.0597, + "eval_samples_per_second": 92.442, + "eval_steps_per_second": 5.781, + "step": 54000 + }, + { + "epoch": 15.744290375203915, + "grad_norm": 0.3969421982765198, + "learning_rate": 0.00041131215389099386, + "loss": 3.2974, + "step": 54050 + }, + { + "epoch": 15.758855744581682, + "grad_norm": 0.357675701379776, + "learning_rate": 0.0004111372777615855, + "loss": 3.3099, + "step": 54100 + }, + { + "epoch": 15.77342111395945, + "grad_norm": 0.37822505831718445, + "learning_rate": 0.0004109624016321772, + "loss": 3.2978, + "step": 54150 + }, + { + "epoch": 15.787986483337217, + "grad_norm": 0.3701728880405426, + "learning_rate": 0.0004107875255027688, + "loss": 3.2945, + "step": 54200 + }, + { + "epoch": 15.802551852714984, + "grad_norm": 0.39551228284835815, + "learning_rate": 0.00041061264937336045, + "loss": 3.2964, + "step": 54250 + }, + { + "epoch": 15.817117222092753, + "grad_norm": 0.36080774664878845, + "learning_rate": 0.0004104377732439522, + "loss": 3.3008, + "step": 54300 + }, + { + "epoch": 15.83168259147052, + "grad_norm": 0.37071558833122253, + "learning_rate": 0.00041026289711454384, + "loss": 3.3056, + "step": 54350 + }, + { + "epoch": 15.846247960848288, + "grad_norm": 0.36288759112358093, + "learning_rate": 0.0004100880209851355, + "loss": 3.2979, + "step": 54400 + }, + { + "epoch": 15.860813330226055, + "grad_norm": 0.3693920075893402, + "learning_rate": 0.00040991314485572716, + "loss": 3.3088, + "step": 54450 + }, + { + "epoch": 15.875378699603822, + "grad_norm": 0.37087926268577576, + "learning_rate": 0.00040973826872631885, + "loss": 3.3055, + "step": 54500 + }, + { + "epoch": 15.88994406898159, + "grad_norm": 0.36043593287467957, + "learning_rate": 0.0004095633925969105, + "loss": 3.3004, + "step": 54550 + }, + { + "epoch": 15.904509438359357, + "grad_norm": 0.3625832200050354, + "learning_rate": 0.0004093885164675021, + "loss": 3.3118, + "step": 54600 + }, + { + "epoch": 15.919074807737124, + "grad_norm": 0.36735785007476807, + "learning_rate": 0.0004092136403380938, + "loss": 3.2942, + "step": 54650 + }, + { + "epoch": 15.933640177114892, + "grad_norm": 0.3763796389102936, + "learning_rate": 0.00040903876420868545, + "loss": 3.3066, + "step": 54700 + }, + { + "epoch": 15.948205546492659, + "grad_norm": 0.36403748393058777, + "learning_rate": 0.00040886388807927714, + "loss": 3.3197, + "step": 54750 + }, + { + "epoch": 15.962770915870426, + "grad_norm": 0.38384634256362915, + "learning_rate": 0.00040868901194986883, + "loss": 3.3177, + "step": 54800 + }, + { + "epoch": 15.977336285248194, + "grad_norm": 0.38456591963768005, + "learning_rate": 0.00040851413582046047, + "loss": 3.3092, + "step": 54850 + }, + { + "epoch": 15.991901654625961, + "grad_norm": 0.36568573117256165, + "learning_rate": 0.00040833925969105216, + "loss": 3.308, + "step": 54900 + }, + { + "epoch": 16.006408762526217, + "grad_norm": 0.3781602680683136, + "learning_rate": 0.0004081643835616438, + "loss": 3.2568, + "step": 54950 + }, + { + "epoch": 16.020974131903984, + "grad_norm": 0.3532155454158783, + "learning_rate": 0.0004079895074322355, + "loss": 3.2009, + "step": 55000 + }, + { + "epoch": 16.020974131903984, + "eval_accuracy": 0.3720104237269703, + "eval_loss": 3.548476219177246, + "eval_runtime": 180.062, + "eval_samples_per_second": 92.44, + "eval_steps_per_second": 5.781, + "step": 55000 + }, + { + "epoch": 16.035539501281754, + "grad_norm": 0.37421002984046936, + "learning_rate": 0.0004078146313028271, + "loss": 3.1894, + "step": 55050 + }, + { + "epoch": 16.05010487065952, + "grad_norm": 0.38773152232170105, + "learning_rate": 0.0004076397551734188, + "loss": 3.2052, + "step": 55100 + }, + { + "epoch": 16.064670240037287, + "grad_norm": 0.40175601840019226, + "learning_rate": 0.00040746487904401045, + "loss": 3.2037, + "step": 55150 + }, + { + "epoch": 16.079235609415054, + "grad_norm": 0.4081617593765259, + "learning_rate": 0.0004072900029146021, + "loss": 3.2144, + "step": 55200 + }, + { + "epoch": 16.09380097879282, + "grad_norm": 0.40000903606414795, + "learning_rate": 0.0004071151267851938, + "loss": 3.2147, + "step": 55250 + }, + { + "epoch": 16.10836634817059, + "grad_norm": 0.3925228416919708, + "learning_rate": 0.00040694025065578546, + "loss": 3.2222, + "step": 55300 + }, + { + "epoch": 16.122931717548358, + "grad_norm": 0.3875502943992615, + "learning_rate": 0.00040676537452637716, + "loss": 3.2168, + "step": 55350 + }, + { + "epoch": 16.137497086926125, + "grad_norm": 0.3993126153945923, + "learning_rate": 0.0004065904983969688, + "loss": 3.2285, + "step": 55400 + }, + { + "epoch": 16.15206245630389, + "grad_norm": 0.3767448365688324, + "learning_rate": 0.00040641562226756043, + "loss": 3.2361, + "step": 55450 + }, + { + "epoch": 16.16662782568166, + "grad_norm": 0.3723169267177582, + "learning_rate": 0.0004062407461381521, + "loss": 3.2316, + "step": 55500 + }, + { + "epoch": 16.181193195059425, + "grad_norm": 0.3401069939136505, + "learning_rate": 0.00040606587000874375, + "loss": 3.2341, + "step": 55550 + }, + { + "epoch": 16.195758564437195, + "grad_norm": 0.3760260343551636, + "learning_rate": 0.00040589099387933544, + "loss": 3.2417, + "step": 55600 + }, + { + "epoch": 16.210323933814962, + "grad_norm": 0.39141103625297546, + "learning_rate": 0.0004057161177499271, + "loss": 3.2435, + "step": 55650 + }, + { + "epoch": 16.22488930319273, + "grad_norm": 0.36476460099220276, + "learning_rate": 0.0004055412416205187, + "loss": 3.2431, + "step": 55700 + }, + { + "epoch": 16.239454672570496, + "grad_norm": 0.391431599855423, + "learning_rate": 0.0004053663654911104, + "loss": 3.2551, + "step": 55750 + }, + { + "epoch": 16.254020041948262, + "grad_norm": 0.3892253041267395, + "learning_rate": 0.0004051914893617021, + "loss": 3.2572, + "step": 55800 + }, + { + "epoch": 16.268585411326033, + "grad_norm": 0.39836394786834717, + "learning_rate": 0.0004050166132322938, + "loss": 3.2445, + "step": 55850 + }, + { + "epoch": 16.2831507807038, + "grad_norm": 0.4005641043186188, + "learning_rate": 0.0004048417371028854, + "loss": 3.2473, + "step": 55900 + }, + { + "epoch": 16.297716150081566, + "grad_norm": 0.35440683364868164, + "learning_rate": 0.0004046668609734771, + "loss": 3.2538, + "step": 55950 + }, + { + "epoch": 16.312281519459333, + "grad_norm": 0.37304195761680603, + "learning_rate": 0.00040449198484406875, + "loss": 3.2631, + "step": 56000 + }, + { + "epoch": 16.312281519459333, + "eval_accuracy": 0.3721091822500366, + "eval_loss": 3.550828456878662, + "eval_runtime": 180.285, + "eval_samples_per_second": 92.326, + "eval_steps_per_second": 5.774, + "step": 56000 + }, + { + "epoch": 16.3268468888371, + "grad_norm": 0.4370158612728119, + "learning_rate": 0.0004043171087146604, + "loss": 3.2598, + "step": 56050 + }, + { + "epoch": 16.34141225821487, + "grad_norm": 0.40278568863868713, + "learning_rate": 0.0004041422325852521, + "loss": 3.2515, + "step": 56100 + }, + { + "epoch": 16.355977627592637, + "grad_norm": 0.36991235613822937, + "learning_rate": 0.0004039673564558437, + "loss": 3.2632, + "step": 56150 + }, + { + "epoch": 16.370542996970403, + "grad_norm": 0.35354509949684143, + "learning_rate": 0.0004037924803264354, + "loss": 3.265, + "step": 56200 + }, + { + "epoch": 16.38510836634817, + "grad_norm": 0.3710688650608063, + "learning_rate": 0.00040361760419702704, + "loss": 3.2659, + "step": 56250 + }, + { + "epoch": 16.399673735725937, + "grad_norm": 0.3730112910270691, + "learning_rate": 0.00040344272806761873, + "loss": 3.2531, + "step": 56300 + }, + { + "epoch": 16.414239105103704, + "grad_norm": 0.4005134403705597, + "learning_rate": 0.0004032678519382104, + "loss": 3.2692, + "step": 56350 + }, + { + "epoch": 16.428804474481474, + "grad_norm": 0.3491421937942505, + "learning_rate": 0.00040309297580880206, + "loss": 3.2599, + "step": 56400 + }, + { + "epoch": 16.44336984385924, + "grad_norm": 0.35259345173835754, + "learning_rate": 0.00040291809967939375, + "loss": 3.2715, + "step": 56450 + }, + { + "epoch": 16.457935213237008, + "grad_norm": 0.3865564167499542, + "learning_rate": 0.0004027432235499854, + "loss": 3.2702, + "step": 56500 + }, + { + "epoch": 16.472500582614774, + "grad_norm": 0.3678795099258423, + "learning_rate": 0.0004025683474205771, + "loss": 3.2733, + "step": 56550 + }, + { + "epoch": 16.48706595199254, + "grad_norm": 0.36398905515670776, + "learning_rate": 0.0004023934712911687, + "loss": 3.2619, + "step": 56600 + }, + { + "epoch": 16.50163132137031, + "grad_norm": 0.3686988949775696, + "learning_rate": 0.00040221859516176035, + "loss": 3.2789, + "step": 56650 + }, + { + "epoch": 16.516196690748078, + "grad_norm": 0.3928549587726593, + "learning_rate": 0.00040204371903235204, + "loss": 3.2703, + "step": 56700 + }, + { + "epoch": 16.530762060125845, + "grad_norm": 0.38307324051856995, + "learning_rate": 0.0004018688429029437, + "loss": 3.2629, + "step": 56750 + }, + { + "epoch": 16.54532742950361, + "grad_norm": 0.3929668366909027, + "learning_rate": 0.0004016939667735354, + "loss": 3.2684, + "step": 56800 + }, + { + "epoch": 16.55989279888138, + "grad_norm": 0.36345812678337097, + "learning_rate": 0.00040151909064412705, + "loss": 3.2756, + "step": 56850 + }, + { + "epoch": 16.57445816825915, + "grad_norm": 0.3730520009994507, + "learning_rate": 0.0004013442145147187, + "loss": 3.2684, + "step": 56900 + }, + { + "epoch": 16.589023537636916, + "grad_norm": 0.3729363977909088, + "learning_rate": 0.0004011693383853104, + "loss": 3.2854, + "step": 56950 + }, + { + "epoch": 16.603588907014682, + "grad_norm": 0.37262773513793945, + "learning_rate": 0.000400994462255902, + "loss": 3.2698, + "step": 57000 + }, + { + "epoch": 16.603588907014682, + "eval_accuracy": 0.3724431976834072, + "eval_loss": 3.54331111907959, + "eval_runtime": 180.2789, + "eval_samples_per_second": 92.329, + "eval_steps_per_second": 5.774, + "step": 57000 + }, + { + "epoch": 16.61815427639245, + "grad_norm": 0.3967365026473999, + "learning_rate": 0.0004008195861264937, + "loss": 3.2847, + "step": 57050 + }, + { + "epoch": 16.632719645770216, + "grad_norm": 0.3768179416656494, + "learning_rate": 0.00040064470999708534, + "loss": 3.2724, + "step": 57100 + }, + { + "epoch": 16.647285015147983, + "grad_norm": 0.3650953769683838, + "learning_rate": 0.00040046983386767703, + "loss": 3.2813, + "step": 57150 + }, + { + "epoch": 16.661850384525753, + "grad_norm": 0.35720765590667725, + "learning_rate": 0.00040029495773826867, + "loss": 3.2817, + "step": 57200 + }, + { + "epoch": 16.67641575390352, + "grad_norm": 0.39140841364860535, + "learning_rate": 0.0004001200816088603, + "loss": 3.2895, + "step": 57250 + }, + { + "epoch": 16.690981123281286, + "grad_norm": 0.3720303475856781, + "learning_rate": 0.00039994520547945205, + "loss": 3.2812, + "step": 57300 + }, + { + "epoch": 16.705546492659053, + "grad_norm": 0.37557291984558105, + "learning_rate": 0.0003997703293500437, + "loss": 3.2861, + "step": 57350 + }, + { + "epoch": 16.72011186203682, + "grad_norm": 0.36764585971832275, + "learning_rate": 0.0003995954532206354, + "loss": 3.2853, + "step": 57400 + }, + { + "epoch": 16.73467723141459, + "grad_norm": 0.38060253858566284, + "learning_rate": 0.000399420577091227, + "loss": 3.2809, + "step": 57450 + }, + { + "epoch": 16.749242600792357, + "grad_norm": 0.3591197729110718, + "learning_rate": 0.00039924570096181865, + "loss": 3.2897, + "step": 57500 + }, + { + "epoch": 16.763807970170124, + "grad_norm": 0.3743264079093933, + "learning_rate": 0.00039907082483241034, + "loss": 3.2838, + "step": 57550 + }, + { + "epoch": 16.77837333954789, + "grad_norm": 0.3858964145183563, + "learning_rate": 0.000398895948703002, + "loss": 3.2924, + "step": 57600 + }, + { + "epoch": 16.792938708925657, + "grad_norm": 0.372662216424942, + "learning_rate": 0.00039872107257359367, + "loss": 3.2809, + "step": 57650 + }, + { + "epoch": 16.807504078303424, + "grad_norm": 0.3536543846130371, + "learning_rate": 0.0003985461964441853, + "loss": 3.2989, + "step": 57700 + }, + { + "epoch": 16.822069447681194, + "grad_norm": 0.38918742537498474, + "learning_rate": 0.00039837132031477694, + "loss": 3.2845, + "step": 57750 + }, + { + "epoch": 16.83663481705896, + "grad_norm": 0.3828890323638916, + "learning_rate": 0.0003981964441853687, + "loss": 3.2902, + "step": 57800 + }, + { + "epoch": 16.851200186436728, + "grad_norm": 0.36972659826278687, + "learning_rate": 0.0003980215680559603, + "loss": 3.2828, + "step": 57850 + }, + { + "epoch": 16.865765555814495, + "grad_norm": 0.36914464831352234, + "learning_rate": 0.000397846691926552, + "loss": 3.2819, + "step": 57900 + }, + { + "epoch": 16.88033092519226, + "grad_norm": 0.41757625341415405, + "learning_rate": 0.00039767181579714365, + "loss": 3.2939, + "step": 57950 + }, + { + "epoch": 16.89489629457003, + "grad_norm": 0.3668311536312103, + "learning_rate": 0.00039749693966773534, + "loss": 3.2943, + "step": 58000 + }, + { + "epoch": 16.89489629457003, + "eval_accuracy": 0.37297237876950406, + "eval_loss": 3.535468101501465, + "eval_runtime": 180.2238, + "eval_samples_per_second": 92.357, + "eval_steps_per_second": 5.776, + "step": 58000 + }, + { + "epoch": 16.9094616639478, + "grad_norm": 0.3850747346878052, + "learning_rate": 0.00039732206353832697, + "loss": 3.3026, + "step": 58050 + }, + { + "epoch": 16.924027033325565, + "grad_norm": 0.3471890091896057, + "learning_rate": 0.0003971471874089186, + "loss": 3.3096, + "step": 58100 + }, + { + "epoch": 16.938592402703332, + "grad_norm": 0.371502548456192, + "learning_rate": 0.0003969723112795103, + "loss": 3.2999, + "step": 58150 + }, + { + "epoch": 16.9531577720811, + "grad_norm": 0.36119985580444336, + "learning_rate": 0.00039679743515010194, + "loss": 3.2996, + "step": 58200 + }, + { + "epoch": 16.96772314145887, + "grad_norm": 0.38665369153022766, + "learning_rate": 0.0003966225590206937, + "loss": 3.2906, + "step": 58250 + }, + { + "epoch": 16.982288510836636, + "grad_norm": 0.3815176784992218, + "learning_rate": 0.0003964476828912853, + "loss": 3.2889, + "step": 58300 + }, + { + "epoch": 16.996853880214402, + "grad_norm": 0.3887826204299927, + "learning_rate": 0.00039627280676187695, + "loss": 3.2914, + "step": 58350 + }, + { + "epoch": 17.01136098811466, + "grad_norm": 0.4290904402732849, + "learning_rate": 0.00039609793063246864, + "loss": 3.2189, + "step": 58400 + }, + { + "epoch": 17.025926357492427, + "grad_norm": 0.37529537081718445, + "learning_rate": 0.0003959230545030603, + "loss": 3.1944, + "step": 58450 + }, + { + "epoch": 17.040491726870194, + "grad_norm": 0.364271879196167, + "learning_rate": 0.00039574817837365197, + "loss": 3.1825, + "step": 58500 + }, + { + "epoch": 17.05505709624796, + "grad_norm": 0.37614959478378296, + "learning_rate": 0.0003955733022442436, + "loss": 3.2019, + "step": 58550 + }, + { + "epoch": 17.069622465625727, + "grad_norm": 0.3827800750732422, + "learning_rate": 0.0003953984261148353, + "loss": 3.211, + "step": 58600 + }, + { + "epoch": 17.084187835003497, + "grad_norm": 0.38689175248146057, + "learning_rate": 0.00039522354998542693, + "loss": 3.2085, + "step": 58650 + }, + { + "epoch": 17.098753204381264, + "grad_norm": 0.4140167534351349, + "learning_rate": 0.00039504867385601857, + "loss": 3.2028, + "step": 58700 + }, + { + "epoch": 17.11331857375903, + "grad_norm": 0.396028608083725, + "learning_rate": 0.0003948737977266103, + "loss": 3.2235, + "step": 58750 + }, + { + "epoch": 17.127883943136798, + "grad_norm": 0.37700241804122925, + "learning_rate": 0.00039469892159720195, + "loss": 3.192, + "step": 58800 + }, + { + "epoch": 17.142449312514564, + "grad_norm": 0.41701769828796387, + "learning_rate": 0.00039452404546779364, + "loss": 3.2142, + "step": 58850 + }, + { + "epoch": 17.15701468189233, + "grad_norm": 0.3866487741470337, + "learning_rate": 0.0003943491693383853, + "loss": 3.2286, + "step": 58900 + }, + { + "epoch": 17.1715800512701, + "grad_norm": 0.4011995494365692, + "learning_rate": 0.0003941742932089769, + "loss": 3.2263, + "step": 58950 + }, + { + "epoch": 17.18614542064787, + "grad_norm": 0.37379980087280273, + "learning_rate": 0.0003939994170795686, + "loss": 3.2083, + "step": 59000 + }, + { + "epoch": 17.18614542064787, + "eval_accuracy": 0.37220688264607005, + "eval_loss": 3.5509185791015625, + "eval_runtime": 180.2102, + "eval_samples_per_second": 92.364, + "eval_steps_per_second": 5.777, + "step": 59000 + }, + { + "epoch": 17.200710790025635, + "grad_norm": 0.38473978638648987, + "learning_rate": 0.00039382454095016024, + "loss": 3.224, + "step": 59050 + }, + { + "epoch": 17.215276159403402, + "grad_norm": 0.3696572184562683, + "learning_rate": 0.00039364966482075193, + "loss": 3.2346, + "step": 59100 + }, + { + "epoch": 17.22984152878117, + "grad_norm": 0.3958011269569397, + "learning_rate": 0.00039347478869134356, + "loss": 3.2397, + "step": 59150 + }, + { + "epoch": 17.24440689815894, + "grad_norm": 0.38348454236984253, + "learning_rate": 0.0003932999125619353, + "loss": 3.2252, + "step": 59200 + }, + { + "epoch": 17.258972267536706, + "grad_norm": 0.3725033402442932, + "learning_rate": 0.00039312503643252695, + "loss": 3.2384, + "step": 59250 + }, + { + "epoch": 17.273537636914472, + "grad_norm": 0.4087284803390503, + "learning_rate": 0.0003929501603031186, + "loss": 3.2321, + "step": 59300 + }, + { + "epoch": 17.28810300629224, + "grad_norm": 0.3712259531021118, + "learning_rate": 0.00039277528417371027, + "loss": 3.2538, + "step": 59350 + }, + { + "epoch": 17.302668375670006, + "grad_norm": 0.38870859146118164, + "learning_rate": 0.0003926004080443019, + "loss": 3.2529, + "step": 59400 + }, + { + "epoch": 17.317233745047773, + "grad_norm": 0.4209974706172943, + "learning_rate": 0.0003924255319148936, + "loss": 3.2472, + "step": 59450 + }, + { + "epoch": 17.331799114425543, + "grad_norm": 0.3777424097061157, + "learning_rate": 0.00039225065578548523, + "loss": 3.2365, + "step": 59500 + }, + { + "epoch": 17.34636448380331, + "grad_norm": 0.37616270780563354, + "learning_rate": 0.00039207577965607687, + "loss": 3.2432, + "step": 59550 + }, + { + "epoch": 17.360929853181077, + "grad_norm": 0.4067479968070984, + "learning_rate": 0.00039190090352666856, + "loss": 3.2597, + "step": 59600 + }, + { + "epoch": 17.375495222558843, + "grad_norm": 0.3870598375797272, + "learning_rate": 0.0003917260273972602, + "loss": 3.2533, + "step": 59650 + }, + { + "epoch": 17.39006059193661, + "grad_norm": 0.39382418990135193, + "learning_rate": 0.00039155115126785194, + "loss": 3.2419, + "step": 59700 + }, + { + "epoch": 17.40462596131438, + "grad_norm": 0.36751487851142883, + "learning_rate": 0.0003913762751384436, + "loss": 3.233, + "step": 59750 + }, + { + "epoch": 17.419191330692147, + "grad_norm": 0.3841138184070587, + "learning_rate": 0.00039120139900903527, + "loss": 3.2467, + "step": 59800 + }, + { + "epoch": 17.433756700069914, + "grad_norm": 0.37457749247550964, + "learning_rate": 0.0003910265228796269, + "loss": 3.2391, + "step": 59850 + }, + { + "epoch": 17.44832206944768, + "grad_norm": 0.3810558021068573, + "learning_rate": 0.00039085164675021854, + "loss": 3.2447, + "step": 59900 + }, + { + "epoch": 17.462887438825447, + "grad_norm": 0.3752453625202179, + "learning_rate": 0.00039067677062081023, + "loss": 3.2476, + "step": 59950 + }, + { + "epoch": 17.477452808203218, + "grad_norm": 0.3891676664352417, + "learning_rate": 0.00039050189449140187, + "loss": 3.2538, + "step": 60000 + }, + { + "epoch": 17.477452808203218, + "eval_accuracy": 0.3728713864227018, + "eval_loss": 3.544705629348755, + "eval_runtime": 180.1568, + "eval_samples_per_second": 92.392, + "eval_steps_per_second": 5.778, + "step": 60000 + }, + { + "epoch": 17.492018177580984, + "grad_norm": 0.38783255219459534, + "learning_rate": 0.00039032701836199356, + "loss": 3.2714, + "step": 60050 + }, + { + "epoch": 17.50658354695875, + "grad_norm": 0.36609965562820435, + "learning_rate": 0.0003901521422325852, + "loss": 3.2597, + "step": 60100 + }, + { + "epoch": 17.521148916336518, + "grad_norm": 0.38936126232147217, + "learning_rate": 0.00038997726610317683, + "loss": 3.2579, + "step": 60150 + }, + { + "epoch": 17.535714285714285, + "grad_norm": 0.3598592281341553, + "learning_rate": 0.0003898023899737686, + "loss": 3.2579, + "step": 60200 + }, + { + "epoch": 17.55027965509205, + "grad_norm": 0.37373456358909607, + "learning_rate": 0.0003896275138443602, + "loss": 3.2591, + "step": 60250 + }, + { + "epoch": 17.56484502446982, + "grad_norm": 0.3937729299068451, + "learning_rate": 0.0003894526377149519, + "loss": 3.2699, + "step": 60300 + }, + { + "epoch": 17.57941039384759, + "grad_norm": 0.434922993183136, + "learning_rate": 0.00038927776158554354, + "loss": 3.2521, + "step": 60350 + }, + { + "epoch": 17.593975763225355, + "grad_norm": 0.3775573670864105, + "learning_rate": 0.0003891028854561352, + "loss": 3.2672, + "step": 60400 + }, + { + "epoch": 17.608541132603122, + "grad_norm": 0.386683851480484, + "learning_rate": 0.00038892800932672686, + "loss": 3.2778, + "step": 60450 + }, + { + "epoch": 17.62310650198089, + "grad_norm": 0.3631393015384674, + "learning_rate": 0.0003887531331973185, + "loss": 3.2626, + "step": 60500 + }, + { + "epoch": 17.63767187135866, + "grad_norm": 0.3855550289154053, + "learning_rate": 0.0003885782570679102, + "loss": 3.2725, + "step": 60550 + }, + { + "epoch": 17.652237240736426, + "grad_norm": 0.41759514808654785, + "learning_rate": 0.0003884033809385018, + "loss": 3.2726, + "step": 60600 + }, + { + "epoch": 17.666802610114193, + "grad_norm": 0.37082067131996155, + "learning_rate": 0.00038822850480909357, + "loss": 3.2743, + "step": 60650 + }, + { + "epoch": 17.68136797949196, + "grad_norm": 0.3791321814060211, + "learning_rate": 0.0003880536286796852, + "loss": 3.2711, + "step": 60700 + }, + { + "epoch": 17.695933348869726, + "grad_norm": 0.3907877504825592, + "learning_rate": 0.00038787875255027684, + "loss": 3.2657, + "step": 60750 + }, + { + "epoch": 17.710498718247496, + "grad_norm": 0.3546347916126251, + "learning_rate": 0.00038770387642086853, + "loss": 3.2659, + "step": 60800 + }, + { + "epoch": 17.725064087625263, + "grad_norm": 0.393719881772995, + "learning_rate": 0.00038752900029146017, + "loss": 3.277, + "step": 60850 + }, + { + "epoch": 17.73962945700303, + "grad_norm": 0.3926842510700226, + "learning_rate": 0.00038735412416205186, + "loss": 3.2704, + "step": 60900 + }, + { + "epoch": 17.754194826380797, + "grad_norm": 0.3844298720359802, + "learning_rate": 0.0003871792480326435, + "loss": 3.271, + "step": 60950 + }, + { + "epoch": 17.768760195758563, + "grad_norm": 0.3942425549030304, + "learning_rate": 0.00038700437190323513, + "loss": 3.2726, + "step": 61000 + }, + { + "epoch": 17.768760195758563, + "eval_accuracy": 0.37308395238663494, + "eval_loss": 3.5373306274414062, + "eval_runtime": 180.211, + "eval_samples_per_second": 92.364, + "eval_steps_per_second": 5.777, + "step": 61000 + }, + { + "epoch": 17.78332556513633, + "grad_norm": 0.416614294052124, + "learning_rate": 0.0003868294957738268, + "loss": 3.2786, + "step": 61050 + }, + { + "epoch": 17.7978909345141, + "grad_norm": 0.3919004201889038, + "learning_rate": 0.00038665461964441846, + "loss": 3.2755, + "step": 61100 + }, + { + "epoch": 17.812456303891867, + "grad_norm": 0.3852967321872711, + "learning_rate": 0.0003864797435150102, + "loss": 3.2774, + "step": 61150 + }, + { + "epoch": 17.827021673269634, + "grad_norm": 0.4006671607494354, + "learning_rate": 0.00038630486738560184, + "loss": 3.2849, + "step": 61200 + }, + { + "epoch": 17.8415870426474, + "grad_norm": 0.3728589713573456, + "learning_rate": 0.00038612999125619353, + "loss": 3.2795, + "step": 61250 + }, + { + "epoch": 17.856152412025168, + "grad_norm": 0.38556793332099915, + "learning_rate": 0.00038595511512678517, + "loss": 3.2734, + "step": 61300 + }, + { + "epoch": 17.870717781402938, + "grad_norm": 0.3805278241634369, + "learning_rate": 0.0003857802389973768, + "loss": 3.2765, + "step": 61350 + }, + { + "epoch": 17.885283150780705, + "grad_norm": 0.3954722583293915, + "learning_rate": 0.0003856053628679685, + "loss": 3.296, + "step": 61400 + }, + { + "epoch": 17.89984852015847, + "grad_norm": 0.37796393036842346, + "learning_rate": 0.00038543048673856013, + "loss": 3.283, + "step": 61450 + }, + { + "epoch": 17.914413889536238, + "grad_norm": 0.35778191685676575, + "learning_rate": 0.0003852556106091518, + "loss": 3.2872, + "step": 61500 + }, + { + "epoch": 17.928979258914005, + "grad_norm": 0.3738497197628021, + "learning_rate": 0.00038508073447974346, + "loss": 3.2867, + "step": 61550 + }, + { + "epoch": 17.943544628291775, + "grad_norm": 0.3807421624660492, + "learning_rate": 0.0003849058583503351, + "loss": 3.2681, + "step": 61600 + }, + { + "epoch": 17.958109997669542, + "grad_norm": 0.3870691657066345, + "learning_rate": 0.00038473098222092684, + "loss": 3.2799, + "step": 61650 + }, + { + "epoch": 17.97267536704731, + "grad_norm": 0.3574218153953552, + "learning_rate": 0.0003845561060915185, + "loss": 3.2848, + "step": 61700 + }, + { + "epoch": 17.987240736425075, + "grad_norm": 0.37810570001602173, + "learning_rate": 0.00038438122996211016, + "loss": 3.2866, + "step": 61750 + }, + { + "epoch": 18.001747844325333, + "grad_norm": 0.42944374680519104, + "learning_rate": 0.0003842063538327018, + "loss": 3.272, + "step": 61800 + }, + { + "epoch": 18.0163132137031, + "grad_norm": 0.3770248293876648, + "learning_rate": 0.0003840314777032935, + "loss": 3.1656, + "step": 61850 + }, + { + "epoch": 18.030878583080867, + "grad_norm": 0.38981911540031433, + "learning_rate": 0.0003838566015738851, + "loss": 3.1825, + "step": 61900 + }, + { + "epoch": 18.045443952458633, + "grad_norm": 0.41429778933525085, + "learning_rate": 0.00038368172544447676, + "loss": 3.185, + "step": 61950 + }, + { + "epoch": 18.0600093218364, + "grad_norm": 0.3987773060798645, + "learning_rate": 0.00038350684931506845, + "loss": 3.1734, + "step": 62000 + }, + { + "epoch": 18.0600093218364, + "eval_accuracy": 0.37259674367284124, + "eval_loss": 3.5511462688446045, + "eval_runtime": 180.2134, + "eval_samples_per_second": 92.363, + "eval_steps_per_second": 5.776, + "step": 62000 + }, + { + "epoch": 18.07457469121417, + "grad_norm": 0.3764216899871826, + "learning_rate": 0.0003833319731856601, + "loss": 3.1908, + "step": 62050 + }, + { + "epoch": 18.089140060591937, + "grad_norm": 0.5347649455070496, + "learning_rate": 0.00038315709705625183, + "loss": 3.1914, + "step": 62100 + }, + { + "epoch": 18.103705429969704, + "grad_norm": 0.3660869300365448, + "learning_rate": 0.00038298222092684347, + "loss": 3.2015, + "step": 62150 + }, + { + "epoch": 18.11827079934747, + "grad_norm": 0.3871309161186218, + "learning_rate": 0.0003828073447974351, + "loss": 3.1985, + "step": 62200 + }, + { + "epoch": 18.132836168725238, + "grad_norm": 0.41517186164855957, + "learning_rate": 0.0003826324686680268, + "loss": 3.2121, + "step": 62250 + }, + { + "epoch": 18.147401538103008, + "grad_norm": 0.3585100471973419, + "learning_rate": 0.00038245759253861843, + "loss": 3.2091, + "step": 62300 + }, + { + "epoch": 18.161966907480775, + "grad_norm": 0.3985573351383209, + "learning_rate": 0.0003822827164092101, + "loss": 3.2183, + "step": 62350 + }, + { + "epoch": 18.17653227685854, + "grad_norm": 0.40456002950668335, + "learning_rate": 0.00038210784027980176, + "loss": 3.2072, + "step": 62400 + }, + { + "epoch": 18.191097646236308, + "grad_norm": 0.3861040472984314, + "learning_rate": 0.0003819329641503934, + "loss": 3.2051, + "step": 62450 + }, + { + "epoch": 18.205663015614075, + "grad_norm": 0.4029352366924286, + "learning_rate": 0.0003817580880209851, + "loss": 3.213, + "step": 62500 + }, + { + "epoch": 18.22022838499184, + "grad_norm": 0.36406001448631287, + "learning_rate": 0.0003815832118915767, + "loss": 3.2182, + "step": 62550 + }, + { + "epoch": 18.234793754369612, + "grad_norm": 0.38128963112831116, + "learning_rate": 0.00038140833576216847, + "loss": 3.2251, + "step": 62600 + }, + { + "epoch": 18.24935912374738, + "grad_norm": 0.3720012307167053, + "learning_rate": 0.0003812334596327601, + "loss": 3.2148, + "step": 62650 + }, + { + "epoch": 18.263924493125145, + "grad_norm": 0.38644638657569885, + "learning_rate": 0.0003810585835033518, + "loss": 3.2211, + "step": 62700 + }, + { + "epoch": 18.278489862502912, + "grad_norm": 0.36919474601745605, + "learning_rate": 0.00038088370737394343, + "loss": 3.2195, + "step": 62750 + }, + { + "epoch": 18.29305523188068, + "grad_norm": 0.3620702028274536, + "learning_rate": 0.00038070883124453507, + "loss": 3.2203, + "step": 62800 + }, + { + "epoch": 18.30762060125845, + "grad_norm": 0.3722609281539917, + "learning_rate": 0.00038053395511512676, + "loss": 3.2249, + "step": 62850 + }, + { + "epoch": 18.322185970636216, + "grad_norm": 0.3975137770175934, + "learning_rate": 0.0003803590789857184, + "loss": 3.2353, + "step": 62900 + }, + { + "epoch": 18.336751340013983, + "grad_norm": 0.37322407960891724, + "learning_rate": 0.0003801842028563101, + "loss": 3.2471, + "step": 62950 + }, + { + "epoch": 18.35131670939175, + "grad_norm": 0.3992110788822174, + "learning_rate": 0.0003800093267269017, + "loss": 3.2302, + "step": 63000 + }, + { + "epoch": 18.35131670939175, + "eval_accuracy": 0.3729018369673139, + "eval_loss": 3.544276714324951, + "eval_runtime": 180.3309, + "eval_samples_per_second": 92.303, + "eval_steps_per_second": 5.773, + "step": 63000 + }, + { + "epoch": 18.365882078769516, + "grad_norm": 0.37217405438423157, + "learning_rate": 0.00037983445059749335, + "loss": 3.2365, + "step": 63050 + }, + { + "epoch": 18.380447448147287, + "grad_norm": 0.38584333658218384, + "learning_rate": 0.0003796595744680851, + "loss": 3.2501, + "step": 63100 + }, + { + "epoch": 18.395012817525053, + "grad_norm": 0.3655035197734833, + "learning_rate": 0.00037948469833867674, + "loss": 3.2413, + "step": 63150 + }, + { + "epoch": 18.40957818690282, + "grad_norm": 0.3870564103126526, + "learning_rate": 0.0003793098222092684, + "loss": 3.2297, + "step": 63200 + }, + { + "epoch": 18.424143556280587, + "grad_norm": 0.4136461615562439, + "learning_rate": 0.00037913494607986006, + "loss": 3.2377, + "step": 63250 + }, + { + "epoch": 18.438708925658354, + "grad_norm": 0.39346110820770264, + "learning_rate": 0.00037896006995045175, + "loss": 3.2505, + "step": 63300 + }, + { + "epoch": 18.45327429503612, + "grad_norm": 0.3884267508983612, + "learning_rate": 0.0003787851938210434, + "loss": 3.254, + "step": 63350 + }, + { + "epoch": 18.46783966441389, + "grad_norm": 0.3669024705886841, + "learning_rate": 0.000378610317691635, + "loss": 3.2379, + "step": 63400 + }, + { + "epoch": 18.482405033791657, + "grad_norm": 0.38779205083847046, + "learning_rate": 0.0003784354415622267, + "loss": 3.2469, + "step": 63450 + }, + { + "epoch": 18.496970403169424, + "grad_norm": 0.3925683796405792, + "learning_rate": 0.00037826056543281835, + "loss": 3.2475, + "step": 63500 + }, + { + "epoch": 18.51153577254719, + "grad_norm": 0.3939642906188965, + "learning_rate": 0.0003780856893034101, + "loss": 3.2455, + "step": 63550 + }, + { + "epoch": 18.526101141924958, + "grad_norm": 0.36159586906433105, + "learning_rate": 0.00037791081317400173, + "loss": 3.2528, + "step": 63600 + }, + { + "epoch": 18.540666511302728, + "grad_norm": 0.40784958004951477, + "learning_rate": 0.00037773593704459337, + "loss": 3.2448, + "step": 63650 + }, + { + "epoch": 18.555231880680495, + "grad_norm": 0.3873050808906555, + "learning_rate": 0.00037756106091518506, + "loss": 3.2595, + "step": 63700 + }, + { + "epoch": 18.56979725005826, + "grad_norm": 0.4017407298088074, + "learning_rate": 0.0003773861847857767, + "loss": 3.2543, + "step": 63750 + }, + { + "epoch": 18.58436261943603, + "grad_norm": 0.4253888428211212, + "learning_rate": 0.0003772113086563684, + "loss": 3.2584, + "step": 63800 + }, + { + "epoch": 18.598927988813795, + "grad_norm": 0.3869839012622833, + "learning_rate": 0.00037703643252696, + "loss": 3.2514, + "step": 63850 + }, + { + "epoch": 18.613493358191565, + "grad_norm": 0.4242589771747589, + "learning_rate": 0.0003768615563975517, + "loss": 3.2537, + "step": 63900 + }, + { + "epoch": 18.628058727569332, + "grad_norm": 0.4035606384277344, + "learning_rate": 0.00037668668026814335, + "loss": 3.2601, + "step": 63950 + }, + { + "epoch": 18.6426240969471, + "grad_norm": 0.3770582675933838, + "learning_rate": 0.000376511804138735, + "loss": 3.2384, + "step": 64000 + }, + { + "epoch": 18.6426240969471, + "eval_accuracy": 0.37323761594573923, + "eval_loss": 3.538996934890747, + "eval_runtime": 180.1189, + "eval_samples_per_second": 92.411, + "eval_steps_per_second": 5.78, + "step": 64000 + }, + { + "epoch": 18.657189466324866, + "grad_norm": 0.3780357837677002, + "learning_rate": 0.00037633692800932673, + "loss": 3.2646, + "step": 64050 + }, + { + "epoch": 18.671754835702632, + "grad_norm": 0.4115695655345917, + "learning_rate": 0.00037616205187991837, + "loss": 3.2593, + "step": 64100 + }, + { + "epoch": 18.6863202050804, + "grad_norm": 0.37910208106040955, + "learning_rate": 0.00037598717575051006, + "loss": 3.2559, + "step": 64150 + }, + { + "epoch": 18.70088557445817, + "grad_norm": 0.40785324573516846, + "learning_rate": 0.0003758122996211017, + "loss": 3.2566, + "step": 64200 + }, + { + "epoch": 18.715450943835936, + "grad_norm": 0.3921259343624115, + "learning_rate": 0.00037563742349169333, + "loss": 3.2651, + "step": 64250 + }, + { + "epoch": 18.730016313213703, + "grad_norm": 0.3851292133331299, + "learning_rate": 0.000375462547362285, + "loss": 3.2619, + "step": 64300 + }, + { + "epoch": 18.74458168259147, + "grad_norm": 0.39476004242897034, + "learning_rate": 0.00037528767123287665, + "loss": 3.259, + "step": 64350 + }, + { + "epoch": 18.759147051969236, + "grad_norm": 0.3848125636577606, + "learning_rate": 0.00037511279510346834, + "loss": 3.2541, + "step": 64400 + }, + { + "epoch": 18.773712421347007, + "grad_norm": 0.3958471417427063, + "learning_rate": 0.00037493791897406, + "loss": 3.2738, + "step": 64450 + }, + { + "epoch": 18.788277790724774, + "grad_norm": 0.3779659867286682, + "learning_rate": 0.0003747630428446516, + "loss": 3.2601, + "step": 64500 + }, + { + "epoch": 18.80284316010254, + "grad_norm": 0.37562546133995056, + "learning_rate": 0.00037458816671524336, + "loss": 3.2631, + "step": 64550 + }, + { + "epoch": 18.817408529480307, + "grad_norm": 0.3678816556930542, + "learning_rate": 0.000374413290585835, + "loss": 3.2564, + "step": 64600 + }, + { + "epoch": 18.831973898858074, + "grad_norm": 0.3767731487751007, + "learning_rate": 0.0003742384144564267, + "loss": 3.2594, + "step": 64650 + }, + { + "epoch": 18.846539268235844, + "grad_norm": 0.3676002025604248, + "learning_rate": 0.0003740635383270183, + "loss": 3.2694, + "step": 64700 + }, + { + "epoch": 18.86110463761361, + "grad_norm": 0.3892149329185486, + "learning_rate": 0.00037388866219761, + "loss": 3.2799, + "step": 64750 + }, + { + "epoch": 18.875670006991378, + "grad_norm": 0.39518481492996216, + "learning_rate": 0.00037371378606820165, + "loss": 3.2705, + "step": 64800 + }, + { + "epoch": 18.890235376369144, + "grad_norm": 0.416162371635437, + "learning_rate": 0.0003735389099387933, + "loss": 3.2622, + "step": 64850 + }, + { + "epoch": 18.90480074574691, + "grad_norm": 0.3584899306297302, + "learning_rate": 0.000373364033809385, + "loss": 3.2664, + "step": 64900 + }, + { + "epoch": 18.919366115124678, + "grad_norm": 0.3977811634540558, + "learning_rate": 0.0003731891576799766, + "loss": 3.2779, + "step": 64950 + }, + { + "epoch": 18.93393148450245, + "grad_norm": 0.37734439969062805, + "learning_rate": 0.00037301428155056836, + "loss": 3.2662, + "step": 65000 + }, + { + "epoch": 18.93393148450245, + "eval_accuracy": 0.3737098933114027, + "eval_loss": 3.5310399532318115, + "eval_runtime": 180.1554, + "eval_samples_per_second": 92.392, + "eval_steps_per_second": 5.778, + "step": 65000 + }, + { + "epoch": 18.948496853880215, + "grad_norm": 0.38468220829963684, + "learning_rate": 0.00037283940542116, + "loss": 3.2678, + "step": 65050 + }, + { + "epoch": 18.96306222325798, + "grad_norm": 0.40911436080932617, + "learning_rate": 0.00037266452929175163, + "loss": 3.28, + "step": 65100 + }, + { + "epoch": 18.97762759263575, + "grad_norm": 0.3881330192089081, + "learning_rate": 0.0003724896531623433, + "loss": 3.2711, + "step": 65150 + }, + { + "epoch": 18.992192962013515, + "grad_norm": 0.3778620958328247, + "learning_rate": 0.00037231477703293496, + "loss": 3.284, + "step": 65200 + }, + { + "epoch": 19.006700069913773, + "grad_norm": 0.421502023935318, + "learning_rate": 0.00037213990090352665, + "loss": 3.2223, + "step": 65250 + }, + { + "epoch": 19.02126543929154, + "grad_norm": 0.38426366448402405, + "learning_rate": 0.0003719650247741183, + "loss": 3.1653, + "step": 65300 + }, + { + "epoch": 19.035830808669306, + "grad_norm": 0.3845437169075012, + "learning_rate": 0.00037179014864471, + "loss": 3.186, + "step": 65350 + }, + { + "epoch": 19.050396178047077, + "grad_norm": 0.38343438506126404, + "learning_rate": 0.0003716152725153016, + "loss": 3.1675, + "step": 65400 + }, + { + "epoch": 19.064961547424844, + "grad_norm": 0.41841766238212585, + "learning_rate": 0.00037144039638589325, + "loss": 3.168, + "step": 65450 + }, + { + "epoch": 19.07952691680261, + "grad_norm": 0.4013581871986389, + "learning_rate": 0.000371265520256485, + "loss": 3.1991, + "step": 65500 + }, + { + "epoch": 19.094092286180377, + "grad_norm": 0.3751412630081177, + "learning_rate": 0.00037109064412707663, + "loss": 3.194, + "step": 65550 + }, + { + "epoch": 19.108657655558144, + "grad_norm": 0.3733639717102051, + "learning_rate": 0.0003709157679976683, + "loss": 3.1903, + "step": 65600 + }, + { + "epoch": 19.123223024935914, + "grad_norm": 0.37872931361198425, + "learning_rate": 0.00037074089186825995, + "loss": 3.1894, + "step": 65650 + }, + { + "epoch": 19.13778839431368, + "grad_norm": 0.3800518810749054, + "learning_rate": 0.0003705660157388516, + "loss": 3.2034, + "step": 65700 + }, + { + "epoch": 19.152353763691448, + "grad_norm": 0.38501694798469543, + "learning_rate": 0.0003703911396094433, + "loss": 3.2032, + "step": 65750 + }, + { + "epoch": 19.166919133069214, + "grad_norm": 0.3732318580150604, + "learning_rate": 0.0003702162634800349, + "loss": 3.199, + "step": 65800 + }, + { + "epoch": 19.18148450244698, + "grad_norm": 0.39963558316230774, + "learning_rate": 0.0003700413873506266, + "loss": 3.1866, + "step": 65850 + }, + { + "epoch": 19.196049871824748, + "grad_norm": 0.39880403876304626, + "learning_rate": 0.00036986651122121824, + "loss": 3.2026, + "step": 65900 + }, + { + "epoch": 19.210615241202518, + "grad_norm": 0.3877936601638794, + "learning_rate": 0.00036969163509181, + "loss": 3.1999, + "step": 65950 + }, + { + "epoch": 19.225180610580285, + "grad_norm": 0.40352022647857666, + "learning_rate": 0.0003695167589624016, + "loss": 3.1971, + "step": 66000 + }, + { + "epoch": 19.225180610580285, + "eval_accuracy": 0.37293699029873867, + "eval_loss": 3.546077013015747, + "eval_runtime": 180.0981, + "eval_samples_per_second": 92.422, + "eval_steps_per_second": 5.78, + "step": 66000 + }, + { + "epoch": 19.23974597995805, + "grad_norm": 0.39953070878982544, + "learning_rate": 0.00036934188283299326, + "loss": 3.2065, + "step": 66050 + }, + { + "epoch": 19.25431134933582, + "grad_norm": 0.4177154004573822, + "learning_rate": 0.00036916700670358495, + "loss": 3.2163, + "step": 66100 + }, + { + "epoch": 19.268876718713585, + "grad_norm": 0.3941349387168884, + "learning_rate": 0.0003689921305741766, + "loss": 3.2171, + "step": 66150 + }, + { + "epoch": 19.283442088091356, + "grad_norm": 0.38486939668655396, + "learning_rate": 0.0003688172544447683, + "loss": 3.2216, + "step": 66200 + }, + { + "epoch": 19.298007457469122, + "grad_norm": 0.40340861678123474, + "learning_rate": 0.0003686423783153599, + "loss": 3.209, + "step": 66250 + }, + { + "epoch": 19.31257282684689, + "grad_norm": 0.4014577567577362, + "learning_rate": 0.00036846750218595155, + "loss": 3.2142, + "step": 66300 + }, + { + "epoch": 19.327138196224656, + "grad_norm": 0.37454017996788025, + "learning_rate": 0.00036829262605654324, + "loss": 3.2138, + "step": 66350 + }, + { + "epoch": 19.341703565602423, + "grad_norm": 0.40584367513656616, + "learning_rate": 0.0003681177499271349, + "loss": 3.2271, + "step": 66400 + }, + { + "epoch": 19.356268934980193, + "grad_norm": 0.46146562695503235, + "learning_rate": 0.0003679428737977266, + "loss": 3.224, + "step": 66450 + }, + { + "epoch": 19.37083430435796, + "grad_norm": 0.4199482798576355, + "learning_rate": 0.00036776799766831826, + "loss": 3.2247, + "step": 66500 + }, + { + "epoch": 19.385399673735726, + "grad_norm": 0.4005260467529297, + "learning_rate": 0.0003675931215389099, + "loss": 3.2315, + "step": 66550 + }, + { + "epoch": 19.399965043113493, + "grad_norm": 0.3830159902572632, + "learning_rate": 0.0003674182454095016, + "loss": 3.2393, + "step": 66600 + }, + { + "epoch": 19.41453041249126, + "grad_norm": 0.3859393298625946, + "learning_rate": 0.0003672433692800932, + "loss": 3.224, + "step": 66650 + }, + { + "epoch": 19.429095781869027, + "grad_norm": 0.3998680114746094, + "learning_rate": 0.0003670684931506849, + "loss": 3.2305, + "step": 66700 + }, + { + "epoch": 19.443661151246797, + "grad_norm": 0.4194350838661194, + "learning_rate": 0.00036689361702127655, + "loss": 3.2375, + "step": 66750 + }, + { + "epoch": 19.458226520624564, + "grad_norm": 0.41730183362960815, + "learning_rate": 0.00036671874089186824, + "loss": 3.2294, + "step": 66800 + }, + { + "epoch": 19.47279189000233, + "grad_norm": 0.4091123640537262, + "learning_rate": 0.00036654386476245987, + "loss": 3.2356, + "step": 66850 + }, + { + "epoch": 19.487357259380097, + "grad_norm": 0.37244871258735657, + "learning_rate": 0.0003663689886330515, + "loss": 3.2232, + "step": 66900 + }, + { + "epoch": 19.501922628757864, + "grad_norm": 0.38801804184913635, + "learning_rate": 0.00036619411250364325, + "loss": 3.2317, + "step": 66950 + }, + { + "epoch": 19.516487998135634, + "grad_norm": 0.3837634027004242, + "learning_rate": 0.0003660192363742349, + "loss": 3.233, + "step": 67000 + }, + { + "epoch": 19.516487998135634, + "eval_accuracy": 0.3734149110085773, + "eval_loss": 3.5416438579559326, + "eval_runtime": 180.1033, + "eval_samples_per_second": 92.419, + "eval_steps_per_second": 5.78, + "step": 67000 + }, + { + "epoch": 19.5310533675134, + "grad_norm": 0.38826125860214233, + "learning_rate": 0.0003658443602448266, + "loss": 3.2486, + "step": 67050 + }, + { + "epoch": 19.545618736891168, + "grad_norm": 0.37006881833076477, + "learning_rate": 0.0003656694841154182, + "loss": 3.2385, + "step": 67100 + }, + { + "epoch": 19.560184106268935, + "grad_norm": 0.3804363012313843, + "learning_rate": 0.00036549460798600985, + "loss": 3.2371, + "step": 67150 + }, + { + "epoch": 19.5747494756467, + "grad_norm": 0.3897872567176819, + "learning_rate": 0.00036531973185660154, + "loss": 3.24, + "step": 67200 + }, + { + "epoch": 19.589314845024468, + "grad_norm": 0.360208123922348, + "learning_rate": 0.0003651448557271932, + "loss": 3.2373, + "step": 67250 + }, + { + "epoch": 19.60388021440224, + "grad_norm": 0.3896685242652893, + "learning_rate": 0.00036496997959778487, + "loss": 3.2419, + "step": 67300 + }, + { + "epoch": 19.618445583780005, + "grad_norm": 0.3791717290878296, + "learning_rate": 0.0003647951034683765, + "loss": 3.2349, + "step": 67350 + }, + { + "epoch": 19.633010953157772, + "grad_norm": 0.40495508909225464, + "learning_rate": 0.00036462022733896825, + "loss": 3.25, + "step": 67400 + }, + { + "epoch": 19.64757632253554, + "grad_norm": 0.3995152711868286, + "learning_rate": 0.0003644453512095599, + "loss": 3.2518, + "step": 67450 + }, + { + "epoch": 19.662141691913305, + "grad_norm": 0.379250168800354, + "learning_rate": 0.0003642704750801515, + "loss": 3.2473, + "step": 67500 + }, + { + "epoch": 19.676707061291076, + "grad_norm": 0.37855687737464905, + "learning_rate": 0.0003640955989507432, + "loss": 3.2479, + "step": 67550 + }, + { + "epoch": 19.691272430668842, + "grad_norm": 0.37955552339553833, + "learning_rate": 0.00036392072282133485, + "loss": 3.2494, + "step": 67600 + }, + { + "epoch": 19.70583780004661, + "grad_norm": 0.4204575717449188, + "learning_rate": 0.00036374584669192654, + "loss": 3.2508, + "step": 67650 + }, + { + "epoch": 19.720403169424376, + "grad_norm": 0.3943386971950531, + "learning_rate": 0.0003635709705625182, + "loss": 3.2509, + "step": 67700 + }, + { + "epoch": 19.734968538802143, + "grad_norm": 0.41143128275871277, + "learning_rate": 0.0003633960944331098, + "loss": 3.2468, + "step": 67750 + }, + { + "epoch": 19.749533908179913, + "grad_norm": 0.4092387855052948, + "learning_rate": 0.0003632212183037015, + "loss": 3.2534, + "step": 67800 + }, + { + "epoch": 19.76409927755768, + "grad_norm": 0.42127180099487305, + "learning_rate": 0.00036304634217429314, + "loss": 3.2494, + "step": 67850 + }, + { + "epoch": 19.778664646935447, + "grad_norm": 0.3770343065261841, + "learning_rate": 0.0003628714660448849, + "loss": 3.2586, + "step": 67900 + }, + { + "epoch": 19.793230016313213, + "grad_norm": 0.41852423548698425, + "learning_rate": 0.0003626965899154765, + "loss": 3.2537, + "step": 67950 + }, + { + "epoch": 19.80779538569098, + "grad_norm": 0.4298582375049591, + "learning_rate": 0.0003625217137860682, + "loss": 3.2554, + "step": 68000 + }, + { + "epoch": 19.80779538569098, + "eval_accuracy": 0.37402227592543497, + "eval_loss": 3.5344889163970947, + "eval_runtime": 180.1298, + "eval_samples_per_second": 92.406, + "eval_steps_per_second": 5.779, + "step": 68000 + }, + { + "epoch": 19.822360755068747, + "grad_norm": 0.3977230191230774, + "learning_rate": 0.00036234683765665985, + "loss": 3.2665, + "step": 68050 + }, + { + "epoch": 19.836926124446517, + "grad_norm": 0.38390326499938965, + "learning_rate": 0.0003621719615272515, + "loss": 3.2562, + "step": 68100 + }, + { + "epoch": 19.851491493824284, + "grad_norm": 0.3879960775375366, + "learning_rate": 0.00036199708539784317, + "loss": 3.2422, + "step": 68150 + }, + { + "epoch": 19.86605686320205, + "grad_norm": 0.4085167646408081, + "learning_rate": 0.0003618222092684348, + "loss": 3.2567, + "step": 68200 + }, + { + "epoch": 19.880622232579817, + "grad_norm": 0.4254435896873474, + "learning_rate": 0.0003616473331390265, + "loss": 3.2562, + "step": 68250 + }, + { + "epoch": 19.895187601957584, + "grad_norm": 0.38896510004997253, + "learning_rate": 0.00036147245700961813, + "loss": 3.2491, + "step": 68300 + }, + { + "epoch": 19.909752971335354, + "grad_norm": 0.3670238256454468, + "learning_rate": 0.00036129758088020977, + "loss": 3.2599, + "step": 68350 + }, + { + "epoch": 19.92431834071312, + "grad_norm": 0.38788822293281555, + "learning_rate": 0.0003611227047508015, + "loss": 3.2658, + "step": 68400 + }, + { + "epoch": 19.938883710090888, + "grad_norm": 0.3823579251766205, + "learning_rate": 0.00036094782862139315, + "loss": 3.2518, + "step": 68450 + }, + { + "epoch": 19.953449079468655, + "grad_norm": 0.419264018535614, + "learning_rate": 0.00036077295249198484, + "loss": 3.265, + "step": 68500 + }, + { + "epoch": 19.96801444884642, + "grad_norm": 0.41140833497047424, + "learning_rate": 0.0003605980763625765, + "loss": 3.2592, + "step": 68550 + }, + { + "epoch": 19.982579818224192, + "grad_norm": 0.4099409878253937, + "learning_rate": 0.0003604232002331681, + "loss": 3.2693, + "step": 68600 + }, + { + "epoch": 19.99714518760196, + "grad_norm": 0.37093162536621094, + "learning_rate": 0.0003602483241037598, + "loss": 3.2544, + "step": 68650 + }, + { + "epoch": 20.011652295502213, + "grad_norm": 0.42578721046447754, + "learning_rate": 0.00036007344797435144, + "loss": 3.1794, + "step": 68700 + }, + { + "epoch": 20.026217664879983, + "grad_norm": 0.3819316625595093, + "learning_rate": 0.00035989857184494313, + "loss": 3.1559, + "step": 68750 + }, + { + "epoch": 20.04078303425775, + "grad_norm": 0.40451663732528687, + "learning_rate": 0.00035972369571553477, + "loss": 3.1716, + "step": 68800 + }, + { + "epoch": 20.055348403635517, + "grad_norm": 0.39993008971214294, + "learning_rate": 0.0003595488195861265, + "loss": 3.1791, + "step": 68850 + }, + { + "epoch": 20.069913773013283, + "grad_norm": 0.38017529249191284, + "learning_rate": 0.00035937394345671815, + "loss": 3.1703, + "step": 68900 + }, + { + "epoch": 20.08447914239105, + "grad_norm": 0.3801606297492981, + "learning_rate": 0.0003591990673273098, + "loss": 3.167, + "step": 68950 + }, + { + "epoch": 20.099044511768817, + "grad_norm": 0.4289296865463257, + "learning_rate": 0.0003590241911979015, + "loss": 3.1718, + "step": 69000 + }, + { + "epoch": 20.099044511768817, + "eval_accuracy": 0.37306749263279054, + "eval_loss": 3.5509676933288574, + "eval_runtime": 180.195, + "eval_samples_per_second": 92.372, + "eval_steps_per_second": 5.777, + "step": 69000 + }, + { + "epoch": 20.113609881146587, + "grad_norm": 0.4497009217739105, + "learning_rate": 0.0003588493150684931, + "loss": 3.1754, + "step": 69050 + }, + { + "epoch": 20.128175250524354, + "grad_norm": 0.39954957365989685, + "learning_rate": 0.0003586744389390848, + "loss": 3.1792, + "step": 69100 + }, + { + "epoch": 20.14274061990212, + "grad_norm": 0.4464070200920105, + "learning_rate": 0.00035849956280967644, + "loss": 3.1865, + "step": 69150 + }, + { + "epoch": 20.157305989279887, + "grad_norm": 0.42317837476730347, + "learning_rate": 0.0003583246866802681, + "loss": 3.1917, + "step": 69200 + }, + { + "epoch": 20.171871358657654, + "grad_norm": 0.3977357745170593, + "learning_rate": 0.00035814981055085976, + "loss": 3.1804, + "step": 69250 + }, + { + "epoch": 20.186436728035424, + "grad_norm": 0.39570218324661255, + "learning_rate": 0.0003579749344214514, + "loss": 3.2007, + "step": 69300 + }, + { + "epoch": 20.20100209741319, + "grad_norm": 0.3908149302005768, + "learning_rate": 0.00035780005829204315, + "loss": 3.1905, + "step": 69350 + }, + { + "epoch": 20.215567466790958, + "grad_norm": 0.41018813848495483, + "learning_rate": 0.0003576251821626348, + "loss": 3.1864, + "step": 69400 + }, + { + "epoch": 20.230132836168725, + "grad_norm": 0.42226818203926086, + "learning_rate": 0.00035745030603322647, + "loss": 3.2023, + "step": 69450 + }, + { + "epoch": 20.24469820554649, + "grad_norm": 0.40277135372161865, + "learning_rate": 0.0003572754299038181, + "loss": 3.1942, + "step": 69500 + }, + { + "epoch": 20.25926357492426, + "grad_norm": 0.4245125353336334, + "learning_rate": 0.00035710055377440974, + "loss": 3.1983, + "step": 69550 + }, + { + "epoch": 20.27382894430203, + "grad_norm": 0.3877376914024353, + "learning_rate": 0.00035692567764500143, + "loss": 3.1959, + "step": 69600 + }, + { + "epoch": 20.288394313679795, + "grad_norm": 0.40770015120506287, + "learning_rate": 0.00035675080151559307, + "loss": 3.2, + "step": 69650 + }, + { + "epoch": 20.302959683057562, + "grad_norm": 0.3683469891548157, + "learning_rate": 0.00035657592538618476, + "loss": 3.2096, + "step": 69700 + }, + { + "epoch": 20.31752505243533, + "grad_norm": 0.3850116431713104, + "learning_rate": 0.0003564010492567764, + "loss": 3.2068, + "step": 69750 + }, + { + "epoch": 20.332090421813096, + "grad_norm": 0.39738941192626953, + "learning_rate": 0.00035622617312736803, + "loss": 3.2022, + "step": 69800 + }, + { + "epoch": 20.346655791190866, + "grad_norm": 0.39597588777542114, + "learning_rate": 0.0003560512969979598, + "loss": 3.2073, + "step": 69850 + }, + { + "epoch": 20.361221160568633, + "grad_norm": 0.38561490178108215, + "learning_rate": 0.0003558764208685514, + "loss": 3.21, + "step": 69900 + }, + { + "epoch": 20.3757865299464, + "grad_norm": 0.39177340269088745, + "learning_rate": 0.0003557015447391431, + "loss": 3.2072, + "step": 69950 + }, + { + "epoch": 20.390351899324166, + "grad_norm": 0.37735575437545776, + "learning_rate": 0.00035552666860973474, + "loss": 3.2171, + "step": 70000 + }, + { + "epoch": 20.390351899324166, + "eval_accuracy": 0.3732674786419998, + "eval_loss": 3.5471997261047363, + "eval_runtime": 180.1445, + "eval_samples_per_second": 92.398, + "eval_steps_per_second": 5.779, + "step": 70000 + } + ], + "logging_steps": 50, + "max_steps": 171650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 5 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.46313608822784e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}