diff --git "a/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_carry_frequency_2128/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5568654537200928, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_carry_frequency_2128/checkpoint-40000", + "epoch": 11.651654625961314, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 0.8238836526870728, + "learning_rate": 0.000294, + "loss": 8.4647, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.8511333465576172, + "learning_rate": 0.0005939999999999999, + "loss": 6.7154, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.5358927249908447, + "learning_rate": 0.0005998286213931798, + "loss": 6.3402, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.393213152885437, + "learning_rate": 0.0005996537452637714, + "loss": 6.128, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.44214680790901184, + "learning_rate": 0.0005994788691343632, + "loss": 5.9984, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.39625659584999084, + "learning_rate": 0.0005993039930049548, + "loss": 5.8476, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.47089600563049316, + "learning_rate": 0.0005991291168755465, + "loss": 5.717, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.49012529850006104, + "learning_rate": 0.0005989542407461382, + "loss": 5.6004, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.44449353218078613, + "learning_rate": 0.0005987793646167297, + "loss": 5.5036, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.4581308662891388, + "learning_rate": 0.0005986044884873214, + "loss": 5.4036, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.42073458433151245, + "learning_rate": 0.0005984296123579131, + "loss": 5.3157, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.40689051151275635, + "learning_rate": 0.0005982547362285047, + "loss": 5.2389, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.43460237979888916, + "learning_rate": 0.0005980798600990964, + "loss": 5.1757, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.45589685440063477, + "learning_rate": 0.0005979049839696881, + "loss": 5.1372, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.3859228491783142, + "learning_rate": 0.0005977301078402798, + "loss": 5.0648, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.4326412081718445, + "learning_rate": 0.0005975552317108715, + "loss": 5.0189, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4279559552669525, + "learning_rate": 0.0005973803555814631, + "loss": 4.9551, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.4713323414325714, + "learning_rate": 0.0005972054794520547, + "loss": 4.9013, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.43930691480636597, + "learning_rate": 0.0005970306033226464, + "loss": 4.8775, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.4358115792274475, + "learning_rate": 0.0005968557271932381, + "loss": 4.8034, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.2553606134660685, + "eval_loss": 4.747249603271484, + "eval_runtime": 181.1133, + "eval_samples_per_second": 91.882, + "eval_steps_per_second": 5.748, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.4385921061038971, + "learning_rate": 0.0005966808510638297, + "loss": 4.764, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.4408250153064728, + "learning_rate": 0.0005965059749344214, + "loss": 4.7228, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.42670053243637085, + "learning_rate": 0.0005963310988050131, + "loss": 4.6831, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.45188194513320923, + "learning_rate": 0.0005961562226756047, + "loss": 4.6466, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.45061755180358887, + "learning_rate": 0.0005959813465461965, + "loss": 4.6151, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.4137357175350189, + "learning_rate": 0.000595806470416788, + "loss": 4.594, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.4820528030395508, + "learning_rate": 0.0005956315942873797, + "loss": 4.5574, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.4236258864402771, + "learning_rate": 0.0005954567181579714, + "loss": 4.547, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.46075788140296936, + "learning_rate": 0.000595281842028563, + "loss": 4.5196, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4334660470485687, + "learning_rate": 0.0005951069658991547, + "loss": 4.4987, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.4550553858280182, + "learning_rate": 0.0005949320897697464, + "loss": 4.4661, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.43172529339790344, + "learning_rate": 0.0005947572136403381, + "loss": 4.4598, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.43075188994407654, + "learning_rate": 0.0005945823375109297, + "loss": 4.4466, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.39069297909736633, + "learning_rate": 0.0005944074613815215, + "loss": 4.4363, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.4138490855693817, + "learning_rate": 0.000594232585252113, + "loss": 4.4134, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.39502963423728943, + "learning_rate": 0.0005940577091227047, + "loss": 4.3856, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.43347927927970886, + "learning_rate": 0.0005938828329932964, + "loss": 4.3706, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.3977086842060089, + "learning_rate": 0.000593707956863888, + "loss": 4.3561, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.41190075874328613, + "learning_rate": 0.0005935330807344797, + "loss": 4.3474, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.46395954489707947, + "learning_rate": 0.0005933582046050714, + "loss": 4.3307, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.3005255098722875, + "eval_loss": 4.274885654449463, + "eval_runtime": 181.1908, + "eval_samples_per_second": 91.842, + "eval_steps_per_second": 5.745, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.38975879549980164, + "learning_rate": 0.000593183328475663, + "loss": 4.3244, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.43265265226364136, + "learning_rate": 0.0005930084523462546, + "loss": 4.3044, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.39545658230781555, + "learning_rate": 0.0005928335762168463, + "loss": 4.3117, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.3943400979042053, + "learning_rate": 0.000592658700087438, + "loss": 4.2746, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.38592591881752014, + "learning_rate": 0.0005924838239580297, + "loss": 4.2631, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.4045238792896271, + "learning_rate": 0.0005923089478286214, + "loss": 4.252, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.3675093650817871, + "learning_rate": 0.000592134071699213, + "loss": 4.2618, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.3889940679073334, + "learning_rate": 0.0005919591955698047, + "loss": 4.2367, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.3890587091445923, + "learning_rate": 0.0005917843194403964, + "loss": 4.2284, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.3815591633319855, + "learning_rate": 0.000591609443310988, + "loss": 4.2156, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.37089216709136963, + "learning_rate": 0.0005914345671815796, + "loss": 4.212, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.3704032003879547, + "learning_rate": 0.0005912596910521713, + "loss": 4.2067, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.35531026124954224, + "learning_rate": 0.0005910848149227629, + "loss": 4.1983, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3560468554496765, + "learning_rate": 0.0005909099387933547, + "loss": 4.1943, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.34763428568840027, + "learning_rate": 0.0005907350626639463, + "loss": 4.1821, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.3786865174770355, + "learning_rate": 0.000590560186534538, + "loss": 4.182, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.36594879627227783, + "learning_rate": 0.0005903853104051297, + "loss": 4.1746, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.3726920485496521, + "learning_rate": 0.0005902104342757214, + "loss": 4.1519, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.3717030882835388, + "learning_rate": 0.000590035558146313, + "loss": 4.1468, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.3748721480369568, + "learning_rate": 0.0005898606820169046, + "loss": 4.133, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.31594671449609696, + "eval_loss": 4.093606948852539, + "eval_runtime": 181.8584, + "eval_samples_per_second": 91.505, + "eval_steps_per_second": 5.724, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.3633127510547638, + "learning_rate": 0.0005896858058874963, + "loss": 4.1293, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.3572661280632019, + "learning_rate": 0.0005895109297580879, + "loss": 4.1391, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.36137935519218445, + "learning_rate": 0.0005893360536286797, + "loss": 4.1105, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.32254934310913086, + "learning_rate": 0.0005891611774992713, + "loss": 4.1254, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.39124587178230286, + "learning_rate": 0.000588986301369863, + "loss": 4.1111, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.3515588343143463, + "learning_rate": 0.0005888114252404547, + "loss": 4.1108, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.34326955676078796, + "learning_rate": 0.0005886365491110463, + "loss": 4.093, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.355753630399704, + "learning_rate": 0.000588461672981638, + "loss": 4.0918, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.3507901728153229, + "learning_rate": 0.0005882867968522296, + "loss": 4.0691, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.3766329288482666, + "learning_rate": 0.0005881119207228212, + "loss": 4.0107, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.3579021692276001, + "learning_rate": 0.0005879370445934129, + "loss": 4.007, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.3645196855068207, + "learning_rate": 0.0005877621684640046, + "loss": 3.9979, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3481312096118927, + "learning_rate": 0.0005875872923345963, + "loss": 4.0211, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.3510258197784424, + "learning_rate": 0.000587412416205188, + "loss": 4.0166, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.3350948393344879, + "learning_rate": 0.0005872375400757797, + "loss": 4.0069, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.3625650405883789, + "learning_rate": 0.0005870626639463713, + "loss": 4.0113, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.3299196660518646, + "learning_rate": 0.0005868877878169629, + "loss": 4.0002, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.3324016034603119, + "learning_rate": 0.0005867129116875546, + "loss": 3.9859, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.3453924059867859, + "learning_rate": 0.0005865380355581462, + "loss": 3.9777, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.3689613342285156, + "learning_rate": 0.0005863631594287379, + "loss": 3.9994, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.3255163636932383, + "eval_loss": 3.986950635910034, + "eval_runtime": 181.1582, + "eval_samples_per_second": 91.859, + "eval_steps_per_second": 5.746, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.33577021956443787, + "learning_rate": 0.0005861882832993296, + "loss": 3.9865, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.36526042222976685, + "learning_rate": 0.0005860134071699212, + "loss": 3.9788, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.38221442699432373, + "learning_rate": 0.000585838531040513, + "loss": 3.9812, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.33103591203689575, + "learning_rate": 0.0005856636549111046, + "loss": 3.9718, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.35662996768951416, + "learning_rate": 0.0005854887787816963, + "loss": 3.9494, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.34380409121513367, + "learning_rate": 0.0005853139026522879, + "loss": 3.9743, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.37011435627937317, + "learning_rate": 0.0005851390265228796, + "loss": 3.9679, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.3393860161304474, + "learning_rate": 0.0005849641503934712, + "loss": 3.9668, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.37732088565826416, + "learning_rate": 0.0005847892742640629, + "loss": 3.954, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.34702256321907043, + "learning_rate": 0.0005846143981346546, + "loss": 3.945, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.347993940114975, + "learning_rate": 0.0005844395220052462, + "loss": 3.9482, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.36191609501838684, + "learning_rate": 0.000584264645875838, + "loss": 3.9631, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.3485741913318634, + "learning_rate": 0.0005840897697464296, + "loss": 3.9519, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.35182255506515503, + "learning_rate": 0.0005839148936170212, + "loss": 3.9376, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.3750380277633667, + "learning_rate": 0.0005837400174876129, + "loss": 3.9428, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.360505610704422, + "learning_rate": 0.0005835651413582045, + "loss": 3.9449, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.33969995379447937, + "learning_rate": 0.0005833902652287962, + "loss": 3.9371, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.35013002157211304, + "learning_rate": 0.0005832153890993879, + "loss": 3.9337, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.3549332916736603, + "learning_rate": 0.0005830405129699796, + "loss": 3.9288, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.350067675113678, + "learning_rate": 0.0005828656368405712, + "loss": 3.9189, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.3319192182183655, + "eval_loss": 3.9136765003204346, + "eval_runtime": 180.9327, + "eval_samples_per_second": 91.973, + "eval_steps_per_second": 5.754, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.3185882866382599, + "learning_rate": 0.0005826907607111629, + "loss": 3.9184, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.32343989610671997, + "learning_rate": 0.0005825158845817546, + "loss": 3.9226, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.3393322825431824, + "learning_rate": 0.0005823410084523462, + "loss": 3.9186, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.33320391178131104, + "learning_rate": 0.0005821661323229379, + "loss": 3.9163, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.32511457800865173, + "learning_rate": 0.0005819912561935295, + "loss": 3.9171, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.3350389897823334, + "learning_rate": 0.0005818163800641212, + "loss": 3.9138, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.33055445551872253, + "learning_rate": 0.0005816415039347129, + "loss": 3.9016, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.33436310291290283, + "learning_rate": 0.0005814666278053045, + "loss": 3.9029, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.3497842252254486, + "learning_rate": 0.0005812917516758962, + "loss": 3.9082, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.345002144575119, + "learning_rate": 0.0005811168755464879, + "loss": 3.9131, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.34684985876083374, + "learning_rate": 0.0005809419994170794, + "loss": 3.8984, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.31695857644081116, + "learning_rate": 0.0005807671232876712, + "loss": 3.883, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.36025357246398926, + "learning_rate": 0.0005805922471582628, + "loss": 3.9014, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.34583941102027893, + "learning_rate": 0.0005804173710288545, + "loss": 3.8942, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.33271774649620056, + "learning_rate": 0.0005802424948994462, + "loss": 3.8925, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.3309691548347473, + "learning_rate": 0.0005800676187700379, + "loss": 3.8858, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.3122190833091736, + "learning_rate": 0.0005798927426406295, + "loss": 3.8657, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.3420509397983551, + "learning_rate": 0.0005797178665112212, + "loss": 3.8769, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.32377851009368896, + "learning_rate": 0.0005795429903818129, + "loss": 3.8841, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.31988197565078735, + "learning_rate": 0.0005793681142524044, + "loss": 3.873, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.33766140756961416, + "eval_loss": 3.8517301082611084, + "eval_runtime": 180.5226, + "eval_samples_per_second": 92.182, + "eval_steps_per_second": 5.767, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.3411223888397217, + "learning_rate": 0.0005791932381229961, + "loss": 3.8641, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.34019729495048523, + "learning_rate": 0.0005790183619935878, + "loss": 3.861, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.3440145254135132, + "learning_rate": 0.0005788434858641795, + "loss": 3.872, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.33574172854423523, + "learning_rate": 0.0005786686097347712, + "loss": 3.8661, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.3292637765407562, + "learning_rate": 0.0005784937336053628, + "loss": 3.8686, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.3391307294368744, + "learning_rate": 0.0005783188574759545, + "loss": 3.8548, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.32597413659095764, + "learning_rate": 0.0005781439813465462, + "loss": 3.8698, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.3270658552646637, + "learning_rate": 0.0005779691052171379, + "loss": 3.8609, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.3589136302471161, + "learning_rate": 0.0005777942290877294, + "loss": 3.8593, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.33831045031547546, + "learning_rate": 0.0005776193529583211, + "loss": 3.8622, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.3330910801887512, + "learning_rate": 0.0005774444768289128, + "loss": 3.8481, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.3293820321559906, + "learning_rate": 0.0005772696006995045, + "loss": 3.8514, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.3244706094264984, + "learning_rate": 0.0005770947245700962, + "loss": 3.8415, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.3254324495792389, + "learning_rate": 0.0005769198484406878, + "loss": 3.8378, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.3237997889518738, + "learning_rate": 0.0005767449723112795, + "loss": 3.8317, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.3228047788143158, + "learning_rate": 0.0005765700961818712, + "loss": 3.8368, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.32064223289489746, + "learning_rate": 0.0005763952200524627, + "loss": 3.8394, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.3343140184879303, + "learning_rate": 0.0005762203439230544, + "loss": 3.777, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.3371165692806244, + "learning_rate": 0.0005760454677936461, + "loss": 3.7455, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.32197079062461853, + "learning_rate": 0.0005758705916642378, + "loss": 3.7513, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34149063138446517, + "eval_loss": 3.810184955596924, + "eval_runtime": 180.5526, + "eval_samples_per_second": 92.167, + "eval_steps_per_second": 5.766, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.33044517040252686, + "learning_rate": 0.0005756957155348294, + "loss": 3.7375, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.3351292014122009, + "learning_rate": 0.0005755208394054211, + "loss": 3.7394, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.3348188102245331, + "learning_rate": 0.0005753459632760128, + "loss": 3.7432, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.33414652943611145, + "learning_rate": 0.0005751710871466045, + "loss": 3.7535, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.34815260767936707, + "learning_rate": 0.0005749962110171962, + "loss": 3.748, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.3307594954967499, + "learning_rate": 0.0005748213348877877, + "loss": 3.7448, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.34988632798194885, + "learning_rate": 0.0005746464587583794, + "loss": 3.7542, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.3482917249202728, + "learning_rate": 0.0005744715826289711, + "loss": 3.7581, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.3216814696788788, + "learning_rate": 0.0005742967064995627, + "loss": 3.7627, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.3261340856552124, + "learning_rate": 0.0005741218303701544, + "loss": 3.7539, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.32120323181152344, + "learning_rate": 0.0005739469542407461, + "loss": 3.7448, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.3328768312931061, + "learning_rate": 0.0005737720781113378, + "loss": 3.7542, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.32364141941070557, + "learning_rate": 0.0005735972019819295, + "loss": 3.7494, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.3354133367538452, + "learning_rate": 0.000573422325852521, + "loss": 3.7409, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.3489600718021393, + "learning_rate": 0.0005732474497231127, + "loss": 3.7527, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.33028727769851685, + "learning_rate": 0.0005730725735937044, + "loss": 3.746, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.3276793658733368, + "learning_rate": 0.0005728976974642961, + "loss": 3.7565, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3518573045730591, + "learning_rate": 0.0005727228213348877, + "loss": 3.7492, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.31932392716407776, + "learning_rate": 0.0005725479452054794, + "loss": 3.7511, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.306573748588562, + "learning_rate": 0.0005723730690760711, + "loss": 3.7592, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.34500433995162727, + "eval_loss": 3.7794785499572754, + "eval_runtime": 180.6748, + "eval_samples_per_second": 92.105, + "eval_steps_per_second": 5.762, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.3530043959617615, + "learning_rate": 0.0005721981929466627, + "loss": 3.7412, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.31992873549461365, + "learning_rate": 0.0005720233168172545, + "loss": 3.7336, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3247889578342438, + "learning_rate": 0.000571848440687846, + "loss": 3.7426, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.32873016595840454, + "learning_rate": 0.0005716735645584377, + "loss": 3.7411, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.34048840403556824, + "learning_rate": 0.0005714986884290294, + "loss": 3.7446, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.3255440890789032, + "learning_rate": 0.000571323812299621, + "loss": 3.7486, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.3284962475299835, + "learning_rate": 0.0005711489361702127, + "loss": 3.7478, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.31856876611709595, + "learning_rate": 0.0005709740600408044, + "loss": 3.7488, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.34519293904304504, + "learning_rate": 0.0005707991839113961, + "loss": 3.7563, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.32581546902656555, + "learning_rate": 0.0005706243077819877, + "loss": 3.7507, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.3163212537765503, + "learning_rate": 0.0005704494316525793, + "loss": 3.738, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.3154032826423645, + "learning_rate": 0.000570274555523171, + "loss": 3.7477, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.3356831967830658, + "learning_rate": 0.0005700996793937627, + "loss": 3.7488, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.35420989990234375, + "learning_rate": 0.0005699248032643544, + "loss": 3.7355, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.34032773971557617, + "learning_rate": 0.000569749927134946, + "loss": 3.7363, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.3459492623806, + "learning_rate": 0.0005695750510055377, + "loss": 3.7408, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.31637832522392273, + "learning_rate": 0.0005694001748761294, + "loss": 3.739, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.34592801332473755, + "learning_rate": 0.000569225298746721, + "loss": 3.7549, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.34058457612991333, + "learning_rate": 0.0005690504226173127, + "loss": 3.7433, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3215969204902649, + "learning_rate": 0.0005688755464879043, + "loss": 3.7318, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.3473946354881625, + "eval_loss": 3.7510855197906494, + "eval_runtime": 180.685, + "eval_samples_per_second": 92.1, + "eval_steps_per_second": 5.761, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.31068679690361023, + "learning_rate": 0.000568700670358496, + "loss": 3.7287, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.3394843637943268, + "learning_rate": 0.0005685257942290877, + "loss": 3.7376, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.31130099296569824, + "learning_rate": 0.0005683509180996793, + "loss": 3.7278, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.32781389355659485, + "learning_rate": 0.000568176041970271, + "loss": 3.7263, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.3248126208782196, + "learning_rate": 0.0005680011658408627, + "loss": 3.7386, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.32771995663642883, + "learning_rate": 0.0005678262897114544, + "loss": 3.7293, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.3164122998714447, + "learning_rate": 0.000567651413582046, + "loss": 3.7312, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.3185890018939972, + "learning_rate": 0.0005674765374526377, + "loss": 3.7251, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.3235971927642822, + "learning_rate": 0.0005673016613232293, + "loss": 3.7237, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.3337843418121338, + "learning_rate": 0.0005671267851938209, + "loss": 3.7302, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.32673659920692444, + "learning_rate": 0.0005669519090644127, + "loss": 3.7265, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.31469056010246277, + "learning_rate": 0.0005667770329350043, + "loss": 3.7272, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.33078935742378235, + "learning_rate": 0.000566602156805596, + "loss": 3.7226, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.3159051239490509, + "learning_rate": 0.0005664272806761877, + "loss": 3.7284, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.32226207852363586, + "learning_rate": 0.0005662524045467793, + "loss": 3.7184, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.3349704444408417, + "learning_rate": 0.000566077528417371, + "loss": 3.7276, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.3188948333263397, + "learning_rate": 0.0005659026522879626, + "loss": 3.7167, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.3187043368816376, + "learning_rate": 0.0005657277761585543, + "loss": 3.7229, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.3272295296192169, + "learning_rate": 0.0005655529000291459, + "loss": 3.7178, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.33525437116622925, + "learning_rate": 0.0005653780238997376, + "loss": 3.7092, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.3501877039368612, + "eval_loss": 3.723299741744995, + "eval_runtime": 181.1596, + "eval_samples_per_second": 91.858, + "eval_steps_per_second": 5.746, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.3185238242149353, + "learning_rate": 0.0005652031477703293, + "loss": 3.721, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.3339935541152954, + "learning_rate": 0.000565028271640921, + "loss": 3.7196, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.3274375796318054, + "learning_rate": 0.0005648533955115127, + "loss": 3.7089, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.33105677366256714, + "learning_rate": 0.0005646785193821043, + "loss": 3.7093, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.3180982768535614, + "learning_rate": 0.000564503643252696, + "loss": 3.7031, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.342707097530365, + "learning_rate": 0.0005643287671232876, + "loss": 3.7172, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.32696056365966797, + "learning_rate": 0.0005641538909938792, + "loss": 3.6025, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.33608219027519226, + "learning_rate": 0.0005639790148644709, + "loss": 3.614, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.31899672746658325, + "learning_rate": 0.0005638041387350626, + "loss": 3.6131, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.3347124457359314, + "learning_rate": 0.0005636292626056543, + "loss": 3.6169, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.3240697383880615, + "learning_rate": 0.000563454386476246, + "loss": 3.6303, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.3579652011394501, + "learning_rate": 0.0005632795103468376, + "loss": 3.6228, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3257977366447449, + "learning_rate": 0.0005631046342174293, + "loss": 3.6246, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.33490806818008423, + "learning_rate": 0.000562929758088021, + "loss": 3.6311, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.3260871469974518, + "learning_rate": 0.0005627548819586126, + "loss": 3.6349, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.34829798340797424, + "learning_rate": 0.0005625800058292042, + "loss": 3.6296, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.39637139439582825, + "learning_rate": 0.0005624051296997959, + "loss": 3.6221, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.361925333738327, + "learning_rate": 0.0005622302535703876, + "loss": 3.6232, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.3251033127307892, + "learning_rate": 0.0005620553774409792, + "loss": 3.6361, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.32738086581230164, + "learning_rate": 0.000561880501311571, + "loss": 3.6303, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.35179714921448696, + "eval_loss": 3.71109676361084, + "eval_runtime": 180.569, + "eval_samples_per_second": 92.159, + "eval_steps_per_second": 5.765, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.3152799606323242, + "learning_rate": 0.0005617056251821626, + "loss": 3.6392, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.31268447637557983, + "learning_rate": 0.0005615307490527543, + "loss": 3.6429, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.32059621810913086, + "learning_rate": 0.000561355872923346, + "loss": 3.6429, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.33950409293174744, + "learning_rate": 0.0005611809967939375, + "loss": 3.6292, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.34160423278808594, + "learning_rate": 0.0005610061206645292, + "loss": 3.6465, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.32576170563697815, + "learning_rate": 0.0005608312445351209, + "loss": 3.6273, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.34006011486053467, + "learning_rate": 0.0005606563684057126, + "loss": 3.6418, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.3233993947505951, + "learning_rate": 0.0005604814922763042, + "loss": 3.6329, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.33062100410461426, + "learning_rate": 0.0005603066161468959, + "loss": 3.6387, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.3202456533908844, + "learning_rate": 0.0005601317400174876, + "loss": 3.6509, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.325444757938385, + "learning_rate": 0.0005599568638880793, + "loss": 3.6396, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.32571959495544434, + "learning_rate": 0.0005597819877586709, + "loss": 3.6497, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.3135475218296051, + "learning_rate": 0.0005596071116292625, + "loss": 3.6388, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.34956151247024536, + "learning_rate": 0.0005594322354998542, + "loss": 3.6315, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.3408454656600952, + "learning_rate": 0.0005592573593704459, + "loss": 3.639, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.32927653193473816, + "learning_rate": 0.0005590824832410375, + "loss": 3.6389, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.331126868724823, + "learning_rate": 0.0005589076071116292, + "loss": 3.6399, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.32898014783859253, + "learning_rate": 0.0005587327309822209, + "loss": 3.6464, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.3311103582382202, + "learning_rate": 0.0005585578548528126, + "loss": 3.647, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.32251957058906555, + "learning_rate": 0.0005583829787234043, + "loss": 3.634, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.35319656458813503, + "eval_loss": 3.6934473514556885, + "eval_runtime": 180.9126, + "eval_samples_per_second": 91.984, + "eval_steps_per_second": 5.754, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.31824731826782227, + "learning_rate": 0.0005582081025939958, + "loss": 3.6325, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.323544979095459, + "learning_rate": 0.0005580332264645875, + "loss": 3.6328, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.3168325424194336, + "learning_rate": 0.0005578583503351792, + "loss": 3.6318, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.29865562915802, + "learning_rate": 0.0005576834742057709, + "loss": 3.6316, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.32099124789237976, + "learning_rate": 0.0005575085980763625, + "loss": 3.641, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.31370314955711365, + "learning_rate": 0.0005573337219469542, + "loss": 3.6376, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.34374183416366577, + "learning_rate": 0.0005571588458175459, + "loss": 3.6366, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.32212623953819275, + "learning_rate": 0.0005569839696881374, + "loss": 3.6539, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.3175927400588989, + "learning_rate": 0.0005568090935587292, + "loss": 3.6245, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.32453852891921997, + "learning_rate": 0.0005566342174293208, + "loss": 3.632, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.3191271424293518, + "learning_rate": 0.0005564593412999125, + "loss": 3.6513, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.3355797827243805, + "learning_rate": 0.0005562844651705042, + "loss": 3.6365, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.3287925124168396, + "learning_rate": 0.0005561095890410958, + "loss": 3.6323, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.3259026110172272, + "learning_rate": 0.0005559347129116875, + "loss": 3.6389, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.3568135201931, + "learning_rate": 0.0005557598367822792, + "loss": 3.6406, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.3320167362689972, + "learning_rate": 0.0005555849606528709, + "loss": 3.6365, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.31746959686279297, + "learning_rate": 0.0005554100845234624, + "loss": 3.635, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.32215115427970886, + "learning_rate": 0.0005552352083940541, + "loss": 3.6346, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.3303179442882538, + "learning_rate": 0.0005550603322646458, + "loss": 3.6321, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.3192589581012726, + "learning_rate": 0.0005548854561352375, + "loss": 3.6417, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.3552138394889382, + "eval_loss": 3.6750006675720215, + "eval_runtime": 180.7632, + "eval_samples_per_second": 92.06, + "eval_steps_per_second": 5.759, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.32486605644226074, + "learning_rate": 0.0005547105800058292, + "loss": 3.6201, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.32540053129196167, + "learning_rate": 0.0005545357038764208, + "loss": 3.6341, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.31797096133232117, + "learning_rate": 0.0005543608277470125, + "loss": 3.6461, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.3220556676387787, + "learning_rate": 0.0005541859516176042, + "loss": 3.6295, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.3416280746459961, + "learning_rate": 0.0005540110754881958, + "loss": 3.6403, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.31618985533714294, + "learning_rate": 0.0005538361993587874, + "loss": 3.6357, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.32899636030197144, + "learning_rate": 0.0005536613232293791, + "loss": 3.6354, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.3490321636199951, + "learning_rate": 0.0005534864470999708, + "loss": 3.6209, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.31270232796669006, + "learning_rate": 0.0005533115709705625, + "loss": 3.6345, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.3164960443973541, + "learning_rate": 0.0005531366948411541, + "loss": 3.6336, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.31384697556495667, + "learning_rate": 0.0005529618187117458, + "loss": 3.6306, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.3088199198246002, + "learning_rate": 0.0005527869425823375, + "loss": 3.6351, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.3234373927116394, + "learning_rate": 0.0005526120664529292, + "loss": 3.6327, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.3382308781147003, + "learning_rate": 0.0005524371903235207, + "loss": 3.6321, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.32010650634765625, + "learning_rate": 0.0005522623141941124, + "loss": 3.5813, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.3284594714641571, + "learning_rate": 0.0005520874380647041, + "loss": 3.5195, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.3116910457611084, + "learning_rate": 0.0005519125619352957, + "loss": 3.5323, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.30614858865737915, + "learning_rate": 0.0005517376858058875, + "loss": 3.526, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.30232539772987366, + "learning_rate": 0.0005515628096764791, + "loss": 3.5272, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.34007346630096436, + "learning_rate": 0.0005513879335470708, + "loss": 3.5387, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.35625963788539633, + "eval_loss": 3.6710259914398193, + "eval_runtime": 180.6469, + "eval_samples_per_second": 92.119, + "eval_steps_per_second": 5.763, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.31514182686805725, + "learning_rate": 0.0005512130574176625, + "loss": 3.5323, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.31488853693008423, + "learning_rate": 0.000551038181288254, + "loss": 3.5381, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.3174729347229004, + "learning_rate": 0.0005508633051588457, + "loss": 3.5473, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.3296089470386505, + "learning_rate": 0.0005506884290294374, + "loss": 3.532, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.3251158893108368, + "learning_rate": 0.0005505135529000291, + "loss": 3.5579, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.3330046236515045, + "learning_rate": 0.0005503386767706207, + "loss": 3.5448, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.329488605260849, + "learning_rate": 0.0005501638006412124, + "loss": 3.5567, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.32068127393722534, + "learning_rate": 0.0005499889245118041, + "loss": 3.5561, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.3509705662727356, + "learning_rate": 0.0005498140483823958, + "loss": 3.5629, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.33269885182380676, + "learning_rate": 0.0005496391722529875, + "loss": 3.5629, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.33287978172302246, + "learning_rate": 0.000549464296123579, + "loss": 3.5532, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.3189695477485657, + "learning_rate": 0.0005492894199941707, + "loss": 3.5571, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.33141258358955383, + "learning_rate": 0.0005491145438647624, + "loss": 3.5586, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.319828063249588, + "learning_rate": 0.000548939667735354, + "loss": 3.5612, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": 0.334999144077301, + "learning_rate": 0.0005487647916059457, + "loss": 3.553, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.3195878565311432, + "learning_rate": 0.0005485899154765374, + "loss": 3.5709, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.3178456723690033, + "learning_rate": 0.0005484150393471291, + "loss": 3.5663, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.3257606327533722, + "learning_rate": 0.0005482401632177208, + "loss": 3.5596, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.3173132836818695, + "learning_rate": 0.0005480652870883124, + "loss": 3.5624, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.3216117322444916, + "learning_rate": 0.000547890410958904, + "loss": 3.5645, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.35712233630397466, + "eval_loss": 3.6592366695404053, + "eval_runtime": 180.8389, + "eval_samples_per_second": 92.021, + "eval_steps_per_second": 5.757, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.3293343186378479, + "learning_rate": 0.0005477155348294957, + "loss": 3.5799, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.3351048231124878, + "learning_rate": 0.0005475406587000874, + "loss": 3.5695, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.3400932252407074, + "learning_rate": 0.000547365782570679, + "loss": 3.5654, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.3267037272453308, + "learning_rate": 0.0005471909064412707, + "loss": 3.5663, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.32394009828567505, + "learning_rate": 0.0005470160303118624, + "loss": 3.5739, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.3179683983325958, + "learning_rate": 0.000546841154182454, + "loss": 3.5629, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": 0.33053240180015564, + "learning_rate": 0.0005466662780530458, + "loss": 3.5767, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.34838345646858215, + "learning_rate": 0.0005464914019236374, + "loss": 3.5734, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.34467417001724243, + "learning_rate": 0.000546316525794229, + "loss": 3.5659, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.317242830991745, + "learning_rate": 0.0005461416496648207, + "loss": 3.5814, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.3329235911369324, + "learning_rate": 0.0005459667735354123, + "loss": 3.5601, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.3190889358520508, + "learning_rate": 0.000545791897406004, + "loss": 3.5674, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.34190186858177185, + "learning_rate": 0.0005456170212765957, + "loss": 3.5614, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.3235403895378113, + "learning_rate": 0.0005454421451471874, + "loss": 3.5748, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.32054969668388367, + "learning_rate": 0.000545267269017779, + "loss": 3.5636, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.33121681213378906, + "learning_rate": 0.0005450923928883708, + "loss": 3.5788, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3060116469860077, + "learning_rate": 0.0005449175167589623, + "loss": 3.5646, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.31824249029159546, + "learning_rate": 0.000544742640629554, + "loss": 3.5675, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3265272080898285, + "learning_rate": 0.0005445677645001457, + "loss": 3.5737, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.3331158757209778, + "learning_rate": 0.0005443928883707373, + "loss": 3.5712, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.35869673739829394, + "eval_loss": 3.6470656394958496, + "eval_runtime": 180.6963, + "eval_samples_per_second": 92.094, + "eval_steps_per_second": 5.761, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.34401053190231323, + "learning_rate": 0.000544218012241329, + "loss": 3.5747, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.33307909965515137, + "learning_rate": 0.0005440431361119207, + "loss": 3.5658, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.34673160314559937, + "learning_rate": 0.0005438682599825123, + "loss": 3.5709, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.3231510818004608, + "learning_rate": 0.000543693383853104, + "loss": 3.5722, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.3428361117839813, + "learning_rate": 0.0005435185077236957, + "loss": 3.5723, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.328531414270401, + "learning_rate": 0.0005433436315942873, + "loss": 3.5885, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.32748499512672424, + "learning_rate": 0.000543168755464879, + "loss": 3.5731, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.32989397644996643, + "learning_rate": 0.0005429938793354706, + "loss": 3.5731, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.3433181345462799, + "learning_rate": 0.0005428190032060623, + "loss": 3.5764, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.3318285644054413, + "learning_rate": 0.000542644127076654, + "loss": 3.5753, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.3202374279499054, + "learning_rate": 0.0005424692509472457, + "loss": 3.5821, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.32062003016471863, + "learning_rate": 0.0005422943748178373, + "loss": 3.5817, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.33390170335769653, + "learning_rate": 0.000542119498688429, + "loss": 3.5706, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.31805121898651123, + "learning_rate": 0.0005419446225590207, + "loss": 3.5652, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.31974926590919495, + "learning_rate": 0.0005417697464296122, + "loss": 3.5732, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.3215559124946594, + "learning_rate": 0.000541594870300204, + "loss": 3.5705, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.31701287627220154, + "learning_rate": 0.0005414199941707956, + "loss": 3.5707, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.29899007081985474, + "learning_rate": 0.0005412451180413873, + "loss": 3.5722, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.3220870792865753, + "learning_rate": 0.000541070241911979, + "loss": 3.5804, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.31385698914527893, + "learning_rate": 0.0005408953657825706, + "loss": 3.5678, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.35995856319318836, + "eval_loss": 3.631695508956909, + "eval_runtime": 180.6546, + "eval_samples_per_second": 92.115, + "eval_steps_per_second": 5.762, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.30999556183815, + "learning_rate": 0.0005407204896531623, + "loss": 3.5824, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.31185469031333923, + "learning_rate": 0.000540545613523754, + "loss": 3.5701, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.31638017296791077, + "learning_rate": 0.0005403707373943456, + "loss": 3.5758, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.3234923779964447, + "learning_rate": 0.0005401958612649372, + "loss": 3.4972, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.3230464458465576, + "learning_rate": 0.000540020985135529, + "loss": 3.4543, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.32958754897117615, + "learning_rate": 0.0005398461090061206, + "loss": 3.4646, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.3155558705329895, + "learning_rate": 0.0005396712328767123, + "loss": 3.4774, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.3168497681617737, + "learning_rate": 0.000539496356747304, + "loss": 3.4642, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.3253559172153473, + "learning_rate": 0.0005393214806178956, + "loss": 3.4729, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.32378828525543213, + "learning_rate": 0.0005391466044884873, + "loss": 3.4818, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.3335542380809784, + "learning_rate": 0.000538971728359079, + "loss": 3.4832, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.3030463755130768, + "learning_rate": 0.0005387968522296705, + "loss": 3.4849, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.31400778889656067, + "learning_rate": 0.0005386219761002622, + "loss": 3.4826, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3250598907470703, + "learning_rate": 0.0005384470999708539, + "loss": 3.4884, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.345520555973053, + "learning_rate": 0.0005382722238414456, + "loss": 3.4921, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.3516606390476227, + "learning_rate": 0.0005380973477120373, + "loss": 3.4985, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.352478563785553, + "learning_rate": 0.000537922471582629, + "loss": 3.4826, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.35319584608078003, + "learning_rate": 0.0005377475954532206, + "loss": 3.4955, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.3407022953033447, + "learning_rate": 0.0005375727193238123, + "loss": 3.5012, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.35426145792007446, + "learning_rate": 0.000537397843194404, + "loss": 3.5042, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.3603563969922683, + "eval_loss": 3.6395750045776367, + "eval_runtime": 180.8321, + "eval_samples_per_second": 92.025, + "eval_steps_per_second": 5.757, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.35406365990638733, + "learning_rate": 0.0005372229670649955, + "loss": 3.5002, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.31914931535720825, + "learning_rate": 0.0005370480909355872, + "loss": 3.5036, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.3213772773742676, + "learning_rate": 0.0005368732148061789, + "loss": 3.5054, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.34711068868637085, + "learning_rate": 0.0005366983386767705, + "loss": 3.5062, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.31905606389045715, + "learning_rate": 0.0005365234625473623, + "loss": 3.5136, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.34507957100868225, + "learning_rate": 0.0005363485864179539, + "loss": 3.5083, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.33473140001296997, + "learning_rate": 0.0005361737102885456, + "loss": 3.5181, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.32312649488449097, + "learning_rate": 0.0005359988341591373, + "loss": 3.5086, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3478247821331024, + "learning_rate": 0.000535823958029729, + "loss": 3.5049, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.3395753502845764, + "learning_rate": 0.0005356490819003205, + "loss": 3.5144, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.35865122079849243, + "learning_rate": 0.0005354742057709122, + "loss": 3.5177, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.3185734748840332, + "learning_rate": 0.0005352993296415039, + "loss": 3.5112, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.35112476348876953, + "learning_rate": 0.0005351244535120955, + "loss": 3.5156, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.3340792953968048, + "learning_rate": 0.0005349495773826873, + "loss": 3.5167, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.3263596296310425, + "learning_rate": 0.0005347747012532789, + "loss": 3.4987, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.3156237304210663, + "learning_rate": 0.0005345998251238706, + "loss": 3.5255, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.34373199939727783, + "learning_rate": 0.0005344249489944623, + "loss": 3.5092, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.3335568606853485, + "learning_rate": 0.0005342500728650538, + "loss": 3.5306, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.34739938378334045, + "learning_rate": 0.0005340751967356455, + "loss": 3.5186, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.3066945970058441, + "learning_rate": 0.0005339003206062372, + "loss": 3.5238, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.3612617834596394, + "eval_loss": 3.6238057613372803, + "eval_runtime": 181.0108, + "eval_samples_per_second": 91.934, + "eval_steps_per_second": 5.751, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.35906171798706055, + "learning_rate": 0.0005337254444768288, + "loss": 3.5188, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3499426245689392, + "learning_rate": 0.0005335505683474205, + "loss": 3.5172, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.32113078236579895, + "learning_rate": 0.0005333756922180122, + "loss": 3.5124, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.3309899866580963, + "learning_rate": 0.0005332008160886039, + "loss": 3.521, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.33870729804039, + "learning_rate": 0.0005330259399591956, + "loss": 3.5212, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.3483949899673462, + "learning_rate": 0.0005328510638297873, + "loss": 3.5272, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.31153854727745056, + "learning_rate": 0.0005326761877003788, + "loss": 3.5116, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.32981279492378235, + "learning_rate": 0.0005325013115709705, + "loss": 3.5192, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.336296409368515, + "learning_rate": 0.0005323264354415622, + "loss": 3.525, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.34458428621292114, + "learning_rate": 0.0005321515593121538, + "loss": 3.5289, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.3392544388771057, + "learning_rate": 0.0005319766831827455, + "loss": 3.5259, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.3356161117553711, + "learning_rate": 0.0005318018070533372, + "loss": 3.5266, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.34901565313339233, + "learning_rate": 0.0005316269309239288, + "loss": 3.5224, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.33175837993621826, + "learning_rate": 0.0005314520547945206, + "loss": 3.5364, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.31978893280029297, + "learning_rate": 0.0005312771786651121, + "loss": 3.5287, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.3431372344493866, + "learning_rate": 0.0005311023025357038, + "loss": 3.5381, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3272165358066559, + "learning_rate": 0.0005309274264062955, + "loss": 3.5206, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.3220779597759247, + "learning_rate": 0.0005307525502768872, + "loss": 3.5213, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.32556435465812683, + "learning_rate": 0.0005305776741474788, + "loss": 3.5191, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.32084694504737854, + "learning_rate": 0.0005304027980180705, + "loss": 3.5195, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.36219539343034457, + "eval_loss": 3.6152045726776123, + "eval_runtime": 181.196, + "eval_samples_per_second": 91.84, + "eval_steps_per_second": 5.745, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.3214913606643677, + "learning_rate": 0.0005302279218886622, + "loss": 3.5319, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.3432092070579529, + "learning_rate": 0.0005300530457592538, + "loss": 3.5317, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.351404070854187, + "learning_rate": 0.0005298781696298456, + "loss": 3.5237, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.36213329434394836, + "learning_rate": 0.0005297032935004371, + "loss": 3.5253, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.3420363664627075, + "learning_rate": 0.0005295284173710288, + "loss": 3.5339, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.31901925802230835, + "learning_rate": 0.0005293535412416205, + "loss": 3.533, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.3374883234500885, + "learning_rate": 0.0005291786651122121, + "loss": 3.5282, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.33261144161224365, + "learning_rate": 0.0005290037889828038, + "loss": 3.5292, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.32766902446746826, + "learning_rate": 0.0005288289128533955, + "loss": 3.5325, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.2985290288925171, + "learning_rate": 0.0005286540367239872, + "loss": 3.5208, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.3742130994796753, + "learning_rate": 0.0005284791605945788, + "loss": 3.5248, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.3418431580066681, + "learning_rate": 0.0005283042844651704, + "loss": 3.5182, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.33612483739852905, + "learning_rate": 0.0005281294083357621, + "loss": 3.4176, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.3870946168899536, + "learning_rate": 0.0005279545322063538, + "loss": 3.4103, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.33531826734542847, + "learning_rate": 0.0005277796560769455, + "loss": 3.4213, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.3558143973350525, + "learning_rate": 0.0005276047799475371, + "loss": 3.4305, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.3327952027320862, + "learning_rate": 0.0005274299038181288, + "loss": 3.428, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.32903343439102173, + "learning_rate": 0.0005272550276887205, + "loss": 3.4273, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.3103554844856262, + "learning_rate": 0.0005270801515593121, + "loss": 3.4361, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.336482435464859, + "learning_rate": 0.0005269052754299037, + "loss": 3.4431, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.36242612057010065, + "eval_loss": 3.6179239749908447, + "eval_runtime": 181.3924, + "eval_samples_per_second": 91.74, + "eval_steps_per_second": 5.739, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.3301815092563629, + "learning_rate": 0.0005267303993004954, + "loss": 3.44, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.31713324785232544, + "learning_rate": 0.000526555523171087, + "loss": 3.4436, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.3336808383464813, + "learning_rate": 0.0005263806470416788, + "loss": 3.4506, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.3340149521827698, + "learning_rate": 0.0005262057709122704, + "loss": 3.4424, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.3524874150753021, + "learning_rate": 0.0005260308947828621, + "loss": 3.4413, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.33669140934944153, + "learning_rate": 0.0005258560186534538, + "loss": 3.4488, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.33581581711769104, + "learning_rate": 0.0005256811425240455, + "loss": 3.456, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.3273789584636688, + "learning_rate": 0.0005255062663946371, + "loss": 3.445, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.3533139228820801, + "learning_rate": 0.0005253313902652287, + "loss": 3.4585, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.35069921612739563, + "learning_rate": 0.0005251565141358204, + "loss": 3.4586, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.3373342752456665, + "learning_rate": 0.000524981638006412, + "loss": 3.4618, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.32910624146461487, + "learning_rate": 0.0005248067618770038, + "loss": 3.4705, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.3465712368488312, + "learning_rate": 0.0005246318857475954, + "loss": 3.4618, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.33147528767585754, + "learning_rate": 0.0005244570096181871, + "loss": 3.4512, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.3352925181388855, + "learning_rate": 0.0005242821334887788, + "loss": 3.4722, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.36607617139816284, + "learning_rate": 0.0005241072573593704, + "loss": 3.4672, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.3554653823375702, + "learning_rate": 0.000523932381229962, + "loss": 3.4626, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.33701369166374207, + "learning_rate": 0.0005237575051005537, + "loss": 3.4676, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.33805081248283386, + "learning_rate": 0.0005235826289711454, + "loss": 3.4543, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.335484117269516, + "learning_rate": 0.000523407752841737, + "loss": 3.4638, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.3632213177765383, + "eval_loss": 3.6108696460723877, + "eval_runtime": 181.2596, + "eval_samples_per_second": 91.808, + "eval_steps_per_second": 5.743, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.33982568979263306, + "learning_rate": 0.0005232328767123287, + "loss": 3.4719, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.34843680262565613, + "learning_rate": 0.0005230580005829204, + "loss": 3.4806, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.3565368056297302, + "learning_rate": 0.0005228831244535121, + "loss": 3.4761, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.3219994306564331, + "learning_rate": 0.0005227082483241038, + "loss": 3.4619, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.3399719297885895, + "learning_rate": 0.0005225333721946954, + "loss": 3.4699, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.33617985248565674, + "learning_rate": 0.000522358496065287, + "loss": 3.4852, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.33567848801612854, + "learning_rate": 0.0005221836199358787, + "loss": 3.4678, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.3187302350997925, + "learning_rate": 0.0005220087438064703, + "loss": 3.4789, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.3293958306312561, + "learning_rate": 0.000521833867677062, + "loss": 3.487, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.32739055156707764, + "learning_rate": 0.0005216589915476537, + "loss": 3.4803, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.3373546898365021, + "learning_rate": 0.0005214841154182454, + "loss": 3.4806, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.30911925435066223, + "learning_rate": 0.0005213092392888371, + "loss": 3.479, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.34881287813186646, + "learning_rate": 0.0005211343631594287, + "loss": 3.4737, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.32742807269096375, + "learning_rate": 0.0005209594870300204, + "loss": 3.4909, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.34914788603782654, + "learning_rate": 0.000520784610900612, + "loss": 3.4834, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.34026578068733215, + "learning_rate": 0.0005206097347712037, + "loss": 3.4829, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.33478906750679016, + "learning_rate": 0.0005204348586417953, + "loss": 3.4747, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.31324368715286255, + "learning_rate": 0.000520259982512387, + "loss": 3.4695, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.33750981092453003, + "learning_rate": 0.0005200851063829787, + "loss": 3.4852, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.3517208695411682, + "learning_rate": 0.0005199102302535703, + "loss": 3.505, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.36391443997925105, + "eval_loss": 3.6010148525238037, + "eval_runtime": 181.0814, + "eval_samples_per_second": 91.898, + "eval_steps_per_second": 5.749, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.32067185640335083, + "learning_rate": 0.0005197353541241621, + "loss": 3.4911, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.34073543548583984, + "learning_rate": 0.0005195604779947537, + "loss": 3.4761, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.3122085928916931, + "learning_rate": 0.0005193856018653454, + "loss": 3.484, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.33687078952789307, + "learning_rate": 0.000519210725735937, + "loss": 3.4858, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.34494155645370483, + "learning_rate": 0.0005190358496065286, + "loss": 3.4851, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.33593103289604187, + "learning_rate": 0.0005188609734771203, + "loss": 3.4894, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.3480103313922882, + "learning_rate": 0.000518686097347712, + "loss": 3.4892, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.3221217691898346, + "learning_rate": 0.0005185112212183037, + "loss": 3.4947, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.32625964283943176, + "learning_rate": 0.0005183363450888953, + "loss": 3.493, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.34171169996261597, + "learning_rate": 0.000518161468959487, + "loss": 3.4883, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.33914807438850403, + "learning_rate": 0.0005179865928300787, + "loss": 3.4836, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.3401309549808502, + "learning_rate": 0.0005178117167006703, + "loss": 3.4926, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.3442193269729614, + "learning_rate": 0.000517636840571262, + "loss": 3.501, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.3085883557796478, + "learning_rate": 0.0005174619644418536, + "loss": 3.4841, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.3226356506347656, + "learning_rate": 0.0005172870883124453, + "loss": 3.4904, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.32244643568992615, + "learning_rate": 0.000517112212183037, + "loss": 3.489, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.3091798722743988, + "learning_rate": 0.0005169373360536286, + "loss": 3.4939, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.34266242384910583, + "learning_rate": 0.0005167624599242203, + "loss": 3.4886, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.3311766982078552, + "learning_rate": 0.000516587583794812, + "loss": 3.4971, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.32126250863075256, + "learning_rate": 0.0005164127076654037, + "loss": 3.4849, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.36486815919608173, + "eval_loss": 3.5899109840393066, + "eval_runtime": 181.1825, + "eval_samples_per_second": 91.847, + "eval_steps_per_second": 5.746, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.34321558475494385, + "learning_rate": 0.0005162378315359953, + "loss": 3.456, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.32099732756614685, + "learning_rate": 0.0005160629554065869, + "loss": 3.3697, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.3492303490638733, + "learning_rate": 0.0005158880792771786, + "loss": 3.3825, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.3613590598106384, + "learning_rate": 0.0005157132031477703, + "loss": 3.3826, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.3345809876918793, + "learning_rate": 0.000515538327018362, + "loss": 3.3882, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.34374889731407166, + "learning_rate": 0.0005153634508889536, + "loss": 3.3903, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.34649938344955444, + "learning_rate": 0.0005151885747595453, + "loss": 3.3955, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.32787710428237915, + "learning_rate": 0.000515013698630137, + "loss": 3.3989, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.35148805379867554, + "learning_rate": 0.0005148388225007285, + "loss": 3.4054, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.3529978096485138, + "learning_rate": 0.0005146639463713203, + "loss": 3.4034, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.3426077961921692, + "learning_rate": 0.0005144890702419119, + "loss": 3.4015, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.36019691824913025, + "learning_rate": 0.0005143141941125036, + "loss": 3.4042, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.32822075486183167, + "learning_rate": 0.0005141393179830953, + "loss": 3.4107, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.3263060450553894, + "learning_rate": 0.0005139644418536869, + "loss": 3.4203, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.3379085063934326, + "learning_rate": 0.0005137895657242786, + "loss": 3.4229, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.3335205316543579, + "learning_rate": 0.0005136146895948703, + "loss": 3.4138, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.3487148880958557, + "learning_rate": 0.000513439813465462, + "loss": 3.4128, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.3310547471046448, + "learning_rate": 0.0005132649373360535, + "loss": 3.4229, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.34830957651138306, + "learning_rate": 0.0005130900612066452, + "loss": 3.4208, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.32062774896621704, + "learning_rate": 0.0005129151850772369, + "loss": 3.4216, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.3646655379617292, + "eval_loss": 3.5988800525665283, + "eval_runtime": 181.3926, + "eval_samples_per_second": 91.74, + "eval_steps_per_second": 5.739, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.3289690613746643, + "learning_rate": 0.0005127403089478286, + "loss": 3.4188, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.34852126240730286, + "learning_rate": 0.0005125654328184203, + "loss": 3.4199, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.3791807293891907, + "learning_rate": 0.0005123905566890119, + "loss": 3.4376, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.33741068840026855, + "learning_rate": 0.0005122156805596036, + "loss": 3.4319, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.3323241174221039, + "learning_rate": 0.0005120408044301953, + "loss": 3.4227, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.35564154386520386, + "learning_rate": 0.0005118659283007868, + "loss": 3.4385, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.34174442291259766, + "learning_rate": 0.0005116910521713785, + "loss": 3.4283, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.3545416295528412, + "learning_rate": 0.0005115161760419702, + "loss": 3.4331, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.3415866196155548, + "learning_rate": 0.0005113412999125619, + "loss": 3.4329, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.32280662655830383, + "learning_rate": 0.0005111664237831536, + "loss": 3.4482, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.34938618540763855, + "learning_rate": 0.0005109915476537452, + "loss": 3.4327, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.34118571877479553, + "learning_rate": 0.0005108166715243369, + "loss": 3.4423, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.3390498459339142, + "learning_rate": 0.0005106417953949286, + "loss": 3.4347, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.33087974786758423, + "learning_rate": 0.0005104669192655203, + "loss": 3.4475, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.32783931493759155, + "learning_rate": 0.0005102920431361118, + "loss": 3.4459, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.3460298478603363, + "learning_rate": 0.0005101171670067035, + "loss": 3.4426, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.3336711525917053, + "learning_rate": 0.0005099422908772952, + "loss": 3.4442, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.33612728118896484, + "learning_rate": 0.0005097674147478868, + "loss": 3.449, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.3177841603755951, + "learning_rate": 0.0005095925386184786, + "loss": 3.4447, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.3445686995983124, + "learning_rate": 0.0005094176624890702, + "loss": 3.4423, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.3656406600019215, + "eval_loss": 3.589552164077759, + "eval_runtime": 180.5358, + "eval_samples_per_second": 92.176, + "eval_steps_per_second": 5.766, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.3337860107421875, + "learning_rate": 0.0005092427863596619, + "loss": 3.434, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.35997340083122253, + "learning_rate": 0.0005090679102302536, + "loss": 3.4465, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.32720959186553955, + "learning_rate": 0.0005088930341008451, + "loss": 3.4555, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.34168654680252075, + "learning_rate": 0.0005087181579714368, + "loss": 3.4556, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.332603394985199, + "learning_rate": 0.0005085432818420285, + "loss": 3.4463, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.31847983598709106, + "learning_rate": 0.0005083684057126202, + "loss": 3.4566, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.3408845067024231, + "learning_rate": 0.0005081935295832118, + "loss": 3.4529, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.34977152943611145, + "learning_rate": 0.0005080186534538035, + "loss": 3.4471, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.31974101066589355, + "learning_rate": 0.0005078437773243952, + "loss": 3.451, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.3461310863494873, + "learning_rate": 0.0005076689011949869, + "loss": 3.4536, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.3448072075843811, + "learning_rate": 0.0005074940250655786, + "loss": 3.4542, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.3268807530403137, + "learning_rate": 0.0005073191489361701, + "loss": 3.4477, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.36917299032211304, + "learning_rate": 0.0005071442728067618, + "loss": 3.461, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.3446688950061798, + "learning_rate": 0.0005069693966773535, + "loss": 3.4474, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3360874056816101, + "learning_rate": 0.0005067945205479451, + "loss": 3.4546, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.3311786651611328, + "learning_rate": 0.0005066196444185368, + "loss": 3.4458, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.33738234639167786, + "learning_rate": 0.0005064447682891285, + "loss": 3.4496, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.33856943249702454, + "learning_rate": 0.0005062698921597202, + "loss": 3.4499, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.37138065695762634, + "learning_rate": 0.0005060950160303119, + "loss": 3.4548, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.3353155553340912, + "learning_rate": 0.0005059201399009035, + "loss": 3.4548, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.3661931350796861, + "eval_loss": 3.581078052520752, + "eval_runtime": 180.3799, + "eval_samples_per_second": 92.255, + "eval_steps_per_second": 5.771, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.3436886966228485, + "learning_rate": 0.0005057452637714951, + "loss": 3.4584, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.35296398401260376, + "learning_rate": 0.0005055703876420868, + "loss": 3.4657, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.32724249362945557, + "learning_rate": 0.0005053955115126785, + "loss": 3.4646, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.33248910307884216, + "learning_rate": 0.0005052206353832701, + "loss": 3.4631, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.3228563368320465, + "learning_rate": 0.0005050457592538618, + "loss": 3.4495, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3430377244949341, + "learning_rate": 0.0005048708831244535, + "loss": 3.4564, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.3293624222278595, + "learning_rate": 0.0005046960069950451, + "loss": 3.4625, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.33939313888549805, + "learning_rate": 0.0005045211308656369, + "loss": 3.4708, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.33478549122810364, + "learning_rate": 0.0005043462547362284, + "loss": 3.4623, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.34706467390060425, + "learning_rate": 0.0005041713786068201, + "loss": 3.3717, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.3538230359554291, + "learning_rate": 0.0005039965024774118, + "loss": 3.3461, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.3367486596107483, + "learning_rate": 0.0005038216263480034, + "loss": 3.3598, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.3318694829940796, + "learning_rate": 0.0005036467502185951, + "loss": 3.3606, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.370888352394104, + "learning_rate": 0.0005034718740891868, + "loss": 3.3628, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.3453328311443329, + "learning_rate": 0.0005032969979597785, + "loss": 3.3619, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.3351195156574249, + "learning_rate": 0.0005031221218303701, + "loss": 3.3658, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.32804664969444275, + "learning_rate": 0.0005029472457009618, + "loss": 3.3629, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.31457826495170593, + "learning_rate": 0.0005027723695715534, + "loss": 3.372, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.3434246778488159, + "learning_rate": 0.0005025974934421451, + "loss": 3.3817, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.31584593653678894, + "learning_rate": 0.0005024226173127368, + "loss": 3.3804, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.36580259235230084, + "eval_loss": 3.5929579734802246, + "eval_runtime": 180.492, + "eval_samples_per_second": 92.198, + "eval_steps_per_second": 5.768, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.34201693534851074, + "learning_rate": 0.0005022477411833284, + "loss": 3.3746, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.33196499943733215, + "learning_rate": 0.0005020728650539201, + "loss": 3.3854, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.3429182767868042, + "learning_rate": 0.0005018979889245118, + "loss": 3.3806, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.3488587737083435, + "learning_rate": 0.0005017231127951034, + "loss": 3.3733, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.3547254204750061, + "learning_rate": 0.0005015482366656951, + "loss": 3.384, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.33073434233665466, + "learning_rate": 0.0005013733605362868, + "loss": 3.3974, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.3444958031177521, + "learning_rate": 0.0005011984844068784, + "loss": 3.4023, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3397659361362457, + "learning_rate": 0.0005010236082774701, + "loss": 3.3907, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.3528975248336792, + "learning_rate": 0.0005008487321480617, + "loss": 3.4025, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.325632780790329, + "learning_rate": 0.0005006738560186534, + "loss": 3.4043, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.3388330936431885, + "learning_rate": 0.0005004989798892451, + "loss": 3.4008, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.33034101128578186, + "learning_rate": 0.0005003241037598368, + "loss": 3.4041, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.3294675350189209, + "learning_rate": 0.0005001492276304284, + "loss": 3.3978, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.3379242718219757, + "learning_rate": 0.0004999743515010201, + "loss": 3.3931, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.3348020315170288, + "learning_rate": 0.0004997994753716117, + "loss": 3.4, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.33885735273361206, + "learning_rate": 0.0004996245992422033, + "loss": 3.4037, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.3465324342250824, + "learning_rate": 0.0004994497231127951, + "loss": 3.3976, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.35643112659454346, + "learning_rate": 0.0004992748469833867, + "loss": 3.4031, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.327167272567749, + "learning_rate": 0.0004990999708539784, + "loss": 3.4055, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.3230350613594055, + "learning_rate": 0.0004989250947245701, + "loss": 3.4071, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.3665124134611529, + "eval_loss": 3.5824544429779053, + "eval_runtime": 180.4921, + "eval_samples_per_second": 92.198, + "eval_steps_per_second": 5.768, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.3105214536190033, + "learning_rate": 0.0004987502185951617, + "loss": 3.4164, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.3333645164966583, + "learning_rate": 0.0004985753424657534, + "loss": 3.4057, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.3227234482765198, + "learning_rate": 0.000498400466336345, + "loss": 3.4038, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.37023699283599854, + "learning_rate": 0.0004982255902069367, + "loss": 3.4149, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.3446352183818817, + "learning_rate": 0.0004980507140775283, + "loss": 3.4098, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.3288004696369171, + "learning_rate": 0.0004978758379481201, + "loss": 3.4005, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.314487099647522, + "learning_rate": 0.0004977009618187117, + "loss": 3.4191, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.3452147841453552, + "learning_rate": 0.0004975260856893034, + "loss": 3.41, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.31154942512512207, + "learning_rate": 0.0004973512095598951, + "loss": 3.4108, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.3399568200111389, + "learning_rate": 0.0004971763334304867, + "loss": 3.4201, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.34255409240722656, + "learning_rate": 0.0004970014573010784, + "loss": 3.4154, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.3516691327095032, + "learning_rate": 0.00049682658117167, + "loss": 3.4224, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.33777916431427, + "learning_rate": 0.0004966517050422616, + "loss": 3.4208, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.3323017358779907, + "learning_rate": 0.0004964768289128533, + "loss": 3.4177, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.3520144820213318, + "learning_rate": 0.000496301952783445, + "loss": 3.4251, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.33961278200149536, + "learning_rate": 0.0004961270766540367, + "loss": 3.4323, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.33390894532203674, + "learning_rate": 0.0004959522005246284, + "loss": 3.4244, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.3280866742134094, + "learning_rate": 0.00049577732439522, + "loss": 3.4277, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.34025970101356506, + "learning_rate": 0.0004956024482658117, + "loss": 3.4221, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.3388609290122986, + "learning_rate": 0.0004954275721364034, + "loss": 3.4312, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.3672837382876871, + "eval_loss": 3.5726044178009033, + "eval_runtime": 180.5709, + "eval_samples_per_second": 92.158, + "eval_steps_per_second": 5.765, + "step": 30000 + }, + { + "epoch": 8.753320904218132, + "grad_norm": 0.3268741965293884, + "learning_rate": 0.000495252696006995, + "loss": 3.4261, + "step": 30050 + }, + { + "epoch": 8.767886273595899, + "grad_norm": 0.34607651829719543, + "learning_rate": 0.0004950778198775866, + "loss": 3.4155, + "step": 30100 + }, + { + "epoch": 8.782451642973665, + "grad_norm": 0.31372374296188354, + "learning_rate": 0.0004949029437481783, + "loss": 3.4126, + "step": 30150 + }, + { + "epoch": 8.797017012351434, + "grad_norm": 0.345786452293396, + "learning_rate": 0.00049472806761877, + "loss": 3.4428, + "step": 30200 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.3313784599304199, + "learning_rate": 0.0004945531914893616, + "loss": 3.424, + "step": 30250 + }, + { + "epoch": 8.826147751106967, + "grad_norm": 0.3862437307834625, + "learning_rate": 0.0004943783153599534, + "loss": 3.4353, + "step": 30300 + }, + { + "epoch": 8.840713120484736, + "grad_norm": 0.3465390205383301, + "learning_rate": 0.000494203439230545, + "loss": 3.4368, + "step": 30350 + }, + { + "epoch": 8.855278489862503, + "grad_norm": 0.3554335832595825, + "learning_rate": 0.0004940285631011367, + "loss": 3.4241, + "step": 30400 + }, + { + "epoch": 8.86984385924027, + "grad_norm": 0.3209091126918793, + "learning_rate": 0.0004938536869717284, + "loss": 3.4351, + "step": 30450 + }, + { + "epoch": 8.884409228618038, + "grad_norm": 0.33575254678726196, + "learning_rate": 0.0004936788108423199, + "loss": 3.4228, + "step": 30500 + }, + { + "epoch": 8.898974597995805, + "grad_norm": 0.3382629156112671, + "learning_rate": 0.0004935039347129116, + "loss": 3.4385, + "step": 30550 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.31963130831718445, + "learning_rate": 0.0004933290585835033, + "loss": 3.4193, + "step": 30600 + }, + { + "epoch": 8.92810533675134, + "grad_norm": 0.3423698842525482, + "learning_rate": 0.000493154182454095, + "loss": 3.4429, + "step": 30650 + }, + { + "epoch": 8.942670706129107, + "grad_norm": 0.35074007511138916, + "learning_rate": 0.0004929793063246866, + "loss": 3.4345, + "step": 30700 + }, + { + "epoch": 8.957236075506875, + "grad_norm": 0.3303930461406708, + "learning_rate": 0.0004928044301952783, + "loss": 3.4338, + "step": 30750 + }, + { + "epoch": 8.971801444884642, + "grad_norm": 0.36244720220565796, + "learning_rate": 0.00049262955406587, + "loss": 3.4198, + "step": 30800 + }, + { + "epoch": 8.986366814262409, + "grad_norm": 0.3402937650680542, + "learning_rate": 0.0004924546779364617, + "loss": 3.422, + "step": 30850 + }, + { + "epoch": 9.000873922162667, + "grad_norm": 0.3200979232788086, + "learning_rate": 0.0004922798018070533, + "loss": 3.4227, + "step": 30900 + }, + { + "epoch": 9.015439291540433, + "grad_norm": 0.3527183532714844, + "learning_rate": 0.0004921049256776449, + "loss": 3.3389, + "step": 30950 + }, + { + "epoch": 9.0300046609182, + "grad_norm": 0.357515811920166, + "learning_rate": 0.0004919300495482366, + "loss": 3.3276, + "step": 31000 + }, + { + "epoch": 9.0300046609182, + "eval_accuracy": 0.36772073219764306, + "eval_loss": 3.577789783477783, + "eval_runtime": 180.5131, + "eval_samples_per_second": 92.187, + "eval_steps_per_second": 5.767, + "step": 31000 + }, + { + "epoch": 9.044570030295969, + "grad_norm": 0.3615468144416809, + "learning_rate": 0.0004917551734188283, + "loss": 3.329, + "step": 31050 + }, + { + "epoch": 9.059135399673735, + "grad_norm": 0.3507601022720337, + "learning_rate": 0.0004915802972894199, + "loss": 3.3342, + "step": 31100 + }, + { + "epoch": 9.073700769051504, + "grad_norm": 0.35503971576690674, + "learning_rate": 0.0004914054211600116, + "loss": 3.3349, + "step": 31150 + }, + { + "epoch": 9.08826613842927, + "grad_norm": 0.353206068277359, + "learning_rate": 0.0004912305450306033, + "loss": 3.3419, + "step": 31200 + }, + { + "epoch": 9.102831507807037, + "grad_norm": 0.359823077917099, + "learning_rate": 0.000491055668901195, + "loss": 3.352, + "step": 31250 + }, + { + "epoch": 9.117396877184806, + "grad_norm": 0.33940252661705017, + "learning_rate": 0.0004908807927717865, + "loss": 3.358, + "step": 31300 + }, + { + "epoch": 9.131962246562573, + "grad_norm": 0.3473168909549713, + "learning_rate": 0.0004907059166423783, + "loss": 3.3456, + "step": 31350 + }, + { + "epoch": 9.14652761594034, + "grad_norm": 0.34054091572761536, + "learning_rate": 0.0004905310405129699, + "loss": 3.3383, + "step": 31400 + }, + { + "epoch": 9.161092985318108, + "grad_norm": 0.3713337779045105, + "learning_rate": 0.0004903561643835616, + "loss": 3.3514, + "step": 31450 + }, + { + "epoch": 9.175658354695875, + "grad_norm": 0.3296445310115814, + "learning_rate": 0.0004901812882541533, + "loss": 3.3533, + "step": 31500 + }, + { + "epoch": 9.190223724073643, + "grad_norm": 0.34697720408439636, + "learning_rate": 0.0004900064121247449, + "loss": 3.3548, + "step": 31550 + }, + { + "epoch": 9.20478909345141, + "grad_norm": 0.3309634029865265, + "learning_rate": 0.0004898315359953366, + "loss": 3.35, + "step": 31600 + }, + { + "epoch": 9.219354462829177, + "grad_norm": 0.3397737145423889, + "learning_rate": 0.0004896566598659283, + "loss": 3.3565, + "step": 31650 + }, + { + "epoch": 9.233919832206945, + "grad_norm": 0.36492007970809937, + "learning_rate": 0.0004894817837365199, + "loss": 3.3646, + "step": 31700 + }, + { + "epoch": 9.248485201584712, + "grad_norm": 0.38047462701797485, + "learning_rate": 0.0004893069076071115, + "loss": 3.3694, + "step": 31750 + }, + { + "epoch": 9.263050570962479, + "grad_norm": 0.37059515714645386, + "learning_rate": 0.0004891320314777032, + "loss": 3.3674, + "step": 31800 + }, + { + "epoch": 9.277615940340247, + "grad_norm": 0.3410259187221527, + "learning_rate": 0.0004889571553482949, + "loss": 3.3716, + "step": 31850 + }, + { + "epoch": 9.292181309718014, + "grad_norm": 0.3509190082550049, + "learning_rate": 0.0004887822792188866, + "loss": 3.3652, + "step": 31900 + }, + { + "epoch": 9.306746679095783, + "grad_norm": 0.33213210105895996, + "learning_rate": 0.0004886074030894782, + "loss": 3.3606, + "step": 31950 + }, + { + "epoch": 9.32131204847355, + "grad_norm": 0.3355337381362915, + "learning_rate": 0.0004884325269600699, + "loss": 3.3689, + "step": 32000 + }, + { + "epoch": 9.32131204847355, + "eval_accuracy": 0.3674749525227755, + "eval_loss": 3.5782315731048584, + "eval_runtime": 180.5937, + "eval_samples_per_second": 92.146, + "eval_steps_per_second": 5.764, + "step": 32000 + }, + { + "epoch": 9.335877417851316, + "grad_norm": 0.34377825260162354, + "learning_rate": 0.0004882576508306615, + "loss": 3.3749, + "step": 32050 + }, + { + "epoch": 9.350442787229085, + "grad_norm": 0.3456684947013855, + "learning_rate": 0.00048808277470125327, + "loss": 3.3629, + "step": 32100 + }, + { + "epoch": 9.365008156606851, + "grad_norm": 0.33172163367271423, + "learning_rate": 0.0004879078985718449, + "loss": 3.3571, + "step": 32150 + }, + { + "epoch": 9.379573525984618, + "grad_norm": 0.3585987091064453, + "learning_rate": 0.0004877330224424366, + "loss": 3.3849, + "step": 32200 + }, + { + "epoch": 9.394138895362387, + "grad_norm": 0.3346097469329834, + "learning_rate": 0.00048755814631302823, + "loss": 3.383, + "step": 32250 + }, + { + "epoch": 9.408704264740154, + "grad_norm": 0.36908599734306335, + "learning_rate": 0.00048738327018361987, + "loss": 3.3743, + "step": 32300 + }, + { + "epoch": 9.423269634117922, + "grad_norm": 0.3361928462982178, + "learning_rate": 0.00048720839405421156, + "loss": 3.3991, + "step": 32350 + }, + { + "epoch": 9.437835003495689, + "grad_norm": 0.32366377115249634, + "learning_rate": 0.0004870335179248032, + "loss": 3.3874, + "step": 32400 + }, + { + "epoch": 9.452400372873456, + "grad_norm": 0.3365907073020935, + "learning_rate": 0.0004868586417953949, + "loss": 3.3823, + "step": 32450 + }, + { + "epoch": 9.466965742251224, + "grad_norm": 0.3624427914619446, + "learning_rate": 0.0004866837656659865, + "loss": 3.3778, + "step": 32500 + }, + { + "epoch": 9.48153111162899, + "grad_norm": 0.3516867756843567, + "learning_rate": 0.00048650888953657816, + "loss": 3.3825, + "step": 32550 + }, + { + "epoch": 9.496096481006758, + "grad_norm": 0.3406991958618164, + "learning_rate": 0.0004863340134071699, + "loss": 3.3884, + "step": 32600 + }, + { + "epoch": 9.510661850384526, + "grad_norm": 0.32491427659988403, + "learning_rate": 0.00048615913727776154, + "loss": 3.3927, + "step": 32650 + }, + { + "epoch": 9.525227219762293, + "grad_norm": 0.35882461071014404, + "learning_rate": 0.00048598426114835323, + "loss": 3.3866, + "step": 32700 + }, + { + "epoch": 9.53979258914006, + "grad_norm": 0.3702818751335144, + "learning_rate": 0.00048580938501894486, + "loss": 3.3808, + "step": 32750 + }, + { + "epoch": 9.554357958517828, + "grad_norm": 0.3388266861438751, + "learning_rate": 0.00048563450888953655, + "loss": 3.3844, + "step": 32800 + }, + { + "epoch": 9.568923327895595, + "grad_norm": 0.3394116461277008, + "learning_rate": 0.0004854596327601282, + "loss": 3.3815, + "step": 32850 + }, + { + "epoch": 9.583488697273363, + "grad_norm": 0.3533828854560852, + "learning_rate": 0.0004852847566307198, + "loss": 3.3932, + "step": 32900 + }, + { + "epoch": 9.59805406665113, + "grad_norm": 0.36314722895622253, + "learning_rate": 0.0004851098805013115, + "loss": 3.3978, + "step": 32950 + }, + { + "epoch": 9.612619436028897, + "grad_norm": 0.35144150257110596, + "learning_rate": 0.00048493500437190315, + "loss": 3.3928, + "step": 33000 + }, + { + "epoch": 9.612619436028897, + "eval_accuracy": 0.367995205767567, + "eval_loss": 3.5680465698242188, + "eval_runtime": 180.7083, + "eval_samples_per_second": 92.088, + "eval_steps_per_second": 5.761, + "step": 33000 + }, + { + "epoch": 9.627184805406666, + "grad_norm": 0.33490830659866333, + "learning_rate": 0.0004847601282424949, + "loss": 3.3954, + "step": 33050 + }, + { + "epoch": 9.641750174784432, + "grad_norm": 0.35083308815956116, + "learning_rate": 0.00048458525211308653, + "loss": 3.3905, + "step": 33100 + }, + { + "epoch": 9.6563155441622, + "grad_norm": 0.3497445285320282, + "learning_rate": 0.00048441037598367817, + "loss": 3.392, + "step": 33150 + }, + { + "epoch": 9.670880913539968, + "grad_norm": 0.37605422735214233, + "learning_rate": 0.00048423549985426986, + "loss": 3.3965, + "step": 33200 + }, + { + "epoch": 9.685446282917734, + "grad_norm": 0.3329659104347229, + "learning_rate": 0.0004840606237248615, + "loss": 3.3914, + "step": 33250 + }, + { + "epoch": 9.700011652295503, + "grad_norm": 0.3359730839729309, + "learning_rate": 0.0004838857475954532, + "loss": 3.3981, + "step": 33300 + }, + { + "epoch": 9.71457702167327, + "grad_norm": 0.3315553367137909, + "learning_rate": 0.0004837108714660448, + "loss": 3.3989, + "step": 33350 + }, + { + "epoch": 9.729142391051036, + "grad_norm": 0.3292211890220642, + "learning_rate": 0.0004835359953366365, + "loss": 3.4053, + "step": 33400 + }, + { + "epoch": 9.743707760428805, + "grad_norm": 0.3730834424495697, + "learning_rate": 0.00048336111920722815, + "loss": 3.4121, + "step": 33450 + }, + { + "epoch": 9.758273129806572, + "grad_norm": 0.3661917448043823, + "learning_rate": 0.0004831862430778198, + "loss": 3.4055, + "step": 33500 + }, + { + "epoch": 9.772838499184338, + "grad_norm": 0.35423949360847473, + "learning_rate": 0.00048301136694841153, + "loss": 3.3926, + "step": 33550 + }, + { + "epoch": 9.787403868562107, + "grad_norm": 0.3627997934818268, + "learning_rate": 0.00048283649081900317, + "loss": 3.4121, + "step": 33600 + }, + { + "epoch": 9.801969237939874, + "grad_norm": 0.34267759323120117, + "learning_rate": 0.00048266161468959486, + "loss": 3.3994, + "step": 33650 + }, + { + "epoch": 9.816534607317642, + "grad_norm": 0.33956974744796753, + "learning_rate": 0.0004824867385601865, + "loss": 3.403, + "step": 33700 + }, + { + "epoch": 9.831099976695409, + "grad_norm": 0.32388654351234436, + "learning_rate": 0.00048231186243077813, + "loss": 3.3997, + "step": 33750 + }, + { + "epoch": 9.845665346073176, + "grad_norm": 0.3442712724208832, + "learning_rate": 0.0004821369863013698, + "loss": 3.4033, + "step": 33800 + }, + { + "epoch": 9.860230715450944, + "grad_norm": 0.3338857889175415, + "learning_rate": 0.00048196211017196146, + "loss": 3.3966, + "step": 33850 + }, + { + "epoch": 9.874796084828711, + "grad_norm": 0.3293977677822113, + "learning_rate": 0.00048178723404255315, + "loss": 3.4135, + "step": 33900 + }, + { + "epoch": 9.88936145420648, + "grad_norm": 0.36276566982269287, + "learning_rate": 0.0004816123579131448, + "loss": 3.4042, + "step": 33950 + }, + { + "epoch": 9.903926823584246, + "grad_norm": 0.35830309987068176, + "learning_rate": 0.0004814374817837364, + "loss": 3.4054, + "step": 34000 + }, + { + "epoch": 9.903926823584246, + "eval_accuracy": 0.36873207440044753, + "eval_loss": 3.5603408813476562, + "eval_runtime": 180.5399, + "eval_samples_per_second": 92.174, + "eval_steps_per_second": 5.766, + "step": 34000 + }, + { + "epoch": 9.918492192962013, + "grad_norm": 0.3282095789909363, + "learning_rate": 0.00048126260565432816, + "loss": 3.4053, + "step": 34050 + }, + { + "epoch": 9.933057562339782, + "grad_norm": 0.32416263222694397, + "learning_rate": 0.0004810877295249198, + "loss": 3.407, + "step": 34100 + }, + { + "epoch": 9.947622931717548, + "grad_norm": 0.33595913648605347, + "learning_rate": 0.0004809128533955115, + "loss": 3.3987, + "step": 34150 + }, + { + "epoch": 9.962188301095315, + "grad_norm": 0.3410275876522064, + "learning_rate": 0.0004807379772661031, + "loss": 3.4105, + "step": 34200 + }, + { + "epoch": 9.976753670473084, + "grad_norm": 0.3720123767852783, + "learning_rate": 0.0004805631011366948, + "loss": 3.4018, + "step": 34250 + }, + { + "epoch": 9.99131903985085, + "grad_norm": 0.3672340512275696, + "learning_rate": 0.00048038822500728645, + "loss": 3.4149, + "step": 34300 + }, + { + "epoch": 10.005826147751106, + "grad_norm": 0.35301780700683594, + "learning_rate": 0.0004802133488778781, + "loss": 3.365, + "step": 34350 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.3307633697986603, + "learning_rate": 0.0004800384727484698, + "loss": 3.2972, + "step": 34400 + }, + { + "epoch": 10.034956886506642, + "grad_norm": 0.33249130845069885, + "learning_rate": 0.0004798635966190614, + "loss": 3.3138, + "step": 34450 + }, + { + "epoch": 10.049522255884408, + "grad_norm": 0.3358910083770752, + "learning_rate": 0.00047968872048965316, + "loss": 3.3001, + "step": 34500 + }, + { + "epoch": 10.064087625262177, + "grad_norm": 0.36032402515411377, + "learning_rate": 0.0004795138443602448, + "loss": 3.3061, + "step": 34550 + }, + { + "epoch": 10.078652994639944, + "grad_norm": 0.3334386348724365, + "learning_rate": 0.00047933896823083643, + "loss": 3.3094, + "step": 34600 + }, + { + "epoch": 10.093218364017712, + "grad_norm": 0.3427768647670746, + "learning_rate": 0.0004791640921014281, + "loss": 3.3143, + "step": 34650 + }, + { + "epoch": 10.107783733395479, + "grad_norm": 0.34351709485054016, + "learning_rate": 0.00047898921597201976, + "loss": 3.3146, + "step": 34700 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.3420124351978302, + "learning_rate": 0.00047881433984261145, + "loss": 3.3151, + "step": 34750 + }, + { + "epoch": 10.136914472151014, + "grad_norm": 0.35633090138435364, + "learning_rate": 0.0004786394637132031, + "loss": 3.3189, + "step": 34800 + }, + { + "epoch": 10.151479841528781, + "grad_norm": 0.33765673637390137, + "learning_rate": 0.0004784645875837948, + "loss": 3.3225, + "step": 34850 + }, + { + "epoch": 10.166045210906548, + "grad_norm": 0.32485634088516235, + "learning_rate": 0.0004782897114543864, + "loss": 3.3276, + "step": 34900 + }, + { + "epoch": 10.180610580284316, + "grad_norm": 0.3453110456466675, + "learning_rate": 0.00047811483532497805, + "loss": 3.3325, + "step": 34950 + }, + { + "epoch": 10.195175949662083, + "grad_norm": 0.33640891313552856, + "learning_rate": 0.0004779399591955698, + "loss": 3.3389, + "step": 35000 + }, + { + "epoch": 10.195175949662083, + "eval_accuracy": 0.3684539552946763, + "eval_loss": 3.574207305908203, + "eval_runtime": 180.7408, + "eval_samples_per_second": 92.071, + "eval_steps_per_second": 5.76, + "step": 35000 + }, + { + "epoch": 10.209741319039852, + "grad_norm": 0.3730228543281555, + "learning_rate": 0.00047776508306616143, + "loss": 3.3386, + "step": 35050 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.337240993976593, + "learning_rate": 0.0004775902069367531, + "loss": 3.3323, + "step": 35100 + }, + { + "epoch": 10.238872057795385, + "grad_norm": 0.35810065269470215, + "learning_rate": 0.00047741533080734476, + "loss": 3.3288, + "step": 35150 + }, + { + "epoch": 10.253437427173154, + "grad_norm": 0.36488911509513855, + "learning_rate": 0.0004772404546779364, + "loss": 3.3393, + "step": 35200 + }, + { + "epoch": 10.26800279655092, + "grad_norm": 0.3996935784816742, + "learning_rate": 0.0004770655785485281, + "loss": 3.3444, + "step": 35250 + }, + { + "epoch": 10.282568165928687, + "grad_norm": 0.3564625382423401, + "learning_rate": 0.0004768907024191197, + "loss": 3.3532, + "step": 35300 + }, + { + "epoch": 10.297133535306456, + "grad_norm": 0.34120872616767883, + "learning_rate": 0.0004767158262897114, + "loss": 3.3494, + "step": 35350 + }, + { + "epoch": 10.311698904684222, + "grad_norm": 0.3489512801170349, + "learning_rate": 0.00047654095016030305, + "loss": 3.3401, + "step": 35400 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.3304221034049988, + "learning_rate": 0.0004763660740308948, + "loss": 3.3385, + "step": 35450 + }, + { + "epoch": 10.340829643439758, + "grad_norm": 0.35581955313682556, + "learning_rate": 0.0004761911979014864, + "loss": 3.3412, + "step": 35500 + }, + { + "epoch": 10.355395012817525, + "grad_norm": 0.336819052696228, + "learning_rate": 0.00047601632177207806, + "loss": 3.353, + "step": 35550 + }, + { + "epoch": 10.369960382195293, + "grad_norm": 0.38106346130371094, + "learning_rate": 0.00047584144564266975, + "loss": 3.3524, + "step": 35600 + }, + { + "epoch": 10.38452575157306, + "grad_norm": 0.3523114025592804, + "learning_rate": 0.0004756665695132614, + "loss": 3.3468, + "step": 35650 + }, + { + "epoch": 10.399091120950827, + "grad_norm": 0.3667258620262146, + "learning_rate": 0.0004754916933838531, + "loss": 3.3502, + "step": 35700 + }, + { + "epoch": 10.413656490328595, + "grad_norm": 0.3516891896724701, + "learning_rate": 0.0004753168172544447, + "loss": 3.3623, + "step": 35750 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.34636440873146057, + "learning_rate": 0.00047514194112503635, + "loss": 3.3703, + "step": 35800 + }, + { + "epoch": 10.44278722908413, + "grad_norm": 0.3442688584327698, + "learning_rate": 0.00047496706499562804, + "loss": 3.3575, + "step": 35850 + }, + { + "epoch": 10.457352598461897, + "grad_norm": 0.35010236501693726, + "learning_rate": 0.0004747921888662197, + "loss": 3.3677, + "step": 35900 + }, + { + "epoch": 10.471917967839664, + "grad_norm": 0.3223050534725189, + "learning_rate": 0.0004746173127368114, + "loss": 3.361, + "step": 35950 + }, + { + "epoch": 10.486483337217432, + "grad_norm": 0.3651178479194641, + "learning_rate": 0.00047444243660740306, + "loss": 3.3566, + "step": 36000 + }, + { + "epoch": 10.486483337217432, + "eval_accuracy": 0.36855473672116507, + "eval_loss": 3.566204071044922, + "eval_runtime": 180.6127, + "eval_samples_per_second": 92.136, + "eval_steps_per_second": 5.764, + "step": 36000 + }, + { + "epoch": 10.5010487065952, + "grad_norm": 0.34635260701179504, + "learning_rate": 0.0004742675604779947, + "loss": 3.3656, + "step": 36050 + }, + { + "epoch": 10.515614075972966, + "grad_norm": 0.3358325660228729, + "learning_rate": 0.0004740926843485864, + "loss": 3.3699, + "step": 36100 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.3611391484737396, + "learning_rate": 0.000473917808219178, + "loss": 3.3752, + "step": 36150 + }, + { + "epoch": 10.544744814728501, + "grad_norm": 0.34786006808280945, + "learning_rate": 0.0004737429320897697, + "loss": 3.3712, + "step": 36200 + }, + { + "epoch": 10.55931018410627, + "grad_norm": 0.34254857897758484, + "learning_rate": 0.00047356805596036135, + "loss": 3.3698, + "step": 36250 + }, + { + "epoch": 10.573875553484037, + "grad_norm": 0.3432486951351166, + "learning_rate": 0.00047339317983095304, + "loss": 3.358, + "step": 36300 + }, + { + "epoch": 10.588440922861803, + "grad_norm": 0.3423634171485901, + "learning_rate": 0.0004732183037015447, + "loss": 3.3799, + "step": 36350 + }, + { + "epoch": 10.603006292239572, + "grad_norm": 0.3549594283103943, + "learning_rate": 0.0004730434275721363, + "loss": 3.3809, + "step": 36400 + }, + { + "epoch": 10.617571661617339, + "grad_norm": 0.36914047598838806, + "learning_rate": 0.00047286855144272806, + "loss": 3.3755, + "step": 36450 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.34810906648635864, + "learning_rate": 0.0004726936753133197, + "loss": 3.3678, + "step": 36500 + }, + { + "epoch": 10.646702400372874, + "grad_norm": 0.36608827114105225, + "learning_rate": 0.0004725187991839114, + "loss": 3.3748, + "step": 36550 + }, + { + "epoch": 10.66126776975064, + "grad_norm": 0.3450092077255249, + "learning_rate": 0.000472343923054503, + "loss": 3.3695, + "step": 36600 + }, + { + "epoch": 10.675833139128407, + "grad_norm": 0.3681490421295166, + "learning_rate": 0.00047216904692509465, + "loss": 3.3782, + "step": 36650 + }, + { + "epoch": 10.690398508506176, + "grad_norm": 0.369987428188324, + "learning_rate": 0.00047199417079568634, + "loss": 3.3902, + "step": 36700 + }, + { + "epoch": 10.704963877883943, + "grad_norm": 0.3441363275051117, + "learning_rate": 0.000471819294666278, + "loss": 3.3744, + "step": 36750 + }, + { + "epoch": 10.719529247261711, + "grad_norm": 0.35835736989974976, + "learning_rate": 0.00047164441853686967, + "loss": 3.3766, + "step": 36800 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.34251275658607483, + "learning_rate": 0.0004714695424074613, + "loss": 3.3909, + "step": 36850 + }, + { + "epoch": 10.748659986017245, + "grad_norm": 0.3410945236682892, + "learning_rate": 0.00047129466627805305, + "loss": 3.3767, + "step": 36900 + }, + { + "epoch": 10.763225355395013, + "grad_norm": 0.3362538516521454, + "learning_rate": 0.0004711197901486447, + "loss": 3.3841, + "step": 36950 + }, + { + "epoch": 10.77779072477278, + "grad_norm": 0.3534057140350342, + "learning_rate": 0.0004709449140192363, + "loss": 3.3865, + "step": 37000 + }, + { + "epoch": 10.77779072477278, + "eval_accuracy": 0.36938544850263144, + "eval_loss": 3.5582756996154785, + "eval_runtime": 180.5512, + "eval_samples_per_second": 92.168, + "eval_steps_per_second": 5.766, + "step": 37000 + }, + { + "epoch": 10.792356094150549, + "grad_norm": 0.3291257917881012, + "learning_rate": 0.000470770037889828, + "loss": 3.3841, + "step": 37050 + }, + { + "epoch": 10.806921463528315, + "grad_norm": 0.31563735008239746, + "learning_rate": 0.00047059516176041965, + "loss": 3.3827, + "step": 37100 + }, + { + "epoch": 10.821486832906082, + "grad_norm": 0.34236061573028564, + "learning_rate": 0.00047042028563101134, + "loss": 3.3738, + "step": 37150 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.3518962860107422, + "learning_rate": 0.000470245409501603, + "loss": 3.3873, + "step": 37200 + }, + { + "epoch": 10.850617571661617, + "grad_norm": 0.34845709800720215, + "learning_rate": 0.0004700705333721946, + "loss": 3.3749, + "step": 37250 + }, + { + "epoch": 10.865182941039384, + "grad_norm": 0.3321661651134491, + "learning_rate": 0.0004698956572427863, + "loss": 3.3812, + "step": 37300 + }, + { + "epoch": 10.879748310417153, + "grad_norm": 0.35323044657707214, + "learning_rate": 0.00046972078111337794, + "loss": 3.3762, + "step": 37350 + }, + { + "epoch": 10.89431367979492, + "grad_norm": 0.358534574508667, + "learning_rate": 0.0004695459049839697, + "loss": 3.3799, + "step": 37400 + }, + { + "epoch": 10.908879049172686, + "grad_norm": 0.3211372494697571, + "learning_rate": 0.0004693710288545613, + "loss": 3.3873, + "step": 37450 + }, + { + "epoch": 10.923444418550455, + "grad_norm": 0.3522772490978241, + "learning_rate": 0.000469196152725153, + "loss": 3.393, + "step": 37500 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.329208105802536, + "learning_rate": 0.00046902127659574465, + "loss": 3.3878, + "step": 37550 + }, + { + "epoch": 10.95257515730599, + "grad_norm": 0.3144879639148712, + "learning_rate": 0.0004688464004663363, + "loss": 3.3864, + "step": 37600 + }, + { + "epoch": 10.967140526683757, + "grad_norm": 0.351262629032135, + "learning_rate": 0.000468671524336928, + "loss": 3.3842, + "step": 37650 + }, + { + "epoch": 10.981705896061523, + "grad_norm": 0.335332989692688, + "learning_rate": 0.0004684966482075196, + "loss": 3.3909, + "step": 37700 + }, + { + "epoch": 10.996271265439292, + "grad_norm": 0.3745553493499756, + "learning_rate": 0.0004683217720781113, + "loss": 3.3869, + "step": 37750 + }, + { + "epoch": 11.010778373339548, + "grad_norm": 0.39662572741508484, + "learning_rate": 0.00046814689594870294, + "loss": 3.3025, + "step": 37800 + }, + { + "epoch": 11.025343742717315, + "grad_norm": 0.35523492097854614, + "learning_rate": 0.0004679720198192946, + "loss": 3.2731, + "step": 37850 + }, + { + "epoch": 11.039909112095083, + "grad_norm": 0.36821720004081726, + "learning_rate": 0.0004677971436898863, + "loss": 3.2698, + "step": 37900 + }, + { + "epoch": 11.05447448147285, + "grad_norm": 0.37665146589279175, + "learning_rate": 0.00046762226756047795, + "loss": 3.2943, + "step": 37950 + }, + { + "epoch": 11.069039850850617, + "grad_norm": 0.33821412920951843, + "learning_rate": 0.00046744739143106964, + "loss": 3.3071, + "step": 38000 + }, + { + "epoch": 11.069039850850617, + "eval_accuracy": 0.3689831459821903, + "eval_loss": 3.5702402591705322, + "eval_runtime": 180.66, + "eval_samples_per_second": 92.112, + "eval_steps_per_second": 5.762, + "step": 38000 + }, + { + "epoch": 11.083605220228385, + "grad_norm": 0.3681955635547638, + "learning_rate": 0.0004672725153016613, + "loss": 3.3036, + "step": 38050 + }, + { + "epoch": 11.098170589606152, + "grad_norm": 0.34856081008911133, + "learning_rate": 0.00046709763917225297, + "loss": 3.2974, + "step": 38100 + }, + { + "epoch": 11.11273595898392, + "grad_norm": 0.3701181411743164, + "learning_rate": 0.0004669227630428446, + "loss": 3.2957, + "step": 38150 + }, + { + "epoch": 11.127301328361687, + "grad_norm": 0.3402550220489502, + "learning_rate": 0.00046674788691343624, + "loss": 3.3014, + "step": 38200 + }, + { + "epoch": 11.141866697739454, + "grad_norm": 0.3537476658821106, + "learning_rate": 0.00046657301078402793, + "loss": 3.2916, + "step": 38250 + }, + { + "epoch": 11.156432067117223, + "grad_norm": 0.3368578553199768, + "learning_rate": 0.00046639813465461957, + "loss": 3.2984, + "step": 38300 + }, + { + "epoch": 11.17099743649499, + "grad_norm": 0.35655760765075684, + "learning_rate": 0.0004662232585252113, + "loss": 3.3049, + "step": 38350 + }, + { + "epoch": 11.185562805872756, + "grad_norm": 0.32713720202445984, + "learning_rate": 0.00046604838239580295, + "loss": 3.307, + "step": 38400 + }, + { + "epoch": 11.200128175250525, + "grad_norm": 0.3530442416667938, + "learning_rate": 0.0004658735062663946, + "loss": 3.3251, + "step": 38450 + }, + { + "epoch": 11.214693544628291, + "grad_norm": 0.3867241144180298, + "learning_rate": 0.0004656986301369863, + "loss": 3.3222, + "step": 38500 + }, + { + "epoch": 11.22925891400606, + "grad_norm": 0.3441658616065979, + "learning_rate": 0.0004655237540075779, + "loss": 3.3201, + "step": 38550 + }, + { + "epoch": 11.243824283383827, + "grad_norm": 0.3687216639518738, + "learning_rate": 0.0004653488778781696, + "loss": 3.3179, + "step": 38600 + }, + { + "epoch": 11.258389652761593, + "grad_norm": 0.3919098973274231, + "learning_rate": 0.00046517400174876124, + "loss": 3.3203, + "step": 38650 + }, + { + "epoch": 11.272955022139362, + "grad_norm": 0.3542415201663971, + "learning_rate": 0.0004649991256193529, + "loss": 3.3219, + "step": 38700 + }, + { + "epoch": 11.287520391517129, + "grad_norm": 0.33851155638694763, + "learning_rate": 0.00046482424948994457, + "loss": 3.3277, + "step": 38750 + }, + { + "epoch": 11.302085760894895, + "grad_norm": 0.3498404026031494, + "learning_rate": 0.0004646493733605362, + "loss": 3.3304, + "step": 38800 + }, + { + "epoch": 11.316651130272664, + "grad_norm": 0.3452480435371399, + "learning_rate": 0.00046447449723112795, + "loss": 3.3376, + "step": 38850 + }, + { + "epoch": 11.33121649965043, + "grad_norm": 0.3540496230125427, + "learning_rate": 0.0004642996211017196, + "loss": 3.3389, + "step": 38900 + }, + { + "epoch": 11.3457818690282, + "grad_norm": 0.3557718098163605, + "learning_rate": 0.0004641247449723113, + "loss": 3.3363, + "step": 38950 + }, + { + "epoch": 11.360347238405966, + "grad_norm": 0.3436686396598816, + "learning_rate": 0.0004639498688429029, + "loss": 3.3269, + "step": 39000 + }, + { + "epoch": 11.360347238405966, + "eval_accuracy": 0.3691524870021947, + "eval_loss": 3.5654149055480957, + "eval_runtime": 180.5254, + "eval_samples_per_second": 92.181, + "eval_steps_per_second": 5.767, + "step": 39000 + }, + { + "epoch": 11.374912607783733, + "grad_norm": 0.35435765981674194, + "learning_rate": 0.00046377499271349455, + "loss": 3.3498, + "step": 39050 + }, + { + "epoch": 11.389477977161501, + "grad_norm": 0.33486226201057434, + "learning_rate": 0.00046360011658408624, + "loss": 3.334, + "step": 39100 + }, + { + "epoch": 11.404043346539268, + "grad_norm": 0.3701058328151703, + "learning_rate": 0.00046342524045467787, + "loss": 3.3274, + "step": 39150 + }, + { + "epoch": 11.418608715917035, + "grad_norm": 0.3655923306941986, + "learning_rate": 0.00046325036432526956, + "loss": 3.3407, + "step": 39200 + }, + { + "epoch": 11.433174085294803, + "grad_norm": 0.3761111795902252, + "learning_rate": 0.0004630754881958612, + "loss": 3.3474, + "step": 39250 + }, + { + "epoch": 11.44773945467257, + "grad_norm": 0.3308749198913574, + "learning_rate": 0.00046290061206645284, + "loss": 3.329, + "step": 39300 + }, + { + "epoch": 11.462304824050339, + "grad_norm": 0.3537072241306305, + "learning_rate": 0.0004627257359370446, + "loss": 3.3509, + "step": 39350 + }, + { + "epoch": 11.476870193428105, + "grad_norm": 0.36065319180488586, + "learning_rate": 0.0004625508598076362, + "loss": 3.3393, + "step": 39400 + }, + { + "epoch": 11.491435562805872, + "grad_norm": 0.3387892246246338, + "learning_rate": 0.0004623759836782279, + "loss": 3.3344, + "step": 39450 + }, + { + "epoch": 11.50600093218364, + "grad_norm": 0.3416099548339844, + "learning_rate": 0.00046220110754881954, + "loss": 3.3415, + "step": 39500 + }, + { + "epoch": 11.520566301561407, + "grad_norm": 0.34926944971084595, + "learning_rate": 0.00046202623141941123, + "loss": 3.3434, + "step": 39550 + }, + { + "epoch": 11.535131670939174, + "grad_norm": 0.3529592752456665, + "learning_rate": 0.00046185135529000287, + "loss": 3.352, + "step": 39600 + }, + { + "epoch": 11.549697040316943, + "grad_norm": 0.32331353425979614, + "learning_rate": 0.0004616764791605945, + "loss": 3.3386, + "step": 39650 + }, + { + "epoch": 11.56426240969471, + "grad_norm": 0.32540374994277954, + "learning_rate": 0.0004615016030311862, + "loss": 3.3506, + "step": 39700 + }, + { + "epoch": 11.578827779072478, + "grad_norm": 0.36556684970855713, + "learning_rate": 0.00046132672690177783, + "loss": 3.3511, + "step": 39750 + }, + { + "epoch": 11.593393148450245, + "grad_norm": 0.37641826272010803, + "learning_rate": 0.0004611518507723696, + "loss": 3.3488, + "step": 39800 + }, + { + "epoch": 11.607958517828012, + "grad_norm": 0.33244505524635315, + "learning_rate": 0.0004609769746429612, + "loss": 3.3593, + "step": 39850 + }, + { + "epoch": 11.62252388720578, + "grad_norm": 0.3254227042198181, + "learning_rate": 0.00046080209851355285, + "loss": 3.3406, + "step": 39900 + }, + { + "epoch": 11.637089256583547, + "grad_norm": 0.35844728350639343, + "learning_rate": 0.00046062722238414454, + "loss": 3.351, + "step": 39950 + }, + { + "epoch": 11.651654625961314, + "grad_norm": 0.34296858310699463, + "learning_rate": 0.0004604523462547362, + "loss": 3.3528, + "step": 40000 + }, + { + "epoch": 11.651654625961314, + "eval_accuracy": 0.3700508175937323, + "eval_loss": 3.5568654537200928, + "eval_runtime": 182.4459, + "eval_samples_per_second": 91.211, + "eval_steps_per_second": 5.706, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 171650, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.36068395515904e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}