diff --git "a/checkpoint-30000/trainer_state.json" "b/checkpoint-30000/trainer_state.json" --- "a/checkpoint-30000/trainer_state.json" +++ "b/checkpoint-30000/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 3.4922268390655518, - "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-30000", - "epoch": 3.228931223764934, + "best_metric": 3.4946022033691406, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_high_2000_8397/checkpoint-30000", + "epoch": 3.234501347708895, "eval_steps": 1000, "global_step": 30000, "is_hyper_param_search": false, @@ -9,4478 +9,4478 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005381552039608223, - "grad_norm": 1.3847299814224243, - "learning_rate": 0.0003, - "loss": 8.4655, + "epoch": 0.005390835579514825, + "grad_norm": 1.6020084619522095, + "learning_rate": 0.00028199999999999997, + "loss": 8.5757, "step": 50 }, { - "epoch": 0.010763104079216447, - "grad_norm": 1.0632425546646118, - "learning_rate": 0.0006, - "loss": 6.8249, + "epoch": 0.01078167115902965, + "grad_norm": 1.8397823572158813, + "learning_rate": 0.0005819999999999999, + "loss": 6.94, "step": 100 }, { - "epoch": 0.01614465611882467, - "grad_norm": 2.088347911834717, - "learning_rate": 0.0005996767589699385, - "loss": 6.4221, + "epoch": 0.016172506738544475, + "grad_norm": 2.0749616622924805, + "learning_rate": 0.0005996956287101997, + "loss": 6.498, "step": 150 }, { - "epoch": 0.021526208158432893, - "grad_norm": 1.0710954666137695, - "learning_rate": 0.0005993535179398771, - "loss": 6.1909, + "epoch": 0.0215633423180593, + "grad_norm": 2.3127713203430176, + "learning_rate": 0.0005993718294657311, + "loss": 6.2576, "step": 200 }, { - "epoch": 0.026907760198041114, - "grad_norm": 1.5236896276474, - "learning_rate": 0.0005990302769098158, - "loss": 6.0442, + "epoch": 0.026954177897574125, + "grad_norm": 0.8723729848861694, + "learning_rate": 0.0005990480302212627, + "loss": 6.1064, "step": 250 }, { - "epoch": 0.03228931223764934, - "grad_norm": 1.8247500658035278, - "learning_rate": 0.0005987070358797543, - "loss": 5.9513, + "epoch": 0.03234501347708895, + "grad_norm": 1.36876380443573, + "learning_rate": 0.0005987242309767944, + "loss": 6.0069, "step": 300 }, { - "epoch": 0.03767086427725756, - "grad_norm": 1.2985302209854126, - "learning_rate": 0.0005983837948496929, - "loss": 5.8643, + "epoch": 0.03773584905660377, + "grad_norm": 1.1116160154342651, + "learning_rate": 0.0005984004317323259, + "loss": 5.8938, "step": 350 }, { - "epoch": 0.04305241631686579, - "grad_norm": 1.256401538848877, - "learning_rate": 0.0005980605538196314, - "loss": 5.8018, + "epoch": 0.0431266846361186, + "grad_norm": 1.7246553897857666, + "learning_rate": 0.0005980766324878575, + "loss": 5.8482, "step": 400 }, { - "epoch": 0.048433968356474004, - "grad_norm": 0.8569295406341553, - "learning_rate": 0.0005977373127895701, - "loss": 5.7053, + "epoch": 0.04851752021563342, + "grad_norm": 1.5949625968933105, + "learning_rate": 0.000597752833243389, + "loss": 5.7717, "step": 450 }, { - "epoch": 0.05381552039608223, - "grad_norm": 0.860461413860321, - "learning_rate": 0.0005974140717595086, - "loss": 5.6413, + "epoch": 0.05390835579514825, + "grad_norm": 1.2913446426391602, + "learning_rate": 0.0005974290339989207, + "loss": 5.6782, "step": 500 }, { - "epoch": 0.05919707243569045, - "grad_norm": 1.6800360679626465, - "learning_rate": 0.0005970908307294472, - "loss": 5.5789, + "epoch": 0.05929919137466307, + "grad_norm": 1.3474897146224976, + "learning_rate": 0.0005971052347544522, + "loss": 5.6001, "step": 550 }, { - "epoch": 0.06457862447529868, - "grad_norm": 1.408766508102417, - "learning_rate": 0.0005967675896993858, - "loss": 5.4957, + "epoch": 0.0646900269541779, + "grad_norm": 1.230842113494873, + "learning_rate": 0.0005967814355099838, + "loss": 5.5233, "step": 600 }, { - "epoch": 0.0699601765149069, - "grad_norm": 1.263355016708374, - "learning_rate": 0.0005964443486693243, - "loss": 5.421, + "epoch": 0.07008086253369272, + "grad_norm": 1.3157235383987427, + "learning_rate": 0.0005964576362655153, + "loss": 5.4574, "step": 650 }, { - "epoch": 0.07534172855451512, - "grad_norm": 1.617833137512207, - "learning_rate": 0.000596121107639263, - "loss": 5.334, + "epoch": 0.07547169811320754, + "grad_norm": 1.1696827411651611, + "learning_rate": 0.0005961338370210469, + "loss": 5.4063, "step": 700 }, { - "epoch": 0.08072328059412334, - "grad_norm": 1.2163816690444946, - "learning_rate": 0.0005957978666092015, - "loss": 5.2925, + "epoch": 0.08086253369272237, + "grad_norm": 1.3056632280349731, + "learning_rate": 0.0005958100377765785, + "loss": 5.3331, "step": 750 }, { - "epoch": 0.08610483263373157, - "grad_norm": 1.4054591655731201, - "learning_rate": 0.0005954746255791401, - "loss": 5.222, + "epoch": 0.0862533692722372, + "grad_norm": 1.6241542100906372, + "learning_rate": 0.00059548623853211, + "loss": 5.2849, "step": 800 }, { - "epoch": 0.09148638467333979, - "grad_norm": 1.0674018859863281, - "learning_rate": 0.0005951513845490787, - "loss": 5.2071, + "epoch": 0.09164420485175202, + "grad_norm": 1.0650733709335327, + "learning_rate": 0.0005951624392876416, + "loss": 5.2456, "step": 850 }, { - "epoch": 0.09686793671294801, - "grad_norm": 1.4989951848983765, - "learning_rate": 0.0005948281435190174, - "loss": 5.1526, + "epoch": 0.09703504043126684, + "grad_norm": 1.5372745990753174, + "learning_rate": 0.0005948386400431732, + "loss": 5.191, "step": 900 }, { - "epoch": 0.10224948875255624, - "grad_norm": 1.028799057006836, - "learning_rate": 0.0005945049024889559, - "loss": 5.1098, + "epoch": 0.10242587601078167, + "grad_norm": 1.4312268495559692, + "learning_rate": 0.0005945148407987047, + "loss": 5.1659, "step": 950 }, { - "epoch": 0.10763104079216446, - "grad_norm": 0.955634355545044, - "learning_rate": 0.0005941816614588944, - "loss": 5.0633, + "epoch": 0.1078167115902965, + "grad_norm": 0.9435037970542908, + "learning_rate": 0.0005941910415542363, + "loss": 5.1266, "step": 1000 }, { - "epoch": 0.10763104079216446, - "eval_accuracy": 0.22915491630956988, - "eval_loss": 4.996466636657715, - "eval_runtime": 202.5064, - "eval_samples_per_second": 88.94, - "eval_steps_per_second": 5.56, + "epoch": 0.1078167115902965, + "eval_accuracy": 0.22553308094716198, + "eval_loss": 5.041633605957031, + "eval_runtime": 152.6311, + "eval_samples_per_second": 118.004, + "eval_steps_per_second": 7.377, "step": 1000 }, { - "epoch": 0.11301259283177269, - "grad_norm": 1.2216582298278809, - "learning_rate": 0.000593858420428833, - "loss": 5.0219, + "epoch": 0.11320754716981132, + "grad_norm": 1.088536024093628, + "learning_rate": 0.0005938672423097679, + "loss": 5.0725, "step": 1050 }, { - "epoch": 0.1183941448713809, - "grad_norm": 1.2919893264770508, - "learning_rate": 0.0005935351793987716, - "loss": 4.9962, + "epoch": 0.11859838274932614, + "grad_norm": 0.9872896075248718, + "learning_rate": 0.0005935434430652995, + "loss": 5.0443, "step": 1100 }, { - "epoch": 0.12377569691098914, - "grad_norm": 1.842529058456421, - "learning_rate": 0.0005932119383687103, - "loss": 4.9736, + "epoch": 0.12398921832884097, + "grad_norm": 0.9727984070777893, + "learning_rate": 0.000593219643820831, + "loss": 5.0164, "step": 1150 }, { - "epoch": 0.12915724895059735, - "grad_norm": 0.9416630268096924, - "learning_rate": 0.0005928886973386488, - "loss": 4.9687, + "epoch": 0.1293800539083558, + "grad_norm": 1.0641847848892212, + "learning_rate": 0.0005928958445763626, + "loss": 4.978, "step": 1200 }, { - "epoch": 0.13453880099020557, - "grad_norm": 1.340468406677246, - "learning_rate": 0.0005925654563085874, - "loss": 4.9251, + "epoch": 0.1347708894878706, + "grad_norm": 0.8890565633773804, + "learning_rate": 0.0005925720453318941, + "loss": 4.9292, "step": 1250 }, { - "epoch": 0.1399203530298138, - "grad_norm": 1.4057867527008057, - "learning_rate": 0.000592242215278526, - "loss": 4.8782, + "epoch": 0.14016172506738545, + "grad_norm": 1.0303993225097656, + "learning_rate": 0.0005922482460874258, + "loss": 4.9, "step": 1300 }, { - "epoch": 0.14530190506942203, - "grad_norm": 1.3462789058685303, - "learning_rate": 0.0005919189742484645, - "loss": 4.8431, + "epoch": 0.14555256064690028, + "grad_norm": 1.0149179697036743, + "learning_rate": 0.0005919244468429573, + "loss": 4.8846, "step": 1350 }, { - "epoch": 0.15068345710903025, - "grad_norm": 0.9291279911994934, - "learning_rate": 0.0005915957332184032, - "loss": 4.8316, + "epoch": 0.1509433962264151, + "grad_norm": 1.0036956071853638, + "learning_rate": 0.0005916006475984889, + "loss": 4.8728, "step": 1400 }, { - "epoch": 0.15606500914863847, - "grad_norm": 0.7789126634597778, - "learning_rate": 0.0005912724921883417, - "loss": 4.8028, + "epoch": 0.15633423180592992, + "grad_norm": 0.8696756958961487, + "learning_rate": 0.0005912768483540205, + "loss": 4.8206, "step": 1450 }, { - "epoch": 0.16144656118824668, - "grad_norm": 1.2111430168151855, - "learning_rate": 0.0005909492511582803, - "loss": 4.7837, + "epoch": 0.16172506738544473, + "grad_norm": 1.1622956991195679, + "learning_rate": 0.0005909530491095521, + "loss": 4.8359, "step": 1500 }, { - "epoch": 0.1668281132278549, - "grad_norm": 1.1703121662139893, - "learning_rate": 0.0005906260101282189, - "loss": 4.7456, + "epoch": 0.16711590296495957, + "grad_norm": 1.1141048669815063, + "learning_rate": 0.0005906292498650836, + "loss": 4.7878, "step": 1550 }, { - "epoch": 0.17220966526746315, - "grad_norm": 0.9755436182022095, - "learning_rate": 0.0005903027690981575, - "loss": 4.7303, + "epoch": 0.1725067385444744, + "grad_norm": 1.274352788925171, + "learning_rate": 0.0005903054506206151, + "loss": 4.7862, "step": 1600 }, { - "epoch": 0.17759121730707136, - "grad_norm": 0.9121830463409424, - "learning_rate": 0.000589979528068096, - "loss": 4.7437, + "epoch": 0.1778975741239892, + "grad_norm": 0.8880785703659058, + "learning_rate": 0.0005899816513761468, + "loss": 4.7445, "step": 1650 }, { - "epoch": 0.18297276934667958, - "grad_norm": 1.2337779998779297, - "learning_rate": 0.0005896562870380347, - "loss": 4.6991, + "epoch": 0.18328840970350405, + "grad_norm": 0.9446322321891785, + "learning_rate": 0.0005896578521316783, + "loss": 4.7346, "step": 1700 }, { - "epoch": 0.1883543213862878, - "grad_norm": 0.7923302054405212, - "learning_rate": 0.0005893330460079732, - "loss": 4.6819, + "epoch": 0.18867924528301888, + "grad_norm": 0.9463135600090027, + "learning_rate": 0.0005893340528872099, + "loss": 4.6952, "step": 1750 }, { - "epoch": 0.19373587342589602, - "grad_norm": 0.9414947032928467, - "learning_rate": 0.0005890098049779118, - "loss": 4.6401, + "epoch": 0.1940700808625337, + "grad_norm": 0.8582187294960022, + "learning_rate": 0.0005890102536427414, + "loss": 4.7029, "step": 1800 }, { - "epoch": 0.19911742546550426, - "grad_norm": 0.8575401306152344, - "learning_rate": 0.0005886865639478504, - "loss": 4.6503, + "epoch": 0.19946091644204852, + "grad_norm": 0.9058123826980591, + "learning_rate": 0.0005886864543982731, + "loss": 4.6735, "step": 1850 }, { - "epoch": 0.20449897750511248, - "grad_norm": 0.874113917350769, - "learning_rate": 0.0005883633229177889, - "loss": 4.6262, + "epoch": 0.20485175202156333, + "grad_norm": 0.9203746318817139, + "learning_rate": 0.0005883626551538046, + "loss": 4.6596, "step": 1900 }, { - "epoch": 0.2098805295447207, - "grad_norm": 0.7579442262649536, - "learning_rate": 0.0005880400818877276, - "loss": 4.5946, + "epoch": 0.21024258760107817, + "grad_norm": 1.0241466760635376, + "learning_rate": 0.0005880388559093362, + "loss": 4.6282, "step": 1950 }, { - "epoch": 0.2152620815843289, - "grad_norm": 0.8062272071838379, - "learning_rate": 0.0005877168408576662, - "loss": 4.579, + "epoch": 0.215633423180593, + "grad_norm": 1.1103599071502686, + "learning_rate": 0.0005877150566648677, + "loss": 4.6194, "step": 2000 }, { - "epoch": 0.2152620815843289, - "eval_accuracy": 0.2712283567521957, - "eval_loss": 4.50425910949707, - "eval_runtime": 211.1194, - "eval_samples_per_second": 85.312, - "eval_steps_per_second": 5.333, + "epoch": 0.215633423180593, + "eval_accuracy": 0.2670949835939572, + "eval_loss": 4.5360541343688965, + "eval_runtime": 152.732, + "eval_samples_per_second": 117.926, + "eval_steps_per_second": 7.372, "step": 2000 }, { - "epoch": 0.22064363362393713, - "grad_norm": 1.1572167873382568, - "learning_rate": 0.0005873935998276048, - "loss": 4.5564, + "epoch": 0.2210242587601078, + "grad_norm": 0.8629128336906433, + "learning_rate": 0.0005873912574203993, + "loss": 4.5954, "step": 2050 }, { - "epoch": 0.22602518566354537, - "grad_norm": 1.1494579315185547, - "learning_rate": 0.0005870703587975433, - "loss": 4.5425, + "epoch": 0.22641509433962265, + "grad_norm": 0.7517758011817932, + "learning_rate": 0.0005870674581759309, + "loss": 4.5774, "step": 2100 }, { - "epoch": 0.2314067377031536, - "grad_norm": 0.9105241298675537, - "learning_rate": 0.0005867471177674818, - "loss": 4.5083, + "epoch": 0.23180592991913745, + "grad_norm": 1.1525517702102661, + "learning_rate": 0.0005867436589314624, + "loss": 4.5567, "step": 2150 }, { - "epoch": 0.2367882897427618, - "grad_norm": 0.9526411890983582, - "learning_rate": 0.0005864238767374205, - "loss": 4.5128, + "epoch": 0.2371967654986523, + "grad_norm": 1.0665569305419922, + "learning_rate": 0.000586419859686994, + "loss": 4.5313, "step": 2200 }, { - "epoch": 0.24216984178237003, - "grad_norm": 0.7726457715034485, - "learning_rate": 0.0005861006357073591, - "loss": 4.4899, + "epoch": 0.24258760107816713, + "grad_norm": 0.8165591359138489, + "learning_rate": 0.0005860960604425256, + "loss": 4.5156, "step": 2250 }, { - "epoch": 0.24755139382197827, - "grad_norm": 1.0442702770233154, - "learning_rate": 0.0005857773946772977, - "loss": 4.4716, + "epoch": 0.24797843665768193, + "grad_norm": 1.0752588510513306, + "learning_rate": 0.0005857722611980571, + "loss": 4.526, "step": 2300 }, { - "epoch": 0.2529329458615865, - "grad_norm": 1.1448233127593994, - "learning_rate": 0.0005854541536472362, - "loss": 4.4607, + "epoch": 0.25336927223719674, + "grad_norm": 0.9010254740715027, + "learning_rate": 0.0005854484619535887, + "loss": 4.4768, "step": 2350 }, { - "epoch": 0.2583144979011947, - "grad_norm": 0.9515467286109924, - "learning_rate": 0.0005851309126171749, - "loss": 4.4544, + "epoch": 0.2587601078167116, + "grad_norm": 0.813308596611023, + "learning_rate": 0.0005851246627091202, + "loss": 4.486, "step": 2400 }, { - "epoch": 0.2636960499408029, - "grad_norm": 0.7929104566574097, - "learning_rate": 0.0005848076715871134, - "loss": 4.4392, + "epoch": 0.2641509433962264, + "grad_norm": 0.997901976108551, + "learning_rate": 0.0005848008634646519, + "loss": 4.4602, "step": 2450 }, { - "epoch": 0.26907760198041114, - "grad_norm": 1.1213116645812988, - "learning_rate": 0.000584484430557052, - "loss": 4.4407, + "epoch": 0.2695417789757412, + "grad_norm": 1.0612961053848267, + "learning_rate": 0.0005844770642201834, + "loss": 4.4441, "step": 2500 }, { - "epoch": 0.27445915402001936, - "grad_norm": 1.006108045578003, - "learning_rate": 0.0005841611895269906, - "loss": 4.3997, + "epoch": 0.2749326145552561, + "grad_norm": 0.7074645757675171, + "learning_rate": 0.000584153264975715, + "loss": 4.4268, "step": 2550 }, { - "epoch": 0.2798407060596276, - "grad_norm": 0.7756773829460144, - "learning_rate": 0.0005838379484969291, - "loss": 4.3982, + "epoch": 0.2803234501347709, + "grad_norm": 0.7893711924552917, + "learning_rate": 0.0005838294657312465, + "loss": 4.3983, "step": 2600 }, { - "epoch": 0.2852222580992358, - "grad_norm": 0.7442255020141602, - "learning_rate": 0.0005835147074668678, - "loss": 4.3751, + "epoch": 0.2857142857142857, + "grad_norm": 0.8888121843338013, + "learning_rate": 0.0005835056664867782, + "loss": 4.4127, "step": 2650 }, { - "epoch": 0.29060381013884407, - "grad_norm": 0.7645350694656372, - "learning_rate": 0.0005831914664368063, - "loss": 4.3873, + "epoch": 0.29110512129380056, + "grad_norm": 0.8462523221969604, + "learning_rate": 0.0005831818672423098, + "loss": 4.4129, "step": 2700 }, { - "epoch": 0.2959853621784523, - "grad_norm": 0.7028666734695435, - "learning_rate": 0.0005828682254067449, - "loss": 4.3659, + "epoch": 0.29649595687331537, + "grad_norm": 0.8949580192565918, + "learning_rate": 0.0005828580679978413, + "loss": 4.3751, "step": 2750 }, { - "epoch": 0.3013669142180605, - "grad_norm": 0.8504071831703186, - "learning_rate": 0.0005825449843766835, - "loss": 4.3617, + "epoch": 0.3018867924528302, + "grad_norm": 0.7590197324752808, + "learning_rate": 0.0005825342687533729, + "loss": 4.3503, "step": 2800 }, { - "epoch": 0.3067484662576687, - "grad_norm": 0.880102276802063, - "learning_rate": 0.0005822217433466221, - "loss": 4.3115, + "epoch": 0.30727762803234504, + "grad_norm": 0.6843862533569336, + "learning_rate": 0.0005822104695089044, + "loss": 4.3515, "step": 2850 }, { - "epoch": 0.31213001829727693, - "grad_norm": 0.7290977835655212, - "learning_rate": 0.0005818985023165607, - "loss": 4.3215, + "epoch": 0.31266846361185985, + "grad_norm": 0.8599426746368408, + "learning_rate": 0.000581886670264436, + "loss": 4.342, "step": 2900 }, { - "epoch": 0.31751157033688515, - "grad_norm": 0.8019910454750061, - "learning_rate": 0.0005815752612864992, - "loss": 4.3149, + "epoch": 0.31805929919137466, + "grad_norm": 0.893551766872406, + "learning_rate": 0.0005815628710199675, + "loss": 4.3359, "step": 2950 }, { - "epoch": 0.32289312237649337, - "grad_norm": 0.7739085555076599, - "learning_rate": 0.0005812520202564378, - "loss": 4.322, + "epoch": 0.32345013477088946, + "grad_norm": 0.8023635149002075, + "learning_rate": 0.0005812390717754992, + "loss": 4.3296, "step": 3000 }, { - "epoch": 0.32289312237649337, - "eval_accuracy": 0.2992396144952079, - "eval_loss": 4.226711750030518, - "eval_runtime": 199.2176, - "eval_samples_per_second": 90.409, - "eval_steps_per_second": 5.652, + "epoch": 0.32345013477088946, + "eval_accuracy": 0.29539221573769714, + "eval_loss": 4.2696380615234375, + "eval_runtime": 152.7589, + "eval_samples_per_second": 117.905, + "eval_steps_per_second": 7.371, "step": 3000 }, { - "epoch": 0.3282746744161016, - "grad_norm": 1.0869840383529663, - "learning_rate": 0.0005809287792263764, - "loss": 4.3072, + "epoch": 0.3288409703504043, + "grad_norm": 0.6987361907958984, + "learning_rate": 0.0005809152725310307, + "loss": 4.3208, "step": 3050 }, { - "epoch": 0.3336562264557098, - "grad_norm": 0.7337630391120911, - "learning_rate": 0.0005806055381963151, - "loss": 4.2759, + "epoch": 0.33423180592991913, + "grad_norm": 0.7668080925941467, + "learning_rate": 0.0005805914732865623, + "loss": 4.3246, "step": 3100 }, { - "epoch": 0.3390377784953181, - "grad_norm": 0.7336916923522949, - "learning_rate": 0.0005802822971662536, - "loss": 4.2838, + "epoch": 0.33962264150943394, + "grad_norm": 0.7270567417144775, + "learning_rate": 0.0005802676740420938, + "loss": 4.2823, "step": 3150 }, { - "epoch": 0.3444193305349263, - "grad_norm": 0.7901497483253479, - "learning_rate": 0.0005799590561361922, - "loss": 4.2757, + "epoch": 0.3450134770889488, + "grad_norm": 0.6801213622093201, + "learning_rate": 0.0005799438747976255, + "loss": 4.2811, "step": 3200 }, { - "epoch": 0.3498008825745345, - "grad_norm": 0.9568474888801575, - "learning_rate": 0.0005796358151061307, - "loss": 4.2574, + "epoch": 0.3504043126684636, + "grad_norm": 0.7790431976318359, + "learning_rate": 0.000579620075553157, + "loss": 4.28, "step": 3250 }, { - "epoch": 0.35518243461414273, - "grad_norm": 0.9219122529029846, - "learning_rate": 0.0005793125740760694, - "loss": 4.2423, + "epoch": 0.3557951482479784, + "grad_norm": 0.8592586517333984, + "learning_rate": 0.0005792962763086886, + "loss": 4.2634, "step": 3300 }, { - "epoch": 0.36056398665375095, - "grad_norm": 0.6552030444145203, - "learning_rate": 0.0005789893330460079, - "loss": 4.2339, + "epoch": 0.3611859838274933, + "grad_norm": 0.7063053846359253, + "learning_rate": 0.0005789724770642201, + "loss": 4.2784, "step": 3350 }, { - "epoch": 0.36594553869335916, - "grad_norm": 0.6318998336791992, - "learning_rate": 0.0005786660920159465, - "loss": 4.2253, + "epoch": 0.3665768194070081, + "grad_norm": 0.7619624733924866, + "learning_rate": 0.0005786486778197517, + "loss": 4.2507, "step": 3400 }, { - "epoch": 0.3713270907329674, - "grad_norm": 0.7610264420509338, - "learning_rate": 0.0005783428509858851, - "loss": 4.2369, + "epoch": 0.3719676549865229, + "grad_norm": 0.7230337858200073, + "learning_rate": 0.0005783248785752833, + "loss": 4.2259, "step": 3450 }, { - "epoch": 0.3767086427725756, - "grad_norm": 0.6418978571891785, - "learning_rate": 0.0005780196099558237, - "loss": 4.2239, + "epoch": 0.37735849056603776, + "grad_norm": 0.7475656270980835, + "learning_rate": 0.0005780010793308148, + "loss": 4.2353, "step": 3500 }, { - "epoch": 0.3820901948121838, - "grad_norm": 0.7791884541511536, - "learning_rate": 0.0005776963689257623, - "loss": 4.2253, + "epoch": 0.38274932614555257, + "grad_norm": 0.7857275009155273, + "learning_rate": 0.0005776772800863464, + "loss": 4.2317, "step": 3550 }, { - "epoch": 0.38747174685179203, - "grad_norm": 0.7014104127883911, - "learning_rate": 0.0005773731278957008, - "loss": 4.1888, + "epoch": 0.3881401617250674, + "grad_norm": 0.6113690733909607, + "learning_rate": 0.000577353480841878, + "loss": 4.2344, "step": 3600 }, { - "epoch": 0.3928532988914003, - "grad_norm": 0.7691745758056641, - "learning_rate": 0.0005770498868656394, - "loss": 4.2055, + "epoch": 0.3935309973045822, + "grad_norm": 0.8256933093070984, + "learning_rate": 0.0005770296815974095, + "loss": 4.2054, "step": 3650 }, { - "epoch": 0.3982348509310085, - "grad_norm": 0.7330165505409241, - "learning_rate": 0.000576726645835578, - "loss": 4.1917, + "epoch": 0.39892183288409705, + "grad_norm": 0.8064551949501038, + "learning_rate": 0.0005767058823529411, + "loss": 4.2078, "step": 3700 }, { - "epoch": 0.40361640297061674, - "grad_norm": 0.6409156918525696, - "learning_rate": 0.0005764034048055167, - "loss": 4.1739, + "epoch": 0.40431266846361186, + "grad_norm": 0.6000850200653076, + "learning_rate": 0.0005763820831084726, + "loss": 4.2117, "step": 3750 }, { - "epoch": 0.40899795501022496, - "grad_norm": 0.751600444316864, - "learning_rate": 0.0005760801637754552, - "loss": 4.1815, + "epoch": 0.40970350404312667, + "grad_norm": 0.6357919573783875, + "learning_rate": 0.0005760582838640043, + "loss": 4.205, "step": 3800 }, { - "epoch": 0.4143795070498332, - "grad_norm": 0.6341384649276733, - "learning_rate": 0.0005757569227453937, - "loss": 4.1752, + "epoch": 0.41509433962264153, + "grad_norm": 0.6840759515762329, + "learning_rate": 0.0005757344846195359, + "loss": 4.1844, "step": 3850 }, { - "epoch": 0.4197610590894414, - "grad_norm": 0.8716601729393005, - "learning_rate": 0.0005754336817153324, - "loss": 4.1772, + "epoch": 0.42048517520215634, + "grad_norm": 0.7748379111289978, + "learning_rate": 0.0005754106853750674, + "loss": 4.1815, "step": 3900 }, { - "epoch": 0.4251426111290496, - "grad_norm": 0.7268335223197937, - "learning_rate": 0.0005751104406852709, - "loss": 4.1579, + "epoch": 0.42587601078167114, + "grad_norm": 0.6725760698318481, + "learning_rate": 0.000575086886130599, + "loss": 4.1742, "step": 3950 }, { - "epoch": 0.4305241631686578, - "grad_norm": 0.7490660548210144, - "learning_rate": 0.0005747871996552096, - "loss": 4.1439, + "epoch": 0.431266846361186, + "grad_norm": 0.8388188481330872, + "learning_rate": 0.0005747630868861306, + "loss": 4.1751, "step": 4000 }, { - "epoch": 0.4305241631686578, - "eval_accuracy": 0.31218462820231296, - "eval_loss": 4.092499256134033, - "eval_runtime": 199.4467, - "eval_samples_per_second": 90.305, - "eval_steps_per_second": 5.646, + "epoch": 0.431266846361186, + "eval_accuracy": 0.31143709633414934, + "eval_loss": 4.099897384643555, + "eval_runtime": 152.799, + "eval_samples_per_second": 117.874, + "eval_steps_per_second": 7.369, "step": 4000 }, { - "epoch": 0.43590571520826604, - "grad_norm": 0.9424974918365479, - "learning_rate": 0.0005744639586251481, - "loss": 4.1592, + "epoch": 0.4366576819407008, + "grad_norm": 0.5372797250747681, + "learning_rate": 0.0005744392876416622, + "loss": 4.1637, "step": 4050 }, { - "epoch": 0.44128726724787426, - "grad_norm": 0.771391749382019, - "learning_rate": 0.0005741407175950867, - "loss": 4.1455, + "epoch": 0.4420485175202156, + "grad_norm": 0.7316073775291443, + "learning_rate": 0.0005741154883971936, + "loss": 4.1648, "step": 4100 }, { - "epoch": 0.44666881928748253, - "grad_norm": 0.7916195392608643, - "learning_rate": 0.0005738174765650253, - "loss": 4.1329, + "epoch": 0.4474393530997305, + "grad_norm": 0.6784662008285522, + "learning_rate": 0.0005737916891527253, + "loss": 4.1653, "step": 4150 }, { - "epoch": 0.45205037132709075, - "grad_norm": 0.7015873789787292, - "learning_rate": 0.0005734942355349638, - "loss": 4.1338, + "epoch": 0.4528301886792453, + "grad_norm": 0.6223421692848206, + "learning_rate": 0.0005734678899082568, + "loss": 4.1481, "step": 4200 }, { - "epoch": 0.45743192336669897, - "grad_norm": 0.62185138463974, - "learning_rate": 0.0005731709945049025, - "loss": 4.1517, + "epoch": 0.4582210242587601, + "grad_norm": 0.6360011100769043, + "learning_rate": 0.0005731440906637884, + "loss": 4.1347, "step": 4250 }, { - "epoch": 0.4628134754063072, - "grad_norm": 0.6499165296554565, - "learning_rate": 0.000572847753474841, - "loss": 4.1249, + "epoch": 0.4636118598382749, + "grad_norm": 0.9023606181144714, + "learning_rate": 0.0005728202914193199, + "loss": 4.1318, "step": 4300 }, { - "epoch": 0.4681950274459154, - "grad_norm": 0.8142028450965881, - "learning_rate": 0.0005725245124447796, - "loss": 4.1094, + "epoch": 0.46900269541778977, + "grad_norm": 0.5858056545257568, + "learning_rate": 0.0005724964921748516, + "loss": 4.1316, "step": 4350 }, { - "epoch": 0.4735765794855236, - "grad_norm": 0.686265230178833, - "learning_rate": 0.0005722012714147182, - "loss": 4.1156, + "epoch": 0.4743935309973046, + "grad_norm": 0.7229328751564026, + "learning_rate": 0.0005721726929303831, + "loss": 4.1268, "step": 4400 }, { - "epoch": 0.47895813152513184, - "grad_norm": 0.6861261129379272, - "learning_rate": 0.0005718780303846568, - "loss": 4.0911, + "epoch": 0.4797843665768194, + "grad_norm": 0.8891165256500244, + "learning_rate": 0.0005718488936859147, + "loss": 4.1138, "step": 4450 }, { - "epoch": 0.48433968356474005, - "grad_norm": 0.717899739742279, - "learning_rate": 0.0005715547893545953, - "loss": 4.0984, + "epoch": 0.48517520215633425, + "grad_norm": 0.6901257038116455, + "learning_rate": 0.0005715250944414462, + "loss": 4.1243, "step": 4500 }, { - "epoch": 0.48972123560434827, - "grad_norm": 0.635183572769165, - "learning_rate": 0.000571231548324534, - "loss": 4.0915, + "epoch": 0.49056603773584906, + "grad_norm": 0.6422321200370789, + "learning_rate": 0.0005712012951969778, + "loss": 4.1152, "step": 4550 }, { - "epoch": 0.49510278764395654, - "grad_norm": 0.6602552533149719, - "learning_rate": 0.0005709083072944725, - "loss": 4.0886, + "epoch": 0.49595687331536387, + "grad_norm": 0.7133573293685913, + "learning_rate": 0.0005708774959525094, + "loss": 4.0984, "step": 4600 }, { - "epoch": 0.5004843396835648, - "grad_norm": 0.7243141531944275, - "learning_rate": 0.0005705850662644111, - "loss": 4.0928, + "epoch": 0.5013477088948787, + "grad_norm": 0.625822126865387, + "learning_rate": 0.000570553696708041, + "loss": 4.0931, "step": 4650 }, { - "epoch": 0.505865891723173, - "grad_norm": 0.6310231685638428, - "learning_rate": 0.0005702618252343497, - "loss": 4.0986, + "epoch": 0.5067385444743935, + "grad_norm": 0.6901353001594543, + "learning_rate": 0.0005702298974635725, + "loss": 4.1064, "step": 4700 }, { - "epoch": 0.5112474437627812, - "grad_norm": 0.5868064761161804, - "learning_rate": 0.0005699385842042882, - "loss": 4.0687, + "epoch": 0.5121293800539084, + "grad_norm": 0.6352577805519104, + "learning_rate": 0.0005699060982191041, + "loss": 4.0958, "step": 4750 }, { - "epoch": 0.5166289958023894, - "grad_norm": 0.6389384269714355, - "learning_rate": 0.0005696153431742269, - "loss": 4.0789, + "epoch": 0.5175202156334232, + "grad_norm": 0.7718018293380737, + "learning_rate": 0.0005695822989746357, + "loss": 4.0929, "step": 4800 }, { - "epoch": 0.5220105478419976, - "grad_norm": 0.7776073217391968, - "learning_rate": 0.0005692921021441655, - "loss": 4.0977, + "epoch": 0.522911051212938, + "grad_norm": 0.6640039682388306, + "learning_rate": 0.0005692584997301672, + "loss": 4.0793, "step": 4850 }, { - "epoch": 0.5273920998816058, - "grad_norm": 0.6234896779060364, - "learning_rate": 0.0005689688611141041, - "loss": 4.0547, + "epoch": 0.5283018867924528, + "grad_norm": 0.5986649990081787, + "learning_rate": 0.0005689347004856988, + "loss": 4.0624, "step": 4900 }, { - "epoch": 0.5327736519212141, - "grad_norm": 0.8750380277633667, - "learning_rate": 0.0005686456200840426, - "loss": 4.0718, + "epoch": 0.5336927223719676, + "grad_norm": 0.6509894132614136, + "learning_rate": 0.0005686109012412304, + "loss": 4.08, "step": 4950 }, { - "epoch": 0.5381552039608223, - "grad_norm": 0.7700529098510742, - "learning_rate": 0.0005683223790539811, - "loss": 4.043, + "epoch": 0.5390835579514824, + "grad_norm": 0.5881455540657043, + "learning_rate": 0.000568287101996762, + "loss": 4.0618, "step": 5000 }, { - "epoch": 0.5381552039608223, - "eval_accuracy": 0.3214325100957547, - "eval_loss": 3.9926369190216064, - "eval_runtime": 225.5059, - "eval_samples_per_second": 79.869, - "eval_steps_per_second": 4.993, + "epoch": 0.5390835579514824, + "eval_accuracy": 0.3202667732623931, + "eval_loss": 4.000813007354736, + "eval_runtime": 152.9398, + "eval_samples_per_second": 117.765, + "eval_steps_per_second": 7.362, "step": 5000 }, { - "epoch": 0.5435367560004305, - "grad_norm": 0.7643857002258301, - "learning_rate": 0.0005679991380239198, - "loss": 4.0605, + "epoch": 0.5444743935309974, + "grad_norm": 0.6732229590415955, + "learning_rate": 0.0005679633027522935, + "loss": 4.068, "step": 5050 }, { - "epoch": 0.5489183080400387, - "grad_norm": 0.554442286491394, - "learning_rate": 0.0005676758969938584, - "loss": 4.0446, + "epoch": 0.5498652291105122, + "grad_norm": 0.6501848697662354, + "learning_rate": 0.000567639503507825, + "loss": 4.0611, "step": 5100 }, { - "epoch": 0.5542998600796469, - "grad_norm": 0.7216308116912842, - "learning_rate": 0.000567352655963797, - "loss": 4.0313, + "epoch": 0.555256064690027, + "grad_norm": 0.6546076536178589, + "learning_rate": 0.0005673157042633567, + "loss": 4.0601, "step": 5150 }, { - "epoch": 0.5596814121192552, - "grad_norm": 0.6057455539703369, - "learning_rate": 0.0005670294149337355, - "loss": 4.0337, + "epoch": 0.5606469002695418, + "grad_norm": 0.6216645836830139, + "learning_rate": 0.0005669919050188883, + "loss": 4.0788, "step": 5200 }, { - "epoch": 0.5650629641588634, - "grad_norm": 0.535860002040863, - "learning_rate": 0.0005667061739036742, - "loss": 4.0392, + "epoch": 0.5660377358490566, + "grad_norm": 0.6688370704650879, + "learning_rate": 0.0005666681057744198, + "loss": 4.0529, "step": 5250 }, { - "epoch": 0.5704445161984716, - "grad_norm": 0.647204577922821, - "learning_rate": 0.0005663829328736127, - "loss": 4.0421, + "epoch": 0.5714285714285714, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0005663443065299514, + "loss": 4.0448, "step": 5300 }, { - "epoch": 0.5758260682380799, - "grad_norm": 0.5126392841339111, - "learning_rate": 0.0005660596918435512, - "loss": 4.0318, + "epoch": 0.5768194070080862, + "grad_norm": 0.6612991094589233, + "learning_rate": 0.000566020507285483, + "loss": 4.0386, "step": 5350 }, { - "epoch": 0.5812076202776881, - "grad_norm": 0.6040687561035156, - "learning_rate": 0.0005657364508134899, - "loss": 4.0117, + "epoch": 0.5822102425876011, + "grad_norm": 0.5907670855522156, + "learning_rate": 0.0005656967080410146, + "loss": 4.0478, "step": 5400 }, { - "epoch": 0.5865891723172963, - "grad_norm": 0.6971921324729919, - "learning_rate": 0.0005654132097834284, - "loss": 4.025, + "epoch": 0.5876010781671159, + "grad_norm": 0.539881706237793, + "learning_rate": 0.000565372908796546, + "loss": 4.0481, "step": 5450 }, { - "epoch": 0.5919707243569046, - "grad_norm": 0.5343753099441528, - "learning_rate": 0.0005650899687533671, - "loss": 4.015, + "epoch": 0.5929919137466307, + "grad_norm": 0.6923815608024597, + "learning_rate": 0.0005650491095520777, + "loss": 4.028, "step": 5500 }, { - "epoch": 0.5973522763965128, - "grad_norm": 0.5600801706314087, - "learning_rate": 0.0005647667277233056, - "loss": 4.0087, + "epoch": 0.5983827493261455, + "grad_norm": 0.6604423522949219, + "learning_rate": 0.0005647253103076092, + "loss": 4.0191, "step": 5550 }, { - "epoch": 0.602733828436121, - "grad_norm": 0.6542683243751526, - "learning_rate": 0.0005644434866932442, - "loss": 3.9957, + "epoch": 0.6037735849056604, + "grad_norm": 0.6762202382087708, + "learning_rate": 0.0005644015110631408, + "loss": 4.038, "step": 5600 }, { - "epoch": 0.6081153804757292, - "grad_norm": 0.6249606609344482, - "learning_rate": 0.0005641202456631828, - "loss": 4.0136, + "epoch": 0.6091644204851752, + "grad_norm": 0.6797250509262085, + "learning_rate": 0.0005640777118186723, + "loss": 4.015, "step": 5650 }, { - "epoch": 0.6134969325153374, - "grad_norm": 0.6572969555854797, - "learning_rate": 0.0005637970046331214, - "loss": 4.0046, + "epoch": 0.6145552560646901, + "grad_norm": 0.5718669891357422, + "learning_rate": 0.000563753912574204, + "loss": 4.0142, "step": 5700 }, { - "epoch": 0.6188784845549457, - "grad_norm": 0.6926669478416443, - "learning_rate": 0.00056347376360306, - "loss": 3.9869, + "epoch": 0.6199460916442049, + "grad_norm": 0.6036492586135864, + "learning_rate": 0.0005634301133297355, + "loss": 3.9904, "step": 5750 }, { - "epoch": 0.6242600365945539, - "grad_norm": 0.6491366624832153, - "learning_rate": 0.0005631505225729985, - "loss": 4.0004, + "epoch": 0.6253369272237197, + "grad_norm": 0.6295326352119446, + "learning_rate": 0.0005631063140852671, + "loss": 3.9973, "step": 5800 }, { - "epoch": 0.6296415886341621, - "grad_norm": 0.6138956546783447, - "learning_rate": 0.0005628272815429371, - "loss": 3.9814, + "epoch": 0.6307277628032345, + "grad_norm": 0.6866003274917603, + "learning_rate": 0.0005627825148407986, + "loss": 4.0086, "step": 5850 }, { - "epoch": 0.6350231406737703, - "grad_norm": 0.6467788219451904, - "learning_rate": 0.0005625040405128757, - "loss": 3.9771, + "epoch": 0.6361185983827493, + "grad_norm": 0.6472943425178528, + "learning_rate": 0.0005624587155963302, + "loss": 4.0048, "step": 5900 }, { - "epoch": 0.6404046927133785, - "grad_norm": 0.6469590067863464, - "learning_rate": 0.0005621807994828143, - "loss": 3.9882, + "epoch": 0.6415094339622641, + "grad_norm": 0.6805757284164429, + "learning_rate": 0.0005621349163518618, + "loss": 3.9913, "step": 5950 }, { - "epoch": 0.6457862447529867, - "grad_norm": 0.7396846413612366, - "learning_rate": 0.0005618575584527529, - "loss": 3.9759, + "epoch": 0.6469002695417789, + "grad_norm": 0.6941430568695068, + "learning_rate": 0.0005618111171073934, + "loss": 4.0077, "step": 6000 }, { - "epoch": 0.6457862447529867, - "eval_accuracy": 0.32833664054615025, - "eval_loss": 3.919649839401245, - "eval_runtime": 200.1239, - "eval_samples_per_second": 89.999, - "eval_steps_per_second": 5.627, + "epoch": 0.6469002695417789, + "eval_accuracy": 0.3263130891634934, + "eval_loss": 3.9324727058410645, + "eval_runtime": 152.9066, + "eval_samples_per_second": 117.791, + "eval_steps_per_second": 7.364, "step": 6000 }, { - "epoch": 0.651167796792595, - "grad_norm": 0.7368170619010925, - "learning_rate": 0.0005615343174226915, + "epoch": 0.6522911051212938, + "grad_norm": 0.6350868344306946, + "learning_rate": 0.0005614873178629249, "loss": 3.9789, "step": 6050 }, { - "epoch": 0.6565493488322032, - "grad_norm": 0.593928873538971, - "learning_rate": 0.00056121107639263, - "loss": 3.962, + "epoch": 0.6576819407008087, + "grad_norm": 0.6860512495040894, + "learning_rate": 0.0005611635186184565, + "loss": 4.0102, "step": 6100 }, { - "epoch": 0.6619309008718114, - "grad_norm": 0.5571395754814148, - "learning_rate": 0.0005608878353625687, - "loss": 3.9732, + "epoch": 0.6630727762803235, + "grad_norm": 0.771823525428772, + "learning_rate": 0.0005608397193739882, + "loss": 3.9907, "step": 6150 }, { - "epoch": 0.6673124529114196, - "grad_norm": 0.6184373497962952, - "learning_rate": 0.0005605645943325072, - "loss": 3.9618, + "epoch": 0.6684636118598383, + "grad_norm": 0.6326267123222351, + "learning_rate": 0.0005605159201295196, + "loss": 3.9633, "step": 6200 }, { - "epoch": 0.6726940049510278, - "grad_norm": 0.6731172800064087, - "learning_rate": 0.0005602413533024458, - "loss": 3.9629, + "epoch": 0.6738544474393531, + "grad_norm": 0.5878480672836304, + "learning_rate": 0.0005601921208850511, + "loss": 3.9821, "step": 6250 }, { - "epoch": 0.6780755569906362, - "grad_norm": Infinity, - "learning_rate": 0.0005599245770929855, - "loss": 3.9661, + "epoch": 0.6792452830188679, + "grad_norm": 0.6786486506462097, + "learning_rate": 0.0005598683216405828, + "loss": 3.9856, "step": 6300 }, { - "epoch": 0.6834571090302444, - "grad_norm": 0.6674894690513611, - "learning_rate": 0.0005596013360629242, - "loss": 3.9598, + "epoch": 0.6846361185983828, + "grad_norm": 0.5655982494354248, + "learning_rate": 0.0005595445223961144, + "loss": 3.9732, "step": 6350 }, { - "epoch": 0.6888386610698526, - "grad_norm": 0.6270789504051208, - "learning_rate": 0.0005592780950328628, - "loss": 3.9443, + "epoch": 0.6900269541778976, + "grad_norm": 0.6204545497894287, + "learning_rate": 0.0005592207231516459, + "loss": 3.9792, "step": 6400 }, { - "epoch": 0.6942202131094608, - "grad_norm": 0.6163814067840576, - "learning_rate": 0.0005589548540028014, - "loss": 3.9506, + "epoch": 0.6954177897574124, + "grad_norm": 0.718676745891571, + "learning_rate": 0.0005588969239071775, + "loss": 3.9757, "step": 6450 }, { - "epoch": 0.699601765149069, - "grad_norm": 0.6060782670974731, - "learning_rate": 0.0005586316129727399, - "loss": 3.9645, + "epoch": 0.7008086253369272, + "grad_norm": 0.5987905263900757, + "learning_rate": 0.0005585731246627091, + "loss": 3.9713, "step": 6500 }, { - "epoch": 0.7049833171886772, - "grad_norm": 0.6246522068977356, - "learning_rate": 0.0005583083719426786, - "loss": 3.9614, + "epoch": 0.706199460916442, + "grad_norm": 0.6304261684417725, + "learning_rate": 0.0005582493254182407, + "loss": 3.9699, "step": 6550 }, { - "epoch": 0.7103648692282855, - "grad_norm": 0.603921115398407, - "learning_rate": 0.0005579851309126171, - "loss": 3.94, + "epoch": 0.7115902964959568, + "grad_norm": 0.5621398091316223, + "learning_rate": 0.0005579255261737722, + "loss": 3.9707, "step": 6600 }, { - "epoch": 0.7157464212678937, - "grad_norm": 0.5655505061149597, - "learning_rate": 0.0005576618898825558, - "loss": 3.9318, + "epoch": 0.7169811320754716, + "grad_norm": 0.637505829334259, + "learning_rate": 0.0005576017269293038, + "loss": 3.9813, "step": 6650 }, { - "epoch": 0.7211279733075019, - "grad_norm": 0.604542076587677, - "learning_rate": 0.0005573386488524943, - "loss": 3.9307, + "epoch": 0.7223719676549866, + "grad_norm": 0.6425164341926575, + "learning_rate": 0.0005572779276848353, + "loss": 3.9482, "step": 6700 }, { - "epoch": 0.7265095253471101, - "grad_norm": 0.600004255771637, - "learning_rate": 0.0005570154078224328, - "loss": 3.9442, + "epoch": 0.7277628032345014, + "grad_norm": 0.624649703502655, + "learning_rate": 0.000556954128440367, + "loss": 3.9741, "step": 6750 }, { - "epoch": 0.7318910773867183, - "grad_norm": 0.6960250735282898, - "learning_rate": 0.0005566921667923715, - "loss": 3.9425, + "epoch": 0.7331536388140162, + "grad_norm": 0.6176960468292236, + "learning_rate": 0.0005566303291958984, + "loss": 3.9436, "step": 6800 }, { - "epoch": 0.7372726294263265, - "grad_norm": 0.5619109869003296, - "learning_rate": 0.00055636892576231, - "loss": 3.9073, + "epoch": 0.738544474393531, + "grad_norm": 0.6119649410247803, + "learning_rate": 0.0005563065299514301, + "loss": 3.9442, "step": 6850 }, { - "epoch": 0.7426541814659348, - "grad_norm": 0.6036113500595093, - "learning_rate": 0.0005560456847322487, - "loss": 3.9305, + "epoch": 0.7439353099730458, + "grad_norm": 0.5820748805999756, + "learning_rate": 0.0005559827307069616, + "loss": 3.9463, "step": 6900 }, { - "epoch": 0.748035733505543, - "grad_norm": 0.6234127283096313, - "learning_rate": 0.0005557224437021872, - "loss": 3.9285, + "epoch": 0.7493261455525606, + "grad_norm": 0.6512102484703064, + "learning_rate": 0.0005556589314624932, + "loss": 3.9498, "step": 6950 }, { - "epoch": 0.7534172855451512, - "grad_norm": 0.6131523251533508, - "learning_rate": 0.0005553992026721258, - "loss": 3.922, + "epoch": 0.7547169811320755, + "grad_norm": 0.548623263835907, + "learning_rate": 0.0005553351322180247, + "loss": 3.9306, "step": 7000 }, { - "epoch": 0.7534172855451512, - "eval_accuracy": 0.3334209437785411, - "eval_loss": 3.861107349395752, - "eval_runtime": 205.1306, - "eval_samples_per_second": 87.803, - "eval_steps_per_second": 5.489, + "epoch": 0.7547169811320755, + "eval_accuracy": 0.3323316985782009, + "eval_loss": 3.8741683959960938, + "eval_runtime": 152.843, + "eval_samples_per_second": 117.84, + "eval_steps_per_second": 7.367, "step": 7000 }, { - "epoch": 0.7587988375847594, - "grad_norm": 0.6058487296104431, - "learning_rate": 0.0005550759616420644, - "loss": 3.9083, + "epoch": 0.7601078167115903, + "grad_norm": 0.6566927433013916, + "learning_rate": 0.0005550113329735564, + "loss": 3.9435, "step": 7050 }, { - "epoch": 0.7641803896243676, - "grad_norm": 0.624620258808136, - "learning_rate": 0.000554752720612003, - "loss": 3.9349, + "epoch": 0.7654986522911051, + "grad_norm": 0.8524190187454224, + "learning_rate": 0.0005546875337290879, + "loss": 3.9177, "step": 7100 }, { - "epoch": 0.7695619416639758, - "grad_norm": 0.5525732040405273, - "learning_rate": 0.0005544294795819415, - "loss": 3.9261, + "epoch": 0.77088948787062, + "grad_norm": 0.6614950299263, + "learning_rate": 0.0005543637344846195, + "loss": 3.9448, "step": 7150 }, { - "epoch": 0.7749434937035841, - "grad_norm": 0.5950748324394226, - "learning_rate": 0.0005541062385518801, - "loss": 3.9067, + "epoch": 0.7762803234501348, + "grad_norm": 0.5254852175712585, + "learning_rate": 0.000554039935240151, + "loss": 3.9164, "step": 7200 }, { - "epoch": 0.7803250457431924, - "grad_norm": 0.5052813291549683, - "learning_rate": 0.0005537829975218188, - "loss": 3.9004, + "epoch": 0.7816711590296496, + "grad_norm": 0.6216305494308472, + "learning_rate": 0.0005537161359956826, + "loss": 3.909, "step": 7250 }, { - "epoch": 0.7857065977828006, - "grad_norm": 0.5610913038253784, - "learning_rate": 0.0005534662213123586, - "loss": 3.8911, + "epoch": 0.7870619946091644, + "grad_norm": 0.6032219529151917, + "learning_rate": 0.0005533923367512143, + "loss": 3.9232, "step": 7300 }, { - "epoch": 0.7910881498224088, - "grad_norm": 0.5639758110046387, - "learning_rate": 0.0005531429802822971, - "loss": 3.931, + "epoch": 0.7924528301886793, + "grad_norm": 0.6744025349617004, + "learning_rate": 0.0005530685375067458, + "loss": 3.9224, "step": 7350 }, { - "epoch": 0.796469701862017, - "grad_norm": 0.6221727728843689, - "learning_rate": 0.0005528197392522357, - "loss": 3.9288, + "epoch": 0.7978436657681941, + "grad_norm": 0.568401038646698, + "learning_rate": 0.0005527447382622774, + "loss": 3.918, "step": 7400 }, { - "epoch": 0.8018512539016253, - "grad_norm": 0.5807225108146667, - "learning_rate": 0.0005524964982221743, - "loss": 3.9206, + "epoch": 0.8032345013477089, + "grad_norm": 0.5871582627296448, + "learning_rate": 0.0005524209390178089, + "loss": 3.8955, "step": 7450 }, { - "epoch": 0.8072328059412335, - "grad_norm": 0.6214258074760437, - "learning_rate": 0.0005521732571921129, - "loss": 3.9216, + "epoch": 0.8086253369272237, + "grad_norm": 0.641424834728241, + "learning_rate": 0.0005520971397733406, + "loss": 3.9128, "step": 7500 }, { - "epoch": 0.8126143579808417, - "grad_norm": 0.6088549494743347, - "learning_rate": 0.0005518500161620514, - "loss": 3.9102, + "epoch": 0.8140161725067385, + "grad_norm": 0.60320645570755, + "learning_rate": 0.000551773340528872, + "loss": 3.908, "step": 7550 }, { - "epoch": 0.8179959100204499, - "grad_norm": 0.615315318107605, - "learning_rate": 0.00055152677513199, - "loss": 3.8932, + "epoch": 0.8194070080862533, + "grad_norm": 0.6258028149604797, + "learning_rate": 0.0005514495412844036, + "loss": 3.9018, "step": 7600 }, { - "epoch": 0.8233774620600581, - "grad_norm": 0.5932325124740601, - "learning_rate": 0.0005512035341019286, - "loss": 3.9037, + "epoch": 0.8247978436657682, + "grad_norm": 0.5553128123283386, + "learning_rate": 0.0005511257420399352, + "loss": 3.9058, "step": 7650 }, { - "epoch": 0.8287590140996663, - "grad_norm": 0.632127583026886, - "learning_rate": 0.0005508802930718672, - "loss": 3.9074, + "epoch": 0.8301886792452831, + "grad_norm": 0.6213424801826477, + "learning_rate": 0.0005508019427954668, + "loss": 3.9005, "step": 7700 }, { - "epoch": 0.8341405661392746, - "grad_norm": 0.5906286239624023, - "learning_rate": 0.0005505570520418058, - "loss": 3.8969, + "epoch": 0.8355795148247979, + "grad_norm": 0.6496004462242126, + "learning_rate": 0.0005504781435509983, + "loss": 3.9025, "step": 7750 }, { - "epoch": 0.8395221181788828, - "grad_norm": 0.7419958114624023, - "learning_rate": 0.0005502338110117443, - "loss": 3.911, + "epoch": 0.8409703504043127, + "grad_norm": 0.6373844146728516, + "learning_rate": 0.0005501543443065299, + "loss": 3.8844, "step": 7800 }, { - "epoch": 0.844903670218491, - "grad_norm": 0.5959650874137878, - "learning_rate": 0.000549910569981683, - "loss": 3.885, + "epoch": 0.8463611859838275, + "grad_norm": 0.5957589745521545, + "learning_rate": 0.0005498305450620615, + "loss": 3.8902, "step": 7850 }, { - "epoch": 0.8502852222580992, - "grad_norm": 0.5370450615882874, - "learning_rate": 0.0005495873289516215, - "loss": 3.8883, + "epoch": 0.8517520215633423, + "grad_norm": 0.6516701579093933, + "learning_rate": 0.0005495067458175931, + "loss": 3.8876, "step": 7900 }, { - "epoch": 0.8556667742977074, - "grad_norm": 0.6142526865005493, - "learning_rate": 0.0005492640879215602, - "loss": 3.8891, + "epoch": 0.8571428571428571, + "grad_norm": 0.6374366283416748, + "learning_rate": 0.0005491829465731246, + "loss": 3.9173, "step": 7950 }, { - "epoch": 0.8610483263373157, - "grad_norm": 0.5250054001808167, - "learning_rate": 0.0005489408468914987, - "loss": 3.8731, + "epoch": 0.862533692722372, + "grad_norm": 0.5943326354026794, + "learning_rate": 0.0005488591473286562, + "loss": 3.8636, "step": 8000 }, { - "epoch": 0.8610483263373157, - "eval_accuracy": 0.3380041398923315, - "eval_loss": 3.814573049545288, - "eval_runtime": 218.7029, - "eval_samples_per_second": 82.354, - "eval_steps_per_second": 5.149, + "epoch": 0.862533692722372, + "eval_accuracy": 0.3369106572293666, + "eval_loss": 3.824968099594116, + "eval_runtime": 152.7242, + "eval_samples_per_second": 117.932, + "eval_steps_per_second": 7.373, "step": 8000 }, { - "epoch": 0.8664298783769239, - "grad_norm": 0.5592741370201111, - "learning_rate": 0.0005486176058614372, - "loss": 3.8661, + "epoch": 0.8679245283018868, + "grad_norm": 0.6996535062789917, + "learning_rate": 0.0005485353480841877, + "loss": 3.906, "step": 8050 }, { - "epoch": 0.8718114304165321, - "grad_norm": 0.558358371257782, - "learning_rate": 0.0005482943648313759, - "loss": 3.8665, + "epoch": 0.8733153638814016, + "grad_norm": 0.6234637498855591, + "learning_rate": 0.0005482115488397194, + "loss": 3.8831, "step": 8100 }, { - "epoch": 0.8771929824561403, - "grad_norm": 0.6065065860748291, - "learning_rate": 0.0005479711238013145, - "loss": 3.8815, + "epoch": 0.8787061994609164, + "grad_norm": 0.6020538806915283, + "learning_rate": 0.0005478877495952508, + "loss": 3.8864, "step": 8150 }, { - "epoch": 0.8825745344957485, - "grad_norm": 0.566906750202179, - "learning_rate": 0.0005476478827712531, - "loss": 3.8788, + "epoch": 0.8840970350404312, + "grad_norm": 0.5640770792961121, + "learning_rate": 0.0005475639503507825, + "loss": 3.8799, "step": 8200 }, { - "epoch": 0.8879560865353568, - "grad_norm": 0.5542231202125549, - "learning_rate": 0.0005473246417411916, - "loss": 3.8757, + "epoch": 0.889487870619946, + "grad_norm": 0.6602328419685364, + "learning_rate": 0.000547240151106314, + "loss": 3.8788, "step": 8250 }, { - "epoch": 0.8933376385749651, - "grad_norm": 0.5650736093521118, - "learning_rate": 0.0005470014007111302, - "loss": 3.867, + "epoch": 0.894878706199461, + "grad_norm": 0.6336698532104492, + "learning_rate": 0.0005469163518618456, + "loss": 3.8969, "step": 8300 }, { - "epoch": 0.8987191906145733, - "grad_norm": 0.6359168887138367, - "learning_rate": 0.0005466781596810688, - "loss": 3.8704, + "epoch": 0.9002695417789758, + "grad_norm": 0.562479555606842, + "learning_rate": 0.0005465925526173771, + "loss": 3.8686, "step": 8350 }, { - "epoch": 0.9041007426541815, - "grad_norm": 0.5609973669052124, - "learning_rate": 0.0005463549186510073, - "loss": 3.8704, + "epoch": 0.9056603773584906, + "grad_norm": 0.5576388835906982, + "learning_rate": 0.0005462687533729087, + "loss": 3.8729, "step": 8400 }, { - "epoch": 0.9094822946937897, - "grad_norm": 0.5966047048568726, - "learning_rate": 0.000546031677620946, - "loss": 3.8649, + "epoch": 0.9110512129380054, + "grad_norm": 0.5111304521560669, + "learning_rate": 0.0005459449541284403, + "loss": 3.8679, "step": 8450 }, { - "epoch": 0.9148638467333979, - "grad_norm": 0.555415689945221, - "learning_rate": 0.0005457084365908845, - "loss": 3.8775, + "epoch": 0.9164420485175202, + "grad_norm": 0.6154835224151611, + "learning_rate": 0.0005456211548839719, + "loss": 3.8749, "step": 8500 }, { - "epoch": 0.9202453987730062, - "grad_norm": 0.5719990134239197, - "learning_rate": 0.0005453851955608232, - "loss": 3.8601, + "epoch": 0.921832884097035, + "grad_norm": 0.5426989197731018, + "learning_rate": 0.0005452973556395034, + "loss": 3.8685, "step": 8550 }, { - "epoch": 0.9256269508126144, - "grad_norm": 0.5464230179786682, - "learning_rate": 0.0005450619545307617, - "loss": 3.8599, + "epoch": 0.9272237196765498, + "grad_norm": 0.5612382888793945, + "learning_rate": 0.000544973556395035, + "loss": 3.8747, "step": 8600 }, { - "epoch": 0.9310085028522226, - "grad_norm": 0.6069023013114929, - "learning_rate": 0.0005447387135007003, - "loss": 3.8584, + "epoch": 0.9326145552560647, + "grad_norm": 0.5568158626556396, + "learning_rate": 0.0005446497571505667, + "loss": 3.8547, "step": 8650 }, { - "epoch": 0.9363900548918308, - "grad_norm": 0.556610643863678, - "learning_rate": 0.0005444154724706389, - "loss": 3.8636, + "epoch": 0.9380053908355795, + "grad_norm": 0.6064712405204773, + "learning_rate": 0.0005443259579060982, + "loss": 3.8589, "step": 8700 }, { - "epoch": 0.941771606931439, - "grad_norm": 0.5988168120384216, - "learning_rate": 0.0005440922314405775, - "loss": 3.8366, + "epoch": 0.9433962264150944, + "grad_norm": 0.5729287266731262, + "learning_rate": 0.0005440021586616298, + "loss": 3.8548, "step": 8750 }, { - "epoch": 0.9471531589710472, - "grad_norm": 0.5634347200393677, - "learning_rate": 0.0005437689904105161, - "loss": 3.8604, + "epoch": 0.9487870619946092, + "grad_norm": 0.5489872694015503, + "learning_rate": 0.0005436783594171613, + "loss": 3.8599, "step": 8800 }, { - "epoch": 0.9525347110106555, - "grad_norm": 0.5349180102348328, - "learning_rate": 0.0005434457493804546, - "loss": 3.8395, + "epoch": 0.954177897574124, + "grad_norm": 0.6285766959190369, + "learning_rate": 0.0005433545601726929, + "loss": 3.8606, "step": 8850 }, { - "epoch": 0.9579162630502637, - "grad_norm": 0.5537461042404175, - "learning_rate": 0.0005431225083503932, - "loss": 3.8644, + "epoch": 0.9595687331536388, + "grad_norm": 0.566647469997406, + "learning_rate": 0.0005430307609282244, + "loss": 3.8472, "step": 8900 }, { - "epoch": 0.9632978150898719, - "grad_norm": 0.5711461901664734, - "learning_rate": 0.0005427992673203318, - "loss": 3.8352, + "epoch": 0.9649595687331537, + "grad_norm": 0.5826287269592285, + "learning_rate": 0.000542706961683756, + "loss": 3.8404, "step": 8950 }, { - "epoch": 0.9686793671294801, - "grad_norm": 0.6128414273262024, - "learning_rate": 0.0005424760262902704, - "loss": 3.8405, + "epoch": 0.9703504043126685, + "grad_norm": 0.6150389313697815, + "learning_rate": 0.0005423831624392876, + "loss": 3.8653, "step": 9000 }, { - "epoch": 0.9686793671294801, - "eval_accuracy": 0.3417073562677125, - "eval_loss": 3.779106378555298, - "eval_runtime": 210.7209, - "eval_samples_per_second": 85.473, - "eval_steps_per_second": 5.344, + "epoch": 0.9703504043126685, + "eval_accuracy": 0.3401243923451433, + "eval_loss": 3.788287878036499, + "eval_runtime": 153.2128, + "eval_samples_per_second": 117.555, + "eval_steps_per_second": 7.349, "step": 9000 }, { - "epoch": 0.9740609191690883, - "grad_norm": 0.6156434416770935, - "learning_rate": 0.000542152785260209, - "loss": 3.8333, + "epoch": 0.9757412398921833, + "grad_norm": 0.6907781958580017, + "learning_rate": 0.0005420593631948192, + "loss": 3.8789, "step": 9050 }, { - "epoch": 0.9794424712086965, - "grad_norm": 0.5652556419372559, - "learning_rate": 0.0005418295442301476, - "loss": 3.8327, + "epoch": 0.9811320754716981, + "grad_norm": 0.5165227055549622, + "learning_rate": 0.0005417355639503507, + "loss": 3.8372, "step": 9100 }, { - "epoch": 0.9848240232483048, - "grad_norm": 0.6443074941635132, - "learning_rate": 0.0005415063032000861, - "loss": 3.842, + "epoch": 0.9865229110512129, + "grad_norm": 0.563310980796814, + "learning_rate": 0.0005414117647058823, + "loss": 3.848, "step": 9150 }, { - "epoch": 0.9902055752879131, - "grad_norm": 0.632297158241272, - "learning_rate": 0.0005411830621700248, - "loss": 3.8312, + "epoch": 0.9919137466307277, + "grad_norm": 0.5229910016059875, + "learning_rate": 0.0005410879654614139, + "loss": 3.8388, "step": 9200 }, { - "epoch": 0.9955871273275213, - "grad_norm": 0.5863285064697266, - "learning_rate": 0.0005408598211399633, - "loss": 3.8298, + "epoch": 0.9973045822102425, + "grad_norm": 0.5532335638999939, + "learning_rate": 0.0005407641662169455, + "loss": 3.8559, "step": 9250 }, { - "epoch": 1.0009686793671295, - "grad_norm": 0.5393086671829224, - "learning_rate": 0.0005405365801099019, - "loss": 3.8069, + "epoch": 1.0026954177897573, + "grad_norm": 0.5877416729927063, + "learning_rate": 0.000540440366972477, + "loss": 3.8053, "step": 9300 }, { - "epoch": 1.0063502314067376, - "grad_norm": 0.5285312533378601, - "learning_rate": 0.0005402133390798405, - "loss": 3.7619, + "epoch": 1.0080862533692723, + "grad_norm": 0.5423709154129028, + "learning_rate": 0.0005401165677280086, + "loss": 3.7729, "step": 9350 }, { - "epoch": 1.011731783446346, - "grad_norm": 0.5921708941459656, - "learning_rate": 0.000539890098049779, - "loss": 3.7701, + "epoch": 1.013477088948787, + "grad_norm": 0.6410120725631714, + "learning_rate": 0.0005397927684835401, + "loss": 3.7655, "step": 9400 }, { - "epoch": 1.017113335485954, - "grad_norm": 0.5384525060653687, - "learning_rate": 0.0005395668570197177, - "loss": 3.7657, + "epoch": 1.0188679245283019, + "grad_norm": 0.5875224471092224, + "learning_rate": 0.0005394689692390718, + "loss": 3.7643, "step": 9450 }, { - "epoch": 1.0224948875255624, - "grad_norm": 0.5624929666519165, - "learning_rate": 0.0005392436159896562, - "loss": 3.7754, + "epoch": 1.0242587601078168, + "grad_norm": 0.5834726095199585, + "learning_rate": 0.0005391451699946032, + "loss": 3.7698, "step": 9500 }, { - "epoch": 1.0278764395651705, - "grad_norm": 0.5882208943367004, - "learning_rate": 0.0005389203749595948, - "loss": 3.7848, + "epoch": 1.0296495956873315, + "grad_norm": 0.5419567227363586, + "learning_rate": 0.0005388213707501349, + "loss": 3.7799, "step": 9550 }, { - "epoch": 1.0332579916047788, - "grad_norm": 0.5588924884796143, - "learning_rate": 0.0005385971339295334, - "loss": 3.7603, + "epoch": 1.0350404312668464, + "grad_norm": 0.6511868238449097, + "learning_rate": 0.0005384975715056664, + "loss": 3.7784, "step": 9600 }, { - "epoch": 1.0386395436443872, - "grad_norm": 0.5837873220443726, - "learning_rate": 0.000538273892899472, - "loss": 3.7785, + "epoch": 1.0404312668463611, + "grad_norm": 0.6302853226661682, + "learning_rate": 0.000538173772261198, + "loss": 3.7842, "step": 9650 }, { - "epoch": 1.0440210956839953, - "grad_norm": 0.5880899429321289, - "learning_rate": 0.0005379506518694106, - "loss": 3.7506, + "epoch": 1.045822102425876, + "grad_norm": 0.531711220741272, + "learning_rate": 0.0005378499730167295, + "loss": 3.7661, "step": 9700 }, { - "epoch": 1.0494026477236036, - "grad_norm": 0.5579642653465271, - "learning_rate": 0.0005376274108393491, - "loss": 3.7714, + "epoch": 1.0512129380053907, + "grad_norm": 0.5548348426818848, + "learning_rate": 0.0005375261737722611, + "loss": 3.7646, "step": 9750 }, { - "epoch": 1.0547841997632117, - "grad_norm": 0.576766312122345, - "learning_rate": 0.0005373041698092877, - "loss": 3.7653, + "epoch": 1.0566037735849056, + "grad_norm": 0.5238717794418335, + "learning_rate": 0.0005372023745277928, + "loss": 3.7914, "step": 9800 }, { - "epoch": 1.06016575180282, - "grad_norm": 0.5471925735473633, - "learning_rate": 0.0005369809287792263, - "loss": 3.7686, + "epoch": 1.0619946091644206, + "grad_norm": 0.5447802543640137, + "learning_rate": 0.0005368785752833243, + "loss": 3.7911, "step": 9850 }, { - "epoch": 1.0655473038424281, - "grad_norm": 0.59092116355896, - "learning_rate": 0.000536657687749165, - "loss": 3.7667, + "epoch": 1.0673854447439353, + "grad_norm": 0.5512890815734863, + "learning_rate": 0.0005365547760388559, + "loss": 3.7828, "step": 9900 }, { - "epoch": 1.0709288558820365, - "grad_norm": 0.638380765914917, - "learning_rate": 0.0005363344467191035, - "loss": 3.7722, + "epoch": 1.0727762803234502, + "grad_norm": 0.5664125680923462, + "learning_rate": 0.0005362309767943874, + "loss": 3.7949, "step": 9950 }, { - "epoch": 1.0763104079216446, - "grad_norm": 0.5594536066055298, - "learning_rate": 0.000536011205689042, - "loss": 3.7655, + "epoch": 1.0781671159029649, + "grad_norm": 0.6107265949249268, + "learning_rate": 0.0005359071775499191, + "loss": 3.7721, "step": 10000 }, { - "epoch": 1.0763104079216446, - "eval_accuracy": 0.3451049320696713, - "eval_loss": 3.7489354610443115, - "eval_runtime": 202.8767, - "eval_samples_per_second": 88.778, - "eval_steps_per_second": 5.55, + "epoch": 1.0781671159029649, + "eval_accuracy": 0.34332498046149446, + "eval_loss": 3.7572708129882812, + "eval_runtime": 152.6822, + "eval_samples_per_second": 117.964, + "eval_steps_per_second": 7.375, "step": 10000 }, { - "epoch": 1.081691959961253, - "grad_norm": 0.640766441822052, - "learning_rate": 0.0005356879646589807, - "loss": 3.7646, + "epoch": 1.0835579514824798, + "grad_norm": 0.5660226941108704, + "learning_rate": 0.0005355833783054506, + "loss": 3.7667, "step": 10050 }, { - "epoch": 1.087073512000861, - "grad_norm": 0.6311773657798767, - "learning_rate": 0.0005353647236289192, - "loss": 3.7643, + "epoch": 1.0889487870619945, + "grad_norm": 0.6355052590370178, + "learning_rate": 0.0005352595790609822, + "loss": 3.7712, "step": 10100 }, { - "epoch": 1.0924550640404693, - "grad_norm": 0.5654868483543396, - "learning_rate": 0.0005350414825988579, - "loss": 3.7592, + "epoch": 1.0943396226415094, + "grad_norm": 0.5896925330162048, + "learning_rate": 0.0005349357798165137, + "loss": 3.7638, "step": 10150 }, { - "epoch": 1.0978366160800774, - "grad_norm": 0.5341681838035583, - "learning_rate": 0.0005347182415687964, - "loss": 3.7609, + "epoch": 1.0997304582210243, + "grad_norm": 0.6454715728759766, + "learning_rate": 0.0005346119805720453, + "loss": 3.7765, "step": 10200 }, { - "epoch": 1.1032181681196858, - "grad_norm": 0.8984493613243103, - "learning_rate": 0.000534395000538735, - "loss": 3.7588, + "epoch": 1.105121293800539, + "grad_norm": 0.6419016122817993, + "learning_rate": 0.0005342881813275768, + "loss": 3.7669, "step": 10250 }, { - "epoch": 1.1085997201592939, - "grad_norm": 0.6130673885345459, - "learning_rate": 0.0005340717595086736, - "loss": 3.7725, + "epoch": 1.110512129380054, + "grad_norm": 0.6399803161621094, + "learning_rate": 0.0005339643820831084, + "loss": 3.7723, "step": 10300 }, { - "epoch": 1.1139812721989022, - "grad_norm": 0.5171802043914795, - "learning_rate": 0.0005337485184786122, - "loss": 3.7686, + "epoch": 1.1159029649595686, + "grad_norm": 0.5695080757141113, + "learning_rate": 0.00053364058283864, + "loss": 3.7676, "step": 10350 }, { - "epoch": 1.1193628242385103, - "grad_norm": 0.6430924534797668, - "learning_rate": 0.0005334252774485507, - "loss": 3.7493, + "epoch": 1.1212938005390836, + "grad_norm": 0.641523540019989, + "learning_rate": 0.0005333167835941716, + "loss": 3.774, "step": 10400 }, { - "epoch": 1.1247443762781186, - "grad_norm": 0.5376786589622498, - "learning_rate": 0.0005331020364184894, - "loss": 3.7686, + "epoch": 1.1266846361185983, + "grad_norm": 0.5272489786148071, + "learning_rate": 0.0005329929843497031, + "loss": 3.7699, "step": 10450 }, { - "epoch": 1.1301259283177267, - "grad_norm": 0.5798326730728149, - "learning_rate": 0.0005327787953884279, - "loss": 3.7654, + "epoch": 1.1320754716981132, + "grad_norm": 0.5674998760223389, + "learning_rate": 0.0005326691851052347, + "loss": 3.7715, "step": 10500 }, { - "epoch": 1.135507480357335, - "grad_norm": 0.5384038090705872, - "learning_rate": 0.0005324555543583665, - "loss": 3.7485, + "epoch": 1.137466307277628, + "grad_norm": 0.5550113320350647, + "learning_rate": 0.0005323453858607662, + "loss": 3.7748, "step": 10550 }, { - "epoch": 1.1408890323969434, - "grad_norm": 0.5550758838653564, - "learning_rate": 0.0005321323133283051, - "loss": 3.748, + "epoch": 1.1428571428571428, + "grad_norm": 0.5779372453689575, + "learning_rate": 0.0005320215866162979, + "loss": 3.7667, "step": 10600 }, { - "epoch": 1.1462705844365515, - "grad_norm": 0.5656107664108276, - "learning_rate": 0.0005318090722982436, - "loss": 3.7485, + "epoch": 1.1482479784366577, + "grad_norm": 0.5660243630409241, + "learning_rate": 0.0005316977873718294, + "loss": 3.7511, "step": 10650 }, { - "epoch": 1.1516521364761596, - "grad_norm": 0.5888227224349976, - "learning_rate": 0.0005314858312681823, - "loss": 3.7438, + "epoch": 1.1536388140161726, + "grad_norm": 0.5514928102493286, + "learning_rate": 0.000531373988127361, + "loss": 3.7664, "step": 10700 }, { - "epoch": 1.157033688515768, - "grad_norm": 0.5515899658203125, - "learning_rate": 0.0005311625902381209, - "loss": 3.7721, + "epoch": 1.1590296495956873, + "grad_norm": 0.5613722801208496, + "learning_rate": 0.0005310501888828925, + "loss": 3.7443, "step": 10750 }, { - "epoch": 1.1624152405553763, - "grad_norm": 0.6210424900054932, - "learning_rate": 0.0005308393492080595, - "loss": 3.7652, + "epoch": 1.1644204851752022, + "grad_norm": 0.5789129734039307, + "learning_rate": 0.0005307263896384242, + "loss": 3.7512, "step": 10800 }, { - "epoch": 1.1677967925949844, - "grad_norm": 0.5752713084220886, - "learning_rate": 0.000530516108177998, - "loss": 3.7486, + "epoch": 1.169811320754717, + "grad_norm": 0.5901221036911011, + "learning_rate": 0.0005304025903939556, + "loss": 3.7406, "step": 10850 }, { - "epoch": 1.1731783446345927, - "grad_norm": 0.6572223901748657, - "learning_rate": 0.0005301928671479365, - "loss": 3.7272, + "epoch": 1.1752021563342319, + "grad_norm": 0.6217564344406128, + "learning_rate": 0.0005300787911494873, + "loss": 3.7545, "step": 10900 }, { - "epoch": 1.1785598966742008, - "grad_norm": 0.6310757994651794, - "learning_rate": 0.0005298696261178752, - "loss": 3.7563, + "epoch": 1.1805929919137466, + "grad_norm": 0.5514366030693054, + "learning_rate": 0.0005297549919050189, + "loss": 3.7697, "step": 10950 }, { - "epoch": 1.1839414487138091, - "grad_norm": 0.5674658417701721, - "learning_rate": 0.0005295463850878138, - "loss": 3.7441, + "epoch": 1.1859838274932615, + "grad_norm": 0.5353732109069824, + "learning_rate": 0.0005294311926605504, + "loss": 3.7783, "step": 11000 }, { - "epoch": 1.1839414487138091, - "eval_accuracy": 0.34708784727228553, - "eval_loss": 3.7200334072113037, - "eval_runtime": 208.4996, - "eval_samples_per_second": 86.384, - "eval_steps_per_second": 5.4, + "epoch": 1.1859838274932615, + "eval_accuracy": 0.34627240734923787, + "eval_loss": 3.730870246887207, + "eval_runtime": 152.3855, + "eval_samples_per_second": 118.194, + "eval_steps_per_second": 7.389, "step": 11000 }, { - "epoch": 1.1893230007534172, - "grad_norm": 0.5541161298751831, - "learning_rate": 0.0005292231440577524, - "loss": 3.7499, + "epoch": 1.1913746630727764, + "grad_norm": 0.5589845776557922, + "learning_rate": 0.000529107393416082, + "loss": 3.7652, "step": 11050 }, { - "epoch": 1.1947045527930256, - "grad_norm": 0.5533831715583801, - "learning_rate": 0.0005288999030276909, - "loss": 3.7418, + "epoch": 1.196765498652291, + "grad_norm": 0.5585349798202515, + "learning_rate": 0.0005287835941716135, + "loss": 3.7521, "step": 11100 }, { - "epoch": 1.2000861048326337, - "grad_norm": 0.5780303478240967, - "learning_rate": 0.0005285766619976295, - "loss": 3.7409, + "epoch": 1.202156334231806, + "grad_norm": 0.5789923071861267, + "learning_rate": 0.0005284597949271452, + "loss": 3.7476, "step": 11150 }, { - "epoch": 1.205467656872242, - "grad_norm": 0.6292614340782166, - "learning_rate": 0.0005282534209675681, - "loss": 3.7435, + "epoch": 1.2075471698113207, + "grad_norm": 0.5433546900749207, + "learning_rate": 0.0005281359956826767, + "loss": 3.7709, "step": 11200 }, { - "epoch": 1.21084920891185, - "grad_norm": 0.5940732359886169, - "learning_rate": 0.0005279301799375066, - "loss": 3.7362, + "epoch": 1.2129380053908356, + "grad_norm": 0.613101065158844, + "learning_rate": 0.0005278121964382083, + "loss": 3.7594, "step": 11250 }, { - "epoch": 1.2162307609514584, - "grad_norm": 0.5676620006561279, - "learning_rate": 0.0005276134037280465, - "loss": 3.7241, + "epoch": 1.2183288409703503, + "grad_norm": 0.5600290298461914, + "learning_rate": 0.0005274883971937398, + "loss": 3.7442, "step": 11300 }, { - "epoch": 1.2216123129910665, - "grad_norm": 0.5294714570045471, - "learning_rate": 0.0005272901626979851, - "loss": 3.7192, + "epoch": 1.2237196765498652, + "grad_norm": 0.5522957444190979, + "learning_rate": 0.0005271645979492714, + "loss": 3.7389, "step": 11350 }, { - "epoch": 1.2269938650306749, - "grad_norm": 0.5521119832992554, - "learning_rate": 0.0005269669216679236, - "loss": 3.734, + "epoch": 1.2291105121293802, + "grad_norm": 0.7547903656959534, + "learning_rate": 0.000526840798704803, + "loss": 3.7529, "step": 11400 }, { - "epoch": 1.232375417070283, - "grad_norm": 0.7975606918334961, - "learning_rate": 0.0005266436806378623, - "loss": 3.7188, + "epoch": 1.2345013477088949, + "grad_norm": 0.5353315472602844, + "learning_rate": 0.0005265169994603346, + "loss": 3.7639, "step": 11450 }, { - "epoch": 1.2377569691098913, - "grad_norm": 0.5794736742973328, - "learning_rate": 0.0005263204396078008, - "loss": 3.7279, + "epoch": 1.2398921832884098, + "grad_norm": 0.524540364742279, + "learning_rate": 0.0005261932002158661, + "loss": 3.7537, "step": 11500 }, { - "epoch": 1.2431385211494996, - "grad_norm": 0.5361841917037964, - "learning_rate": 0.0005259971985777394, - "loss": 3.7432, + "epoch": 1.2452830188679245, + "grad_norm": 0.5606273412704468, + "learning_rate": 0.0005258694009713977, + "loss": 3.7457, "step": 11550 }, { - "epoch": 1.2485200731891077, - "grad_norm": 0.5248964428901672, - "learning_rate": 0.000525673957547678, - "loss": 3.7447, + "epoch": 1.2506738544474394, + "grad_norm": 0.5786051154136658, + "learning_rate": 0.0005255456017269292, + "loss": 3.761, "step": 11600 }, { - "epoch": 1.2539016252287158, - "grad_norm": 0.5553768873214722, - "learning_rate": 0.0005253507165176167, - "loss": 3.7477, + "epoch": 1.256064690026954, + "grad_norm": 0.5806413888931274, + "learning_rate": 0.0005252218024824608, + "loss": 3.7393, "step": 11650 }, { - "epoch": 1.2592831772683242, - "grad_norm": 0.5761224627494812, - "learning_rate": 0.0005250274754875552, - "loss": 3.7211, + "epoch": 1.261455525606469, + "grad_norm": 0.5003126263618469, + "learning_rate": 0.0005248980032379924, + "loss": 3.7527, "step": 11700 }, { - "epoch": 1.2646647293079325, - "grad_norm": 0.607130229473114, - "learning_rate": 0.0005247042344574938, - "loss": 3.7359, + "epoch": 1.266846361185984, + "grad_norm": 0.5108680725097656, + "learning_rate": 0.000524574203993524, + "loss": 3.7502, "step": 11750 }, { - "epoch": 1.2700462813475406, - "grad_norm": 0.5540531873703003, - "learning_rate": 0.0005243809934274323, - "loss": 3.7217, + "epoch": 1.2722371967654986, + "grad_norm": 0.6031525135040283, + "learning_rate": 0.0005242504047490555, + "loss": 3.7638, "step": 11800 }, { - "epoch": 1.275427833387149, - "grad_norm": 0.6098884344100952, - "learning_rate": 0.0005240577523973709, - "loss": 3.7431, + "epoch": 1.2776280323450135, + "grad_norm": 0.5650216937065125, + "learning_rate": 0.0005239266055045871, + "loss": 3.7471, "step": 11850 }, { - "epoch": 1.280809385426757, - "grad_norm": 0.5816884636878967, - "learning_rate": 0.0005237345113673095, - "loss": 3.7404, + "epoch": 1.2830188679245282, + "grad_norm": 0.600111186504364, + "learning_rate": 0.0005236028062601186, + "loss": 3.7432, "step": 11900 }, { - "epoch": 1.2861909374663654, - "grad_norm": 0.5769429802894592, - "learning_rate": 0.0005234112703372481, - "loss": 3.7404, + "epoch": 1.2884097035040432, + "grad_norm": 0.5846624374389648, + "learning_rate": 0.0005232790070156503, + "loss": 3.7259, "step": 11950 }, { - "epoch": 1.2915724895059735, - "grad_norm": 0.563298761844635, - "learning_rate": 0.0005230880293071867, - "loss": 3.7293, + "epoch": 1.2938005390835579, + "grad_norm": 0.5376623868942261, + "learning_rate": 0.0005229552077711818, + "loss": 3.7384, "step": 12000 }, { - "epoch": 1.2915724895059735, - "eval_accuracy": 0.34958697234490643, - "eval_loss": 3.6987245082855225, - "eval_runtime": 217.4267, - "eval_samples_per_second": 82.837, - "eval_steps_per_second": 5.179, + "epoch": 1.2938005390835579, + "eval_accuracy": 0.34856509193501123, + "eval_loss": 3.702909469604492, + "eval_runtime": 152.7981, + "eval_samples_per_second": 117.875, + "eval_steps_per_second": 7.369, "step": 12000 }, { - "epoch": 1.2969540415455818, - "grad_norm": 0.5788484811782837, - "learning_rate": 0.0005227647882771253, - "loss": 3.7336, + "epoch": 1.2991913746630728, + "grad_norm": 0.5912179350852966, + "learning_rate": 0.0005226314085267134, + "loss": 3.7548, "step": 12050 }, { - "epoch": 1.30233559358519, - "grad_norm": 0.6204023957252502, - "learning_rate": 0.0005224415472470639, - "loss": 3.7243, + "epoch": 1.3045822102425877, + "grad_norm": 0.7062541842460632, + "learning_rate": 0.0005223140852671344, + "loss": 3.7468, "step": 12100 }, { - "epoch": 1.3077171456247982, - "grad_norm": 0.5986481308937073, - "learning_rate": 0.0005221183062170024, - "loss": 3.7316, + "epoch": 1.3099730458221024, + "grad_norm": 0.5411067008972168, + "learning_rate": 0.0005219902860226659, + "loss": 3.7358, "step": 12150 }, { - "epoch": 1.3130986976644063, - "grad_norm": 0.6356789469718933, - "learning_rate": 0.0005217950651869409, - "loss": 3.7352, + "epoch": 1.3153638814016173, + "grad_norm": 0.5543079376220703, + "learning_rate": 0.0005216664867781975, + "loss": 3.7528, "step": 12200 }, { - "epoch": 1.3184802497040147, - "grad_norm": 0.555164098739624, - "learning_rate": 0.0005214718241568796, - "loss": 3.7149, + "epoch": 1.320754716981132, + "grad_norm": 0.626389741897583, + "learning_rate": 0.000521342687533729, + "loss": 3.7289, "step": 12250 }, { - "epoch": 1.3238618017436228, - "grad_norm": 0.6156308054924011, - "learning_rate": 0.0005211485831268182, - "loss": 3.6991, + "epoch": 1.326145552560647, + "grad_norm": 0.7137591242790222, + "learning_rate": 0.0005210188882892606, + "loss": 3.7343, "step": 12300 }, { - "epoch": 1.329243353783231, - "grad_norm": 0.5790920853614807, - "learning_rate": 0.0005208253420967568, - "loss": 3.7206, + "epoch": 1.3315363881401616, + "grad_norm": 0.5231083631515503, + "learning_rate": 0.0005206950890447922, + "loss": 3.743, "step": 12350 }, { - "epoch": 1.3346249058228392, - "grad_norm": 0.5524982810020447, - "learning_rate": 0.0005205021010666953, - "loss": 3.7344, + "epoch": 1.3369272237196765, + "grad_norm": 0.6331651210784912, + "learning_rate": 0.0005203712898003238, + "loss": 3.74, "step": 12400 }, { - "epoch": 1.3400064578624475, - "grad_norm": 0.5789136290550232, - "learning_rate": 0.0005201788600366339, - "loss": 3.714, + "epoch": 1.3423180592991915, + "grad_norm": 0.5775096416473389, + "learning_rate": 0.0005200474905558553, + "loss": 3.7299, "step": 12450 }, { - "epoch": 1.3453880099020559, - "grad_norm": 0.5845145583152771, - "learning_rate": 0.0005198556190065725, - "loss": 3.7083, + "epoch": 1.3477088948787062, + "grad_norm": 0.6246299147605896, + "learning_rate": 0.0005197236913113869, + "loss": 3.7473, "step": 12500 }, { - "epoch": 1.350769561941664, - "grad_norm": 0.6391981840133667, - "learning_rate": 0.0005195323779765112, - "loss": 3.7103, + "epoch": 1.353099730458221, + "grad_norm": 0.5844605565071106, + "learning_rate": 0.0005193998920669184, + "loss": 3.7204, "step": 12550 }, { - "epoch": 1.356151113981272, - "grad_norm": 0.543088972568512, - "learning_rate": 0.0005192091369464497, - "loss": 3.7256, + "epoch": 1.3584905660377358, + "grad_norm": 0.5848774909973145, + "learning_rate": 0.0005190760928224501, + "loss": 3.7216, "step": 12600 }, { - "epoch": 1.3615326660208804, - "grad_norm": 0.5750318169593811, - "learning_rate": 0.0005188858959163882, - "loss": 3.7289, + "epoch": 1.3638814016172507, + "grad_norm": 0.6270635724067688, + "learning_rate": 0.0005187522935779816, + "loss": 3.7121, "step": 12650 }, { - "epoch": 1.3669142180604887, - "grad_norm": 0.6135967969894409, - "learning_rate": 0.0005185626548863269, - "loss": 3.7014, + "epoch": 1.3692722371967654, + "grad_norm": 0.5674271583557129, + "learning_rate": 0.0005184284943335132, + "loss": 3.7243, "step": 12700 }, { - "epoch": 1.3722957701000968, - "grad_norm": 0.5833747982978821, - "learning_rate": 0.0005182394138562654, - "loss": 3.7292, + "epoch": 1.3746630727762803, + "grad_norm": 0.5642839074134827, + "learning_rate": 0.0005181046950890447, + "loss": 3.7156, "step": 12750 }, { - "epoch": 1.3776773221397052, - "grad_norm": 0.571729838848114, - "learning_rate": 0.0005179161728262041, - "loss": 3.7158, + "epoch": 1.3800539083557952, + "grad_norm": 0.6046538949012756, + "learning_rate": 0.0005177808958445764, + "loss": 3.7149, "step": 12800 }, { - "epoch": 1.3830588741793133, - "grad_norm": 0.6265289783477783, - "learning_rate": 0.0005175929317961426, - "loss": 3.7261, + "epoch": 1.38544474393531, + "grad_norm": 0.6427596807479858, + "learning_rate": 0.0005174570966001078, + "loss": 3.7265, "step": 12850 }, { - "epoch": 1.3884404262189216, - "grad_norm": 0.6207230091094971, - "learning_rate": 0.0005172696907660812, - "loss": 3.699, + "epoch": 1.3908355795148248, + "grad_norm": 0.5846236944198608, + "learning_rate": 0.0005171332973556395, + "loss": 3.7258, "step": 12900 }, { - "epoch": 1.3938219782585297, - "grad_norm": 0.5325709581375122, - "learning_rate": 0.0005169464497360198, - "loss": 3.7059, + "epoch": 1.3962264150943398, + "grad_norm": 0.5765063762664795, + "learning_rate": 0.000516809498111171, + "loss": 3.7341, "step": 12950 }, { - "epoch": 1.399203530298138, - "grad_norm": 0.5781683325767517, - "learning_rate": 0.0005166232087059583, - "loss": 3.7029, + "epoch": 1.4016172506738545, + "grad_norm": 0.5632540583610535, + "learning_rate": 0.0005164856988667026, + "loss": 3.7152, "step": 13000 }, { - "epoch": 1.399203530298138, - "eval_accuracy": 0.3522667871699628, - "eval_loss": 3.674454927444458, - "eval_runtime": 217.915, - "eval_samples_per_second": 82.651, - "eval_steps_per_second": 5.167, + "epoch": 1.4016172506738545, + "eval_accuracy": 0.351037923008781, + "eval_loss": 3.681976079940796, + "eval_runtime": 153.0001, + "eval_samples_per_second": 117.719, + "eval_steps_per_second": 7.359, "step": 13000 }, { - "epoch": 1.4045850823377461, - "grad_norm": 0.565986156463623, - "learning_rate": 0.0005162999676758969, - "loss": 3.7075, + "epoch": 1.4070080862533692, + "grad_norm": 0.584800660610199, + "learning_rate": 0.0005161618996222341, + "loss": 3.7137, "step": 13050 }, { - "epoch": 1.4099666343773545, - "grad_norm": 0.5653162002563477, - "learning_rate": 0.0005159767266458355, - "loss": 3.6983, + "epoch": 1.412398921832884, + "grad_norm": 0.5406800508499146, + "learning_rate": 0.0005158381003777657, + "loss": 3.7076, "step": 13100 }, { - "epoch": 1.4153481864169626, - "grad_norm": 0.5863538384437561, - "learning_rate": 0.0005156534856157741, - "loss": 3.7083, + "epoch": 1.417789757412399, + "grad_norm": 0.6078150272369385, + "learning_rate": 0.0005155143011332973, + "loss": 3.714, "step": 13150 }, { - "epoch": 1.420729738456571, - "grad_norm": 0.5486993789672852, - "learning_rate": 0.0005153302445857127, - "loss": 3.7251, + "epoch": 1.4231805929919137, + "grad_norm": 0.5497395396232605, + "learning_rate": 0.0005151905018888289, + "loss": 3.7119, "step": 13200 }, { - "epoch": 1.426111290496179, - "grad_norm": 0.6212597489356995, - "learning_rate": 0.0005150070035556513, - "loss": 3.7196, + "epoch": 1.4285714285714286, + "grad_norm": 0.5663301348686218, + "learning_rate": 0.0005148667026443604, + "loss": 3.7237, "step": 13250 }, { - "epoch": 1.4314928425357873, - "grad_norm": 0.6025354862213135, - "learning_rate": 0.0005146837625255898, - "loss": 3.7195, + "epoch": 1.4339622641509435, + "grad_norm": 0.6136855483055115, + "learning_rate": 0.000514542903399892, + "loss": 3.6982, "step": 13300 }, { - "epoch": 1.4368743945753955, - "grad_norm": 0.5640348792076111, - "learning_rate": 0.0005143669863161297, - "loss": 3.7035, + "epoch": 1.4393530997304582, + "grad_norm": 0.577499270439148, + "learning_rate": 0.0005142191041554237, + "loss": 3.7174, "step": 13350 }, { - "epoch": 1.4422559466150038, - "grad_norm": 0.5984680652618408, - "learning_rate": 0.0005140437452860683, - "loss": 3.7052, + "epoch": 1.444743935309973, + "grad_norm": 0.5561140775680542, + "learning_rate": 0.0005139017808958445, + "loss": 3.7323, "step": 13400 }, { - "epoch": 1.447637498654612, - "grad_norm": 0.5823872685432434, - "learning_rate": 0.0005137205042560069, - "loss": 3.695, + "epoch": 1.4501347708894878, + "grad_norm": 0.5748620629310608, + "learning_rate": 0.0005135779816513762, + "loss": 3.7341, "step": 13450 }, { - "epoch": 1.4530190506942202, - "grad_norm": 0.5364094972610474, - "learning_rate": 0.0005133972632259455, - "loss": 3.7199, + "epoch": 1.4555256064690028, + "grad_norm": 0.6695783734321594, + "learning_rate": 0.0005132541824069076, + "loss": 3.7254, "step": 13500 }, { - "epoch": 1.4584006027338283, - "grad_norm": 0.5685153603553772, - "learning_rate": 0.000513074022195884, - "loss": 3.6992, + "epoch": 1.4609164420485174, + "grad_norm": 0.5569912195205688, + "learning_rate": 0.0005129303831624393, + "loss": 3.7147, "step": 13550 }, { - "epoch": 1.4637821547734367, - "grad_norm": 0.5787658095359802, - "learning_rate": 0.0005127507811658226, - "loss": 3.6923, + "epoch": 1.4663072776280324, + "grad_norm": 0.5915151834487915, + "learning_rate": 0.0005126065839179708, + "loss": 3.73, "step": 13600 }, { - "epoch": 1.469163706813045, - "grad_norm": 0.5486370325088501, - "learning_rate": 0.0005124275401357612, - "loss": 3.7024, + "epoch": 1.4716981132075473, + "grad_norm": 0.5770183205604553, + "learning_rate": 0.0005122827846735024, + "loss": 3.7206, "step": 13650 }, { - "epoch": 1.474545258852653, - "grad_norm": 0.5473746061325073, - "learning_rate": 0.0005121042991056997, - "loss": 3.6833, + "epoch": 1.477088948787062, + "grad_norm": 0.6127224564552307, + "learning_rate": 0.0005119589854290339, + "loss": 3.7078, "step": 13700 }, { - "epoch": 1.4799268108922612, - "grad_norm": 0.5517615675926208, - "learning_rate": 0.0005117810580756384, - "loss": 3.6902, + "epoch": 1.482479784366577, + "grad_norm": 0.6317543983459473, + "learning_rate": 0.0005116351861845655, + "loss": 3.7085, "step": 13750 }, { - "epoch": 1.4853083629318695, - "grad_norm": 0.5971811413764954, - "learning_rate": 0.0005114578170455769, - "loss": 3.6845, + "epoch": 1.4878706199460916, + "grad_norm": 0.5654078125953674, + "learning_rate": 0.0005113113869400971, + "loss": 3.7115, "step": 13800 }, { - "epoch": 1.4906899149714778, - "grad_norm": 0.5672309398651123, - "learning_rate": 0.0005111345760155156, - "loss": 3.6854, + "epoch": 1.4932614555256065, + "grad_norm": 0.6050569415092468, + "learning_rate": 0.0005109875876956287, + "loss": 3.7161, "step": 13850 }, { - "epoch": 1.496071467011086, - "grad_norm": 0.5523454546928406, - "learning_rate": 0.0005108113349854541, - "loss": 3.6961, + "epoch": 1.4986522911051212, + "grad_norm": 0.7072622776031494, + "learning_rate": 0.0005106637884511602, + "loss": 3.6974, "step": 13900 }, { - "epoch": 1.501453019050694, - "grad_norm": 0.5691514611244202, - "learning_rate": 0.0005104880939553926, - "loss": 3.6803, + "epoch": 1.5040431266846361, + "grad_norm": 0.5389599800109863, + "learning_rate": 0.0005103399892066918, + "loss": 3.6989, "step": 13950 }, { - "epoch": 1.5068345710903024, - "grad_norm": 0.5720673203468323, - "learning_rate": 0.0005101648529253313, - "loss": 3.6962, + "epoch": 1.509433962264151, + "grad_norm": 0.5915652513504028, + "learning_rate": 0.0005100161899622234, + "loss": 3.7106, "step": 14000 }, { - "epoch": 1.5068345710903024, - "eval_accuracy": 0.35364048563060124, - "eval_loss": 3.655740261077881, - "eval_runtime": 204.9718, - "eval_samples_per_second": 87.871, - "eval_steps_per_second": 5.493, + "epoch": 1.509433962264151, + "eval_accuracy": 0.35301268924480916, + "eval_loss": 3.660759449005127, + "eval_runtime": 152.8889, + "eval_samples_per_second": 117.805, + "eval_steps_per_second": 7.365, "step": 14000 }, { - "epoch": 1.5122161231299107, - "grad_norm": 0.5242102742195129, - "learning_rate": 0.0005098480767158711, - "loss": 3.697, + "epoch": 1.5148247978436657, + "grad_norm": 0.5557901263237, + "learning_rate": 0.000509692390717755, + "loss": 3.7113, "step": 14050 }, { - "epoch": 1.5175976751695188, - "grad_norm": 0.5905733108520508, - "learning_rate": 0.0005095248356858097, - "loss": 3.6869, + "epoch": 1.5202156334231804, + "grad_norm": 0.6000087857246399, + "learning_rate": 0.0005093685914732865, + "loss": 3.7091, "step": 14100 }, { - "epoch": 1.5229792272091272, - "grad_norm": 0.6295709609985352, - "learning_rate": 0.0005092015946557483, - "loss": 3.6839, + "epoch": 1.5256064690026954, + "grad_norm": 0.5491834282875061, + "learning_rate": 0.0005090447922288181, + "loss": 3.7027, "step": 14150 }, { - "epoch": 1.5283607792487355, - "grad_norm": 0.6163989901542664, - "learning_rate": 0.0005088783536256868, - "loss": 3.6979, + "epoch": 1.5309973045822103, + "grad_norm": 0.5360293388366699, + "learning_rate": 0.0005087209929843496, + "loss": 3.7071, "step": 14200 }, { - "epoch": 1.5337423312883436, - "grad_norm": 0.5677395462989807, - "learning_rate": 0.0005085551125956255, - "loss": 3.698, + "epoch": 1.536388140161725, + "grad_norm": 0.6027365326881409, + "learning_rate": 0.0005083971937398812, + "loss": 3.7083, "step": 14250 }, { - "epoch": 1.5391238833279517, - "grad_norm": 0.5294774174690247, - "learning_rate": 0.000508231871565564, - "loss": 3.6858, + "epoch": 1.54177897574124, + "grad_norm": 0.5418972373008728, + "learning_rate": 0.0005080733944954127, + "loss": 3.7045, "step": 14300 }, { - "epoch": 1.54450543536756, - "grad_norm": 0.585757315158844, - "learning_rate": 0.0005079086305355026, - "loss": 3.7028, + "epoch": 1.5471698113207548, + "grad_norm": 0.5064155459403992, + "learning_rate": 0.0005077495952509444, + "loss": 3.6947, "step": 14350 }, { - "epoch": 1.5498869874071683, - "grad_norm": 0.5384769439697266, - "learning_rate": 0.0005075853895054412, - "loss": 3.686, + "epoch": 1.5525606469002695, + "grad_norm": 0.5617753267288208, + "learning_rate": 0.0005074257960064759, + "loss": 3.6987, "step": 14400 }, { - "epoch": 1.5552685394467765, - "grad_norm": 0.5806359052658081, - "learning_rate": 0.0005072621484753797, - "loss": 3.689, + "epoch": 1.5579514824797842, + "grad_norm": 0.5464807152748108, + "learning_rate": 0.0005071019967620075, + "loss": 3.7002, "step": 14450 }, { - "epoch": 1.5606500914863846, - "grad_norm": 0.5702711939811707, - "learning_rate": 0.0005069389074453184, - "loss": 3.675, + "epoch": 1.5633423180592994, + "grad_norm": 0.5438482165336609, + "learning_rate": 0.000506778197517539, + "loss": 3.6985, "step": 14500 }, { - "epoch": 1.566031643525993, - "grad_norm": 0.5366608500480652, - "learning_rate": 0.0005066156664152569, - "loss": 3.6914, + "epoch": 1.568733153638814, + "grad_norm": 0.5997867584228516, + "learning_rate": 0.0005064543982730707, + "loss": 3.6976, "step": 14550 }, { - "epoch": 1.5714131955656012, - "grad_norm": 0.60133296251297, - "learning_rate": 0.0005062924253851955, - "loss": 3.693, + "epoch": 1.5741239892183287, + "grad_norm": 0.5712060928344727, + "learning_rate": 0.0005061305990286023, + "loss": 3.6846, "step": 14600 }, { - "epoch": 1.5767947476052093, - "grad_norm": 0.5777215361595154, - "learning_rate": 0.0005059691843551341, - "loss": 3.6786, + "epoch": 1.5795148247978437, + "grad_norm": 0.5789122581481934, + "learning_rate": 0.0005058067997841338, + "loss": 3.689, "step": 14650 }, { - "epoch": 1.5821762996448174, - "grad_norm": 0.5946151614189148, - "learning_rate": 0.0005056459433250727, - "loss": 3.6849, + "epoch": 1.5849056603773586, + "grad_norm": 0.5818272233009338, + "learning_rate": 0.0005054830005396654, + "loss": 3.6941, "step": 14700 }, { - "epoch": 1.5875578516844258, - "grad_norm": 0.601273775100708, - "learning_rate": 0.0005053227022950113, - "loss": 3.6833, + "epoch": 1.5902964959568733, + "grad_norm": 0.595536470413208, + "learning_rate": 0.0005051592012951969, + "loss": 3.6728, "step": 14750 }, { - "epoch": 1.592939403724034, - "grad_norm": 0.5582096576690674, - "learning_rate": 0.0005049994612649499, - "loss": 3.681, + "epoch": 1.595687331536388, + "grad_norm": 0.5664342045783997, + "learning_rate": 0.0005048354020507286, + "loss": 3.666, "step": 14800 }, { - "epoch": 1.5983209557636422, - "grad_norm": 0.5676015615463257, - "learning_rate": 0.0005046762202348884, - "loss": 3.6892, + "epoch": 1.6010781671159031, + "grad_norm": 0.5805814862251282, + "learning_rate": 0.00050451160280626, + "loss": 3.6899, "step": 14850 }, { - "epoch": 1.6037025078032503, - "grad_norm": 0.5280758738517761, - "learning_rate": 0.000504352979204827, - "loss": 3.6736, + "epoch": 1.6064690026954178, + "grad_norm": 0.5888657569885254, + "learning_rate": 0.0005041878035617917, + "loss": 3.6958, "step": 14900 }, { - "epoch": 1.6090840598428586, - "grad_norm": 0.6497607827186584, - "learning_rate": 0.0005040297381747656, - "loss": 3.6761, + "epoch": 1.6118598382749325, + "grad_norm": 0.6133636832237244, + "learning_rate": 0.0005038640043173232, + "loss": 3.6823, "step": 14950 }, { - "epoch": 1.614465611882467, - "grad_norm": 0.6543067693710327, - "learning_rate": 0.0005037064971447042, - "loss": 3.6601, + "epoch": 1.6172506738544474, + "grad_norm": 0.5385803580284119, + "learning_rate": 0.0005035402050728548, + "loss": 3.673, "step": 15000 }, { - "epoch": 1.614465611882467, - "eval_accuracy": 0.3556327449815676, - "eval_loss": 3.6367475986480713, - "eval_runtime": 205.3772, - "eval_samples_per_second": 87.697, - "eval_steps_per_second": 5.483, + "epoch": 1.6172506738544474, + "eval_accuracy": 0.3548456634622395, + "eval_loss": 3.643002986907959, + "eval_runtime": 152.5762, + "eval_samples_per_second": 118.046, + "eval_steps_per_second": 7.38, "step": 15000 }, { - "epoch": 1.619847163922075, - "grad_norm": 0.5906841158866882, - "learning_rate": 0.0005033832561146428, - "loss": 3.6741, + "epoch": 1.6226415094339623, + "grad_norm": 0.5596092343330383, + "learning_rate": 0.0005032164058283863, + "loss": 3.6722, "step": 15050 }, { - "epoch": 1.6252287159616834, - "grad_norm": 0.5810515880584717, - "learning_rate": 0.0005030600150845813, - "loss": 3.6827, + "epoch": 1.628032345013477, + "grad_norm": 0.5818222761154175, + "learning_rate": 0.0005028926065839179, + "loss": 3.683, "step": 15100 }, { - "epoch": 1.6306102680012917, - "grad_norm": 0.6275368928909302, - "learning_rate": 0.00050273677405452, - "loss": 3.673, + "epoch": 1.633423180592992, + "grad_norm": 0.5584225058555603, + "learning_rate": 0.0005025688073394495, + "loss": 3.7002, "step": 15150 }, { - "epoch": 1.6359918200408998, - "grad_norm": 0.5321255922317505, - "learning_rate": 0.0005024135330244585, - "loss": 3.6789, + "epoch": 1.6388140161725069, + "grad_norm": 0.523932933807373, + "learning_rate": 0.0005022450080949811, + "loss": 3.6904, "step": 15200 }, { - "epoch": 1.641373372080508, - "grad_norm": 0.5455909371376038, - "learning_rate": 0.0005020902919943972, - "loss": 3.6691, + "epoch": 1.6442048517520216, + "grad_norm": 0.5471178293228149, + "learning_rate": 0.0005019212088505126, + "loss": 3.6822, "step": 15250 }, { - "epoch": 1.6467549241201163, - "grad_norm": 0.5684463977813721, - "learning_rate": 0.0005017670509643357, - "loss": 3.656, + "epoch": 1.6495956873315363, + "grad_norm": 0.5977591276168823, + "learning_rate": 0.0005015974096060442, + "loss": 3.6788, "step": 15300 }, { - "epoch": 1.6521364761597246, - "grad_norm": 0.5891856551170349, - "learning_rate": 0.0005014438099342743, - "loss": 3.666, + "epoch": 1.6549865229110512, + "grad_norm": 0.527600884437561, + "learning_rate": 0.0005012736103615758, + "loss": 3.6653, "step": 15350 }, { - "epoch": 1.6575180281993327, - "grad_norm": 0.5768228769302368, - "learning_rate": 0.0005011205689042129, - "loss": 3.6802, + "epoch": 1.6603773584905661, + "grad_norm": 0.5747079253196716, + "learning_rate": 0.0005009498111171074, + "loss": 3.6841, "step": 15400 }, { - "epoch": 1.6628995802389408, - "grad_norm": 0.5384355187416077, - "learning_rate": 0.0005007973278741514, - "loss": 3.6677, + "epoch": 1.6657681940700808, + "grad_norm": 0.6080580353736877, + "learning_rate": 0.0005006260118726389, + "loss": 3.6727, "step": 15450 }, { - "epoch": 1.6682811322785491, - "grad_norm": 0.5776270031929016, - "learning_rate": 0.00050047408684409, - "loss": 3.6913, + "epoch": 1.6711590296495957, + "grad_norm": 0.5732918977737427, + "learning_rate": 0.0005003022126281705, + "loss": 3.6717, "step": 15500 }, { - "epoch": 1.6736626843181575, - "grad_norm": 0.5457106232643127, - "learning_rate": 0.0005001508458140286, - "loss": 3.6747, + "epoch": 1.6765498652291106, + "grad_norm": 0.559468686580658, + "learning_rate": 0.000499978413383702, + "loss": 3.659, "step": 15550 }, { - "epoch": 1.6790442363577656, - "grad_norm": 0.5584999322891235, - "learning_rate": 0.0004998276047839673, - "loss": 3.6656, + "epoch": 1.6819407008086253, + "grad_norm": 0.6118157505989075, + "learning_rate": 0.0004996546141392336, + "loss": 3.6778, "step": 15600 }, { - "epoch": 1.6844257883973737, - "grad_norm": 0.5563948154449463, - "learning_rate": 0.0004995043637539058, - "loss": 3.6699, + "epoch": 1.68733153638814, + "grad_norm": 0.5858181118965149, + "learning_rate": 0.0004993308148947651, + "loss": 3.6833, "step": 15650 }, { - "epoch": 1.689807340436982, - "grad_norm": 0.5881028175354004, - "learning_rate": 0.0004991811227238443, - "loss": 3.6704, + "epoch": 1.692722371967655, + "grad_norm": 0.6019673943519592, + "learning_rate": 0.0004990070156502968, + "loss": 3.6808, "step": 15700 }, { - "epoch": 1.6951888924765903, - "grad_norm": 0.5460503101348877, - "learning_rate": 0.0004988578816937829, - "loss": 3.6895, + "epoch": 1.6981132075471699, + "grad_norm": 0.5684689879417419, + "learning_rate": 0.0004986832164058284, + "loss": 3.6617, "step": 15750 }, { - "epoch": 1.7005704445161984, - "grad_norm": 0.6377148032188416, - "learning_rate": 0.0004985346406637215, - "loss": 3.6985, + "epoch": 1.7035040431266846, + "grad_norm": 0.6181156635284424, + "learning_rate": 0.0004983594171613599, + "loss": 3.6723, "step": 15800 }, { - "epoch": 1.7059519965558065, - "grad_norm": 0.5566238760948181, - "learning_rate": 0.0004982113996336602, - "loss": 3.6537, + "epoch": 1.7088948787061995, + "grad_norm": 0.6059028506278992, + "learning_rate": 0.0004980356179168915, + "loss": 3.6885, "step": 15850 }, { - "epoch": 1.7113335485954149, - "grad_norm": 0.5709816813468933, - "learning_rate": 0.0004978881586035987, - "loss": 3.6434, + "epoch": 1.7142857142857144, + "grad_norm": 0.5463699698448181, + "learning_rate": 0.000497711818672423, + "loss": 3.6892, "step": 15900 }, { - "epoch": 1.7167151006350232, - "grad_norm": 0.5734738111495972, - "learning_rate": 0.0004975649175735373, - "loss": 3.6449, + "epoch": 1.719676549865229, + "grad_norm": 0.5700849890708923, + "learning_rate": 0.0004973880194279547, + "loss": 3.6756, "step": 15950 }, { - "epoch": 1.7220966526746313, - "grad_norm": 0.5572895407676697, - "learning_rate": 0.0004972416765434759, - "loss": 3.6579, + "epoch": 1.7250673854447438, + "grad_norm": 0.61159747838974, + "learning_rate": 0.0004970642201834862, + "loss": 3.6719, "step": 16000 }, { - "epoch": 1.7220966526746313, - "eval_accuracy": 0.35718387360800713, - "eval_loss": 3.6182830333709717, - "eval_runtime": 214.6803, - "eval_samples_per_second": 83.897, - "eval_steps_per_second": 5.245, + "epoch": 1.7250673854447438, + "eval_accuracy": 0.3567654513370335, + "eval_loss": 3.625068426132202, + "eval_runtime": 153.0219, + "eval_samples_per_second": 117.702, + "eval_steps_per_second": 7.358, "step": 16000 }, { - "epoch": 1.7274782047142396, - "grad_norm": 0.5495673418045044, - "learning_rate": 0.0004969184355134145, - "loss": 3.6629, + "epoch": 1.7304582210242587, + "grad_norm": 0.6022645831108093, + "learning_rate": 0.0004967404209390178, + "loss": 3.6581, "step": 16050 }, { - "epoch": 1.732859756753848, - "grad_norm": 0.6197149753570557, - "learning_rate": 0.0004965951944833531, - "loss": 3.6439, + "epoch": 1.7358490566037736, + "grad_norm": 0.5485215187072754, + "learning_rate": 0.0004964166216945493, + "loss": 3.6612, "step": 16100 }, { - "epoch": 1.738241308793456, - "grad_norm": 0.5787531137466431, - "learning_rate": 0.0004962719534532916, - "loss": 3.6703, + "epoch": 1.7412398921832883, + "grad_norm": 0.6632840633392334, + "learning_rate": 0.000496092822450081, + "loss": 3.6679, "step": 16150 }, { - "epoch": 1.7436228608330642, - "grad_norm": 0.5500142574310303, - "learning_rate": 0.0004959487124232302, - "loss": 3.656, + "epoch": 1.7466307277628033, + "grad_norm": 0.6232796907424927, + "learning_rate": 0.0004957690232056125, + "loss": 3.6763, "step": 16200 }, { - "epoch": 1.7490044128726725, - "grad_norm": 0.5271518230438232, - "learning_rate": 0.0004956254713931688, - "loss": 3.6594, + "epoch": 1.7520215633423182, + "grad_norm": 0.5928359031677246, + "learning_rate": 0.0004954452239611441, + "loss": 3.6582, "step": 16250 }, { - "epoch": 1.7543859649122808, - "grad_norm": 0.5560300350189209, - "learning_rate": 0.0004953022303631074, - "loss": 3.6622, + "epoch": 1.7574123989218329, + "grad_norm": 0.542195737361908, + "learning_rate": 0.0004951214247166756, + "loss": 3.6535, "step": 16300 }, { - "epoch": 1.759767516951889, - "grad_norm": 0.5545780062675476, - "learning_rate": 0.0004949789893330459, - "loss": 3.6657, + "epoch": 1.7628032345013476, + "grad_norm": 0.5729801058769226, + "learning_rate": 0.0004947976254722072, + "loss": 3.6752, "step": 16350 }, { - "epoch": 1.765149068991497, - "grad_norm": 0.5637168884277344, - "learning_rate": 0.0004946557483029846, - "loss": 3.6701, + "epoch": 1.7681940700808625, + "grad_norm": 0.5516073107719421, + "learning_rate": 0.0004944738262277387, + "loss": 3.6533, "step": 16400 }, { - "epoch": 1.7705306210311054, - "grad_norm": 0.6499923467636108, - "learning_rate": 0.0004943325072729231, - "loss": 3.6591, + "epoch": 1.7735849056603774, + "grad_norm": 0.5613517165184021, + "learning_rate": 0.0004941500269832703, + "loss": 3.6646, "step": 16450 }, { - "epoch": 1.7759121730707137, - "grad_norm": 0.5951055884361267, - "learning_rate": 0.0004940092662428617, - "loss": 3.6589, + "epoch": 1.778975741239892, + "grad_norm": 0.503390908241272, + "learning_rate": 0.0004938262277388019, + "loss": 3.6498, "step": 16500 }, { - "epoch": 1.7812937251103218, - "grad_norm": 0.5989054441452026, - "learning_rate": 0.0004936860252128003, - "loss": 3.6615, + "epoch": 1.784366576819407, + "grad_norm": 0.5427384972572327, + "learning_rate": 0.0004935024284943335, + "loss": 3.6562, "step": 16550 }, { - "epoch": 1.78667527714993, - "grad_norm": 0.6071681976318359, - "learning_rate": 0.0004933627841827388, - "loss": 3.6413, + "epoch": 1.789757412398922, + "grad_norm": 0.5183855295181274, + "learning_rate": 0.000493178629249865, + "loss": 3.6698, "step": 16600 }, { - "epoch": 1.7920568291895382, - "grad_norm": 0.5401762127876282, - "learning_rate": 0.0004930395431526775, - "loss": 3.6646, + "epoch": 1.7951482479784366, + "grad_norm": 0.5511346459388733, + "learning_rate": 0.0004928548300053966, + "loss": 3.6607, "step": 16650 }, { - "epoch": 1.7974383812291466, - "grad_norm": 0.6720755100250244, - "learning_rate": 0.0004927163021226161, - "loss": 3.6535, + "epoch": 1.8005390835579513, + "grad_norm": 0.6408042907714844, + "learning_rate": 0.0004925310307609282, + "loss": 3.669, "step": 16700 }, { - "epoch": 1.8028199332687547, - "grad_norm": 0.5372287034988403, - "learning_rate": 0.0004923930610925547, - "loss": 3.6387, + "epoch": 1.8059299191374663, + "grad_norm": 0.5517454743385315, + "learning_rate": 0.0004922072315164598, + "loss": 3.6657, "step": 16750 }, { - "epoch": 1.8082014853083628, - "grad_norm": 0.5767584443092346, - "learning_rate": 0.0004920698200624932, - "loss": 3.6509, + "epoch": 1.8113207547169812, + "grad_norm": 0.5822569727897644, + "learning_rate": 0.0004918834322719913, + "loss": 3.6735, "step": 16800 }, { - "epoch": 1.813583037347971, - "grad_norm": 0.619735598564148, - "learning_rate": 0.0004917465790324317, - "loss": 3.6593, + "epoch": 1.8167115902964959, + "grad_norm": 0.5561976432800293, + "learning_rate": 0.0004915596330275229, + "loss": 3.6814, "step": 16850 }, { - "epoch": 1.8189645893875794, - "grad_norm": 0.612182080745697, - "learning_rate": 0.0004914233380023704, - "loss": 3.669, + "epoch": 1.8221024258760108, + "grad_norm": 0.547584593296051, + "learning_rate": 0.0004912358337830544, + "loss": 3.6533, "step": 16900 }, { - "epoch": 1.8243461414271875, - "grad_norm": 0.5973613262176514, - "learning_rate": 0.0004911000969723089, - "loss": 3.6581, + "epoch": 1.8274932614555257, + "grad_norm": 0.5689830780029297, + "learning_rate": 0.000490912034538586, + "loss": 3.6651, "step": 16950 }, { - "epoch": 1.8297276934667959, - "grad_norm": 0.5377869606018066, - "learning_rate": 0.0004907768559422476, - "loss": 3.6582, + "epoch": 1.8328840970350404, + "grad_norm": 0.5999789237976074, + "learning_rate": 0.0004905882352941175, + "loss": 3.6449, "step": 17000 }, { - "epoch": 1.8297276934667959, - "eval_accuracy": 0.3589950085949867, - "eval_loss": 3.60500168800354, - "eval_runtime": 203.9944, - "eval_samples_per_second": 88.292, - "eval_steps_per_second": 5.52, + "epoch": 1.8328840970350404, + "eval_accuracy": 0.3580432092977318, + "eval_loss": 3.6107499599456787, + "eval_runtime": 152.6204, + "eval_samples_per_second": 118.012, + "eval_steps_per_second": 7.378, "step": 17000 }, { - "epoch": 1.8351092455064042, - "grad_norm": 0.5382079482078552, - "learning_rate": 0.0004904536149121861, - "loss": 3.6536, + "epoch": 1.838274932614555, + "grad_norm": 0.5623023509979248, + "learning_rate": 0.0004902644360496492, + "loss": 3.6507, "step": 17050 }, { - "epoch": 1.8404907975460123, - "grad_norm": 0.5734342336654663, - "learning_rate": 0.0004901303738821248, - "loss": 3.6325, + "epoch": 1.8436657681940702, + "grad_norm": 0.5425819754600525, + "learning_rate": 0.0004899406368051808, + "loss": 3.6409, "step": 17100 }, { - "epoch": 1.8458723495856204, - "grad_norm": 0.6593245267868042, - "learning_rate": 0.0004898071328520633, - "loss": 3.6733, + "epoch": 1.849056603773585, + "grad_norm": 0.5366511940956116, + "learning_rate": 0.0004896168375607123, + "loss": 3.6465, "step": 17150 }, { - "epoch": 1.8512539016252287, - "grad_norm": 0.5794579982757568, - "learning_rate": 0.0004894838918220019, - "loss": 3.6634, + "epoch": 1.8544474393530996, + "grad_norm": 0.5287717580795288, + "learning_rate": 0.0004892930383162439, + "loss": 3.6553, "step": 17200 }, { - "epoch": 1.856635453664837, - "grad_norm": 0.5953862071037292, - "learning_rate": 0.0004891606507919405, - "loss": 3.6544, + "epoch": 1.8598382749326146, + "grad_norm": 0.5218010544776917, + "learning_rate": 0.0004889692390717754, + "loss": 3.6619, "step": 17250 }, { - "epoch": 1.8620170057044452, - "grad_norm": 0.6200346946716309, - "learning_rate": 0.000488837409761879, - "loss": 3.6443, + "epoch": 1.8652291105121295, + "grad_norm": 0.5530878901481628, + "learning_rate": 0.0004886454398273071, + "loss": 3.6631, "step": 17300 }, { - "epoch": 1.8673985577440533, - "grad_norm": 0.585228681564331, - "learning_rate": 0.0004885141687318177, - "loss": 3.6478, + "epoch": 1.8706199460916442, + "grad_norm": 0.5743786096572876, + "learning_rate": 0.0004883216405828386, + "loss": 3.6628, "step": 17350 }, { - "epoch": 1.8727801097836616, - "grad_norm": 0.6394591927528381, - "learning_rate": 0.00048819092770175623, - "loss": 3.6416, + "epoch": 1.8760107816711589, + "grad_norm": 0.542460024356842, + "learning_rate": 0.00048799784133837017, + "loss": 3.6582, "step": 17400 }, { - "epoch": 1.87816166182327, - "grad_norm": 0.5823236107826233, - "learning_rate": 0.0004878676866716948, - "loss": 3.6572, + "epoch": 1.881401617250674, + "grad_norm": 0.6218643188476562, + "learning_rate": 0.0004876740420939017, + "loss": 3.6466, "step": 17450 }, { - "epoch": 1.883543213862878, - "grad_norm": 0.5914448499679565, - "learning_rate": 0.00048754444564163337, - "loss": 3.6479, + "epoch": 1.8867924528301887, + "grad_norm": 0.5412008166313171, + "learning_rate": 0.0004873567188343227, + "loss": 3.6474, "step": 17500 }, { - "epoch": 1.8889247659024861, - "grad_norm": 0.6287000775337219, - "learning_rate": 0.000487221204611572, - "loss": 3.6431, + "epoch": 1.8921832884097034, + "grad_norm": 0.5895251035690308, + "learning_rate": 0.0004870329195898542, + "loss": 3.645, "step": 17550 }, { - "epoch": 1.8943063179420945, - "grad_norm": 0.5507499575614929, - "learning_rate": 0.00048689796358151056, - "loss": 3.6444, + "epoch": 1.8975741239892183, + "grad_norm": 0.5757617950439453, + "learning_rate": 0.00048670912034538583, + "loss": 3.66, "step": 17600 }, { - "epoch": 1.8996878699817028, - "grad_norm": 0.5838987827301025, - "learning_rate": 0.00048657472255144915, - "loss": 3.649, + "epoch": 1.9029649595687332, + "grad_norm": 0.6187827587127686, + "learning_rate": 0.0004863853211009174, + "loss": 3.6504, "step": 17650 }, { - "epoch": 1.905069422021311, - "grad_norm": 0.6239995360374451, - "learning_rate": 0.00048625148152138775, - "loss": 3.6385, + "epoch": 1.908355795148248, + "grad_norm": 0.5335280299186707, + "learning_rate": 0.000486061521856449, + "loss": 3.644, "step": 17700 }, { - "epoch": 1.910450974060919, - "grad_norm": 0.5718323588371277, - "learning_rate": 0.0004859282404913263, - "loss": 3.6248, + "epoch": 1.9137466307277629, + "grad_norm": 0.5789933800697327, + "learning_rate": 0.00048573772261198054, + "loss": 3.6661, "step": 17750 }, { - "epoch": 1.9158325261005273, - "grad_norm": 0.5826128125190735, - "learning_rate": 0.0004856049994612649, - "loss": 3.6441, + "epoch": 1.9191374663072778, + "grad_norm": 0.6365672945976257, + "learning_rate": 0.00048541392336751214, + "loss": 3.6509, "step": 17800 }, { - "epoch": 1.9212140781401357, - "grad_norm": 0.6261082291603088, - "learning_rate": 0.00048528175843120353, - "loss": 3.6379, + "epoch": 1.9245283018867925, + "grad_norm": 0.5443440079689026, + "learning_rate": 0.0004850901241230437, + "loss": 3.6415, "step": 17850 }, { - "epoch": 1.9265956301797438, - "grad_norm": 0.5729628205299377, - "learning_rate": 0.0004849585174011421, - "loss": 3.6403, + "epoch": 1.9299191374663072, + "grad_norm": 0.5827429294586182, + "learning_rate": 0.0004847663248785753, + "loss": 3.655, "step": 17900 }, { - "epoch": 1.931977182219352, - "grad_norm": 0.6479345560073853, - "learning_rate": 0.00048464174119168193, - "loss": 3.6513, + "epoch": 1.935309973045822, + "grad_norm": 0.5369715094566345, + "learning_rate": 0.0004844425256341068, + "loss": 3.6345, "step": 17950 }, { - "epoch": 1.9373587342589604, - "grad_norm": 0.6019191145896912, - "learning_rate": 0.0004843185001616205, - "loss": 3.6427, + "epoch": 1.940700808625337, + "grad_norm": 0.5673846006393433, + "learning_rate": 0.00048411872638963834, + "loss": 3.6395, "step": 18000 }, { - "epoch": 1.9373587342589604, - "eval_accuracy": 0.3606186086975985, - "eval_loss": 3.589313268661499, - "eval_runtime": 204.8701, - "eval_samples_per_second": 87.914, - "eval_steps_per_second": 5.496, + "epoch": 1.940700808625337, + "eval_accuracy": 0.35913441025005266, + "eval_loss": 3.5955255031585693, + "eval_runtime": 152.9984, + "eval_samples_per_second": 117.72, + "eval_steps_per_second": 7.36, "step": 18000 }, { - "epoch": 1.9427402862985685, - "grad_norm": 0.5419387817382812, - "learning_rate": 0.00048399525913155907, - "loss": 3.6361, + "epoch": 1.9460916442048517, + "grad_norm": 0.5917626619338989, + "learning_rate": 0.00048379492714516995, + "loss": 3.6433, "step": 18050 }, { - "epoch": 1.9481218383381766, - "grad_norm": 0.6220253109931946, - "learning_rate": 0.0004836720181014976, - "loss": 3.6207, + "epoch": 1.9514824797843666, + "grad_norm": 0.5339709520339966, + "learning_rate": 0.0004834711279007015, + "loss": 3.6414, "step": 18100 }, { - "epoch": 1.953503390377785, - "grad_norm": 0.5842125415802002, - "learning_rate": 0.0004833487770714362, - "loss": 3.6405, + "epoch": 1.9568733153638815, + "grad_norm": 0.5372532606124878, + "learning_rate": 0.0004831473286562331, + "loss": 3.6388, "step": 18150 }, { - "epoch": 1.9588849424173933, - "grad_norm": 0.5908029675483704, - "learning_rate": 0.00048302553604137485, - "loss": 3.6445, + "epoch": 1.9622641509433962, + "grad_norm": 0.605369508266449, + "learning_rate": 0.00048282352941176465, + "loss": 3.6292, "step": 18200 }, { - "epoch": 1.9642664944570014, - "grad_norm": 0.5424453020095825, - "learning_rate": 0.0004827022950113134, - "loss": 3.6419, + "epoch": 1.967654986522911, + "grad_norm": 0.566500723361969, + "learning_rate": 0.00048249973016729626, + "loss": 3.6473, "step": 18250 }, { - "epoch": 1.9696480464966095, - "grad_norm": 0.6032727360725403, - "learning_rate": 0.000482379053981252, - "loss": 3.6507, + "epoch": 1.9730458221024259, + "grad_norm": 0.6016471982002258, + "learning_rate": 0.0004821759309228278, + "loss": 3.6583, "step": 18300 }, { - "epoch": 1.9750295985362178, - "grad_norm": 0.6096407771110535, - "learning_rate": 0.0004820558129511906, - "loss": 3.6364, + "epoch": 1.9784366576819408, + "grad_norm": 0.5543355345726013, + "learning_rate": 0.00048185213167835936, + "loss": 3.6431, "step": 18350 }, { - "epoch": 1.9804111505758262, - "grad_norm": 0.6347204446792603, - "learning_rate": 0.0004817325719211291, - "loss": 3.6451, + "epoch": 1.9838274932614555, + "grad_norm": 0.6136170029640198, + "learning_rate": 0.00048152833243389096, + "loss": 3.643, "step": 18400 }, { - "epoch": 1.9857927026154343, - "grad_norm": 0.5634959936141968, - "learning_rate": 0.0004814093308910677, - "loss": 3.641, + "epoch": 1.9892183288409704, + "grad_norm": 0.5570639371871948, + "learning_rate": 0.0004812045331894225, + "loss": 3.6528, "step": 18450 }, { - "epoch": 1.9911742546550424, - "grad_norm": 0.5860807299613953, - "learning_rate": 0.00048108608986100637, - "loss": 3.6336, + "epoch": 1.9946091644204853, + "grad_norm": 0.5984097719192505, + "learning_rate": 0.0004808807339449541, + "loss": 3.6418, "step": 18500 }, { - "epoch": 1.9965558066946507, - "grad_norm": 0.5338708162307739, - "learning_rate": 0.0004807628488309449, - "loss": 3.6211, + "epoch": 2.0, + "grad_norm": 1.259522795677185, + "learning_rate": 0.0004805569347004856, + "loss": 3.6473, "step": 18550 }, { - "epoch": 2.001937358734259, - "grad_norm": 0.5974145531654358, - "learning_rate": 0.0004804396078008835, - "loss": 3.5958, + "epoch": 2.0053908355795147, + "grad_norm": 0.5877414345741272, + "learning_rate": 0.0004802331354560173, + "loss": 3.5795, "step": 18600 }, { - "epoch": 2.007318910773867, - "grad_norm": 0.5562605261802673, - "learning_rate": 0.00048011636677082204, - "loss": 3.5582, + "epoch": 2.01078167115903, + "grad_norm": 0.5764709711074829, + "learning_rate": 0.00047990933621154877, + "loss": 3.5482, "step": 18650 }, { - "epoch": 2.0127004628134753, - "grad_norm": 0.6376757025718689, - "learning_rate": 0.00047979312574076064, - "loss": 3.5377, + "epoch": 2.0161725067385445, + "grad_norm": 0.5639136433601379, + "learning_rate": 0.0004795855369670804, + "loss": 3.5606, "step": 18700 }, { - "epoch": 2.018082014853084, - "grad_norm": 0.5702477097511292, - "learning_rate": 0.0004794698847106992, - "loss": 3.5431, + "epoch": 2.0215633423180592, + "grad_norm": 0.5635726451873779, + "learning_rate": 0.0004792617377226119, + "loss": 3.5408, "step": 18750 }, { - "epoch": 2.023463566892692, - "grad_norm": 0.5655612945556641, - "learning_rate": 0.0004791466436806378, - "loss": 3.5661, + "epoch": 2.026954177897574, + "grad_norm": 0.6025230884552002, + "learning_rate": 0.0004789379384781435, + "loss": 3.5591, "step": 18800 }, { - "epoch": 2.0288451189323, - "grad_norm": 0.5684214234352112, - "learning_rate": 0.0004788234026505764, - "loss": 3.5378, + "epoch": 2.032345013477089, + "grad_norm": 0.5429965257644653, + "learning_rate": 0.0004786141392336751, + "loss": 3.5674, "step": 18850 }, { - "epoch": 2.034226670971908, - "grad_norm": 0.595730185508728, - "learning_rate": 0.00047850016162051496, - "loss": 3.5512, + "epoch": 2.0377358490566038, + "grad_norm": 0.5635925531387329, + "learning_rate": 0.00047829033998920663, + "loss": 3.5557, "step": 18900 }, { - "epoch": 2.0396082230115167, - "grad_norm": 0.602304220199585, - "learning_rate": 0.00047817692059045356, - "loss": 3.5471, + "epoch": 2.0431266846361185, + "grad_norm": 0.5662844181060791, + "learning_rate": 0.00047796654074473824, + "loss": 3.5489, "step": 18950 }, { - "epoch": 2.044989775051125, - "grad_norm": 0.6218283176422119, - "learning_rate": 0.00047785367956039215, - "loss": 3.56, + "epoch": 2.0485175202156336, + "grad_norm": 0.5810090899467468, + "learning_rate": 0.0004776427415002698, + "loss": 3.5784, "step": 19000 }, { - "epoch": 2.044989775051125, - "eval_accuracy": 0.3616626542966078, - "eval_loss": 3.5785436630249023, - "eval_runtime": 202.2767, - "eval_samples_per_second": 89.041, - "eval_steps_per_second": 5.567, + "epoch": 2.0485175202156336, + "eval_accuracy": 0.3612909527673945, + "eval_loss": 3.5867395401000977, + "eval_runtime": 152.2957, + "eval_samples_per_second": 118.263, + "eval_steps_per_second": 7.394, "step": 19000 }, { - "epoch": 2.050371327090733, - "grad_norm": 0.5897380113601685, - "learning_rate": 0.00047753043853033075, - "loss": 3.5425, + "epoch": 2.0539083557951483, + "grad_norm": 0.5732460021972656, + "learning_rate": 0.0004773189422558014, + "loss": 3.5515, "step": 19050 }, { - "epoch": 2.055752879130341, - "grad_norm": 0.5932830572128296, - "learning_rate": 0.00047720719750026934, - "loss": 3.5699, + "epoch": 2.059299191374663, + "grad_norm": 0.5630287528038025, + "learning_rate": 0.00047699514301133294, + "loss": 3.5751, "step": 19100 }, { - "epoch": 2.0611344311699495, - "grad_norm": 0.5921191573143005, - "learning_rate": 0.00047688395647020793, - "loss": 3.5431, + "epoch": 2.0646900269541777, + "grad_norm": 0.5999619364738464, + "learning_rate": 0.00047667134376686455, + "loss": 3.569, "step": 19150 }, { - "epoch": 2.0665159832095576, - "grad_norm": 0.6052051186561584, - "learning_rate": 0.0004765607154401465, - "loss": 3.5734, + "epoch": 2.070080862533693, + "grad_norm": 0.6230016350746155, + "learning_rate": 0.0004763475445223961, + "loss": 3.5766, "step": 19200 }, { - "epoch": 2.0718975352491658, - "grad_norm": 0.562523603439331, - "learning_rate": 0.00047623747441008507, - "loss": 3.543, + "epoch": 2.0754716981132075, + "grad_norm": 0.6227099895477295, + "learning_rate": 0.0004760237452779276, + "loss": 3.5583, "step": 19250 }, { - "epoch": 2.0772790872887743, - "grad_norm": 0.546420156955719, - "learning_rate": 0.0004759142333800236, - "loss": 3.5441, + "epoch": 2.0808625336927222, + "grad_norm": 0.575489342212677, + "learning_rate": 0.0004756999460334592, + "loss": 3.5529, "step": 19300 }, { - "epoch": 2.0826606393283824, - "grad_norm": 0.590813159942627, - "learning_rate": 0.00047559099234996226, - "loss": 3.5624, + "epoch": 2.0862533692722374, + "grad_norm": 0.6161721348762512, + "learning_rate": 0.00047537614678899075, + "loss": 3.5567, "step": 19350 }, { - "epoch": 2.0880421913679905, - "grad_norm": 0.6105584502220154, - "learning_rate": 0.00047526775131990085, - "loss": 3.557, + "epoch": 2.091644204851752, + "grad_norm": 0.570197582244873, + "learning_rate": 0.00047505234754452235, + "loss": 3.551, "step": 19400 }, { - "epoch": 2.0934237434075986, - "grad_norm": 0.6170618534088135, - "learning_rate": 0.0004749445102898394, - "loss": 3.5373, + "epoch": 2.0970350404312668, + "grad_norm": 0.6006742715835571, + "learning_rate": 0.0004747285483000539, + "loss": 3.5608, "step": 19450 }, { - "epoch": 2.098805295447207, - "grad_norm": 0.5814986228942871, - "learning_rate": 0.000474621269259778, - "loss": 3.5693, + "epoch": 2.1024258760107815, + "grad_norm": 0.5746577978134155, + "learning_rate": 0.00047441122504047486, + "loss": 3.5538, "step": 19500 }, { - "epoch": 2.1041868474868153, - "grad_norm": 0.6113174557685852, - "learning_rate": 0.0004742980282297166, - "loss": 3.5589, + "epoch": 2.1078167115902966, + "grad_norm": 0.5501875281333923, + "learning_rate": 0.0004740874257960064, + "loss": 3.559, "step": 19550 }, { - "epoch": 2.1095683995264234, - "grad_norm": 0.6312413811683655, - "learning_rate": 0.0004739747871996551, - "loss": 3.5398, + "epoch": 2.1132075471698113, + "grad_norm": 0.5990867018699646, + "learning_rate": 0.000473763626551538, + "loss": 3.5566, "step": 19600 }, { - "epoch": 2.1149499515660315, - "grad_norm": 0.6352087259292603, - "learning_rate": 0.00047365154616959377, - "loss": 3.5541, + "epoch": 2.118598382749326, + "grad_norm": 0.5872001647949219, + "learning_rate": 0.0004734592552617377, + "loss": 3.5883, "step": 19650 }, { - "epoch": 2.12033150360564, - "grad_norm": 0.6081045269966125, - "learning_rate": 0.00047332830513953237, - "loss": 3.576, + "epoch": 2.123989218328841, + "grad_norm": 0.5909648537635803, + "learning_rate": 0.00047313545601726926, + "loss": 3.5488, "step": 19700 }, { - "epoch": 2.125713055645248, - "grad_norm": 0.5704367160797119, - "learning_rate": 0.0004730050641094709, - "loss": 3.5563, + "epoch": 2.129380053908356, + "grad_norm": 0.5571679472923279, + "learning_rate": 0.0004728116567728008, + "loss": 3.5699, "step": 19750 }, { - "epoch": 2.1310946076848563, - "grad_norm": 0.6367883682250977, - "learning_rate": 0.0004726818230794095, - "loss": 3.5358, + "epoch": 2.1347708894878705, + "grad_norm": 0.5732533931732178, + "learning_rate": 0.0004724878575283324, + "loss": 3.5616, "step": 19800 }, { - "epoch": 2.1364761597244644, - "grad_norm": 0.5543844699859619, - "learning_rate": 0.00047235858204934804, - "loss": 3.5561, + "epoch": 2.1401617250673857, + "grad_norm": 0.5497268438339233, + "learning_rate": 0.00047216405828386397, + "loss": 3.5549, "step": 19850 }, { - "epoch": 2.141857711764073, - "grad_norm": 0.6325891017913818, - "learning_rate": 0.0004720353410192867, - "loss": 3.5459, + "epoch": 2.1455525606469004, + "grad_norm": 0.5779978632926941, + "learning_rate": 0.0004718402590393956, + "loss": 3.5673, "step": 19900 }, { - "epoch": 2.147239263803681, - "grad_norm": 0.5876049995422363, - "learning_rate": 0.0004717120999892253, - "loss": 3.5568, + "epoch": 2.150943396226415, + "grad_norm": 0.5528773069381714, + "learning_rate": 0.00047151645979492707, + "loss": 3.5603, "step": 19950 }, { - "epoch": 2.152620815843289, - "grad_norm": 0.5623601675033569, - "learning_rate": 0.0004713888589591638, - "loss": 3.5527, + "epoch": 2.1563342318059298, + "grad_norm": 0.6277232766151428, + "learning_rate": 0.0004711926605504587, + "loss": 3.5816, "step": 20000 }, { - "epoch": 2.152620815843289, - "eval_accuracy": 0.3627479879929867, - "eval_loss": 3.5705673694610596, - "eval_runtime": 206.2591, - "eval_samples_per_second": 87.322, - "eval_steps_per_second": 5.459, + "epoch": 2.1563342318059298, + "eval_accuracy": 0.3615700820361899, + "eval_loss": 3.5775678157806396, + "eval_runtime": 152.8956, + "eval_samples_per_second": 117.799, + "eval_steps_per_second": 7.365, "step": 20000 }, { - "epoch": 2.1580023678828972, - "grad_norm": 0.5725038051605225, - "learning_rate": 0.0004710656179291024, - "loss": 3.5565, + "epoch": 2.161725067385445, + "grad_norm": 0.5952591896057129, + "learning_rate": 0.0004708688613059902, + "loss": 3.5851, "step": 20050 }, { - "epoch": 2.163383919922506, - "grad_norm": 0.6287389993667603, - "learning_rate": 0.000470742376899041, - "loss": 3.5528, + "epoch": 2.1671159029649596, + "grad_norm": 0.6090236306190491, + "learning_rate": 0.0004705450620615218, + "loss": 3.5848, "step": 20100 }, { - "epoch": 2.168765471962114, - "grad_norm": 0.5884525179862976, - "learning_rate": 0.00047041913586897956, - "loss": 3.5597, + "epoch": 2.1725067385444743, + "grad_norm": 0.5715079307556152, + "learning_rate": 0.0004702212628170534, + "loss": 3.5757, "step": 20150 }, { - "epoch": 2.174147024001722, - "grad_norm": 0.5764286518096924, - "learning_rate": 0.0004700958948389182, - "loss": 3.5426, + "epoch": 2.177897574123989, + "grad_norm": 0.6015267968177795, + "learning_rate": 0.00046989746357258493, + "loss": 3.5736, "step": 20200 }, { - "epoch": 2.1795285760413305, - "grad_norm": 0.6041747331619263, - "learning_rate": 0.0004697726538088568, - "loss": 3.5471, + "epoch": 2.183288409703504, + "grad_norm": 0.5924991369247437, + "learning_rate": 0.00046957366432811654, + "loss": 3.5751, "step": 20250 }, { - "epoch": 2.1849101280809387, - "grad_norm": 0.5907628536224365, - "learning_rate": 0.00046944941277879534, - "loss": 3.5453, + "epoch": 2.188679245283019, + "grad_norm": 0.5487517714500427, + "learning_rate": 0.0004692498650836481, + "loss": 3.5663, "step": 20300 }, { - "epoch": 2.1902916801205468, - "grad_norm": 0.5632272958755493, - "learning_rate": 0.00046912617174873394, - "loss": 3.5558, + "epoch": 2.1940700808625335, + "grad_norm": 0.5642894506454468, + "learning_rate": 0.0004689260658391797, + "loss": 3.5811, "step": 20350 }, { - "epoch": 2.195673232160155, - "grad_norm": 0.6226711869239807, - "learning_rate": 0.0004688029307186725, - "loss": 3.5865, + "epoch": 2.1994609164420487, + "grad_norm": 0.6076687574386597, + "learning_rate": 0.00046860226659471124, + "loss": 3.5823, "step": 20400 }, { - "epoch": 2.2010547841997634, - "grad_norm": 0.5935404896736145, - "learning_rate": 0.00046847968968861107, - "loss": 3.5594, + "epoch": 2.2048517520215634, + "grad_norm": 0.5700116157531738, + "learning_rate": 0.00046827846735024285, + "loss": 3.5785, "step": 20450 }, { - "epoch": 2.2064363362393715, - "grad_norm": 0.5879222750663757, - "learning_rate": 0.0004681564486585497, - "loss": 3.5637, + "epoch": 2.210242587601078, + "grad_norm": 0.5856590867042542, + "learning_rate": 0.0004679546681057744, + "loss": 3.5476, "step": 20500 }, { - "epoch": 2.2118178882789796, - "grad_norm": 0.5888842940330505, - "learning_rate": 0.00046783320762848826, - "loss": 3.5438, + "epoch": 2.215633423180593, + "grad_norm": 0.5908544659614563, + "learning_rate": 0.0004676308688613059, + "loss": 3.5791, "step": 20550 }, { - "epoch": 2.2171994403185877, - "grad_norm": 0.5803065299987793, - "learning_rate": 0.00046750996659842685, - "loss": 3.5429, + "epoch": 2.221024258760108, + "grad_norm": 0.6657243371009827, + "learning_rate": 0.00046730706961683755, + "loss": 3.5541, "step": 20600 }, { - "epoch": 2.2225809923581963, - "grad_norm": 0.5627387166023254, - "learning_rate": 0.00046718672556836545, - "loss": 3.5601, + "epoch": 2.2264150943396226, + "grad_norm": 0.5953009724617004, + "learning_rate": 0.00046698327037236905, + "loss": 3.5625, "step": 20650 }, { - "epoch": 2.2279625443978044, - "grad_norm": 0.5622759461402893, - "learning_rate": 0.000466863484538304, - "loss": 3.5676, + "epoch": 2.2318059299191373, + "grad_norm": 0.5696465969085693, + "learning_rate": 0.00046665947112790065, + "loss": 3.5584, "step": 20700 }, { - "epoch": 2.2333440964374125, - "grad_norm": 0.5685352087020874, - "learning_rate": 0.0004665402435082426, - "loss": 3.5572, + "epoch": 2.2371967654986524, + "grad_norm": 0.5952767729759216, + "learning_rate": 0.0004663356718834322, + "loss": 3.5659, "step": 20750 }, { - "epoch": 2.2387256484770206, - "grad_norm": 0.6102375388145447, - "learning_rate": 0.00046621700247818123, - "loss": 3.5546, + "epoch": 2.242587601078167, + "grad_norm": 0.5673699975013733, + "learning_rate": 0.0004660118726389638, + "loss": 3.5602, "step": 20800 }, { - "epoch": 2.244107200516629, - "grad_norm": 0.5915598273277283, - "learning_rate": 0.0004658937614481198, - "loss": 3.5519, + "epoch": 2.247978436657682, + "grad_norm": 0.5948861241340637, + "learning_rate": 0.00046568807339449536, + "loss": 3.5748, "step": 20850 }, { - "epoch": 2.2494887525562373, - "grad_norm": 0.5531293153762817, - "learning_rate": 0.00046557052041805837, - "loss": 3.5491, + "epoch": 2.2533692722371965, + "grad_norm": 0.5734835267066956, + "learning_rate": 0.00046536427415002696, + "loss": 3.568, "step": 20900 }, { - "epoch": 2.2548703045958454, - "grad_norm": 0.5677109956741333, - "learning_rate": 0.0004652472793879969, - "loss": 3.5562, + "epoch": 2.2587601078167117, + "grad_norm": 0.5433420538902283, + "learning_rate": 0.0004650404749055585, + "loss": 3.5583, "step": 20950 }, { - "epoch": 2.2602518566354535, - "grad_norm": 0.5943840742111206, - "learning_rate": 0.0004649240383579355, - "loss": 3.5487, + "epoch": 2.2641509433962264, + "grad_norm": 0.6004379391670227, + "learning_rate": 0.00046471667566109007, + "loss": 3.5716, "step": 21000 }, { - "epoch": 2.2602518566354535, - "eval_accuracy": 0.36394262649450687, - "eval_loss": 3.560176372528076, - "eval_runtime": 204.7535, - "eval_samples_per_second": 87.964, - "eval_steps_per_second": 5.499, + "epoch": 2.2641509433962264, + "eval_accuracy": 0.36321606463369144, + "eval_loss": 3.5651097297668457, + "eval_runtime": 152.3329, + "eval_samples_per_second": 118.234, + "eval_steps_per_second": 7.392, "step": 21000 }, { - "epoch": 2.265633408675062, - "grad_norm": 0.5741783380508423, - "learning_rate": 0.00046460079732787415, - "loss": 3.5606, + "epoch": 2.269541778975741, + "grad_norm": 0.5563739538192749, + "learning_rate": 0.00046439287641662167, + "loss": 3.5663, "step": 21050 }, { - "epoch": 2.27101496071467, - "grad_norm": 0.6168876886367798, - "learning_rate": 0.0004642775562978127, - "loss": 3.5662, + "epoch": 2.274932614555256, + "grad_norm": 0.6050722599029541, + "learning_rate": 0.0004640690771721532, + "loss": 3.562, "step": 21100 }, { - "epoch": 2.2763965127542782, - "grad_norm": 0.6492966413497925, - "learning_rate": 0.0004639543152677513, - "loss": 3.5546, + "epoch": 2.280323450134771, + "grad_norm": 0.5888524055480957, + "learning_rate": 0.0004637452779276848, + "loss": 3.5838, "step": 21150 }, { - "epoch": 2.281778064793887, - "grad_norm": 0.634781539440155, - "learning_rate": 0.0004636310742376899, - "loss": 3.5674, + "epoch": 2.2857142857142856, + "grad_norm": 0.6243483424186707, + "learning_rate": 0.0004634214786832164, + "loss": 3.5691, "step": 21200 }, { - "epoch": 2.287159616833495, - "grad_norm": 0.5752384066581726, - "learning_rate": 0.0004633078332076284, - "loss": 3.5561, + "epoch": 2.2911051212938007, + "grad_norm": 0.6089298725128174, + "learning_rate": 0.000463097679438748, + "loss": 3.5804, "step": 21250 }, { - "epoch": 2.292541168873103, - "grad_norm": 0.5751746296882629, - "learning_rate": 0.000462984592177567, - "loss": 3.5505, + "epoch": 2.2964959568733154, + "grad_norm": 0.5981297492980957, + "learning_rate": 0.0004627738801942795, + "loss": 3.5454, "step": 21300 }, { - "epoch": 2.297922720912711, - "grad_norm": 0.5733442902565002, - "learning_rate": 0.00046266135114750567, - "loss": 3.5489, + "epoch": 2.30188679245283, + "grad_norm": 0.7177810072898865, + "learning_rate": 0.0004624500809498111, + "loss": 3.5644, "step": 21350 }, { - "epoch": 2.303304272952319, - "grad_norm": 0.6262395977973938, - "learning_rate": 0.0004623381101174442, - "loss": 3.552, + "epoch": 2.3072776280323453, + "grad_norm": 0.5293326377868652, + "learning_rate": 0.00046212628170534263, + "loss": 3.559, "step": 21400 }, { - "epoch": 2.3086858249919278, - "grad_norm": 0.6084133982658386, - "learning_rate": 0.0004620148690873828, - "loss": 3.5515, + "epoch": 2.31266846361186, + "grad_norm": 0.5836008191108704, + "learning_rate": 0.0004618024824608742, + "loss": 3.5843, "step": 21450 }, { - "epoch": 2.314067377031536, - "grad_norm": 0.558005690574646, - "learning_rate": 0.00046169162805732134, - "loss": 3.5501, + "epoch": 2.3180592991913747, + "grad_norm": 0.5900083780288696, + "learning_rate": 0.0004614786832164058, + "loss": 3.5552, "step": 21500 }, { - "epoch": 2.319448929071144, - "grad_norm": 0.656355619430542, - "learning_rate": 0.00046136838702725994, - "loss": 3.5677, + "epoch": 2.3234501347708894, + "grad_norm": 0.5620343685150146, + "learning_rate": 0.00046115488397193734, + "loss": 3.569, "step": 21550 }, { - "epoch": 2.3248304811107525, - "grad_norm": 0.6125053763389587, - "learning_rate": 0.00046104514599719853, - "loss": 3.5496, + "epoch": 2.3288409703504045, + "grad_norm": 0.6022666096687317, + "learning_rate": 0.00046083108472746894, + "loss": 3.5624, "step": 21600 }, { - "epoch": 2.3302120331503606, - "grad_norm": 0.708257257938385, - "learning_rate": 0.0004607219049671371, - "loss": 3.594, + "epoch": 2.334231805929919, + "grad_norm": 0.7098419070243835, + "learning_rate": 0.0004605072854830005, + "loss": 3.563, "step": 21650 }, { - "epoch": 2.3355935851899687, - "grad_norm": 0.6216398477554321, - "learning_rate": 0.0004603986639370757, - "loss": 3.5749, + "epoch": 2.339622641509434, + "grad_norm": 0.6487218141555786, + "learning_rate": 0.0004601834862385321, + "loss": 3.5825, "step": 21700 }, { - "epoch": 2.340975137229577, - "grad_norm": 0.5760979056358337, - "learning_rate": 0.0004600754229070143, - "loss": 3.5546, + "epoch": 2.3450134770889486, + "grad_norm": 0.6314107179641724, + "learning_rate": 0.00045985968699406365, + "loss": 3.5701, "step": 21750 }, { - "epoch": 2.3463566892691854, - "grad_norm": 0.5885084867477417, - "learning_rate": 0.00045975218187695286, - "loss": 3.5654, + "epoch": 2.3504043126684637, + "grad_norm": 0.5793794989585876, + "learning_rate": 0.0004595358877495952, + "loss": 3.5527, "step": 21800 }, { - "epoch": 2.3517382413087935, - "grad_norm": 0.6174579858779907, - "learning_rate": 0.00045942894084689145, - "loss": 3.5543, + "epoch": 2.3557951482479784, + "grad_norm": 0.5867211222648621, + "learning_rate": 0.0004592120885051268, + "loss": 3.562, "step": 21850 }, { - "epoch": 2.3571197933484016, - "grad_norm": 0.6659450531005859, - "learning_rate": 0.0004591056998168301, - "loss": 3.553, + "epoch": 2.361185983827493, + "grad_norm": 0.5849786400794983, + "learning_rate": 0.0004588882892606583, + "loss": 3.555, "step": 21900 }, { - "epoch": 2.3625013453880097, - "grad_norm": 0.5789748430252075, - "learning_rate": 0.00045878245878676864, - "loss": 3.5611, + "epoch": 2.3665768194070083, + "grad_norm": 0.5810790657997131, + "learning_rate": 0.00045856449001618996, + "loss": 3.5578, "step": 21950 }, { - "epoch": 2.3678828974276183, - "grad_norm": 0.5881466865539551, - "learning_rate": 0.00045846568257730845, - "loss": 3.5664, + "epoch": 2.371967654986523, + "grad_norm": 0.5677943825721741, + "learning_rate": 0.00045824069077172146, + "loss": 3.5795, "step": 22000 }, { - "epoch": 2.3678828974276183, - "eval_accuracy": 0.36495016472321057, - "eval_loss": 3.5474905967712402, - "eval_runtime": 194.8012, - "eval_samples_per_second": 92.458, - "eval_steps_per_second": 5.78, + "epoch": 2.371967654986523, + "eval_accuracy": 0.3641010424049404, + "eval_loss": 3.556856393814087, + "eval_runtime": 152.9462, + "eval_samples_per_second": 117.76, + "eval_steps_per_second": 7.362, "step": 22000 }, { - "epoch": 2.3732644494672264, - "grad_norm": 0.6417264342308044, - "learning_rate": 0.00045814244154724704, - "loss": 3.5509, + "epoch": 2.3773584905660377, + "grad_norm": 0.5545417070388794, + "learning_rate": 0.00045791689152725306, + "loss": 3.5555, "step": 22050 }, { - "epoch": 2.3786460015068345, - "grad_norm": 0.6134101748466492, - "learning_rate": 0.00045781920051718563, - "loss": 3.5482, + "epoch": 2.382749326145553, + "grad_norm": 0.662285327911377, + "learning_rate": 0.0004575930922827846, + "loss": 3.5585, "step": 22100 }, { - "epoch": 2.384027553546443, - "grad_norm": 0.8174729943275452, - "learning_rate": 0.0004574959594871242, - "loss": 3.5538, + "epoch": 2.3881401617250675, + "grad_norm": 0.6141922473907471, + "learning_rate": 0.0004572692930383162, + "loss": 3.5602, "step": 22150 }, { - "epoch": 2.389409105586051, - "grad_norm": 0.6091681122779846, - "learning_rate": 0.00045717271845706277, - "loss": 3.5516, + "epoch": 2.393530997304582, + "grad_norm": 0.6011219024658203, + "learning_rate": 0.00045694549379384777, + "loss": 3.5705, "step": 22200 }, { - "epoch": 2.3947906576256592, - "grad_norm": 0.5839729905128479, - "learning_rate": 0.0004568494774270013, - "loss": 3.5545, + "epoch": 2.398921832884097, + "grad_norm": 0.5966909527778625, + "learning_rate": 0.0004566216945493793, + "loss": 3.575, "step": 22250 }, { - "epoch": 2.4001722096652673, - "grad_norm": 0.620452344417572, - "learning_rate": 0.00045652623639693996, - "loss": 3.5427, + "epoch": 2.404312668463612, + "grad_norm": 0.5801154971122742, + "learning_rate": 0.0004562978953049109, + "loss": 3.5767, "step": 22300 }, { - "epoch": 2.4055537617048754, - "grad_norm": 0.6439001560211182, - "learning_rate": 0.00045620299536687855, - "loss": 3.554, + "epoch": 2.4097035040431267, + "grad_norm": 0.5791796445846558, + "learning_rate": 0.0004559740960604425, + "loss": 3.558, "step": 22350 }, { - "epoch": 2.410935313744484, - "grad_norm": 0.6319786906242371, - "learning_rate": 0.0004558797543368171, - "loss": 3.5723, + "epoch": 2.4150943396226414, + "grad_norm": 0.6324838399887085, + "learning_rate": 0.0004556502968159741, + "loss": 3.5805, "step": 22400 }, { - "epoch": 2.416316865784092, - "grad_norm": 0.6055615544319153, - "learning_rate": 0.0004555565133067557, - "loss": 3.5568, + "epoch": 2.420485175202156, + "grad_norm": 0.589653491973877, + "learning_rate": 0.00045532649757150563, + "loss": 3.5696, "step": 22450 }, { - "epoch": 2.4216984178237, - "grad_norm": 0.6199345588684082, - "learning_rate": 0.0004552332722766943, - "loss": 3.549, + "epoch": 2.4258760107816713, + "grad_norm": 0.5994852781295776, + "learning_rate": 0.00045500269832703723, + "loss": 3.5624, "step": 22500 }, { - "epoch": 2.4270799698633088, - "grad_norm": 0.587843656539917, - "learning_rate": 0.0004549100312466328, - "loss": 3.5311, + "epoch": 2.431266846361186, + "grad_norm": 0.5957964062690735, + "learning_rate": 0.0004546788990825688, + "loss": 3.5596, "step": 22550 }, { - "epoch": 2.432461521902917, - "grad_norm": 0.5868039727210999, - "learning_rate": 0.0004545867902165715, - "loss": 3.5624, + "epoch": 2.4366576819407006, + "grad_norm": 0.5986453890800476, + "learning_rate": 0.0004543550998381004, + "loss": 3.56, "step": 22600 }, { - "epoch": 2.437843073942525, - "grad_norm": 0.5922572016716003, - "learning_rate": 0.00045426354918651007, - "loss": 3.5538, + "epoch": 2.442048517520216, + "grad_norm": 0.5685117244720459, + "learning_rate": 0.0004540313005936319, + "loss": 3.5473, "step": 22650 }, { - "epoch": 2.443224625982133, - "grad_norm": 0.6361838579177856, - "learning_rate": 0.0004539403081564486, - "loss": 3.5364, + "epoch": 2.4474393530997305, + "grad_norm": 0.6107739210128784, + "learning_rate": 0.00045370750134916344, + "loss": 3.5517, "step": 22700 }, { - "epoch": 2.4486061780217416, - "grad_norm": 0.5849791765213013, - "learning_rate": 0.0004536170671263872, - "loss": 3.5606, + "epoch": 2.452830188679245, + "grad_norm": 0.5900905132293701, + "learning_rate": 0.00045338370210469504, + "loss": 3.555, "step": 22750 }, { - "epoch": 2.4539877300613497, - "grad_norm": 0.6513404250144958, - "learning_rate": 0.00045329382609632574, - "loss": 3.535, + "epoch": 2.4582210242587603, + "grad_norm": 0.5761315822601318, + "learning_rate": 0.0004530599028602266, + "loss": 3.5514, "step": 22800 }, { - "epoch": 2.459369282100958, - "grad_norm": 0.5861430168151855, - "learning_rate": 0.0004529705850662644, - "loss": 3.5432, + "epoch": 2.463611859838275, + "grad_norm": 0.6041805744171143, + "learning_rate": 0.0004527361036157582, + "loss": 3.5594, "step": 22850 }, { - "epoch": 2.464750834140566, - "grad_norm": 0.6476746797561646, - "learning_rate": 0.000452647344036203, - "loss": 3.5454, + "epoch": 2.4690026954177897, + "grad_norm": 0.6323763132095337, + "learning_rate": 0.00045241230437128975, + "loss": 3.5682, "step": 22900 }, { - "epoch": 2.4701323861801745, - "grad_norm": 0.6115290522575378, - "learning_rate": 0.00045232410300614153, - "loss": 3.5439, + "epoch": 2.4743935309973044, + "grad_norm": 0.6081458926200867, + "learning_rate": 0.00045208850512682135, + "loss": 3.5714, "step": 22950 }, { - "epoch": 2.4755139382197826, - "grad_norm": 0.5903562903404236, - "learning_rate": 0.0004520008619760801, - "loss": 3.5464, + "epoch": 2.4797843665768196, + "grad_norm": 0.5616876482963562, + "learning_rate": 0.0004517647058823529, + "loss": 3.5451, "step": 23000 }, { - "epoch": 2.4755139382197826, - "eval_accuracy": 0.3655437354493411, - "eval_loss": 3.5417819023132324, - "eval_runtime": 206.588, - "eval_samples_per_second": 87.183, - "eval_steps_per_second": 5.45, + "epoch": 2.4797843665768196, + "eval_accuracy": 0.3651323756160755, + "eval_loss": 3.544189691543579, + "eval_runtime": 152.5592, + "eval_samples_per_second": 118.059, + "eval_steps_per_second": 7.381, "step": 23000 }, { - "epoch": 2.4808954902593907, - "grad_norm": 0.5942795276641846, - "learning_rate": 0.0004516776209460187, - "loss": 3.5293, + "epoch": 2.4851752021563343, + "grad_norm": 0.5997558832168579, + "learning_rate": 0.0004514409066378845, + "loss": 3.5474, "step": 23050 }, { - "epoch": 2.4862770422989993, - "grad_norm": 0.5559934973716736, - "learning_rate": 0.00045135437991595726, - "loss": 3.5569, + "epoch": 2.490566037735849, + "grad_norm": 0.5757705569267273, + "learning_rate": 0.00045111710739341606, + "loss": 3.5646, "step": 23100 }, { - "epoch": 2.4916585943386074, - "grad_norm": 0.6209527254104614, - "learning_rate": 0.0004510311388858959, - "loss": 3.5374, + "epoch": 2.4959568733153636, + "grad_norm": 0.5957797765731812, + "learning_rate": 0.0004507933081489476, + "loss": 3.5681, "step": 23150 }, { - "epoch": 2.4970401463782155, - "grad_norm": 0.5962895750999451, - "learning_rate": 0.0004507078978558345, - "loss": 3.5538, + "epoch": 2.501347708894879, + "grad_norm": 0.5993410348892212, + "learning_rate": 0.0004504695089044792, + "loss": 3.5585, "step": 23200 }, { - "epoch": 2.5024216984178236, - "grad_norm": 0.6051533222198486, - "learning_rate": 0.00045038465682577304, - "loss": 3.552, + "epoch": 2.5067385444743935, + "grad_norm": 0.5758466124534607, + "learning_rate": 0.0004501457096600107, + "loss": 3.5529, "step": 23250 }, { - "epoch": 2.5078032504574317, - "grad_norm": 0.5640018582344055, - "learning_rate": 0.00045006141579571164, - "loss": 3.5596, + "epoch": 2.512129380053908, + "grad_norm": 0.584588885307312, + "learning_rate": 0.00044982191041554237, + "loss": 3.5518, "step": 23300 }, { - "epoch": 2.5131848024970402, - "grad_norm": 0.6006429195404053, - "learning_rate": 0.0004497381747656502, - "loss": 3.5692, + "epoch": 2.5175202156334233, + "grad_norm": 0.6807150840759277, + "learning_rate": 0.00044949811117107386, + "loss": 3.5484, "step": 23350 }, { - "epoch": 2.5185663545366483, - "grad_norm": 0.6432710886001587, - "learning_rate": 0.00044941493373558877, - "loss": 3.5302, + "epoch": 2.522911051212938, + "grad_norm": 0.5969212055206299, + "learning_rate": 0.00044917431192660547, + "loss": 3.5403, "step": 23400 }, { - "epoch": 2.5239479065762565, - "grad_norm": 0.5585759282112122, - "learning_rate": 0.0004490916927055274, - "loss": 3.5559, + "epoch": 2.5283018867924527, + "grad_norm": 0.5543271899223328, + "learning_rate": 0.000448850512682137, + "loss": 3.5529, "step": 23450 }, { - "epoch": 2.529329458615865, - "grad_norm": 0.6355800032615662, - "learning_rate": 0.00044876845167546596, - "loss": 3.5332, + "epoch": 2.533692722371968, + "grad_norm": 0.5737673044204712, + "learning_rate": 0.0004485267134376686, + "loss": 3.5474, "step": 23500 }, { - "epoch": 2.534711010655473, - "grad_norm": 0.5865529179573059, - "learning_rate": 0.00044844521064540455, - "loss": 3.5558, + "epoch": 2.5390835579514826, + "grad_norm": 0.5664697885513306, + "learning_rate": 0.0004482029141932002, + "loss": 3.5604, "step": 23550 }, { - "epoch": 2.540092562695081, - "grad_norm": 0.6548492312431335, - "learning_rate": 0.00044812196961534315, - "loss": 3.5339, + "epoch": 2.5444743935309972, + "grad_norm": 0.5698988437652588, + "learning_rate": 0.0004478791149487317, + "loss": 3.5589, "step": 23600 }, { - "epoch": 2.5454741147346893, - "grad_norm": 0.6574897766113281, - "learning_rate": 0.0004477987285852817, - "loss": 3.5498, + "epoch": 2.5498652291105124, + "grad_norm": 0.6224414110183716, + "learning_rate": 0.00044755531570426333, + "loss": 3.5653, "step": 23650 }, { - "epoch": 2.550855666774298, - "grad_norm": 0.5814394950866699, - "learning_rate": 0.00044747548755522034, - "loss": 3.5552, + "epoch": 2.555256064690027, + "grad_norm": 0.585587203502655, + "learning_rate": 0.0004472315164597949, + "loss": 3.5702, "step": 23700 }, { - "epoch": 2.556237218813906, - "grad_norm": 0.5807026028633118, - "learning_rate": 0.00044715224652515893, - "loss": 3.5408, + "epoch": 2.560646900269542, + "grad_norm": 0.5666549205780029, + "learning_rate": 0.0004469077172153265, + "loss": 3.5777, "step": 23750 }, { - "epoch": 2.561618770853514, - "grad_norm": 0.6143361926078796, - "learning_rate": 0.0004468290054950975, - "loss": 3.5664, + "epoch": 2.5660377358490565, + "grad_norm": 0.7604691386222839, + "learning_rate": 0.00044658391797085804, + "loss": 3.5518, "step": 23800 }, { - "epoch": 2.567000322893122, - "grad_norm": 0.5572286248207092, - "learning_rate": 0.00044650576446503607, - "loss": 3.5545, + "epoch": 2.571428571428571, + "grad_norm": 0.608855128288269, + "learning_rate": 0.00044626011872638964, + "loss": 3.5655, "step": 23850 }, { - "epoch": 2.5723818749327307, - "grad_norm": 0.5634745955467224, - "learning_rate": 0.0004461825234349746, - "loss": 3.5302, + "epoch": 2.5768194070080863, + "grad_norm": 0.6249096989631653, + "learning_rate": 0.0004459363194819212, + "loss": 3.5501, "step": 23900 }, { - "epoch": 2.577763426972339, - "grad_norm": 0.5858882665634155, - "learning_rate": 0.00044586574722551447, - "loss": 3.5528, + "epoch": 2.582210242587601, + "grad_norm": 0.599327564239502, + "learning_rate": 0.0004456125202374527, + "loss": 3.5576, "step": 23950 }, { - "epoch": 2.583144979011947, - "grad_norm": 0.6945074200630188, - "learning_rate": 0.000445542506195453, - "loss": 3.5619, + "epoch": 2.5876010781671157, + "grad_norm": 0.6255910396575928, + "learning_rate": 0.0004452887209929843, + "loss": 3.5389, "step": 24000 }, { - "epoch": 2.583144979011947, - "eval_accuracy": 0.36655561979355733, - "eval_loss": 3.5325984954833984, - "eval_runtime": 195.3908, - "eval_samples_per_second": 92.179, - "eval_steps_per_second": 5.763, + "epoch": 2.5876010781671157, + "eval_accuracy": 0.36652791330716467, + "eval_loss": 3.534491777420044, + "eval_runtime": 152.998, + "eval_samples_per_second": 117.72, + "eval_steps_per_second": 7.36, "step": 24000 }, { - "epoch": 2.5885265310515555, - "grad_norm": 0.5496438145637512, - "learning_rate": 0.0004452192651653916, - "loss": 3.5522, + "epoch": 2.592991913746631, + "grad_norm": 0.5508790612220764, + "learning_rate": 0.00044496492174851584, + "loss": 3.5528, "step": 24050 }, { - "epoch": 2.5939080830911636, - "grad_norm": 0.5970087051391602, - "learning_rate": 0.00044489602413533025, - "loss": 3.5635, + "epoch": 2.5983827493261455, + "grad_norm": 0.7283141016960144, + "learning_rate": 0.00044464112250404745, + "loss": 3.5466, "step": 24100 }, { - "epoch": 2.5992896351307717, - "grad_norm": 0.6501139998435974, - "learning_rate": 0.0004445727831052688, - "loss": 3.533, + "epoch": 2.6037735849056602, + "grad_norm": 0.5894685983657837, + "learning_rate": 0.000444317323259579, + "loss": 3.5734, "step": 24150 }, { - "epoch": 2.60467118717038, - "grad_norm": 0.6015759110450745, - "learning_rate": 0.0004442495420752074, - "loss": 3.5451, + "epoch": 2.6091644204851754, + "grad_norm": 0.6347320079803467, + "learning_rate": 0.0004439935240151106, + "loss": 3.5372, "step": 24200 }, { - "epoch": 2.610052739209988, - "grad_norm": 0.5871186852455139, - "learning_rate": 0.00044392630104514593, - "loss": 3.5508, + "epoch": 2.61455525606469, + "grad_norm": 0.5959154367446899, + "learning_rate": 0.00044366972477064215, + "loss": 3.5464, "step": 24250 }, { - "epoch": 2.6154342912495965, - "grad_norm": 0.5801131725311279, - "learning_rate": 0.0004436030600150845, - "loss": 3.5321, + "epoch": 2.6199460916442048, + "grad_norm": 0.6151760220527649, + "learning_rate": 0.00044334592552617376, + "loss": 3.5566, "step": 24300 }, { - "epoch": 2.6208158432892046, - "grad_norm": 0.7078419327735901, - "learning_rate": 0.0004432798189850231, - "loss": 3.5526, + "epoch": 2.62533692722372, + "grad_norm": 0.5945620536804199, + "learning_rate": 0.0004430221262817053, + "loss": 3.5717, "step": 24350 }, { - "epoch": 2.6261973953288127, - "grad_norm": 0.6336981654167175, - "learning_rate": 0.0004429565779549617, - "loss": 3.553, + "epoch": 2.6307277628032346, + "grad_norm": 0.6333128809928894, + "learning_rate": 0.00044269832703723686, + "loss": 3.5343, "step": 24400 }, { - "epoch": 2.6315789473684212, - "grad_norm": 0.621759831905365, - "learning_rate": 0.0004426333369249003, - "loss": 3.5336, + "epoch": 2.6361185983827493, + "grad_norm": 0.5592067837715149, + "learning_rate": 0.00044237452779276846, + "loss": 3.5512, "step": 24450 }, { - "epoch": 2.6369604994080293, - "grad_norm": 0.5916746854782104, - "learning_rate": 0.0004423100958948389, - "loss": 3.5525, + "epoch": 2.641509433962264, + "grad_norm": 0.5705219507217407, + "learning_rate": 0.0004420507285483, + "loss": 3.5672, "step": 24500 }, { - "epoch": 2.6423420514476375, - "grad_norm": 0.6469952464103699, - "learning_rate": 0.00044198685486477744, - "loss": 3.5264, + "epoch": 2.6469002695417787, + "grad_norm": 0.597025454044342, + "learning_rate": 0.0004417269293038316, + "loss": 3.5344, "step": 24550 }, { - "epoch": 2.6477236034872456, - "grad_norm": 0.6205400824546814, - "learning_rate": 0.00044166361383471604, - "loss": 3.5513, + "epoch": 2.652291105121294, + "grad_norm": 0.5921151638031006, + "learning_rate": 0.0004414031300593631, + "loss": 3.54, "step": 24600 }, { - "epoch": 2.653105155526854, - "grad_norm": 0.6007969975471497, - "learning_rate": 0.0004413403728046547, - "loss": 3.5303, + "epoch": 2.6576819407008085, + "grad_norm": 0.5553827881813049, + "learning_rate": 0.0004410793308148948, + "loss": 3.5434, "step": 24650 }, { - "epoch": 2.658486707566462, - "grad_norm": 0.6586825847625732, - "learning_rate": 0.0004410171317745932, - "loss": 3.5627, + "epoch": 2.6630727762803232, + "grad_norm": 0.6193781495094299, + "learning_rate": 0.00044075553157042627, + "loss": 3.5474, "step": 24700 }, { - "epoch": 2.6638682596060703, - "grad_norm": 0.5926357507705688, - "learning_rate": 0.0004406938907445318, - "loss": 3.5663, + "epoch": 2.6684636118598384, + "grad_norm": 0.6883496642112732, + "learning_rate": 0.0004404317323259579, + "loss": 3.5466, "step": 24750 }, { - "epoch": 2.6692498116456784, - "grad_norm": 0.5873600840568542, - "learning_rate": 0.00044037064971447036, - "loss": 3.5241, + "epoch": 2.673854447439353, + "grad_norm": 0.5652745962142944, + "learning_rate": 0.0004401079330814894, + "loss": 3.5251, "step": 24800 }, { - "epoch": 2.674631363685287, - "grad_norm": 0.592735767364502, - "learning_rate": 0.00044004740868440896, - "loss": 3.5567, + "epoch": 2.6792452830188678, + "grad_norm": 0.545843780040741, + "learning_rate": 0.000439784133837021, + "loss": 3.5213, "step": 24850 }, { - "epoch": 2.680012915724895, - "grad_norm": 0.626936674118042, - "learning_rate": 0.00043972416765434755, - "loss": 3.5204, + "epoch": 2.684636118598383, + "grad_norm": 0.6006308197975159, + "learning_rate": 0.0004394603345925526, + "loss": 3.5627, "step": 24900 }, { - "epoch": 2.685394467764503, - "grad_norm": 0.5691058039665222, - "learning_rate": 0.00043940092662428615, - "loss": 3.5214, + "epoch": 2.6900269541778976, + "grad_norm": 0.6098518967628479, + "learning_rate": 0.00043913653534808413, + "loss": 3.5446, "step": 24950 }, { - "epoch": 2.6907760198041117, - "grad_norm": 0.6450164914131165, - "learning_rate": 0.00043907768559422474, - "loss": 3.5487, + "epoch": 2.6954177897574123, + "grad_norm": 0.5801330208778381, + "learning_rate": 0.00043881273610361574, + "loss": 3.5351, "step": 25000 }, { - "epoch": 2.6907760198041117, - "eval_accuracy": 0.36812326365894465, - "eval_loss": 3.521350145339966, - "eval_runtime": 197.2244, - "eval_samples_per_second": 91.322, - "eval_steps_per_second": 5.709, + "epoch": 2.6954177897574123, + "eval_accuracy": 0.36689037934091373, + "eval_loss": 3.5260772705078125, + "eval_runtime": 152.8924, + "eval_samples_per_second": 117.802, + "eval_steps_per_second": 7.365, "step": 25000 }, { - "epoch": 2.69615757184372, - "grad_norm": 0.6423863768577576, - "learning_rate": 0.00043875444456416334, - "loss": 3.5373, + "epoch": 2.7008086253369274, + "grad_norm": 0.595666229724884, + "learning_rate": 0.0004384889368591473, + "loss": 3.546, "step": 25050 }, { - "epoch": 2.701539123883328, - "grad_norm": 0.6400243043899536, - "learning_rate": 0.0004384312035341019, - "loss": 3.5428, + "epoch": 2.706199460916442, + "grad_norm": 0.5644325613975525, + "learning_rate": 0.0004381651376146789, + "loss": 3.5296, "step": 25100 }, { - "epoch": 2.706920675922936, - "grad_norm": 0.6252772212028503, - "learning_rate": 0.00043810796250404047, - "loss": 3.5469, + "epoch": 2.711590296495957, + "grad_norm": 0.587679386138916, + "learning_rate": 0.00043784133837021044, + "loss": 3.5217, "step": 25150 }, { - "epoch": 2.712302227962544, - "grad_norm": 0.5882490873336792, - "learning_rate": 0.000437784721473979, - "loss": 3.5376, + "epoch": 2.7169811320754715, + "grad_norm": 0.6090216040611267, + "learning_rate": 0.00043751753912574205, + "loss": 3.5267, "step": 25200 }, { - "epoch": 2.7176837800021527, - "grad_norm": 0.604751467704773, - "learning_rate": 0.00043746148044391766, - "loss": 3.5409, + "epoch": 2.7223719676549867, + "grad_norm": 0.6133639812469482, + "learning_rate": 0.0004371937398812736, + "loss": 3.5605, "step": 25250 }, { - "epoch": 2.723065332041761, - "grad_norm": 0.6292399168014526, - "learning_rate": 0.00043713823941385625, - "loss": 3.5313, + "epoch": 2.7277628032345014, + "grad_norm": 0.6062129735946655, + "learning_rate": 0.0004368699406368051, + "loss": 3.5186, "step": 25300 }, { - "epoch": 2.728446884081369, - "grad_norm": 0.6144165396690369, - "learning_rate": 0.0004368149983837948, - "loss": 3.5484, + "epoch": 2.733153638814016, + "grad_norm": 0.5763473510742188, + "learning_rate": 0.0004365461413923367, + "loss": 3.5433, "step": 25350 }, { - "epoch": 2.7338284361209775, - "grad_norm": 0.6086465716362, - "learning_rate": 0.0004364917573537334, - "loss": 3.5381, + "epoch": 2.7385444743935308, + "grad_norm": 0.6757517457008362, + "learning_rate": 0.00043622234214786825, + "loss": 3.5452, "step": 25400 }, { - "epoch": 2.7392099881605856, - "grad_norm": 0.6256065964698792, - "learning_rate": 0.00043616851632367193, - "loss": 3.557, + "epoch": 2.743935309973046, + "grad_norm": 0.5790356993675232, + "learning_rate": 0.00043589854290339985, + "loss": 3.5194, "step": 25450 }, { - "epoch": 2.7445915402001937, - "grad_norm": 0.6041662096977234, - "learning_rate": 0.0004358452752936106, - "loss": 3.5347, + "epoch": 2.7493261455525606, + "grad_norm": 0.6045161485671997, + "learning_rate": 0.0004355747436589314, + "loss": 3.5777, "step": 25500 }, { - "epoch": 2.749973092239802, - "grad_norm": 0.6548029780387878, - "learning_rate": 0.0004355220342635492, - "loss": 3.5374, + "epoch": 2.7547169811320753, + "grad_norm": 0.5605710744857788, + "learning_rate": 0.000435250944414463, + "loss": 3.5656, "step": 25550 }, { - "epoch": 2.7553546442794103, - "grad_norm": 0.6124310493469238, - "learning_rate": 0.00043519879323348777, - "loss": 3.529, + "epoch": 2.7601078167115904, + "grad_norm": 0.6743922233581543, + "learning_rate": 0.00043492714516999456, + "loss": 3.5443, "step": 25600 }, { - "epoch": 2.7607361963190185, - "grad_norm": 0.596762478351593, - "learning_rate": 0.0004348755522034263, - "loss": 3.5273, + "epoch": 2.765498652291105, + "grad_norm": 0.6153864860534668, + "learning_rate": 0.0004346033459255261, + "loss": 3.5308, "step": 25650 }, { - "epoch": 2.7661177483586266, - "grad_norm": 0.5779131650924683, - "learning_rate": 0.0004345523111733649, - "loss": 3.5448, + "epoch": 2.77088948787062, + "grad_norm": 0.5993860960006714, + "learning_rate": 0.0004342795466810577, + "loss": 3.5506, "step": 25700 }, { - "epoch": 2.7714993003982347, - "grad_norm": 0.6550736427307129, - "learning_rate": 0.00043422907014330344, - "loss": 3.5405, + "epoch": 2.776280323450135, + "grad_norm": 0.622205376625061, + "learning_rate": 0.00043395574743658927, + "loss": 3.5411, "step": 25750 }, { - "epoch": 2.776880852437843, - "grad_norm": 0.7348501682281494, - "learning_rate": 0.0004339058291132421, - "loss": 3.5329, + "epoch": 2.7816711590296497, + "grad_norm": 0.600141167640686, + "learning_rate": 0.00043363194819212087, + "loss": 3.5421, "step": 25800 }, { - "epoch": 2.7822624044774513, - "grad_norm": 0.6231963038444519, - "learning_rate": 0.0004335825880831807, - "loss": 3.5465, + "epoch": 2.7870619946091644, + "grad_norm": 0.605973482131958, + "learning_rate": 0.0004333081489476524, + "loss": 3.5291, "step": 25850 }, { - "epoch": 2.7876439565170594, - "grad_norm": 0.5558022856712341, - "learning_rate": 0.00043325934705311923, - "loss": 3.5381, + "epoch": 2.7924528301886795, + "grad_norm": 0.6000856757164001, + "learning_rate": 0.000432984349703184, + "loss": 3.5385, "step": 25900 }, { - "epoch": 2.793025508556668, - "grad_norm": 0.6114339232444763, - "learning_rate": 0.0004329361060230578, - "loss": 3.5492, + "epoch": 2.797843665768194, + "grad_norm": 0.5753637552261353, + "learning_rate": 0.0004326605504587155, + "loss": 3.5278, "step": 25950 }, { - "epoch": 2.798407060596276, - "grad_norm": 0.621716320514679, - "learning_rate": 0.00043261286499299636, - "loss": 3.5277, + "epoch": 2.803234501347709, + "grad_norm": 0.6122384071350098, + "learning_rate": 0.0004323367512142472, + "loss": 3.5397, "step": 26000 }, { - "epoch": 2.798407060596276, - "eval_accuracy": 0.36862752171129165, - "eval_loss": 3.51253342628479, - "eval_runtime": 212.7222, - "eval_samples_per_second": 84.669, - "eval_steps_per_second": 5.293, + "epoch": 2.803234501347709, + "eval_accuracy": 0.3679035675197838, + "eval_loss": 3.5164499282836914, + "eval_runtime": 152.7141, + "eval_samples_per_second": 117.939, + "eval_steps_per_second": 7.373, "step": 26000 }, { - "epoch": 2.803788612635884, - "grad_norm": 0.6198678016662598, - "learning_rate": 0.00043228962396293496, - "loss": 3.5241, + "epoch": 2.8086253369272236, + "grad_norm": 0.6394927501678467, + "learning_rate": 0.0004320129519697787, + "loss": 3.5372, "step": 26050 }, { - "epoch": 2.8091701646754923, - "grad_norm": 0.5953556299209595, - "learning_rate": 0.0004319663829328736, - "loss": 3.5257, + "epoch": 2.8140161725067383, + "grad_norm": 0.6251195073127747, + "learning_rate": 0.00043168915272531023, + "loss": 3.5121, "step": 26100 }, { - "epoch": 2.8145517167151004, - "grad_norm": 0.6188264489173889, - "learning_rate": 0.00043164314190281215, - "loss": 3.5268, + "epoch": 2.8194070080862534, + "grad_norm": 0.6234621405601501, + "learning_rate": 0.00043136535348084183, + "loss": 3.5139, "step": 26150 }, { - "epoch": 2.819933268754709, - "grad_norm": 0.5980226993560791, - "learning_rate": 0.00043131990087275074, - "loss": 3.5478, + "epoch": 2.824797843665768, + "grad_norm": 0.6286827325820923, + "learning_rate": 0.0004310415542363734, + "loss": 3.5389, "step": 26200 }, { - "epoch": 2.825314820794317, - "grad_norm": 0.6212344169616699, - "learning_rate": 0.00043099665984268934, - "loss": 3.5297, + "epoch": 2.830188679245283, + "grad_norm": 0.615374743938446, + "learning_rate": 0.000430717754991905, + "loss": 3.5499, "step": 26250 }, { - "epoch": 2.830696372833925, - "grad_norm": 0.5705240964889526, - "learning_rate": 0.0004306734188126279, - "loss": 3.5324, + "epoch": 2.835579514824798, + "grad_norm": 0.5852181911468506, + "learning_rate": 0.00043039395574743654, + "loss": 3.5664, "step": 26300 }, { - "epoch": 2.8360779248735337, - "grad_norm": 0.6547979712486267, - "learning_rate": 0.00043035017778256647, - "loss": 3.533, + "epoch": 2.8409703504043127, + "grad_norm": 0.7029324173927307, + "learning_rate": 0.00043007015650296814, + "loss": 3.5374, "step": 26350 }, { - "epoch": 2.841459476913142, - "grad_norm": 0.5784933567047119, - "learning_rate": 0.0004300269367525051, - "loss": 3.5297, + "epoch": 2.8463611859838274, + "grad_norm": 0.612174391746521, + "learning_rate": 0.0004297463572584997, + "loss": 3.5438, "step": 26400 }, { - "epoch": 2.84684102895275, - "grad_norm": 0.7146517038345337, - "learning_rate": 0.00042970369572244366, - "loss": 3.5382, + "epoch": 2.8517520215633425, + "grad_norm": 0.6446730494499207, + "learning_rate": 0.0004294225580140313, + "loss": 3.5538, "step": 26450 }, { - "epoch": 2.852222580992358, - "grad_norm": 0.6623178124427795, - "learning_rate": 0.00042938045469238226, - "loss": 3.5374, + "epoch": 2.857142857142857, + "grad_norm": 0.6072059273719788, + "learning_rate": 0.00042909875876956285, + "loss": 3.5238, "step": 26500 }, { - "epoch": 2.857604133031966, - "grad_norm": 0.5954160094261169, - "learning_rate": 0.0004290572136623208, - "loss": 3.5509, + "epoch": 2.862533692722372, + "grad_norm": 0.6181698441505432, + "learning_rate": 0.0004287749595250944, + "loss": 3.5385, "step": 26550 }, { - "epoch": 2.8629856850715747, - "grad_norm": 0.6484636068344116, - "learning_rate": 0.0004287339726322594, - "loss": 3.5399, + "epoch": 2.867924528301887, + "grad_norm": 0.6247893571853638, + "learning_rate": 0.000428451160280626, + "loss": 3.5481, "step": 26600 }, { - "epoch": 2.868367237111183, - "grad_norm": 0.6219885349273682, - "learning_rate": 0.00042841073160219804, - "loss": 3.5334, + "epoch": 2.8733153638814017, + "grad_norm": 0.612091064453125, + "learning_rate": 0.0004281273610361575, + "loss": 3.5525, "step": 26650 }, { - "epoch": 2.873748789150791, - "grad_norm": 0.6355128288269043, - "learning_rate": 0.0004280874905721366, - "loss": 3.5135, + "epoch": 2.8787061994609164, + "grad_norm": 0.6002130508422852, + "learning_rate": 0.0004278035617916891, + "loss": 3.5394, "step": 26700 }, { - "epoch": 2.8791303411903995, - "grad_norm": 0.5998795032501221, - "learning_rate": 0.0004277642495420752, - "loss": 3.5442, + "epoch": 2.884097035040431, + "grad_norm": 0.6247308850288391, + "learning_rate": 0.00042747976254722066, + "loss": 3.5506, "step": 26750 }, { - "epoch": 2.8845118932300076, - "grad_norm": 0.5889739394187927, - "learning_rate": 0.00042744100851201377, - "loss": 3.5133, + "epoch": 2.889487870619946, + "grad_norm": 0.5961256623268127, + "learning_rate": 0.00042715596330275226, + "loss": 3.5267, "step": 26800 }, { - "epoch": 2.8898934452696157, - "grad_norm": 0.6281293630599976, - "learning_rate": 0.0004271177674819523, - "loss": 3.5056, + "epoch": 2.894878706199461, + "grad_norm": 0.620616614818573, + "learning_rate": 0.0004268321640582838, + "loss": 3.5502, "step": 26850 }, { - "epoch": 2.895274997309224, - "grad_norm": 0.5909113883972168, - "learning_rate": 0.0004267945264518909, - "loss": 3.5157, + "epoch": 2.9002695417789757, + "grad_norm": 0.6439360976219177, + "learning_rate": 0.0004265083648138154, + "loss": 3.5179, "step": 26900 }, { - "epoch": 2.9006565493488323, - "grad_norm": 0.6297270059585571, - "learning_rate": 0.00042647128542182955, - "loss": 3.5354, + "epoch": 2.9056603773584904, + "grad_norm": 0.6051164865493774, + "learning_rate": 0.00042618456556934697, + "loss": 3.5367, "step": 26950 }, { - "epoch": 2.9060381013884404, - "grad_norm": 0.6224883794784546, - "learning_rate": 0.0004261480443917681, - "loss": 3.5346, + "epoch": 2.9110512129380055, + "grad_norm": 0.6090266704559326, + "learning_rate": 0.0004258607663248785, + "loss": 3.5532, "step": 27000 }, { - "epoch": 2.9060381013884404, - "eval_accuracy": 0.3694928333098462, - "eval_loss": 3.5037596225738525, - "eval_runtime": 206.2343, - "eval_samples_per_second": 87.333, - "eval_steps_per_second": 5.46, + "epoch": 2.9110512129380055, + "eval_accuracy": 0.36900107033959784, + "eval_loss": 3.5088884830474854, + "eval_runtime": 152.765, + "eval_samples_per_second": 117.9, + "eval_steps_per_second": 7.371, "step": 27000 }, { - "epoch": 2.9114196534280485, - "grad_norm": 0.6158651113510132, - "learning_rate": 0.00042583126818230795, - "loss": 3.5352, + "epoch": 2.91644204851752, + "grad_norm": 0.6238779425621033, + "learning_rate": 0.0004255369670804101, + "loss": 3.5368, "step": 27050 }, { - "epoch": 2.9168012054676566, - "grad_norm": 0.6259952187538147, - "learning_rate": 0.0004255080271522465, - "loss": 3.5156, + "epoch": 2.921832884097035, + "grad_norm": 0.6020278334617615, + "learning_rate": 0.0004252131678359417, + "loss": 3.541, "step": 27100 }, { - "epoch": 2.922182757507265, - "grad_norm": 0.5961571335792542, - "learning_rate": 0.0004251847861221851, + "epoch": 2.92722371967655, + "grad_norm": 0.689791738986969, + "learning_rate": 0.0004248893685914733, "loss": 3.5362, "step": 27150 }, { - "epoch": 2.9275643095468733, - "grad_norm": 0.62592613697052, - "learning_rate": 0.00042486154509212363, - "loss": 3.5094, + "epoch": 2.9326145552560647, + "grad_norm": 0.644339382648468, + "learning_rate": 0.00042456556934700483, + "loss": 3.5235, "step": 27200 }, { - "epoch": 2.9329458615864814, - "grad_norm": 0.5974050760269165, - "learning_rate": 0.0004245383040620622, - "loss": 3.5258, + "epoch": 2.9380053908355794, + "grad_norm": 0.6350017786026001, + "learning_rate": 0.00042424177010253643, + "loss": 3.5371, "step": 27250 }, { - "epoch": 2.93832741362609, - "grad_norm": 0.5732911229133606, - "learning_rate": 0.0004242150630320009, - "loss": 3.5374, + "epoch": 2.9433962264150946, + "grad_norm": 0.5984843969345093, + "learning_rate": 0.00042391797085806793, + "loss": 3.5414, "step": 27300 }, { - "epoch": 2.943708965665698, - "grad_norm": 0.6453855633735657, - "learning_rate": 0.0004238918220019394, - "loss": 3.527, + "epoch": 2.9487870619946093, + "grad_norm": 0.5899128913879395, + "learning_rate": 0.0004235941716135995, + "loss": 3.5269, "step": 27350 }, { - "epoch": 2.949090517705306, - "grad_norm": 0.5842043161392212, - "learning_rate": 0.000423568580971878, - "loss": 3.5214, + "epoch": 2.954177897574124, + "grad_norm": 0.6649240851402283, + "learning_rate": 0.0004232703723691311, + "loss": 3.5181, "step": 27400 }, { - "epoch": 2.9544720697449143, - "grad_norm": 0.6021690964698792, - "learning_rate": 0.00042324533994181655, - "loss": 3.5026, + "epoch": 2.9595687331536387, + "grad_norm": 0.5655061602592468, + "learning_rate": 0.00042294657312466264, + "loss": 3.5257, "step": 27450 }, { - "epoch": 2.9598536217845224, - "grad_norm": 0.6042524576187134, - "learning_rate": 0.00042292209891175514, - "loss": 3.5169, + "epoch": 2.964959568733154, + "grad_norm": 0.6041932702064514, + "learning_rate": 0.00042262277388019424, + "loss": 3.535, "step": 27500 }, { - "epoch": 2.965235173824131, - "grad_norm": 0.6471766829490662, - "learning_rate": 0.00042259885788169374, - "loss": 3.5289, + "epoch": 2.9703504043126685, + "grad_norm": 0.5469972491264343, + "learning_rate": 0.0004222989746357258, + "loss": 3.524, "step": 27550 }, { - "epoch": 2.970616725863739, - "grad_norm": 0.6328997611999512, - "learning_rate": 0.00042227561685163233, - "loss": 3.521, + "epoch": 2.975741239892183, + "grad_norm": 0.5674824714660645, + "learning_rate": 0.0004219751753912574, + "loss": 3.5333, "step": 27600 }, { - "epoch": 2.975998277903347, - "grad_norm": 0.6381612420082092, - "learning_rate": 0.00042195237582157093, - "loss": 3.5258, + "epoch": 2.981132075471698, + "grad_norm": 0.5904396772384644, + "learning_rate": 0.00042165785213167835, + "loss": 3.5118, "step": 27650 }, { - "epoch": 2.9813798299429557, - "grad_norm": 0.6035014986991882, - "learning_rate": 0.0004216291347915095, - "loss": 3.5295, + "epoch": 2.986522911051213, + "grad_norm": 0.6024655699729919, + "learning_rate": 0.0004213340528872099, + "loss": 3.5224, "step": 27700 }, { - "epoch": 2.986761381982564, - "grad_norm": 0.6743372678756714, - "learning_rate": 0.00042130589376144806, - "loss": 3.5261, + "epoch": 2.9919137466307277, + "grad_norm": 0.6454286575317383, + "learning_rate": 0.00042101025364274145, + "loss": 3.5298, "step": 27750 }, { - "epoch": 2.992142934022172, - "grad_norm": 0.6017929315567017, - "learning_rate": 0.00042098265273138666, - "loss": 3.5172, + "epoch": 2.9973045822102424, + "grad_norm": 0.6410738229751587, + "learning_rate": 0.00042068645439827305, + "loss": 3.5312, "step": 27800 }, { - "epoch": 2.9975244860617805, - "grad_norm": 0.6554933786392212, - "learning_rate": 0.0004206594117013252, - "loss": 3.5245, + "epoch": 3.0026954177897576, + "grad_norm": 0.6354919672012329, + "learning_rate": 0.0004203626551538046, + "loss": 3.4957, "step": 27850 }, { - "epoch": 3.0029060381013886, - "grad_norm": 0.6437206864356995, - "learning_rate": 0.00042033617067126385, - "loss": 3.4742, + "epoch": 3.0080862533692723, + "grad_norm": 0.6528053283691406, + "learning_rate": 0.0004200388559093362, + "loss": 3.4367, "step": 27900 }, { - "epoch": 3.0082875901409967, - "grad_norm": 0.6490093469619751, - "learning_rate": 0.00042001292964120244, - "loss": 3.4372, + "epoch": 3.013477088948787, + "grad_norm": 0.5901045799255371, + "learning_rate": 0.0004197150566648677, + "loss": 3.4398, "step": 27950 }, { - "epoch": 3.0136691421806048, - "grad_norm": 0.6507807374000549, - "learning_rate": 0.000419689688611141, - "loss": 3.4461, + "epoch": 3.018867924528302, + "grad_norm": 0.610563337802887, + "learning_rate": 0.00041939125742039936, + "loss": 3.4533, "step": 28000 }, { - "epoch": 3.0136691421806048, - "eval_accuracy": 0.3706709565724186, - "eval_loss": 3.497730016708374, - "eval_runtime": 215.1239, - "eval_samples_per_second": 83.724, - "eval_steps_per_second": 5.234, + "epoch": 3.018867924528302, + "eval_accuracy": 0.36950989181323307, + "eval_loss": 3.5048389434814453, + "eval_runtime": 151.8303, + "eval_samples_per_second": 118.626, + "eval_steps_per_second": 7.416, "step": 28000 }, { - "epoch": 3.0190506942202133, - "grad_norm": 0.6191187500953674, - "learning_rate": 0.0004193664475810796, - "loss": 3.4181, + "epoch": 3.024258760107817, + "grad_norm": 0.648098886013031, + "learning_rate": 0.00041906745817593086, + "loss": 3.4386, "step": 28050 }, { - "epoch": 3.0244322462598214, - "grad_norm": 0.62087082862854, - "learning_rate": 0.00041904320655101817, - "loss": 3.4181, + "epoch": 3.0296495956873315, + "grad_norm": 0.662765383720398, + "learning_rate": 0.00041874365893146247, + "loss": 3.4337, "step": 28100 }, { - "epoch": 3.0298137982994295, - "grad_norm": 0.6341911554336548, - "learning_rate": 0.0004187199655209567, - "loss": 3.4286, + "epoch": 3.035040431266846, + "grad_norm": 0.5984097719192505, + "learning_rate": 0.000418419859686994, + "loss": 3.4747, "step": 28150 }, { - "epoch": 3.0351953503390376, - "grad_norm": 0.6108666062355042, - "learning_rate": 0.00041839672449089536, - "loss": 3.4278, + "epoch": 3.0404312668463613, + "grad_norm": 0.593792200088501, + "learning_rate": 0.00041809606044252557, + "loss": 3.4557, "step": 28200 }, { - "epoch": 3.040576902378646, - "grad_norm": 0.5951593518257141, - "learning_rate": 0.00041807348346083395, - "loss": 3.441, + "epoch": 3.045822102425876, + "grad_norm": 0.619614839553833, + "learning_rate": 0.00041777226119805717, + "loss": 3.4661, "step": 28250 }, { - "epoch": 3.0459584544182543, - "grad_norm": 0.6405830383300781, - "learning_rate": 0.0004177502424307725, - "loss": 3.4542, + "epoch": 3.0512129380053907, + "grad_norm": 0.650425910949707, + "learning_rate": 0.0004174484619535887, + "loss": 3.4621, "step": 28300 }, { - "epoch": 3.0513400064578624, - "grad_norm": 0.6463958024978638, - "learning_rate": 0.0004174270014007111, - "loss": 3.4252, + "epoch": 3.056603773584906, + "grad_norm": 0.6080370545387268, + "learning_rate": 0.00041712466270912033, + "loss": 3.4525, "step": 28350 }, { - "epoch": 3.0567215584974705, - "grad_norm": 0.5980671048164368, - "learning_rate": 0.00041710376037064963, - "loss": 3.4458, + "epoch": 3.0619946091644206, + "grad_norm": 0.604832649230957, + "learning_rate": 0.0004168008634646519, + "loss": 3.4284, "step": 28400 }, { - "epoch": 3.062103110537079, - "grad_norm": 0.6921752691268921, - "learning_rate": 0.0004167805193405883, - "loss": 3.4406, + "epoch": 3.0673854447439353, + "grad_norm": 0.6373007893562317, + "learning_rate": 0.0004164770642201835, + "loss": 3.4731, "step": 28450 }, { - "epoch": 3.067484662576687, - "grad_norm": 0.6020042896270752, - "learning_rate": 0.0004164572783105269, - "loss": 3.4446, + "epoch": 3.07277628032345, + "grad_norm": 0.6558822989463806, + "learning_rate": 0.00041615326497571503, + "loss": 3.4663, "step": 28500 }, { - "epoch": 3.0728662146162953, - "grad_norm": 0.6640591025352478, - "learning_rate": 0.0004161340372804654, - "loss": 3.4635, + "epoch": 3.078167115902965, + "grad_norm": 0.6558163166046143, + "learning_rate": 0.0004158294657312466, + "loss": 3.4533, "step": 28550 }, { - "epoch": 3.0782477666559034, - "grad_norm": 0.6354085206985474, - "learning_rate": 0.000415810796250404, - "loss": 3.4555, + "epoch": 3.08355795148248, + "grad_norm": 0.6458030343055725, + "learning_rate": 0.0004155056664867782, + "loss": 3.4506, "step": 28600 }, { - "epoch": 3.083629318695512, - "grad_norm": 0.636901319026947, - "learning_rate": 0.0004154875552203426, - "loss": 3.4515, + "epoch": 3.0889487870619945, + "grad_norm": 0.665330708026886, + "learning_rate": 0.0004151818672423097, + "loss": 3.4485, "step": 28650 }, { - "epoch": 3.08901087073512, - "grad_norm": 0.6186822056770325, - "learning_rate": 0.00041516431419028114, - "loss": 3.4345, + "epoch": 3.0943396226415096, + "grad_norm": 0.614852249622345, + "learning_rate": 0.0004148580679978413, + "loss": 3.4589, "step": 28700 }, { - "epoch": 3.094392422774728, - "grad_norm": 0.6416160464286804, - "learning_rate": 0.0004148410731602198, - "loss": 3.45, + "epoch": 3.0997304582210243, + "grad_norm": 0.6223500370979309, + "learning_rate": 0.00041453426875337284, + "loss": 3.4486, "step": 28750 }, { - "epoch": 3.0997739748143363, - "grad_norm": 0.6019704341888428, - "learning_rate": 0.0004145178321301584, - "loss": 3.4452, + "epoch": 3.105121293800539, + "grad_norm": 0.5892756581306458, + "learning_rate": 0.00041421046950890445, + "loss": 3.4498, "step": 28800 }, { - "epoch": 3.105155526853945, - "grad_norm": 0.6140844225883484, - "learning_rate": 0.00041419459110009693, - "loss": 3.4474, + "epoch": 3.1105121293800537, + "grad_norm": 0.6175945401191711, + "learning_rate": 0.000413886670264436, + "loss": 3.4339, "step": 28850 }, { - "epoch": 3.110537078893553, - "grad_norm": 0.6584010720252991, - "learning_rate": 0.0004138713500700355, - "loss": 3.442, + "epoch": 3.115902964959569, + "grad_norm": 0.5722609758377075, + "learning_rate": 0.0004135628710199676, + "loss": 3.4505, "step": 28900 }, { - "epoch": 3.115918630933161, - "grad_norm": 0.6763238906860352, - "learning_rate": 0.00041354810903997406, - "loss": 3.4475, + "epoch": 3.1212938005390836, + "grad_norm": 0.6651545763015747, + "learning_rate": 0.00041323907177549915, + "loss": 3.4691, "step": 28950 }, { - "epoch": 3.121300182972769, - "grad_norm": 0.6160973906517029, - "learning_rate": 0.00041322486800991266, - "loss": 3.4491, + "epoch": 3.1266846361185983, + "grad_norm": 0.628146231174469, + "learning_rate": 0.0004129152725310307, + "loss": 3.4393, "step": 29000 }, { - "epoch": 3.121300182972769, - "eval_accuracy": 0.3712210661434233, - "eval_loss": 3.494593858718872, - "eval_runtime": 209.8562, - "eval_samples_per_second": 85.825, - "eval_steps_per_second": 5.366, + "epoch": 3.1266846361185983, + "eval_accuracy": 0.3704548459785556, + "eval_loss": 3.5026659965515137, + "eval_runtime": 151.9804, + "eval_samples_per_second": 118.509, + "eval_steps_per_second": 7.409, "step": 29000 }, { - "epoch": 3.1266817350123777, - "grad_norm": 0.6309645771980286, - "learning_rate": 0.0004129016269798513, - "loss": 3.452, + "epoch": 3.1320754716981134, + "grad_norm": 0.653403103351593, + "learning_rate": 0.0004125914732865623, + "loss": 3.4586, "step": 29050 }, { - "epoch": 3.132063287051986, - "grad_norm": 0.616537868976593, - "learning_rate": 0.00041257838594978985, - "loss": 3.4708, + "epoch": 3.137466307277628, + "grad_norm": 0.6372281908988953, + "learning_rate": 0.00041226767404209386, + "loss": 3.4495, "step": 29100 }, { - "epoch": 3.137444839091594, - "grad_norm": 0.6185223460197449, - "learning_rate": 0.00041225514491972844, - "loss": 3.455, + "epoch": 3.142857142857143, + "grad_norm": 0.6290000081062317, + "learning_rate": 0.00041194387479762546, + "loss": 3.4673, "step": 29150 }, { - "epoch": 3.1428263911312024, - "grad_norm": 0.6138353943824768, - "learning_rate": 0.00041193190388966704, - "loss": 3.4406, + "epoch": 3.1482479784366575, + "grad_norm": 0.570479691028595, + "learning_rate": 0.000411620075553157, + "loss": 3.4668, "step": 29200 }, { - "epoch": 3.1482079431708105, - "grad_norm": 0.608535647392273, - "learning_rate": 0.0004116086628596056, - "loss": 3.4461, + "epoch": 3.1536388140161726, + "grad_norm": 0.5886173248291016, + "learning_rate": 0.0004112962763086886, + "loss": 3.4534, "step": 29250 }, { - "epoch": 3.1535894952104186, - "grad_norm": 0.6112627387046814, - "learning_rate": 0.0004112854218295442, - "loss": 3.4395, + "epoch": 3.1590296495956873, + "grad_norm": 0.5947257280349731, + "learning_rate": 0.0004109724770642201, + "loss": 3.4598, "step": 29300 }, { - "epoch": 3.1589710472500268, - "grad_norm": 0.5768312811851501, - "learning_rate": 0.0004109621807994828, - "loss": 3.4302, + "epoch": 3.164420485175202, + "grad_norm": 0.6352401971817017, + "learning_rate": 0.00041064867781975177, + "loss": 3.4734, "step": 29350 }, { - "epoch": 3.1643525992896353, - "grad_norm": 0.6325687170028687, - "learning_rate": 0.00041063893976942136, - "loss": 3.4465, + "epoch": 3.169811320754717, + "grad_norm": 0.5621167421340942, + "learning_rate": 0.00041032487857528327, + "loss": 3.4683, "step": 29400 }, { - "epoch": 3.1697341513292434, - "grad_norm": 0.6342477202415466, - "learning_rate": 0.00041031569873935996, - "loss": 3.442, + "epoch": 3.175202156334232, + "grad_norm": 0.6027476787567139, + "learning_rate": 0.0004100010793308148, + "loss": 3.4779, "step": 29450 }, { - "epoch": 3.1751157033688515, - "grad_norm": 0.6336598992347717, - "learning_rate": 0.0004099924577092985, - "loss": 3.4462, + "epoch": 3.1805929919137466, + "grad_norm": 0.5749760270118713, + "learning_rate": 0.0004096772800863464, + "loss": 3.473, "step": 29500 }, { - "epoch": 3.1804972554084596, - "grad_norm": 0.607629656791687, - "learning_rate": 0.0004096692166792371, - "loss": 3.4299, + "epoch": 3.1859838274932613, + "grad_norm": 0.6280606389045715, + "learning_rate": 0.000409353480841878, + "loss": 3.4493, "step": 29550 }, { - "epoch": 3.185878807448068, - "grad_norm": 0.6477324366569519, - "learning_rate": 0.00040934597564917574, - "loss": 3.448, + "epoch": 3.1913746630727764, + "grad_norm": 0.6310157179832458, + "learning_rate": 0.0004090296815974096, + "loss": 3.4706, "step": 29600 }, { - "epoch": 3.1912603594876763, - "grad_norm": 0.6741542220115662, - "learning_rate": 0.0004090227346191143, - "loss": 3.4596, + "epoch": 3.196765498652291, + "grad_norm": 0.6024655699729919, + "learning_rate": 0.00040870588235294113, + "loss": 3.4574, "step": 29650 }, { - "epoch": 3.1966419115272844, - "grad_norm": 0.6519025564193726, - "learning_rate": 0.0004086994935890529, - "loss": 3.4715, + "epoch": 3.202156334231806, + "grad_norm": 0.6026850938796997, + "learning_rate": 0.00040838208310847273, + "loss": 3.4629, "step": 29700 }, { - "epoch": 3.2020234635668925, - "grad_norm": 0.6172560453414917, - "learning_rate": 0.00040837625255899147, - "loss": 3.4494, + "epoch": 3.207547169811321, + "grad_norm": 0.5791655778884888, + "learning_rate": 0.0004080582838640043, + "loss": 3.4699, "step": 29750 }, { - "epoch": 3.207405015606501, - "grad_norm": 0.6829916834831238, - "learning_rate": 0.00040805301152893, - "loss": 3.4398, + "epoch": 3.2129380053908356, + "grad_norm": 0.5974932312965393, + "learning_rate": 0.00040774096060442524, + "loss": 3.4527, "step": 29800 }, { - "epoch": 3.212786567646109, - "grad_norm": 0.6081319451332092, - "learning_rate": 0.0004077297704988686, - "loss": 3.446, + "epoch": 3.2183288409703503, + "grad_norm": 0.6695175766944885, + "learning_rate": 0.0004074171613599568, + "loss": 3.4755, "step": 29850 }, { - "epoch": 3.2181681196857173, - "grad_norm": 0.6863629817962646, - "learning_rate": 0.00040740652946880725, - "loss": 3.4471, + "epoch": 3.223719676549865, + "grad_norm": 0.5739556550979614, + "learning_rate": 0.0004070933621154884, + "loss": 3.4812, "step": 29900 }, { - "epoch": 3.2235496717253254, - "grad_norm": 0.6253100633621216, - "learning_rate": 0.0004070832884387458, - "loss": 3.4593, + "epoch": 3.22911051212938, + "grad_norm": 0.6452335119247437, + "learning_rate": 0.00040676956287101994, + "loss": 3.474, "step": 29950 }, { - "epoch": 3.228931223764934, - "grad_norm": 0.61070317029953, - "learning_rate": 0.0004067600474086844, - "loss": 3.4684, + "epoch": 3.234501347708895, + "grad_norm": 0.6547558903694153, + "learning_rate": 0.00040644576362655155, + "loss": 3.4587, "step": 30000 }, { - "epoch": 3.228931223764934, - "eval_accuracy": 0.3713381939564873, - "eval_loss": 3.4922268390655518, - "eval_runtime": 208.2583, - "eval_samples_per_second": 86.484, - "eval_steps_per_second": 5.407, + "epoch": 3.234501347708895, + "eval_accuracy": 0.37111730263556053, + "eval_loss": 3.4946022033691406, + "eval_runtime": 151.9, + "eval_samples_per_second": 118.571, + "eval_steps_per_second": 7.413, "step": 30000 } ], "logging_steps": 50, - "max_steps": 92910, + "max_steps": 92750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, @@ -4496,7 +4496,7 @@ "attributes": {} } }, - "total_flos": 2.50830944206848e+17, + "total_flos": 2.508207538176e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null