diff --git "a/checkpoint-70000/trainer_state.json" "b/checkpoint-70000/trainer_state.json" --- "a/checkpoint-70000/trainer_state.json" +++ "b/checkpoint-70000/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 3.3461689949035645, - "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_495/checkpoint-70000", - "epoch": 7.534172855451512, + "best_metric": 3.347477912902832, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_high_1000_495/checkpoint-70000", + "epoch": 7.547169811320755, "eval_steps": 1000, "global_step": 70000, "is_hyper_param_search": false, @@ -9,10438 +9,10438 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005381552039608223, - "grad_norm": 12.074577331542969, + "epoch": 0.005390835579514825, + "grad_norm": 2.323624849319458, "learning_rate": 0.0003, - "loss": 8.6374, + "loss": 8.5206, "step": 50 }, { - "epoch": 0.010763104079216447, - "grad_norm": 1.3158553838729858, + "epoch": 0.01078167115902965, + "grad_norm": 2.057921886444092, "learning_rate": 0.0006, - "loss": 6.8912, + "loss": 6.8833, "step": 100 }, { - "epoch": 0.01614465611882467, - "grad_norm": 2.628305196762085, - "learning_rate": 0.0005996767589699385, - "loss": 6.4839, + "epoch": 0.016172506738544475, + "grad_norm": 1.1483490467071533, + "learning_rate": 0.0005996762007555315, + "loss": 6.4306, "step": 150 }, { - "epoch": 0.021526208158432893, - "grad_norm": 1.0599905252456665, - "learning_rate": 0.0005993535179398771, - "loss": 6.2264, + "epoch": 0.0215633423180593, + "grad_norm": 3.967722177505493, + "learning_rate": 0.000599352401511063, + "loss": 6.2321, "step": 200 }, { - "epoch": 0.026907760198041114, - "grad_norm": 1.638780951499939, - "learning_rate": 0.0005990302769098158, - "loss": 6.0419, + "epoch": 0.026954177897574125, + "grad_norm": 2.4125149250030518, + "learning_rate": 0.0005990286022665946, + "loss": 6.0666, "step": 250 }, { - "epoch": 0.03228931223764934, - "grad_norm": 1.9191187620162964, - "learning_rate": 0.0005987070358797543, - "loss": 5.955, + "epoch": 0.03234501347708895, + "grad_norm": 2.052870750427246, + "learning_rate": 0.0005987048030221263, + "loss": 5.9672, "step": 300 }, { - "epoch": 0.03767086427725756, - "grad_norm": 1.12886643409729, - "learning_rate": 0.0005983837948496929, - "loss": 5.8538, + "epoch": 0.03773584905660377, + "grad_norm": 1.2476660013198853, + "learning_rate": 0.0005983810037776578, + "loss": 5.8655, "step": 350 }, { - "epoch": 0.04305241631686579, - "grad_norm": 1.3254728317260742, - "learning_rate": 0.0005980605538196314, - "loss": 5.7663, + "epoch": 0.0431266846361186, + "grad_norm": 0.8398758172988892, + "learning_rate": 0.0005980572045331894, + "loss": 5.8111, "step": 400 }, { - "epoch": 0.048433968356474004, - "grad_norm": 2.3004322052001953, - "learning_rate": 0.0005977373127895701, - "loss": 5.6952, + "epoch": 0.04851752021563342, + "grad_norm": 1.2200261354446411, + "learning_rate": 0.0005977334052887209, + "loss": 5.736, "step": 450 }, { - "epoch": 0.05381552039608223, - "grad_norm": 1.495474934577942, - "learning_rate": 0.0005974140717595086, - "loss": 5.6341, + "epoch": 0.05390835579514825, + "grad_norm": 1.2435953617095947, + "learning_rate": 0.0005974096060442526, + "loss": 5.6756, "step": 500 }, { - "epoch": 0.05919707243569045, - "grad_norm": 0.9526841640472412, - "learning_rate": 0.0005970908307294472, - "loss": 5.5508, + "epoch": 0.05929919137466307, + "grad_norm": 2.038189649581909, + "learning_rate": 0.0005970858067997841, + "loss": 5.5615, "step": 550 }, { - "epoch": 0.06457862447529868, - "grad_norm": 1.1846691370010376, - "learning_rate": 0.0005967675896993858, - "loss": 5.4745, + "epoch": 0.0646900269541779, + "grad_norm": 1.168102741241455, + "learning_rate": 0.0005967620075553157, + "loss": 5.529, "step": 600 }, { - "epoch": 0.0699601765149069, - "grad_norm": 1.5218234062194824, - "learning_rate": 0.0005964443486693243, - "loss": 5.4167, + "epoch": 0.07008086253369272, + "grad_norm": 1.0181015729904175, + "learning_rate": 0.0005964382083108472, + "loss": 5.4185, "step": 650 }, { - "epoch": 0.07534172855451512, - "grad_norm": 1.037400484085083, - "learning_rate": 0.000596121107639263, - "loss": 5.3214, + "epoch": 0.07547169811320754, + "grad_norm": 1.2549965381622314, + "learning_rate": 0.0005961144090663788, + "loss": 5.4023, "step": 700 }, { - "epoch": 0.08072328059412334, - "grad_norm": 1.1252976655960083, - "learning_rate": 0.0005957978666092015, - "loss": 5.3002, + "epoch": 0.08086253369272237, + "grad_norm": 1.1174157857894897, + "learning_rate": 0.0005957906098219104, + "loss": 5.3263, "step": 750 }, { - "epoch": 0.08610483263373157, - "grad_norm": 0.9828529953956604, - "learning_rate": 0.0005954746255791401, - "loss": 5.224, + "epoch": 0.0862533692722372, + "grad_norm": 1.3886544704437256, + "learning_rate": 0.0005954668105774419, + "loss": 5.2671, "step": 800 }, { - "epoch": 0.09148638467333979, - "grad_norm": 1.4241634607315063, - "learning_rate": 0.0005951513845490787, - "loss": 5.2284, + "epoch": 0.09164420485175202, + "grad_norm": 0.8726224303245544, + "learning_rate": 0.0005951430113329735, + "loss": 5.2379, "step": 850 }, { - "epoch": 0.09686793671294801, - "grad_norm": 1.0950024127960205, - "learning_rate": 0.0005948281435190174, - "loss": 5.1559, + "epoch": 0.09703504043126684, + "grad_norm": 1.1303365230560303, + "learning_rate": 0.0005948192120885051, + "loss": 5.1541, "step": 900 }, { - "epoch": 0.10224948875255624, - "grad_norm": 0.8689674735069275, - "learning_rate": 0.0005945049024889559, - "loss": 5.1198, + "epoch": 0.10242587601078167, + "grad_norm": 1.1907507181167603, + "learning_rate": 0.0005944954128440366, + "loss": 5.1459, "step": 950 }, { - "epoch": 0.10763104079216446, - "grad_norm": 0.8567312955856323, - "learning_rate": 0.0005941816614588944, - "loss": 5.0633, + "epoch": 0.1078167115902965, + "grad_norm": 1.1421568393707275, + "learning_rate": 0.0005941716135995682, + "loss": 5.0905, "step": 1000 }, { - "epoch": 0.10763104079216446, - "eval_accuracy": 0.22941774764519313, - "eval_loss": 4.999941825866699, - "eval_runtime": 181.8712, - "eval_samples_per_second": 99.032, - "eval_steps_per_second": 6.191, + "epoch": 0.1078167115902965, + "eval_accuracy": 0.2273858299901745, + "eval_loss": 5.018712520599365, + "eval_runtime": 185.0397, + "eval_samples_per_second": 97.336, + "eval_steps_per_second": 6.085, "step": 1000 }, { - "epoch": 0.11301259283177269, - "grad_norm": 1.0926947593688965, - "learning_rate": 0.000593858420428833, - "loss": 5.0126, + "epoch": 0.11320754716981132, + "grad_norm": 1.1553248167037964, + "learning_rate": 0.0005938478143550997, + "loss": 5.0487, "step": 1050 }, { - "epoch": 0.1183941448713809, - "grad_norm": 1.079075813293457, - "learning_rate": 0.0005935351793987716, - "loss": 4.9581, + "epoch": 0.11859838274932614, + "grad_norm": 1.4468927383422852, + "learning_rate": 0.0005935240151106314, + "loss": 5.0104, "step": 1100 }, { - "epoch": 0.12377569691098914, - "grad_norm": 1.9798409938812256, - "learning_rate": 0.0005932119383687103, - "loss": 4.9724, + "epoch": 0.12398921832884097, + "grad_norm": 0.8825078010559082, + "learning_rate": 0.0005932002158661629, + "loss": 4.9857, "step": 1150 }, { - "epoch": 0.12915724895059735, - "grad_norm": 1.3321768045425415, - "learning_rate": 0.0005928886973386488, - "loss": 4.9741, + "epoch": 0.1293800539083558, + "grad_norm": 1.3887170553207397, + "learning_rate": 0.0005928764166216945, + "loss": 4.9682, "step": 1200 }, { - "epoch": 0.13453880099020557, - "grad_norm": 1.1671667098999023, - "learning_rate": 0.0005925654563085874, - "loss": 4.9051, + "epoch": 0.1347708894878706, + "grad_norm": 1.168694019317627, + "learning_rate": 0.000592552617377226, + "loss": 4.9421, "step": 1250 }, { - "epoch": 0.1399203530298138, - "grad_norm": 0.8295050263404846, - "learning_rate": 0.000592242215278526, - "loss": 4.9002, + "epoch": 0.14016172506738545, + "grad_norm": 0.8403277397155762, + "learning_rate": 0.0005922288181327577, + "loss": 4.8864, "step": 1300 }, { - "epoch": 0.14530190506942203, - "grad_norm": 1.0559158325195312, - "learning_rate": 0.0005919189742484645, - "loss": 4.8626, + "epoch": 0.14555256064690028, + "grad_norm": 1.0229427814483643, + "learning_rate": 0.0005919050188882893, + "loss": 4.8666, "step": 1350 }, { - "epoch": 0.15068345710903025, - "grad_norm": 1.2337747812271118, - "learning_rate": 0.0005915957332184032, - "loss": 4.851, + "epoch": 0.1509433962264151, + "grad_norm": 1.0546599626541138, + "learning_rate": 0.0005915812196438207, + "loss": 4.8429, "step": 1400 }, { - "epoch": 0.15606500914863847, - "grad_norm": 1.0464680194854736, - "learning_rate": 0.0005912724921883417, - "loss": 4.8242, + "epoch": 0.15633423180592992, + "grad_norm": 1.2618271112442017, + "learning_rate": 0.0005912574203993524, + "loss": 4.8244, "step": 1450 }, { - "epoch": 0.16144656118824668, - "grad_norm": 0.8018038868904114, - "learning_rate": 0.0005909492511582803, - "loss": 4.7894, + "epoch": 0.16172506738544473, + "grad_norm": 1.0748432874679565, + "learning_rate": 0.0005909336211548839, + "loss": 4.8066, "step": 1500 }, { - "epoch": 0.1668281132278549, - "grad_norm": 1.5508265495300293, - "learning_rate": 0.0005906260101282189, - "loss": 4.74, + "epoch": 0.16711590296495957, + "grad_norm": 0.9467751979827881, + "learning_rate": 0.0005906098219104155, + "loss": 4.7638, "step": 1550 }, { - "epoch": 0.17220966526746315, - "grad_norm": 0.9371482133865356, - "learning_rate": 0.0005903027690981575, - "loss": 4.7316, + "epoch": 0.1725067385444744, + "grad_norm": 0.9856839776039124, + "learning_rate": 0.000590286022665947, + "loss": 4.7827, "step": 1600 }, { - "epoch": 0.17759121730707136, - "grad_norm": 1.0305496454238892, - "learning_rate": 0.000589979528068096, - "loss": 4.7221, + "epoch": 0.1778975741239892, + "grad_norm": 0.9949484467506409, + "learning_rate": 0.0005899622234214787, + "loss": 4.7595, "step": 1650 }, { - "epoch": 0.18297276934667958, - "grad_norm": 0.90619957447052, - "learning_rate": 0.0005896562870380347, - "loss": 4.7063, + "epoch": 0.18328840970350405, + "grad_norm": 0.956685483455658, + "learning_rate": 0.0005896384241770102, + "loss": 4.6836, "step": 1700 }, { - "epoch": 0.1883543213862878, - "grad_norm": 1.0820300579071045, - "learning_rate": 0.0005893330460079732, - "loss": 4.668, + "epoch": 0.18867924528301888, + "grad_norm": 0.861179530620575, + "learning_rate": 0.0005893146249325418, + "loss": 4.698, "step": 1750 }, { - "epoch": 0.19373587342589602, - "grad_norm": 0.857718825340271, - "learning_rate": 0.0005890098049779118, - "loss": 4.6453, + "epoch": 0.1940700808625337, + "grad_norm": 0.8717144727706909, + "learning_rate": 0.0005889908256880733, + "loss": 4.6851, "step": 1800 }, { - "epoch": 0.19911742546550426, - "grad_norm": 0.9784050583839417, - "learning_rate": 0.0005886865639478504, - "loss": 4.6397, + "epoch": 0.19946091644204852, + "grad_norm": 1.0138026475906372, + "learning_rate": 0.0005886670264436049, + "loss": 4.6388, "step": 1850 }, { - "epoch": 0.20449897750511248, - "grad_norm": 0.9087166786193848, - "learning_rate": 0.0005883633229177889, - "loss": 4.6246, + "epoch": 0.20485175202156333, + "grad_norm": 1.0646897554397583, + "learning_rate": 0.0005883432271991365, + "loss": 4.6339, "step": 1900 }, { - "epoch": 0.2098805295447207, - "grad_norm": 0.9737820029258728, - "learning_rate": 0.0005880400818877276, - "loss": 4.6047, + "epoch": 0.21024258760107817, + "grad_norm": 0.9919822812080383, + "learning_rate": 0.0005880194279546681, + "loss": 4.5957, "step": 1950 }, { - "epoch": 0.2152620815843289, - "grad_norm": 0.8255292773246765, - "learning_rate": 0.0005877168408576662, - "loss": 4.5492, + "epoch": 0.215633423180593, + "grad_norm": 1.2434839010238647, + "learning_rate": 0.0005876956287101996, + "loss": 4.5775, "step": 2000 }, { - "epoch": 0.2152620815843289, - "eval_accuracy": 0.27245189692187455, - "eval_loss": 4.491321563720703, - "eval_runtime": 181.6962, - "eval_samples_per_second": 99.127, - "eval_steps_per_second": 6.197, + "epoch": 0.215633423180593, + "eval_accuracy": 0.27008391588484576, + "eval_loss": 4.515827178955078, + "eval_runtime": 184.8516, + "eval_samples_per_second": 97.435, + "eval_steps_per_second": 6.091, "step": 2000 }, { - "epoch": 0.22064363362393713, - "grad_norm": 0.7951588034629822, - "learning_rate": 0.0005873935998276048, - "loss": 4.5555, + "epoch": 0.2210242587601078, + "grad_norm": 0.9802135229110718, + "learning_rate": 0.0005873718294657312, + "loss": 4.5897, "step": 2050 }, { - "epoch": 0.22602518566354537, - "grad_norm": 0.8975157141685486, - "learning_rate": 0.0005870703587975433, - "loss": 4.5375, + "epoch": 0.22641509433962265, + "grad_norm": 1.04331636428833, + "learning_rate": 0.0005870480302212628, + "loss": 4.5522, "step": 2100 }, { - "epoch": 0.2314067377031536, - "grad_norm": 1.0026023387908936, - "learning_rate": 0.0005867471177674818, - "loss": 4.5323, + "epoch": 0.23180592991913745, + "grad_norm": 0.9205772280693054, + "learning_rate": 0.0005867242309767943, + "loss": 4.4983, "step": 2150 }, { - "epoch": 0.2367882897427618, - "grad_norm": 0.803642988204956, - "learning_rate": 0.0005864238767374205, - "loss": 4.4924, + "epoch": 0.2371967654986523, + "grad_norm": 1.154678225517273, + "learning_rate": 0.0005864004317323259, + "loss": 4.5365, "step": 2200 }, { - "epoch": 0.24216984178237003, - "grad_norm": 1.0086907148361206, - "learning_rate": 0.0005861006357073591, - "loss": 4.5072, + "epoch": 0.24258760107816713, + "grad_norm": 0.763646125793457, + "learning_rate": 0.0005860766324878575, + "loss": 4.4747, "step": 2250 }, { - "epoch": 0.24755139382197827, - "grad_norm": 0.8211397528648376, - "learning_rate": 0.0005857773946772977, - "loss": 4.4681, + "epoch": 0.24797843665768193, + "grad_norm": 0.7726097702980042, + "learning_rate": 0.000585752833243389, + "loss": 4.4536, "step": 2300 }, { - "epoch": 0.2529329458615865, - "grad_norm": 0.7603225111961365, - "learning_rate": 0.0005854541536472362, - "loss": 4.4691, + "epoch": 0.25336927223719674, + "grad_norm": 0.8371621966362, + "learning_rate": 0.0005854290339989206, + "loss": 4.4495, "step": 2350 }, { - "epoch": 0.2583144979011947, - "grad_norm": 0.8533746004104614, - "learning_rate": 0.0005851309126171749, - "loss": 4.4473, + "epoch": 0.2587601078167116, + "grad_norm": 0.9792793989181519, + "learning_rate": 0.0005851052347544521, + "loss": 4.4429, "step": 2400 }, { - "epoch": 0.2636960499408029, - "grad_norm": 1.300952434539795, - "learning_rate": 0.0005848076715871134, - "loss": 4.4076, + "epoch": 0.2641509433962264, + "grad_norm": 1.2388581037521362, + "learning_rate": 0.0005847814355099838, + "loss": 4.4476, "step": 2450 }, { - "epoch": 0.26907760198041114, - "grad_norm": 0.7179780006408691, - "learning_rate": 0.000584484430557052, - "loss": 4.4134, + "epoch": 0.2695417789757412, + "grad_norm": 0.6825560331344604, + "learning_rate": 0.0005844576362655154, + "loss": 4.4384, "step": 2500 }, { - "epoch": 0.27445915402001936, - "grad_norm": 0.8938673734664917, - "learning_rate": 0.0005841611895269906, - "loss": 4.4109, + "epoch": 0.2749326145552561, + "grad_norm": 0.7776582837104797, + "learning_rate": 0.0005841338370210469, + "loss": 4.4017, "step": 2550 }, { - "epoch": 0.2798407060596276, - "grad_norm": 0.8200742602348328, - "learning_rate": 0.0005838379484969291, - "loss": 4.4013, + "epoch": 0.2803234501347709, + "grad_norm": 1.007873773574829, + "learning_rate": 0.0005838100377765785, + "loss": 4.3936, "step": 2600 }, { - "epoch": 0.2852222580992358, - "grad_norm": 0.8515391945838928, - "learning_rate": 0.0005835147074668678, - "loss": 4.3763, + "epoch": 0.2857142857142857, + "grad_norm": 1.2725998163223267, + "learning_rate": 0.0005834862385321101, + "loss": 4.3894, "step": 2650 }, { - "epoch": 0.29060381013884407, - "grad_norm": 0.8918663859367371, - "learning_rate": 0.0005831914664368063, - "loss": 4.3774, + "epoch": 0.29110512129380056, + "grad_norm": 0.8046647310256958, + "learning_rate": 0.0005831624392876417, + "loss": 4.3826, "step": 2700 }, { - "epoch": 0.2959853621784523, - "grad_norm": 0.6826805472373962, - "learning_rate": 0.0005828682254067449, - "loss": 4.3631, + "epoch": 0.29649595687331537, + "grad_norm": 0.9733596444129944, + "learning_rate": 0.0005828386400431731, + "loss": 4.3404, "step": 2750 }, { - "epoch": 0.3013669142180605, - "grad_norm": 0.7818616628646851, - "learning_rate": 0.0005825449843766835, - "loss": 4.3313, + "epoch": 0.3018867924528302, + "grad_norm": 0.6732513308525085, + "learning_rate": 0.0005825148407987048, + "loss": 4.3807, "step": 2800 }, { - "epoch": 0.3067484662576687, - "grad_norm": 0.7908890843391418, - "learning_rate": 0.0005822217433466221, - "loss": 4.3407, + "epoch": 0.30727762803234504, + "grad_norm": 0.8311448097229004, + "learning_rate": 0.0005821910415542363, + "loss": 4.3505, "step": 2850 }, { - "epoch": 0.31213001829727693, - "grad_norm": 1.1532859802246094, - "learning_rate": 0.0005818985023165607, - "loss": 4.3188, + "epoch": 0.31266846361185985, + "grad_norm": 0.8010222911834717, + "learning_rate": 0.0005818672423097679, + "loss": 4.3291, "step": 2900 }, { - "epoch": 0.31751157033688515, - "grad_norm": 0.7944912910461426, - "learning_rate": 0.0005815752612864992, - "loss": 4.3062, + "epoch": 0.31805929919137466, + "grad_norm": 0.7362540364265442, + "learning_rate": 0.0005815434430652994, + "loss": 4.3354, "step": 2950 }, { - "epoch": 0.32289312237649337, - "grad_norm": 0.7332167029380798, - "learning_rate": 0.0005812520202564378, - "loss": 4.2863, + "epoch": 0.32345013477088946, + "grad_norm": 0.7173567414283752, + "learning_rate": 0.0005812196438208311, + "loss": 4.332, "step": 3000 }, { - "epoch": 0.32289312237649337, - "eval_accuracy": 0.29881217403454574, - "eval_loss": 4.233547687530518, - "eval_runtime": 181.7872, - "eval_samples_per_second": 99.077, - "eval_steps_per_second": 6.194, + "epoch": 0.32345013477088946, + "eval_accuracy": 0.29800922919359674, + "eval_loss": 4.234623908996582, + "eval_runtime": 184.9545, + "eval_samples_per_second": 97.381, + "eval_steps_per_second": 6.088, "step": 3000 }, { - "epoch": 0.3282746744161016, - "grad_norm": 0.6582856774330139, - "learning_rate": 0.0005809287792263764, - "loss": 4.3131, + "epoch": 0.3288409703504043, + "grad_norm": 0.7485026121139526, + "learning_rate": 0.0005808958445763626, + "loss": 4.3078, "step": 3050 }, { - "epoch": 0.3336562264557098, - "grad_norm": 0.7379726767539978, - "learning_rate": 0.0005806055381963151, - "loss": 4.2889, + "epoch": 0.33423180592991913, + "grad_norm": 0.7733100652694702, + "learning_rate": 0.0005805720453318942, + "loss": 4.3032, "step": 3100 }, { - "epoch": 0.3390377784953181, - "grad_norm": 0.7286725640296936, - "learning_rate": 0.0005802822971662536, - "loss": 4.287, + "epoch": 0.33962264150943394, + "grad_norm": 0.835577130317688, + "learning_rate": 0.0005802482460874257, + "loss": 4.2658, "step": 3150 }, { - "epoch": 0.3444193305349263, - "grad_norm": 0.7306691408157349, - "learning_rate": 0.0005799590561361922, - "loss": 4.2596, + "epoch": 0.3450134770889488, + "grad_norm": 0.8923593163490295, + "learning_rate": 0.0005799244468429573, + "loss": 4.2757, "step": 3200 }, { - "epoch": 0.3498008825745345, - "grad_norm": 0.6126540303230286, - "learning_rate": 0.0005796358151061307, - "loss": 4.2404, + "epoch": 0.3504043126684636, + "grad_norm": 0.7083461880683899, + "learning_rate": 0.0005796006475984889, + "loss": 4.2502, "step": 3250 }, { - "epoch": 0.35518243461414273, - "grad_norm": 0.7405567765235901, - "learning_rate": 0.0005793125740760694, - "loss": 4.253, + "epoch": 0.3557951482479784, + "grad_norm": 0.8044667840003967, + "learning_rate": 0.0005792768483540205, + "loss": 4.2714, "step": 3300 }, { - "epoch": 0.36056398665375095, - "grad_norm": 0.8209679126739502, - "learning_rate": 0.0005789893330460079, - "loss": 4.2517, + "epoch": 0.3611859838274933, + "grad_norm": 0.6673637628555298, + "learning_rate": 0.000578953049109552, + "loss": 4.2516, "step": 3350 }, { - "epoch": 0.36594553869335916, - "grad_norm": 0.7422078847885132, - "learning_rate": 0.0005786660920159465, - "loss": 4.2362, + "epoch": 0.3665768194070081, + "grad_norm": 0.8567188382148743, + "learning_rate": 0.0005786292498650836, + "loss": 4.244, "step": 3400 }, { - "epoch": 0.3713270907329674, - "grad_norm": 0.6978297233581543, - "learning_rate": 0.0005783428509858851, - "loss": 4.2293, + "epoch": 0.3719676549865229, + "grad_norm": 0.6368674635887146, + "learning_rate": 0.0005783054506206152, + "loss": 4.2306, "step": 3450 }, { - "epoch": 0.3767086427725756, - "grad_norm": 0.7036471962928772, - "learning_rate": 0.0005780196099558237, - "loss": 4.2389, + "epoch": 0.37735849056603776, + "grad_norm": 0.7097017765045166, + "learning_rate": 0.0005779816513761467, + "loss": 4.21, "step": 3500 }, { - "epoch": 0.3820901948121838, - "grad_norm": 0.8010238409042358, - "learning_rate": 0.0005776963689257623, - "loss": 4.2198, + "epoch": 0.38274932614555257, + "grad_norm": 0.6933664679527283, + "learning_rate": 0.0005776578521316782, + "loss": 4.2159, "step": 3550 }, { - "epoch": 0.38747174685179203, - "grad_norm": 0.6499230265617371, - "learning_rate": 0.0005773731278957008, - "loss": 4.1941, + "epoch": 0.3881401617250674, + "grad_norm": 0.7401413917541504, + "learning_rate": 0.0005773340528872099, + "loss": 4.2171, "step": 3600 }, { - "epoch": 0.3928532988914003, - "grad_norm": 0.7018994688987732, - "learning_rate": 0.0005770498868656394, - "loss": 4.2102, + "epoch": 0.3935309973045822, + "grad_norm": 0.899066686630249, + "learning_rate": 0.0005770102536427414, + "loss": 4.2236, "step": 3650 }, { - "epoch": 0.3982348509310085, - "grad_norm": 0.6573231816291809, - "learning_rate": 0.000576726645835578, - "loss": 4.1892, + "epoch": 0.39892183288409705, + "grad_norm": 0.8368923664093018, + "learning_rate": 0.000576686454398273, + "loss": 4.1929, "step": 3700 }, { - "epoch": 0.40361640297061674, - "grad_norm": 0.6301137804985046, - "learning_rate": 0.0005764034048055167, - "loss": 4.1782, + "epoch": 0.40431266846361186, + "grad_norm": 0.7426706552505493, + "learning_rate": 0.0005763626551538045, + "loss": 4.1921, "step": 3750 }, { - "epoch": 0.40899795501022496, - "grad_norm": 0.6420815587043762, - "learning_rate": 0.0005760801637754552, - "loss": 4.1881, + "epoch": 0.40970350404312667, + "grad_norm": 0.8189597725868225, + "learning_rate": 0.0005760388559093362, + "loss": 4.2154, "step": 3800 }, { - "epoch": 0.4143795070498332, - "grad_norm": 0.7286609411239624, - "learning_rate": 0.0005757569227453937, - "loss": 4.1681, + "epoch": 0.41509433962264153, + "grad_norm": 0.664476752281189, + "learning_rate": 0.0005757150566648678, + "loss": 4.1881, "step": 3850 }, { - "epoch": 0.4197610590894414, - "grad_norm": 0.7999758720397949, - "learning_rate": 0.0005754336817153324, - "loss": 4.1663, + "epoch": 0.42048517520215634, + "grad_norm": 0.8473009467124939, + "learning_rate": 0.0005753912574203993, + "loss": 4.1562, "step": 3900 }, { - "epoch": 0.4251426111290496, - "grad_norm": 0.681491494178772, - "learning_rate": 0.0005751104406852709, - "loss": 4.1526, + "epoch": 0.42587601078167114, + "grad_norm": 0.7176007628440857, + "learning_rate": 0.0005750674581759309, + "loss": 4.158, "step": 3950 }, { - "epoch": 0.4305241631686578, - "grad_norm": 0.8898637890815735, - "learning_rate": 0.0005747871996552096, - "loss": 4.1472, + "epoch": 0.431266846361186, + "grad_norm": 0.7438318729400635, + "learning_rate": 0.0005747436589314624, + "loss": 4.1601, "step": 4000 }, { - "epoch": 0.4305241631686578, - "eval_accuracy": 0.31332352777238437, - "eval_loss": 4.088411808013916, - "eval_runtime": 182.0558, - "eval_samples_per_second": 98.931, - "eval_steps_per_second": 6.185, + "epoch": 0.431266846361186, + "eval_accuracy": 0.31239595806911213, + "eval_loss": 4.091654300689697, + "eval_runtime": 185.0599, + "eval_samples_per_second": 97.325, + "eval_steps_per_second": 6.085, "step": 4000 }, { - "epoch": 0.43590571520826604, - "grad_norm": 1.0454559326171875, - "learning_rate": 0.0005744639586251481, - "loss": 4.174, + "epoch": 0.4366576819407008, + "grad_norm": 0.6847988963127136, + "learning_rate": 0.0005744198596869941, + "loss": 4.1601, "step": 4050 }, { - "epoch": 0.44128726724787426, - "grad_norm": 0.7238677144050598, - "learning_rate": 0.0005741407175950867, - "loss": 4.1417, + "epoch": 0.4420485175202156, + "grad_norm": 0.6240386962890625, + "learning_rate": 0.0005740960604425255, + "loss": 4.1664, "step": 4100 }, { - "epoch": 0.44666881928748253, - "grad_norm": 0.6636929512023926, - "learning_rate": 0.0005738174765650253, - "loss": 4.14, + "epoch": 0.4474393530997305, + "grad_norm": 0.5988531708717346, + "learning_rate": 0.0005737722611980572, + "loss": 4.153, "step": 4150 }, { - "epoch": 0.45205037132709075, - "grad_norm": 0.553901195526123, - "learning_rate": 0.0005734942355349638, - "loss": 4.1244, + "epoch": 0.4528301886792453, + "grad_norm": 0.6246315240859985, + "learning_rate": 0.0005734484619535887, + "loss": 4.1408, "step": 4200 }, { - "epoch": 0.45743192336669897, - "grad_norm": 0.7060267925262451, - "learning_rate": 0.0005731709945049025, - "loss": 4.1363, + "epoch": 0.4582210242587601, + "grad_norm": 0.6362719535827637, + "learning_rate": 0.0005731246627091203, + "loss": 4.1336, "step": 4250 }, { - "epoch": 0.4628134754063072, - "grad_norm": 0.6566117405891418, - "learning_rate": 0.000572847753474841, - "loss": 4.1328, + "epoch": 0.4636118598382749, + "grad_norm": 0.7302852272987366, + "learning_rate": 0.0005728008634646518, + "loss": 4.1261, "step": 4300 }, { - "epoch": 0.4681950274459154, - "grad_norm": 0.663583517074585, - "learning_rate": 0.0005725245124447796, - "loss": 4.1281, + "epoch": 0.46900269541778977, + "grad_norm": 0.7277068495750427, + "learning_rate": 0.0005724770642201835, + "loss": 4.1332, "step": 4350 }, { - "epoch": 0.4735765794855236, - "grad_norm": 0.797214925289154, - "learning_rate": 0.0005722012714147182, - "loss": 4.1199, + "epoch": 0.4743935309973046, + "grad_norm": 0.8166965246200562, + "learning_rate": 0.000572153264975715, + "loss": 4.1182, "step": 4400 }, { - "epoch": 0.47895813152513184, - "grad_norm": 0.6216398477554321, - "learning_rate": 0.0005718780303846568, - "loss": 4.1097, + "epoch": 0.4797843665768194, + "grad_norm": 0.772174596786499, + "learning_rate": 0.0005718294657312466, + "loss": 4.128, "step": 4450 }, { - "epoch": 0.48433968356474005, - "grad_norm": 0.7291539907455444, - "learning_rate": 0.0005715547893545953, - "loss": 4.0946, + "epoch": 0.48517520215633425, + "grad_norm": 0.6778306365013123, + "learning_rate": 0.0005715056664867781, + "loss": 4.1057, "step": 4500 }, { - "epoch": 0.48972123560434827, - "grad_norm": 0.6444728374481201, - "learning_rate": 0.000571231548324534, - "loss": 4.0871, + "epoch": 0.49056603773584906, + "grad_norm": 0.5974618792533875, + "learning_rate": 0.0005711818672423097, + "loss": 4.0954, "step": 4550 }, { - "epoch": 0.49510278764395654, - "grad_norm": 0.6845434308052063, - "learning_rate": 0.0005709083072944725, - "loss": 4.0885, + "epoch": 0.49595687331536387, + "grad_norm": 0.7339206337928772, + "learning_rate": 0.0005708580679978413, + "loss": 4.1006, "step": 4600 }, { - "epoch": 0.5004843396835648, - "grad_norm": 0.7555699944496155, - "learning_rate": 0.0005705850662644111, - "loss": 4.0955, + "epoch": 0.5013477088948787, + "grad_norm": 0.732438862323761, + "learning_rate": 0.0005705342687533729, + "loss": 4.0836, "step": 4650 }, { - "epoch": 0.505865891723173, - "grad_norm": 0.7021520733833313, - "learning_rate": 0.0005702618252343497, - "loss": 4.0961, + "epoch": 0.5067385444743935, + "grad_norm": 0.7650737166404724, + "learning_rate": 0.0005702104695089044, + "loss": 4.0887, "step": 4700 }, { - "epoch": 0.5112474437627812, - "grad_norm": 0.5818867087364197, - "learning_rate": 0.0005699385842042882, - "loss": 4.0799, + "epoch": 0.5121293800539084, + "grad_norm": 0.5491770505905151, + "learning_rate": 0.000569886670264436, + "loss": 4.0701, "step": 4750 }, { - "epoch": 0.5166289958023894, - "grad_norm": 0.665334939956665, - "learning_rate": 0.0005696153431742269, - "loss": 4.0524, + "epoch": 0.5175202156334232, + "grad_norm": 0.590347170829773, + "learning_rate": 0.0005695628710199675, + "loss": 4.0851, "step": 4800 }, { - "epoch": 0.5220105478419976, - "grad_norm": 0.6017621755599976, - "learning_rate": 0.0005692921021441655, - "loss": 4.0778, + "epoch": 0.522911051212938, + "grad_norm": 0.6122878789901733, + "learning_rate": 0.0005692390717754991, + "loss": 4.0779, "step": 4850 }, { - "epoch": 0.5273920998816058, - "grad_norm": 0.6869152188301086, - "learning_rate": 0.0005689688611141041, - "loss": 4.0557, + "epoch": 0.5283018867924528, + "grad_norm": 0.6329747438430786, + "learning_rate": 0.0005689152725310306, + "loss": 4.0857, "step": 4900 }, { - "epoch": 0.5327736519212141, - "grad_norm": 0.7289915680885315, - "learning_rate": 0.0005686456200840426, - "loss": 4.0789, + "epoch": 0.5336927223719676, + "grad_norm": 0.7208688259124756, + "learning_rate": 0.0005685914732865623, + "loss": 4.0632, "step": 4950 }, { - "epoch": 0.5381552039608223, - "grad_norm": 0.6597164869308472, - "learning_rate": 0.0005683223790539811, - "loss": 4.062, + "epoch": 0.5390835579514824, + "grad_norm": 0.5675130486488342, + "learning_rate": 0.0005682676740420939, + "loss": 4.0517, "step": 5000 }, { - "epoch": 0.5381552039608223, - "eval_accuracy": 0.32235986249325127, - "eval_loss": 3.9826269149780273, - "eval_runtime": 181.7213, - "eval_samples_per_second": 99.113, - "eval_steps_per_second": 6.196, + "epoch": 0.5390835579514824, + "eval_accuracy": 0.32126485868985694, + "eval_loss": 3.9956471920013428, + "eval_runtime": 184.8848, + "eval_samples_per_second": 97.417, + "eval_steps_per_second": 6.09, "step": 5000 }, { - "epoch": 0.5435367560004305, - "grad_norm": 0.6261274218559265, - "learning_rate": 0.0005679991380239198, - "loss": 4.0553, + "epoch": 0.5444743935309974, + "grad_norm": 0.646916925907135, + "learning_rate": 0.0005679438747976254, + "loss": 4.0447, "step": 5050 }, { - "epoch": 0.5489183080400387, - "grad_norm": 0.649811863899231, - "learning_rate": 0.0005676823618144596, - "loss": 4.0688, + "epoch": 0.5498652291105122, + "grad_norm": 0.6901677250862122, + "learning_rate": 0.000567620075553157, + "loss": 4.0497, "step": 5100 }, { - "epoch": 0.5542998600796469, - "grad_norm": 0.6599764823913574, - "learning_rate": 0.0005673591207843981, - "loss": 4.0458, + "epoch": 0.555256064690027, + "grad_norm": 0.6170982718467712, + "learning_rate": 0.0005672962763086886, + "loss": 4.0538, "step": 5150 }, { - "epoch": 0.5596814121192552, - "grad_norm": 0.5525273680686951, - "learning_rate": 0.0005670358797543368, - "loss": 4.032, + "epoch": 0.5606469002695418, + "grad_norm": 0.6177210807800293, + "learning_rate": 0.0005669724770642202, + "loss": 4.0316, "step": 5200 }, { - "epoch": 0.5650629641588634, - "grad_norm": 0.5733092427253723, - "learning_rate": 0.0005667126387242753, - "loss": 4.0628, + "epoch": 0.5660377358490566, + "grad_norm": 0.6208630204200745, + "learning_rate": 0.0005666486778197517, + "loss": 4.0317, "step": 5250 }, { - "epoch": 0.5704445161984716, - "grad_norm": 0.6523923277854919, - "learning_rate": 0.000566389397694214, - "loss": 4.0207, + "epoch": 0.5714285714285714, + "grad_norm": 0.6907792091369629, + "learning_rate": 0.0005663248785752833, + "loss": 4.0154, "step": 5300 }, { - "epoch": 0.5758260682380799, - "grad_norm": 0.5884820222854614, - "learning_rate": 0.0005660661566641525, - "loss": 4.0289, + "epoch": 0.5768194070080862, + "grad_norm": 0.5993092656135559, + "learning_rate": 0.0005660010793308148, + "loss": 4.0339, "step": 5350 }, { - "epoch": 0.5812076202776881, - "grad_norm": 0.6992023587226868, - "learning_rate": 0.0005657429156340911, - "loss": 4.0149, + "epoch": 0.5822102425876011, + "grad_norm": 0.7210185527801514, + "learning_rate": 0.0005656772800863465, + "loss": 4.024, "step": 5400 }, { - "epoch": 0.5865891723172963, - "grad_norm": 0.6001935601234436, - "learning_rate": 0.0005654196746040297, - "loss": 4.0226, + "epoch": 0.5876010781671159, + "grad_norm": 0.6388777494430542, + "learning_rate": 0.0005653534808418779, + "loss": 4.0306, "step": 5450 }, { - "epoch": 0.5919707243569046, - "grad_norm": 0.7079454660415649, - "learning_rate": 0.0005650964335739684, - "loss": 4.0292, + "epoch": 0.5929919137466307, + "grad_norm": 0.5874868035316467, + "learning_rate": 0.0005650296815974096, + "loss": 4.0322, "step": 5500 }, { - "epoch": 0.5973522763965128, - "grad_norm": 0.6667420864105225, - "learning_rate": 0.0005647731925439069, - "loss": 3.9974, + "epoch": 0.5983827493261455, + "grad_norm": 0.6064094305038452, + "learning_rate": 0.0005647058823529411, + "loss": 4.0365, "step": 5550 }, { - "epoch": 0.602733828436121, - "grad_norm": 0.589589536190033, - "learning_rate": 0.0005644499515138454, - "loss": 4.0047, + "epoch": 0.6037735849056604, + "grad_norm": 0.6970310211181641, + "learning_rate": 0.0005643820831084727, + "loss": 4.0156, "step": 5600 }, { - "epoch": 0.6081153804757292, - "grad_norm": 0.6412240266799927, - "learning_rate": 0.000564126710483784, - "loss": 3.9956, + "epoch": 0.6091644204851752, + "grad_norm": 0.5707041025161743, + "learning_rate": 0.0005640582838640042, + "loss": 4.0067, "step": 5650 }, { - "epoch": 0.6134969325153374, - "grad_norm": 0.6452463269233704, - "learning_rate": 0.0005638034694537226, - "loss": 3.9969, + "epoch": 0.6145552560646901, + "grad_norm": 0.6243141293525696, + "learning_rate": 0.0005637344846195358, + "loss": 4.0111, "step": 5700 }, { - "epoch": 0.6188784845549457, - "grad_norm": 0.634573221206665, - "learning_rate": 0.0005634802284236612, - "loss": 3.9971, + "epoch": 0.6199460916442049, + "grad_norm": 0.6075974106788635, + "learning_rate": 0.0005634106853750674, + "loss": 4.0213, "step": 5750 }, { - "epoch": 0.6242600365945539, - "grad_norm": 0.6738696694374084, - "learning_rate": 0.0005631569873935998, - "loss": 3.9907, + "epoch": 0.6253369272237197, + "grad_norm": 0.5607590675354004, + "learning_rate": 0.000563086886130599, + "loss": 4.0105, "step": 5800 }, { - "epoch": 0.6296415886341621, - "grad_norm": 0.6989587545394897, - "learning_rate": 0.0005628337463635384, - "loss": 4.0001, + "epoch": 0.6307277628032345, + "grad_norm": 0.6650785803794861, + "learning_rate": 0.0005627630868861305, + "loss": 4.023, "step": 5850 }, { - "epoch": 0.6350231406737703, - "grad_norm": 0.7382339239120483, - "learning_rate": 0.0005625105053334769, - "loss": 3.9866, + "epoch": 0.6361185983827493, + "grad_norm": 0.6571787595748901, + "learning_rate": 0.0005624392876416621, + "loss": 3.9933, "step": 5900 }, { - "epoch": 0.6404046927133785, - "grad_norm": 0.6265085339546204, - "learning_rate": 0.0005621872643034155, - "loss": 3.965, + "epoch": 0.6415094339622641, + "grad_norm": 0.637128472328186, + "learning_rate": 0.0005621154883971937, + "loss": 3.9854, "step": 5950 }, { - "epoch": 0.6457862447529867, - "grad_norm": 0.675687313079834, - "learning_rate": 0.0005618640232733541, - "loss": 3.9919, + "epoch": 0.6469002695417789, + "grad_norm": 0.6387634873390198, + "learning_rate": 0.0005617916891527253, + "loss": 3.9951, "step": 6000 }, { - "epoch": 0.6457862447529867, - "eval_accuracy": 0.3284569192929609, - "eval_loss": 3.910811424255371, - "eval_runtime": 181.5773, - "eval_samples_per_second": 99.192, - "eval_steps_per_second": 6.201, + "epoch": 0.6469002695417789, + "eval_accuracy": 0.32733703397825703, + "eval_loss": 3.921706199645996, + "eval_runtime": 184.8387, + "eval_samples_per_second": 97.442, + "eval_steps_per_second": 6.092, "step": 6000 }, { - "epoch": 0.651167796792595, - "grad_norm": 0.6309311389923096, - "learning_rate": 0.0005615407822432927, - "loss": 3.9811, + "epoch": 0.6522911051212938, + "grad_norm": 0.653874933719635, + "learning_rate": 0.0005614678899082568, + "loss": 3.9682, "step": 6050 }, { - "epoch": 0.6565493488322032, - "grad_norm": 0.5584713220596313, - "learning_rate": 0.0005612175412132313, - "loss": 3.9619, + "epoch": 0.6576819407008087, + "grad_norm": 0.7536391615867615, + "learning_rate": 0.0005611440906637884, + "loss": 4.0014, "step": 6100 }, { - "epoch": 0.6619309008718114, - "grad_norm": 0.5973617434501648, - "learning_rate": 0.0005608943001831699, - "loss": 3.9528, + "epoch": 0.6630727762803235, + "grad_norm": 0.6264755129814148, + "learning_rate": 0.00056082029141932, + "loss": 3.9809, "step": 6150 }, { - "epoch": 0.6673124529114196, - "grad_norm": 0.6784116625785828, - "learning_rate": 0.0005605710591531085, - "loss": 3.9584, + "epoch": 0.6684636118598383, + "grad_norm": 0.6374775767326355, + "learning_rate": 0.0005604964921748515, + "loss": 3.9595, "step": 6200 }, { - "epoch": 0.6726940049510278, - "grad_norm": 0.5735004544258118, - "learning_rate": 0.000560247818123047, - "loss": 3.9522, + "epoch": 0.6738544474393531, + "grad_norm": 0.6206533312797546, + "learning_rate": 0.000560172692930383, + "loss": 3.9782, "step": 6250 }, { - "epoch": 0.6780755569906362, - "grad_norm": 0.6804719567298889, - "learning_rate": 0.0005599245770929855, - "loss": 3.9563, + "epoch": 0.6792452830188679, + "grad_norm": 0.612671434879303, + "learning_rate": 0.0005598488936859147, + "loss": 3.9957, "step": 6300 }, { - "epoch": 0.6834571090302444, - "grad_norm": 0.6392138004302979, - "learning_rate": 0.0005596013360629242, - "loss": 3.9444, + "epoch": 0.6846361185983828, + "grad_norm": 0.6080566048622131, + "learning_rate": 0.0005595250944414463, + "loss": 3.9573, "step": 6350 }, { - "epoch": 0.6888386610698526, - "grad_norm": 0.5223947763442993, - "learning_rate": 0.0005592780950328628, - "loss": 3.9685, + "epoch": 0.6900269541778976, + "grad_norm": Infinity, + "learning_rate": 0.0005592077711818672, + "loss": 3.9618, "step": 6400 }, { - "epoch": 0.6942202131094608, - "grad_norm": 0.6110231280326843, - "learning_rate": 0.0005589548540028014, - "loss": 3.9501, + "epoch": 0.6954177897574124, + "grad_norm": 0.6666589975357056, + "learning_rate": 0.0005588839719373988, + "loss": 3.9455, "step": 6450 }, { - "epoch": 0.699601765149069, - "grad_norm": 0.6495262980461121, - "learning_rate": 0.0005586316129727399, - "loss": 3.9733, + "epoch": 0.7008086253369272, + "grad_norm": 0.6202605962753296, + "learning_rate": 0.0005585601726929303, + "loss": 3.9596, "step": 6500 }, { - "epoch": 0.7049833171886772, - "grad_norm": 0.5392972230911255, - "learning_rate": 0.0005583083719426786, - "loss": 3.9475, + "epoch": 0.706199460916442, + "grad_norm": 0.570480465888977, + "learning_rate": 0.0005582363734484619, + "loss": 3.9847, "step": 6550 }, { - "epoch": 0.7103648692282855, - "grad_norm": 0.5373910665512085, - "learning_rate": 0.0005579851309126171, - "loss": 3.9475, + "epoch": 0.7115902964959568, + "grad_norm": 0.579180121421814, + "learning_rate": 0.0005579125742039935, + "loss": 3.9406, "step": 6600 }, { - "epoch": 0.7157464212678937, - "grad_norm": 0.5609496235847473, - "learning_rate": 0.0005576618898825558, - "loss": 3.9292, + "epoch": 0.7169811320754716, + "grad_norm": 0.5900434255599976, + "learning_rate": 0.0005575887749595251, + "loss": 3.9683, "step": 6650 }, { - "epoch": 0.7211279733075019, - "grad_norm": 0.5721986293792725, - "learning_rate": 0.0005573386488524943, - "loss": 3.9432, + "epoch": 0.7223719676549866, + "grad_norm": 0.6138500571250916, + "learning_rate": 0.0005572649757150566, + "loss": 3.9438, "step": 6700 }, { - "epoch": 0.7265095253471101, - "grad_norm": 0.5666886568069458, - "learning_rate": 0.0005570154078224328, - "loss": 3.9348, + "epoch": 0.7277628032345014, + "grad_norm": 0.5448847413063049, + "learning_rate": 0.0005569411764705882, + "loss": 3.9507, "step": 6750 }, { - "epoch": 0.7318910773867183, - "grad_norm": 0.5143133997917175, - "learning_rate": 0.0005566921667923715, - "loss": 3.9322, + "epoch": 0.7331536388140162, + "grad_norm": 0.6583116054534912, + "learning_rate": 0.0005566173772261198, + "loss": 3.9383, "step": 6800 }, { - "epoch": 0.7372726294263265, - "grad_norm": 0.5764801502227783, - "learning_rate": 0.00055636892576231, - "loss": 3.9196, + "epoch": 0.738544474393531, + "grad_norm": 0.6227813959121704, + "learning_rate": 0.0005562935779816513, + "loss": 3.9187, "step": 6850 }, { - "epoch": 0.7426541814659348, - "grad_norm": 0.6438068151473999, - "learning_rate": 0.0005560456847322487, - "loss": 3.9207, + "epoch": 0.7439353099730458, + "grad_norm": 0.6529786586761475, + "learning_rate": 0.0005559697787371828, + "loss": 3.9458, "step": 6900 }, { - "epoch": 0.748035733505543, - "grad_norm": 0.547141432762146, - "learning_rate": 0.0005557224437021872, - "loss": 3.9211, + "epoch": 0.7493261455525606, + "grad_norm": 0.6059437990188599, + "learning_rate": 0.0005556459794927145, + "loss": 3.9425, "step": 6950 }, { - "epoch": 0.7534172855451512, - "grad_norm": 0.5400163531303406, - "learning_rate": 0.0005553992026721258, - "loss": 3.912, + "epoch": 0.7547169811320755, + "grad_norm": 0.6971544623374939, + "learning_rate": 0.000555322180248246, + "loss": 3.9281, "step": 7000 }, { - "epoch": 0.7534172855451512, - "eval_accuracy": 0.3342374702304669, - "eval_loss": 3.858619451522827, - "eval_runtime": 181.854, - "eval_samples_per_second": 99.041, - "eval_steps_per_second": 6.192, + "epoch": 0.7547169811320755, + "eval_accuracy": 0.3333070755521115, + "eval_loss": 3.862715482711792, + "eval_runtime": 184.7265, + "eval_samples_per_second": 97.501, + "eval_steps_per_second": 6.095, "step": 7000 }, { - "epoch": 0.7587988375847594, - "grad_norm": 0.6468452215194702, - "learning_rate": 0.0005550759616420644, - "loss": 3.9213, + "epoch": 0.7601078167115903, + "grad_norm": 0.5811669826507568, + "learning_rate": 0.0005549983810037776, + "loss": 3.9215, "step": 7050 }, { - "epoch": 0.7641803896243676, - "grad_norm": 0.8306963443756104, - "learning_rate": 0.000554752720612003, - "loss": 3.9261, + "epoch": 0.7654986522911051, + "grad_norm": 0.5781192183494568, + "learning_rate": 0.0005546745817593091, + "loss": 3.9333, "step": 7100 }, { - "epoch": 0.7695619416639758, - "grad_norm": 0.5487331748008728, - "learning_rate": 0.0005544294795819415, - "loss": 3.9015, + "epoch": 0.77088948787062, + "grad_norm": 0.5684405565261841, + "learning_rate": 0.0005543507825148408, + "loss": 3.9335, "step": 7150 }, { - "epoch": 0.7749434937035841, - "grad_norm": 0.5448126196861267, - "learning_rate": 0.0005541062385518801, - "loss": 3.9063, + "epoch": 0.7762803234501348, + "grad_norm": 0.6275970339775085, + "learning_rate": 0.0005540269832703723, + "loss": 3.9334, "step": 7200 }, { - "epoch": 0.7803250457431924, - "grad_norm": 0.5569677948951721, - "learning_rate": 0.0005537829975218188, - "loss": 3.9022, + "epoch": 0.7816711590296496, + "grad_norm": 0.5931147933006287, + "learning_rate": 0.0005537031840259039, + "loss": 3.8893, "step": 7250 }, { - "epoch": 0.7857065977828006, - "grad_norm": 0.5910586714744568, - "learning_rate": 0.0005534597564917573, - "loss": 3.9111, + "epoch": 0.7870619946091644, + "grad_norm": 0.6141475439071655, + "learning_rate": 0.0005533793847814354, + "loss": 3.9264, "step": 7300 }, { - "epoch": 0.7910881498224088, - "grad_norm": 0.75867760181427, - "learning_rate": 0.0005531365154616959, - "loss": 3.9149, + "epoch": 0.7924528301886793, + "grad_norm": 0.6149185299873352, + "learning_rate": 0.000553055585536967, + "loss": 3.9141, "step": 7350 }, { - "epoch": 0.796469701862017, - "grad_norm": 0.5806201696395874, - "learning_rate": 0.0005528132744316344, - "loss": 3.8975, + "epoch": 0.7978436657681941, + "grad_norm": 0.608295202255249, + "learning_rate": 0.0005527317862924987, + "loss": 3.9136, "step": 7400 }, { - "epoch": 0.8018512539016253, - "grad_norm": 0.5979334115982056, - "learning_rate": 0.0005524900334015731, - "loss": 3.9194, + "epoch": 0.8032345013477089, + "grad_norm": 0.5281786918640137, + "learning_rate": 0.0005524079870480301, + "loss": 3.9039, "step": 7450 }, { - "epoch": 0.8072328059412335, - "grad_norm": 0.6362331509590149, - "learning_rate": 0.0005521667923715117, - "loss": 3.8981, + "epoch": 0.8086253369272237, + "grad_norm": 0.6112433671951294, + "learning_rate": 0.0005520841878035618, + "loss": 3.8906, "step": 7500 }, { - "epoch": 0.8126143579808417, - "grad_norm": 0.5838714838027954, - "learning_rate": 0.0005518435513414502, - "loss": 3.8961, + "epoch": 0.8140161725067385, + "grad_norm": 0.5515127182006836, + "learning_rate": 0.0005517603885590933, + "loss": 3.9185, "step": 7550 }, { - "epoch": 0.8179959100204499, - "grad_norm": 0.7832590937614441, - "learning_rate": 0.0005515203103113888, - "loss": 3.8931, + "epoch": 0.8194070080862533, + "grad_norm": 0.5626883506774902, + "learning_rate": 0.0005514365893146249, + "loss": 3.9207, "step": 7600 }, { - "epoch": 0.8233774620600581, - "grad_norm": 0.5615019798278809, - "learning_rate": 0.0005511970692813274, - "loss": 3.8981, + "epoch": 0.8247978436657682, + "grad_norm": 0.5652742981910706, + "learning_rate": 0.0005511127900701564, + "loss": 3.8936, "step": 7650 }, { - "epoch": 0.8287590140996663, - "grad_norm": 0.6099148392677307, - "learning_rate": 0.000550873828251266, - "loss": 3.8941, + "epoch": 0.8301886792452831, + "grad_norm": 0.5642729997634888, + "learning_rate": 0.000550788990825688, + "loss": 3.8981, "step": 7700 }, { - "epoch": 0.8341405661392746, - "grad_norm": 0.5140008330345154, - "learning_rate": 0.0005505505872212045, - "loss": 3.8796, + "epoch": 0.8355795148247979, + "grad_norm": 0.5795193910598755, + "learning_rate": 0.0005504651915812196, + "loss": 3.8818, "step": 7750 }, { - "epoch": 0.8395221181788828, - "grad_norm": 0.6559896469116211, - "learning_rate": 0.0005502273461911432, - "loss": 3.8723, + "epoch": 0.8409703504043127, + "grad_norm": 0.609893798828125, + "learning_rate": 0.0005501413923367512, + "loss": 3.9107, "step": 7800 }, { - "epoch": 0.844903670218491, - "grad_norm": 0.557579517364502, - "learning_rate": 0.0005499041051610817, - "loss": 3.8736, + "epoch": 0.8463611859838275, + "grad_norm": 0.6346420049667358, + "learning_rate": 0.0005498175930922827, + "loss": 3.8884, "step": 7850 }, { - "epoch": 0.8502852222580992, - "grad_norm": 0.6086620688438416, - "learning_rate": 0.0005495808641310204, - "loss": 3.9014, + "epoch": 0.8517520215633423, + "grad_norm": 0.5454068779945374, + "learning_rate": 0.0005494937938478143, + "loss": 3.8879, "step": 7900 }, { - "epoch": 0.8556667742977074, - "grad_norm": 0.6662868857383728, - "learning_rate": 0.0005492576231009589, - "loss": 3.8747, + "epoch": 0.8571428571428571, + "grad_norm": 0.5878623723983765, + "learning_rate": 0.0005491699946033459, + "loss": 3.8863, "step": 7950 }, { - "epoch": 0.8610483263373157, - "grad_norm": 1.007802963256836, - "learning_rate": 0.0005489343820708974, - "loss": 3.862, + "epoch": 0.862533692722372, + "grad_norm": 0.6634072661399841, + "learning_rate": 0.0005488461953588775, + "loss": 3.8745, "step": 8000 }, { - "epoch": 0.8610483263373157, - "eval_accuracy": 0.33844711771595115, - "eval_loss": 3.811176061630249, - "eval_runtime": 181.8668, - "eval_samples_per_second": 99.034, - "eval_steps_per_second": 6.191, + "epoch": 0.862533692722372, + "eval_accuracy": 0.33734624665661483, + "eval_loss": 3.8194918632507324, + "eval_runtime": 184.5541, + "eval_samples_per_second": 97.592, + "eval_steps_per_second": 6.101, "step": 8000 }, { - "epoch": 0.8664298783769239, - "grad_norm": 0.5262889266014099, - "learning_rate": 0.0005486111410408361, - "loss": 3.8806, + "epoch": 0.8679245283018868, + "grad_norm": 0.5450937747955322, + "learning_rate": 0.000548522396114409, + "loss": 3.8827, "step": 8050 }, { - "epoch": 0.8718114304165321, - "grad_norm": 0.5448309183120728, - "learning_rate": 0.0005482879000107746, - "loss": 3.8702, + "epoch": 0.8733153638814016, + "grad_norm": 0.6128168702125549, + "learning_rate": 0.0005481985968699406, + "loss": 3.8677, "step": 8100 }, { - "epoch": 0.8771929824561403, - "grad_norm": 0.5762999057769775, - "learning_rate": 0.0005479646589807133, - "loss": 3.8735, + "epoch": 0.8787061994609164, + "grad_norm": 0.5559018850326538, + "learning_rate": 0.0005478747976254721, + "loss": 3.8809, "step": 8150 }, { - "epoch": 0.8825745344957485, - "grad_norm": 1.141692876815796, - "learning_rate": 0.0005476414179506518, - "loss": 3.8806, + "epoch": 0.8840970350404312, + "grad_norm": 0.5544788241386414, + "learning_rate": 0.0005475509983810037, + "loss": 3.8704, "step": 8200 }, { - "epoch": 0.8879560865353568, - "grad_norm": 0.5628105998039246, - "learning_rate": 0.0005473181769205904, - "loss": 3.8622, + "epoch": 0.889487870619946, + "grad_norm": 0.582495391368866, + "learning_rate": 0.0005472271991365352, + "loss": 3.8559, "step": 8250 }, { - "epoch": 0.8933376385749651, - "grad_norm": 0.5854573249816895, - "learning_rate": 0.000546994935890529, - "loss": 3.8763, + "epoch": 0.894878706199461, + "grad_norm": 0.5992187857627869, + "learning_rate": 0.0005469033998920669, + "loss": 3.8817, "step": 8300 }, { - "epoch": 0.8987191906145733, - "grad_norm": 0.5357927083969116, - "learning_rate": 0.0005466716948604677, - "loss": 3.8505, + "epoch": 0.9002695417789758, + "grad_norm": 0.6100888252258301, + "learning_rate": 0.0005465796006475984, + "loss": 3.8633, "step": 8350 }, { - "epoch": 0.9041007426541815, - "grad_norm": 0.6378769874572754, - "learning_rate": 0.0005463484538304062, - "loss": 3.845, + "epoch": 0.9056603773584906, + "grad_norm": 0.5837623476982117, + "learning_rate": 0.00054625580140313, + "loss": 3.8758, "step": 8400 }, { - "epoch": 0.9094822946937897, - "grad_norm": 0.6226728558540344, - "learning_rate": 0.0005460252128003447, - "loss": 3.8583, + "epoch": 0.9110512129380054, + "grad_norm": 0.6226052641868591, + "learning_rate": 0.0005459320021586615, + "loss": 3.886, "step": 8450 }, { - "epoch": 0.9148638467333979, - "grad_norm": 0.5731797218322754, - "learning_rate": 0.0005457019717702833, - "loss": 3.8498, + "epoch": 0.9164420485175202, + "grad_norm": 0.5699930787086487, + "learning_rate": 0.0005456082029141932, + "loss": 3.8766, "step": 8500 }, { - "epoch": 0.9202453987730062, - "grad_norm": 0.6198984384536743, - "learning_rate": 0.0005453787307402219, - "loss": 3.8389, + "epoch": 0.921832884097035, + "grad_norm": 0.5467044711112976, + "learning_rate": 0.0005452844036697248, + "loss": 3.8585, "step": 8550 }, { - "epoch": 0.9256269508126144, - "grad_norm": 0.600885272026062, - "learning_rate": 0.0005450554897101605, - "loss": 3.8547, + "epoch": 0.9272237196765498, + "grad_norm": 0.6719435453414917, + "learning_rate": 0.0005449670804101457, + "loss": 3.8533, "step": 8600 }, { - "epoch": 0.9310085028522226, - "grad_norm": 0.5399230122566223, - "learning_rate": 0.0005447322486800991, - "loss": 3.8578, + "epoch": 0.9326145552560647, + "grad_norm": 0.5468204617500305, + "learning_rate": 0.0005446432811656773, + "loss": 3.8461, "step": 8650 }, { - "epoch": 0.9363900548918308, - "grad_norm": 0.5042197108268738, - "learning_rate": 0.0005444090076500377, - "loss": 3.8613, + "epoch": 0.9380053908355795, + "grad_norm": 0.5554046630859375, + "learning_rate": 0.0005443194819212088, + "loss": 3.8516, "step": 8700 }, { - "epoch": 0.941771606931439, - "grad_norm": 0.5343637466430664, - "learning_rate": 0.0005440857666199763, - "loss": 3.8458, + "epoch": 0.9433962264150944, + "grad_norm": 0.5696746706962585, + "learning_rate": 0.0005439956826767404, + "loss": 3.8491, "step": 8750 }, { - "epoch": 0.9471531589710472, - "grad_norm": 0.5491181015968323, - "learning_rate": 0.0005437625255899148, - "loss": 3.8335, + "epoch": 0.9487870619946092, + "grad_norm": 0.5562222003936768, + "learning_rate": 0.000543671883432272, + "loss": 3.8421, "step": 8800 }, { - "epoch": 0.9525347110106555, - "grad_norm": 0.5667814612388611, - "learning_rate": 0.0005434392845598534, - "loss": 3.845, + "epoch": 0.954177897574124, + "grad_norm": 0.6517609357833862, + "learning_rate": 0.0005433480841878035, + "loss": 3.8343, "step": 8850 }, { - "epoch": 0.9579162630502637, - "grad_norm": 0.5124309659004211, - "learning_rate": 0.000543116043529792, - "loss": 3.8512, + "epoch": 0.9595687331536388, + "grad_norm": 0.570240318775177, + "learning_rate": 0.000543024284943335, + "loss": 3.8533, "step": 8900 }, { - "epoch": 0.9632978150898719, - "grad_norm": 0.5880436301231384, - "learning_rate": 0.0005427928024997306, - "loss": 3.8386, + "epoch": 0.9649595687331537, + "grad_norm": 0.5864043831825256, + "learning_rate": 0.0005427004856988667, + "loss": 3.8559, "step": 8950 }, { - "epoch": 0.9686793671294801, - "grad_norm": 0.5593850016593933, - "learning_rate": 0.0005424695614696692, - "loss": 3.8266, + "epoch": 0.9703504043126685, + "grad_norm": 0.5611904859542847, + "learning_rate": 0.0005423766864543982, + "loss": 3.8429, "step": 9000 }, { - "epoch": 0.9686793671294801, - "eval_accuracy": 0.34156187005092886, - "eval_loss": 3.7744014263153076, - "eval_runtime": 179.4219, - "eval_samples_per_second": 100.384, - "eval_steps_per_second": 6.276, + "epoch": 0.9703504043126685, + "eval_accuracy": 0.3413870475544354, + "eval_loss": 3.780839681625366, + "eval_runtime": 185.0401, + "eval_samples_per_second": 97.336, + "eval_steps_per_second": 6.085, "step": 9000 }, { - "epoch": 0.9740609191690883, - "grad_norm": 0.6109694838523865, - "learning_rate": 0.0005421463204396078, - "loss": 3.8173, + "epoch": 0.9757412398921833, + "grad_norm": 0.578873336315155, + "learning_rate": 0.0005420528872099298, + "loss": 3.8385, "step": 9050 }, { - "epoch": 0.9794424712086965, - "grad_norm": 0.5747610330581665, - "learning_rate": 0.0005418295442301476, - "loss": 3.8562, + "epoch": 0.9811320754716981, + "grad_norm": 0.657522439956665, + "learning_rate": 0.0005417290879654613, + "loss": 3.8524, "step": 9100 }, { - "epoch": 0.9848240232483048, - "grad_norm": 0.536498486995697, - "learning_rate": 0.0005415063032000861, - "loss": 3.8285, + "epoch": 0.9865229110512129, + "grad_norm": 0.5951496958732605, + "learning_rate": 0.000541405288720993, + "loss": 3.8276, "step": 9150 }, { - "epoch": 0.9902055752879131, - "grad_norm": 0.5872951745986938, - "learning_rate": 0.0005411830621700248, - "loss": 3.8311, + "epoch": 0.9919137466307277, + "grad_norm": 0.6185961961746216, + "learning_rate": 0.0005410814894765245, + "loss": 3.8227, "step": 9200 }, { - "epoch": 0.9955871273275213, - "grad_norm": 0.6314157843589783, - "learning_rate": 0.0005408598211399633, - "loss": 3.8067, + "epoch": 0.9973045822102425, + "grad_norm": 0.6109092831611633, + "learning_rate": 0.0005407576902320561, + "loss": 3.835, "step": 9250 }, { - "epoch": 1.0009686793671295, - "grad_norm": 0.5174910426139832, - "learning_rate": 0.0005405365801099019, - "loss": 3.8005, + "epoch": 1.0026954177897573, + "grad_norm": 0.5460496544837952, + "learning_rate": 0.0005404338909875876, + "loss": 3.8096, "step": 9300 }, { - "epoch": 1.0063502314067376, - "grad_norm": 0.5861645936965942, - "learning_rate": 0.0005402133390798405, - "loss": 3.7532, + "epoch": 1.0080862533692723, + "grad_norm": 0.5667276978492737, + "learning_rate": 0.0005401100917431192, + "loss": 3.7744, "step": 9350 }, { - "epoch": 1.011731783446346, - "grad_norm": 0.6189785003662109, - "learning_rate": 0.000539890098049779, - "loss": 3.7568, + "epoch": 1.013477088948787, + "grad_norm": 0.5935086607933044, + "learning_rate": 0.0005397862924986508, + "loss": 3.7692, "step": 9400 }, { - "epoch": 1.017113335485954, - "grad_norm": 0.5914862751960754, - "learning_rate": 0.0005395668570197177, - "loss": 3.7533, + "epoch": 1.0188679245283019, + "grad_norm": 0.6849828958511353, + "learning_rate": 0.0005394624932541824, + "loss": 3.7707, "step": 9450 }, { - "epoch": 1.0224948875255624, - "grad_norm": 0.618517279624939, - "learning_rate": 0.0005392436159896562, - "loss": 3.7495, + "epoch": 1.0242587601078168, + "grad_norm": 0.5856293439865112, + "learning_rate": 0.0005391386940097139, + "loss": 3.7689, "step": 9500 }, { - "epoch": 1.0278764395651705, - "grad_norm": 0.5718392133712769, - "learning_rate": 0.0005389203749595948, - "loss": 3.7624, + "epoch": 1.0296495956873315, + "grad_norm": 0.5455779433250427, + "learning_rate": 0.0005388148947652455, + "loss": 3.7681, "step": 9550 }, { - "epoch": 1.0332579916047788, - "grad_norm": 0.5652843713760376, - "learning_rate": 0.0005385971339295334, - "loss": 3.7637, + "epoch": 1.0350404312668464, + "grad_norm": 0.6295051574707031, + "learning_rate": 0.000538491095520777, + "loss": 3.7576, "step": 9600 }, { - "epoch": 1.0386395436443872, - "grad_norm": 0.6264984011650085, - "learning_rate": 0.000538273892899472, - "loss": 3.7699, + "epoch": 1.0404312668463611, + "grad_norm": 0.6791478395462036, + "learning_rate": 0.0005381672962763086, + "loss": 3.796, "step": 9650 }, { - "epoch": 1.0440210956839953, - "grad_norm": 0.5926131010055542, - "learning_rate": 0.0005379506518694106, - "loss": 3.7665, + "epoch": 1.045822102425876, + "grad_norm": 0.6339579820632935, + "learning_rate": 0.0005378434970318403, + "loss": 3.7826, "step": 9700 }, { - "epoch": 1.0494026477236036, - "grad_norm": 0.539849579334259, - "learning_rate": 0.0005376274108393491, - "loss": 3.7516, + "epoch": 1.0512129380053907, + "grad_norm": 0.5362462401390076, + "learning_rate": 0.0005375196977873718, + "loss": 3.7799, "step": 9750 }, { - "epoch": 1.0547841997632117, - "grad_norm": 0.626375138759613, - "learning_rate": 0.0005373041698092877, - "loss": 3.7572, + "epoch": 1.0566037735849056, + "grad_norm": 0.5385799407958984, + "learning_rate": 0.0005371958985429034, + "loss": 3.7875, "step": 9800 }, { - "epoch": 1.06016575180282, - "grad_norm": 0.5491912364959717, - "learning_rate": 0.0005369809287792263, - "loss": 3.7701, + "epoch": 1.0619946091644206, + "grad_norm": 0.5548085570335388, + "learning_rate": 0.0005368720992984349, + "loss": 3.7837, "step": 9850 }, { - "epoch": 1.0655473038424281, - "grad_norm": 0.5011469721794128, - "learning_rate": 0.000536657687749165, - "loss": 3.7628, + "epoch": 1.0673854447439353, + "grad_norm": 0.5549419522285461, + "learning_rate": 0.0005365483000539665, + "loss": 3.7745, "step": 9900 }, { - "epoch": 1.0709288558820365, - "grad_norm": 0.6251816153526306, - "learning_rate": 0.0005363344467191035, - "loss": 3.7487, + "epoch": 1.0727762803234502, + "grad_norm": 0.5701829195022583, + "learning_rate": 0.0005362245008094981, + "loss": 3.7694, "step": 9950 }, { - "epoch": 1.0763104079216446, - "grad_norm": 0.702565610408783, - "learning_rate": 0.000536011205689042, - "loss": 3.7541, + "epoch": 1.0781671159029649, + "grad_norm": 0.5939721465110779, + "learning_rate": 0.0005359007015650297, + "loss": 3.7625, "step": 10000 }, { - "epoch": 1.0763104079216446, - "eval_accuracy": 0.34536037500892314, - "eval_loss": 3.741765022277832, - "eval_runtime": 180.285, - "eval_samples_per_second": 99.903, - "eval_steps_per_second": 6.246, + "epoch": 1.0781671159029649, + "eval_accuracy": 0.34421343512515346, + "eval_loss": 3.7506351470947266, + "eval_runtime": 184.7523, + "eval_samples_per_second": 97.487, + "eval_steps_per_second": 6.095, "step": 10000 }, { - "epoch": 1.081691959961253, - "grad_norm": 0.5714436769485474, - "learning_rate": 0.0005356879646589807, - "loss": 3.7614, + "epoch": 1.0835579514824798, + "grad_norm": 0.5285921692848206, + "learning_rate": 0.0005355769023205612, + "loss": 3.7661, "step": 10050 }, { - "epoch": 1.087073512000861, - "grad_norm": 0.5856513977050781, - "learning_rate": 0.0005353647236289192, - "loss": 3.7646, + "epoch": 1.0889487870619945, + "grad_norm": 0.5900604128837585, + "learning_rate": 0.0005352531030760928, + "loss": 3.7748, "step": 10100 }, { - "epoch": 1.0924550640404693, - "grad_norm": 0.5325464606285095, - "learning_rate": 0.0005350414825988579, - "loss": 3.77, + "epoch": 1.0943396226415094, + "grad_norm": 0.5793938040733337, + "learning_rate": 0.0005349293038316244, + "loss": 3.7681, "step": 10150 }, { - "epoch": 1.0978366160800774, - "grad_norm": 0.6802165508270264, - "learning_rate": 0.0005347182415687964, - "loss": 3.745, + "epoch": 1.0997304582210243, + "grad_norm": 0.5640652179718018, + "learning_rate": 0.0005346055045871559, + "loss": 3.7681, "step": 10200 }, { - "epoch": 1.1032181681196858, - "grad_norm": 0.5991120934486389, - "learning_rate": 0.000534395000538735, - "loss": 3.7629, + "epoch": 1.105121293800539, + "grad_norm": 0.5162309408187866, + "learning_rate": 0.0005342817053426874, + "loss": 3.7755, "step": 10250 }, { - "epoch": 1.1085997201592939, - "grad_norm": 0.5548239350318909, - "learning_rate": 0.0005340717595086736, - "loss": 3.7561, + "epoch": 1.110512129380054, + "grad_norm": 0.5436460971832275, + "learning_rate": 0.0005339579060982191, + "loss": 3.763, "step": 10300 }, { - "epoch": 1.1139812721989022, - "grad_norm": 0.5725270509719849, - "learning_rate": 0.0005337485184786122, - "loss": 3.7547, + "epoch": 1.1159029649595686, + "grad_norm": 0.637514591217041, + "learning_rate": 0.0005336341068537506, + "loss": 3.75, "step": 10350 }, { - "epoch": 1.1193628242385103, - "grad_norm": 0.6069482564926147, - "learning_rate": 0.0005334252774485507, - "loss": 3.755, + "epoch": 1.1212938005390836, + "grad_norm": 0.5975980758666992, + "learning_rate": 0.0005333103076092822, + "loss": 3.7669, "step": 10400 }, { - "epoch": 1.1247443762781186, - "grad_norm": 0.5790407657623291, - "learning_rate": 0.0005331020364184894, - "loss": 3.742, + "epoch": 1.1266846361185983, + "grad_norm": 0.5235165953636169, + "learning_rate": 0.0005329865083648137, + "loss": 3.7541, "step": 10450 }, { - "epoch": 1.1301259283177267, - "grad_norm": 0.5913881659507751, - "learning_rate": 0.0005327787953884279, - "loss": 3.7511, + "epoch": 1.1320754716981132, + "grad_norm": 0.6354934573173523, + "learning_rate": 0.0005326627091203454, + "loss": 3.7428, "step": 10500 }, { - "epoch": 1.135507480357335, - "grad_norm": 0.6094677448272705, - "learning_rate": 0.0005324555543583665, - "loss": 3.7278, + "epoch": 1.137466307277628, + "grad_norm": 0.5056789517402649, + "learning_rate": 0.0005323389098758769, + "loss": 3.7647, "step": 10550 }, { - "epoch": 1.1408890323969434, - "grad_norm": 0.5710995197296143, - "learning_rate": 0.0005321323133283051, - "loss": 3.7383, + "epoch": 1.1428571428571428, + "grad_norm": 0.5910845398902893, + "learning_rate": 0.0005320151106314085, + "loss": 3.7758, "step": 10600 }, { - "epoch": 1.1462705844365515, - "grad_norm": 0.532470703125, - "learning_rate": 0.0005318090722982436, - "loss": 3.7373, + "epoch": 1.1482479784366577, + "grad_norm": 0.6386284232139587, + "learning_rate": 0.00053169131138694, + "loss": 3.7546, "step": 10650 }, { - "epoch": 1.1516521364761596, - "grad_norm": 0.5976846218109131, - "learning_rate": 0.0005314858312681823, - "loss": 3.7609, + "epoch": 1.1536388140161726, + "grad_norm": 0.5644304752349854, + "learning_rate": 0.0005313675121424716, + "loss": 3.751, "step": 10700 }, { - "epoch": 1.157033688515768, - "grad_norm": 0.5880893468856812, - "learning_rate": 0.0005311625902381209, - "loss": 3.7448, + "epoch": 1.1590296495956873, + "grad_norm": 0.5989267826080322, + "learning_rate": 0.0005310437128980032, + "loss": 3.7387, "step": 10750 }, { - "epoch": 1.1624152405553763, - "grad_norm": 0.6322693228721619, - "learning_rate": 0.0005308393492080595, - "loss": 3.7477, + "epoch": 1.1644204851752022, + "grad_norm": 0.545608639717102, + "learning_rate": 0.0005307199136535348, + "loss": 3.7279, "step": 10800 }, { - "epoch": 1.1677967925949844, - "grad_norm": 0.5472283363342285, - "learning_rate": 0.000530516108177998, - "loss": 3.737, + "epoch": 1.169811320754717, + "grad_norm": 0.5512583255767822, + "learning_rate": 0.0005303961144090663, + "loss": 3.7474, "step": 10850 }, { - "epoch": 1.1731783446345927, - "grad_norm": 0.5412709712982178, - "learning_rate": 0.0005301928671479365, - "loss": 3.7412, + "epoch": 1.1752021563342319, + "grad_norm": 0.5729238390922546, + "learning_rate": 0.0005300723151645979, + "loss": 3.7492, "step": 10900 }, { - "epoch": 1.1785598966742008, - "grad_norm": 0.5746583342552185, - "learning_rate": 0.0005298696261178752, - "loss": 3.7427, + "epoch": 1.1805929919137466, + "grad_norm": 0.596947431564331, + "learning_rate": 0.0005297485159201295, + "loss": 3.7606, "step": 10950 }, { - "epoch": 1.1839414487138091, - "grad_norm": 0.543925940990448, - "learning_rate": 0.0005295463850878138, - "loss": 3.7349, + "epoch": 1.1859838274932615, + "grad_norm": 0.6165409088134766, + "learning_rate": 0.000529424716675661, + "loss": 3.7757, "step": 11000 }, { - "epoch": 1.1839414487138091, - "eval_accuracy": 0.34741369728284116, - "eval_loss": 3.7178292274475098, - "eval_runtime": 179.9682, - "eval_samples_per_second": 100.079, - "eval_steps_per_second": 6.257, + "epoch": 1.1859838274932615, + "eval_accuracy": 0.3462492642841334, + "eval_loss": 3.726261854171753, + "eval_runtime": 184.6494, + "eval_samples_per_second": 97.542, + "eval_steps_per_second": 6.098, "step": 11000 }, { - "epoch": 1.1893230007534172, - "grad_norm": 0.6048632860183716, - "learning_rate": 0.0005292231440577524, - "loss": 3.7374, + "epoch": 1.1913746630727764, + "grad_norm": 0.5490753054618835, + "learning_rate": 0.0005291009174311926, + "loss": 3.7638, "step": 11050 }, { - "epoch": 1.1947045527930256, - "grad_norm": 0.5460487008094788, - "learning_rate": 0.0005288999030276909, - "loss": 3.7464, + "epoch": 1.196765498652291, + "grad_norm": 0.5522347092628479, + "learning_rate": 0.0005287771181867242, + "loss": 3.7567, "step": 11100 }, { - "epoch": 1.2000861048326337, - "grad_norm": 0.6168481111526489, - "learning_rate": 0.0005285766619976295, - "loss": 3.7319, + "epoch": 1.202156334231806, + "grad_norm": 0.5174296498298645, + "learning_rate": 0.0005284597949271452, + "loss": 3.7552, "step": 11150 }, { - "epoch": 1.205467656872242, - "grad_norm": 0.5460136532783508, - "learning_rate": 0.0005282534209675681, - "loss": 3.7478, + "epoch": 1.2075471698113207, + "grad_norm": 0.5238677263259888, + "learning_rate": 0.0005281359956826767, + "loss": 3.743, "step": 11200 }, { - "epoch": 1.21084920891185, - "grad_norm": 0.5822840332984924, - "learning_rate": 0.0005279301799375066, - "loss": 3.7287, + "epoch": 1.2129380053908356, + "grad_norm": 0.5870748162269592, + "learning_rate": 0.0005278121964382083, + "loss": 3.7358, "step": 11250 }, { - "epoch": 1.2162307609514584, - "grad_norm": 0.528600811958313, - "learning_rate": 0.0005276069389074453, - "loss": 3.7461, + "epoch": 1.2183288409703503, + "grad_norm": 0.6238572597503662, + "learning_rate": 0.0005274883971937398, + "loss": 3.7525, "step": 11300 }, { - "epoch": 1.2216123129910665, - "grad_norm": 0.5433237552642822, - "learning_rate": 0.0005272836978773838, - "loss": 3.738, + "epoch": 1.2237196765498652, + "grad_norm": 0.6007615923881531, + "learning_rate": 0.0005271645979492714, + "loss": 3.7364, "step": 11350 }, { - "epoch": 1.2269938650306749, - "grad_norm": 0.5144934058189392, - "learning_rate": 0.0005269604568473225, - "loss": 3.6953, + "epoch": 1.2291105121293802, + "grad_norm": 0.5366285443305969, + "learning_rate": 0.000526840798704803, + "loss": 3.7556, "step": 11400 }, { - "epoch": 1.232375417070283, - "grad_norm": 0.5954382419586182, - "learning_rate": 0.000526637215817261, - "loss": 3.7262, + "epoch": 1.2345013477088949, + "grad_norm": 0.5180173516273499, + "learning_rate": 0.0005265169994603346, + "loss": 3.7442, "step": 11450 }, { - "epoch": 1.2377569691098913, - "grad_norm": 0.6082680225372314, - "learning_rate": 0.0005263139747871996, - "loss": 3.7328, + "epoch": 1.2398921832884098, + "grad_norm": 0.5628926157951355, + "learning_rate": 0.0005261932002158661, + "loss": 3.7362, "step": 11500 }, { - "epoch": 1.2431385211494996, - "grad_norm": 0.510694682598114, - "learning_rate": 0.0005259907337571381, - "loss": 3.7532, + "epoch": 1.2452830188679245, + "grad_norm": 0.5104094743728638, + "learning_rate": 0.0005258694009713977, + "loss": 3.7472, "step": 11550 }, { - "epoch": 1.2485200731891077, - "grad_norm": 0.5158873796463013, - "learning_rate": 0.0005256674927270768, - "loss": 3.7257, + "epoch": 1.2506738544474394, + "grad_norm": 0.5553884506225586, + "learning_rate": 0.0005255456017269292, + "loss": 3.7208, "step": 11600 }, { - "epoch": 1.2539016252287158, - "grad_norm": 0.559540331363678, - "learning_rate": 0.0005253442516970154, - "loss": 3.7084, + "epoch": 1.256064690026954, + "grad_norm": 0.5858808755874634, + "learning_rate": 0.0005252218024824608, + "loss": 3.7538, "step": 11650 }, { - "epoch": 1.2592831772683242, - "grad_norm": 0.6382985711097717, - "learning_rate": 0.000525021010666954, - "loss": 3.7259, + "epoch": 1.261455525606469, + "grad_norm": 0.694400429725647, + "learning_rate": 0.0005248980032379924, + "loss": 3.7502, "step": 11700 }, { - "epoch": 1.2646647293079325, - "grad_norm": 0.6098189353942871, - "learning_rate": 0.0005246977696368925, - "loss": 3.7354, + "epoch": 1.266846361185984, + "grad_norm": 0.552725613117218, + "learning_rate": 0.000524574203993524, + "loss": 3.7423, "step": 11750 }, { - "epoch": 1.2700462813475406, - "grad_norm": 0.5487571358680725, - "learning_rate": 0.0005243745286068311, - "loss": 3.7322, + "epoch": 1.2722371967654986, + "grad_norm": 0.5437906980514526, + "learning_rate": 0.0005242504047490555, + "loss": 3.7346, "step": 11800 }, { - "epoch": 1.275427833387149, - "grad_norm": 0.5637610554695129, - "learning_rate": 0.0005240577523973709, - "loss": 3.7314, + "epoch": 1.2776280323450135, + "grad_norm": 0.6244624257087708, + "learning_rate": 0.0005239266055045871, + "loss": 3.7441, "step": 11850 }, { - "epoch": 1.280809385426757, - "grad_norm": 0.5707816481590271, - "learning_rate": 0.0005237345113673095, - "loss": 3.7303, + "epoch": 1.2830188679245282, + "grad_norm": 0.5783806443214417, + "learning_rate": 0.0005236028062601186, + "loss": 3.7325, "step": 11900 }, { - "epoch": 1.2861909374663654, - "grad_norm": 0.5442221164703369, - "learning_rate": 0.0005234112703372481, - "loss": 3.7523, + "epoch": 1.2884097035040432, + "grad_norm": 0.5712251663208008, + "learning_rate": 0.0005232790070156503, + "loss": 3.7446, "step": 11950 }, { - "epoch": 1.2915724895059735, - "grad_norm": 0.7692139148712158, - "learning_rate": 0.0005230880293071867, - "loss": 3.7312, + "epoch": 1.2938005390835579, + "grad_norm": 0.637886643409729, + "learning_rate": 0.0005229552077711818, + "loss": 3.7387, "step": 12000 }, { - "epoch": 1.2915724895059735, - "eval_accuracy": 0.3501756536910853, - "eval_loss": 3.6950345039367676, - "eval_runtime": 182.9419, - "eval_samples_per_second": 98.452, - "eval_steps_per_second": 6.155, + "epoch": 1.2938005390835579, + "eval_accuracy": 0.3485640054061331, + "eval_loss": 3.701038122177124, + "eval_runtime": 184.7994, + "eval_samples_per_second": 97.462, + "eval_steps_per_second": 6.093, "step": 12000 }, { - "epoch": 1.2969540415455818, - "grad_norm": 0.5720347762107849, - "learning_rate": 0.0005227647882771253, - "loss": 3.7174, + "epoch": 1.2991913746630728, + "grad_norm": 0.539336621761322, + "learning_rate": 0.0005226314085267134, + "loss": 3.7356, "step": 12050 }, { - "epoch": 1.30233559358519, - "grad_norm": 0.5530905723571777, - "learning_rate": 0.0005224415472470639, - "loss": 3.7105, + "epoch": 1.3045822102425877, + "grad_norm": 0.5483951568603516, + "learning_rate": 0.000522307609282245, + "loss": 3.7429, "step": 12100 }, { - "epoch": 1.3077171456247982, - "grad_norm": 0.5424105525016785, - "learning_rate": 0.0005221183062170024, - "loss": 3.7247, + "epoch": 1.3099730458221024, + "grad_norm": 0.5724830627441406, + "learning_rate": 0.0005219838100377766, + "loss": 3.7593, "step": 12150 }, { - "epoch": 1.3130986976644063, - "grad_norm": 0.5885405540466309, - "learning_rate": 0.0005217950651869409, - "loss": 3.7127, + "epoch": 1.3153638814016173, + "grad_norm": 0.5383995771408081, + "learning_rate": 0.000521660010793308, + "loss": 3.7294, "step": 12200 }, { - "epoch": 1.3184802497040147, - "grad_norm": 0.5528365969657898, - "learning_rate": 0.0005214718241568796, - "loss": 3.7347, + "epoch": 1.320754716981132, + "grad_norm": 0.546576738357544, + "learning_rate": 0.0005213362115488396, + "loss": 3.736, "step": 12250 }, { - "epoch": 1.3238618017436228, - "grad_norm": 0.5199754238128662, - "learning_rate": 0.0005211485831268182, - "loss": 3.7388, + "epoch": 1.326145552560647, + "grad_norm": 0.570923388004303, + "learning_rate": 0.0005210124123043713, + "loss": 3.7464, "step": 12300 }, { - "epoch": 1.329243353783231, - "grad_norm": 0.581535279750824, - "learning_rate": 0.0005208253420967568, - "loss": 3.7209, + "epoch": 1.3315363881401616, + "grad_norm": 0.579967200756073, + "learning_rate": 0.0005206886130599028, + "loss": 3.7331, "step": 12350 }, { - "epoch": 1.3346249058228392, - "grad_norm": 0.614345371723175, - "learning_rate": 0.0005205021010666953, - "loss": 3.7146, + "epoch": 1.3369272237196765, + "grad_norm": 0.6479465961456299, + "learning_rate": 0.0005203648138154344, + "loss": 3.722, "step": 12400 }, { - "epoch": 1.3400064578624475, - "grad_norm": 0.6020660400390625, - "learning_rate": 0.0005201788600366339, - "loss": 3.7209, + "epoch": 1.3423180592991915, + "grad_norm": 0.552336573600769, + "learning_rate": 0.0005200410145709659, + "loss": 3.7365, "step": 12450 }, { - "epoch": 1.3453880099020559, - "grad_norm": 0.5743170976638794, - "learning_rate": 0.0005198556190065725, - "loss": 3.7123, + "epoch": 1.3477088948787062, + "grad_norm": 0.7343564629554749, + "learning_rate": 0.0005197172153264976, + "loss": 3.7177, "step": 12500 }, { - "epoch": 1.350769561941664, - "grad_norm": 0.5332421660423279, - "learning_rate": 0.0005195323779765112, - "loss": 3.7184, + "epoch": 1.353099730458221, + "grad_norm": 0.5686933398246765, + "learning_rate": 0.0005193934160820291, + "loss": 3.7304, "step": 12550 }, { - "epoch": 1.356151113981272, - "grad_norm": 0.5720219612121582, - "learning_rate": 0.0005192091369464497, - "loss": 3.7093, + "epoch": 1.3584905660377358, + "grad_norm": 0.6135874390602112, + "learning_rate": 0.0005190696168375607, + "loss": 3.7254, "step": 12600 }, { - "epoch": 1.3615326660208804, - "grad_norm": 0.5706057548522949, - "learning_rate": 0.0005188858959163882, - "loss": 3.7122, + "epoch": 1.3638814016172507, + "grad_norm": 0.5901519060134888, + "learning_rate": 0.0005187458175930922, + "loss": 3.7145, "step": 12650 }, { - "epoch": 1.3669142180604887, - "grad_norm": 0.5560119152069092, - "learning_rate": 0.0005185626548863269, - "loss": 3.7149, + "epoch": 1.3692722371967654, + "grad_norm": 0.5513310432434082, + "learning_rate": 0.0005184220183486238, + "loss": 3.7122, "step": 12700 }, { - "epoch": 1.3722957701000968, - "grad_norm": 0.6042450666427612, - "learning_rate": 0.0005182394138562654, - "loss": 3.7158, + "epoch": 1.3746630727762803, + "grad_norm": 0.6472775936126709, + "learning_rate": 0.0005180982191041554, + "loss": 3.7296, "step": 12750 }, { - "epoch": 1.3776773221397052, - "grad_norm": 0.6797989010810852, - "learning_rate": 0.0005179161728262041, - "loss": 3.7213, + "epoch": 1.3800539083557952, + "grad_norm": 0.5848823189735413, + "learning_rate": 0.000517774419859687, + "loss": 3.7099, "step": 12800 }, { - "epoch": 1.3830588741793133, - "grad_norm": 0.6140666007995605, - "learning_rate": 0.0005175929317961426, - "loss": 3.7106, + "epoch": 1.38544474393531, + "grad_norm": 0.6691808104515076, + "learning_rate": 0.0005174506206152185, + "loss": 3.7238, "step": 12850 }, { - "epoch": 1.3884404262189216, - "grad_norm": 0.5921536684036255, - "learning_rate": 0.0005172696907660812, - "loss": 3.7036, + "epoch": 1.3908355795148248, + "grad_norm": 0.5118029117584229, + "learning_rate": 0.0005171268213707501, + "loss": 3.7223, "step": 12900 }, { - "epoch": 1.3938219782585297, - "grad_norm": 0.547536313533783, - "learning_rate": 0.0005169464497360198, - "loss": 3.7023, + "epoch": 1.3962264150943398, + "grad_norm": 0.5811614394187927, + "learning_rate": 0.0005168030221262816, + "loss": 3.7143, "step": 12950 }, { - "epoch": 1.399203530298138, - "grad_norm": 0.5648983716964722, - "learning_rate": 0.0005166232087059583, - "loss": 3.7127, + "epoch": 1.4016172506738545, + "grad_norm": 0.5549213886260986, + "learning_rate": 0.0005164792228818132, + "loss": 3.7098, "step": 13000 }, { - "epoch": 1.399203530298138, - "eval_accuracy": 0.35228004282227615, - "eval_loss": 3.673454761505127, - "eval_runtime": 182.9372, - "eval_samples_per_second": 98.455, - "eval_steps_per_second": 6.155, + "epoch": 1.4016172506738545, + "eval_accuracy": 0.35118536497754527, + "eval_loss": 3.678708076477051, + "eval_runtime": 184.9082, + "eval_samples_per_second": 97.405, + "eval_steps_per_second": 6.09, "step": 13000 }, { - "epoch": 1.4045850823377461, - "grad_norm": 0.5507751107215881, - "learning_rate": 0.0005162999676758969, - "loss": 3.7025, + "epoch": 1.4070080862533692, + "grad_norm": 0.5535794496536255, + "learning_rate": 0.0005161554236373448, + "loss": 3.6872, "step": 13050 }, { - "epoch": 1.4099666343773545, - "grad_norm": 0.5451176166534424, - "learning_rate": 0.0005159767266458355, - "loss": 3.7267, + "epoch": 1.412398921832884, + "grad_norm": 0.5400480031967163, + "learning_rate": 0.0005158316243928764, + "loss": 3.7089, "step": 13100 }, { - "epoch": 1.4153481864169626, - "grad_norm": 0.6242395043373108, - "learning_rate": 0.0005156534856157741, - "loss": 3.6888, + "epoch": 1.417789757412399, + "grad_norm": 0.5891815423965454, + "learning_rate": 0.0005155078251484079, + "loss": 3.7138, "step": 13150 }, { - "epoch": 1.420729738456571, - "grad_norm": 0.5988582372665405, - "learning_rate": 0.0005153302445857127, - "loss": 3.7165, + "epoch": 1.4231805929919137, + "grad_norm": 0.5472397208213806, + "learning_rate": 0.0005151840259039395, + "loss": 3.6966, "step": 13200 }, { - "epoch": 1.426111290496179, - "grad_norm": 0.5528572201728821, - "learning_rate": 0.0005150070035556513, - "loss": 3.6995, + "epoch": 1.4285714285714286, + "grad_norm": 0.5532459020614624, + "learning_rate": 0.000514860226659471, + "loss": 3.7152, "step": 13250 }, { - "epoch": 1.4314928425357873, - "grad_norm": 0.581158459186554, - "learning_rate": 0.0005146837625255898, - "loss": 3.7006, + "epoch": 1.4339622641509435, + "grad_norm": 0.5864548683166504, + "learning_rate": 0.0005145364274150027, + "loss": 3.7092, "step": 13300 }, { - "epoch": 1.4368743945753955, - "grad_norm": 0.5531030297279358, - "learning_rate": 0.0005143605214955285, - "loss": 3.6895, + "epoch": 1.4393530997304582, + "grad_norm": 0.5949729681015015, + "learning_rate": 0.0005142126281705343, + "loss": 3.7166, "step": 13350 }, { - "epoch": 1.4422559466150038, - "grad_norm": 0.5417513847351074, - "learning_rate": 0.0005140372804654671, - "loss": 3.6885, + "epoch": 1.444743935309973, + "grad_norm": 0.5387771129608154, + "learning_rate": 0.0005138888289260658, + "loss": 3.7041, "step": 13400 }, { - "epoch": 1.447637498654612, - "grad_norm": 0.5422365665435791, - "learning_rate": 0.0005137140394354056, - "loss": 3.7015, + "epoch": 1.4501347708894878, + "grad_norm": 0.5066041350364685, + "learning_rate": 0.0005135650296815974, + "loss": 3.7209, "step": 13450 }, { - "epoch": 1.4530190506942202, - "grad_norm": 0.5661604404449463, - "learning_rate": 0.0005133907984053442, - "loss": 3.6965, + "epoch": 1.4555256064690028, + "grad_norm": 0.6268380284309387, + "learning_rate": 0.0005132412304371289, + "loss": 3.7135, "step": 13500 }, { - "epoch": 1.4584006027338283, - "grad_norm": 0.581283688545227, - "learning_rate": 0.0005130675573752827, - "loss": 3.7015, + "epoch": 1.4609164420485174, + "grad_norm": 0.5601277351379395, + "learning_rate": 0.0005129174311926605, + "loss": 3.6909, "step": 13550 }, { - "epoch": 1.4637821547734367, - "grad_norm": 0.5587770938873291, - "learning_rate": 0.0005127443163452214, - "loss": 3.7007, + "epoch": 1.4663072776280324, + "grad_norm": 0.552325963973999, + "learning_rate": 0.0005126001079330814, + "loss": 3.7188, "step": 13600 }, { - "epoch": 1.469163706813045, - "grad_norm": 0.5291188955307007, - "learning_rate": 0.00051242107531516, - "loss": 3.6855, + "epoch": 1.4716981132075473, + "grad_norm": 0.48801907896995544, + "learning_rate": 0.000512276308688613, + "loss": 3.7012, "step": 13650 }, { - "epoch": 1.474545258852653, - "grad_norm": 0.5866516828536987, - "learning_rate": 0.0005120978342850986, - "loss": 3.6982, + "epoch": 1.477088948787062, + "grad_norm": 0.5625745058059692, + "learning_rate": 0.0005119525094441446, + "loss": 3.7185, "step": 13700 }, { - "epoch": 1.4799268108922612, - "grad_norm": 0.6090840697288513, - "learning_rate": 0.0005117745932550371, - "loss": 3.7063, + "epoch": 1.482479784366577, + "grad_norm": 0.5748574733734131, + "learning_rate": 0.0005116287101996762, + "loss": 3.6912, "step": 13750 }, { - "epoch": 1.4853083629318695, - "grad_norm": 0.5599469542503357, - "learning_rate": 0.0005114513522249758, - "loss": 3.6826, + "epoch": 1.4878706199460916, + "grad_norm": 0.590937614440918, + "learning_rate": 0.0005113049109552077, + "loss": 3.7156, "step": 13800 }, { - "epoch": 1.4906899149714778, - "grad_norm": 0.5345983505249023, - "learning_rate": 0.0005111281111949143, - "loss": 3.7033, + "epoch": 1.4932614555256065, + "grad_norm": 0.5972180962562561, + "learning_rate": 0.0005109811117107393, + "loss": 3.7, "step": 13850 }, { - "epoch": 1.496071467011086, - "grad_norm": 0.582324206829071, - "learning_rate": 0.0005108048701648528, - "loss": 3.706, + "epoch": 1.4986522911051212, + "grad_norm": 0.5326563715934753, + "learning_rate": 0.0005106573124662708, + "loss": 3.7097, "step": 13900 }, { - "epoch": 1.501453019050694, - "grad_norm": 0.5937202572822571, - "learning_rate": 0.0005104880939553926, - "loss": 3.6788, + "epoch": 1.5040431266846361, + "grad_norm": 0.5362458825111389, + "learning_rate": 0.0005103335132218025, + "loss": 3.699, "step": 13950 }, { - "epoch": 1.5068345710903024, - "grad_norm": 0.5526854395866394, - "learning_rate": 0.0005101648529253313, - "loss": 3.6974, + "epoch": 1.509433962264151, + "grad_norm": 0.5459834337234497, + "learning_rate": 0.000510009713977334, + "loss": 3.6989, "step": 14000 }, { - "epoch": 1.5068345710903024, - "eval_accuracy": 0.3537209974204718, - "eval_loss": 3.6525895595550537, - "eval_runtime": 183.0004, - "eval_samples_per_second": 98.421, - "eval_steps_per_second": 6.153, + "epoch": 1.509433962264151, + "eval_accuracy": 0.353066689730053, + "eval_loss": 3.6584391593933105, + "eval_runtime": 184.8467, + "eval_samples_per_second": 97.437, + "eval_steps_per_second": 6.092, "step": 14000 }, { - "epoch": 1.5122161231299107, - "grad_norm": 0.530371904373169, - "learning_rate": 0.0005098416118952699, - "loss": 3.7083, + "epoch": 1.5148247978436657, + "grad_norm": 0.5656163096427917, + "learning_rate": 0.0005096859147328656, + "loss": 3.6969, "step": 14050 }, { - "epoch": 1.5175976751695188, - "grad_norm": 0.5713452100753784, - "learning_rate": 0.0005095183708652085, - "loss": 3.681, + "epoch": 1.5202156334231804, + "grad_norm": 0.5515555739402771, + "learning_rate": 0.0005093621154883971, + "loss": 3.6973, "step": 14100 }, { - "epoch": 1.5229792272091272, - "grad_norm": 0.5218368768692017, - "learning_rate": 0.000509195129835147, - "loss": 3.6881, + "epoch": 1.5256064690026954, + "grad_norm": 0.5614264607429504, + "learning_rate": 0.0005090383162439288, + "loss": 3.6884, "step": 14150 }, { - "epoch": 1.5283607792487355, - "grad_norm": 0.5329176783561707, - "learning_rate": 0.0005088718888050856, - "loss": 3.6926, + "epoch": 1.5309973045822103, + "grad_norm": 0.6352477669715881, + "learning_rate": 0.0005087145169994602, + "loss": 3.6887, "step": 14200 }, { - "epoch": 1.5337423312883436, - "grad_norm": 0.5380780100822449, - "learning_rate": 0.0005085486477750242, - "loss": 3.705, + "epoch": 1.536388140161725, + "grad_norm": 0.542805552482605, + "learning_rate": 0.0005083907177549918, + "loss": 3.7049, "step": 14250 }, { - "epoch": 1.5391238833279517, - "grad_norm": 0.5449959635734558, - "learning_rate": 0.0005082254067449629, - "loss": 3.6826, + "epoch": 1.54177897574124, + "grad_norm": 0.6061757206916809, + "learning_rate": 0.0005080669185105234, + "loss": 3.7037, "step": 14300 }, { - "epoch": 1.54450543536756, - "grad_norm": 0.6050571203231812, - "learning_rate": 0.0005079021657149014, - "loss": 3.6679, + "epoch": 1.5471698113207548, + "grad_norm": 0.616715669631958, + "learning_rate": 0.000507743119266055, + "loss": 3.6958, "step": 14350 }, { - "epoch": 1.5498869874071683, - "grad_norm": 0.5612210631370544, - "learning_rate": 0.0005075789246848399, - "loss": 3.6984, + "epoch": 1.5525606469002695, + "grad_norm": 0.571155846118927, + "learning_rate": 0.0005074193200215865, + "loss": 3.7111, "step": 14400 }, { - "epoch": 1.5552685394467765, - "grad_norm": 0.5346029996871948, - "learning_rate": 0.0005072556836547785, - "loss": 3.6831, + "epoch": 1.5579514824797842, + "grad_norm": 0.586685299873352, + "learning_rate": 0.0005070955207771181, + "loss": 3.7035, "step": 14450 }, { - "epoch": 1.5606500914863846, - "grad_norm": 0.5634347200393677, - "learning_rate": 0.0005069324426247171, - "loss": 3.6772, + "epoch": 1.5633423180592994, + "grad_norm": 0.6216734051704407, + "learning_rate": 0.0005067717215326498, + "loss": 3.6841, "step": 14500 }, { - "epoch": 1.566031643525993, - "grad_norm": 0.5833031535148621, - "learning_rate": 0.0005066092015946557, - "loss": 3.6852, + "epoch": 1.568733153638814, + "grad_norm": 0.5405490398406982, + "learning_rate": 0.0005064479222881813, + "loss": 3.6997, "step": 14550 }, { - "epoch": 1.5714131955656012, - "grad_norm": 0.5364494919776917, - "learning_rate": 0.0005062859605645943, - "loss": 3.6904, + "epoch": 1.5741239892183287, + "grad_norm": 0.5977158546447754, + "learning_rate": 0.0005061241230437129, + "loss": 3.6862, "step": 14600 }, { - "epoch": 1.5767947476052093, - "grad_norm": 0.592128336429596, - "learning_rate": 0.0005059627195345329, - "loss": 3.6875, + "epoch": 1.5795148247978437, + "grad_norm": 0.5780313014984131, + "learning_rate": 0.0005058003237992444, + "loss": 3.6827, "step": 14650 }, { - "epoch": 1.5821762996448174, - "grad_norm": 0.5206913352012634, - "learning_rate": 0.0005056394785044715, - "loss": 3.6847, + "epoch": 1.5849056603773586, + "grad_norm": 0.5581985116004944, + "learning_rate": 0.000505476524554776, + "loss": 3.6985, "step": 14700 }, { - "epoch": 1.5875578516844258, - "grad_norm": 0.5297030806541443, - "learning_rate": 0.00050531623747441, - "loss": 3.677, + "epoch": 1.5902964959568733, + "grad_norm": 0.5698649883270264, + "learning_rate": 0.0005051527253103076, + "loss": 3.6979, "step": 14750 }, { - "epoch": 1.592939403724034, - "grad_norm": 0.6701345443725586, - "learning_rate": 0.0005049929964443486, - "loss": 3.6901, + "epoch": 1.595687331536388, + "grad_norm": 0.5590009093284607, + "learning_rate": 0.0005048289260658392, + "loss": 3.6692, "step": 14800 }, { - "epoch": 1.5983209557636422, - "grad_norm": 0.5313422083854675, - "learning_rate": 0.0005046697554142871, - "loss": 3.6859, + "epoch": 1.6010781671159031, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.0005045051268213707, + "loss": 3.6912, "step": 14850 }, { - "epoch": 1.6037025078032503, - "grad_norm": 0.5316382646560669, - "learning_rate": 0.0005043465143842258, - "loss": 3.6802, + "epoch": 1.6064690026954178, + "grad_norm": 0.6016737818717957, + "learning_rate": 0.0005041813275769023, + "loss": 3.698, "step": 14900 }, { - "epoch": 1.6090840598428586, - "grad_norm": 0.6596531271934509, - "learning_rate": 0.0005040232733541644, - "loss": 3.6445, + "epoch": 1.6118598382749325, + "grad_norm": 0.5573039650917053, + "learning_rate": 0.0005038575283324338, + "loss": 3.6874, "step": 14950 }, { - "epoch": 1.614465611882467, - "grad_norm": 0.6033098101615906, - "learning_rate": 0.000503700032324103, - "loss": 3.6788, + "epoch": 1.6172506738544474, + "grad_norm": 0.5686517953872681, + "learning_rate": 0.0005035337290879654, + "loss": 3.6871, "step": 15000 }, { - "epoch": 1.614465611882467, - "eval_accuracy": 0.35575758714966643, - "eval_loss": 3.6347877979278564, - "eval_runtime": 183.0426, - "eval_samples_per_second": 98.398, - "eval_steps_per_second": 6.152, + "epoch": 1.6172506738544474, + "eval_accuracy": 0.35506025291567306, + "eval_loss": 3.6408228874206543, + "eval_runtime": 184.8073, + "eval_samples_per_second": 97.458, + "eval_steps_per_second": 6.093, "step": 15000 }, { - "epoch": 1.619847163922075, - "grad_norm": 0.5396302342414856, - "learning_rate": 0.0005033767912940415, - "loss": 3.663, + "epoch": 1.6226415094339623, + "grad_norm": 0.5335454344749451, + "learning_rate": 0.000503209929843497, + "loss": 3.678, "step": 15050 }, { - "epoch": 1.6252287159616834, - "grad_norm": 0.6209707856178284, - "learning_rate": 0.0005030535502639802, - "loss": 3.6696, + "epoch": 1.628032345013477, + "grad_norm": 0.5367480516433716, + "learning_rate": 0.0005028861305990286, + "loss": 3.6716, "step": 15100 }, { - "epoch": 1.6306102680012917, - "grad_norm": 0.5924604535102844, - "learning_rate": 0.0005027303092339187, - "loss": 3.6585, + "epoch": 1.633423180592992, + "grad_norm": 0.5422275066375732, + "learning_rate": 0.0005025623313545601, + "loss": 3.6707, "step": 15150 }, { - "epoch": 1.6359918200408998, - "grad_norm": 0.5263779163360596, - "learning_rate": 0.0005024070682038573, - "loss": 3.6734, + "epoch": 1.6388140161725069, + "grad_norm": 0.5667644739151001, + "learning_rate": 0.0005022385321100917, + "loss": 3.6751, "step": 15200 }, { - "epoch": 1.641373372080508, - "grad_norm": 0.5559638142585754, - "learning_rate": 0.0005020838271737959, - "loss": 3.6798, + "epoch": 1.6442048517520216, + "grad_norm": 0.6019392609596252, + "learning_rate": 0.0005019147328656232, + "loss": 3.6686, "step": 15250 }, { - "epoch": 1.6467549241201163, - "grad_norm": 0.5919946432113647, - "learning_rate": 0.0005017605861437344, - "loss": 3.6702, + "epoch": 1.6495956873315363, + "grad_norm": 0.566340982913971, + "learning_rate": 0.0005015909336211549, + "loss": 3.674, "step": 15300 }, { - "epoch": 1.6521364761597246, - "grad_norm": 0.4996275007724762, - "learning_rate": 0.0005014373451136731, - "loss": 3.673, + "epoch": 1.6549865229110512, + "grad_norm": 0.5398353338241577, + "learning_rate": 0.0005012671343766864, + "loss": 3.6976, "step": 15350 }, { - "epoch": 1.6575180281993327, - "grad_norm": 0.5767028331756592, - "learning_rate": 0.0005011141040836116, - "loss": 3.6692, + "epoch": 1.6603773584905661, + "grad_norm": 0.5944126844406128, + "learning_rate": 0.000500943335132218, + "loss": 3.6878, "step": 15400 }, { - "epoch": 1.6628995802389408, - "grad_norm": 0.5547932386398315, - "learning_rate": 0.0005007908630535503, - "loss": 3.665, + "epoch": 1.6657681940700808, + "grad_norm": 0.544411301612854, + "learning_rate": 0.0005006195358877495, + "loss": 3.6535, "step": 15450 }, { - "epoch": 1.6682811322785491, - "grad_norm": 0.5808051228523254, - "learning_rate": 0.0005004676220234888, - "loss": 3.6697, + "epoch": 1.6711590296495957, + "grad_norm": 0.6025227904319763, + "learning_rate": 0.0005002957366432812, + "loss": 3.6718, "step": 15500 }, { - "epoch": 1.6736626843181575, - "grad_norm": 0.6720907688140869, - "learning_rate": 0.0005001443809934273, - "loss": 3.666, + "epoch": 1.6765498652291106, + "grad_norm": 0.5519585013389587, + "learning_rate": 0.0004999719373988127, + "loss": 3.6706, "step": 15550 }, { - "epoch": 1.6790442363577656, - "grad_norm": 0.5560029149055481, - "learning_rate": 0.000499821139963366, - "loss": 3.66, + "epoch": 1.6819407008086253, + "grad_norm": 0.5432153344154358, + "learning_rate": 0.0004996481381543442, + "loss": 3.6583, "step": 15600 }, { - "epoch": 1.6844257883973737, - "grad_norm": 0.5708025693893433, - "learning_rate": 0.0004994978989333045, - "loss": 3.6729, + "epoch": 1.68733153638814, + "grad_norm": 0.569072961807251, + "learning_rate": 0.0004993243389098758, + "loss": 3.6736, "step": 15650 }, { - "epoch": 1.689807340436982, - "grad_norm": 0.6271914839744568, - "learning_rate": 0.0004991746579032431, - "loss": 3.6725, + "epoch": 1.692722371967655, + "grad_norm": 0.5824419260025024, + "learning_rate": 0.0004990070156502968, + "loss": 3.6625, "step": 15700 }, { - "epoch": 1.6951888924765903, - "grad_norm": 0.5649659037590027, - "learning_rate": 0.0004988514168731817, - "loss": 3.6506, + "epoch": 1.6981132075471699, + "grad_norm": 0.5825706720352173, + "learning_rate": 0.0004986832164058284, + "loss": 3.6896, "step": 15750 }, { - "epoch": 1.7005704445161984, - "grad_norm": 0.5502942204475403, - "learning_rate": 0.0004985281758431204, - "loss": 3.68, + "epoch": 1.7035040431266846, + "grad_norm": 0.5170381665229797, + "learning_rate": 0.0004983594171613599, + "loss": 3.6834, "step": 15800 }, { - "epoch": 1.7059519965558065, - "grad_norm": 0.6230396628379822, - "learning_rate": 0.0004982049348130589, - "loss": 3.6764, + "epoch": 1.7088948787061995, + "grad_norm": 0.5656709671020508, + "learning_rate": 0.0004980356179168915, + "loss": 3.6781, "step": 15850 }, { - "epoch": 1.7113335485954149, - "grad_norm": 0.5738170742988586, - "learning_rate": 0.0004978816937829975, - "loss": 3.6683, + "epoch": 1.7142857142857144, + "grad_norm": 0.5774093270301819, + "learning_rate": 0.000497711818672423, + "loss": 3.679, "step": 15900 }, { - "epoch": 1.7167151006350232, - "grad_norm": 0.5674835443496704, - "learning_rate": 0.000497558452752936, - "loss": 3.6641, + "epoch": 1.719676549865229, + "grad_norm": 0.5535004734992981, + "learning_rate": 0.0004973880194279547, + "loss": 3.664, "step": 15950 }, { - "epoch": 1.7220966526746313, - "grad_norm": 0.5687111616134644, - "learning_rate": 0.0004972416765434759, - "loss": 3.653, + "epoch": 1.7250673854447438, + "grad_norm": 0.6128283143043518, + "learning_rate": 0.0004970642201834862, + "loss": 3.6625, "step": 16000 }, { - "epoch": 1.7220966526746313, - "eval_accuracy": 0.35766868279343533, - "eval_loss": 3.6169068813323975, - "eval_runtime": 182.8519, - "eval_samples_per_second": 98.5, - "eval_steps_per_second": 6.158, + "epoch": 1.7250673854447438, + "eval_accuracy": 0.3567382881150799, + "eval_loss": 3.622256278991699, + "eval_runtime": 185.2729, + "eval_samples_per_second": 97.213, + "eval_steps_per_second": 6.078, "step": 16000 }, { - "epoch": 1.7274782047142396, - "grad_norm": 0.5726053714752197, - "learning_rate": 0.0004969184355134145, - "loss": 3.6704, + "epoch": 1.7304582210242587, + "grad_norm": 0.5429639220237732, + "learning_rate": 0.0004967404209390178, + "loss": 3.6633, "step": 16050 }, { - "epoch": 1.732859756753848, - "grad_norm": 0.5218805074691772, - "learning_rate": 0.0004965951944833531, - "loss": 3.6687, + "epoch": 1.7358490566037736, + "grad_norm": 0.5845425128936768, + "learning_rate": 0.0004964166216945493, + "loss": 3.6505, "step": 16100 }, { - "epoch": 1.738241308793456, - "grad_norm": 0.5829153060913086, - "learning_rate": 0.0004962719534532916, - "loss": 3.6701, + "epoch": 1.7412398921832883, + "grad_norm": 0.519659698009491, + "learning_rate": 0.000496092822450081, + "loss": 3.6709, "step": 16150 }, { - "epoch": 1.7436228608330642, - "grad_norm": 0.6053098440170288, - "learning_rate": 0.0004959487124232302, - "loss": 3.6489, + "epoch": 1.7466307277628033, + "grad_norm": 0.5584535002708435, + "learning_rate": 0.0004957690232056125, + "loss": 3.6862, "step": 16200 }, { - "epoch": 1.7490044128726725, - "grad_norm": 0.5329374670982361, - "learning_rate": 0.0004956254713931688, - "loss": 3.6494, + "epoch": 1.7520215633423182, + "grad_norm": 0.5469822883605957, + "learning_rate": 0.0004954452239611441, + "loss": 3.6593, "step": 16250 }, { - "epoch": 1.7543859649122808, - "grad_norm": 0.5787749886512756, - "learning_rate": 0.0004953022303631074, - "loss": 3.6523, + "epoch": 1.7574123989218329, + "grad_norm": 0.5633559823036194, + "learning_rate": 0.0004951214247166756, + "loss": 3.6798, "step": 16300 }, { - "epoch": 1.759767516951889, - "grad_norm": 0.5446376800537109, - "learning_rate": 0.0004949789893330459, - "loss": 3.6551, + "epoch": 1.7628032345013476, + "grad_norm": 0.593555748462677, + "learning_rate": 0.0004947976254722072, + "loss": 3.6635, "step": 16350 }, { - "epoch": 1.765149068991497, - "grad_norm": 0.6261696815490723, - "learning_rate": 0.0004946557483029846, - "loss": 3.6529, + "epoch": 1.7681940700808625, + "grad_norm": 0.5291847586631775, + "learning_rate": 0.0004944738262277387, + "loss": 3.6539, "step": 16400 }, { - "epoch": 1.7705306210311054, - "grad_norm": 0.5923592448234558, - "learning_rate": 0.0004943325072729231, - "loss": 3.6742, + "epoch": 1.7735849056603774, + "grad_norm": 0.5443708300590515, + "learning_rate": 0.0004941500269832703, + "loss": 3.66, "step": 16450 }, { - "epoch": 1.7759121730707137, - "grad_norm": 0.5514686703681946, - "learning_rate": 0.0004940092662428617, - "loss": 3.6655, + "epoch": 1.778975741239892, + "grad_norm": 0.5486942529678345, + "learning_rate": 0.0004938262277388019, + "loss": 3.6489, "step": 16500 }, { - "epoch": 1.7812937251103218, - "grad_norm": 0.5481582283973694, - "learning_rate": 0.0004936860252128003, - "loss": 3.6613, + "epoch": 1.784366576819407, + "grad_norm": 0.6374784708023071, + "learning_rate": 0.0004935024284943335, + "loss": 3.676, "step": 16550 }, { - "epoch": 1.78667527714993, - "grad_norm": 0.5708692073822021, - "learning_rate": 0.0004933627841827388, - "loss": 3.6644, + "epoch": 1.789757412398922, + "grad_norm": 0.6883034706115723, + "learning_rate": 0.000493178629249865, + "loss": 3.6612, "step": 16600 }, { - "epoch": 1.7920568291895382, - "grad_norm": 0.5547276139259338, - "learning_rate": 0.0004930395431526775, - "loss": 3.6443, + "epoch": 1.7951482479784366, + "grad_norm": 0.5932786464691162, + "learning_rate": 0.0004928548300053966, + "loss": 3.6656, "step": 16650 }, { - "epoch": 1.7974383812291466, - "grad_norm": 0.60475754737854, - "learning_rate": 0.0004927163021226161, - "loss": 3.666, + "epoch": 1.8005390835579513, + "grad_norm": 0.5369443297386169, + "learning_rate": 0.0004925310307609282, + "loss": 3.6662, "step": 16700 }, { - "epoch": 1.8028199332687547, - "grad_norm": 0.568871796131134, - "learning_rate": 0.0004923930610925547, - "loss": 3.6573, + "epoch": 1.8059299191374663, + "grad_norm": 0.5238076448440552, + "learning_rate": 0.0004922072315164598, + "loss": 3.6535, "step": 16750 }, { - "epoch": 1.8082014853083628, - "grad_norm": 0.5564674139022827, - "learning_rate": 0.0004920698200624932, - "loss": 3.6515, + "epoch": 1.8113207547169812, + "grad_norm": 0.500116765499115, + "learning_rate": 0.0004918834322719913, + "loss": 3.6552, "step": 16800 }, { - "epoch": 1.813583037347971, - "grad_norm": 0.5341878533363342, - "learning_rate": 0.0004917465790324317, - "loss": 3.6652, + "epoch": 1.8167115902964959, + "grad_norm": 0.580346941947937, + "learning_rate": 0.0004915596330275229, + "loss": 3.6617, "step": 16850 }, { - "epoch": 1.8189645893875794, - "grad_norm": 0.58695387840271, - "learning_rate": 0.0004914233380023704, - "loss": 3.6546, + "epoch": 1.8221024258760108, + "grad_norm": 0.5278408527374268, + "learning_rate": 0.0004912423097679439, + "loss": 3.6465, "step": 16900 }, { - "epoch": 1.8243461414271875, - "grad_norm": 0.5584468245506287, - "learning_rate": 0.0004911000969723089, - "loss": 3.631, + "epoch": 1.8274932614555257, + "grad_norm": 0.5278293490409851, + "learning_rate": 0.0004909185105234754, + "loss": 3.6773, "step": 16950 }, { - "epoch": 1.8297276934667959, - "grad_norm": 0.5749080777168274, - "learning_rate": 0.0004907768559422476, - "loss": 3.6484, + "epoch": 1.8328840970350404, + "grad_norm": 0.5422157645225525, + "learning_rate": 0.000490594711279007, + "loss": 3.6619, "step": 17000 }, { - "epoch": 1.8297276934667959, - "eval_accuracy": 0.35881909956961505, - "eval_loss": 3.601506233215332, - "eval_runtime": 182.9699, - "eval_samples_per_second": 98.437, - "eval_steps_per_second": 6.154, + "epoch": 1.8328840970350404, + "eval_accuracy": 0.35810253377447854, + "eval_loss": 3.610804796218872, + "eval_runtime": 184.8314, + "eval_samples_per_second": 97.446, + "eval_steps_per_second": 6.092, "step": 17000 }, { - "epoch": 1.8351092455064042, - "grad_norm": 0.5957604050636292, - "learning_rate": 0.0004904536149121861, - "loss": 3.6388, + "epoch": 1.838274932614555, + "grad_norm": 0.531470775604248, + "learning_rate": 0.0004902709120345385, + "loss": 3.6441, "step": 17050 }, { - "epoch": 1.8404907975460123, - "grad_norm": 0.5906072854995728, - "learning_rate": 0.0004901303738821248, - "loss": 3.6523, + "epoch": 1.8436657681940702, + "grad_norm": 0.5433018803596497, + "learning_rate": 0.00048994711279007, + "loss": 3.6581, "step": 17100 }, { - "epoch": 1.8458723495856204, - "grad_norm": 0.6116198301315308, - "learning_rate": 0.0004898071328520633, - "loss": 3.6424, + "epoch": 1.849056603773585, + "grad_norm": 0.530763566493988, + "learning_rate": 0.0004896233135456017, + "loss": 3.6545, "step": 17150 }, { - "epoch": 1.8512539016252287, - "grad_norm": 0.5727099776268005, - "learning_rate": 0.0004894838918220019, - "loss": 3.6625, + "epoch": 1.8544474393530996, + "grad_norm": 0.5608590245246887, + "learning_rate": 0.0004892995143011333, + "loss": 3.6389, "step": 17200 }, { - "epoch": 1.856635453664837, - "grad_norm": 0.5920504331588745, - "learning_rate": 0.0004891606507919405, - "loss": 3.6366, + "epoch": 1.8598382749326146, + "grad_norm": 0.6097438335418701, + "learning_rate": 0.0004889757150566648, + "loss": 3.6605, "step": 17250 }, { - "epoch": 1.8620170057044452, - "grad_norm": 0.6015959978103638, - "learning_rate": 0.000488837409761879, - "loss": 3.6506, + "epoch": 1.8652291105121295, + "grad_norm": 0.6334707736968994, + "learning_rate": 0.0004886519158121964, + "loss": 3.6499, "step": 17300 }, { - "epoch": 1.8673985577440533, - "grad_norm": 0.5476948022842407, - "learning_rate": 0.0004885141687318177, - "loss": 3.654, + "epoch": 1.8706199460916442, + "grad_norm": 0.6429612040519714, + "learning_rate": 0.000488328116567728, + "loss": 3.6542, "step": 17350 }, { - "epoch": 1.8727801097836616, - "grad_norm": 0.5550304651260376, - "learning_rate": 0.00048819092770175623, - "loss": 3.6382, + "epoch": 1.8760107816711589, + "grad_norm": 0.6209458708763123, + "learning_rate": 0.0004880043173232595, + "loss": 3.6473, "step": 17400 }, { - "epoch": 1.87816166182327, - "grad_norm": 0.5779985189437866, - "learning_rate": 0.0004878676866716948, - "loss": 3.6562, + "epoch": 1.881401617250674, + "grad_norm": 0.5378085374832153, + "learning_rate": 0.0004876805180787911, + "loss": 3.6667, "step": 17450 }, { - "epoch": 1.883543213862878, - "grad_norm": 0.5878141522407532, - "learning_rate": 0.00048754444564163337, - "loss": 3.6424, + "epoch": 1.8867924528301887, + "grad_norm": 0.5677717924118042, + "learning_rate": 0.0004873567188343227, + "loss": 3.6589, "step": 17500 }, { - "epoch": 1.8889247659024861, - "grad_norm": 0.5703890323638916, - "learning_rate": 0.000487221204611572, - "loss": 3.6491, + "epoch": 1.8921832884097034, + "grad_norm": 0.5877004861831665, + "learning_rate": 0.0004870329195898542, + "loss": 3.6427, "step": 17550 }, { - "epoch": 1.8943063179420945, - "grad_norm": 0.5712037086486816, - "learning_rate": 0.00048689796358151056, - "loss": 3.6276, + "epoch": 1.8975741239892183, + "grad_norm": 0.5844641923904419, + "learning_rate": 0.00048670912034538583, + "loss": 3.6495, "step": 17600 }, { - "epoch": 1.8996878699817028, - "grad_norm": 0.5311342477798462, - "learning_rate": 0.00048657472255144915, - "loss": 3.6438, + "epoch": 1.9029649595687332, + "grad_norm": 0.6069989204406738, + "learning_rate": 0.0004863853211009174, + "loss": 3.6266, "step": 17650 }, { - "epoch": 1.905069422021311, - "grad_norm": 0.5727416276931763, - "learning_rate": 0.00048625148152138775, - "loss": 3.6264, + "epoch": 1.908355795148248, + "grad_norm": 0.5461699962615967, + "learning_rate": 0.000486061521856449, + "loss": 3.6356, "step": 17700 }, { - "epoch": 1.910450974060919, - "grad_norm": 0.6044231653213501, - "learning_rate": 0.0004859282404913263, - "loss": 3.6242, + "epoch": 1.9137466307277629, + "grad_norm": 0.6169970035552979, + "learning_rate": 0.00048573772261198054, + "loss": 3.6391, "step": 17750 }, { - "epoch": 1.9158325261005273, - "grad_norm": 0.5727190971374512, - "learning_rate": 0.0004856049994612649, - "loss": 3.6462, + "epoch": 1.9191374663072778, + "grad_norm": 0.5463764071464539, + "learning_rate": 0.00048541392336751214, + "loss": 3.658, "step": 17800 }, { - "epoch": 1.9212140781401357, - "grad_norm": 0.5734631419181824, - "learning_rate": 0.00048528175843120353, - "loss": 3.6472, + "epoch": 1.9245283018867925, + "grad_norm": 0.5543783903121948, + "learning_rate": 0.0004850901241230437, + "loss": 3.6572, "step": 17850 }, { - "epoch": 1.9265956301797438, - "grad_norm": 0.5388041734695435, - "learning_rate": 0.0004849585174011421, - "loss": 3.6489, + "epoch": 1.9299191374663072, + "grad_norm": 0.5729919672012329, + "learning_rate": 0.0004847663248785753, + "loss": 3.6619, "step": 17900 }, { - "epoch": 1.931977182219352, - "grad_norm": 0.5606278777122498, - "learning_rate": 0.00048463527637108067, - "loss": 3.6294, + "epoch": 1.935309973045822, + "grad_norm": 0.5639930963516235, + "learning_rate": 0.0004844425256341068, + "loss": 3.6524, "step": 17950 }, { - "epoch": 1.9373587342589604, - "grad_norm": 0.5878045558929443, - "learning_rate": 0.0004843120353410192, - "loss": 3.6472, + "epoch": 1.940700808625337, + "grad_norm": 0.5668043494224548, + "learning_rate": 0.00048411872638963834, + "loss": 3.6364, "step": 18000 }, { - "epoch": 1.9373587342589604, - "eval_accuracy": 0.36079995036736084, - "eval_loss": 3.5877466201782227, - "eval_runtime": 182.9553, - "eval_samples_per_second": 98.445, - "eval_steps_per_second": 6.155, + "epoch": 1.940700808625337, + "eval_accuracy": 0.3596066157004944, + "eval_loss": 3.5932109355926514, + "eval_runtime": 185.1478, + "eval_samples_per_second": 97.279, + "eval_steps_per_second": 6.082, "step": 18000 }, { - "epoch": 1.9427402862985685, - "grad_norm": 0.5944440960884094, - "learning_rate": 0.00048399525913155907, - "loss": 3.6349, + "epoch": 1.9460916442048517, + "grad_norm": 0.6137953996658325, + "learning_rate": 0.00048379492714516995, + "loss": 3.6263, "step": 18050 }, { - "epoch": 1.9481218383381766, - "grad_norm": 0.6114388704299927, - "learning_rate": 0.0004836720181014976, - "loss": 3.6319, + "epoch": 1.9514824797843666, + "grad_norm": 0.603476881980896, + "learning_rate": 0.0004834711279007015, + "loss": 3.6348, "step": 18100 }, { - "epoch": 1.953503390377785, - "grad_norm": 0.5699650645256042, - "learning_rate": 0.0004833487770714362, - "loss": 3.6309, + "epoch": 1.9568733153638815, + "grad_norm": 0.5930576920509338, + "learning_rate": 0.0004831473286562331, + "loss": 3.6418, "step": 18150 }, { - "epoch": 1.9588849424173933, - "grad_norm": 0.5740841031074524, - "learning_rate": 0.00048302553604137485, - "loss": 3.6278, + "epoch": 1.9622641509433962, + "grad_norm": 0.5806794762611389, + "learning_rate": 0.00048282352941176465, + "loss": 3.6427, "step": 18200 }, { - "epoch": 1.9642664944570014, - "grad_norm": 0.6211947798728943, - "learning_rate": 0.0004827022950113134, - "loss": 3.6346, + "epoch": 1.967654986522911, + "grad_norm": 0.5229198932647705, + "learning_rate": 0.00048249973016729626, + "loss": 3.6336, "step": 18250 }, { - "epoch": 1.9696480464966095, - "grad_norm": 0.5401277542114258, - "learning_rate": 0.000482379053981252, - "loss": 3.6603, + "epoch": 1.9730458221024259, + "grad_norm": 0.5625728964805603, + "learning_rate": 0.0004821759309228278, + "loss": 3.6622, "step": 18300 }, { - "epoch": 1.9750295985362178, - "grad_norm": 0.5557017922401428, - "learning_rate": 0.0004820558129511906, - "loss": 3.629, + "epoch": 1.9784366576819408, + "grad_norm": 0.577113151550293, + "learning_rate": 0.00048185213167835936, + "loss": 3.6386, "step": 18350 }, { - "epoch": 1.9804111505758262, - "grad_norm": 0.592074453830719, - "learning_rate": 0.0004817325719211291, - "loss": 3.6235, + "epoch": 1.9838274932614555, + "grad_norm": 0.5825108289718628, + "learning_rate": 0.00048152833243389096, + "loss": 3.6421, "step": 18400 }, { - "epoch": 1.9857927026154343, - "grad_norm": 0.600532591342926, - "learning_rate": 0.0004814093308910677, - "loss": 3.6253, + "epoch": 1.9892183288409704, + "grad_norm": 0.6418988108634949, + "learning_rate": 0.0004812045331894225, + "loss": 3.6495, "step": 18450 }, { - "epoch": 1.9911742546550424, - "grad_norm": 0.4943503141403198, - "learning_rate": 0.00048108608986100637, - "loss": 3.6225, + "epoch": 1.9946091644204853, + "grad_norm": 0.6308207511901855, + "learning_rate": 0.0004808807339449541, + "loss": 3.6288, "step": 18500 }, { - "epoch": 1.9965558066946507, - "grad_norm": 0.5403441190719604, - "learning_rate": 0.0004807628488309449, - "loss": 3.6333, + "epoch": 2.0, + "grad_norm": 1.1038312911987305, + "learning_rate": 0.0004805569347004856, + "loss": 3.6314, "step": 18550 }, { - "epoch": 2.001937358734259, - "grad_norm": 0.5593751668930054, - "learning_rate": 0.0004804396078008835, - "loss": 3.5902, + "epoch": 2.0053908355795147, + "grad_norm": 0.5954450368881226, + "learning_rate": 0.0004802331354560173, + "loss": 3.5285, "step": 18600 }, { - "epoch": 2.007318910773867, - "grad_norm": 0.5596261024475098, - "learning_rate": 0.00048011636677082204, - "loss": 3.5438, + "epoch": 2.01078167115903, + "grad_norm": 0.5570014119148254, + "learning_rate": 0.00047990933621154877, + "loss": 3.5394, "step": 18650 }, { - "epoch": 2.0127004628134753, - "grad_norm": 0.5620816946029663, - "learning_rate": 0.00047979312574076064, - "loss": 3.5367, + "epoch": 2.0161725067385445, + "grad_norm": 0.6168981194496155, + "learning_rate": 0.0004795855369670804, + "loss": 3.5386, "step": 18700 }, { - "epoch": 2.018082014853084, - "grad_norm": 0.6553834080696106, - "learning_rate": 0.0004794698847106992, - "loss": 3.5336, + "epoch": 2.0215633423180592, + "grad_norm": 0.5821147561073303, + "learning_rate": 0.0004792617377226119, + "loss": 3.5445, "step": 18750 }, { - "epoch": 2.023463566892692, - "grad_norm": 0.5942727327346802, - "learning_rate": 0.0004791466436806378, - "loss": 3.5532, + "epoch": 2.026954177897574, + "grad_norm": 0.6035953164100647, + "learning_rate": 0.0004789379384781435, + "loss": 3.566, "step": 18800 }, { - "epoch": 2.0288451189323, - "grad_norm": 0.563507616519928, - "learning_rate": 0.0004788234026505764, - "loss": 3.565, + "epoch": 2.032345013477089, + "grad_norm": 0.547264039516449, + "learning_rate": 0.0004786141392336751, + "loss": 3.563, "step": 18850 }, { - "epoch": 2.034226670971908, - "grad_norm": 0.5451712608337402, - "learning_rate": 0.00047850016162051496, - "loss": 3.55, + "epoch": 2.0377358490566038, + "grad_norm": 0.5701025724411011, + "learning_rate": 0.00047829033998920663, + "loss": 3.5472, "step": 18900 }, { - "epoch": 2.0396082230115167, - "grad_norm": 0.6260777711868286, - "learning_rate": 0.00047817692059045356, - "loss": 3.5395, + "epoch": 2.0431266846361185, + "grad_norm": 0.5930619239807129, + "learning_rate": 0.00047796654074473824, + "loss": 3.5514, "step": 18950 }, { - "epoch": 2.044989775051125, - "grad_norm": 0.6237614750862122, - "learning_rate": 0.00047785367956039215, - "loss": 3.5601, + "epoch": 2.0485175202156336, + "grad_norm": 0.5958006978034973, + "learning_rate": 0.0004776427415002698, + "loss": 3.5822, "step": 19000 }, { - "epoch": 2.044989775051125, - "eval_accuracy": 0.36222699739591624, - "eval_loss": 3.5780997276306152, - "eval_runtime": 182.9631, - "eval_samples_per_second": 98.441, - "eval_steps_per_second": 6.154, + "epoch": 2.0485175202156336, + "eval_accuracy": 0.3610058475897693, + "eval_loss": 3.584773540496826, + "eval_runtime": 184.7103, + "eval_samples_per_second": 97.509, + "eval_steps_per_second": 6.096, "step": 19000 }, { - "epoch": 2.050371327090733, - "grad_norm": 0.5645211935043335, - "learning_rate": 0.00047753043853033075, - "loss": 3.5712, + "epoch": 2.0539083557951483, + "grad_norm": 0.5540789365768433, + "learning_rate": 0.0004773189422558014, + "loss": 3.5571, "step": 19050 }, { - "epoch": 2.055752879130341, - "grad_norm": 0.550080418586731, - "learning_rate": 0.00047720719750026934, + "epoch": 2.059299191374663, + "grad_norm": 0.6005356907844543, + "learning_rate": 0.00047699514301133294, "loss": 3.5519, "step": 19100 }, { - "epoch": 2.0611344311699495, - "grad_norm": 0.5775645971298218, - "learning_rate": 0.00047688395647020793, - "loss": 3.5345, + "epoch": 2.0646900269541777, + "grad_norm": 0.6260417103767395, + "learning_rate": 0.00047667134376686455, + "loss": 3.5664, "step": 19150 }, { - "epoch": 2.0665159832095576, - "grad_norm": 0.6265087127685547, - "learning_rate": 0.0004765607154401465, - "loss": 3.5604, + "epoch": 2.070080862533693, + "grad_norm": 0.6222960948944092, + "learning_rate": 0.0004763475445223961, + "loss": 3.5777, "step": 19200 }, { - "epoch": 2.0718975352491658, - "grad_norm": 0.645406186580658, - "learning_rate": 0.00047623747441008507, - "loss": 3.5486, + "epoch": 2.0754716981132075, + "grad_norm": 0.5388205647468567, + "learning_rate": 0.0004760237452779276, + "loss": 3.5605, "step": 19250 }, { - "epoch": 2.0772790872887743, - "grad_norm": 0.5847898721694946, - "learning_rate": 0.0004759142333800236, - "loss": 3.5457, + "epoch": 2.0808625336927222, + "grad_norm": 0.6028093099594116, + "learning_rate": 0.0004756999460334592, + "loss": 3.5498, "step": 19300 }, { - "epoch": 2.0826606393283824, - "grad_norm": 0.5650346875190735, - "learning_rate": 0.00047559099234996226, - "loss": 3.5555, + "epoch": 2.0862533692722374, + "grad_norm": 0.646430492401123, + "learning_rate": 0.00047537614678899075, + "loss": 3.5661, "step": 19350 }, { - "epoch": 2.0880421913679905, - "grad_norm": 0.599783182144165, - "learning_rate": 0.00047526775131990085, - "loss": 3.5459, + "epoch": 2.091644204851752, + "grad_norm": 0.5180575251579285, + "learning_rate": 0.00047505234754452235, + "loss": 3.5445, "step": 19400 }, { - "epoch": 2.0934237434075986, - "grad_norm": 0.5775169730186462, - "learning_rate": 0.0004749445102898394, - "loss": 3.5612, + "epoch": 2.0970350404312668, + "grad_norm": 0.5926779508590698, + "learning_rate": 0.0004747285483000539, + "loss": 3.5611, "step": 19450 }, { - "epoch": 2.098805295447207, - "grad_norm": 0.6146634817123413, - "learning_rate": 0.000474621269259778, - "loss": 3.5666, + "epoch": 2.1024258760107815, + "grad_norm": 0.5802668929100037, + "learning_rate": 0.0004744047490555855, + "loss": 3.5625, "step": 19500 }, { - "epoch": 2.1041868474868153, - "grad_norm": 0.6270310878753662, - "learning_rate": 0.0004742980282297166, - "loss": 3.5408, + "epoch": 2.1078167115902966, + "grad_norm": 0.6308825016021729, + "learning_rate": 0.00047408094981111706, + "loss": 3.5505, "step": 19550 }, { - "epoch": 2.1095683995264234, - "grad_norm": 0.517265796661377, - "learning_rate": 0.0004739747871996551, - "loss": 3.5576, + "epoch": 2.1132075471698113, + "grad_norm": 0.5490583181381226, + "learning_rate": 0.00047375715056664866, + "loss": 3.5526, "step": 19600 }, { - "epoch": 2.1149499515660315, - "grad_norm": 0.6480297446250916, - "learning_rate": 0.00047365154616959377, - "loss": 3.542, + "epoch": 2.118598382749326, + "grad_norm": 0.5867042541503906, + "learning_rate": 0.0004734333513221802, + "loss": 3.5601, "step": 19650 }, { - "epoch": 2.12033150360564, - "grad_norm": 0.5682874321937561, - "learning_rate": 0.00047332830513953237, - "loss": 3.5669, + "epoch": 2.123989218328841, + "grad_norm": 0.5740089416503906, + "learning_rate": 0.00047310955207771177, + "loss": 3.5641, "step": 19700 }, { - "epoch": 2.125713055645248, - "grad_norm": 0.6224113702774048, - "learning_rate": 0.0004730050641094709, - "loss": 3.5485, + "epoch": 2.129380053908356, + "grad_norm": 0.5596190690994263, + "learning_rate": 0.00047278575283324337, + "loss": 3.5679, "step": 19750 }, { - "epoch": 2.1310946076848563, - "grad_norm": 0.554499089717865, - "learning_rate": 0.0004726818230794095, - "loss": 3.5338, + "epoch": 2.1347708894878705, + "grad_norm": 0.6093752980232239, + "learning_rate": 0.0004724619535887749, + "loss": 3.5522, "step": 19800 }, { - "epoch": 2.1364761597244644, - "grad_norm": 0.6027452349662781, - "learning_rate": 0.0004723650468699493, - "loss": 3.5481, + "epoch": 2.1401617250673857, + "grad_norm": 0.581826388835907, + "learning_rate": 0.0004721381543443065, + "loss": 3.5798, "step": 19850 }, { - "epoch": 2.141857711764073, - "grad_norm": 0.6047434210777283, - "learning_rate": 0.0004720418058398879, - "loss": 3.5465, + "epoch": 2.1455525606469004, + "grad_norm": 0.5723430514335632, + "learning_rate": 0.000471814355099838, + "loss": 3.5818, "step": 19900 }, { - "epoch": 2.147239263803681, - "grad_norm": 0.5787104964256287, - "learning_rate": 0.00047171856480982644, - "loss": 3.5562, + "epoch": 2.150943396226415, + "grad_norm": 0.6087849736213684, + "learning_rate": 0.0004714905558553697, + "loss": 3.5759, "step": 19950 }, { - "epoch": 2.152620815843289, - "grad_norm": 0.5642232298851013, - "learning_rate": 0.0004713953237797651, - "loss": 3.5694, + "epoch": 2.1563342318059298, + "grad_norm": 0.5303805470466614, + "learning_rate": 0.0004711667566109012, + "loss": 3.5592, "step": 20000 }, { - "epoch": 2.152620815843289, - "eval_accuracy": 0.36314848253747084, - "eval_loss": 3.5687355995178223, - "eval_runtime": 179.7649, - "eval_samples_per_second": 100.192, - "eval_steps_per_second": 6.264, + "epoch": 2.1563342318059298, + "eval_accuracy": 0.3620970485420901, + "eval_loss": 3.578047037124634, + "eval_runtime": 185.1715, + "eval_samples_per_second": 97.267, + "eval_steps_per_second": 6.081, "step": 20000 }, { - "epoch": 2.1580023678828972, - "grad_norm": 0.6349261999130249, - "learning_rate": 0.0004710720827497037, - "loss": 3.5747, + "epoch": 2.161725067385445, + "grad_norm": 0.6045820116996765, + "learning_rate": 0.00047084295736643273, + "loss": 3.5599, "step": 20050 }, { - "epoch": 2.163383919922506, - "grad_norm": 0.5958331823348999, - "learning_rate": 0.00047074884171964223, - "loss": 3.5547, + "epoch": 2.1671159029649596, + "grad_norm": 0.6460957527160645, + "learning_rate": 0.00047051915812196433, + "loss": 3.5676, "step": 20100 }, { - "epoch": 2.168765471962114, - "grad_norm": 0.6347177028656006, - "learning_rate": 0.0004704256006895808, - "loss": 3.561, + "epoch": 2.1725067385444743, + "grad_norm": 0.6051047444343567, + "learning_rate": 0.0004701953588774959, + "loss": 3.5624, "step": 20150 }, { - "epoch": 2.174147024001722, - "grad_norm": 0.6787843108177185, - "learning_rate": 0.00047010235965951936, - "loss": 3.5584, + "epoch": 2.177897574123989, + "grad_norm": 0.5932462215423584, + "learning_rate": 0.0004698715596330275, + "loss": 3.5778, "step": 20200 }, { - "epoch": 2.1795285760413305, - "grad_norm": 0.5568081140518188, - "learning_rate": 0.00046977911862945796, - "loss": 3.5667, + "epoch": 2.183288409703504, + "grad_norm": 0.5538517236709595, + "learning_rate": 0.00046954776038855904, + "loss": 3.5628, "step": 20250 }, { - "epoch": 2.1849101280809387, - "grad_norm": 0.6003414988517761, - "learning_rate": 0.0004694558775993966, - "loss": 3.5515, + "epoch": 2.188679245283019, + "grad_norm": 0.5807755589485168, + "learning_rate": 0.00046922396114409064, + "loss": 3.5691, "step": 20300 }, { - "epoch": 2.1902916801205468, - "grad_norm": 0.5988909602165222, - "learning_rate": 0.00046913263656933515, - "loss": 3.5563, + "epoch": 2.1940700808625335, + "grad_norm": 0.5965390801429749, + "learning_rate": 0.0004689001618996222, + "loss": 3.55, "step": 20350 }, { - "epoch": 2.195673232160155, - "grad_norm": 0.6274018287658691, - "learning_rate": 0.00046880939553927374, - "loss": 3.55, + "epoch": 2.1994609164420487, + "grad_norm": 0.5739687085151672, + "learning_rate": 0.0004685763626551538, + "loss": 3.5578, "step": 20400 }, { - "epoch": 2.2010547841997634, - "grad_norm": 0.5922993421554565, - "learning_rate": 0.00046848615450921234, - "loss": 3.5385, + "epoch": 2.2048517520215634, + "grad_norm": 0.5837062001228333, + "learning_rate": 0.00046825256341068535, + "loss": 3.5594, "step": 20450 }, { - "epoch": 2.2064363362393715, - "grad_norm": 0.6406957507133484, - "learning_rate": 0.0004681629134791509, - "loss": 3.5292, + "epoch": 2.210242587601078, + "grad_norm": 0.5664048790931702, + "learning_rate": 0.0004679287641662169, + "loss": 3.5722, "step": 20500 }, { - "epoch": 2.2118178882789796, - "grad_norm": 0.5569520592689514, - "learning_rate": 0.00046783967244908947, - "loss": 3.5393, + "epoch": 2.215633423180593, + "grad_norm": 0.5342917442321777, + "learning_rate": 0.0004676049649217485, + "loss": 3.5671, "step": 20550 }, { - "epoch": 2.2171994403185877, - "grad_norm": 0.5793364644050598, - "learning_rate": 0.0004675164314190281, - "loss": 3.5537, + "epoch": 2.221024258760108, + "grad_norm": 0.5454385876655579, + "learning_rate": 0.00046728116567728, + "loss": 3.56, "step": 20600 }, { - "epoch": 2.2225809923581963, - "grad_norm": 0.5888959765434265, - "learning_rate": 0.00046719319038896666, - "loss": 3.5403, + "epoch": 2.2264150943396226, + "grad_norm": 0.5746397972106934, + "learning_rate": 0.0004669573664328116, + "loss": 3.5615, "step": 20650 }, { - "epoch": 2.2279625443978044, - "grad_norm": 0.5922018885612488, - "learning_rate": 0.00046686994935890526, - "loss": 3.547, + "epoch": 2.2318059299191373, + "grad_norm": 0.6558805108070374, + "learning_rate": 0.00046663356718834316, + "loss": 3.5595, "step": 20700 }, { - "epoch": 2.2333440964374125, - "grad_norm": 0.6159331798553467, - "learning_rate": 0.0004665467083288438, - "loss": 3.5509, + "epoch": 2.2371967654986524, + "grad_norm": 0.5640301704406738, + "learning_rate": 0.00046630976794387476, + "loss": 3.5584, "step": 20750 }, { - "epoch": 2.2387256484770206, - "grad_norm": 0.6433561444282532, - "learning_rate": 0.0004662234672987824, - "loss": 3.5546, + "epoch": 2.242587601078167, + "grad_norm": 0.6599795818328857, + "learning_rate": 0.0004659859686994063, + "loss": 3.5644, "step": 20800 }, { - "epoch": 2.244107200516629, - "grad_norm": 0.5780196189880371, - "learning_rate": 0.00046590022626872104, - "loss": 3.5505, + "epoch": 2.247978436657682, + "grad_norm": 0.5787530541419983, + "learning_rate": 0.0004656621694549379, + "loss": 3.5593, "step": 20850 }, { - "epoch": 2.2494887525562373, - "grad_norm": 0.6302474141120911, - "learning_rate": 0.0004655769852386596, - "loss": 3.5528, + "epoch": 2.2533692722371965, + "grad_norm": 0.5842300653457642, + "learning_rate": 0.00046533837021046947, + "loss": 3.5535, "step": 20900 }, { - "epoch": 2.2548703045958454, - "grad_norm": 0.6163661479949951, - "learning_rate": 0.0004652537442085982, - "loss": 3.5506, + "epoch": 2.2587601078167117, + "grad_norm": 0.5889685750007629, + "learning_rate": 0.0004650210469508904, + "loss": 3.5653, "step": 20950 }, { - "epoch": 2.2602518566354535, - "grad_norm": 0.5856783390045166, - "learning_rate": 0.00046493050317853677, - "loss": 3.5485, + "epoch": 2.2641509433962264, + "grad_norm": 0.5827546715736389, + "learning_rate": 0.00046469724770642197, + "loss": 3.5472, "step": 21000 }, { - "epoch": 2.2602518566354535, - "eval_accuracy": 0.364305092528256, - "eval_loss": 3.5584065914154053, - "eval_runtime": 179.7572, - "eval_samples_per_second": 100.196, - "eval_steps_per_second": 6.264, + "epoch": 2.2641509433962264, + "eval_accuracy": 0.3631977022956508, + "eval_loss": 3.566005229949951, + "eval_runtime": 184.999, + "eval_samples_per_second": 97.357, + "eval_steps_per_second": 6.087, "step": 21000 }, { - "epoch": 2.265633408675062, - "grad_norm": 0.5490982532501221, - "learning_rate": 0.0004646072621484753, - "loss": 3.5765, + "epoch": 2.269541778975741, + "grad_norm": 0.5650562047958374, + "learning_rate": 0.0004643734484619536, + "loss": 3.5607, "step": 21050 }, { - "epoch": 2.27101496071467, - "grad_norm": 0.6259151697158813, - "learning_rate": 0.0004642840211184139, - "loss": 3.5557, + "epoch": 2.274932614555256, + "grad_norm": 0.5835627913475037, + "learning_rate": 0.0004640496492174851, + "loss": 3.5659, "step": 21100 }, { - "epoch": 2.2763965127542782, - "grad_norm": 0.6125593185424805, - "learning_rate": 0.00046396078008835255, - "loss": 3.562, + "epoch": 2.280323450134771, + "grad_norm": 0.621948778629303, + "learning_rate": 0.00046372584997301673, + "loss": 3.5788, "step": 21150 }, { - "epoch": 2.281778064793887, - "grad_norm": 0.596523642539978, - "learning_rate": 0.0004636375390582911, - "loss": 3.544, + "epoch": 2.2857142857142856, + "grad_norm": 0.5712414383888245, + "learning_rate": 0.0004634020507285483, + "loss": 3.5315, "step": 21200 }, { - "epoch": 2.287159616833495, - "grad_norm": 0.6259929537773132, - "learning_rate": 0.0004633142980282297, - "loss": 3.5504, + "epoch": 2.2911051212938007, + "grad_norm": 0.5780783295631409, + "learning_rate": 0.0004630782514840798, + "loss": 3.5586, "step": 21250 }, { - "epoch": 2.292541168873103, - "grad_norm": 0.5609055161476135, - "learning_rate": 0.00046299105699816823, - "loss": 3.5574, + "epoch": 2.2964959568733154, + "grad_norm": 0.5562280416488647, + "learning_rate": 0.0004627544522396114, + "loss": 3.559, "step": 21300 }, { - "epoch": 2.297922720912711, - "grad_norm": 0.6183776259422302, - "learning_rate": 0.0004626678159681068, - "loss": 3.5536, + "epoch": 2.30188679245283, + "grad_norm": 0.5436495542526245, + "learning_rate": 0.00046243065299514293, + "loss": 3.5524, "step": 21350 }, { - "epoch": 2.303304272952319, - "grad_norm": 0.531791090965271, - "learning_rate": 0.0004623445749380454, - "loss": 3.5414, + "epoch": 2.3072776280323453, + "grad_norm": 0.5707684755325317, + "learning_rate": 0.00046210685375067454, + "loss": 3.5663, "step": 21400 }, { - "epoch": 2.3086858249919278, - "grad_norm": 0.558052122592926, - "learning_rate": 0.000462021333907984, - "loss": 3.538, + "epoch": 2.31266846361186, + "grad_norm": 0.5568186044692993, + "learning_rate": 0.0004617830545062061, + "loss": 3.5645, "step": 21450 }, { - "epoch": 2.314067377031536, - "grad_norm": 0.5366657972335815, - "learning_rate": 0.0004616980928779226, - "loss": 3.5508, + "epoch": 2.3180592991913747, + "grad_norm": 0.5847433805465698, + "learning_rate": 0.0004614592552617377, + "loss": 3.5663, "step": 21500 }, { - "epoch": 2.319448929071144, - "grad_norm": 0.5605040192604065, - "learning_rate": 0.0004613748518478612, - "loss": 3.5527, + "epoch": 2.3234501347708894, + "grad_norm": 0.551858127117157, + "learning_rate": 0.00046113545601726924, + "loss": 3.5486, "step": 21550 }, { - "epoch": 2.3248304811107525, - "grad_norm": 0.5950835347175598, - "learning_rate": 0.00046105161081779974, - "loss": 3.553, + "epoch": 2.3288409703504045, + "grad_norm": 0.5552904605865479, + "learning_rate": 0.00046081165677280085, + "loss": 3.5698, "step": 21600 }, { - "epoch": 2.3302120331503606, - "grad_norm": 0.5673235058784485, - "learning_rate": 0.00046072836978773834, - "loss": 3.5484, + "epoch": 2.334231805929919, + "grad_norm": 0.5644253492355347, + "learning_rate": 0.0004604878575283324, + "loss": 3.564, "step": 21650 }, { - "epoch": 2.3355935851899687, - "grad_norm": 0.5962451100349426, - "learning_rate": 0.000460405128757677, - "loss": 3.5665, + "epoch": 2.339622641509434, + "grad_norm": 0.5640168786048889, + "learning_rate": 0.00046016405828386395, + "loss": 3.5443, "step": 21700 }, { - "epoch": 2.340975137229577, - "grad_norm": 0.5817105770111084, - "learning_rate": 0.0004600818877276155, - "loss": 3.5456, + "epoch": 2.3450134770889486, + "grad_norm": 0.5596201419830322, + "learning_rate": 0.00045984025903939555, + "loss": 3.5594, "step": 21750 }, { - "epoch": 2.3463566892691854, - "grad_norm": 0.5566921234130859, - "learning_rate": 0.0004597586466975541, - "loss": 3.5419, + "epoch": 2.3504043126684637, + "grad_norm": 0.6457816958427429, + "learning_rate": 0.0004595164597949271, + "loss": 3.5811, "step": 21800 }, { - "epoch": 2.3517382413087935, - "grad_norm": 0.5889357924461365, - "learning_rate": 0.00045943540566749266, - "loss": 3.553, + "epoch": 2.3557951482479784, + "grad_norm": 0.610068142414093, + "learning_rate": 0.0004591926605504587, + "loss": 3.5691, "step": 21850 }, { - "epoch": 2.3571197933484016, - "grad_norm": 0.5869264006614685, - "learning_rate": 0.00045911216463743126, - "loss": 3.549, + "epoch": 2.361185983827493, + "grad_norm": 0.6661171317100525, + "learning_rate": 0.0004588688613059902, + "loss": 3.5453, "step": 21900 }, { - "epoch": 2.3625013453880097, - "grad_norm": 0.5697734951972961, - "learning_rate": 0.00045878892360736985, - "loss": 3.5583, + "epoch": 2.3665768194070083, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.00045854506206152186, + "loss": 3.5598, "step": 21950 }, { - "epoch": 2.3678828974276183, - "grad_norm": 0.6061686873435974, - "learning_rate": 0.00045846568257730845, - "loss": 3.5613, + "epoch": 2.371967654986523, + "grad_norm": 0.5913756489753723, + "learning_rate": 0.00045822126281705336, + "loss": 3.5526, "step": 22000 }, { - "epoch": 2.3678828974276183, - "eval_accuracy": 0.36498678074640406, - "eval_loss": 3.547973871231079, - "eval_runtime": 180.0761, - "eval_samples_per_second": 100.019, - "eval_steps_per_second": 6.253, + "epoch": 2.371967654986523, + "eval_accuracy": 0.36443677982828715, + "eval_loss": 3.5550854206085205, + "eval_runtime": 184.9317, + "eval_samples_per_second": 97.393, + "eval_steps_per_second": 6.089, "step": 22000 }, { - "epoch": 2.3732644494672264, - "grad_norm": 0.5746018290519714, - "learning_rate": 0.00045814244154724704, - "loss": 3.5496, + "epoch": 2.3773584905660377, + "grad_norm": 0.6312258839607239, + "learning_rate": 0.00045789746357258497, + "loss": 3.5527, "step": 22050 }, { - "epoch": 2.3786460015068345, - "grad_norm": 0.6272563338279724, - "learning_rate": 0.00045781920051718563, - "loss": 3.5636, + "epoch": 2.382749326145553, + "grad_norm": 0.5353261828422546, + "learning_rate": 0.0004575736643281165, + "loss": 3.5654, "step": 22100 }, { - "epoch": 2.384027553546443, - "grad_norm": 0.5796659588813782, - "learning_rate": 0.0004574959594871242, - "loss": 3.5412, + "epoch": 2.3881401617250675, + "grad_norm": 0.5589489340782166, + "learning_rate": 0.00045724986508364807, + "loss": 3.5687, "step": 22150 }, { - "epoch": 2.389409105586051, - "grad_norm": 0.5960429906845093, - "learning_rate": 0.00045717271845706277, - "loss": 3.5478, + "epoch": 2.393530997304582, + "grad_norm": 0.6058165431022644, + "learning_rate": 0.00045692606583917967, + "loss": 3.5535, "step": 22200 }, { - "epoch": 2.3947906576256592, - "grad_norm": 0.6166843175888062, - "learning_rate": 0.0004568494774270013, - "loss": 3.5584, + "epoch": 2.398921832884097, + "grad_norm": 0.5829679369926453, + "learning_rate": 0.0004566087425796006, + "loss": 3.5606, "step": 22250 }, { - "epoch": 2.4001722096652673, - "grad_norm": 0.5762814879417419, - "learning_rate": 0.00045652623639693996, - "loss": 3.537, + "epoch": 2.404312668463612, + "grad_norm": 0.6008812189102173, + "learning_rate": 0.0004562849433351322, + "loss": 3.5468, "step": 22300 }, { - "epoch": 2.4055537617048754, - "grad_norm": 0.5935600399971008, - "learning_rate": 0.00045620299536687855, - "loss": 3.5506, + "epoch": 2.4097035040431267, + "grad_norm": 0.5908893346786499, + "learning_rate": 0.0004559611440906638, + "loss": 3.5721, "step": 22350 }, { - "epoch": 2.410935313744484, - "grad_norm": 0.5715270042419434, - "learning_rate": 0.0004558797543368171, - "loss": 3.5349, + "epoch": 2.4150943396226414, + "grad_norm": 0.5803592205047607, + "learning_rate": 0.00045563734484619533, + "loss": 3.5554, "step": 22400 }, { - "epoch": 2.416316865784092, - "grad_norm": 0.5746564269065857, - "learning_rate": 0.0004555565133067557, - "loss": 3.5393, + "epoch": 2.420485175202156, + "grad_norm": 0.5852487087249756, + "learning_rate": 0.0004553135456017269, + "loss": 3.5566, "step": 22450 }, { - "epoch": 2.4216984178237, - "grad_norm": 0.6002961993217468, - "learning_rate": 0.0004552332722766943, - "loss": 3.5703, + "epoch": 2.4258760107816713, + "grad_norm": 0.618068516254425, + "learning_rate": 0.0004549897463572585, + "loss": 3.566, "step": 22500 }, { - "epoch": 2.4270799698633088, - "grad_norm": 0.6364290118217468, - "learning_rate": 0.0004549100312466328, - "loss": 3.535, + "epoch": 2.431266846361186, + "grad_norm": 0.5683373808860779, + "learning_rate": 0.00045466594711279, + "loss": 3.5557, "step": 22550 }, { - "epoch": 2.432461521902917, - "grad_norm": 0.5654012560844421, - "learning_rate": 0.0004545867902165715, - "loss": 3.5494, + "epoch": 2.4366576819407006, + "grad_norm": 0.5734935998916626, + "learning_rate": 0.00045434214786832164, + "loss": 3.5868, "step": 22600 }, { - "epoch": 2.437843073942525, - "grad_norm": 0.6237375736236572, - "learning_rate": 0.00045426354918651007, - "loss": 3.5464, + "epoch": 2.442048517520216, + "grad_norm": 0.5584690570831299, + "learning_rate": 0.00045401834862385314, + "loss": 3.5469, "step": 22650 }, { - "epoch": 2.443224625982133, - "grad_norm": 0.5933444499969482, - "learning_rate": 0.0004539467729770499, - "loss": 3.5417, + "epoch": 2.4474393530997305, + "grad_norm": 0.5913657546043396, + "learning_rate": 0.00045369454937938474, + "loss": 3.5375, "step": 22700 }, { - "epoch": 2.4486061780217416, - "grad_norm": 0.5668746829032898, - "learning_rate": 0.0004536235319469884, - "loss": 3.5352, + "epoch": 2.452830188679245, + "grad_norm": 0.5663593411445618, + "learning_rate": 0.0004533707501349163, + "loss": 3.5422, "step": 22750 }, { - "epoch": 2.4539877300613497, - "grad_norm": 0.6203579902648926, - "learning_rate": 0.000453300290916927, - "loss": 3.574, + "epoch": 2.4582210242587603, + "grad_norm": 0.5713441371917725, + "learning_rate": 0.0004530469508904479, + "loss": 3.5542, "step": 22800 }, { - "epoch": 2.459369282100958, - "grad_norm": 0.5573844909667969, - "learning_rate": 0.0004529770498868656, - "loss": 3.5529, + "epoch": 2.463611859838275, + "grad_norm": 0.5829697847366333, + "learning_rate": 0.00045272315164597945, + "loss": 3.5379, "step": 22850 }, { - "epoch": 2.464750834140566, - "grad_norm": 0.6508901715278625, - "learning_rate": 0.00045265380885680414, - "loss": 3.531, + "epoch": 2.4690026954177897, + "grad_norm": 0.5785667300224304, + "learning_rate": 0.000452399352401511, + "loss": 3.5559, "step": 22900 }, { - "epoch": 2.4701323861801745, - "grad_norm": 0.5782315135002136, - "learning_rate": 0.0004523305678267428, - "loss": 3.5689, + "epoch": 2.4743935309973044, + "grad_norm": 0.6283608675003052, + "learning_rate": 0.0004520755531570426, + "loss": 3.5512, "step": 22950 }, { - "epoch": 2.4755139382197826, - "grad_norm": 0.6106559038162231, - "learning_rate": 0.0004520073267966814, - "loss": 3.5521, + "epoch": 2.4797843665768196, + "grad_norm": 0.5529918670654297, + "learning_rate": 0.00045175175391257415, + "loss": 3.544, "step": 23000 }, { - "epoch": 2.4755139382197826, - "eval_accuracy": 0.3663403784227969, - "eval_loss": 3.5380020141601562, - "eval_runtime": 179.612, - "eval_samples_per_second": 100.277, - "eval_steps_per_second": 6.269, + "epoch": 2.4797843665768196, + "eval_accuracy": 0.3652747108991124, + "eval_loss": 3.542670726776123, + "eval_runtime": 184.9689, + "eval_samples_per_second": 97.373, + "eval_steps_per_second": 6.088, "step": 23000 }, { - "epoch": 2.4808954902593907, - "grad_norm": 0.5985417366027832, - "learning_rate": 0.00045168408576661993, - "loss": 3.5309, + "epoch": 2.4851752021563343, + "grad_norm": 0.5712370872497559, + "learning_rate": 0.00045142795466810576, + "loss": 3.5568, "step": 23050 }, { - "epoch": 2.4862770422989993, - "grad_norm": 0.5568486452102661, - "learning_rate": 0.0004513608447365585, - "loss": 3.5504, + "epoch": 2.490566037735849, + "grad_norm": 0.589131236076355, + "learning_rate": 0.0004511041554236373, + "loss": 3.5592, "step": 23100 }, { - "epoch": 2.4916585943386074, - "grad_norm": 0.607467770576477, - "learning_rate": 0.00045103760370649706, - "loss": 3.5585, + "epoch": 2.4959568733153636, + "grad_norm": 0.5965885519981384, + "learning_rate": 0.0004507803561791689, + "loss": 3.5562, "step": 23150 }, { - "epoch": 2.4970401463782155, - "grad_norm": 0.6122623085975647, - "learning_rate": 0.00045071436267643566, - "loss": 3.55, + "epoch": 2.501347708894879, + "grad_norm": 0.591253936290741, + "learning_rate": 0.00045045655693470046, + "loss": 3.5516, "step": 23200 }, { - "epoch": 2.5024216984178236, - "grad_norm": 0.6299822330474854, - "learning_rate": 0.0004503911216463743, - "loss": 3.5361, + "epoch": 2.5067385444743935, + "grad_norm": 0.575657308101654, + "learning_rate": 0.00045013275769023207, + "loss": 3.5523, "step": 23250 }, { - "epoch": 2.5078032504574317, - "grad_norm": 0.5975040793418884, - "learning_rate": 0.00045006788061631285, - "loss": 3.5452, + "epoch": 2.512129380053908, + "grad_norm": 0.549932599067688, + "learning_rate": 0.00044980895844576356, + "loss": 3.5368, "step": 23300 }, { - "epoch": 2.5131848024970402, - "grad_norm": 0.6091236472129822, - "learning_rate": 0.00044974463958625144, - "loss": 3.5612, + "epoch": 2.5175202156334233, + "grad_norm": 0.5937458276748657, + "learning_rate": 0.0004494851592012951, + "loss": 3.544, "step": 23350 }, { - "epoch": 2.5185663545366483, - "grad_norm": 0.5954560041427612, - "learning_rate": 0.00044942139855619004, - "loss": 3.5283, + "epoch": 2.522911051212938, + "grad_norm": 0.576596736907959, + "learning_rate": 0.0004491613599568267, + "loss": 3.5412, "step": 23400 }, { - "epoch": 2.5239479065762565, - "grad_norm": 0.5859009623527527, - "learning_rate": 0.0004490981575261286, - "loss": 3.537, + "epoch": 2.5283018867924527, + "grad_norm": 0.5818652510643005, + "learning_rate": 0.00044883756071235827, + "loss": 3.5493, "step": 23450 }, { - "epoch": 2.529329458615865, - "grad_norm": 0.5438385009765625, - "learning_rate": 0.0004487749164960672, - "loss": 3.5416, + "epoch": 2.533692722371968, + "grad_norm": 0.5370622873306274, + "learning_rate": 0.0004485137614678899, + "loss": 3.5539, "step": 23500 }, { - "epoch": 2.534711010655473, - "grad_norm": 0.5849915146827698, - "learning_rate": 0.0004484516754660058, - "loss": 3.544, + "epoch": 2.5390835579514826, + "grad_norm": 0.5701831579208374, + "learning_rate": 0.0004481899622234214, + "loss": 3.5573, "step": 23550 }, { - "epoch": 2.540092562695081, - "grad_norm": 0.6110696792602539, - "learning_rate": 0.00044812843443594436, - "loss": 3.5456, + "epoch": 2.5444743935309972, + "grad_norm": 0.6162128448486328, + "learning_rate": 0.00044786616297895303, + "loss": 3.558, "step": 23600 }, { - "epoch": 2.5454741147346893, - "grad_norm": 0.6262888312339783, - "learning_rate": 0.00044780519340588296, - "loss": 3.5357, + "epoch": 2.5498652291105124, + "grad_norm": 0.547835648059845, + "learning_rate": 0.0004475423637344846, + "loss": 3.5503, "step": 23650 }, { - "epoch": 2.550855666774298, - "grad_norm": 0.6227126717567444, - "learning_rate": 0.0004474819523758215, - "loss": 3.5471, + "epoch": 2.555256064690027, + "grad_norm": 0.5706479549407959, + "learning_rate": 0.00044721856449001613, + "loss": 3.5553, "step": 23700 }, { - "epoch": 2.556237218813906, - "grad_norm": 0.607038140296936, - "learning_rate": 0.0004471587113457601, - "loss": 3.5478, + "epoch": 2.560646900269542, + "grad_norm": 0.5650632381439209, + "learning_rate": 0.00044689476524554774, + "loss": 3.5479, "step": 23750 }, { - "epoch": 2.561618770853514, - "grad_norm": 0.5638023614883423, - "learning_rate": 0.00044683547031569874, - "loss": 3.5658, + "epoch": 2.5660377358490565, + "grad_norm": 0.5929921865463257, + "learning_rate": 0.0004465709660010793, + "loss": 3.5634, "step": 23800 }, { - "epoch": 2.567000322893122, - "grad_norm": 0.6163343787193298, - "learning_rate": 0.0004465122292856373, - "loss": 3.5602, + "epoch": 2.571428571428571, + "grad_norm": 0.6945008039474487, + "learning_rate": 0.0004462471667566109, + "loss": 3.5678, "step": 23850 }, { - "epoch": 2.5723818749327307, - "grad_norm": 0.5720522999763489, - "learning_rate": 0.0004461889882555759, - "loss": 3.5562, + "epoch": 2.5768194070080863, + "grad_norm": 0.5972580313682556, + "learning_rate": 0.00044592336751214244, + "loss": 3.5669, "step": 23900 }, { - "epoch": 2.577763426972339, - "grad_norm": 0.636634111404419, - "learning_rate": 0.00044586574722551447, - "loss": 3.5685, + "epoch": 2.582210242587601, + "grad_norm": 0.6179361343383789, + "learning_rate": 0.00044559956826767405, + "loss": 3.5411, "step": 23950 }, { - "epoch": 2.583144979011947, - "grad_norm": 0.5891180634498596, - "learning_rate": 0.000445542506195453, - "loss": 3.5455, + "epoch": 2.5876010781671157, + "grad_norm": 0.538170337677002, + "learning_rate": 0.00044527576902320554, + "loss": 3.5367, "step": 24000 }, { - "epoch": 2.583144979011947, - "eval_accuracy": 0.36704879525134726, - "eval_loss": 3.526322364807129, - "eval_runtime": 179.7285, - "eval_samples_per_second": 100.212, - "eval_steps_per_second": 6.265, + "epoch": 2.5876010781671157, + "eval_accuracy": 0.3663797107681857, + "eval_loss": 3.533842086791992, + "eval_runtime": 184.8514, + "eval_samples_per_second": 97.435, + "eval_steps_per_second": 6.091, "step": 24000 }, { - "epoch": 2.5885265310515555, - "grad_norm": 0.6043258905410767, - "learning_rate": 0.0004452192651653916, - "loss": 3.5164, + "epoch": 2.592991913746631, + "grad_norm": 0.6353852152824402, + "learning_rate": 0.00044495196977873715, + "loss": 3.5368, "step": 24050 }, { - "epoch": 2.5939080830911636, - "grad_norm": 0.6400256752967834, - "learning_rate": 0.00044489602413533025, - "loss": 3.5409, + "epoch": 2.5983827493261455, + "grad_norm": 0.5333035588264465, + "learning_rate": 0.0004446281705342687, + "loss": 3.5384, "step": 24100 }, { - "epoch": 2.5992896351307717, - "grad_norm": 0.6273722648620605, - "learning_rate": 0.0004445727831052688, - "loss": 3.5391, + "epoch": 2.6037735849056602, + "grad_norm": 0.5871497988700867, + "learning_rate": 0.00044430437128980025, + "loss": 3.5439, "step": 24150 }, { - "epoch": 2.60467118717038, - "grad_norm": 0.6210452318191528, - "learning_rate": 0.0004442495420752074, - "loss": 3.5436, + "epoch": 2.6091644204851754, + "grad_norm": 0.6557198166847229, + "learning_rate": 0.00044398057204533185, + "loss": 3.5563, "step": 24200 }, { - "epoch": 2.610052739209988, - "grad_norm": 0.5660796761512756, - "learning_rate": 0.00044392630104514593, - "loss": 3.5256, + "epoch": 2.61455525606469, + "grad_norm": 0.5965690016746521, + "learning_rate": 0.0004436567728008634, + "loss": 3.5611, "step": 24250 }, { - "epoch": 2.6154342912495965, - "grad_norm": 0.5474342107772827, - "learning_rate": 0.0004436030600150845, - "loss": 3.526, + "epoch": 2.6199460916442048, + "grad_norm": 0.5756832957267761, + "learning_rate": 0.000443332973556395, + "loss": 3.5472, "step": 24300 }, { - "epoch": 2.6208158432892046, - "grad_norm": 0.5688294768333435, - "learning_rate": 0.0004432798189850231, - "loss": 3.5341, + "epoch": 2.62533692722372, + "grad_norm": 0.5462123155593872, + "learning_rate": 0.00044300917431192656, + "loss": 3.5405, "step": 24350 }, { - "epoch": 2.6261973953288127, - "grad_norm": 0.5909520387649536, - "learning_rate": 0.0004429565779549617, - "loss": 3.5394, + "epoch": 2.6307277628032346, + "grad_norm": 0.56703120470047, + "learning_rate": 0.00044268537506745816, + "loss": 3.5526, "step": 24400 }, { - "epoch": 2.6315789473684212, - "grad_norm": 0.5881052613258362, - "learning_rate": 0.0004426333369249003, - "loss": 3.544, + "epoch": 2.6361185983827493, + "grad_norm": 0.5480033755302429, + "learning_rate": 0.0004423615758229897, + "loss": 3.5252, "step": 24450 }, { - "epoch": 2.6369604994080293, - "grad_norm": 0.5401953458786011, - "learning_rate": 0.0004423100958948389, - "loss": 3.5518, + "epoch": 2.641509433962264, + "grad_norm": 0.5893705487251282, + "learning_rate": 0.0004420377765785213, + "loss": 3.5317, "step": 24500 }, { - "epoch": 2.6423420514476375, - "grad_norm": 0.6648808121681213, - "learning_rate": 0.00044198685486477744, - "loss": 3.5483, + "epoch": 2.6469002695417787, + "grad_norm": 0.6192256212234497, + "learning_rate": 0.00044171397733405287, + "loss": 3.5497, "step": 24550 }, { - "epoch": 2.6477236034872456, - "grad_norm": 0.5619612336158752, - "learning_rate": 0.00044166361383471604, - "loss": 3.5388, + "epoch": 2.652291105121294, + "grad_norm": 0.5576728582382202, + "learning_rate": 0.00044139017808958437, + "loss": 3.5294, "step": 24600 }, { - "epoch": 2.653105155526854, - "grad_norm": 0.5457707643508911, - "learning_rate": 0.0004413403728046547, - "loss": 3.5491, + "epoch": 2.6576819407008085, + "grad_norm": 0.5619601607322693, + "learning_rate": 0.00044106637884511597, + "loss": 3.5434, "step": 24650 }, { - "epoch": 2.658486707566462, - "grad_norm": 0.6164374947547913, - "learning_rate": 0.0004410171317745932, - "loss": 3.5383, + "epoch": 2.6630727762803232, + "grad_norm": 0.5698447227478027, + "learning_rate": 0.0004407425796006475, + "loss": 3.5462, "step": 24700 }, { - "epoch": 2.6638682596060703, - "grad_norm": 0.5915449261665344, - "learning_rate": 0.0004406938907445318, - "loss": 3.5412, + "epoch": 2.6684636118598384, + "grad_norm": 0.585787832736969, + "learning_rate": 0.00044041878035617913, + "loss": 3.5486, "step": 24750 }, { - "epoch": 2.6692498116456784, - "grad_norm": 0.6016857028007507, - "learning_rate": 0.00044037064971447036, - "loss": 3.5451, + "epoch": 2.673854447439353, + "grad_norm": 0.5916891694068909, + "learning_rate": 0.0004400949811117107, + "loss": 3.5477, "step": 24800 }, { - "epoch": 2.674631363685287, - "grad_norm": 0.5480591058731079, - "learning_rate": 0.00044004740868440896, - "loss": 3.538, + "epoch": 2.6792452830188678, + "grad_norm": 0.5576286315917969, + "learning_rate": 0.0004397711818672423, + "loss": 3.5487, "step": 24850 }, { - "epoch": 2.680012915724895, - "grad_norm": 0.5656848549842834, - "learning_rate": 0.00043972416765434755, - "loss": 3.5566, + "epoch": 2.684636118598383, + "grad_norm": 0.5873832702636719, + "learning_rate": 0.00043944738262277383, + "loss": 3.5629, "step": 24900 }, { - "epoch": 2.685394467764503, - "grad_norm": 0.6277399063110352, - "learning_rate": 0.00043940092662428615, - "loss": 3.5456, + "epoch": 2.6900269541778976, + "grad_norm": 0.5807034969329834, + "learning_rate": 0.00043912358337830544, + "loss": 3.5528, "step": 24950 }, { - "epoch": 2.6907760198041117, - "grad_norm": 0.6056678295135498, - "learning_rate": 0.00043907768559422474, - "loss": 3.5473, + "epoch": 2.6954177897574123, + "grad_norm": 0.5750570297241211, + "learning_rate": 0.000438799784133837, + "loss": 3.5471, "step": 25000 }, { - "epoch": 2.6907760198041117, - "eval_accuracy": 0.36839630836602244, - "eval_loss": 3.5200185775756836, - "eval_runtime": 179.6896, - "eval_samples_per_second": 100.234, - "eval_steps_per_second": 6.266, + "epoch": 2.6954177897574123, + "eval_accuracy": 0.3674458129034214, + "eval_loss": 3.522408962249756, + "eval_runtime": 184.8934, + "eval_samples_per_second": 97.413, + "eval_steps_per_second": 6.09, "step": 25000 }, { - "epoch": 2.69615757184372, - "grad_norm": 0.635338544845581, - "learning_rate": 0.00043875444456416334, - "loss": 3.5616, + "epoch": 2.7008086253369274, + "grad_norm": 0.5960842967033386, + "learning_rate": 0.00043847598488936854, + "loss": 3.5387, "step": 25050 }, { - "epoch": 2.701539123883328, - "grad_norm": 0.6054742932319641, - "learning_rate": 0.0004384312035341019, - "loss": 3.5651, + "epoch": 2.706199460916442, + "grad_norm": 0.5355721712112427, + "learning_rate": 0.00043815218564490014, + "loss": 3.5478, "step": 25100 }, { - "epoch": 2.706920675922936, - "grad_norm": 0.6213764548301697, - "learning_rate": 0.00043810796250404047, - "loss": 3.5406, + "epoch": 2.711590296495957, + "grad_norm": 0.5989283323287964, + "learning_rate": 0.0004378283864004317, + "loss": 3.5244, "step": 25150 }, { - "epoch": 2.712302227962544, - "grad_norm": 0.5552936792373657, - "learning_rate": 0.000437784721473979, - "loss": 3.5186, + "epoch": 2.7169811320754715, + "grad_norm": 0.6410152912139893, + "learning_rate": 0.0004375045871559633, + "loss": 3.546, "step": 25200 }, { - "epoch": 2.7176837800021527, - "grad_norm": 0.588412344455719, - "learning_rate": 0.00043746148044391766, - "loss": 3.5416, + "epoch": 2.7223719676549867, + "grad_norm": 0.632588267326355, + "learning_rate": 0.00043718078791149485, + "loss": 3.5491, "step": 25250 }, { - "epoch": 2.723065332041761, - "grad_norm": 0.5414818525314331, - "learning_rate": 0.00043713823941385625, - "loss": 3.5289, + "epoch": 2.7277628032345014, + "grad_norm": 0.5347442030906677, + "learning_rate": 0.00043685698866702645, + "loss": 3.5518, "step": 25300 }, { - "epoch": 2.728446884081369, - "grad_norm": 0.6171042919158936, - "learning_rate": 0.0004368149983837948, - "loss": 3.548, + "epoch": 2.733153638814016, + "grad_norm": 0.5966982245445251, + "learning_rate": 0.00043653318942255795, + "loss": 3.5438, "step": 25350 }, { - "epoch": 2.7338284361209775, - "grad_norm": 0.6250415444374084, - "learning_rate": 0.0004364917573537334, - "loss": 3.5393, + "epoch": 2.7385444743935308, + "grad_norm": 0.6366229057312012, + "learning_rate": 0.00043620939017808956, + "loss": 3.5336, "step": 25400 }, { - "epoch": 2.7392099881605856, - "grad_norm": 0.6534850597381592, - "learning_rate": 0.00043616851632367193, - "loss": 3.5351, + "epoch": 2.743935309973046, + "grad_norm": 0.598671019077301, + "learning_rate": 0.0004358855909336211, + "loss": 3.5338, "step": 25450 }, { - "epoch": 2.7445915402001937, - "grad_norm": 0.6056131720542908, - "learning_rate": 0.0004358452752936106, - "loss": 3.5331, + "epoch": 2.7493261455525606, + "grad_norm": 0.5955828428268433, + "learning_rate": 0.00043556179168915266, + "loss": 3.5497, "step": 25500 }, { - "epoch": 2.749973092239802, - "grad_norm": 0.6097458004951477, - "learning_rate": 0.0004355220342635492, - "loss": 3.5229, + "epoch": 2.7547169811320753, + "grad_norm": 0.5636226534843445, + "learning_rate": 0.00043523799244468426, + "loss": 3.5418, "step": 25550 }, { - "epoch": 2.7553546442794103, - "grad_norm": 0.5821205377578735, - "learning_rate": 0.00043519879323348777, - "loss": 3.5547, + "epoch": 2.7601078167115904, + "grad_norm": 0.5939311981201172, + "learning_rate": 0.0004349141932002158, + "loss": 3.5449, "step": 25600 }, { - "epoch": 2.7607361963190185, - "grad_norm": 0.572923481464386, - "learning_rate": 0.0004348755522034263, - "loss": 3.5459, + "epoch": 2.765498652291105, + "grad_norm": 0.5545902848243713, + "learning_rate": 0.0004345903939557474, + "loss": 3.537, "step": 25650 }, { - "epoch": 2.7661177483586266, - "grad_norm": 0.5959768295288086, - "learning_rate": 0.0004345523111733649, - "loss": 3.53, + "epoch": 2.77088948787062, + "grad_norm": 0.6240540742874146, + "learning_rate": 0.00043426659471127897, + "loss": 3.5434, "step": 25700 }, { - "epoch": 2.7714993003982347, - "grad_norm": 0.6123167872428894, - "learning_rate": 0.00043422907014330344, - "loss": 3.5332, + "epoch": 2.776280323450135, + "grad_norm": 0.5980251431465149, + "learning_rate": 0.00043394279546681057, + "loss": 3.53, "step": 25750 }, { - "epoch": 2.776880852437843, - "grad_norm": 0.5567452311515808, - "learning_rate": 0.0004339058291132421, - "loss": 3.5329, + "epoch": 2.7816711590296497, + "grad_norm": 0.5463680028915405, + "learning_rate": 0.0004336189962223421, + "loss": 3.5552, "step": 25800 }, { - "epoch": 2.7822624044774513, - "grad_norm": 0.6061723232269287, - "learning_rate": 0.0004335825880831807, - "loss": 3.5429, + "epoch": 2.7870619946091644, + "grad_norm": 0.5872852802276611, + "learning_rate": 0.0004332951969778737, + "loss": 3.5262, "step": 25850 }, { - "epoch": 2.7876439565170594, - "grad_norm": 0.5593557357788086, - "learning_rate": 0.00043325934705311923, - "loss": 3.5072, + "epoch": 2.7924528301886795, + "grad_norm": 0.5788702368736267, + "learning_rate": 0.0004329713977334053, + "loss": 3.538, "step": 25900 }, { - "epoch": 2.793025508556668, - "grad_norm": 0.5608957409858704, - "learning_rate": 0.0004329361060230578, - "loss": 3.5159, + "epoch": 2.797843665768194, + "grad_norm": 0.6034548282623291, + "learning_rate": 0.0004326475984889368, + "loss": 3.5353, "step": 25950 }, { - "epoch": 2.798407060596276, - "grad_norm": 0.5900284051895142, - "learning_rate": 0.00043261286499299636, - "loss": 3.5301, + "epoch": 2.803234501347709, + "grad_norm": 0.5133172869682312, + "learning_rate": 0.0004323237992444684, + "loss": 3.551, "step": 26000 }, { - "epoch": 2.798407060596276, - "eval_accuracy": 0.36886427635383945, - "eval_loss": 3.5095598697662354, - "eval_runtime": 180.5366, - "eval_samples_per_second": 99.764, - "eval_steps_per_second": 6.237, + "epoch": 2.803234501347709, + "eval_accuracy": 0.36823169924098353, + "eval_loss": 3.515547037124634, + "eval_runtime": 184.9718, + "eval_samples_per_second": 97.372, + "eval_steps_per_second": 6.087, "step": 26000 }, { - "epoch": 2.803788612635884, - "grad_norm": 0.5952501893043518, - "learning_rate": 0.00043228962396293496, - "loss": 3.5283, + "epoch": 2.8086253369272236, + "grad_norm": 0.6219244003295898, + "learning_rate": 0.00043199999999999993, + "loss": 3.556, "step": 26050 }, { - "epoch": 2.8091701646754923, - "grad_norm": 0.5934816598892212, - "learning_rate": 0.00043197284775347476, - "loss": 3.5287, + "epoch": 2.8140161725067383, + "grad_norm": 0.5854507684707642, + "learning_rate": 0.00043167620075553153, + "loss": 3.5406, "step": 26100 }, { - "epoch": 2.8145517167151004, - "grad_norm": 0.6665016412734985, - "learning_rate": 0.00043164960672341336, - "loss": 3.5178, + "epoch": 2.8194070080862534, + "grad_norm": 0.5761322379112244, + "learning_rate": 0.0004313524015110631, + "loss": 3.5364, "step": 26150 }, { - "epoch": 2.819933268754709, - "grad_norm": 0.5784537196159363, - "learning_rate": 0.000431326365693352, - "loss": 3.5368, + "epoch": 2.824797843665768, + "grad_norm": 0.5397259593009949, + "learning_rate": 0.0004310286022665947, + "loss": 3.5472, "step": 26200 }, { - "epoch": 2.825314820794317, - "grad_norm": 0.5916054248809814, - "learning_rate": 0.00043100312466329055, - "loss": 3.5361, + "epoch": 2.830188679245283, + "grad_norm": 0.5752435326576233, + "learning_rate": 0.0004307112790070156, + "loss": 3.5301, "step": 26250 }, { - "epoch": 2.830696372833925, - "grad_norm": 0.5684788227081299, - "learning_rate": 0.00043067988363322914, - "loss": 3.5204, + "epoch": 2.835579514824798, + "grad_norm": 0.6607756018638611, + "learning_rate": 0.0004303874797625472, + "loss": 3.5288, "step": 26300 }, { - "epoch": 2.8360779248735337, - "grad_norm": 0.5924267172813416, - "learning_rate": 0.00043035664260316774, - "loss": 3.5382, + "epoch": 2.8409703504043127, + "grad_norm": 0.5861449837684631, + "learning_rate": 0.00043006368051807874, + "loss": 3.5494, "step": 26350 }, { - "epoch": 2.841459476913142, - "grad_norm": 0.6063621640205383, - "learning_rate": 0.0004300334015731063, - "loss": 3.5343, + "epoch": 2.8463611859838274, + "grad_norm": 0.5686273574829102, + "learning_rate": 0.00042973988127361035, + "loss": 3.5248, "step": 26400 }, { - "epoch": 2.84684102895275, - "grad_norm": 0.6259608268737793, - "learning_rate": 0.0004297101605430449, - "loss": 3.5212, + "epoch": 2.8517520215633425, + "grad_norm": 0.5789076685905457, + "learning_rate": 0.0004294160820291419, + "loss": 3.5457, "step": 26450 }, { - "epoch": 2.852222580992358, - "grad_norm": 0.5912953019142151, - "learning_rate": 0.0004293869195129835, - "loss": 3.5333, + "epoch": 2.857142857142857, + "grad_norm": 0.589009165763855, + "learning_rate": 0.0004290922827846735, + "loss": 3.5533, "step": 26500 }, { - "epoch": 2.857604133031966, - "grad_norm": 0.6470860838890076, - "learning_rate": 0.00042906367848292206, - "loss": 3.5356, + "epoch": 2.862533692722372, + "grad_norm": 0.5996458530426025, + "learning_rate": 0.00042876848354020505, + "loss": 3.5342, "step": 26550 }, { - "epoch": 2.8629856850715747, - "grad_norm": 0.6130536198616028, - "learning_rate": 0.00042874043745286066, - "loss": 3.5259, + "epoch": 2.867924528301887, + "grad_norm": 0.5394443869590759, + "learning_rate": 0.00042844468429573655, + "loss": 3.5353, "step": 26600 }, { - "epoch": 2.868367237111183, - "grad_norm": 0.5800825357437134, - "learning_rate": 0.0004284171964227992, - "loss": 3.5425, + "epoch": 2.8733153638814017, + "grad_norm": 0.5539592504501343, + "learning_rate": 0.00042812088505126815, + "loss": 3.5198, "step": 26650 }, { - "epoch": 2.873748789150791, - "grad_norm": 0.5847500562667847, - "learning_rate": 0.0004280939553927378, - "loss": 3.5086, + "epoch": 2.8787061994609164, + "grad_norm": 0.5455945134162903, + "learning_rate": 0.0004277970858067997, + "loss": 3.561, "step": 26700 }, { - "epoch": 2.8791303411903995, - "grad_norm": 0.6748270988464355, - "learning_rate": 0.00042777071436267644, - "loss": 3.5213, + "epoch": 2.884097035040431, + "grad_norm": 0.6022780537605286, + "learning_rate": 0.0004274732865623313, + "loss": 3.5265, "step": 26750 }, { - "epoch": 2.8845118932300076, - "grad_norm": 0.576877772808075, - "learning_rate": 0.000427447473332615, - "loss": 3.5173, + "epoch": 2.889487870619946, + "grad_norm": 0.5307306051254272, + "learning_rate": 0.00042714948731786286, + "loss": 3.5104, "step": 26800 }, { - "epoch": 2.8898934452696157, - "grad_norm": 0.5994119644165039, - "learning_rate": 0.0004271242323025536, - "loss": 3.5184, + "epoch": 2.894878706199461, + "grad_norm": 0.6074073910713196, + "learning_rate": 0.00042682568807339447, + "loss": 3.5143, "step": 26850 }, { - "epoch": 2.895274997309224, - "grad_norm": 0.5870147943496704, - "learning_rate": 0.00042680099127249217, - "loss": 3.5404, + "epoch": 2.9002695417789757, + "grad_norm": 0.5870118141174316, + "learning_rate": 0.000426501888828926, + "loss": 3.5332, "step": 26900 }, { - "epoch": 2.9006565493488323, - "grad_norm": 0.5605146884918213, - "learning_rate": 0.0004264777502424307, - "loss": 3.5338, + "epoch": 2.9056603773584904, + "grad_norm": 0.5871788859367371, + "learning_rate": 0.0004261780895844576, + "loss": 3.5547, "step": 26950 }, { - "epoch": 2.9060381013884404, - "grad_norm": 0.6184528470039368, - "learning_rate": 0.0004261545092123693, - "loss": 3.5226, + "epoch": 2.9110512129380055, + "grad_norm": 0.5941997766494751, + "learning_rate": 0.00042585429033998917, + "loss": 3.5355, "step": 27000 }, { - "epoch": 2.9060381013884404, - "eval_accuracy": 0.3700053489816671, - "eval_loss": 3.502581834793091, - "eval_runtime": 180.637, - "eval_samples_per_second": 99.708, - "eval_steps_per_second": 6.233, + "epoch": 2.9110512129380055, + "eval_accuracy": 0.3690959243106599, + "eval_loss": 3.506704807281494, + "eval_runtime": 184.548, + "eval_samples_per_second": 97.595, + "eval_steps_per_second": 6.101, "step": 27000 }, { - "epoch": 2.9114196534280485, - "grad_norm": 0.6496872305870056, - "learning_rate": 0.00042583126818230795, - "loss": 3.5242, + "epoch": 2.91644204851752, + "grad_norm": 0.5948578119277954, + "learning_rate": 0.0004255304910955207, + "loss": 3.522, "step": 27050 }, { - "epoch": 2.9168012054676566, - "grad_norm": 0.6035163402557373, - "learning_rate": 0.0004255080271522465, - "loss": 3.5265, + "epoch": 2.921832884097035, + "grad_norm": 0.6063885688781738, + "learning_rate": 0.0004252066918510523, + "loss": 3.5315, "step": 27100 }, { - "epoch": 2.922182757507265, - "grad_norm": 0.5869918465614319, - "learning_rate": 0.0004251847861221851, - "loss": 3.5208, + "epoch": 2.92722371967655, + "grad_norm": 0.5720027685165405, + "learning_rate": 0.0004248828926065839, + "loss": 3.5281, "step": 27150 }, { - "epoch": 2.9275643095468733, - "grad_norm": 0.6070534586906433, - "learning_rate": 0.00042486154509212363, - "loss": 3.4932, + "epoch": 2.9326145552560647, + "grad_norm": 0.6208416819572449, + "learning_rate": 0.0004245590933621155, + "loss": 3.5356, "step": 27200 }, { - "epoch": 2.9329458615864814, - "grad_norm": 0.6247556805610657, - "learning_rate": 0.0004245383040620622, - "loss": 3.5168, + "epoch": 2.9380053908355794, + "grad_norm": 0.6040082573890686, + "learning_rate": 0.00042423529411764703, + "loss": 3.5406, "step": 27250 }, { - "epoch": 2.93832741362609, - "grad_norm": 0.606638491153717, - "learning_rate": 0.0004242150630320009, - "loss": 3.5379, + "epoch": 2.9433962264150946, + "grad_norm": 0.6055092811584473, + "learning_rate": 0.00042391149487317864, + "loss": 3.5071, "step": 27300 }, { - "epoch": 2.943708965665698, - "grad_norm": 0.5896838307380676, - "learning_rate": 0.0004238918220019394, - "loss": 3.5439, + "epoch": 2.9487870619946093, + "grad_norm": 0.5801677703857422, + "learning_rate": 0.00042358769562871013, + "loss": 3.5266, "step": 27350 }, { - "epoch": 2.949090517705306, - "grad_norm": 0.7161862850189209, - "learning_rate": 0.000423568580971878, - "loss": 3.524, + "epoch": 2.954177897574124, + "grad_norm": 0.6085225939750671, + "learning_rate": 0.00042326389638424174, + "loss": 3.5207, "step": 27400 }, { - "epoch": 2.9544720697449143, - "grad_norm": 0.5472960472106934, - "learning_rate": 0.00042324533994181655, - "loss": 3.5082, + "epoch": 2.9595687331536387, + "grad_norm": 0.6469091176986694, + "learning_rate": 0.0004229400971397733, + "loss": 3.5144, "step": 27450 }, { - "epoch": 2.9598536217845224, - "grad_norm": 0.6270456910133362, - "learning_rate": 0.00042292209891175514, - "loss": 3.5133, + "epoch": 2.964959568733154, + "grad_norm": 0.5324587821960449, + "learning_rate": 0.00042261629789530484, + "loss": 3.5401, "step": 27500 }, { - "epoch": 2.965235173824131, - "grad_norm": 0.6163716912269592, - "learning_rate": 0.00042259885788169374, - "loss": 3.5185, + "epoch": 2.9703504043126685, + "grad_norm": 0.5728937387466431, + "learning_rate": 0.00042229249865083644, + "loss": 3.5412, "step": 27550 }, { - "epoch": 2.970616725863739, - "grad_norm": 0.5813859105110168, - "learning_rate": 0.00042227561685163233, - "loss": 3.5219, + "epoch": 2.975741239892183, + "grad_norm": 0.5513703227043152, + "learning_rate": 0.000421968699406368, + "loss": 3.5235, "step": 27600 }, { - "epoch": 2.975998277903347, - "grad_norm": 0.6137823462486267, - "learning_rate": 0.00042195237582157093, - "loss": 3.5238, + "epoch": 2.981132075471698, + "grad_norm": 0.6015611290931702, + "learning_rate": 0.0004216449001618996, + "loss": 3.5281, "step": 27650 }, { - "epoch": 2.9813798299429557, - "grad_norm": 0.573917031288147, - "learning_rate": 0.0004216291347915095, - "loss": 3.531, + "epoch": 2.986522911051213, + "grad_norm": 0.6475524306297302, + "learning_rate": 0.00042132110091743115, + "loss": 3.5199, "step": 27700 }, { - "epoch": 2.986761381982564, - "grad_norm": 0.6336318254470825, - "learning_rate": 0.00042130589376144806, - "loss": 3.5142, + "epoch": 2.9919137466307277, + "grad_norm": 0.5860631465911865, + "learning_rate": 0.00042099730167296275, + "loss": 3.5175, "step": 27750 }, { - "epoch": 2.992142934022172, - "grad_norm": 0.5717305541038513, - "learning_rate": 0.00042098265273138666, - "loss": 3.5073, + "epoch": 2.9973045822102424, + "grad_norm": 0.6016190052032471, + "learning_rate": 0.0004206735024284943, + "loss": 3.5275, "step": 27800 }, { - "epoch": 2.9975244860617805, - "grad_norm": 0.6153519749641418, - "learning_rate": 0.0004206594117013252, - "loss": 3.5313, + "epoch": 3.0026954177897576, + "grad_norm": 0.5374354124069214, + "learning_rate": 0.0004203497031840259, + "loss": 3.4869, "step": 27850 }, { - "epoch": 3.0029060381013886, - "grad_norm": 0.5966550707817078, - "learning_rate": 0.00042033617067126385, - "loss": 3.4862, + "epoch": 3.0080862533692723, + "grad_norm": 0.6027006506919861, + "learning_rate": 0.00042002590393955746, + "loss": 3.4352, "step": 27900 }, { - "epoch": 3.0082875901409967, - "grad_norm": 0.5811104774475098, - "learning_rate": 0.00042001292964120244, - "loss": 3.4131, + "epoch": 3.013477088948787, + "grad_norm": 0.6323424577713013, + "learning_rate": 0.00041970210469508896, + "loss": 3.4338, "step": 27950 }, { - "epoch": 3.0136691421806048, - "grad_norm": 0.5833897590637207, - "learning_rate": 0.000419689688611141, - "loss": 3.4297, + "epoch": 3.018867924528302, + "grad_norm": 0.5759109258651733, + "learning_rate": 0.00041937830545062056, + "loss": 3.4341, "step": 28000 }, { - "epoch": 3.0136691421806048, - "eval_accuracy": 0.3707112667937978, - "eval_loss": 3.4965574741363525, - "eval_runtime": 180.5598, - "eval_samples_per_second": 99.751, - "eval_steps_per_second": 6.236, + "epoch": 3.018867924528302, + "eval_accuracy": 0.36992527180334783, + "eval_loss": 3.500993251800537, + "eval_runtime": 185.1029, + "eval_samples_per_second": 97.303, + "eval_steps_per_second": 6.083, "step": 28000 }, { - "epoch": 3.0190506942202133, - "grad_norm": 0.6142577528953552, - "learning_rate": 0.0004193664475810796, - "loss": 3.4263, + "epoch": 3.024258760107817, + "grad_norm": 0.5716021060943604, + "learning_rate": 0.0004190545062061521, + "loss": 3.4337, "step": 28050 }, { - "epoch": 3.0244322462598214, - "grad_norm": 0.60226970911026, - "learning_rate": 0.00041904320655101817, - "loss": 3.4323, + "epoch": 3.0296495956873315, + "grad_norm": 0.6018630266189575, + "learning_rate": 0.0004187307069616837, + "loss": 3.4388, "step": 28100 }, { - "epoch": 3.0298137982994295, - "grad_norm": 0.6177240014076233, - "learning_rate": 0.0004187199655209567, - "loss": 3.4357, + "epoch": 3.035040431266846, + "grad_norm": 0.6063191890716553, + "learning_rate": 0.00041840690771721527, + "loss": 3.423, "step": 28150 }, { - "epoch": 3.0351953503390376, - "grad_norm": 0.5931535363197327, - "learning_rate": 0.00041839672449089536, - "loss": 3.4589, + "epoch": 3.0404312668463613, + "grad_norm": 0.5545841455459595, + "learning_rate": 0.00041808310847274687, + "loss": 3.4414, "step": 28200 }, { - "epoch": 3.040576902378646, - "grad_norm": 0.608650803565979, - "learning_rate": 0.00041807348346083395, - "loss": 3.4432, + "epoch": 3.045822102425876, + "grad_norm": 0.5774965882301331, + "learning_rate": 0.0004177593092282784, + "loss": 3.4324, "step": 28250 }, { - "epoch": 3.0459584544182543, - "grad_norm": 0.6467320322990417, - "learning_rate": 0.0004177502424307725, - "loss": 3.4356, + "epoch": 3.0512129380053907, + "grad_norm": 0.5643296837806702, + "learning_rate": 0.0004174419859686994, + "loss": 3.455, "step": 28300 }, { - "epoch": 3.0513400064578624, - "grad_norm": 0.6208910346031189, - "learning_rate": 0.0004174270014007111, - "loss": 3.4335, + "epoch": 3.056603773584906, + "grad_norm": 0.6419238448143005, + "learning_rate": 0.0004171181867242309, + "loss": 3.4396, "step": 28350 }, { - "epoch": 3.0567215584974705, - "grad_norm": 0.6012397408485413, - "learning_rate": 0.00041710376037064963, - "loss": 3.4368, + "epoch": 3.0619946091644206, + "grad_norm": 0.5539114475250244, + "learning_rate": 0.00041679438747976253, + "loss": 3.4607, "step": 28400 }, { - "epoch": 3.062103110537079, - "grad_norm": 0.5883954763412476, - "learning_rate": 0.0004167805193405883, - "loss": 3.4407, + "epoch": 3.0673854447439353, + "grad_norm": 0.575046956539154, + "learning_rate": 0.0004164705882352941, + "loss": 3.4423, "step": 28450 }, { - "epoch": 3.067484662576687, - "grad_norm": 0.5667783617973328, - "learning_rate": 0.0004164572783105269, - "loss": 3.4358, + "epoch": 3.07277628032345, + "grad_norm": 0.5910366177558899, + "learning_rate": 0.0004161467889908257, + "loss": 3.4337, "step": 28500 }, { - "epoch": 3.0728662146162953, - "grad_norm": 0.6078146696090698, - "learning_rate": 0.0004161340372804654, - "loss": 3.4422, + "epoch": 3.078167115902965, + "grad_norm": 0.6324418187141418, + "learning_rate": 0.00041582298974635724, + "loss": 3.4346, "step": 28550 }, { - "epoch": 3.0782477666559034, - "grad_norm": 0.6011075377464294, - "learning_rate": 0.000415810796250404, - "loss": 3.4444, + "epoch": 3.08355795148248, + "grad_norm": 0.5757888555526733, + "learning_rate": 0.00041549919050188884, + "loss": 3.4546, "step": 28600 }, { - "epoch": 3.083629318695512, - "grad_norm": 0.6752050518989563, - "learning_rate": 0.0004154875552203426, - "loss": 3.4557, + "epoch": 3.0889487870619945, + "grad_norm": 0.6831218600273132, + "learning_rate": 0.00041517539125742034, + "loss": 3.4394, "step": 28650 }, { - "epoch": 3.08901087073512, - "grad_norm": 0.6102807521820068, - "learning_rate": 0.00041516431419028114, - "loss": 3.4272, + "epoch": 3.0943396226415096, + "grad_norm": 0.575617253780365, + "learning_rate": 0.0004148515920129519, + "loss": 3.4627, "step": 28700 }, { - "epoch": 3.094392422774728, - "grad_norm": 0.6542812585830688, - "learning_rate": 0.0004148410731602198, - "loss": 3.4495, + "epoch": 3.0997304582210243, + "grad_norm": 0.5726577043533325, + "learning_rate": 0.0004145277927684835, + "loss": 3.4267, "step": 28750 }, { - "epoch": 3.0997739748143363, - "grad_norm": 0.6202298998832703, - "learning_rate": 0.0004145178321301584, - "loss": 3.4493, + "epoch": 3.105121293800539, + "grad_norm": 0.5732120871543884, + "learning_rate": 0.00041420399352401504, + "loss": 3.4551, "step": 28800 }, { - "epoch": 3.105155526853945, - "grad_norm": 0.6084877848625183, - "learning_rate": 0.00041419459110009693, - "loss": 3.4437, + "epoch": 3.1105121293800537, + "grad_norm": 0.6304787397384644, + "learning_rate": 0.00041388019427954665, + "loss": 3.4438, "step": 28850 }, { - "epoch": 3.110537078893553, - "grad_norm": 0.8142428398132324, - "learning_rate": 0.0004138713500700355, - "loss": 3.4456, + "epoch": 3.115902964959569, + "grad_norm": 0.5989034175872803, + "learning_rate": 0.0004135563950350782, + "loss": 3.4592, "step": 28900 }, { - "epoch": 3.115918630933161, - "grad_norm": 0.6456471681594849, - "learning_rate": 0.00041354810903997406, - "loss": 3.4468, + "epoch": 3.1212938005390836, + "grad_norm": 0.5832788348197937, + "learning_rate": 0.0004132325957906098, + "loss": 3.4562, "step": 28950 }, { - "epoch": 3.121300182972769, - "grad_norm": 0.6340752840042114, - "learning_rate": 0.00041322486800991266, - "loss": 3.4224, + "epoch": 3.1266846361185983, + "grad_norm": 0.592212975025177, + "learning_rate": 0.00041290879654614135, + "loss": 3.4521, "step": 29000 }, { - "epoch": 3.121300182972769, - "eval_accuracy": 0.37119194716948906, - "eval_loss": 3.4933841228485107, - "eval_runtime": 180.305, - "eval_samples_per_second": 99.892, - "eval_steps_per_second": 6.245, + "epoch": 3.1266846361185983, + "eval_accuracy": 0.37072582628076495, + "eval_loss": 3.496122121810913, + "eval_runtime": 184.6302, + "eval_samples_per_second": 97.552, + "eval_steps_per_second": 6.099, "step": 29000 }, { - "epoch": 3.1266817350123777, - "grad_norm": 0.6109960675239563, - "learning_rate": 0.0004129016269798513, - "loss": 3.4461, + "epoch": 3.1320754716981134, + "grad_norm": 0.5908344388008118, + "learning_rate": 0.00041258499730167296, + "loss": 3.4568, "step": 29050 }, { - "epoch": 3.132063287051986, - "grad_norm": 0.5882677435874939, - "learning_rate": 0.00041257838594978985, - "loss": 3.4344, + "epoch": 3.137466307277628, + "grad_norm": 0.5698777437210083, + "learning_rate": 0.0004122611980572045, + "loss": 3.4543, "step": 29100 }, { - "epoch": 3.137444839091594, - "grad_norm": 0.5873978734016418, - "learning_rate": 0.00041225514491972844, - "loss": 3.4492, + "epoch": 3.142857142857143, + "grad_norm": 0.5800032615661621, + "learning_rate": 0.00041193739881273606, + "loss": 3.46, "step": 29150 }, { - "epoch": 3.1428263911312024, - "grad_norm": 0.6426523923873901, - "learning_rate": 0.00041193190388966704, - "loss": 3.4381, + "epoch": 3.1482479784366575, + "grad_norm": 0.583019495010376, + "learning_rate": 0.00041161359956826766, + "loss": 3.455, "step": 29200 }, { - "epoch": 3.1482079431708105, - "grad_norm": 0.5868753790855408, - "learning_rate": 0.0004116086628596056, - "loss": 3.4539, + "epoch": 3.1536388140161726, + "grad_norm": 0.6571466326713562, + "learning_rate": 0.0004112898003237992, + "loss": 3.473, "step": 29250 }, { - "epoch": 3.1535894952104186, - "grad_norm": 0.6552289128303528, - "learning_rate": 0.0004112854218295442, - "loss": 3.4576, + "epoch": 3.1590296495956873, + "grad_norm": 0.6302173137664795, + "learning_rate": 0.0004109660010793308, + "loss": 3.4594, "step": 29300 }, { - "epoch": 3.1589710472500268, - "grad_norm": 0.6329394578933716, - "learning_rate": 0.0004109621807994828, - "loss": 3.4514, + "epoch": 3.164420485175202, + "grad_norm": 0.6568088531494141, + "learning_rate": 0.0004106422018348623, + "loss": 3.453, "step": 29350 }, { - "epoch": 3.1643525992896353, - "grad_norm": 0.6486338376998901, - "learning_rate": 0.00041063893976942136, - "loss": 3.4588, + "epoch": 3.169811320754717, + "grad_norm": 0.6203985810279846, + "learning_rate": 0.0004103184025903939, + "loss": 3.449, "step": 29400 }, { - "epoch": 3.1697341513292434, - "grad_norm": 0.5754224061965942, - "learning_rate": 0.00041031569873935996, - "loss": 3.4392, + "epoch": 3.175202156334232, + "grad_norm": 0.6170281767845154, + "learning_rate": 0.00040999460334592547, + "loss": 3.4603, "step": 29450 }, { - "epoch": 3.1751157033688515, - "grad_norm": 0.6380184888839722, - "learning_rate": 0.0004099924577092985, - "loss": 3.4547, + "epoch": 3.1805929919137466, + "grad_norm": 0.5898356437683105, + "learning_rate": 0.000409670804101457, + "loss": 3.4562, "step": 29500 }, { - "epoch": 3.1804972554084596, - "grad_norm": 0.6305249333381653, - "learning_rate": 0.0004096692166792371, - "loss": 3.4568, + "epoch": 3.1859838274932613, + "grad_norm": 0.670875608921051, + "learning_rate": 0.00040934700485698863, + "loss": 3.4667, "step": 29550 }, { - "epoch": 3.185878807448068, - "grad_norm": 0.6137855648994446, - "learning_rate": 0.00040934597564917574, + "epoch": 3.1913746630727764, + "grad_norm": 0.6004409790039062, + "learning_rate": 0.0004090232056125202, "loss": 3.4612, "step": 29600 }, { - "epoch": 3.1912603594876763, - "grad_norm": 0.5795332789421082, - "learning_rate": 0.0004090227346191143, - "loss": 3.4387, + "epoch": 3.196765498652291, + "grad_norm": 0.6314196586608887, + "learning_rate": 0.0004086994063680518, + "loss": 3.4508, "step": 29650 }, { - "epoch": 3.1966419115272844, - "grad_norm": 0.6044164299964905, - "learning_rate": 0.0004086994935890529, - "loss": 3.4696, + "epoch": 3.202156334231806, + "grad_norm": 0.5917602181434631, + "learning_rate": 0.00040837560712358333, + "loss": 3.4829, "step": 29700 }, { - "epoch": 3.2020234635668925, - "grad_norm": 0.60833340883255, - "learning_rate": 0.00040837625255899147, - "loss": 3.4597, + "epoch": 3.207547169811321, + "grad_norm": 0.6115114688873291, + "learning_rate": 0.00040805180787911494, + "loss": 3.4527, "step": 29750 }, { - "epoch": 3.207405015606501, - "grad_norm": 0.6233122944831848, - "learning_rate": 0.00040805301152893, - "loss": 3.4374, + "epoch": 3.2129380053908356, + "grad_norm": 0.6146586537361145, + "learning_rate": 0.0004077280086346465, + "loss": 3.4625, "step": 29800 }, { - "epoch": 3.212786567646109, - "grad_norm": 0.6239629983901978, - "learning_rate": 0.0004077297704988686, - "loss": 3.4452, + "epoch": 3.2183288409703503, + "grad_norm": 0.5785256624221802, + "learning_rate": 0.0004074042093901781, + "loss": 3.4397, "step": 29850 }, { - "epoch": 3.2181681196857173, - "grad_norm": 0.6396247148513794, - "learning_rate": 0.00040740652946880725, - "loss": 3.4325, + "epoch": 3.223719676549865, + "grad_norm": 0.5921712517738342, + "learning_rate": 0.00040708041014570964, + "loss": 3.4511, "step": 29900 }, { - "epoch": 3.2235496717253254, - "grad_norm": 0.58219975233078, - "learning_rate": 0.0004070832884387458, - "loss": 3.466, + "epoch": 3.22911051212938, + "grad_norm": 0.5875865817070007, + "learning_rate": 0.00040675661090124114, + "loss": 3.464, "step": 29950 }, { - "epoch": 3.228931223764934, - "grad_norm": 0.6302553415298462, - "learning_rate": 0.0004067600474086844, - "loss": 3.456, + "epoch": 3.234501347708895, + "grad_norm": 0.5804892182350159, + "learning_rate": 0.0004064328116567728, + "loss": 3.4666, "step": 30000 }, { - "epoch": 3.228931223764934, - "eval_accuracy": 0.3716267760265226, - "eval_loss": 3.489248275756836, - "eval_runtime": 180.7912, - "eval_samples_per_second": 99.623, - "eval_steps_per_second": 6.228, + "epoch": 3.234501347708895, + "eval_accuracy": 0.37127311087668646, + "eval_loss": 3.491586923599243, + "eval_runtime": 185.2775, + "eval_samples_per_second": 97.211, + "eval_steps_per_second": 6.077, "step": 30000 }, { - "epoch": 3.234312775804542, - "grad_norm": 0.6658568382263184, - "learning_rate": 0.00040643680637862293, - "loss": 3.4516, + "epoch": 3.2398921832884096, + "grad_norm": 0.6126497983932495, + "learning_rate": 0.0004061090124123043, + "loss": 3.4724, "step": 30050 }, { - "epoch": 3.23969432784415, - "grad_norm": 0.6323238611221313, - "learning_rate": 0.0004061135653485615, - "loss": 3.457, + "epoch": 3.2452830188679247, + "grad_norm": 0.5532984733581543, + "learning_rate": 0.0004057852131678359, + "loss": 3.4516, "step": 30100 }, { - "epoch": 3.2450758798837587, - "grad_norm": 0.5995866060256958, - "learning_rate": 0.00040579678913910133, - "loss": 3.4326, + "epoch": 3.2506738544474394, + "grad_norm": 0.6137168407440186, + "learning_rate": 0.00040546141392336745, + "loss": 3.4561, "step": 30150 }, { - "epoch": 3.250457431923367, - "grad_norm": 0.6197038888931274, - "learning_rate": 0.0004054735481090399, - "loss": 3.4274, + "epoch": 3.256064690026954, + "grad_norm": 0.6394794583320618, + "learning_rate": 0.00040513761467889906, + "loss": 3.4598, "step": 30200 }, { - "epoch": 3.255838983962975, - "grad_norm": 0.5652536749839783, - "learning_rate": 0.0004051503070789786, - "loss": 3.4548, + "epoch": 3.2614555256064692, + "grad_norm": 0.600420355796814, + "learning_rate": 0.0004048138154344306, + "loss": 3.4413, "step": 30250 }, { - "epoch": 3.261220536002583, - "grad_norm": 0.6363932490348816, - "learning_rate": 0.0004048270660489171, - "loss": 3.4593, + "epoch": 3.266846361185984, + "grad_norm": 0.6034658551216125, + "learning_rate": 0.0004044900161899622, + "loss": 3.4709, "step": 30300 }, { - "epoch": 3.2666020880421915, - "grad_norm": 0.6436594128608704, - "learning_rate": 0.0004045038250188557, - "loss": 3.4508, + "epoch": 3.2722371967654986, + "grad_norm": 0.5927817225456238, + "learning_rate": 0.0004041726929303831, + "loss": 3.4559, "step": 30350 }, { - "epoch": 3.2719836400817996, - "grad_norm": 0.627250075340271, - "learning_rate": 0.00040418058398879425, - "loss": 3.4444, + "epoch": 3.2776280323450133, + "grad_norm": 0.6020606756210327, + "learning_rate": 0.0004038488936859147, + "loss": 3.4567, "step": 30400 }, { - "epoch": 3.2773651921214078, - "grad_norm": 0.659887433052063, - "learning_rate": 0.00040385734295873284, - "loss": 3.4687, + "epoch": 3.2830188679245285, + "grad_norm": 0.6315540671348572, + "learning_rate": 0.00040352509444144626, + "loss": 3.4581, "step": 30450 }, { - "epoch": 3.282746744161016, - "grad_norm": 0.5804165005683899, - "learning_rate": 0.00040353410192867144, - "loss": 3.4656, + "epoch": 3.288409703504043, + "grad_norm": 0.6471781730651855, + "learning_rate": 0.00040320129519697787, + "loss": 3.4872, "step": 30500 }, { - "epoch": 3.2881282962006244, - "grad_norm": 0.6306347846984863, - "learning_rate": 0.00040321086089861003, - "loss": 3.4417, + "epoch": 3.293800539083558, + "grad_norm": 0.6469458937644958, + "learning_rate": 0.0004028774959525094, + "loss": 3.4568, "step": 30550 }, { - "epoch": 3.2935098482402325, - "grad_norm": 0.6350857019424438, - "learning_rate": 0.00040288761986854863, - "loss": 3.4682, + "epoch": 3.2991913746630726, + "grad_norm": 0.6129242181777954, + "learning_rate": 0.000402553696708041, + "loss": 3.4642, "step": 30600 }, { - "epoch": 3.2988914002798406, - "grad_norm": 0.7076759338378906, - "learning_rate": 0.0004025643788384872, - "loss": 3.4531, + "epoch": 3.3045822102425877, + "grad_norm": 0.6130914092063904, + "learning_rate": 0.0004022298974635726, + "loss": 3.4538, "step": 30650 }, { - "epoch": 3.304272952319449, - "grad_norm": 0.6139525175094604, - "learning_rate": 0.00040224113780842576, - "loss": 3.4659, + "epoch": 3.3099730458221024, + "grad_norm": 0.6233851313591003, + "learning_rate": 0.00040190609821910407, + "loss": 3.4567, "step": 30700 }, { - "epoch": 3.3096545043590573, - "grad_norm": 0.6305450201034546, - "learning_rate": 0.00040191789677836436, - "loss": 3.4443, + "epoch": 3.315363881401617, + "grad_norm": 0.6533472537994385, + "learning_rate": 0.0004015822989746357, + "loss": 3.445, "step": 30750 }, { - "epoch": 3.3150360563986654, - "grad_norm": 0.5848320722579956, - "learning_rate": 0.0004015946557483029, - "loss": 3.453, + "epoch": 3.3207547169811322, + "grad_norm": 0.6035603284835815, + "learning_rate": 0.0004012584997301672, + "loss": 3.4614, "step": 30800 }, { - "epoch": 3.3204176084382735, - "grad_norm": 0.6427562236785889, - "learning_rate": 0.00040127141471824155, - "loss": 3.4731, + "epoch": 3.326145552560647, + "grad_norm": 0.6911303997039795, + "learning_rate": 0.00040093470048569883, + "loss": 3.4716, "step": 30850 }, { - "epoch": 3.3257991604778816, - "grad_norm": 0.6468438506126404, - "learning_rate": 0.00040094817368818014, - "loss": 3.4487, + "epoch": 3.3315363881401616, + "grad_norm": 0.6303085088729858, + "learning_rate": 0.0004006109012412304, + "loss": 3.4612, "step": 30900 }, { - "epoch": 3.33118071251749, - "grad_norm": 0.6019175052642822, - "learning_rate": 0.0004006249326581187, - "loss": 3.4629, + "epoch": 3.3369272237196768, + "grad_norm": 0.6187211871147156, + "learning_rate": 0.000400287101996762, + "loss": 3.4651, "step": 30950 }, { - "epoch": 3.3365622645570983, - "grad_norm": 0.6581705212593079, - "learning_rate": 0.0004003016916280573, - "loss": 3.4661, + "epoch": 3.3423180592991915, + "grad_norm": 0.5940481424331665, + "learning_rate": 0.00039996330275229354, + "loss": 3.4547, "step": 31000 }, { - "epoch": 3.3365622645570983, - "eval_accuracy": 0.37244656206508286, - "eval_loss": 3.4832379817962646, - "eval_runtime": 180.6125, - "eval_samples_per_second": 99.722, - "eval_steps_per_second": 6.234, + "epoch": 3.3423180592991915, + "eval_accuracy": 0.37221404488515986, + "eval_loss": 3.4880588054656982, + "eval_runtime": 184.5498, + "eval_samples_per_second": 97.594, + "eval_steps_per_second": 6.101, "step": 31000 }, { - "epoch": 3.3419438165967064, - "grad_norm": 0.65335613489151, - "learning_rate": 0.00039997845059799587, - "loss": 3.4359, + "epoch": 3.347708894878706, + "grad_norm": 0.567207396030426, + "learning_rate": 0.00039963950350782514, + "loss": 3.4654, "step": 31050 }, { - "epoch": 3.347325368636315, - "grad_norm": 0.6507302522659302, - "learning_rate": 0.00039965520956793447, - "loss": 3.4525, + "epoch": 3.353099730458221, + "grad_norm": 0.6383653879165649, + "learning_rate": 0.0003993157042633567, + "loss": 3.4536, "step": 31100 }, { - "epoch": 3.352706920675923, - "grad_norm": 0.5894781351089478, - "learning_rate": 0.00039933196853787306, - "loss": 3.4516, + "epoch": 3.358490566037736, + "grad_norm": 0.6113681197166443, + "learning_rate": 0.00039899190501888824, + "loss": 3.4568, "step": 31150 }, { - "epoch": 3.358088472715531, - "grad_norm": 0.6025640964508057, - "learning_rate": 0.00039900872750781166, - "loss": 3.4648, + "epoch": 3.3638814016172507, + "grad_norm": 0.5768980979919434, + "learning_rate": 0.00039866810577441985, + "loss": 3.4645, "step": 31200 }, { - "epoch": 3.3634700247551392, - "grad_norm": 0.6366141438484192, - "learning_rate": 0.0003986854864777502, - "loss": 3.4504, + "epoch": 3.3692722371967654, + "grad_norm": 0.5896947979927063, + "learning_rate": 0.0003983443065299514, + "loss": 3.4555, "step": 31250 }, { - "epoch": 3.368851576794748, - "grad_norm": 0.6077277064323425, - "learning_rate": 0.0003983622454476888, - "loss": 3.4439, + "epoch": 3.37466307277628, + "grad_norm": 0.6593387126922607, + "learning_rate": 0.000398020507285483, + "loss": 3.4606, "step": 31300 }, { - "epoch": 3.374233128834356, - "grad_norm": 0.6532594561576843, - "learning_rate": 0.00039803900441762733, - "loss": 3.4624, + "epoch": 3.3800539083557952, + "grad_norm": 0.6088037490844727, + "learning_rate": 0.0003976967080410145, + "loss": 3.4654, "step": 31350 }, { - "epoch": 3.379614680873964, - "grad_norm": 0.6048536896705627, - "learning_rate": 0.000397715763387566, - "loss": 3.4589, + "epoch": 3.38544474393531, + "grad_norm": 0.585678219795227, + "learning_rate": 0.0003973729087965461, + "loss": 3.4782, "step": 31400 }, { - "epoch": 3.384996232913572, - "grad_norm": 0.6351385712623596, - "learning_rate": 0.0003973925223575046, - "loss": 3.4395, + "epoch": 3.3908355795148246, + "grad_norm": 0.6365036368370056, + "learning_rate": 0.00039704910955207765, + "loss": 3.4588, "step": 31450 }, { - "epoch": 3.3903777849531807, - "grad_norm": 0.6504836082458496, - "learning_rate": 0.0003970692813274431, - "loss": 3.4588, + "epoch": 3.3962264150943398, + "grad_norm": 0.5940556526184082, + "learning_rate": 0.00039672531030760926, + "loss": 3.4557, "step": 31500 }, { - "epoch": 3.3957593369927888, - "grad_norm": 0.6237932443618774, - "learning_rate": 0.0003967460402973817, - "loss": 3.4532, + "epoch": 3.4016172506738545, + "grad_norm": 0.6067618727684021, + "learning_rate": 0.0003964015110631408, + "loss": 3.4426, "step": 31550 }, { - "epoch": 3.401140889032397, - "grad_norm": 0.5831241011619568, - "learning_rate": 0.0003964227992673203, - "loss": 3.447, + "epoch": 3.407008086253369, + "grad_norm": 0.6328786015510559, + "learning_rate": 0.00039607771181867236, + "loss": 3.4521, "step": 31600 }, { - "epoch": 3.4065224410720054, - "grad_norm": 0.6177220344543457, - "learning_rate": 0.00039609955823725884, - "loss": 3.4438, + "epoch": 3.4123989218328843, + "grad_norm": 0.5680471658706665, + "learning_rate": 0.00039575391257420397, + "loss": 3.4677, "step": 31650 }, { - "epoch": 3.4119039931116135, - "grad_norm": 0.6046972870826721, - "learning_rate": 0.0003957763172071975, - "loss": 3.4752, + "epoch": 3.417789757412399, + "grad_norm": 0.6207356452941895, + "learning_rate": 0.0003954301133297355, + "loss": 3.4535, "step": 31700 }, { - "epoch": 3.4172855451512216, - "grad_norm": 0.6035352349281311, - "learning_rate": 0.0003954530761771361, - "loss": 3.4588, + "epoch": 3.4231805929919137, + "grad_norm": 0.6631540656089783, + "learning_rate": 0.0003951063140852671, + "loss": 3.4616, "step": 31750 }, { - "epoch": 3.4226670971908297, - "grad_norm": 0.5911862850189209, - "learning_rate": 0.00039512983514707463, - "loss": 3.4505, + "epoch": 3.4285714285714284, + "grad_norm": 0.6220523118972778, + "learning_rate": 0.00039478251484079867, + "loss": 3.4612, "step": 31800 }, { - "epoch": 3.428048649230438, - "grad_norm": 0.5918540358543396, - "learning_rate": 0.0003948065941170132, - "loss": 3.4555, + "epoch": 3.4339622641509435, + "grad_norm": 0.5873520374298096, + "learning_rate": 0.0003944587155963303, + "loss": 3.4504, "step": 31850 }, { - "epoch": 3.4334302012700464, - "grad_norm": 0.6266008019447327, - "learning_rate": 0.00039448335308695176, - "loss": 3.4768, + "epoch": 3.439353099730458, + "grad_norm": 0.5860127806663513, + "learning_rate": 0.0003941349163518618, + "loss": 3.4518, "step": 31900 }, { - "epoch": 3.4388117533096545, - "grad_norm": 0.6207110285758972, - "learning_rate": 0.00039416011205689036, - "loss": 3.4568, + "epoch": 3.444743935309973, + "grad_norm": 0.5693365931510925, + "learning_rate": 0.00039381111710739343, + "loss": 3.4594, "step": 31950 }, { - "epoch": 3.4441933053492626, - "grad_norm": 0.6404582858085632, - "learning_rate": 0.000393836871026829, - "loss": 3.4577, + "epoch": 3.450134770889488, + "grad_norm": 0.6177678108215332, + "learning_rate": 0.000393487317862925, + "loss": 3.4656, "step": 32000 }, { - "epoch": 3.4441933053492626, - "eval_accuracy": 0.37327199805380945, - "eval_loss": 3.4748642444610596, - "eval_runtime": 180.6631, - "eval_samples_per_second": 99.694, - "eval_steps_per_second": 6.233, + "epoch": 3.450134770889488, + "eval_accuracy": 0.37241483542184106, + "eval_loss": 3.479771375656128, + "eval_runtime": 184.9587, + "eval_samples_per_second": 97.378, + "eval_steps_per_second": 6.088, "step": 32000 }, { - "epoch": 3.449574857388871, - "grad_norm": 0.6518675088882446, - "learning_rate": 0.00039351362999676755, - "loss": 3.4445, + "epoch": 3.4555256064690028, + "grad_norm": 0.5787439346313477, + "learning_rate": 0.0003931635186184565, + "loss": 3.4585, "step": 32050 }, { - "epoch": 3.4549564094284793, - "grad_norm": 0.6538469195365906, - "learning_rate": 0.00039319038896670614, - "loss": 3.4755, + "epoch": 3.4609164420485174, + "grad_norm": 0.5935786962509155, + "learning_rate": 0.0003928397193739881, + "loss": 3.4594, "step": 32100 }, { - "epoch": 3.4603379614680874, - "grad_norm": 0.5812484622001648, - "learning_rate": 0.00039287361275724595, - "loss": 3.4689, + "epoch": 3.466307277628032, + "grad_norm": 0.5919837355613708, + "learning_rate": 0.00039251592012951963, + "loss": 3.4575, "step": 32150 }, { - "epoch": 3.4657195135076955, - "grad_norm": 0.6309418678283691, - "learning_rate": 0.00039255037172718454, - "loss": 3.4447, + "epoch": 3.4716981132075473, + "grad_norm": 0.6029905080795288, + "learning_rate": 0.00039219212088505124, + "loss": 3.4581, "step": 32200 }, { - "epoch": 3.471101065547304, - "grad_norm": 0.6192950010299683, - "learning_rate": 0.0003922271306971231, - "loss": 3.4599, + "epoch": 3.477088948787062, + "grad_norm": 0.6316198706626892, + "learning_rate": 0.0003918683216405828, + "loss": 3.4671, "step": 32250 }, { - "epoch": 3.476482617586912, - "grad_norm": 0.6055722832679749, - "learning_rate": 0.0003919038896670617, - "loss": 3.462, + "epoch": 3.4824797843665767, + "grad_norm": 0.6068309545516968, + "learning_rate": 0.0003915445223961144, + "loss": 3.4777, "step": 32300 }, { - "epoch": 3.4818641696265202, - "grad_norm": 0.6457564234733582, - "learning_rate": 0.00039158064863700033, - "loss": 3.4393, + "epoch": 3.487870619946092, + "grad_norm": 0.6286913752555847, + "learning_rate": 0.00039122072315164594, + "loss": 3.4624, "step": 32350 }, { - "epoch": 3.4872457216661283, - "grad_norm": 0.6618093848228455, - "learning_rate": 0.00039125740760693887, - "loss": 3.4606, + "epoch": 3.4932614555256065, + "grad_norm": 0.5759497880935669, + "learning_rate": 0.0003909033998920669, + "loss": 3.4627, "step": 32400 }, { - "epoch": 3.492627273705737, - "grad_norm": 0.5980201363563538, - "learning_rate": 0.00039093416657687746, - "loss": 3.4392, + "epoch": 3.498652291105121, + "grad_norm": 0.669968843460083, + "learning_rate": 0.00039057960064759845, + "loss": 3.443, "step": 32450 }, { - "epoch": 3.498008825745345, - "grad_norm": 0.6752046346664429, - "learning_rate": 0.00039061092554681606, - "loss": 3.4676, + "epoch": 3.5040431266846364, + "grad_norm": 0.6047779321670532, + "learning_rate": 0.0003902622773880194, + "loss": 3.4675, "step": 32500 }, { - "epoch": 3.503390377784953, - "grad_norm": 0.6271814703941345, - "learning_rate": 0.0003902876845167546, - "loss": 3.4844, + "epoch": 3.509433962264151, + "grad_norm": 0.6554362773895264, + "learning_rate": 0.000389938478143551, + "loss": 3.4455, "step": 32550 }, { - "epoch": 3.5087719298245617, - "grad_norm": 0.6477372050285339, - "learning_rate": 0.0003899644434866932, - "loss": 3.457, + "epoch": 3.5148247978436657, + "grad_norm": 0.5947167277336121, + "learning_rate": 0.00038961467889908255, + "loss": 3.4678, "step": 32600 }, { - "epoch": 3.5141534818641698, - "grad_norm": 0.613669753074646, - "learning_rate": 0.00038964120245663184, - "loss": 3.4533, + "epoch": 3.5202156334231804, + "grad_norm": 0.6193543672561646, + "learning_rate": 0.00038929087965461405, + "loss": 3.4529, "step": 32650 }, { - "epoch": 3.519535033903778, - "grad_norm": 0.5857597589492798, - "learning_rate": 0.0003893179614265704, - "loss": 3.4571, + "epoch": 3.525606469002695, + "grad_norm": 0.5975248217582703, + "learning_rate": 0.00038896708041014566, + "loss": 3.4469, "step": 32700 }, { - "epoch": 3.524916585943386, - "grad_norm": 0.6374267339706421, - "learning_rate": 0.000388994720396509, - "loss": 3.455, + "epoch": 3.5309973045822103, + "grad_norm": 0.6135221123695374, + "learning_rate": 0.0003886432811656772, + "loss": 3.4602, "step": 32750 }, { - "epoch": 3.530298137982994, - "grad_norm": 0.6730107665061951, - "learning_rate": 0.0003886714793664475, - "loss": 3.4661, + "epoch": 3.536388140161725, + "grad_norm": 0.5899997353553772, + "learning_rate": 0.0003883194819212088, + "loss": 3.4645, "step": 32800 }, { - "epoch": 3.5356796900226026, - "grad_norm": 0.6586828827857971, - "learning_rate": 0.0003883482383363861, - "loss": 3.4606, + "epoch": 3.5417789757412397, + "grad_norm": 0.5811609625816345, + "learning_rate": 0.00038799568267674036, + "loss": 3.451, "step": 32850 }, { - "epoch": 3.5410612420622107, - "grad_norm": 0.6340796947479248, - "learning_rate": 0.00038802499730632476, - "loss": 3.4729, + "epoch": 3.547169811320755, + "grad_norm": 0.558104932308197, + "learning_rate": 0.00038767188343227197, + "loss": 3.4529, "step": 32900 }, { - "epoch": 3.546442794101819, - "grad_norm": 0.6165903806686401, - "learning_rate": 0.0003877017562762633, - "loss": 3.4466, + "epoch": 3.5525606469002695, + "grad_norm": 0.5764511823654175, + "learning_rate": 0.0003873480841878035, + "loss": 3.474, "step": 32950 }, { - "epoch": 3.5518243461414274, - "grad_norm": 0.6035377383232117, - "learning_rate": 0.0003873785152462019, - "loss": 3.4654, + "epoch": 3.557951482479784, + "grad_norm": 0.6954588890075684, + "learning_rate": 0.0003870242849433351, + "loss": 3.4516, "step": 33000 }, { - "epoch": 3.5518243461414274, - "eval_accuracy": 0.3737836445025279, - "eval_loss": 3.466870069503784, - "eval_runtime": 180.8094, - "eval_samples_per_second": 99.613, - "eval_steps_per_second": 6.228, + "epoch": 3.557951482479784, + "eval_accuracy": 0.373256460690852, + "eval_loss": 3.4739084243774414, + "eval_runtime": 184.6218, + "eval_samples_per_second": 97.556, + "eval_steps_per_second": 6.099, "step": 33000 }, { - "epoch": 3.5572058981810355, - "grad_norm": 0.6327840089797974, - "learning_rate": 0.0003870552742161405, - "loss": 3.4539, + "epoch": 3.5633423180592994, + "grad_norm": 0.5901345014572144, + "learning_rate": 0.00038670048569886667, + "loss": 3.4542, "step": 33050 }, { - "epoch": 3.5625874502206436, - "grad_norm": 0.6125268936157227, - "learning_rate": 0.00038673203318607903, - "loss": 3.4392, + "epoch": 3.568733153638814, + "grad_norm": 0.5970635414123535, + "learning_rate": 0.0003863766864543982, + "loss": 3.4758, "step": 33100 }, { - "epoch": 3.5679690022602517, - "grad_norm": 0.6121278405189514, - "learning_rate": 0.0003864087921560176, - "loss": 3.452, + "epoch": 3.5741239892183287, + "grad_norm": 0.6120544672012329, + "learning_rate": 0.00038605288720992983, + "loss": 3.4519, "step": 33150 }, { - "epoch": 3.57335055429986, - "grad_norm": 0.5984029769897461, - "learning_rate": 0.0003860855511259563, - "loss": 3.4638, + "epoch": 3.579514824797844, + "grad_norm": 0.6588565707206726, + "learning_rate": 0.0003857290879654614, + "loss": 3.468, "step": 33200 }, { - "epoch": 3.5787321063394684, - "grad_norm": 0.6518922448158264, - "learning_rate": 0.0003857623100958948, - "loss": 3.4674, + "epoch": 3.5849056603773586, + "grad_norm": 0.5929945707321167, + "learning_rate": 0.000385405288720993, + "loss": 3.4663, "step": 33250 }, { - "epoch": 3.5841136583790765, - "grad_norm": 0.6262392401695251, - "learning_rate": 0.0003854390690658334, - "loss": 3.4391, + "epoch": 3.5902964959568733, + "grad_norm": 0.6324113607406616, + "learning_rate": 0.00038508148947652453, + "loss": 3.4891, "step": 33300 }, { - "epoch": 3.5894952104186846, - "grad_norm": 0.5916134715080261, - "learning_rate": 0.00038511582803577195, - "loss": 3.4595, + "epoch": 3.595687331536388, + "grad_norm": 0.6006209254264832, + "learning_rate": 0.00038475769023205614, + "loss": 3.4537, "step": 33350 }, { - "epoch": 3.594876762458293, - "grad_norm": 0.6136584877967834, - "learning_rate": 0.00038479258700571054, - "loss": 3.4431, + "epoch": 3.601078167115903, + "grad_norm": 0.5798439383506775, + "learning_rate": 0.00038443389098758763, + "loss": 3.4653, "step": 33400 }, { - "epoch": 3.6002583144979012, - "grad_norm": 0.6699656248092651, - "learning_rate": 0.00038446934597564914, - "loss": 3.4583, + "epoch": 3.606469002695418, + "grad_norm": 0.5905429124832153, + "learning_rate": 0.00038411009174311924, + "loss": 3.4769, "step": 33450 }, { - "epoch": 3.6056398665375093, - "grad_norm": 0.5867238640785217, - "learning_rate": 0.00038414610494558773, - "loss": 3.4409, + "epoch": 3.6118598382749325, + "grad_norm": 0.6151285767555237, + "learning_rate": 0.0003837862924986508, + "loss": 3.4692, "step": 33500 }, { - "epoch": 3.611021418577118, - "grad_norm": 0.6047042012214661, - "learning_rate": 0.00038382286391552633, - "loss": 3.4538, + "epoch": 3.617250673854447, + "grad_norm": 0.6235437393188477, + "learning_rate": 0.00038346249325418234, + "loss": 3.4569, "step": 33550 }, { - "epoch": 3.616402970616726, - "grad_norm": 0.6440869569778442, - "learning_rate": 0.0003834996228854649, - "loss": 3.4487, + "epoch": 3.6226415094339623, + "grad_norm": 0.5967689156532288, + "learning_rate": 0.00038313869400971395, + "loss": 3.4543, "step": 33600 }, { - "epoch": 3.621784522656334, - "grad_norm": 0.5783991813659668, - "learning_rate": 0.00038317638185540346, - "loss": 3.4664, + "epoch": 3.628032345013477, + "grad_norm": 0.6635581851005554, + "learning_rate": 0.0003828148947652455, + "loss": 3.4467, "step": 33650 }, { - "epoch": 3.627166074695942, - "grad_norm": 0.611685574054718, - "learning_rate": 0.00038285314082534206, - "loss": 3.4513, + "epoch": 3.6334231805929917, + "grad_norm": 0.6080300807952881, + "learning_rate": 0.0003824910955207771, + "loss": 3.4451, "step": 33700 }, { - "epoch": 3.6325476267355503, - "grad_norm": 0.6536784172058105, - "learning_rate": 0.0003825298997952806, - "loss": 3.4523, + "epoch": 3.638814016172507, + "grad_norm": 0.6412973999977112, + "learning_rate": 0.00038216729627630865, + "loss": 3.4645, "step": 33750 }, { - "epoch": 3.637929178775159, - "grad_norm": 0.557302713394165, - "learning_rate": 0.00038220665876521925, - "loss": 3.4496, + "epoch": 3.6442048517520216, + "grad_norm": 0.5894738435745239, + "learning_rate": 0.00038184349703184026, + "loss": 3.4445, "step": 33800 }, { - "epoch": 3.643310730814767, - "grad_norm": 0.6502556204795837, - "learning_rate": 0.00038188341773515784, - "loss": 3.4551, + "epoch": 3.6495956873315363, + "grad_norm": 0.6351161599159241, + "learning_rate": 0.0003815196977873718, + "loss": 3.4575, "step": 33850 }, { - "epoch": 3.648692282854375, - "grad_norm": 0.6656308770179749, - "learning_rate": 0.0003815601767050964, - "loss": 3.4679, + "epoch": 3.6549865229110514, + "grad_norm": 0.6070971488952637, + "learning_rate": 0.0003811958985429034, + "loss": 3.4581, "step": 33900 }, { - "epoch": 3.6540738348939836, - "grad_norm": 0.6328909993171692, - "learning_rate": 0.000381236935675035, - "loss": 3.4338, + "epoch": 3.660377358490566, + "grad_norm": 0.6513911485671997, + "learning_rate": 0.00038087209929843496, + "loss": 3.4564, "step": 33950 }, { - "epoch": 3.6594553869335917, - "grad_norm": 0.5742416977882385, - "learning_rate": 0.0003809136946449735, - "loss": 3.4522, + "epoch": 3.665768194070081, + "grad_norm": 0.6223207712173462, + "learning_rate": 0.00038054830005396646, + "loss": 3.4754, "step": 34000 }, { - "epoch": 3.6594553869335917, - "eval_accuracy": 0.3745204197347979, - "eval_loss": 3.4621152877807617, - "eval_runtime": 180.7271, - "eval_samples_per_second": 99.659, - "eval_steps_per_second": 6.23, + "epoch": 3.665768194070081, + "eval_accuracy": 0.37367835985423564, + "eval_loss": 3.467810869216919, + "eval_runtime": 184.9879, + "eval_samples_per_second": 97.363, + "eval_steps_per_second": 6.087, "step": 34000 }, { - "epoch": 3.6648369389732, - "grad_norm": 0.6359716653823853, - "learning_rate": 0.00038059045361491217, - "loss": 3.4588, + "epoch": 3.671159029649596, + "grad_norm": 0.6494529247283936, + "learning_rate": 0.00038022450080949806, + "loss": 3.4553, "step": 34050 }, { - "epoch": 3.670218491012808, - "grad_norm": 0.5943437814712524, - "learning_rate": 0.00038026721258485076, - "loss": 3.4557, + "epoch": 3.6765498652291106, + "grad_norm": 0.5826172828674316, + "learning_rate": 0.0003799007015650296, + "loss": 3.4634, "step": 34100 }, { - "epoch": 3.675600043052416, - "grad_norm": 0.6035296320915222, - "learning_rate": 0.0003799439715547893, - "loss": 3.4482, + "epoch": 3.6819407008086253, + "grad_norm": 0.6290008425712585, + "learning_rate": 0.0003795769023205612, + "loss": 3.47, "step": 34150 }, { - "epoch": 3.6809815950920246, - "grad_norm": 0.6305634379386902, - "learning_rate": 0.00037962719534532916, - "loss": 3.4752, + "epoch": 3.68733153638814, + "grad_norm": 0.5976738333702087, + "learning_rate": 0.00037925310307609277, + "loss": 3.4675, "step": 34200 }, { - "epoch": 3.6863631471316327, - "grad_norm": 0.5965917706489563, - "learning_rate": 0.0003793039543152677, - "loss": 3.4266, + "epoch": 3.6927223719676547, + "grad_norm": 0.6034207940101624, + "learning_rate": 0.0003789293038316244, + "loss": 3.4609, "step": 34250 }, { - "epoch": 3.691744699171241, - "grad_norm": 0.6232327818870544, - "learning_rate": 0.0003789807132852063, - "loss": 3.4366, + "epoch": 3.69811320754717, + "grad_norm": 0.6554673314094543, + "learning_rate": 0.0003786055045871559, + "loss": 3.4695, "step": 34300 }, { - "epoch": 3.6971262512108494, - "grad_norm": 0.6077519655227661, - "learning_rate": 0.0003786574722551449, - "loss": 3.4553, + "epoch": 3.7035040431266846, + "grad_norm": 0.6599771976470947, + "learning_rate": 0.0003782817053426875, + "loss": 3.4774, "step": 34350 }, { - "epoch": 3.7025078032504575, - "grad_norm": 0.6324222087860107, - "learning_rate": 0.00037833423122508343, - "loss": 3.4602, + "epoch": 3.7088948787061993, + "grad_norm": 0.576423704624176, + "learning_rate": 0.0003779579060982191, + "loss": 3.4506, "step": 34400 }, { - "epoch": 3.7078893552900656, - "grad_norm": 0.6138796210289001, - "learning_rate": 0.0003780109901950221, - "loss": 3.4462, + "epoch": 3.7142857142857144, + "grad_norm": 0.6085602641105652, + "learning_rate": 0.00037763410685375063, + "loss": 3.4371, "step": 34450 }, { - "epoch": 3.713270907329674, - "grad_norm": 0.6130177974700928, - "learning_rate": 0.0003776877491649607, - "loss": 3.4534, + "epoch": 3.719676549865229, + "grad_norm": 0.6494543552398682, + "learning_rate": 0.00037731030760928223, + "loss": 3.4561, "step": 34500 }, { - "epoch": 3.7186524593692822, - "grad_norm": 0.585455596446991, - "learning_rate": 0.0003773645081348992, - "loss": 3.4594, + "epoch": 3.725067385444744, + "grad_norm": 0.5890125036239624, + "learning_rate": 0.0003769865083648138, + "loss": 3.4745, "step": 34550 }, { - "epoch": 3.7240340114088903, - "grad_norm": 0.6451506614685059, - "learning_rate": 0.0003770412671048378, - "loss": 3.4623, + "epoch": 3.730458221024259, + "grad_norm": 0.6737868785858154, + "learning_rate": 0.0003766627091203454, + "loss": 3.4653, "step": 34600 }, { - "epoch": 3.7294155634484984, - "grad_norm": 0.6334204077720642, - "learning_rate": 0.00037671802607477635, - "loss": 3.4552, + "epoch": 3.7358490566037736, + "grad_norm": 0.6290357708930969, + "learning_rate": 0.00037633890987587694, + "loss": 3.4407, "step": 34650 }, { - "epoch": 3.7347971154881066, - "grad_norm": 0.6382574439048767, - "learning_rate": 0.000376394785044715, - "loss": 3.4504, + "epoch": 3.7412398921832883, + "grad_norm": 0.5991079807281494, + "learning_rate": 0.00037601511063140855, + "loss": 3.4806, "step": 34700 }, { - "epoch": 3.740178667527715, - "grad_norm": 0.6397031545639038, - "learning_rate": 0.0003760715440146536, - "loss": 3.4533, + "epoch": 3.7466307277628035, + "grad_norm": 0.6049849987030029, + "learning_rate": 0.00037569131138694004, + "loss": 3.4436, "step": 34750 }, { - "epoch": 3.745560219567323, - "grad_norm": 0.644877016544342, - "learning_rate": 0.00037574830298459214, - "loss": 3.4596, + "epoch": 3.752021563342318, + "grad_norm": 0.6415731906890869, + "learning_rate": 0.0003753675121424716, + "loss": 3.4751, "step": 34800 }, { - "epoch": 3.7509417716069313, - "grad_norm": 0.6290867924690247, - "learning_rate": 0.00037542506195453073, - "loss": 3.4531, + "epoch": 3.757412398921833, + "grad_norm": 0.602078378200531, + "learning_rate": 0.0003750437128980032, + "loss": 3.4876, "step": 34850 }, { - "epoch": 3.75632332364654, - "grad_norm": 0.6182779669761658, - "learning_rate": 0.0003751018209244693, - "loss": 3.4488, + "epoch": 3.7628032345013476, + "grad_norm": 0.5774201154708862, + "learning_rate": 0.00037471991365353475, + "loss": 3.4721, "step": 34900 }, { - "epoch": 3.761704875686148, - "grad_norm": 0.6078924536705017, - "learning_rate": 0.00037477857989440787, - "loss": 3.4599, + "epoch": 3.7681940700808623, + "grad_norm": 0.5578243136405945, + "learning_rate": 0.00037439611440906635, + "loss": 3.4525, "step": 34950 }, { - "epoch": 3.767086427725756, - "grad_norm": 0.6303339004516602, - "learning_rate": 0.0003744553388643465, - "loss": 3.4511, + "epoch": 3.7735849056603774, + "grad_norm": 0.5971388220787048, + "learning_rate": 0.0003740723151645979, + "loss": 3.4543, "step": 35000 }, { - "epoch": 3.767086427725756, - "eval_accuracy": 0.37501337788681216, - "eval_loss": 3.45632266998291, - "eval_runtime": 180.682, - "eval_samples_per_second": 99.683, - "eval_steps_per_second": 6.232, + "epoch": 3.7735849056603774, + "eval_accuracy": 0.37426954021683423, + "eval_loss": 3.4600491523742676, + "eval_runtime": 184.946, + "eval_samples_per_second": 97.385, + "eval_steps_per_second": 6.088, "step": 35000 }, { - "epoch": 3.772467979765364, - "grad_norm": 0.6259037852287292, - "learning_rate": 0.0003741320978342851, - "loss": 3.4522, + "epoch": 3.778975741239892, + "grad_norm": 0.6439304947853088, + "learning_rate": 0.0003737485159201295, + "loss": 3.4757, "step": 35050 }, { - "epoch": 3.7778495318049723, - "grad_norm": 0.6218352913856506, - "learning_rate": 0.00037380885680422365, - "loss": 3.452, + "epoch": 3.784366576819407, + "grad_norm": 0.6615363955497742, + "learning_rate": 0.00037342471667566106, + "loss": 3.4424, "step": 35100 }, { - "epoch": 3.783231083844581, - "grad_norm": 0.6394650936126709, - "learning_rate": 0.00037348561577416224, - "loss": 3.4425, + "epoch": 3.789757412398922, + "grad_norm": 0.5871990323066711, + "learning_rate": 0.00037310091743119266, + "loss": 3.4652, "step": 35150 }, { - "epoch": 3.788612635884189, - "grad_norm": 0.6433684825897217, - "learning_rate": 0.0003731623747441008, - "loss": 3.4432, + "epoch": 3.7951482479784366, + "grad_norm": 0.6038589477539062, + "learning_rate": 0.0003727771181867242, + "loss": 3.4471, "step": 35200 }, { - "epoch": 3.793994187923797, - "grad_norm": 0.5973502397537231, - "learning_rate": 0.0003728391337140394, - "loss": 3.4626, + "epoch": 3.8005390835579513, + "grad_norm": 0.6315929889678955, + "learning_rate": 0.00037245331894225576, + "loss": 3.4738, "step": 35250 }, { - "epoch": 3.7993757399634056, - "grad_norm": 0.6366912126541138, - "learning_rate": 0.00037251589268397803, - "loss": 3.4434, + "epoch": 3.8059299191374665, + "grad_norm": 0.6901034116744995, + "learning_rate": 0.00037212951969778737, + "loss": 3.4629, "step": 35300 }, { - "epoch": 3.8047572920030137, - "grad_norm": 0.6271637082099915, - "learning_rate": 0.00037219265165391657, - "loss": 3.4478, + "epoch": 3.811320754716981, + "grad_norm": 0.6126003265380859, + "learning_rate": 0.00037180572045331887, + "loss": 3.4642, "step": 35350 }, { - "epoch": 3.810138844042622, - "grad_norm": 0.6246579885482788, - "learning_rate": 0.00037186941062385516, - "loss": 3.4664, + "epoch": 3.816711590296496, + "grad_norm": 0.6850118637084961, + "learning_rate": 0.00037148192120885047, + "loss": 3.4828, "step": 35400 }, { - "epoch": 3.8155203960822304, - "grad_norm": 0.6329902410507202, - "learning_rate": 0.0003715461695937937, - "loss": 3.4509, + "epoch": 3.822102425876011, + "grad_norm": 0.6038691401481628, + "learning_rate": 0.000371158121964382, + "loss": 3.4385, "step": 35450 }, { - "epoch": 3.8209019481218385, - "grad_norm": 0.6158545613288879, - "learning_rate": 0.0003712229285637323, - "loss": 3.4538, + "epoch": 3.8274932614555257, + "grad_norm": 0.6459229588508606, + "learning_rate": 0.0003708343227199136, + "loss": 3.4753, "step": 35500 }, { - "epoch": 3.8262835001614466, - "grad_norm": 0.6100180149078369, - "learning_rate": 0.0003708996875336709, - "loss": 3.4539, + "epoch": 3.8328840970350404, + "grad_norm": 0.6884293556213379, + "learning_rate": 0.0003705105234754452, + "loss": 3.4546, "step": 35550 }, { - "epoch": 3.8316650522010547, - "grad_norm": 0.6152170896530151, - "learning_rate": 0.0003705764465036095, - "loss": 3.4498, + "epoch": 3.838274932614555, + "grad_norm": 0.598310649394989, + "learning_rate": 0.0003701867242309768, + "loss": 3.4446, "step": 35600 }, { - "epoch": 3.837046604240663, - "grad_norm": 0.6338374018669128, - "learning_rate": 0.0003702532054735481, - "loss": 3.4563, + "epoch": 3.8436657681940702, + "grad_norm": 0.6089202761650085, + "learning_rate": 0.00036986292498650833, + "loss": 3.4598, "step": 35650 }, { - "epoch": 3.8424281562802713, - "grad_norm": 0.6253648996353149, - "learning_rate": 0.0003699299644434867, - "loss": 3.452, + "epoch": 3.849056603773585, + "grad_norm": 0.6563847661018372, + "learning_rate": 0.0003695391257420399, + "loss": 3.4544, "step": 35700 }, { - "epoch": 3.8478097083198795, - "grad_norm": 0.6302428245544434, - "learning_rate": 0.0003696067234134252, - "loss": 3.456, + "epoch": 3.8544474393530996, + "grad_norm": 0.6375155448913574, + "learning_rate": 0.0003692153264975715, + "loss": 3.4363, "step": 35750 }, { - "epoch": 3.8531912603594876, - "grad_norm": 0.6266033053398132, - "learning_rate": 0.0003692834823833638, - "loss": 3.4385, + "epoch": 3.8598382749326143, + "grad_norm": 0.6118590831756592, + "learning_rate": 0.00036889152725310304, + "loss": 3.4561, "step": 35800 }, { - "epoch": 3.858572812399096, - "grad_norm": 0.613212525844574, - "learning_rate": 0.00036896024135330246, - "loss": 3.4627, + "epoch": 3.8652291105121295, + "grad_norm": 0.637126624584198, + "learning_rate": 0.00036856772800863464, + "loss": 3.4628, "step": 35850 }, { - "epoch": 3.863954364438704, - "grad_norm": 0.6388803720474243, - "learning_rate": 0.000368637000323241, - "loss": 3.4467, + "epoch": 3.870619946091644, + "grad_norm": 0.5772865414619446, + "learning_rate": 0.0003682439287641662, + "loss": 3.4519, "step": 35900 }, { - "epoch": 3.8693359164783123, - "grad_norm": 0.6624432802200317, - "learning_rate": 0.0003683137592931796, - "loss": 3.4508, + "epoch": 3.876010781671159, + "grad_norm": 0.5758655667304993, + "learning_rate": 0.0003679201295196978, + "loss": 3.4571, "step": 35950 }, { - "epoch": 3.8747174685179204, - "grad_norm": 0.6262862086296082, - "learning_rate": 0.00036799051826311814, - "loss": 3.4455, + "epoch": 3.881401617250674, + "grad_norm": 0.6438195109367371, + "learning_rate": 0.00036759633027522935, + "loss": 3.4385, "step": 36000 }, { - "epoch": 3.8747174685179204, - "eval_accuracy": 0.37557652580435463, - "eval_loss": 3.453572988510132, - "eval_runtime": 180.586, - "eval_samples_per_second": 99.736, - "eval_steps_per_second": 6.235, + "epoch": 3.881401617250674, + "eval_accuracy": 0.375193850333472, + "eval_loss": 3.4544591903686523, + "eval_runtime": 184.7169, + "eval_samples_per_second": 97.506, + "eval_steps_per_second": 6.096, "step": 36000 }, { - "epoch": 3.8800990205575285, - "grad_norm": 0.6518728137016296, - "learning_rate": 0.00036766727723305673, - "loss": 3.4398, + "epoch": 3.8867924528301887, + "grad_norm": 0.6010862588882446, + "learning_rate": 0.00036727253103076084, + "loss": 3.4421, "step": 36050 }, { - "epoch": 3.885480572597137, - "grad_norm": 0.6276541352272034, - "learning_rate": 0.0003673440362029953, - "loss": 3.4572, + "epoch": 3.8921832884097034, + "grad_norm": 0.5899602770805359, + "learning_rate": 0.00036694873178629245, + "loss": 3.4652, "step": 36100 }, { - "epoch": 3.890862124636745, - "grad_norm": 0.6711607575416565, - "learning_rate": 0.0003670207951729339, - "loss": 3.4699, + "epoch": 3.8975741239892185, + "grad_norm": 0.6755630373954773, + "learning_rate": 0.000366624932541824, + "loss": 3.4556, "step": 36150 }, { - "epoch": 3.8962436766763533, - "grad_norm": 0.610508382320404, - "learning_rate": 0.0003667040189634737, - "loss": 3.4665, + "epoch": 3.9029649595687332, + "grad_norm": 0.5775014758110046, + "learning_rate": 0.0003663011332973556, + "loss": 3.4361, "step": 36200 }, { - "epoch": 3.901625228715962, - "grad_norm": 0.6061813235282898, - "learning_rate": 0.0003663807779334123, - "loss": 3.4368, + "epoch": 3.908355795148248, + "grad_norm": 0.6255040168762207, + "learning_rate": 0.00036597733405288715, + "loss": 3.4413, "step": 36250 }, { - "epoch": 3.90700678075557, - "grad_norm": 0.6153860092163086, - "learning_rate": 0.0003660575369033509, - "loss": 3.4408, + "epoch": 3.913746630727763, + "grad_norm": 0.6343650817871094, + "learning_rate": 0.00036565353480841876, + "loss": 3.4543, "step": 36300 }, { - "epoch": 3.912388332795178, - "grad_norm": 0.6401193141937256, - "learning_rate": 0.0003657342958732895, - "loss": 3.4316, + "epoch": 3.9191374663072778, + "grad_norm": 0.6278274059295654, + "learning_rate": 0.0003653297355639503, + "loss": 3.445, "step": 36350 }, { - "epoch": 3.9177698848347866, - "grad_norm": 0.6334563493728638, - "learning_rate": 0.00036541105484322805, - "loss": 3.4479, + "epoch": 3.9245283018867925, + "grad_norm": 0.6233466267585754, + "learning_rate": 0.0003650059363194819, + "loss": 3.4475, "step": 36400 }, { - "epoch": 3.9231514368743947, - "grad_norm": 0.6796964406967163, - "learning_rate": 0.00036508781381316665, - "loss": 3.4225, + "epoch": 3.929919137466307, + "grad_norm": 0.6265336275100708, + "learning_rate": 0.00036468213707501347, + "loss": 3.4557, "step": 36450 }, { - "epoch": 3.928532988914003, - "grad_norm": 0.5866906046867371, - "learning_rate": 0.0003647645727831053, - "loss": 3.4297, + "epoch": 3.935309973045822, + "grad_norm": 0.6003140211105347, + "learning_rate": 0.0003643648138154344, + "loss": 3.4605, "step": 36500 }, { - "epoch": 3.933914540953611, - "grad_norm": 0.6415225267410278, - "learning_rate": 0.00036444133175304384, - "loss": 3.4563, + "epoch": 3.940700808625337, + "grad_norm": 0.6145981550216675, + "learning_rate": 0.00036404101457096597, + "loss": 3.449, "step": 36550 }, { - "epoch": 3.939296092993219, - "grad_norm": 0.6429531574249268, - "learning_rate": 0.00036411809072298243, - "loss": 3.4365, + "epoch": 3.9460916442048517, + "grad_norm": 0.6301774978637695, + "learning_rate": 0.00036371721532649757, + "loss": 3.4603, "step": 36600 }, { - "epoch": 3.9446776450328276, - "grad_norm": 0.590459942817688, - "learning_rate": 0.00036379484969292097, - "loss": 3.4546, + "epoch": 3.9514824797843664, + "grad_norm": 0.5825753211975098, + "learning_rate": 0.0003633934160820291, + "loss": 3.4527, "step": 36650 }, { - "epoch": 3.9500591970724357, - "grad_norm": 0.5942658185958862, - "learning_rate": 0.00036347160866285956, - "loss": 3.438, + "epoch": 3.9568733153638815, + "grad_norm": 0.6060313582420349, + "learning_rate": 0.00036306961683756073, + "loss": 3.453, "step": 36700 }, { - "epoch": 3.955440749112044, - "grad_norm": 0.6288406848907471, - "learning_rate": 0.0003631483676327981, - "loss": 3.4517, + "epoch": 3.9622641509433962, + "grad_norm": 0.6096614003181458, + "learning_rate": 0.0003627458175930922, + "loss": 3.4412, "step": 36750 }, { - "epoch": 3.9608223011516523, - "grad_norm": 0.6701431274414062, - "learning_rate": 0.00036282512660273675, - "loss": 3.4485, + "epoch": 3.967654986522911, + "grad_norm": 0.6492642760276794, + "learning_rate": 0.0003624220183486238, + "loss": 3.4409, "step": 36800 }, { - "epoch": 3.9662038531912605, - "grad_norm": 0.7019712328910828, - "learning_rate": 0.00036250188557267535, - "loss": 3.4408, + "epoch": 3.973045822102426, + "grad_norm": 0.6882068514823914, + "learning_rate": 0.0003620982191041554, + "loss": 3.4385, "step": 36850 }, { - "epoch": 3.9715854052308686, - "grad_norm": 0.6082743406295776, - "learning_rate": 0.0003621786445426139, - "loss": 3.4446, + "epoch": 3.9784366576819408, + "grad_norm": 0.6335549354553223, + "learning_rate": 0.00036177441985968693, + "loss": 3.4543, "step": 36900 }, { - "epoch": 3.9769669572704767, - "grad_norm": 0.6317737102508545, - "learning_rate": 0.0003618554035125525, - "loss": 3.4349, + "epoch": 3.9838274932614555, + "grad_norm": 0.6579206585884094, + "learning_rate": 0.00036145062061521854, + "loss": 3.4656, "step": 36950 }, { - "epoch": 3.9823485093100848, - "grad_norm": 0.619764506816864, - "learning_rate": 0.0003615321624824911, - "loss": 3.4402, + "epoch": 3.9892183288409706, + "grad_norm": 0.6292389631271362, + "learning_rate": 0.0003611268213707501, + "loss": 3.4609, "step": 37000 }, { - "epoch": 3.9823485093100848, - "eval_accuracy": 0.3760329765860633, - "eval_loss": 3.4432144165039062, - "eval_runtime": 180.6742, - "eval_samples_per_second": 99.688, - "eval_steps_per_second": 6.232, + "epoch": 3.9892183288409706, + "eval_accuracy": 0.3756760518495927, + "eval_loss": 3.4502644538879395, + "eval_runtime": 185.0208, + "eval_samples_per_second": 97.346, + "eval_steps_per_second": 6.086, "step": 37000 }, { - "epoch": 3.9877300613496933, - "grad_norm": 0.649409830570221, - "learning_rate": 0.0003612089214524296, - "loss": 3.4344, + "epoch": 3.9946091644204853, + "grad_norm": 0.6191860437393188, + "learning_rate": 0.0003608030221262817, + "loss": 3.4559, "step": 37050 }, { - "epoch": 3.9931116133893014, - "grad_norm": 0.6841443777084351, - "learning_rate": 0.00036088568042236827, - "loss": 3.4393, + "epoch": 4.0, + "grad_norm": 1.2047762870788574, + "learning_rate": 0.00036047922288181324, + "loss": 3.4466, "step": 37100 }, { - "epoch": 3.9984931654289095, - "grad_norm": 0.6146060824394226, - "learning_rate": 0.00036056243939230686, - "loss": 3.4337, + "epoch": 4.005390835579515, + "grad_norm": 0.6499625444412231, + "learning_rate": 0.00036015542363734485, + "loss": 3.3602, "step": 37150 }, { - "epoch": 4.003874717468518, - "grad_norm": 0.6463866829872131, - "learning_rate": 0.0003602391983622454, - "loss": 3.3701, + "epoch": 4.010781671159029, + "grad_norm": 0.6357666850090027, + "learning_rate": 0.0003598316243928764, + "loss": 3.3603, "step": 37200 }, { - "epoch": 4.009256269508126, - "grad_norm": 0.6479421257972717, - "learning_rate": 0.000359915957332184, - "loss": 3.3524, + "epoch": 4.0161725067385445, + "grad_norm": 0.6046599745750427, + "learning_rate": 0.00035950782514840795, + "loss": 3.3552, "step": 37250 }, { - "epoch": 4.014637821547734, - "grad_norm": 0.7297543883323669, - "learning_rate": 0.00035959271630212254, - "loss": 3.3483, + "epoch": 4.02156334231806, + "grad_norm": 0.7040229439735413, + "learning_rate": 0.00035918402590393955, + "loss": 3.3379, "step": 37300 }, { - "epoch": 4.020019373587343, - "grad_norm": 0.6528753638267517, - "learning_rate": 0.00035926947527206113, - "loss": 3.3525, + "epoch": 4.026954177897574, + "grad_norm": 0.6770185828208923, + "learning_rate": 0.00035886022665947105, + "loss": 3.3644, "step": 37350 }, { - "epoch": 4.0254009256269505, - "grad_norm": 0.6073748469352722, - "learning_rate": 0.0003589462342419998, - "loss": 3.3627, + "epoch": 4.032345013477089, + "grad_norm": 0.6390303373336792, + "learning_rate": 0.0003585364274150027, + "loss": 3.3712, "step": 37400 }, { - "epoch": 4.030782477666559, - "grad_norm": 0.6415776610374451, - "learning_rate": 0.0003586229932119383, - "loss": 3.3607, + "epoch": 4.037735849056604, + "grad_norm": 0.6191239953041077, + "learning_rate": 0.0003582126281705342, + "loss": 3.3588, "step": 37450 }, { - "epoch": 4.036164029706168, - "grad_norm": 0.629884660243988, - "learning_rate": 0.0003582997521818769, - "loss": 3.3572, + "epoch": 4.0431266846361185, + "grad_norm": 0.6024324893951416, + "learning_rate": 0.0003578888289260658, + "loss": 3.3723, "step": 37500 }, { - "epoch": 4.041545581745775, - "grad_norm": 0.6344863772392273, - "learning_rate": 0.0003579765111518155, - "loss": 3.3406, + "epoch": 4.048517520215634, + "grad_norm": 0.6768052577972412, + "learning_rate": 0.00035756502968159736, + "loss": 3.3643, "step": 37550 }, { - "epoch": 4.046927133785384, - "grad_norm": 0.5791500806808472, - "learning_rate": 0.00035765327012175405, - "loss": 3.3334, + "epoch": 4.053908355795148, + "grad_norm": 0.646030068397522, + "learning_rate": 0.00035724123043712896, + "loss": 3.3596, "step": 37600 }, { - "epoch": 4.0523086858249915, - "grad_norm": 0.6291230320930481, - "learning_rate": 0.0003573300290916927, - "loss": 3.364, + "epoch": 4.059299191374663, + "grad_norm": 0.7075605392456055, + "learning_rate": 0.0003569174311926605, + "loss": 3.3739, "step": 37650 }, { - "epoch": 4.0576902378646, - "grad_norm": 0.6606059670448303, - "learning_rate": 0.0003570067880616313, - "loss": 3.3702, + "epoch": 4.064690026954178, + "grad_norm": 0.6510331034660339, + "learning_rate": 0.00035659363194819206, + "loss": 3.3767, "step": 37700 }, { - "epoch": 4.063071789904209, - "grad_norm": 0.66637122631073, - "learning_rate": 0.00035668354703156984, - "loss": 3.375, + "epoch": 4.070080862533692, + "grad_norm": 0.633736252784729, + "learning_rate": 0.00035626983270372367, + "loss": 3.355, "step": 37750 }, { - "epoch": 4.068453341943816, - "grad_norm": 0.6079049110412598, - "learning_rate": 0.00035636030600150843, - "loss": 3.3491, + "epoch": 4.0754716981132075, + "grad_norm": 0.6637200713157654, + "learning_rate": 0.0003559460334592552, + "loss": 3.3565, "step": 37800 }, { - "epoch": 4.073834893983425, - "grad_norm": 0.6581602692604065, - "learning_rate": 0.00035603706497144697, - "loss": 3.3601, + "epoch": 4.080862533692723, + "grad_norm": 0.6610650420188904, + "learning_rate": 0.0003556222342147868, + "loss": 3.3665, "step": 37850 }, { - "epoch": 4.079216446023033, - "grad_norm": 0.5903530716896057, - "learning_rate": 0.00035571382394138557, - "loss": 3.3685, + "epoch": 4.086253369272237, + "grad_norm": 0.5748628377914429, + "learning_rate": 0.0003552984349703184, + "loss": 3.3764, "step": 37900 }, { - "epoch": 4.084597998062641, - "grad_norm": 0.6376979947090149, - "learning_rate": 0.0003553905829113242, - "loss": 3.3583, + "epoch": 4.091644204851752, + "grad_norm": 0.6597402691841125, + "learning_rate": 0.00035497463572585, + "loss": 3.3859, "step": 37950 }, { - "epoch": 4.08997955010225, - "grad_norm": 0.6378331184387207, - "learning_rate": 0.00035506734188126275, - "loss": 3.3602, + "epoch": 4.097035040431267, + "grad_norm": 0.6575984954833984, + "learning_rate": 0.00035465083648138153, + "loss": 3.3725, "step": 38000 }, { - "epoch": 4.08997955010225, - "eval_accuracy": 0.37666750945089983, - "eval_loss": 3.4465579986572266, - "eval_runtime": 180.5241, - "eval_samples_per_second": 99.771, - "eval_steps_per_second": 6.237, + "epoch": 4.097035040431267, + "eval_accuracy": 0.37657613237224785, + "eval_loss": 3.45074462890625, + "eval_runtime": 185.1239, + "eval_samples_per_second": 97.292, + "eval_steps_per_second": 6.082, "step": 38000 }, { - "epoch": 4.095361102141858, - "grad_norm": 0.6897552013397217, - "learning_rate": 0.00035474410085120135, - "loss": 3.3881, + "epoch": 4.1024258760107815, + "grad_norm": 0.6551222205162048, + "learning_rate": 0.00035432703723691314, + "loss": 3.3677, "step": 38050 }, { - "epoch": 4.100742654181466, - "grad_norm": 0.624983549118042, - "learning_rate": 0.00035442085982113994, - "loss": 3.384, + "epoch": 4.107816711590297, + "grad_norm": 0.6057955026626587, + "learning_rate": 0.00035400323799244463, + "loss": 3.3761, "step": 38100 }, { - "epoch": 4.106124206221074, - "grad_norm": 0.6599211692810059, - "learning_rate": 0.0003540976187910785, - "loss": 3.3793, + "epoch": 4.113207547169812, + "grad_norm": 0.5905152559280396, + "learning_rate": 0.0003536794387479762, + "loss": 3.3678, "step": 38150 }, { - "epoch": 4.111505758260682, - "grad_norm": Infinity, - "learning_rate": 0.0003537808425816183, - "loss": 3.3838, + "epoch": 4.118598382749326, + "grad_norm": 0.621264636516571, + "learning_rate": 0.0003533556395035078, + "loss": 3.3929, "step": 38200 }, { - "epoch": 4.1168873103002905, - "grad_norm": 0.6250761151313782, - "learning_rate": 0.0003534576015515569, - "loss": 3.372, + "epoch": 4.123989218328841, + "grad_norm": 0.6266987919807434, + "learning_rate": 0.00035303184025903934, + "loss": 3.3833, "step": 38250 }, { - "epoch": 4.122268862339899, - "grad_norm": 0.6939108967781067, - "learning_rate": 0.00035313436052149553, - "loss": 3.3702, + "epoch": 4.129380053908355, + "grad_norm": 0.6679258346557617, + "learning_rate": 0.00035270804101457094, + "loss": 3.386, "step": 38300 }, { - "epoch": 4.127650414379507, - "grad_norm": 0.6383840441703796, - "learning_rate": 0.0003528111194914341, - "loss": 3.3634, + "epoch": 4.1347708894878705, + "grad_norm": 0.6121634244918823, + "learning_rate": 0.0003523842417701025, + "loss": 3.3758, "step": 38350 }, { - "epoch": 4.133031966419115, - "grad_norm": 0.631736159324646, - "learning_rate": 0.00035248787846137267, - "loss": 3.3815, + "epoch": 4.140161725067386, + "grad_norm": 0.6435196995735168, + "learning_rate": 0.0003520604425256341, + "loss": 3.3692, "step": 38400 }, { - "epoch": 4.138413518458724, - "grad_norm": 0.9963732957839966, - "learning_rate": 0.00035216463743131126, - "loss": 3.3576, + "epoch": 4.1455525606469, + "grad_norm": 0.6009247303009033, + "learning_rate": 0.00035173664328116565, + "loss": 3.3782, "step": 38450 }, { - "epoch": 4.1437950704983315, - "grad_norm": 0.6733307838439941, - "learning_rate": 0.0003518413964012498, - "loss": 3.3926, + "epoch": 4.150943396226415, + "grad_norm": 0.630068302154541, + "learning_rate": 0.00035141284403669725, + "loss": 3.3858, "step": 38500 }, { - "epoch": 4.14917662253794, - "grad_norm": 0.5962156653404236, - "learning_rate": 0.0003515181553711884, - "loss": 3.3779, + "epoch": 4.15633423180593, + "grad_norm": Infinity, + "learning_rate": 0.00035109552077711815, + "loss": 3.3752, "step": 38550 }, { - "epoch": 4.154558174577549, - "grad_norm": 0.6685757637023926, - "learning_rate": 0.00035119491434112705, - "loss": 3.3795, + "epoch": 4.1617250673854445, + "grad_norm": 0.6584694981575012, + "learning_rate": 0.00035077172153264976, + "loss": 3.3719, "step": 38600 }, { - "epoch": 4.159939726617156, - "grad_norm": 0.6577488780021667, - "learning_rate": 0.0003508716733110656, - "loss": 3.3678, + "epoch": 4.16711590296496, + "grad_norm": 0.680530309677124, + "learning_rate": 0.0003504479222881813, + "loss": 3.3995, "step": 38650 }, { - "epoch": 4.165321278656765, - "grad_norm": 0.7001060247421265, - "learning_rate": 0.0003505484322810042, - "loss": 3.3803, + "epoch": 4.172506738544475, + "grad_norm": 0.662716269493103, + "learning_rate": 0.0003501241230437129, + "loss": 3.3773, "step": 38700 }, { - "epoch": 4.1707028306963725, - "grad_norm": 0.6563752889633179, - "learning_rate": 0.0003502251912509427, - "loss": 3.3882, + "epoch": 4.177897574123989, + "grad_norm": 0.6013767123222351, + "learning_rate": 0.0003498003237992444, + "loss": 3.3717, "step": 38750 }, { - "epoch": 4.176084382735981, - "grad_norm": 0.6249417066574097, - "learning_rate": 0.0003499019502208813, - "loss": 3.3791, + "epoch": 4.183288409703504, + "grad_norm": 0.6527413725852966, + "learning_rate": 0.000349476524554776, + "loss": 3.392, "step": 38800 }, { - "epoch": 4.18146593477559, - "grad_norm": 0.6289679408073425, - "learning_rate": 0.0003495787091908199, - "loss": 3.3752, + "epoch": 4.188679245283019, + "grad_norm": 0.6448991298675537, + "learning_rate": 0.00034915272531030756, + "loss": 3.395, "step": 38850 }, { - "epoch": 4.186847486815197, - "grad_norm": 0.6969509720802307, - "learning_rate": 0.0003492554681607585, - "loss": 3.3764, + "epoch": 4.1940700808625335, + "grad_norm": 0.6825259923934937, + "learning_rate": 0.0003488289260658391, + "loss": 3.3939, "step": 38900 }, { - "epoch": 4.192229038854806, - "grad_norm": 0.663133442401886, - "learning_rate": 0.0003489322271306971, - "loss": 3.3928, + "epoch": 4.199460916442049, + "grad_norm": 0.6313694715499878, + "learning_rate": 0.0003485051268213707, + "loss": 3.387, "step": 38950 }, { - "epoch": 4.197610590894414, - "grad_norm": 0.7347210645675659, - "learning_rate": 0.0003486089861006357, - "loss": 3.3807, + "epoch": 4.204851752021563, + "grad_norm": 0.6302531361579895, + "learning_rate": 0.00034818132757690227, + "loss": 3.3828, "step": 39000 }, { - "epoch": 4.197610590894414, - "eval_accuracy": 0.37702878030288295, - "eval_loss": 3.4417600631713867, - "eval_runtime": 180.8436, - "eval_samples_per_second": 99.594, - "eval_steps_per_second": 6.226, + "epoch": 4.204851752021563, + "eval_accuracy": 0.37683353106348033, + "eval_loss": 3.4443628787994385, + "eval_runtime": 186.1059, + "eval_samples_per_second": 96.778, + "eval_steps_per_second": 6.05, "step": 39000 }, { - "epoch": 4.202992142934022, - "grad_norm": 0.72121262550354, - "learning_rate": 0.00034828574507057424, - "loss": 3.3713, + "epoch": 4.210242587601078, + "grad_norm": 0.6770233511924744, + "learning_rate": 0.0003478575283324339, + "loss": 3.3811, "step": 39050 }, { - "epoch": 4.208373694973631, - "grad_norm": 0.6440616250038147, - "learning_rate": 0.00034796250404051283, - "loss": 3.3781, + "epoch": 4.215633423180593, + "grad_norm": 0.6239271759986877, + "learning_rate": 0.0003475337290879654, + "loss": 3.3813, "step": 39100 }, { - "epoch": 4.213755247013238, - "grad_norm": 0.6587533950805664, - "learning_rate": 0.00034763926301045137, - "loss": 3.3838, + "epoch": 4.2210242587601075, + "grad_norm": 0.6964449286460876, + "learning_rate": 0.00034720992984349703, + "loss": 3.379, "step": 39150 }, { - "epoch": 4.219136799052847, - "grad_norm": 0.6770883798599243, - "learning_rate": 0.00034731602198039, - "loss": 3.3788, + "epoch": 4.226415094339623, + "grad_norm": 0.6273817420005798, + "learning_rate": 0.0003468861305990286, + "loss": 3.3769, "step": 39200 }, { - "epoch": 4.224518351092455, - "grad_norm": 0.6280528903007507, - "learning_rate": 0.0003469927809503286, - "loss": 3.3621, + "epoch": 4.231805929919138, + "grad_norm": 0.660798966884613, + "learning_rate": 0.0003465623313545602, + "loss": 3.3748, "step": 39250 }, { - "epoch": 4.229899903132063, - "grad_norm": 0.7106229662895203, - "learning_rate": 0.00034666953992026716, - "loss": 3.3894, + "epoch": 4.237196765498652, + "grad_norm": 0.6665384769439697, + "learning_rate": 0.00034623853211009173, + "loss": 3.3878, "step": 39300 }, { - "epoch": 4.2352814551716715, - "grad_norm": 0.6384456157684326, - "learning_rate": 0.00034634629889020575, - "loss": 3.3686, + "epoch": 4.242587601078167, + "grad_norm": 0.6520787477493286, + "learning_rate": 0.00034591473286562323, + "loss": 3.3853, "step": 39350 }, { - "epoch": 4.24066300721128, - "grad_norm": 0.6792171001434326, - "learning_rate": 0.00034602305786014435, - "loss": 3.3783, + "epoch": 4.247978436657682, + "grad_norm": 0.6408469080924988, + "learning_rate": 0.0003455909336211549, + "loss": 3.3927, "step": 39400 }, { - "epoch": 4.246044559250888, - "grad_norm": 0.6978849768638611, - "learning_rate": 0.00034569981683008294, - "loss": 3.3816, + "epoch": 4.2533692722371965, + "grad_norm": 0.638053297996521, + "learning_rate": 0.0003452671343766864, + "loss": 3.3805, "step": 39450 }, { - "epoch": 4.251426111290496, - "grad_norm": 0.6810808181762695, - "learning_rate": 0.00034537657580002154, - "loss": 3.3828, + "epoch": 4.258760107816712, + "grad_norm": 0.654602587223053, + "learning_rate": 0.000344943335132218, + "loss": 3.3743, "step": 39500 }, { - "epoch": 4.256807663330104, - "grad_norm": 0.6624694466590881, - "learning_rate": 0.00034505333476996013, - "loss": 3.3848, + "epoch": 4.264150943396227, + "grad_norm": 0.7286903858184814, + "learning_rate": 0.00034461953588774954, + "loss": 3.3771, "step": 39550 }, { - "epoch": 4.2621892153697125, - "grad_norm": 0.6291442513465881, - "learning_rate": 0.00034473009373989867, - "loss": 3.3908, + "epoch": 4.269541778975741, + "grad_norm": 0.672970712184906, + "learning_rate": 0.00034429573664328115, + "loss": 3.3774, "step": 39600 }, { - "epoch": 4.267570767409321, - "grad_norm": 0.6254659295082092, - "learning_rate": 0.00034440685270983727, - "loss": 3.3732, + "epoch": 4.274932614555256, + "grad_norm": 0.6813864707946777, + "learning_rate": 0.0003439719373988127, + "loss": 3.4044, "step": 39650 }, { - "epoch": 4.272952319448929, - "grad_norm": 0.6378159523010254, - "learning_rate": 0.0003440836116797758, - "loss": 3.3713, + "epoch": 4.280323450134771, + "grad_norm": 0.6745753288269043, + "learning_rate": 0.00034364813815434425, + "loss": 3.3876, "step": 39700 }, { - "epoch": 4.278333871488537, - "grad_norm": 0.6168226003646851, - "learning_rate": 0.00034376037064971445, - "loss": 3.3812, + "epoch": 4.285714285714286, + "grad_norm": 0.6817641854286194, + "learning_rate": 0.00034332433890987585, + "loss": 3.3838, "step": 39750 }, { - "epoch": 4.283715423528146, - "grad_norm": 0.6415433287620544, - "learning_rate": 0.00034343712961965305, - "loss": 3.3846, + "epoch": 4.291105121293801, + "grad_norm": 0.6120652556419373, + "learning_rate": 0.0003430005396654074, + "loss": 3.3974, "step": 39800 }, { - "epoch": 4.2890969755677535, - "grad_norm": 0.6528810262680054, - "learning_rate": 0.0003431138885895916, - "loss": 3.377, + "epoch": 4.296495956873315, + "grad_norm": 0.6508928537368774, + "learning_rate": 0.000342676740420939, + "loss": 3.4013, "step": 39850 }, { - "epoch": 4.294478527607362, - "grad_norm": 0.6433749794960022, - "learning_rate": 0.0003427906475595302, - "loss": 3.3652, + "epoch": 4.30188679245283, + "grad_norm": 0.6188628077507019, + "learning_rate": 0.00034235294117647056, + "loss": 3.392, "step": 39900 }, { - "epoch": 4.299860079646971, - "grad_norm": 0.6395750641822815, - "learning_rate": 0.0003424674065294688, - "loss": 3.381, + "epoch": 4.307277628032345, + "grad_norm": 0.6334621906280518, + "learning_rate": 0.00034202914193200216, + "loss": 3.3925, "step": 39950 }, { - "epoch": 4.305241631686578, - "grad_norm": 0.6627628207206726, - "learning_rate": 0.0003421441654994073, - "loss": 3.3776, + "epoch": 4.3126684636118595, + "grad_norm": 0.6367536187171936, + "learning_rate": 0.0003417053426875337, + "loss": 3.3856, "step": 40000 }, { - "epoch": 4.305241631686578, - "eval_accuracy": 0.37774751915577576, - "eval_loss": 3.4375598430633545, - "eval_runtime": 180.6158, - "eval_samples_per_second": 99.72, - "eval_steps_per_second": 6.234, + "epoch": 4.3126684636118595, + "eval_accuracy": 0.37727640023421216, + "eval_loss": 3.440788507461548, + "eval_runtime": 187.6321, + "eval_samples_per_second": 95.991, + "eval_steps_per_second": 6.001, "step": 40000 }, { - "epoch": 4.310623183726187, - "grad_norm": 0.6349745392799377, - "learning_rate": 0.00034182092446934597, - "loss": 3.3783, + "epoch": 4.318059299191375, + "grad_norm": 0.6316375732421875, + "learning_rate": 0.0003413815434430653, + "loss": 3.3784, "step": 40050 }, { - "epoch": 4.3160047357657945, - "grad_norm": 0.6404194831848145, - "learning_rate": 0.00034149768343928456, - "loss": 3.3909, + "epoch": 4.32345013477089, + "grad_norm": 0.6225030422210693, + "learning_rate": 0.0003410577441985968, + "loss": 3.406, "step": 40100 }, { - "epoch": 4.321386287805403, - "grad_norm": 0.6430397033691406, - "learning_rate": 0.0003411744424092231, - "loss": 3.3771, + "epoch": 4.328840970350404, + "grad_norm": 0.6359233260154724, + "learning_rate": 0.00034073394495412837, + "loss": 3.4153, "step": 40150 }, { - "epoch": 4.326767839845012, - "grad_norm": 0.660280704498291, - "learning_rate": 0.0003408512013791617, - "loss": 3.3799, + "epoch": 4.334231805929919, + "grad_norm": 0.6805363297462463, + "learning_rate": 0.00034041014570965997, + "loss": 3.383, "step": 40200 }, { - "epoch": 4.332149391884619, - "grad_norm": 0.6253730654716492, - "learning_rate": 0.0003405344251697015, - "loss": 3.3712, + "epoch": 4.339622641509434, + "grad_norm": 0.665928840637207, + "learning_rate": 0.0003400863464651915, + "loss": 3.3884, "step": 40250 }, { - "epoch": 4.337530943924228, - "grad_norm": 0.6870384216308594, - "learning_rate": 0.0003402111841396401, - "loss": 3.381, + "epoch": 4.345013477088949, + "grad_norm": 0.6551325917243958, + "learning_rate": 0.0003397625472207231, + "loss": 3.3945, "step": 40300 }, { - "epoch": 4.342912495963836, - "grad_norm": 0.665998637676239, - "learning_rate": 0.00033988794310957864, - "loss": 3.3787, + "epoch": 4.350404312668464, + "grad_norm": 0.6828498840332031, + "learning_rate": 0.0003394387479762547, + "loss": 3.3882, "step": 40350 }, { - "epoch": 4.348294048003444, - "grad_norm": 0.6889680027961731, - "learning_rate": 0.0003395647020795173, - "loss": 3.3795, + "epoch": 4.355795148247978, + "grad_norm": 0.6560102701187134, + "learning_rate": 0.0003391149487317863, + "loss": 3.4021, "step": 40400 }, { - "epoch": 4.3536756000430525, - "grad_norm": 0.7430781722068787, - "learning_rate": 0.0003392414610494559, - "loss": 3.3779, + "epoch": 4.361185983827493, + "grad_norm": 0.6644808650016785, + "learning_rate": 0.00033879114948731783, + "loss": 3.3724, "step": 40450 }, { - "epoch": 4.359057152082661, - "grad_norm": 0.629690945148468, - "learning_rate": 0.0003389182200193944, - "loss": 3.3751, + "epoch": 4.366576819407008, + "grad_norm": 0.6699093580245972, + "learning_rate": 0.00033846735024284944, + "loss": 3.3788, "step": 40500 }, { - "epoch": 4.364438704122269, - "grad_norm": 0.6800459027290344, - "learning_rate": 0.000338594978989333, - "loss": 3.3752, + "epoch": 4.3719676549865225, + "grad_norm": 0.677771270275116, + "learning_rate": 0.000338143550998381, + "loss": 3.4007, "step": 40550 }, { - "epoch": 4.369820256161877, - "grad_norm": 0.6480296850204468, - "learning_rate": 0.00033827173795927156, - "loss": 3.3933, + "epoch": 4.377358490566038, + "grad_norm": 0.665482759475708, + "learning_rate": 0.00033782622773880194, + "loss": 3.386, "step": 40600 }, { - "epoch": 4.375201808201485, - "grad_norm": 0.6850379705429077, - "learning_rate": 0.00033794849692921015, - "loss": 3.3732, + "epoch": 4.382749326145553, + "grad_norm": 0.6514596343040466, + "learning_rate": 0.0003375024284943335, + "loss": 3.3772, "step": 40650 }, { - "epoch": 4.3805833602410935, - "grad_norm": 0.6677266359329224, - "learning_rate": 0.0003376252558991488, - "loss": 3.3851, + "epoch": 4.388140161725067, + "grad_norm": 0.608086347579956, + "learning_rate": 0.0003371786292498651, + "loss": 3.4051, "step": 40700 }, { - "epoch": 4.385964912280702, - "grad_norm": 0.6740007996559143, - "learning_rate": 0.00033730201486908734, - "loss": 3.3816, + "epoch": 4.393530997304582, + "grad_norm": 0.6772755980491638, + "learning_rate": 0.0003368548300053966, + "loss": 3.4013, "step": 40750 }, { - "epoch": 4.39134646432031, - "grad_norm": 0.6995536684989929, - "learning_rate": 0.00033697877383902594, - "loss": 3.3898, + "epoch": 4.398921832884097, + "grad_norm": 0.6337803602218628, + "learning_rate": 0.0003365310307609282, + "loss": 3.3755, "step": 40800 }, { - "epoch": 4.396728016359918, - "grad_norm": 0.6835097074508667, - "learning_rate": 0.00033665553280896453, - "loss": 3.3867, + "epoch": 4.404312668463612, + "grad_norm": 0.6308083534240723, + "learning_rate": 0.00033620723151645975, + "loss": 3.3956, "step": 40850 }, { - "epoch": 4.402109568399527, - "grad_norm": 0.6530295014381409, - "learning_rate": 0.00033633229177890307, - "loss": 3.3737, + "epoch": 4.409703504043127, + "grad_norm": 0.6387740969657898, + "learning_rate": 0.0003358834322719913, + "loss": 3.3838, "step": 40900 }, { - "epoch": 4.4074911204391345, - "grad_norm": 0.6370556354522705, - "learning_rate": 0.00033600905074884167, - "loss": 3.3841, + "epoch": 4.415094339622642, + "grad_norm": 0.6903748512268066, + "learning_rate": 0.0003355596330275229, + "loss": 3.3916, "step": 40950 }, { - "epoch": 4.412872672478743, - "grad_norm": 0.6709283590316772, - "learning_rate": 0.0003356858097187803, - "loss": 3.3871, + "epoch": 4.420485175202156, + "grad_norm": 0.6282992959022522, + "learning_rate": 0.00033523583378305445, + "loss": 3.4027, "step": 41000 }, { - "epoch": 4.412872672478743, - "eval_accuracy": 0.37784802307700416, - "eval_loss": 3.4330832958221436, - "eval_runtime": 180.4989, - "eval_samples_per_second": 99.785, - "eval_steps_per_second": 6.238, + "epoch": 4.420485175202156, + "eval_accuracy": 0.3776030108149825, + "eval_loss": 3.4350337982177734, + "eval_runtime": 188.6899, + "eval_samples_per_second": 95.453, + "eval_steps_per_second": 5.967, "step": 41000 }, { - "epoch": 4.418254224518351, - "grad_norm": 0.6717859506607056, - "learning_rate": 0.00033536256868871886, - "loss": 3.3859, + "epoch": 4.425876010781671, + "grad_norm": 0.636599063873291, + "learning_rate": 0.00033491203453858606, + "loss": 3.4016, "step": 41050 }, { - "epoch": 4.423635776557959, - "grad_norm": 0.6793646216392517, - "learning_rate": 0.00033503932765865745, - "loss": 3.3893, + "epoch": 4.431266846361186, + "grad_norm": 0.6283525824546814, + "learning_rate": 0.0003345882352941176, + "loss": 3.3849, "step": 41100 }, { - "epoch": 4.429017328597568, - "grad_norm": 0.6531360149383545, - "learning_rate": 0.000334716086628596, - "loss": 3.4012, + "epoch": 4.436657681940701, + "grad_norm": 0.6235990524291992, + "learning_rate": 0.0003342644360496492, + "loss": 3.3655, "step": 41150 }, { - "epoch": 4.4343988806371755, - "grad_norm": 0.655354917049408, - "learning_rate": 0.0003343928455985346, - "loss": 3.3705, + "epoch": 4.442048517520216, + "grad_norm": 0.6474363803863525, + "learning_rate": 0.00033394063680518076, + "loss": 3.3996, "step": 41200 }, { - "epoch": 4.439780432676784, - "grad_norm": 0.6870447993278503, - "learning_rate": 0.00033406960456847324, - "loss": 3.3836, + "epoch": 4.44743935309973, + "grad_norm": 0.7136996984481812, + "learning_rate": 0.00033361683756071237, + "loss": 3.3883, "step": 41250 }, { - "epoch": 4.445161984716393, - "grad_norm": 0.6646298170089722, - "learning_rate": 0.0003337463635384118, - "loss": 3.3703, + "epoch": 4.452830188679245, + "grad_norm": 0.6439403295516968, + "learning_rate": 0.0003332930383162439, + "loss": 3.3851, "step": 41300 }, { - "epoch": 4.450543536756, - "grad_norm": 0.7067282795906067, - "learning_rate": 0.00033342312250835037, - "loss": 3.3779, + "epoch": 4.45822102425876, + "grad_norm": 0.6484495997428894, + "learning_rate": 0.0003329692390717754, + "loss": 3.3777, "step": 41350 }, { - "epoch": 4.455925088795609, - "grad_norm": 0.6601843237876892, - "learning_rate": 0.00033309988147828896, - "loss": 3.3896, + "epoch": 4.463611859838275, + "grad_norm": 0.6435943245887756, + "learning_rate": 0.0003326454398273071, + "loss": 3.3884, "step": 41400 }, { - "epoch": 4.461306640835216, - "grad_norm": 0.6824533343315125, - "learning_rate": 0.0003327766404482275, - "loss": 3.3798, + "epoch": 4.46900269541779, + "grad_norm": 0.7229622006416321, + "learning_rate": 0.00033232164058283857, + "loss": 3.3741, "step": 41450 }, { - "epoch": 4.466688192874825, - "grad_norm": 0.6678142547607422, - "learning_rate": 0.0003324533994181661, - "loss": 3.385, + "epoch": 4.474393530997305, + "grad_norm": 0.615696132183075, + "learning_rate": 0.0003319978413383702, + "loss": 3.3795, "step": 41500 }, { - "epoch": 4.4720697449144335, - "grad_norm": 0.6664413213729858, - "learning_rate": 0.00033213015838810475, - "loss": 3.4065, + "epoch": 4.479784366576819, + "grad_norm": 0.682884931564331, + "learning_rate": 0.0003316805180787911, + "loss": 3.4081, "step": 41550 }, { - "epoch": 4.477451296954041, - "grad_norm": 0.6379727721214294, - "learning_rate": 0.0003318069173580433, - "loss": 3.3982, + "epoch": 4.485175202156334, + "grad_norm": 0.6571813225746155, + "learning_rate": 0.0003313567188343227, + "loss": 3.3991, "step": 41600 }, { - "epoch": 4.48283284899365, - "grad_norm": 0.7035315036773682, - "learning_rate": 0.0003314836763279819, - "loss": 3.3971, + "epoch": 4.490566037735849, + "grad_norm": 0.6233645081520081, + "learning_rate": 0.00033103291958985423, + "loss": 3.4019, "step": 41650 }, { - "epoch": 4.488214401033258, - "grad_norm": 0.6721235513687134, - "learning_rate": 0.0003311604352979204, - "loss": 3.3991, + "epoch": 4.495956873315364, + "grad_norm": 0.6672859787940979, + "learning_rate": 0.00033070912034538583, + "loss": 3.4047, "step": 41700 }, { - "epoch": 4.493595953072866, - "grad_norm": 0.6236030459403992, - "learning_rate": 0.000330837194267859, - "loss": 3.3777, + "epoch": 4.501347708894879, + "grad_norm": 0.7237746119499207, + "learning_rate": 0.0003303853211009174, + "loss": 3.3885, "step": 41750 }, { - "epoch": 4.4989775051124745, - "grad_norm": 0.6594476103782654, - "learning_rate": 0.0003305139532377976, - "loss": 3.392, + "epoch": 4.506738544474393, + "grad_norm": 0.6526361107826233, + "learning_rate": 0.000330061521856449, + "loss": 3.4006, "step": 41800 }, { - "epoch": 4.504359057152083, - "grad_norm": 0.6631010174751282, - "learning_rate": 0.0003301907122077362, - "loss": 3.3953, + "epoch": 4.512129380053908, + "grad_norm": 0.6548643708229065, + "learning_rate": 0.00032973772261198054, + "loss": 3.3766, "step": 41850 }, { - "epoch": 4.509740609191691, - "grad_norm": 0.6775970458984375, - "learning_rate": 0.0003298674711776748, - "loss": 3.3798, + "epoch": 4.517520215633423, + "grad_norm": 0.62809157371521, + "learning_rate": 0.00032941392336751214, + "loss": 3.4004, "step": 41900 }, { - "epoch": 4.515122161231299, - "grad_norm": 0.6755871772766113, - "learning_rate": 0.0003295442301476134, - "loss": 3.376, + "epoch": 4.5229110512129385, + "grad_norm": 0.7281858921051025, + "learning_rate": 0.0003290901241230437, + "loss": 3.4013, "step": 41950 }, { - "epoch": 4.520503713270907, - "grad_norm": 0.660190761089325, - "learning_rate": 0.00032922098911755194, - "loss": 3.3824, + "epoch": 4.528301886792453, + "grad_norm": 0.6370959281921387, + "learning_rate": 0.0003287663248785753, + "loss": 3.4027, "step": 42000 }, { - "epoch": 4.520503713270907, - "eval_accuracy": 0.37855622259977895, - "eval_loss": 3.42791485786438, - "eval_runtime": 180.7657, - "eval_samples_per_second": 99.637, - "eval_steps_per_second": 6.229, + "epoch": 4.528301886792453, + "eval_accuracy": 0.3783059949991422, + "eval_loss": 3.430859088897705, + "eval_runtime": 184.5354, + "eval_samples_per_second": 97.602, + "eval_steps_per_second": 6.102, "step": 42000 }, { - "epoch": 4.5258852653105155, - "grad_norm": 0.6533682942390442, - "learning_rate": 0.00032889774808749053, - "loss": 3.3913, + "epoch": 4.533692722371968, + "grad_norm": 0.6753497123718262, + "learning_rate": 0.00032844252563410685, + "loss": 3.3765, "step": 42050 }, { - "epoch": 4.531266817350124, - "grad_norm": 0.6714156270027161, - "learning_rate": 0.0003285745070574292, - "loss": 3.3873, + "epoch": 4.539083557951482, + "grad_norm": 0.6597613096237183, + "learning_rate": 0.00032811872638963834, + "loss": 3.3869, "step": 42100 }, { - "epoch": 4.536648369389732, - "grad_norm": 0.6406194567680359, - "learning_rate": 0.0003282512660273677, - "loss": 3.3876, + "epoch": 4.544474393530997, + "grad_norm": 0.6980563402175903, + "learning_rate": 0.00032779492714516995, + "loss": 3.3917, "step": 42150 }, { - "epoch": 4.54202992142934, - "grad_norm": 0.672727108001709, - "learning_rate": 0.0003279280249973063, - "loss": 3.3871, + "epoch": 4.549865229110512, + "grad_norm": 0.6639866232872009, + "learning_rate": 0.0003274711279007015, + "loss": 3.4029, "step": 42200 }, { - "epoch": 4.547411473468949, - "grad_norm": 0.6561397314071655, - "learning_rate": 0.00032760478396724486, - "loss": 3.3847, + "epoch": 4.555256064690027, + "grad_norm": 0.6949554681777954, + "learning_rate": 0.0003271473286562331, + "loss": 3.3972, "step": 42250 }, { - "epoch": 4.5527930255085565, - "grad_norm": 0.7014278173446655, - "learning_rate": 0.0003272880077577847, - "loss": 3.4011, + "epoch": 4.560646900269542, + "grad_norm": 0.6414914131164551, + "learning_rate": 0.00032682352941176466, + "loss": 3.3903, "step": 42300 }, { - "epoch": 4.558174577548165, - "grad_norm": 0.6329679489135742, - "learning_rate": 0.00032696476672772326, - "loss": 3.3918, + "epoch": 4.566037735849057, + "grad_norm": 0.6200968623161316, + "learning_rate": 0.00032649973016729626, + "loss": 3.4121, "step": 42350 }, { - "epoch": 4.563556129587774, - "grad_norm": 0.6573483943939209, - "learning_rate": 0.00032664152569766185, - "loss": 3.4036, + "epoch": 4.571428571428571, + "grad_norm": 0.6413687467575073, + "learning_rate": 0.0003261759309228278, + "loss": 3.3893, "step": 42400 }, { - "epoch": 4.568937681627381, - "grad_norm": 0.6461696624755859, - "learning_rate": 0.0003263182846676004, - "loss": 3.3831, + "epoch": 4.576819407008086, + "grad_norm": 0.6251882910728455, + "learning_rate": 0.0003258521316783594, + "loss": 3.4008, "step": 42450 }, { - "epoch": 4.57431923366699, - "grad_norm": 0.6436484456062317, - "learning_rate": 0.00032599504363753904, - "loss": 3.4091, + "epoch": 4.5822102425876015, + "grad_norm": 0.7066326141357422, + "learning_rate": 0.00032552833243389097, + "loss": 3.3838, "step": 42500 }, { - "epoch": 4.579700785706597, - "grad_norm": 0.683167040348053, - "learning_rate": 0.00032567180260747764, - "loss": 3.3936, + "epoch": 4.587601078167116, + "grad_norm": 0.7187201380729675, + "learning_rate": 0.0003252045331894225, + "loss": 3.3958, "step": 42550 }, { - "epoch": 4.585082337746206, - "grad_norm": 0.6897136569023132, - "learning_rate": 0.0003253485615774162, - "loss": 3.3839, + "epoch": 4.592991913746631, + "grad_norm": 0.7195764183998108, + "learning_rate": 0.0003248807339449541, + "loss": 3.3767, "step": 42600 }, { - "epoch": 4.5904638897858145, - "grad_norm": 0.7163823246955872, - "learning_rate": 0.00032502532054735477, - "loss": 3.3933, + "epoch": 4.598382749326145, + "grad_norm": 0.6276376247406006, + "learning_rate": 0.00032455693470048567, + "loss": 3.4029, "step": 42650 }, { - "epoch": 4.595845441825422, - "grad_norm": 0.6382591724395752, - "learning_rate": 0.00032470207951729337, - "loss": 3.3879, + "epoch": 4.60377358490566, + "grad_norm": 0.628013551235199, + "learning_rate": 0.0003242331354560173, + "loss": 3.3959, "step": 42700 }, { - "epoch": 4.601226993865031, - "grad_norm": 0.6253649592399597, - "learning_rate": 0.0003243788384872319, - "loss": 3.3937, + "epoch": 4.609164420485175, + "grad_norm": 0.6547573804855347, + "learning_rate": 0.0003239093362115488, + "loss": 3.387, "step": 42750 }, { - "epoch": 4.606608545904638, - "grad_norm": 0.6715789437294006, - "learning_rate": 0.00032405559745717056, - "loss": 3.3917, + "epoch": 4.6145552560646905, + "grad_norm": 0.6338829398155212, + "learning_rate": 0.00032358553696708043, + "loss": 3.389, "step": 42800 }, { - "epoch": 4.611990097944247, - "grad_norm": 0.7270500659942627, - "learning_rate": 0.00032373235642710915, - "loss": 3.3983, + "epoch": 4.619946091644205, + "grad_norm": 0.6385948061943054, + "learning_rate": 0.00032326173772261193, + "loss": 3.3959, "step": 42850 }, { - "epoch": 4.6173716499838555, - "grad_norm": 0.6829226613044739, - "learning_rate": 0.0003234091153970477, - "loss": 3.4023, + "epoch": 4.62533692722372, + "grad_norm": 0.7105038166046143, + "learning_rate": 0.00032293793847814353, + "loss": 3.4078, "step": 42900 }, { - "epoch": 4.622753202023463, - "grad_norm": 0.7018654942512512, - "learning_rate": 0.0003230858743669863, - "loss": 3.3912, + "epoch": 4.630727762803234, + "grad_norm": 0.644783616065979, + "learning_rate": 0.0003226141392336751, + "loss": 3.417, "step": 42950 }, { - "epoch": 4.628134754063072, - "grad_norm": 0.6380162239074707, - "learning_rate": 0.0003227626333369248, - "loss": 3.3943, + "epoch": 4.636118598382749, + "grad_norm": 0.6668684482574463, + "learning_rate": 0.00032229033998920663, + "loss": 3.3913, "step": 43000 }, { - "epoch": 4.628134754063072, - "eval_accuracy": 0.37904103178520715, - "eval_loss": 3.421799421310425, - "eval_runtime": 180.2956, - "eval_samples_per_second": 99.897, - "eval_steps_per_second": 6.245, + "epoch": 4.636118598382749, + "eval_accuracy": 0.3783892231112081, + "eval_loss": 3.4260003566741943, + "eval_runtime": 184.7846, + "eval_samples_per_second": 97.47, + "eval_steps_per_second": 6.094, "step": 43000 }, { - "epoch": 4.63351630610268, - "grad_norm": 0.6637802124023438, - "learning_rate": 0.0003224393923068635, - "loss": 3.3806, + "epoch": 4.6415094339622645, + "grad_norm": 0.6323847770690918, + "learning_rate": 0.00032196654074473824, + "loss": 3.3957, "step": 43050 }, { - "epoch": 4.638897858142288, - "grad_norm": 0.6968106031417847, - "learning_rate": 0.00032211615127680207, - "loss": 3.3677, + "epoch": 4.646900269541779, + "grad_norm": 0.6507082581520081, + "learning_rate": 0.0003216427415002698, + "loss": 3.3885, "step": 43100 }, { - "epoch": 4.6442794101818965, - "grad_norm": 0.6918413043022156, - "learning_rate": 0.0003217929102467406, - "loss": 3.3913, + "epoch": 4.652291105121294, + "grad_norm": 0.6631735563278198, + "learning_rate": 0.0003213189422558014, + "loss": 3.3978, "step": 43150 }, { - "epoch": 4.649660962221505, - "grad_norm": 0.637776255607605, - "learning_rate": 0.0003214696692166792, - "loss": 3.3909, + "epoch": 4.657681940700809, + "grad_norm": 0.7026782631874084, + "learning_rate": 0.00032099514301133295, + "loss": 3.3962, "step": 43200 }, { - "epoch": 4.655042514261113, - "grad_norm": 0.666828453540802, - "learning_rate": 0.0003211464281866178, - "loss": 3.3732, + "epoch": 4.663072776280323, + "grad_norm": 0.6135462522506714, + "learning_rate": 0.00032067134376686455, + "loss": 3.3845, "step": 43250 }, { - "epoch": 4.660424066300721, - "grad_norm": 0.6439411640167236, - "learning_rate": 0.00032082318715655634, - "loss": 3.3776, + "epoch": 4.668463611859838, + "grad_norm": 0.6497666239738464, + "learning_rate": 0.0003203475445223961, + "loss": 3.3851, "step": 43300 }, { - "epoch": 4.665805618340329, - "grad_norm": 0.6902055740356445, - "learning_rate": 0.000320499946126495, - "loss": 3.3948, + "epoch": 4.6738544474393535, + "grad_norm": 0.6379224061965942, + "learning_rate": 0.0003200237452779277, + "loss": 3.3819, "step": 43350 }, { - "epoch": 4.6711871703799375, - "grad_norm": 0.7745203375816345, - "learning_rate": 0.0003201767050964336, - "loss": 3.3795, + "epoch": 4.679245283018868, + "grad_norm": 0.6403704881668091, + "learning_rate": 0.00031969994603345926, + "loss": 3.4085, "step": 43400 }, { - "epoch": 4.676568722419546, - "grad_norm": 0.671711266040802, - "learning_rate": 0.0003198534640663721, - "loss": 3.3798, + "epoch": 4.684636118598383, + "grad_norm": 0.6845625638961792, + "learning_rate": 0.00031937614678899075, + "loss": 3.4026, "step": 43450 }, { - "epoch": 4.681950274459154, - "grad_norm": 0.6714461445808411, - "learning_rate": 0.00031953668785691193, - "loss": 3.3852, + "epoch": 4.690026954177897, + "grad_norm": 0.6924819350242615, + "learning_rate": 0.00031905234754452236, + "loss": 3.4098, "step": 43500 }, { - "epoch": 4.687331826498762, - "grad_norm": 0.6817046999931335, - "learning_rate": 0.0003192134468268505, - "loss": 3.3894, + "epoch": 4.695417789757412, + "grad_norm": 0.6407290101051331, + "learning_rate": 0.0003187285483000539, + "loss": 3.3758, "step": 43550 }, { - "epoch": 4.692713378538371, - "grad_norm": 0.6941414475440979, - "learning_rate": 0.0003188902057967891, - "loss": 3.4095, + "epoch": 4.7008086253369274, + "grad_norm": 0.6609850525856018, + "learning_rate": 0.0003184047490555855, + "loss": 3.3884, "step": 43600 }, { - "epoch": 4.6980949305779784, - "grad_norm": 0.7007185220718384, - "learning_rate": 0.00031856696476672766, - "loss": 3.3747, + "epoch": 4.706199460916442, + "grad_norm": 0.6443186402320862, + "learning_rate": 0.00031808094981111706, + "loss": 3.4078, "step": 43650 }, { - "epoch": 4.703476482617587, - "grad_norm": 0.7390422821044922, - "learning_rate": 0.0003182437237366663, - "loss": 3.3933, + "epoch": 4.711590296495957, + "grad_norm": 0.6441913843154907, + "learning_rate": 0.00031775715056664867, + "loss": 3.3966, "step": 43700 }, { - "epoch": 4.7088580346571955, - "grad_norm": 0.6899697780609131, - "learning_rate": 0.0003179204827066049, - "loss": 3.3846, + "epoch": 4.716981132075472, + "grad_norm": 0.6135480999946594, + "learning_rate": 0.0003174333513221802, + "loss": 3.3789, "step": 43750 }, { - "epoch": 4.714239586696803, - "grad_norm": 0.6393377184867859, - "learning_rate": 0.00031759724167654344, - "loss": 3.3877, + "epoch": 4.722371967654986, + "grad_norm": 0.6646343469619751, + "learning_rate": 0.00031710955207771177, + "loss": 3.3885, "step": 43800 }, { - "epoch": 4.719621138736412, - "grad_norm": 0.6704504489898682, - "learning_rate": 0.00031727400064648204, - "loss": 3.3905, + "epoch": 4.727762803234501, + "grad_norm": 0.6587278842926025, + "learning_rate": 0.0003167857528332434, + "loss": 3.4045, "step": 43850 }, { - "epoch": 4.725002690776019, - "grad_norm": 0.6651193499565125, - "learning_rate": 0.0003169507596164206, - "loss": 3.3773, + "epoch": 4.7331536388140165, + "grad_norm": 0.6713588833808899, + "learning_rate": 0.0003164619535887749, + "loss": 3.3826, "step": 43900 }, { - "epoch": 4.730384242815628, - "grad_norm": 0.6523035168647766, - "learning_rate": 0.0003166275185863592, - "loss": 3.4, + "epoch": 4.738544474393531, + "grad_norm": 0.6753560900688171, + "learning_rate": 0.00031613815434430653, + "loss": 3.3928, "step": 43950 }, { - "epoch": 4.7357657948552365, - "grad_norm": 0.6419060230255127, - "learning_rate": 0.0003163042775562978, - "loss": 3.3728, + "epoch": 4.743935309973046, + "grad_norm": 0.6197715401649475, + "learning_rate": 0.0003158143550998381, + "loss": 3.3965, "step": 44000 }, { - "epoch": 4.7357657948552365, - "eval_accuracy": 0.37933276478898903, - "eval_loss": 3.417551040649414, - "eval_runtime": 180.8101, - "eval_samples_per_second": 99.613, - "eval_steps_per_second": 6.228, + "epoch": 4.743935309973046, + "eval_accuracy": 0.3794112121739911, + "eval_loss": 3.420830726623535, + "eval_runtime": 185.017, + "eval_samples_per_second": 97.348, + "eval_steps_per_second": 6.086, "step": 44000 }, { - "epoch": 4.741147346894844, - "grad_norm": 0.6831957697868347, - "learning_rate": 0.00031598103652623636, - "loss": 3.3938, + "epoch": 4.74932614555256, + "grad_norm": 0.6172311305999756, + "learning_rate": 0.0003154905558553697, + "loss": 3.4111, "step": 44050 }, { - "epoch": 4.746528898934453, - "grad_norm": 0.6325632929801941, - "learning_rate": 0.00031565779549617496, - "loss": 3.4015, + "epoch": 4.754716981132075, + "grad_norm": 0.7061458230018616, + "learning_rate": 0.0003151667566109012, + "loss": 3.3835, "step": 44100 }, { - "epoch": 4.751910450974061, - "grad_norm": 0.687362790107727, - "learning_rate": 0.00031533455446611355, - "loss": 3.3946, + "epoch": 4.7601078167115904, + "grad_norm": 0.6917078495025635, + "learning_rate": 0.00031484295736643284, + "loss": 3.3896, "step": 44150 }, { - "epoch": 4.757292003013669, - "grad_norm": 0.6923133134841919, - "learning_rate": 0.0003150113134360521, - "loss": 3.3823, + "epoch": 4.765498652291106, + "grad_norm": 0.6253639459609985, + "learning_rate": 0.00031451915812196434, + "loss": 3.3832, "step": 44200 }, { - "epoch": 4.7626735550532775, - "grad_norm": 0.709951639175415, - "learning_rate": 0.0003146880724059907, - "loss": 3.3947, + "epoch": 4.77088948787062, + "grad_norm": 0.6521839499473572, + "learning_rate": 0.0003142018348623853, + "loss": 3.4013, "step": 44250 }, { - "epoch": 4.768055107092886, - "grad_norm": 0.6787405610084534, - "learning_rate": 0.00031436483137592934, - "loss": 3.3734, + "epoch": 4.776280323450135, + "grad_norm": 0.6730448603630066, + "learning_rate": 0.00031387803561791684, + "loss": 3.3867, "step": 44300 }, { - "epoch": 4.773436659132494, - "grad_norm": 0.7011154890060425, - "learning_rate": 0.0003140415903458679, - "loss": 3.3932, + "epoch": 4.781671159029649, + "grad_norm": 0.6574965119361877, + "learning_rate": 0.00031355423637344844, + "loss": 3.3834, "step": 44350 }, { - "epoch": 4.778818211172102, - "grad_norm": 0.7104355096817017, - "learning_rate": 0.00031371834931580647, - "loss": 3.3876, + "epoch": 4.787061994609164, + "grad_norm": 0.6981621980667114, + "learning_rate": 0.00031323043712898, + "loss": 3.3903, "step": 44400 }, { - "epoch": 4.78419976321171, - "grad_norm": 0.65087890625, - "learning_rate": 0.000313395108285745, - "loss": 3.3826, + "epoch": 4.7924528301886795, + "grad_norm": 0.6450235843658447, + "learning_rate": 0.0003129066378845116, + "loss": 3.4046, "step": 44450 }, { - "epoch": 4.7895813152513185, - "grad_norm": 0.7193938493728638, - "learning_rate": 0.0003130718672556836, - "loss": 3.3899, + "epoch": 4.797843665768194, + "grad_norm": 0.6787946820259094, + "learning_rate": 0.00031258283864004315, + "loss": 3.3993, "step": 44500 }, { - "epoch": 4.794962867290927, - "grad_norm": 0.6938375234603882, - "learning_rate": 0.0003127486262256222, - "loss": 3.367, + "epoch": 4.803234501347709, + "grad_norm": 0.6712046265602112, + "learning_rate": 0.0003122590393955747, + "loss": 3.3929, "step": 44550 }, { - "epoch": 4.800344419330535, - "grad_norm": 0.6607935428619385, - "learning_rate": 0.0003124253851955608, - "loss": 3.3828, + "epoch": 4.808625336927224, + "grad_norm": 0.6471379399299622, + "learning_rate": 0.0003119352401511063, + "loss": 3.4003, "step": 44600 }, { - "epoch": 4.805725971370143, - "grad_norm": 0.7184892296791077, - "learning_rate": 0.0003121021441654994, - "loss": 3.4025, + "epoch": 4.814016172506738, + "grad_norm": 0.665787935256958, + "learning_rate": 0.00031161144090663786, + "loss": 3.3974, "step": 44650 }, { - "epoch": 4.811107523409751, - "grad_norm": 0.6583475470542908, - "learning_rate": 0.000311778903135438, - "loss": 3.4065, + "epoch": 4.819407008086253, + "grad_norm": 0.6562718152999878, + "learning_rate": 0.00031128764166216946, + "loss": 3.3865, "step": 44700 }, { - "epoch": 4.8164890754493594, - "grad_norm": 0.746714174747467, - "learning_rate": 0.0003114556621053765, - "loss": 3.4003, + "epoch": 4.824797843665769, + "grad_norm": 0.665611743927002, + "learning_rate": 0.00031096384241770096, + "loss": 3.403, "step": 44750 }, { - "epoch": 4.821870627488968, - "grad_norm": 0.667801558971405, - "learning_rate": 0.0003111324210753151, - "loss": 3.3777, + "epoch": 4.830188679245283, + "grad_norm": 0.6378157734870911, + "learning_rate": 0.0003106400431732326, + "loss": 3.4124, "step": 44800 }, { - "epoch": 4.827252179528576, - "grad_norm": 0.6681658029556274, - "learning_rate": 0.00031080918004525377, - "loss": 3.3846, + "epoch": 4.835579514824798, + "grad_norm": 0.6545462608337402, + "learning_rate": 0.0003103162439287641, + "loss": 3.4008, "step": 44850 }, { - "epoch": 4.832633731568184, - "grad_norm": 0.6859903931617737, - "learning_rate": 0.0003104859390151923, - "loss": 3.3843, + "epoch": 4.840970350404312, + "grad_norm": 0.6839264035224915, + "learning_rate": 0.0003099924446842957, + "loss": 3.4062, "step": 44900 }, { - "epoch": 4.838015283607793, - "grad_norm": 0.6311453580856323, - "learning_rate": 0.0003101626979851309, - "loss": 3.3754, + "epoch": 4.846361185983827, + "grad_norm": 0.6348554491996765, + "learning_rate": 0.00030966864543982727, + "loss": 3.4098, "step": 44950 }, { - "epoch": 4.8433968356474, - "grad_norm": 0.6764549016952515, - "learning_rate": 0.00030983945695506945, - "loss": 3.3897, + "epoch": 4.8517520215633425, + "grad_norm": 0.6661449670791626, + "learning_rate": 0.0003093448461953588, + "loss": 3.4006, "step": 45000 }, { - "epoch": 4.8433968356474, - "eval_accuracy": 0.3799441545887211, - "eval_loss": 3.4145073890686035, - "eval_runtime": 180.8088, - "eval_samples_per_second": 99.614, - "eval_steps_per_second": 6.228, + "epoch": 4.8517520215633425, + "eval_accuracy": 0.3796366669162061, + "eval_loss": 3.4154839515686035, + "eval_runtime": 184.8867, + "eval_samples_per_second": 97.416, + "eval_steps_per_second": 6.09, "step": 45000 }, { - "epoch": 4.848778387687009, - "grad_norm": 0.7071652412414551, - "learning_rate": 0.00030951621592500804, - "loss": 3.3874, + "epoch": 4.857142857142857, + "grad_norm": 0.7101733684539795, + "learning_rate": 0.0003090210469508904, + "loss": 3.3889, "step": 45050 }, { - "epoch": 4.8541599397266175, - "grad_norm": 0.6975886821746826, - "learning_rate": 0.00030919297489494663, - "loss": 3.381, + "epoch": 4.862533692722372, + "grad_norm": 0.6474313139915466, + "learning_rate": 0.00030869724770642197, + "loss": 3.394, "step": 45100 }, { - "epoch": 4.859541491766225, - "grad_norm": 0.6888333559036255, - "learning_rate": 0.00030886973386488523, - "loss": 3.3775, + "epoch": 4.867924528301887, + "grad_norm": 0.6456648707389832, + "learning_rate": 0.0003083734484619536, + "loss": 3.3986, "step": 45150 }, { - "epoch": 4.864923043805834, - "grad_norm": 0.6672408580780029, - "learning_rate": 0.0003085464928348238, - "loss": 3.4037, + "epoch": 4.873315363881401, + "grad_norm": 0.7081680297851562, + "learning_rate": 0.00030804964921748513, + "loss": 3.3801, "step": 45200 }, { - "epoch": 4.870304595845441, - "grad_norm": 0.710537850856781, - "learning_rate": 0.0003082232518047624, - "loss": 3.391, + "epoch": 4.878706199460916, + "grad_norm": 0.652946412563324, + "learning_rate": 0.00030772584997301673, + "loss": 3.3928, "step": 45250 }, { - "epoch": 4.87568614788505, - "grad_norm": 0.6636947393417358, - "learning_rate": 0.00030790001077470096, - "loss": 3.3911, + "epoch": 4.884097035040432, + "grad_norm": 0.6638479828834534, + "learning_rate": 0.0003074020507285483, + "loss": 3.3753, "step": 45300 }, { - "epoch": 4.8810676999246585, - "grad_norm": 0.6961594223976135, - "learning_rate": 0.00030757676974463955, - "loss": 3.3829, + "epoch": 4.889487870619946, + "grad_norm": 0.6589642763137817, + "learning_rate": 0.0003070782514840799, + "loss": 3.3901, "step": 45350 }, { - "epoch": 4.886449251964266, - "grad_norm": 0.8008875846862793, - "learning_rate": 0.0003072535287145781, - "loss": 3.3904, + "epoch": 4.894878706199461, + "grad_norm": 0.6825507283210754, + "learning_rate": 0.00030675445223961144, + "loss": 3.3957, "step": 45400 }, { - "epoch": 4.891830804003875, - "grad_norm": 0.6139016151428223, - "learning_rate": 0.00030693028768451674, - "loss": 3.3811, + "epoch": 4.900269541778976, + "grad_norm": 0.6379856467247009, + "learning_rate": 0.00030643065299514294, + "loss": 3.3954, "step": 45450 }, { - "epoch": 4.897212356043483, - "grad_norm": 0.6992993354797363, - "learning_rate": 0.00030660704665445534, - "loss": 3.3865, + "epoch": 4.90566037735849, + "grad_norm": 0.6712074279785156, + "learning_rate": 0.00030610685375067454, + "loss": 3.3868, "step": 45500 }, { - "epoch": 4.902593908083091, - "grad_norm": 0.6530194282531738, - "learning_rate": 0.0003062838056243939, - "loss": 3.3902, + "epoch": 4.9110512129380055, + "grad_norm": 0.6530936360359192, + "learning_rate": 0.0003057830545062061, + "loss": 3.3737, "step": 45550 }, { - "epoch": 4.9079754601226995, - "grad_norm": 0.6725596189498901, - "learning_rate": 0.00030596056459433247, - "loss": 3.3907, + "epoch": 4.916442048517521, + "grad_norm": 0.6042382717132568, + "learning_rate": 0.0003054592552617377, + "loss": 3.3815, "step": 45600 }, { - "epoch": 4.913357012162308, - "grad_norm": 0.670534610748291, - "learning_rate": 0.00030563732356427107, - "loss": 3.3845, + "epoch": 4.921832884097035, + "grad_norm": 0.6475586891174316, + "learning_rate": 0.00030513545601726925, + "loss": 3.3859, "step": 45650 }, { - "epoch": 4.918738564201916, - "grad_norm": 0.6930390000343323, - "learning_rate": 0.00030531408253420966, - "loss": 3.3919, + "epoch": 4.92722371967655, + "grad_norm": 0.624848484992981, + "learning_rate": 0.00030481165677280085, + "loss": 3.3982, "step": 45700 }, { - "epoch": 4.924120116241524, - "grad_norm": 0.7007985711097717, - "learning_rate": 0.00030499084150414826, - "loss": 3.375, + "epoch": 4.932614555256064, + "grad_norm": 0.6341684460639954, + "learning_rate": 0.0003044878575283324, + "loss": 3.3939, "step": 45750 }, { - "epoch": 4.929501668281132, - "grad_norm": 0.6509608030319214, - "learning_rate": 0.00030466760047408685, - "loss": 3.3936, + "epoch": 4.938005390835579, + "grad_norm": 0.6945657134056091, + "learning_rate": 0.000304164058283864, + "loss": 3.3828, "step": 45800 }, { - "epoch": 4.9348832203207404, - "grad_norm": 0.7178775668144226, - "learning_rate": 0.0003043443594440254, - "loss": 3.3867, + "epoch": 4.943396226415095, + "grad_norm": 0.658644437789917, + "learning_rate": 0.00030384025903939556, + "loss": 3.3859, "step": 45850 }, { - "epoch": 4.940264772360349, - "grad_norm": 0.7314358949661255, - "learning_rate": 0.000304021118413964, - "loss": 3.3853, + "epoch": 4.948787061994609, + "grad_norm": 0.6409682035446167, + "learning_rate": 0.0003035164597949271, + "loss": 3.3902, "step": 45900 }, { - "epoch": 4.945646324399957, - "grad_norm": 0.7417964339256287, - "learning_rate": 0.0003036978773839025, - "loss": 3.3955, + "epoch": 4.954177897574124, + "grad_norm": 0.6514752507209778, + "learning_rate": 0.0003031926605504587, + "loss": 3.3919, "step": 45950 }, { - "epoch": 4.951027876439565, - "grad_norm": 0.6962642669677734, - "learning_rate": 0.0003033746363538412, - "loss": 3.3879, + "epoch": 4.959568733153639, + "grad_norm": 0.6511775851249695, + "learning_rate": 0.00030286886130599026, + "loss": 3.3816, "step": 46000 }, { - "epoch": 4.951027876439565, - "eval_accuracy": 0.3806199755509272, - "eval_loss": 3.4076614379882812, - "eval_runtime": 180.463, - "eval_samples_per_second": 99.804, - "eval_steps_per_second": 6.24, + "epoch": 4.959568733153639, + "eval_accuracy": 0.380506433283161, + "eval_loss": 3.412381649017334, + "eval_runtime": 184.9517, + "eval_samples_per_second": 97.382, + "eval_steps_per_second": 6.088, "step": 46000 }, { - "epoch": 4.956409428479174, - "grad_norm": 0.6814647912979126, - "learning_rate": 0.00030305139532377977, - "loss": 3.3941, + "epoch": 4.964959568733153, + "grad_norm": 0.6196075081825256, + "learning_rate": 0.00030254506206152187, + "loss": 3.3994, "step": 46050 }, { - "epoch": 4.961790980518781, - "grad_norm": 0.6750015020370483, - "learning_rate": 0.0003027281542937183, - "loss": 3.3918, + "epoch": 4.9703504043126685, + "grad_norm": 0.6395175457000732, + "learning_rate": 0.00030222126281705336, + "loss": 3.382, "step": 46100 }, { - "epoch": 4.96717253255839, - "grad_norm": 0.6892281174659729, - "learning_rate": 0.0003024049132636569, - "loss": 3.3945, + "epoch": 4.975741239892184, + "grad_norm": 0.7017073631286621, + "learning_rate": 0.000301897463572585, + "loss": 3.3978, "step": 46150 }, { - "epoch": 4.9725540845979985, - "grad_norm": 0.6996626257896423, - "learning_rate": 0.0003020816722335955, - "loss": 3.3739, + "epoch": 4.981132075471698, + "grad_norm": 0.7031179070472717, + "learning_rate": 0.0003015736643281165, + "loss": 3.3877, "step": 46200 }, { - "epoch": 4.977935636637606, - "grad_norm": 0.6532090902328491, - "learning_rate": 0.00030175843120353404, - "loss": 3.4021, + "epoch": 4.986522911051213, + "grad_norm": 0.628025233745575, + "learning_rate": 0.00030124986508364807, + "loss": 3.3984, "step": 46250 }, { - "epoch": 4.983317188677215, - "grad_norm": 0.6986241936683655, - "learning_rate": 0.0003014351901734727, - "loss": 3.3769, + "epoch": 4.991913746630727, + "grad_norm": 0.6933519840240479, + "learning_rate": 0.0003009260658391797, + "loss": 3.4003, "step": 46300 }, { - "epoch": 4.988698740716822, - "grad_norm": 0.6778721213340759, - "learning_rate": 0.0003011119491434113, - "loss": 3.3806, + "epoch": 4.997304582210242, + "grad_norm": 0.6547604203224182, + "learning_rate": 0.0003006022665947112, + "loss": 3.3829, "step": 46350 }, { - "epoch": 4.994080292756431, - "grad_norm": 0.7227446436882019, - "learning_rate": 0.0003007887081133498, - "loss": 3.386, + "epoch": 5.002695417789758, + "grad_norm": 0.6624903678894043, + "learning_rate": 0.00030027846735024283, + "loss": 3.3527, "step": 46400 }, { - "epoch": 4.9994618447960395, - "grad_norm": 0.6771180629730225, - "learning_rate": 0.0003004654670832884, - "loss": 3.3777, + "epoch": 5.008086253369272, + "grad_norm": 0.6124040484428406, + "learning_rate": 0.00029995466810577443, + "loss": 3.2893, "step": 46450 }, { - "epoch": 5.004843396835647, - "grad_norm": 0.6935259699821472, - "learning_rate": 0.00030014222605322696, - "loss": 3.3098, + "epoch": 5.013477088948787, + "grad_norm": 0.6685596704483032, + "learning_rate": 0.00029963086886130593, + "loss": 3.2916, "step": 46500 }, { - "epoch": 5.010224948875256, - "grad_norm": 0.7119801640510559, - "learning_rate": 0.00029981898502316555, - "loss": 3.3013, + "epoch": 5.018867924528302, + "grad_norm": 0.6791301369667053, + "learning_rate": 0.00029930706961683754, + "loss": 3.2882, "step": 46550 }, { - "epoch": 5.015606500914864, - "grad_norm": 0.7470939755439758, - "learning_rate": 0.00029949574399310415, - "loss": 3.2987, + "epoch": 5.024258760107816, + "grad_norm": 0.6711662411689758, + "learning_rate": 0.0002989832703723691, + "loss": 3.2997, "step": 46600 }, { - "epoch": 5.020988052954472, - "grad_norm": 0.7172041535377502, - "learning_rate": 0.00029917250296304274, - "loss": 3.2962, + "epoch": 5.0296495956873315, + "grad_norm": 0.6637666821479797, + "learning_rate": 0.0002986594711279007, + "loss": 3.296, "step": 46650 }, { - "epoch": 5.0263696049940805, - "grad_norm": 0.708893895149231, - "learning_rate": 0.00029884926193298134, - "loss": 3.2944, + "epoch": 5.035040431266847, + "grad_norm": 0.6611721515655518, + "learning_rate": 0.00029833567188343224, + "loss": 3.3144, "step": 46700 }, { - "epoch": 5.031751157033688, - "grad_norm": 0.6856112480163574, - "learning_rate": 0.0002985260209029199, - "loss": 3.2975, + "epoch": 5.040431266846361, + "grad_norm": 0.6716077923774719, + "learning_rate": 0.00029801187263896385, + "loss": 3.3006, "step": 46750 }, { - "epoch": 5.037132709073297, - "grad_norm": 0.6789200305938721, - "learning_rate": 0.00029820277987285853, - "loss": 3.2961, + "epoch": 5.045822102425876, + "grad_norm": 0.6382834315299988, + "learning_rate": 0.0002976880733944954, + "loss": 3.3187, "step": 46800 }, { - "epoch": 5.042514261112905, - "grad_norm": 0.6745328307151794, - "learning_rate": 0.00029787953884279707, - "loss": 3.294, + "epoch": 5.051212938005391, + "grad_norm": 0.6736823916435242, + "learning_rate": 0.00029736427415002695, + "loss": 3.2997, "step": 46850 }, { - "epoch": 5.047895813152513, - "grad_norm": 0.695334792137146, - "learning_rate": 0.00029756276263333693, - "loss": 3.3119, + "epoch": 5.056603773584905, + "grad_norm": 0.6748251914978027, + "learning_rate": 0.0002970404749055585, + "loss": 3.3102, "step": 46900 }, { - "epoch": 5.0532773651921215, - "grad_norm": 0.6939895153045654, - "learning_rate": 0.00029723952160327547, - "loss": 3.2977, + "epoch": 5.061994609164421, + "grad_norm": 0.6604644060134888, + "learning_rate": 0.0002967166756610901, + "loss": 3.3199, "step": 46950 }, { - "epoch": 5.05865891723173, - "grad_norm": 0.7092863917350769, - "learning_rate": 0.00029691628057321406, - "loss": 3.3093, + "epoch": 5.067385444743936, + "grad_norm": 0.6938402652740479, + "learning_rate": 0.00029639287641662165, + "loss": 3.308, "step": 47000 }, { - "epoch": 5.05865891723173, - "eval_accuracy": 0.3807579647184516, - "eval_loss": 3.4131321907043457, - "eval_runtime": 180.5823, - "eval_samples_per_second": 99.738, - "eval_steps_per_second": 6.235, + "epoch": 5.067385444743936, + "eval_accuracy": 0.38030357834161144, + "eval_loss": 3.4177451133728027, + "eval_runtime": 184.7954, + "eval_samples_per_second": 97.465, + "eval_steps_per_second": 6.093, "step": 47000 }, { - "epoch": 5.064040469271338, - "grad_norm": 0.7017537355422974, - "learning_rate": 0.00029659303954315266, - "loss": 3.3113, + "epoch": 5.07277628032345, + "grad_norm": 0.6623622179031372, + "learning_rate": 0.00029606907717215326, + "loss": 3.3111, "step": 47050 }, { - "epoch": 5.069422021310946, - "grad_norm": 0.7066820859909058, - "learning_rate": 0.00029626979851309125, - "loss": 3.3025, + "epoch": 5.078167115902965, + "grad_norm": 0.6709842085838318, + "learning_rate": 0.0002957452779276848, + "loss": 3.3137, "step": 47100 }, { - "epoch": 5.074803573350554, - "grad_norm": 0.7007570862770081, - "learning_rate": 0.00029594655748302985, - "loss": 3.3077, + "epoch": 5.083557951482479, + "grad_norm": 0.7319429516792297, + "learning_rate": 0.00029542147868321636, + "loss": 3.3017, "step": 47150 }, { - "epoch": 5.080185125390162, - "grad_norm": 0.7130218744277954, - "learning_rate": 0.0002956233164529684, - "loss": 3.2919, + "epoch": 5.0889487870619945, + "grad_norm": 0.6315300464630127, + "learning_rate": 0.00029509767943874796, + "loss": 3.3109, "step": 47200 }, { - "epoch": 5.085566677429771, - "grad_norm": 0.7107332944869995, - "learning_rate": 0.000295300075422907, - "loss": 3.3093, + "epoch": 5.09433962264151, + "grad_norm": 0.6517070531845093, + "learning_rate": 0.0002947738801942795, + "loss": 3.3021, "step": 47250 }, { - "epoch": 5.090948229469379, - "grad_norm": 0.6778996586799622, - "learning_rate": 0.0002949768343928456, - "loss": 3.3089, + "epoch": 5.099730458221024, + "grad_norm": 0.6561506986618042, + "learning_rate": 0.0002944500809498111, + "loss": 3.3161, "step": 47300 }, { - "epoch": 5.096329781508987, - "grad_norm": 0.7302454113960266, - "learning_rate": 0.00029465359336278417, - "loss": 3.321, + "epoch": 5.105121293800539, + "grad_norm": 0.6593621969223022, + "learning_rate": 0.00029412628170534267, + "loss": 3.3206, "step": 47350 }, { - "epoch": 5.101711333548596, - "grad_norm": 0.7293109893798828, - "learning_rate": 0.0002943303523327227, - "loss": 3.3307, + "epoch": 5.110512129380054, + "grad_norm": 0.6380136013031006, + "learning_rate": 0.0002938024824608742, + "loss": 3.3299, "step": 47400 }, { - "epoch": 5.107092885588203, - "grad_norm": 0.7125155329704285, - "learning_rate": 0.00029400711130266136, - "loss": 3.3088, + "epoch": 5.115902964959568, + "grad_norm": 0.703005313873291, + "learning_rate": 0.00029347868321640577, + "loss": 3.3193, "step": 47450 }, { - "epoch": 5.112474437627812, - "grad_norm": 0.7028940916061401, - "learning_rate": 0.0002936838702725999, - "loss": 3.3038, + "epoch": 5.121293800539084, + "grad_norm": 0.6386775374412537, + "learning_rate": 0.0002931548839719374, + "loss": 3.3235, "step": 47500 }, { - "epoch": 5.1178559896674205, - "grad_norm": 0.6821138858795166, - "learning_rate": 0.0002933606292425385, - "loss": 3.3197, + "epoch": 5.126684636118599, + "grad_norm": 0.7004210352897644, + "learning_rate": 0.0002928310847274689, + "loss": 3.32, "step": 47550 }, { - "epoch": 5.123237541707028, - "grad_norm": 0.6831609010696411, - "learning_rate": 0.0002930373882124771, - "loss": 3.3003, + "epoch": 5.132075471698113, + "grad_norm": 0.7443287968635559, + "learning_rate": 0.00029250728548300053, + "loss": 3.3218, "step": 47600 }, { - "epoch": 5.128619093746637, - "grad_norm": 0.6890237331390381, - "learning_rate": 0.0002927141471824157, - "loss": 3.3091, + "epoch": 5.137466307277628, + "grad_norm": 0.6982861161231995, + "learning_rate": 0.0002921834862385321, + "loss": 3.3123, "step": 47650 }, { - "epoch": 5.134000645786244, - "grad_norm": 0.7032915949821472, - "learning_rate": 0.0002923909061523542, - "loss": 3.3137, + "epoch": 5.142857142857143, + "grad_norm": 0.7046205997467041, + "learning_rate": 0.0002918596869940637, + "loss": 3.3081, "step": 47700 }, { - "epoch": 5.139382197825853, - "grad_norm": 0.6753126382827759, - "learning_rate": 0.0002920676651222928, - "loss": 3.3034, + "epoch": 5.1482479784366575, + "grad_norm": 0.6518100500106812, + "learning_rate": 0.0002915358877495952, + "loss": 3.3084, "step": 47750 }, { - "epoch": 5.1447637498654615, - "grad_norm": 0.7659389972686768, - "learning_rate": 0.0002917444240922314, - "loss": 3.3154, + "epoch": 5.153638814016173, + "grad_norm": 0.6555050015449524, + "learning_rate": 0.0002912120885051268, + "loss": 3.3268, "step": 47800 }, { - "epoch": 5.150145301905069, - "grad_norm": 0.688945472240448, - "learning_rate": 0.00029142118306216996, - "loss": 3.3108, + "epoch": 5.159029649595688, + "grad_norm": 0.6799789667129517, + "learning_rate": 0.00029088828926065834, + "loss": 3.3014, "step": 47850 }, { - "epoch": 5.155526853944678, - "grad_norm": 0.6884469389915466, - "learning_rate": 0.0002910979420321086, - "loss": 3.3065, + "epoch": 5.164420485175202, + "grad_norm": 0.6917016506195068, + "learning_rate": 0.00029056449001618994, + "loss": 3.3357, "step": 47900 }, { - "epoch": 5.160908405984286, - "grad_norm": 0.6957643032073975, - "learning_rate": 0.00029077470100204715, - "loss": 3.31, + "epoch": 5.169811320754717, + "grad_norm": 0.6385092735290527, + "learning_rate": 0.0002902406907717215, + "loss": 3.3204, "step": 47950 }, { - "epoch": 5.166289958023894, - "grad_norm": 0.7388163805007935, - "learning_rate": 0.00029045145997198574, - "loss": 3.2902, + "epoch": 5.175202156334231, + "grad_norm": 0.6348479986190796, + "learning_rate": 0.0002899168915272531, + "loss": 3.3136, "step": 48000 }, { - "epoch": 5.166289958023894, - "eval_accuracy": 0.3809007346130398, - "eval_loss": 3.4100382328033447, - "eval_runtime": 180.4717, - "eval_samples_per_second": 99.8, - "eval_steps_per_second": 6.239, + "epoch": 5.175202156334231, + "eval_accuracy": 0.3807514455451827, + "eval_loss": 3.412294387817383, + "eval_runtime": 184.954, + "eval_samples_per_second": 97.381, + "eval_steps_per_second": 6.088, "step": 48000 }, { - "epoch": 5.1716715100635025, - "grad_norm": 0.7176282405853271, - "learning_rate": 0.00029012821894192433, - "loss": 3.315, + "epoch": 5.180592991913747, + "grad_norm": 0.7083115577697754, + "learning_rate": 0.00028959309228278465, + "loss": 3.3247, "step": 48050 }, { - "epoch": 5.17705306210311, - "grad_norm": 0.6990656852722168, - "learning_rate": 0.00028980497791186293, - "loss": 3.3085, + "epoch": 5.185983827493262, + "grad_norm": 0.7016012668609619, + "learning_rate": 0.00028926929303831625, + "loss": 3.314, "step": 48100 }, { - "epoch": 5.182434614142719, - "grad_norm": 0.7440829277038574, - "learning_rate": 0.0002894817368818015, - "loss": 3.3026, + "epoch": 5.191374663072776, + "grad_norm": 0.6694549322128296, + "learning_rate": 0.0002889454937938478, + "loss": 3.3267, "step": 48150 }, { - "epoch": 5.187816166182327, - "grad_norm": 0.7810512781143188, - "learning_rate": 0.00028915849585174006, - "loss": 3.3067, + "epoch": 5.196765498652291, + "grad_norm": 0.6478196978569031, + "learning_rate": 0.00028862169454937935, + "loss": 3.3285, "step": 48200 }, { - "epoch": 5.193197718221935, - "grad_norm": 0.7514436841011047, - "learning_rate": 0.00028883525482167866, - "loss": 3.3155, + "epoch": 5.202156334231806, + "grad_norm": 0.7021739482879639, + "learning_rate": 0.0002883043712898003, + "loss": 3.3231, "step": 48250 }, { - "epoch": 5.198579270261543, - "grad_norm": 0.7587404251098633, - "learning_rate": 0.00028851201379161725, - "loss": 3.326, + "epoch": 5.2075471698113205, + "grad_norm": 0.6745837330818176, + "learning_rate": 0.00028798057204533186, + "loss": 3.3216, "step": 48300 }, { - "epoch": 5.203960822301152, - "grad_norm": 0.728809654712677, - "learning_rate": 0.00028818877276155585, - "loss": 3.3046, + "epoch": 5.212938005390836, + "grad_norm": 0.6576200723648071, + "learning_rate": 0.00028765677280086346, + "loss": 3.3242, "step": 48350 }, { - "epoch": 5.20934237434076, - "grad_norm": 0.6925315260887146, - "learning_rate": 0.0002878655317314944, - "loss": 3.3156, + "epoch": 5.218328840970351, + "grad_norm": 0.6276342272758484, + "learning_rate": 0.000287332973556395, + "loss": 3.3337, "step": 48400 }, { - "epoch": 5.214723926380368, - "grad_norm": 0.761146605014801, - "learning_rate": 0.00028754229070143304, - "loss": 3.3213, + "epoch": 5.223719676549865, + "grad_norm": 0.7644480466842651, + "learning_rate": 0.0002870091743119266, + "loss": 3.3271, "step": 48450 }, { - "epoch": 5.220105478419977, - "grad_norm": 0.7084577679634094, - "learning_rate": 0.0002872190496713716, - "loss": 3.3169, + "epoch": 5.22911051212938, + "grad_norm": 0.6814048886299133, + "learning_rate": 0.00028668537506745817, + "loss": 3.3137, "step": 48500 }, { - "epoch": 5.225487030459584, - "grad_norm": 0.6968151926994324, - "learning_rate": 0.0002868958086413102, - "loss": 3.307, + "epoch": 5.234501347708895, + "grad_norm": 0.6768558025360107, + "learning_rate": 0.0002863615758229897, + "loss": 3.3236, "step": 48550 }, { - "epoch": 5.230868582499193, - "grad_norm": 0.6933405995368958, - "learning_rate": 0.00028657256761124877, - "loss": 3.3176, + "epoch": 5.2398921832884096, + "grad_norm": 0.6881909966468811, + "learning_rate": 0.00028603777657852127, + "loss": 3.3468, "step": 48600 }, { - "epoch": 5.236250134538801, - "grad_norm": 0.7592040300369263, - "learning_rate": 0.00028624932658118736, - "loss": 3.3159, + "epoch": 5.245283018867925, + "grad_norm": 0.649848461151123, + "learning_rate": 0.0002857139773340529, + "loss": 3.3271, "step": 48650 }, { - "epoch": 5.241631686578409, - "grad_norm": 0.6854032874107361, - "learning_rate": 0.0002859260855511259, - "loss": 3.3186, + "epoch": 5.250673854447439, + "grad_norm": 0.6775457262992859, + "learning_rate": 0.0002853966540744738, + "loss": 3.3305, "step": 48700 }, { - "epoch": 5.247013238618018, - "grad_norm": 0.7083324193954468, - "learning_rate": 0.0002856028445210645, - "loss": 3.316, + "epoch": 5.256064690026954, + "grad_norm": 0.6716384887695312, + "learning_rate": 0.0002850728548300054, + "loss": 3.3351, "step": 48750 }, { - "epoch": 5.252394790657625, - "grad_norm": 0.7232556939125061, - "learning_rate": 0.0002852796034910031, - "loss": 3.3182, + "epoch": 5.261455525606469, + "grad_norm": 0.6535606384277344, + "learning_rate": 0.000284749055585537, + "loss": 3.3204, "step": 48800 }, { - "epoch": 5.257776342697234, - "grad_norm": 0.6980643272399902, - "learning_rate": 0.0002849563624609417, - "loss": 3.318, + "epoch": 5.2668463611859835, + "grad_norm": 0.6380178928375244, + "learning_rate": 0.0002844252563410685, + "loss": 3.317, "step": 48850 }, { - "epoch": 5.2631578947368425, - "grad_norm": 0.6944864988327026, - "learning_rate": 0.0002846331214308803, - "loss": 3.3329, + "epoch": 5.272237196765499, + "grad_norm": 0.7210144996643066, + "learning_rate": 0.0002841014570966001, + "loss": 3.3309, "step": 48900 }, { - "epoch": 5.26853944677645, - "grad_norm": 0.749697744846344, - "learning_rate": 0.0002843098804008188, - "loss": 3.34, + "epoch": 5.277628032345014, + "grad_norm": 0.6670833826065063, + "learning_rate": 0.00028377765785213163, + "loss": 3.3307, "step": 48950 }, { - "epoch": 5.273920998816059, - "grad_norm": 0.706950843334198, - "learning_rate": 0.00028398663937075747, - "loss": 3.3097, + "epoch": 5.283018867924528, + "grad_norm": 0.7309926748275757, + "learning_rate": 0.00028345385860766324, + "loss": 3.3356, "step": 49000 }, { - "epoch": 5.273920998816059, - "eval_accuracy": 0.3812864523647812, - "eval_loss": 3.405521869659424, - "eval_runtime": 181.1668, - "eval_samples_per_second": 99.417, - "eval_steps_per_second": 6.215, + "epoch": 5.283018867924528, + "eval_accuracy": 0.38106197549855647, + "eval_loss": 3.4104466438293457, + "eval_runtime": 185.3603, + "eval_samples_per_second": 97.168, + "eval_steps_per_second": 6.075, "step": 49000 }, { - "epoch": 5.279302550855666, - "grad_norm": 0.7286232709884644, - "learning_rate": 0.000283663398340696, - "loss": 3.3282, + "epoch": 5.288409703504043, + "grad_norm": 0.718997061252594, + "learning_rate": 0.0002831300593631948, + "loss": 3.3343, "step": 49050 }, { - "epoch": 5.284684102895275, - "grad_norm": 0.7539182305335999, - "learning_rate": 0.0002833401573106346, - "loss": 3.3236, + "epoch": 5.293800539083558, + "grad_norm": 0.723936140537262, + "learning_rate": 0.0002828062601187264, + "loss": 3.3476, "step": 49100 }, { - "epoch": 5.2900656549348835, - "grad_norm": 0.6944497227668762, - "learning_rate": 0.0002830169162805732, - "loss": 3.333, + "epoch": 5.2991913746630726, + "grad_norm": 0.7135783433914185, + "learning_rate": 0.00028248246087425794, + "loss": 3.3404, "step": 49150 }, { - "epoch": 5.295447206974491, - "grad_norm": 0.684943675994873, - "learning_rate": 0.0002826936752505118, - "loss": 3.3295, + "epoch": 5.304582210242588, + "grad_norm": 0.7199476361274719, + "learning_rate": 0.0002821586616297895, + "loss": 3.3478, "step": 49200 }, { - "epoch": 5.3008287590141, - "grad_norm": 0.762440025806427, - "learning_rate": 0.00028237043422045034, - "loss": 3.3233, + "epoch": 5.309973045822103, + "grad_norm": 0.6716611981391907, + "learning_rate": 0.0002818348623853211, + "loss": 3.3271, "step": 49250 }, { - "epoch": 5.306210311053708, - "grad_norm": 0.7036476135253906, - "learning_rate": 0.00028205365801099014, - "loss": 3.3298, + "epoch": 5.315363881401617, + "grad_norm": 0.6387839317321777, + "learning_rate": 0.00028151106314085265, + "loss": 3.3323, "step": 49300 }, { - "epoch": 5.311591863093316, - "grad_norm": 0.7563344836235046, - "learning_rate": 0.00028173041698092874, - "loss": 3.3374, + "epoch": 5.320754716981132, + "grad_norm": 0.6785222291946411, + "learning_rate": 0.0002811872638963842, + "loss": 3.3317, "step": 49350 }, { - "epoch": 5.316973415132924, - "grad_norm": 0.7168316841125488, - "learning_rate": 0.00028140717595086733, - "loss": 3.3358, + "epoch": 5.3261455525606465, + "grad_norm": 0.6540645956993103, + "learning_rate": 0.0002808634646519158, + "loss": 3.3245, "step": 49400 }, { - "epoch": 5.322354967172533, - "grad_norm": 0.7628238201141357, - "learning_rate": 0.0002810839349208059, - "loss": 3.3199, + "epoch": 5.331536388140162, + "grad_norm": 0.641674280166626, + "learning_rate": 0.00028053966540744736, + "loss": 3.3081, "step": 49450 }, { - "epoch": 5.327736519212141, - "grad_norm": 0.7322455644607544, - "learning_rate": 0.00028076069389074447, - "loss": 3.327, + "epoch": 5.336927223719677, + "grad_norm": 0.6906507611274719, + "learning_rate": 0.0002802158661629789, + "loss": 3.3377, "step": 49500 }, { - "epoch": 5.333118071251749, - "grad_norm": 0.6967852115631104, - "learning_rate": 0.0002804374528606831, - "loss": 3.3419, + "epoch": 5.342318059299191, + "grad_norm": 0.660807192325592, + "learning_rate": 0.0002798920669185105, + "loss": 3.335, "step": 49550 }, { - "epoch": 5.338499623291357, - "grad_norm": 0.7441530227661133, - "learning_rate": 0.00028011421183062166, - "loss": 3.3185, + "epoch": 5.347708894878706, + "grad_norm": 0.6709676384925842, + "learning_rate": 0.00027956826767404206, + "loss": 3.3269, "step": 49600 }, { - "epoch": 5.343881175330965, - "grad_norm": 0.7443904280662537, - "learning_rate": 0.00027979097080056025, - "loss": 3.3199, + "epoch": 5.353099730458221, + "grad_norm": 0.6719909310340881, + "learning_rate": 0.00027924446842957367, + "loss": 3.3308, "step": 49650 }, { - "epoch": 5.349262727370574, - "grad_norm": 0.7386976480484009, - "learning_rate": 0.00027946772977049885, - "loss": 3.318, + "epoch": 5.3584905660377355, + "grad_norm": 0.6856489777565002, + "learning_rate": 0.0002789206691851052, + "loss": 3.3159, "step": 49700 }, { - "epoch": 5.354644279410182, - "grad_norm": 0.7359290719032288, - "learning_rate": 0.00027914448874043744, - "loss": 3.3406, + "epoch": 5.363881401617251, + "grad_norm": 0.6632635593414307, + "learning_rate": 0.00027859686994063677, + "loss": 3.3536, "step": 49750 }, { - "epoch": 5.36002583144979, - "grad_norm": 0.730819046497345, - "learning_rate": 0.00027882124771037603, - "loss": 3.3092, + "epoch": 5.369272237196766, + "grad_norm": 0.6541867852210999, + "learning_rate": 0.0002782730706961683, + "loss": 3.3294, "step": 49800 }, { - "epoch": 5.365407383489399, - "grad_norm": 0.7585179209709167, - "learning_rate": 0.0002784980066803146, - "loss": 3.326, + "epoch": 5.37466307277628, + "grad_norm": 0.717612087726593, + "learning_rate": 0.0002779492714516999, + "loss": 3.3391, "step": 49850 }, { - "epoch": 5.370788935529006, - "grad_norm": 0.7162631750106812, - "learning_rate": 0.00027817476565025317, - "loss": 3.3382, + "epoch": 5.380053908355795, + "grad_norm": 0.7243929505348206, + "learning_rate": 0.00027762547220723147, + "loss": 3.3327, "step": 49900 }, { - "epoch": 5.376170487568615, - "grad_norm": 0.7122279405593872, - "learning_rate": 0.00027785152462019176, - "loss": 3.3342, + "epoch": 5.38544474393531, + "grad_norm": 0.6813120245933533, + "learning_rate": 0.0002773016729627631, + "loss": 3.3458, "step": 49950 }, { - "epoch": 5.3815520396082235, - "grad_norm": 0.7170236706733704, - "learning_rate": 0.00027752828359013036, - "loss": 3.3242, + "epoch": 5.390835579514825, + "grad_norm": 0.6332939863204956, + "learning_rate": 0.00027697787371829463, + "loss": 3.3386, "step": 50000 }, { - "epoch": 5.3815520396082235, - "eval_accuracy": 0.38197922317748634, - "eval_loss": 3.400938034057617, - "eval_runtime": 180.5748, - "eval_samples_per_second": 99.743, - "eval_steps_per_second": 6.236, + "epoch": 5.390835579514825, + "eval_accuracy": 0.38111423753759527, + "eval_loss": 3.4083776473999023, + "eval_runtime": 185.12, + "eval_samples_per_second": 97.294, + "eval_steps_per_second": 6.083, "step": 50000 }, { - "epoch": 5.386933591647831, - "grad_norm": 0.6691011786460876, - "learning_rate": 0.0002772050425600689, - "loss": 3.3201, + "epoch": 5.39622641509434, + "grad_norm": 0.6733863353729248, + "learning_rate": 0.00027665407447382623, + "loss": 3.3213, "step": 50050 }, { - "epoch": 5.39231514368744, - "grad_norm": 0.7036200761795044, - "learning_rate": 0.00027688180153000755, - "loss": 3.3223, + "epoch": 5.401617250673855, + "grad_norm": 0.740658164024353, + "learning_rate": 0.0002763302752293578, + "loss": 3.3451, "step": 50100 }, { - "epoch": 5.397696695727047, - "grad_norm": 0.6968643069267273, - "learning_rate": 0.0002765585604999461, - "loss": 3.34, + "epoch": 5.407008086253369, + "grad_norm": 0.6690346002578735, + "learning_rate": 0.00027600647598488933, + "loss": 3.3262, "step": 50150 }, { - "epoch": 5.403078247766656, - "grad_norm": 0.7583166360855103, - "learning_rate": 0.0002762353194698847, - "loss": 3.3211, + "epoch": 5.412398921832884, + "grad_norm": 0.6874278783798218, + "learning_rate": 0.0002756826767404209, + "loss": 3.3288, "step": 50200 }, { - "epoch": 5.4084597998062645, - "grad_norm": 0.7078987956047058, - "learning_rate": 0.0002759120784398233, - "loss": 3.3302, + "epoch": 5.4177897574123985, + "grad_norm": 0.7052708268165588, + "learning_rate": 0.0002753588774959525, + "loss": 3.3515, "step": 50250 }, { - "epoch": 5.413841351845872, - "grad_norm": 0.764525294303894, - "learning_rate": 0.00027558883740976187, - "loss": 3.344, + "epoch": 5.423180592991914, + "grad_norm": 0.6739440560340881, + "learning_rate": 0.00027503507825148404, + "loss": 3.3148, "step": 50300 }, { - "epoch": 5.419222903885481, - "grad_norm": 0.7484478950500488, - "learning_rate": 0.0002752655963797004, - "loss": 3.3345, + "epoch": 5.428571428571429, + "grad_norm": 0.690232515335083, + "learning_rate": 0.00027471127900701564, + "loss": 3.3306, "step": 50350 }, { - "epoch": 5.424604455925088, - "grad_norm": 0.7260310053825378, - "learning_rate": 0.000274942355349639, - "loss": 3.3208, + "epoch": 5.433962264150943, + "grad_norm": 0.7004985213279724, + "learning_rate": 0.0002743874797625472, + "loss": 3.346, "step": 50400 }, { - "epoch": 5.429986007964697, - "grad_norm": 0.6879717707633972, - "learning_rate": 0.0002746191143195776, - "loss": 3.3432, + "epoch": 5.439353099730458, + "grad_norm": 0.720639169216156, + "learning_rate": 0.0002740636805180788, + "loss": 3.3301, "step": 50450 }, { - "epoch": 5.435367560004305, - "grad_norm": 0.7341870665550232, - "learning_rate": 0.0002742958732895162, - "loss": 3.3261, + "epoch": 5.444743935309973, + "grad_norm": 0.6907075047492981, + "learning_rate": 0.00027373988127361035, + "loss": 3.3429, "step": 50500 }, { - "epoch": 5.440749112043913, - "grad_norm": 0.70168536901474, - "learning_rate": 0.0002739726322594548, - "loss": 3.3284, + "epoch": 5.450134770889488, + "grad_norm": 0.7512150406837463, + "learning_rate": 0.0002734160820291419, + "loss": 3.3379, "step": 50550 }, { - "epoch": 5.446130664083522, - "grad_norm": 0.745499312877655, - "learning_rate": 0.00027364939122939333, - "loss": 3.3374, + "epoch": 5.455525606469003, + "grad_norm": 0.6673445701599121, + "learning_rate": 0.00027309228278467345, + "loss": 3.3268, "step": 50600 }, { - "epoch": 5.45151221612313, - "grad_norm": 0.7081572413444519, - "learning_rate": 0.0002733261501993319, - "loss": 3.3199, + "epoch": 5.460916442048518, + "grad_norm": 0.6765791773796082, + "learning_rate": 0.00027276848354020506, + "loss": 3.3387, "step": 50650 }, { - "epoch": 5.456893768162738, - "grad_norm": 0.7260119915008545, - "learning_rate": 0.0002730029091692705, - "loss": 3.3515, + "epoch": 5.466307277628032, + "grad_norm": 0.7130911350250244, + "learning_rate": 0.0002724446842957366, + "loss": 3.3543, "step": 50700 }, { - "epoch": 5.462275320202346, - "grad_norm": 0.7278617024421692, - "learning_rate": 0.0002726796681392091, - "loss": 3.331, + "epoch": 5.471698113207547, + "grad_norm": 0.6755728125572205, + "learning_rate": 0.0002721208850512682, + "loss": 3.3594, "step": 50750 }, { - "epoch": 5.467656872241955, - "grad_norm": 0.7048456072807312, - "learning_rate": 0.0002723564271091477, - "loss": 3.3106, + "epoch": 5.4770889487870615, + "grad_norm": 0.7097396850585938, + "learning_rate": 0.00027179708580679976, + "loss": 3.3366, "step": 50800 }, { - "epoch": 5.473038424281563, - "grad_norm": 0.7255328893661499, - "learning_rate": 0.0002720331860790863, - "loss": 3.3247, + "epoch": 5.482479784366577, + "grad_norm": 0.6999021768569946, + "learning_rate": 0.0002714732865623313, + "loss": 3.3389, "step": 50850 }, { - "epoch": 5.478419976321171, - "grad_norm": 0.7401384115219116, - "learning_rate": 0.00027170994504902485, - "loss": 3.3402, + "epoch": 5.487870619946092, + "grad_norm": 0.6593881249427795, + "learning_rate": 0.0002711494873178629, + "loss": 3.3518, "step": 50900 }, { - "epoch": 5.483801528360779, - "grad_norm": 0.786148190498352, - "learning_rate": 0.00027138670401896344, - "loss": 3.3437, + "epoch": 5.493261455525606, + "grad_norm": 0.6644842028617859, + "learning_rate": 0.00027082568807339447, + "loss": 3.3399, "step": 50950 }, { - "epoch": 5.489183080400387, - "grad_norm": 0.7473777532577515, - "learning_rate": 0.00027106346298890204, - "loss": 3.3597, + "epoch": 5.498652291105121, + "grad_norm": 0.6714942455291748, + "learning_rate": 0.000270501888828926, + "loss": 3.3408, "step": 51000 }, { - "epoch": 5.489183080400387, - "eval_accuracy": 0.38263559527277363, - "eval_loss": 3.3964664936065674, - "eval_runtime": 180.3564, - "eval_samples_per_second": 99.863, - "eval_steps_per_second": 6.243, + "epoch": 5.498652291105121, + "eval_accuracy": 0.38187643754561384, + "eval_loss": 3.401001453399658, + "eval_runtime": 184.9621, + "eval_samples_per_second": 97.377, + "eval_steps_per_second": 6.088, "step": 51000 }, { - "epoch": 5.494564632439996, - "grad_norm": 0.7068842649459839, - "learning_rate": 0.00027074022195884063, - "loss": 3.3147, + "epoch": 5.504043126684636, + "grad_norm": 0.6829885840415955, + "learning_rate": 0.0002701780895844576, + "loss": 3.3329, "step": 51050 }, { - "epoch": 5.499946184479604, - "grad_norm": 0.7448179125785828, - "learning_rate": 0.0002704169809287792, - "loss": 3.3439, + "epoch": 5.509433962264151, + "grad_norm": 0.6931278705596924, + "learning_rate": 0.0002698542903399892, + "loss": 3.3251, "step": 51100 }, { - "epoch": 5.505327736519212, - "grad_norm": 0.7172561287879944, - "learning_rate": 0.00027009373989871776, - "loss": 3.3419, + "epoch": 5.514824797843666, + "grad_norm": 0.7629396319389343, + "learning_rate": 0.0002695304910955207, + "loss": 3.3209, "step": 51150 }, { - "epoch": 5.510709288558821, - "grad_norm": 0.7486968040466309, - "learning_rate": 0.00026977049886865636, - "loss": 3.3319, + "epoch": 5.520215633423181, + "grad_norm": 0.6978450417518616, + "learning_rate": 0.00026920669185105233, + "loss": 3.3236, "step": 51200 }, { - "epoch": 5.516090840598428, - "grad_norm": 0.7076429128646851, - "learning_rate": 0.00026944725783859495, - "loss": 3.3312, + "epoch": 5.525606469002695, + "grad_norm": 0.6813058853149414, + "learning_rate": 0.0002688893685914733, + "loss": 3.3521, "step": 51250 }, { - "epoch": 5.521472392638037, - "grad_norm": 0.7071561813354492, - "learning_rate": 0.00026912401680853355, - "loss": 3.3152, + "epoch": 5.53099730458221, + "grad_norm": 0.709402859210968, + "learning_rate": 0.00026856556934700483, + "loss": 3.3436, "step": 51300 }, { - "epoch": 5.5268539446776455, - "grad_norm": 0.7227540612220764, - "learning_rate": 0.0002688007757784721, - "loss": 3.3266, + "epoch": 5.536388140161725, + "grad_norm": 0.745469868183136, + "learning_rate": 0.0002682417701025364, + "loss": 3.3448, "step": 51350 }, { - "epoch": 5.532235496717253, - "grad_norm": 0.7778589129447937, - "learning_rate": 0.00026847753474841074, - "loss": 3.345, + "epoch": 5.54177897574124, + "grad_norm": 0.6713014841079712, + "learning_rate": 0.000267917970858068, + "loss": 3.3228, "step": 51400 }, { - "epoch": 5.537617048756862, - "grad_norm": 0.7239121794700623, - "learning_rate": 0.0002681542937183493, - "loss": 3.3364, + "epoch": 5.547169811320755, + "grad_norm": 0.6785526871681213, + "learning_rate": 0.00026759417161359954, + "loss": 3.3257, "step": 51450 }, { - "epoch": 5.542998600796469, - "grad_norm": 0.7021813988685608, - "learning_rate": 0.0002678310526882879, - "loss": 3.3266, + "epoch": 5.55256064690027, + "grad_norm": 0.7397943735122681, + "learning_rate": 0.0002672703723691311, + "loss": 3.3662, "step": 51500 }, { - "epoch": 5.548380152836078, - "grad_norm": 0.7057538628578186, - "learning_rate": 0.00026750781165822647, - "loss": 3.315, + "epoch": 5.557951482479784, + "grad_norm": 0.6826764941215515, + "learning_rate": 0.0002669465731246627, + "loss": 3.3281, "step": 51550 }, { - "epoch": 5.553761704875686, - "grad_norm": 0.7174096703529358, - "learning_rate": 0.00026718457062816506, - "loss": 3.3249, + "epoch": 5.563342318059299, + "grad_norm": 0.6693432927131653, + "learning_rate": 0.00026662277388019424, + "loss": 3.3424, "step": 51600 }, { - "epoch": 5.559143256915294, - "grad_norm": 0.7189500331878662, - "learning_rate": 0.00026686132959810366, - "loss": 3.3414, + "epoch": 5.568733153638814, + "grad_norm": 0.6833118200302124, + "learning_rate": 0.00026629897463572585, + "loss": 3.3281, "step": 51650 }, { - "epoch": 5.564524808954903, - "grad_norm": 0.7119573354721069, - "learning_rate": 0.0002665380885680422, - "loss": 3.3625, + "epoch": 5.574123989218329, + "grad_norm": 0.6914713382720947, + "learning_rate": 0.0002659751753912574, + "loss": 3.3337, "step": 51700 }, { - "epoch": 5.569906360994511, - "grad_norm": 0.7115148901939392, - "learning_rate": 0.0002662148475379808, - "loss": 3.3203, + "epoch": 5.579514824797844, + "grad_norm": 0.6685629487037659, + "learning_rate": 0.00026565137614678895, + "loss": 3.3468, "step": 51750 }, { - "epoch": 5.575287913034119, - "grad_norm": 0.7708024382591248, - "learning_rate": 0.0002658916065079194, - "loss": 3.3362, + "epoch": 5.584905660377358, + "grad_norm": 0.6789836287498474, + "learning_rate": 0.0002653275769023205, + "loss": 3.3202, "step": 51800 }, { - "epoch": 5.580669465073727, - "grad_norm": 0.7303184866905212, - "learning_rate": 0.000265568365477858, - "loss": 3.334, + "epoch": 5.590296495956873, + "grad_norm": 0.6970906853675842, + "learning_rate": 0.0002650037776578521, + "loss": 3.3166, "step": 51850 }, { - "epoch": 5.586051017113336, - "grad_norm": 0.7622580528259277, - "learning_rate": 0.0002652451244477965, - "loss": 3.32, + "epoch": 5.595687331536388, + "grad_norm": 0.6891434788703918, + "learning_rate": 0.00026467997841338366, + "loss": 3.3685, "step": 51900 }, { - "epoch": 5.591432569152944, - "grad_norm": 0.7121456265449524, - "learning_rate": 0.00026492188341773517, - "loss": 3.316, + "epoch": 5.601078167115903, + "grad_norm": 0.7000083923339844, + "learning_rate": 0.00026435617916891526, + "loss": 3.3327, "step": 51950 }, { - "epoch": 5.596814121192552, - "grad_norm": 0.7205502390861511, - "learning_rate": 0.0002645986423876737, - "loss": 3.3013, + "epoch": 5.606469002695418, + "grad_norm": 0.6774043440818787, + "learning_rate": 0.0002640323799244468, + "loss": 3.3484, "step": 52000 }, { - "epoch": 5.596814121192552, - "eval_accuracy": 0.38279140351389956, - "eval_loss": 3.3938612937927246, - "eval_runtime": 180.5356, - "eval_samples_per_second": 99.764, - "eval_steps_per_second": 6.237, + "epoch": 5.606469002695418, + "eval_accuracy": 0.3823833032672684, + "eval_loss": 3.3957502841949463, + "eval_runtime": 184.6922, + "eval_samples_per_second": 97.519, + "eval_steps_per_second": 6.097, "step": 52000 }, { - "epoch": 5.60219567323216, - "grad_norm": 0.7592937350273132, - "learning_rate": 0.0002642754013576123, - "loss": 3.3312, + "epoch": 5.611859838274933, + "grad_norm": 0.6880715489387512, + "learning_rate": 0.0002637085806799784, + "loss": 3.3226, "step": 52050 }, { - "epoch": 5.607577225271768, - "grad_norm": 0.7347540855407715, - "learning_rate": 0.0002639521603275509, - "loss": 3.3274, + "epoch": 5.617250673854447, + "grad_norm": 0.6606606245040894, + "learning_rate": 0.00026338478143550997, + "loss": 3.3464, "step": 52100 }, { - "epoch": 5.612958777311377, - "grad_norm": 0.717491626739502, - "learning_rate": 0.0002636289192974895, - "loss": 3.3496, + "epoch": 5.622641509433962, + "grad_norm": 0.6744948029518127, + "learning_rate": 0.00026306098219104157, + "loss": 3.3165, "step": 52150 }, { - "epoch": 5.618340329350985, - "grad_norm": 0.7288353443145752, - "learning_rate": 0.00026330567826742804, - "loss": 3.3103, + "epoch": 5.628032345013477, + "grad_norm": 0.7021511793136597, + "learning_rate": 0.00026273718294657307, + "loss": 3.3176, "step": 52200 }, { - "epoch": 5.623721881390593, - "grad_norm": 0.6995804309844971, - "learning_rate": 0.00026298243723736663, - "loss": 3.3326, + "epoch": 5.633423180592992, + "grad_norm": 0.7162783741950989, + "learning_rate": 0.00026241338370210467, + "loss": 3.337, "step": 52250 }, { - "epoch": 5.629103433430201, - "grad_norm": 0.7089316844940186, - "learning_rate": 0.0002626591962073052, - "loss": 3.3313, + "epoch": 5.638814016172507, + "grad_norm": 0.6945198774337769, + "learning_rate": 0.0002620895844576362, + "loss": 3.3495, "step": 52300 }, { - "epoch": 5.634484985469809, - "grad_norm": 0.7350379824638367, - "learning_rate": 0.0002623359551772438, - "loss": 3.3358, + "epoch": 5.644204851752022, + "grad_norm": 0.6781046390533447, + "learning_rate": 0.00026176578521316783, + "loss": 3.3215, "step": 52350 }, { - "epoch": 5.639866537509418, - "grad_norm": 0.7242736220359802, - "learning_rate": 0.0002620127141471824, - "loss": 3.3154, + "epoch": 5.649595687331536, + "grad_norm": 0.6661491394042969, + "learning_rate": 0.0002614419859686994, + "loss": 3.318, "step": 52400 }, { - "epoch": 5.645248089549026, - "grad_norm": 0.7510601282119751, - "learning_rate": 0.00026168947311712095, - "loss": 3.3391, + "epoch": 5.654986522911051, + "grad_norm": 0.8015744090080261, + "learning_rate": 0.000261118186724231, + "loss": 3.327, "step": 52450 }, { - "epoch": 5.650629641588634, - "grad_norm": 0.7219720482826233, - "learning_rate": 0.00026136623208705955, - "loss": 3.3203, + "epoch": 5.660377358490566, + "grad_norm": 0.7487977147102356, + "learning_rate": 0.00026079438747976253, + "loss": 3.3416, "step": 52500 }, { - "epoch": 5.656011193628243, - "grad_norm": 0.7292102575302124, - "learning_rate": 0.00026104299105699814, - "loss": 3.3382, + "epoch": 5.665768194070081, + "grad_norm": 0.674368679523468, + "learning_rate": 0.0002604705882352941, + "loss": 3.3369, "step": 52550 }, { - "epoch": 5.66139274566785, - "grad_norm": 0.7652955651283264, - "learning_rate": 0.00026071975002693674, - "loss": 3.3266, + "epoch": 5.671159029649596, + "grad_norm": 0.750124454498291, + "learning_rate": 0.00026014678899082563, + "loss": 3.3364, "step": 52600 }, { - "epoch": 5.666774297707459, - "grad_norm": 0.7515937685966492, - "learning_rate": 0.00026039650899687533, - "loss": 3.3376, + "epoch": 5.67654986522911, + "grad_norm": 0.6911941170692444, + "learning_rate": 0.00025982298974635724, + "loss": 3.3505, "step": 52650 }, { - "epoch": 5.672155849747067, - "grad_norm": 0.7084229588508606, - "learning_rate": 0.00026007326796681393, - "loss": 3.3164, + "epoch": 5.681940700808625, + "grad_norm": 0.7061330080032349, + "learning_rate": 0.0002594991905018888, + "loss": 3.3523, "step": 52700 }, { - "epoch": 5.677537401786675, - "grad_norm": 0.717211127281189, - "learning_rate": 0.00025975002693675247, - "loss": 3.3411, + "epoch": 5.6873315363881405, + "grad_norm": 0.7390830516815186, + "learning_rate": 0.0002591753912574204, + "loss": 3.3449, "step": 52750 }, { - "epoch": 5.682918953826284, - "grad_norm": 0.7143423557281494, - "learning_rate": 0.00025942678590669106, - "loss": 3.3249, + "epoch": 5.692722371967655, + "grad_norm": 0.7006047368049622, + "learning_rate": 0.00025885159201295195, + "loss": 3.3225, "step": 52800 }, { - "epoch": 5.688300505865891, - "grad_norm": 0.7390002012252808, - "learning_rate": 0.00025910354487662966, - "loss": 3.3317, + "epoch": 5.69811320754717, + "grad_norm": 0.7471202611923218, + "learning_rate": 0.0002585277927684835, + "loss": 3.3285, "step": 52850 }, { - "epoch": 5.6936820579055, - "grad_norm": 0.7596791386604309, - "learning_rate": 0.00025878676866716946, - "loss": 3.3377, + "epoch": 5.703504043126685, + "grad_norm": 0.6955723762512207, + "learning_rate": 0.0002582039935240151, + "loss": 3.3498, "step": 52900 }, { - "epoch": 5.699063609945108, - "grad_norm": 0.7708361744880676, - "learning_rate": 0.00025846352763710806, - "loss": 3.3309, + "epoch": 5.708894878706199, + "grad_norm": 0.6657953262329102, + "learning_rate": 0.00025788019427954665, + "loss": 3.3469, "step": 52950 }, { - "epoch": 5.704445161984716, - "grad_norm": 0.7448739409446716, - "learning_rate": 0.0002581402866070466, - "loss": 3.3461, + "epoch": 5.714285714285714, + "grad_norm": 0.6830325126647949, + "learning_rate": 0.00025755639503507826, + "loss": 3.339, "step": 53000 }, { - "epoch": 5.704445161984716, - "eval_accuracy": 0.382959815490012, - "eval_loss": 3.3894808292388916, - "eval_runtime": 180.4953, - "eval_samples_per_second": 99.787, - "eval_steps_per_second": 6.238, + "epoch": 5.714285714285714, + "eval_accuracy": 0.38255432291268837, + "eval_loss": 3.392019748687744, + "eval_runtime": 185.0125, + "eval_samples_per_second": 97.35, + "eval_steps_per_second": 6.086, "step": 53000 }, { - "epoch": 5.709826714024325, - "grad_norm": 0.7527272701263428, - "learning_rate": 0.00025781704557698525, - "loss": 3.3118, + "epoch": 5.719676549865229, + "grad_norm": 0.6657763123512268, + "learning_rate": 0.0002572325957906098, + "loss": 3.3222, "step": 53050 }, { - "epoch": 5.715208266063933, - "grad_norm": 0.7586866021156311, - "learning_rate": 0.0002574938045469238, - "loss": 3.323, + "epoch": 5.725067385444744, + "grad_norm": 0.7333123683929443, + "learning_rate": 0.00025690879654614136, + "loss": 3.3471, "step": 53100 }, { - "epoch": 5.720589818103541, - "grad_norm": 0.7504985928535461, - "learning_rate": 0.0002571705635168624, - "loss": 3.3293, + "epoch": 5.730458221024259, + "grad_norm": 0.7092816829681396, + "learning_rate": 0.0002565849973016729, + "loss": 3.3468, "step": 53150 }, { - "epoch": 5.725971370143149, - "grad_norm": 0.7618194222450256, - "learning_rate": 0.000256847322486801, - "loss": 3.344, + "epoch": 5.735849056603773, + "grad_norm": 0.7320940494537354, + "learning_rate": 0.0002562611980572045, + "loss": 3.3424, "step": 53200 }, { - "epoch": 5.731352922182758, - "grad_norm": 0.7982940673828125, - "learning_rate": 0.0002565240814567396, - "loss": 3.3382, + "epoch": 5.741239892183288, + "grad_norm": 0.7620314359664917, + "learning_rate": 0.00025593739881273606, + "loss": 3.3349, "step": 53250 }, { - "epoch": 5.736734474222366, - "grad_norm": 0.7086600661277771, - "learning_rate": 0.0002562008404266781, - "loss": 3.3341, + "epoch": 5.7466307277628035, + "grad_norm": 0.6692413091659546, + "learning_rate": 0.00025561359956826767, + "loss": 3.3568, "step": 53300 }, { - "epoch": 5.742116026261974, - "grad_norm": 0.71695476770401, - "learning_rate": 0.0002558775993966167, - "loss": 3.3213, + "epoch": 5.752021563342318, + "grad_norm": 0.6936851739883423, + "learning_rate": 0.0002552898003237992, + "loss": 3.3201, "step": 53350 }, { - "epoch": 5.747497578301582, - "grad_norm": 0.7411425709724426, - "learning_rate": 0.0002555543583665553, - "loss": 3.3474, + "epoch": 5.757412398921833, + "grad_norm": 0.6667423844337463, + "learning_rate": 0.0002549660010793308, + "loss": 3.3396, "step": 53400 }, { - "epoch": 5.75287913034119, - "grad_norm": 0.7171765565872192, - "learning_rate": 0.0002552311173364939, - "loss": 3.3428, + "epoch": 5.762803234501348, + "grad_norm": 0.7328035831451416, + "learning_rate": 0.0002546422018348624, + "loss": 3.329, "step": 53450 }, { - "epoch": 5.758260682380799, - "grad_norm": 0.7351555228233337, - "learning_rate": 0.0002549078763064325, - "loss": 3.3225, + "epoch": 5.768194070080862, + "grad_norm": 0.6744652390480042, + "learning_rate": 0.0002543184025903939, + "loss": 3.3415, "step": 53500 }, { - "epoch": 5.763642234420407, - "grad_norm": 0.7267298102378845, - "learning_rate": 0.00025458463527637103, - "loss": 3.3269, + "epoch": 5.773584905660377, + "grad_norm": 0.6693131327629089, + "learning_rate": 0.0002539946033459255, + "loss": 3.3265, "step": 53550 }, { - "epoch": 5.769023786460015, - "grad_norm": 0.6977094411849976, - "learning_rate": 0.0002542613942463097, - "loss": 3.3547, + "epoch": 5.7789757412398925, + "grad_norm": 0.7442718148231506, + "learning_rate": 0.0002536708041014571, + "loss": 3.322, "step": 53600 }, { - "epoch": 5.774405338499624, - "grad_norm": 0.7931075096130371, - "learning_rate": 0.0002539381532162482, - "loss": 3.3261, + "epoch": 5.784366576819407, + "grad_norm": 0.6674184203147888, + "learning_rate": 0.00025334700485698863, + "loss": 3.3204, "step": 53650 }, { - "epoch": 5.779786890539231, - "grad_norm": 0.7646996378898621, - "learning_rate": 0.0002536149121861868, - "loss": 3.3383, + "epoch": 5.789757412398922, + "grad_norm": 0.712354838848114, + "learning_rate": 0.00025302320561252023, + "loss": 3.3311, "step": 53700 }, { - "epoch": 5.78516844257884, - "grad_norm": 0.7829656004905701, - "learning_rate": 0.0002532916711561254, - "loss": 3.3332, + "epoch": 5.795148247978437, + "grad_norm": 0.710060179233551, + "learning_rate": 0.0002526994063680518, + "loss": 3.3383, "step": 53750 }, { - "epoch": 5.790549994618448, - "grad_norm": 0.7650371789932251, - "learning_rate": 0.000252968430126064, - "loss": 3.3414, + "epoch": 5.800539083557951, + "grad_norm": 0.7189279198646545, + "learning_rate": 0.0002523756071235834, + "loss": 3.3457, "step": 53800 }, { - "epoch": 5.795931546658056, - "grad_norm": 0.7446619272232056, - "learning_rate": 0.00025264518909600255, - "loss": 3.3277, + "epoch": 5.8059299191374665, + "grad_norm": 0.6352289915084839, + "learning_rate": 0.00025205180787911494, + "loss": 3.3283, "step": 53850 }, { - "epoch": 5.801313098697665, - "grad_norm": 0.7219211459159851, - "learning_rate": 0.00025232194806594114, - "loss": 3.3408, + "epoch": 5.811320754716981, + "grad_norm": 0.7231967449188232, + "learning_rate": 0.0002517280086346465, + "loss": 3.3339, "step": 53900 }, { - "epoch": 5.806694650737272, - "grad_norm": 0.7577422857284546, - "learning_rate": 0.00025199870703587974, - "loss": 3.345, + "epoch": 5.816711590296496, + "grad_norm": 0.6729211807250977, + "learning_rate": 0.00025140420939017804, + "loss": 3.3329, "step": 53950 }, { - "epoch": 5.812076202776881, - "grad_norm": 0.7506514191627502, - "learning_rate": 0.00025167546600581833, - "loss": 3.3376, + "epoch": 5.822102425876011, + "grad_norm": 0.6906619071960449, + "learning_rate": 0.00025108041014570965, + "loss": 3.3305, "step": 54000 }, { - "epoch": 5.812076202776881, - "eval_accuracy": 0.3836133626102161, - "eval_loss": 3.3827064037323, - "eval_runtime": 180.6271, - "eval_samples_per_second": 99.714, - "eval_steps_per_second": 6.234, + "epoch": 5.822102425876011, + "eval_accuracy": 0.3831925499757107, + "eval_loss": 3.38727068901062, + "eval_runtime": 184.8974, + "eval_samples_per_second": 97.411, + "eval_steps_per_second": 6.09, "step": 54000 }, { - "epoch": 5.817457754816489, - "grad_norm": 0.7082645297050476, - "learning_rate": 0.0002513522249757569, - "loss": 3.3294, + "epoch": 5.827493261455525, + "grad_norm": 0.706639289855957, + "learning_rate": 0.0002507566109012412, + "loss": 3.3496, "step": 54050 }, { - "epoch": 5.822839306856097, - "grad_norm": 0.8100736141204834, - "learning_rate": 0.00025102898394569547, - "loss": 3.3329, + "epoch": 5.83288409703504, + "grad_norm": 0.7261019945144653, + "learning_rate": 0.0002504328116567728, + "loss": 3.3316, "step": 54100 }, { - "epoch": 5.828220858895706, - "grad_norm": 0.7404306530952454, - "learning_rate": 0.00025070574291563406, - "loss": 3.3335, + "epoch": 5.8382749326145555, + "grad_norm": 0.681576669216156, + "learning_rate": 0.00025010901241230435, + "loss": 3.3525, "step": 54150 }, { - "epoch": 5.833602410935313, - "grad_norm": 0.7143645286560059, - "learning_rate": 0.00025038250188557265, - "loss": 3.3362, + "epoch": 5.84366576819407, + "grad_norm": 0.6800199151039124, + "learning_rate": 0.0002497852131678359, + "loss": 3.3567, "step": 54200 }, { - "epoch": 5.838983962974922, - "grad_norm": 0.7716860175132751, - "learning_rate": 0.00025005926085551125, - "loss": 3.3455, + "epoch": 5.849056603773585, + "grad_norm": 0.6690753698348999, + "learning_rate": 0.0002494614139233675, + "loss": 3.3331, "step": 54250 }, { - "epoch": 5.84436551501453, - "grad_norm": 0.769177258014679, - "learning_rate": 0.0002497360198254498, - "loss": 3.3358, + "epoch": 5.8544474393531, + "grad_norm": 0.7050347924232483, + "learning_rate": 0.00024913761467889906, + "loss": 3.3425, "step": 54300 }, { - "epoch": 5.849747067054138, - "grad_norm": 0.7687222957611084, - "learning_rate": 0.00024941277879538844, - "loss": 3.3366, + "epoch": 5.859838274932614, + "grad_norm": 0.7527475357055664, + "learning_rate": 0.0002488138154344306, + "loss": 3.3453, "step": 54350 }, { - "epoch": 5.855128619093747, - "grad_norm": 0.7368986010551453, - "learning_rate": 0.000249089537765327, - "loss": 3.332, + "epoch": 5.8652291105121295, + "grad_norm": 0.6619974970817566, + "learning_rate": 0.0002484900161899622, + "loss": 3.3404, "step": 54400 }, { - "epoch": 5.860510171133355, - "grad_norm": 0.7428856492042542, - "learning_rate": 0.0002487662967352656, - "loss": 3.3448, + "epoch": 5.870619946091644, + "grad_norm": 0.7044657468795776, + "learning_rate": 0.00024816621694549376, + "loss": 3.3278, "step": 54450 }, { - "epoch": 5.865891723172963, - "grad_norm": 0.7492393255233765, - "learning_rate": 0.00024844305570520417, - "loss": 3.3396, + "epoch": 5.876010781671159, + "grad_norm": 0.72688227891922, + "learning_rate": 0.00024784241770102537, + "loss": 3.3391, "step": 54500 }, { - "epoch": 5.871273275212571, - "grad_norm": 0.8640391826629639, - "learning_rate": 0.00024811981467514276, - "loss": 3.3256, + "epoch": 5.881401617250674, + "grad_norm": 0.7019941806793213, + "learning_rate": 0.0002475186184565569, + "loss": 3.3388, "step": 54550 }, { - "epoch": 5.87665482725218, - "grad_norm": 0.7381857633590698, - "learning_rate": 0.00024779657364508136, - "loss": 3.3202, + "epoch": 5.886792452830189, + "grad_norm": 0.7095934748649597, + "learning_rate": 0.00024719481921208847, + "loss": 3.3388, "step": 54600 }, { - "epoch": 5.882036379291788, - "grad_norm": 0.7371327877044678, - "learning_rate": 0.0002474733326150199, - "loss": 3.3164, + "epoch": 5.892183288409703, + "grad_norm": 0.7058985829353333, + "learning_rate": 0.0002468710199676201, + "loss": 3.3453, "step": 54650 }, { - "epoch": 5.887417931331396, - "grad_norm": 0.7317863702774048, - "learning_rate": 0.0002471500915849585, - "loss": 3.314, + "epoch": 5.8975741239892185, + "grad_norm": 0.7246441841125488, + "learning_rate": 0.0002465472207231516, + "loss": 3.3331, "step": 54700 }, { - "epoch": 5.892799483371004, - "grad_norm": 0.7320008277893066, - "learning_rate": 0.0002468268505548971, - "loss": 3.3218, + "epoch": 5.902964959568733, + "grad_norm": 0.7580798268318176, + "learning_rate": 0.0002462234214786832, + "loss": 3.3339, "step": 54750 }, { - "epoch": 5.898181035410612, - "grad_norm": 0.7335460186004639, - "learning_rate": 0.0002465036095248357, - "loss": 3.339, + "epoch": 5.908355795148248, + "grad_norm": 0.7288414835929871, + "learning_rate": 0.0002458996222342148, + "loss": 3.3325, "step": 54800 }, { - "epoch": 5.903562587450221, - "grad_norm": 0.7588884234428406, - "learning_rate": 0.0002461803684947742, - "loss": 3.3418, + "epoch": 5.913746630727763, + "grad_norm": 0.6888887882232666, + "learning_rate": 0.00024557582298974633, + "loss": 3.3351, "step": 54850 }, { - "epoch": 5.9089441394898286, - "grad_norm": 0.7557958364486694, - "learning_rate": 0.0002458571274647128, - "loss": 3.3497, + "epoch": 5.919137466307277, + "grad_norm": 0.7192179560661316, + "learning_rate": 0.0002452520237452779, + "loss": 3.3365, "step": 54900 }, { - "epoch": 5.914325691529437, - "grad_norm": 0.7636759281158447, - "learning_rate": 0.0002455338864346514, - "loss": 3.3302, + "epoch": 5.9245283018867925, + "grad_norm": 0.7986792325973511, + "learning_rate": 0.0002449282245008095, + "loss": 3.3433, "step": 54950 }, { - "epoch": 5.919707243569046, - "grad_norm": 0.7415563464164734, - "learning_rate": 0.00024521064540459, - "loss": 3.3454, + "epoch": 5.929919137466308, + "grad_norm": 0.6919881105422974, + "learning_rate": 0.00024460442525634104, + "loss": 3.3495, "step": 55000 }, { - "epoch": 5.919707243569046, - "eval_accuracy": 0.38400386108902135, - "eval_loss": 3.3798177242279053, - "eval_runtime": 180.4085, - "eval_samples_per_second": 99.835, - "eval_steps_per_second": 6.241, + "epoch": 5.929919137466308, + "eval_accuracy": 0.38388206120178137, + "eval_loss": 3.383330821990967, + "eval_runtime": 185.021, + "eval_samples_per_second": 97.346, + "eval_steps_per_second": 6.086, "step": 55000 }, { - "epoch": 5.925088795608653, - "grad_norm": 0.7452832460403442, - "learning_rate": 0.0002448874043745286, - "loss": 3.3509, + "epoch": 5.935309973045822, + "grad_norm": 0.7238466739654541, + "learning_rate": 0.00024428062601187264, + "loss": 3.3563, "step": 55050 }, { - "epoch": 5.930470347648262, - "grad_norm": 0.8210157752037048, - "learning_rate": 0.00024456416334446714, - "loss": 3.3223, + "epoch": 5.940700808625337, + "grad_norm": 0.675534188747406, + "learning_rate": 0.0002439568267674042, + "loss": 3.3441, "step": 55100 }, { - "epoch": 5.93585189968787, - "grad_norm": 0.734848141670227, - "learning_rate": 0.000244247387135007, - "loss": 3.3249, + "epoch": 5.946091644204852, + "grad_norm": 0.7394225001335144, + "learning_rate": 0.00024363302752293574, + "loss": 3.3394, "step": 55150 }, { - "epoch": 5.941233451727478, - "grad_norm": 0.8083829283714294, - "learning_rate": 0.00024392414610494557, - "loss": 3.329, + "epoch": 5.951482479784366, + "grad_norm": 0.7124001383781433, + "learning_rate": 0.00024330922827846732, + "loss": 3.3482, "step": 55200 }, { - "epoch": 5.946615003767087, - "grad_norm": 0.7373672723770142, - "learning_rate": 0.00024360090507488414, - "loss": 3.339, + "epoch": 5.9568733153638815, + "grad_norm": 0.7209517359733582, + "learning_rate": 0.00024299190501888827, + "loss": 3.3532, "step": 55250 }, { - "epoch": 5.951996555806694, - "grad_norm": 0.7348290681838989, - "learning_rate": 0.00024327766404482273, - "loss": 3.3362, + "epoch": 5.962264150943396, + "grad_norm": 0.6774404644966125, + "learning_rate": 0.00024266810577441985, + "loss": 3.3182, "step": 55300 }, { - "epoch": 5.957378107846303, - "grad_norm": 0.7888813018798828, - "learning_rate": 0.0002429544230147613, - "loss": 3.3275, + "epoch": 5.967654986522911, + "grad_norm": 0.7080289125442505, + "learning_rate": 0.00024234430652995143, + "loss": 3.3309, "step": 55350 }, { - "epoch": 5.962759659885911, - "grad_norm": 0.7368828058242798, - "learning_rate": 0.00024263118198469992, - "loss": 3.3428, + "epoch": 5.973045822102426, + "grad_norm": 0.6646098494529724, + "learning_rate": 0.00024202050728548298, + "loss": 3.3311, "step": 55400 }, { - "epoch": 5.968141211925519, - "grad_norm": 0.726843535900116, - "learning_rate": 0.0002423079409546385, - "loss": 3.357, + "epoch": 5.97843665768194, + "grad_norm": 0.7524878978729248, + "learning_rate": 0.00024169670804101456, + "loss": 3.3239, "step": 55450 }, { - "epoch": 5.973522763965128, - "grad_norm": 0.7297669053077698, - "learning_rate": 0.00024198469992457706, - "loss": 3.3171, + "epoch": 5.9838274932614555, + "grad_norm": 0.7035167813301086, + "learning_rate": 0.0002413729087965461, + "loss": 3.3441, "step": 55500 }, { - "epoch": 5.978904316004736, - "grad_norm": 0.7472332119941711, - "learning_rate": 0.00024166145889451568, - "loss": 3.3452, + "epoch": 5.989218328840971, + "grad_norm": 0.7106602191925049, + "learning_rate": 0.00024104910955207768, + "loss": 3.3366, "step": 55550 }, { - "epoch": 5.984285868044344, - "grad_norm": 0.7742068767547607, - "learning_rate": 0.00024133821786445425, - "loss": 3.35, + "epoch": 5.994609164420485, + "grad_norm": 0.6742231249809265, + "learning_rate": 0.00024072531030760926, + "loss": 3.3182, "step": 55600 }, { - "epoch": 5.989667420083952, - "grad_norm": 0.7965028882026672, - "learning_rate": 0.0002410149768343928, - "loss": 3.3454, + "epoch": 6.0, + "grad_norm": Infinity, + "learning_rate": 0.00024040798704803021, + "loss": 3.3324, "step": 55650 }, { - "epoch": 5.995048972123561, - "grad_norm": 0.7538635730743408, - "learning_rate": 0.0002406917358043314, - "loss": 3.3523, + "epoch": 6.005390835579515, + "grad_norm": 0.730053186416626, + "learning_rate": 0.00024008418780356176, + "loss": 3.2365, "step": 55700 }, { - "epoch": 6.000430524163169, - "grad_norm": 0.7462757229804993, - "learning_rate": 0.00024036849477427, - "loss": 3.3303, + "epoch": 6.010781671159029, + "grad_norm": 0.6938073635101318, + "learning_rate": 0.00023976038855909334, + "loss": 3.2432, "step": 55750 }, { - "epoch": 6.005812076202777, - "grad_norm": 0.7339770793914795, - "learning_rate": 0.00024004525374420857, - "loss": 3.2492, + "epoch": 6.0161725067385445, + "grad_norm": 0.6961255669593811, + "learning_rate": 0.00023943658931462492, + "loss": 3.256, "step": 55800 }, { - "epoch": 6.011193628242385, - "grad_norm": 0.7494232654571533, - "learning_rate": 0.00023972201271414716, - "loss": 3.2566, + "epoch": 6.02156334231806, + "grad_norm": 0.7340931296348572, + "learning_rate": 0.00023911279007015647, + "loss": 3.2447, "step": 55850 }, { - "epoch": 6.016575180281993, - "grad_norm": 0.7096470594406128, - "learning_rate": 0.00023939877168408573, - "loss": 3.2373, + "epoch": 6.026954177897574, + "grad_norm": 0.7116497159004211, + "learning_rate": 0.00023878899082568805, + "loss": 3.2531, "step": 55900 }, { - "epoch": 6.021956732321602, - "grad_norm": 0.7256503701210022, - "learning_rate": 0.00023907553065402433, - "loss": 3.2391, + "epoch": 6.032345013477089, + "grad_norm": 0.6797800064086914, + "learning_rate": 0.00023846519158121963, + "loss": 3.2467, "step": 55950 }, { - "epoch": 6.0273382843612096, - "grad_norm": 0.7260343432426453, - "learning_rate": 0.00023875228962396292, - "loss": 3.2501, + "epoch": 6.037735849056604, + "grad_norm": 0.7329632043838501, + "learning_rate": 0.0002381413923367512, + "loss": 3.2488, "step": 56000 }, { - "epoch": 6.0273382843612096, - "eval_accuracy": 0.3840860026722091, - "eval_loss": 3.3830971717834473, - "eval_runtime": 180.8499, - "eval_samples_per_second": 99.591, - "eval_steps_per_second": 6.226, + "epoch": 6.037735849056604, + "eval_accuracy": 0.3840788315816134, + "eval_loss": 3.386367082595825, + "eval_runtime": 184.7957, + "eval_samples_per_second": 97.464, + "eval_steps_per_second": 6.093, "step": 56000 }, { - "epoch": 6.032719836400818, - "grad_norm": 0.8156185746192932, - "learning_rate": 0.0002384290485939015, - "loss": 3.2535, + "epoch": 6.0431266846361185, + "grad_norm": 0.7919038534164429, + "learning_rate": 0.00023781759309228275, + "loss": 3.2497, "step": 56050 }, { - "epoch": 6.038101388440427, - "grad_norm": 0.7497657537460327, - "learning_rate": 0.00023810580756384006, - "loss": 3.2479, + "epoch": 6.048517520215634, + "grad_norm": 0.6978245377540588, + "learning_rate": 0.00023749379384781433, + "loss": 3.268, "step": 56100 }, { - "epoch": 6.043482940480034, - "grad_norm": 0.7156153321266174, - "learning_rate": 0.00023778256653377868, - "loss": 3.2614, + "epoch": 6.053908355795148, + "grad_norm": 0.670793354511261, + "learning_rate": 0.0002371699946033459, + "loss": 3.2439, "step": 56150 }, { - "epoch": 6.048864492519643, - "grad_norm": 0.7490081787109375, - "learning_rate": 0.00023745932550371725, - "loss": 3.2544, + "epoch": 6.059299191374663, + "grad_norm": 0.6458380222320557, + "learning_rate": 0.0002368461953588775, + "loss": 3.2512, "step": 56200 }, { - "epoch": 6.0542460445592505, - "grad_norm": 0.7716811299324036, - "learning_rate": 0.00023713608447365584, - "loss": 3.2517, + "epoch": 6.064690026954178, + "grad_norm": 0.6844601631164551, + "learning_rate": 0.00023652239611440904, + "loss": 3.2673, "step": 56250 }, { - "epoch": 6.059627596598859, - "grad_norm": 0.805608868598938, - "learning_rate": 0.00023681284344359444, - "loss": 3.2538, + "epoch": 6.070080862533692, + "grad_norm": 0.7111284732818604, + "learning_rate": 0.00023619859686994062, + "loss": 3.2516, "step": 56300 }, { - "epoch": 6.065009148638468, - "grad_norm": 0.8110917210578918, - "learning_rate": 0.000236489602413533, - "loss": 3.246, + "epoch": 6.0754716981132075, + "grad_norm": 0.7226814031600952, + "learning_rate": 0.00023587479762547217, + "loss": 3.2614, "step": 56350 }, { - "epoch": 6.070390700678075, - "grad_norm": 0.8059680461883545, - "learning_rate": 0.0002361663613834716, - "loss": 3.2608, + "epoch": 6.080862533692723, + "grad_norm": 0.708491861820221, + "learning_rate": 0.00023555099838100374, + "loss": 3.2659, "step": 56400 }, { - "epoch": 6.075772252717684, - "grad_norm": 0.7581197619438171, - "learning_rate": 0.00023584312035341017, - "loss": 3.2681, + "epoch": 6.086253369272237, + "grad_norm": 0.7179819941520691, + "learning_rate": 0.00023522719913653532, + "loss": 3.2567, "step": 56450 }, { - "epoch": 6.081153804757292, - "grad_norm": 0.7696651816368103, - "learning_rate": 0.00023551987932334876, - "loss": 3.2486, + "epoch": 6.091644204851752, + "grad_norm": 0.7065309286117554, + "learning_rate": 0.0002349033998920669, + "loss": 3.2648, "step": 56500 }, { - "epoch": 6.0865353567969, - "grad_norm": 0.7502686977386475, - "learning_rate": 0.00023519663829328735, - "loss": 3.2639, + "epoch": 6.097035040431267, + "grad_norm": 0.6956079602241516, + "learning_rate": 0.00023457960064759848, + "loss": 3.2849, "step": 56550 }, { - "epoch": 6.091916908836509, - "grad_norm": 0.7801194190979004, - "learning_rate": 0.00023487339726322592, - "loss": 3.2652, + "epoch": 6.1024258760107815, + "grad_norm": 0.7147029638290405, + "learning_rate": 0.00023425580140313005, + "loss": 3.2829, "step": 56600 }, { - "epoch": 6.097298460876116, - "grad_norm": 0.7480630874633789, - "learning_rate": 0.0002345501562331645, - "loss": 3.2495, + "epoch": 6.107816711590297, + "grad_norm": 0.7096705436706543, + "learning_rate": 0.00023393200215866163, + "loss": 3.2555, "step": 56650 }, { - "epoch": 6.102680012915725, - "grad_norm": 0.7701714038848877, - "learning_rate": 0.0002342269152031031, - "loss": 3.27, + "epoch": 6.113207547169812, + "grad_norm": 0.7230135202407837, + "learning_rate": 0.00023360820291419316, + "loss": 3.2777, "step": 56700 }, { - "epoch": 6.108061564955333, - "grad_norm": 0.7400213479995728, - "learning_rate": 0.00023390367417304168, - "loss": 3.2617, + "epoch": 6.118598382749326, + "grad_norm": 0.6688002347946167, + "learning_rate": 0.00023328440366972473, + "loss": 3.2658, "step": 56750 }, { - "epoch": 6.113443116994941, - "grad_norm": 0.7020822763442993, - "learning_rate": 0.00023358043314298025, - "loss": 3.2603, + "epoch": 6.123989218328841, + "grad_norm": 0.7019734382629395, + "learning_rate": 0.0002329606044252563, + "loss": 3.258, "step": 56800 }, { - "epoch": 6.11882466903455, - "grad_norm": 0.7655128240585327, - "learning_rate": 0.00023325719211291887, - "loss": 3.2421, + "epoch": 6.129380053908355, + "grad_norm": 0.7122552990913391, + "learning_rate": 0.0002326368051807879, + "loss": 3.2585, "step": 56850 }, { - "epoch": 6.124206221074158, - "grad_norm": 0.7835574150085449, - "learning_rate": 0.00023293395108285744, - "loss": 3.2615, + "epoch": 6.1347708894878705, + "grad_norm": 0.7427650690078735, + "learning_rate": 0.00023231300593631947, + "loss": 3.2579, "step": 56900 }, { - "epoch": 6.129587773113766, - "grad_norm": 0.7708025574684143, - "learning_rate": 0.000232610710052796, - "loss": 3.2681, + "epoch": 6.140161725067386, + "grad_norm": 0.7284014225006104, + "learning_rate": 0.00023198920669185104, + "loss": 3.2703, "step": 56950 }, { - "epoch": 6.134969325153374, - "grad_norm": 0.7812603116035461, - "learning_rate": 0.0002322874690227346, - "loss": 3.2706, + "epoch": 6.1455525606469, + "grad_norm": 0.7284674048423767, + "learning_rate": 0.00023166540744738262, + "loss": 3.2657, "step": 57000 }, { - "epoch": 6.134969325153374, - "eval_accuracy": 0.3842589780696098, - "eval_loss": 3.382967472076416, - "eval_runtime": 180.3604, - "eval_samples_per_second": 99.861, - "eval_steps_per_second": 6.243, + "epoch": 6.1455525606469, + "eval_accuracy": 0.3841316368850912, + "eval_loss": 3.384326457977295, + "eval_runtime": 184.9132, + "eval_samples_per_second": 97.402, + "eval_steps_per_second": 6.089, "step": 57000 }, { - "epoch": 6.140350877192983, - "grad_norm": 0.7508556842803955, - "learning_rate": 0.0002319642279926732, - "loss": 3.2406, + "epoch": 6.150943396226415, + "grad_norm": 0.7034928798675537, + "learning_rate": 0.0002313416082029142, + "loss": 3.2762, "step": 57050 }, { - "epoch": 6.1457324292325906, - "grad_norm": 0.7822660207748413, - "learning_rate": 0.00023164098696261176, - "loss": 3.2669, + "epoch": 6.15633423180593, + "grad_norm": 0.7338201999664307, + "learning_rate": 0.00023101780895844572, + "loss": 3.2585, "step": 57100 }, { - "epoch": 6.151113981272199, - "grad_norm": 0.7589568495750427, - "learning_rate": 0.00023131774593255036, - "loss": 3.2541, + "epoch": 6.1617250673854445, + "grad_norm": 0.7155392169952393, + "learning_rate": 0.0002306940097139773, + "loss": 3.27, "step": 57150 }, { - "epoch": 6.156495533311807, - "grad_norm": 0.8089141845703125, - "learning_rate": 0.00023099450490248892, - "loss": 3.2655, + "epoch": 6.16711590296496, + "grad_norm": 0.7334120869636536, + "learning_rate": 0.00023037021046950888, + "loss": 3.2631, "step": 57200 }, { - "epoch": 6.161877085351415, - "grad_norm": 0.7813968658447266, - "learning_rate": 0.00023067126387242754, - "loss": 3.2779, + "epoch": 6.172506738544475, + "grad_norm": 0.7121962904930115, + "learning_rate": 0.00023004641122504046, + "loss": 3.2653, "step": 57250 }, { - "epoch": 6.167258637391024, - "grad_norm": 0.7517101168632507, - "learning_rate": 0.0002303480228423661, - "loss": 3.261, + "epoch": 6.177897574123989, + "grad_norm": 0.7687353491783142, + "learning_rate": 0.00022972261198057203, + "loss": 3.2666, "step": 57300 }, { - "epoch": 6.1726401894306315, - "grad_norm": 0.7565500140190125, - "learning_rate": 0.00023002478181230468, - "loss": 3.2513, + "epoch": 6.183288409703504, + "grad_norm": 0.779589831829071, + "learning_rate": 0.0002293988127361036, + "loss": 3.2713, "step": 57350 }, { - "epoch": 6.17802174147024, - "grad_norm": 0.722113311290741, - "learning_rate": 0.00022970154078224327, - "loss": 3.2755, + "epoch": 6.188679245283019, + "grad_norm": 0.7740386128425598, + "learning_rate": 0.00022907501349163516, + "loss": 3.2798, "step": 57400 }, { - "epoch": 6.183403293509849, - "grad_norm": 0.805997371673584, - "learning_rate": 0.00022937829975218187, - "loss": 3.2674, + "epoch": 6.1940700808625335, + "grad_norm": 0.8117671608924866, + "learning_rate": 0.00022875121424716674, + "loss": 3.2933, "step": 57450 }, { - "epoch": 6.188784845549456, - "grad_norm": 0.7368705868721008, - "learning_rate": 0.00022905505872212044, - "loss": 3.2627, + "epoch": 6.199460916442049, + "grad_norm": 0.7242795825004578, + "learning_rate": 0.00022842741500269832, + "loss": 3.2469, "step": 57500 }, { - "epoch": 6.194166397589065, - "grad_norm": 0.7440856099128723, - "learning_rate": 0.00022873181769205903, - "loss": 3.2576, + "epoch": 6.204851752021563, + "grad_norm": 0.759215772151947, + "learning_rate": 0.00022810361575822987, + "loss": 3.2977, "step": 57550 }, { - "epoch": 6.1995479496286725, - "grad_norm": 0.8116312623023987, - "learning_rate": 0.0002284085766619976, - "loss": 3.2743, + "epoch": 6.210242587601078, + "grad_norm": 0.7306986451148987, + "learning_rate": 0.00022777981651376145, + "loss": 3.2674, "step": 57600 }, { - "epoch": 6.204929501668281, - "grad_norm": 0.7270071506500244, - "learning_rate": 0.0002280853356319362, - "loss": 3.2859, + "epoch": 6.215633423180593, + "grad_norm": 0.7148694396018982, + "learning_rate": 0.00022745601726929302, + "loss": 3.2705, "step": 57650 }, { - "epoch": 6.21031105370789, - "grad_norm": 0.7267986536026001, - "learning_rate": 0.0002277620946018748, - "loss": 3.2578, + "epoch": 6.2210242587601075, + "grad_norm": 0.7374268770217896, + "learning_rate": 0.00022713221802482457, + "loss": 3.29, "step": 57700 }, { - "epoch": 6.215692605747497, - "grad_norm": 0.7756433486938477, - "learning_rate": 0.00022743885357181336, - "loss": 3.2718, + "epoch": 6.226415094339623, + "grad_norm": 0.7002312541007996, + "learning_rate": 0.00022681489476524553, + "loss": 3.2757, "step": 57750 }, { - "epoch": 6.221074157787106, - "grad_norm": 0.7905489206314087, - "learning_rate": 0.00022711561254175192, - "loss": 3.2755, + "epoch": 6.231805929919138, + "grad_norm": 0.77752286195755, + "learning_rate": 0.0002264910955207771, + "loss": 3.28, "step": 57800 }, { - "epoch": 6.226455709826714, - "grad_norm": 0.8063260912895203, - "learning_rate": 0.00022679237151169054, - "loss": 3.2735, + "epoch": 6.237196765498652, + "grad_norm": 0.7070091366767883, + "learning_rate": 0.00022616729627630868, + "loss": 3.2798, "step": 57850 }, { - "epoch": 6.231837261866322, - "grad_norm": 0.7980824708938599, - "learning_rate": 0.0002264691304816291, - "loss": 3.2635, + "epoch": 6.242587601078167, + "grad_norm": 0.7434335350990295, + "learning_rate": 0.00022584349703184023, + "loss": 3.2808, "step": 57900 }, { - "epoch": 6.237218813905931, - "grad_norm": 0.7567322254180908, - "learning_rate": 0.00022614588945156768, - "loss": 3.2738, + "epoch": 6.247978436657682, + "grad_norm": 0.7469635009765625, + "learning_rate": 0.0002255196977873718, + "loss": 3.263, "step": 57950 }, { - "epoch": 6.242600365945538, - "grad_norm": 0.7856761813163757, - "learning_rate": 0.0002258226484215063, - "loss": 3.28, + "epoch": 6.2533692722371965, + "grad_norm": 0.7343301773071289, + "learning_rate": 0.0002251958985429034, + "loss": 3.2838, "step": 58000 }, { - "epoch": 6.242600365945538, - "eval_accuracy": 0.3846668610104653, - "eval_loss": 3.3798773288726807, - "eval_runtime": 180.5316, - "eval_samples_per_second": 99.766, - "eval_steps_per_second": 6.237, + "epoch": 6.2533692722371965, + "eval_accuracy": 0.38464078431738985, + "eval_loss": 3.380790948867798, + "eval_runtime": 185.1083, + "eval_samples_per_second": 97.3, + "eval_steps_per_second": 6.083, "step": 58000 }, { - "epoch": 6.247981917985147, - "grad_norm": 0.7729969024658203, - "learning_rate": 0.00022549940739144487, - "loss": 3.2857, + "epoch": 6.258760107816712, + "grad_norm": 0.7176439762115479, + "learning_rate": 0.00022487209929843494, + "loss": 3.2724, "step": 58050 }, { - "epoch": 6.253363470024755, - "grad_norm": 0.8097330927848816, - "learning_rate": 0.00022517616636138344, - "loss": 3.2717, + "epoch": 6.264150943396227, + "grad_norm": 0.7113427519798279, + "learning_rate": 0.00022454830005396652, + "loss": 3.278, "step": 58100 }, { - "epoch": 6.258745022064363, - "grad_norm": 0.8649580478668213, - "learning_rate": 0.00022485292533132203, - "loss": 3.2832, + "epoch": 6.269541778975741, + "grad_norm": 0.7005300521850586, + "learning_rate": 0.0002242245008094981, + "loss": 3.2739, "step": 58150 }, { - "epoch": 6.264126574103972, - "grad_norm": 0.771740734577179, - "learning_rate": 0.00022452968430126063, - "loss": 3.2734, + "epoch": 6.274932614555256, + "grad_norm": 0.719237744808197, + "learning_rate": 0.00022390070156502967, + "loss": 3.2707, "step": 58200 }, { - "epoch": 6.26950812614358, - "grad_norm": 0.7663628458976746, - "learning_rate": 0.00022420644327119922, - "loss": 3.2736, + "epoch": 6.280323450134771, + "grad_norm": 0.808284342288971, + "learning_rate": 0.00022357690232056125, + "loss": 3.2804, "step": 58250 }, { - "epoch": 6.274889678183188, - "grad_norm": 0.7669972777366638, - "learning_rate": 0.0002238832022411378, - "loss": 3.2712, + "epoch": 6.285714285714286, + "grad_norm": 0.7207859754562378, + "learning_rate": 0.0002232531030760928, + "loss": 3.2658, "step": 58300 }, { - "epoch": 6.280271230222796, - "grad_norm": 0.7995150089263916, - "learning_rate": 0.00022355996121107636, - "loss": 3.2764, + "epoch": 6.291105121293801, + "grad_norm": 0.7794989943504333, + "learning_rate": 0.00022292930383162435, + "loss": 3.2928, "step": 58350 }, { - "epoch": 6.285652782262405, - "grad_norm": 0.7507680654525757, - "learning_rate": 0.00022323672018101498, - "loss": 3.2882, + "epoch": 6.296495956873315, + "grad_norm": 0.7986652851104736, + "learning_rate": 0.00022260550458715593, + "loss": 3.2741, "step": 58400 }, { - "epoch": 6.2910343343020125, - "grad_norm": 0.8180110454559326, - "learning_rate": 0.00022291347915095355, - "loss": 3.2764, + "epoch": 6.30188679245283, + "grad_norm": 0.7763763666152954, + "learning_rate": 0.0002222817053426875, + "loss": 3.2745, "step": 58450 }, { - "epoch": 6.296415886341621, - "grad_norm": 0.7749719619750977, - "learning_rate": 0.0002225902381208921, - "loss": 3.272, + "epoch": 6.307277628032345, + "grad_norm": 0.7197876572608948, + "learning_rate": 0.00022195790609821908, + "loss": 3.2668, "step": 58500 }, { - "epoch": 6.301797438381229, - "grad_norm": 0.7502884864807129, - "learning_rate": 0.00022226699709083073, - "loss": 3.2762, + "epoch": 6.3126684636118595, + "grad_norm": 0.7718214392662048, + "learning_rate": 0.00022163410685375066, + "loss": 3.2773, "step": 58550 }, { - "epoch": 6.307178990420837, - "grad_norm": 0.8392398357391357, - "learning_rate": 0.00022195022088137051, - "loss": 3.2646, + "epoch": 6.318059299191375, + "grad_norm": 0.7384246587753296, + "learning_rate": 0.00022131030760928224, + "loss": 3.2932, "step": 58600 }, { - "epoch": 6.312560542460446, - "grad_norm": 0.7521066665649414, - "learning_rate": 0.0002216269798513091, - "loss": 3.2673, + "epoch": 6.32345013477089, + "grad_norm": 0.7198066115379333, + "learning_rate": 0.00022098650836481382, + "loss": 3.2703, "step": 58650 }, { - "epoch": 6.3179420945000535, - "grad_norm": 0.7701011300086975, - "learning_rate": 0.00022130373882124768, - "loss": 3.2645, + "epoch": 6.328840970350404, + "grad_norm": 0.7592787742614746, + "learning_rate": 0.0002206627091203454, + "loss": 3.2831, "step": 58700 }, { - "epoch": 6.323323646539662, - "grad_norm": 0.8215457797050476, - "learning_rate": 0.00022098049779118627, - "loss": 3.2603, + "epoch": 6.334231805929919, + "grad_norm": 0.7230469584465027, + "learning_rate": 0.00022033890987587692, + "loss": 3.2884, "step": 58750 }, { - "epoch": 6.328705198579271, - "grad_norm": 0.7722597122192383, - "learning_rate": 0.00022065725676112487, - "loss": 3.2791, + "epoch": 6.339622641509434, + "grad_norm": 0.7920795679092407, + "learning_rate": 0.0002200151106314085, + "loss": 3.2864, "step": 58800 }, { - "epoch": 6.334086750618878, - "grad_norm": 0.7506366968154907, - "learning_rate": 0.00022033401573106343, - "loss": 3.2869, + "epoch": 6.345013477088949, + "grad_norm": 0.7017115950584412, + "learning_rate": 0.00021969131138694007, + "loss": 3.2721, "step": 58850 }, { - "epoch": 6.339468302658487, - "grad_norm": 0.7620330452919006, - "learning_rate": 0.000220010774701002, + "epoch": 6.350404312668464, + "grad_norm": 0.7787772417068481, + "learning_rate": 0.00021936751214247165, "loss": 3.274, "step": 58900 }, { - "epoch": 6.344849854698095, - "grad_norm": 0.7641044855117798, - "learning_rate": 0.00021968753367094062, - "loss": 3.2703, + "epoch": 6.355795148247978, + "grad_norm": 0.7387892603874207, + "learning_rate": 0.00021904371289800323, + "loss": 3.2744, "step": 58950 }, { - "epoch": 6.350231406737703, - "grad_norm": 0.7357539534568787, - "learning_rate": 0.0002193642926408792, - "loss": 3.277, + "epoch": 6.361185983827493, + "grad_norm": 0.7931119203567505, + "learning_rate": 0.0002187199136535348, + "loss": 3.2684, "step": 59000 }, { - "epoch": 6.350231406737703, - "eval_accuracy": 0.3850675728607251, - "eval_loss": 3.3782007694244385, - "eval_runtime": 180.5172, - "eval_samples_per_second": 99.774, - "eval_steps_per_second": 6.238, + "epoch": 6.361185983827493, + "eval_accuracy": 0.38483874987898786, + "eval_loss": 3.378488302230835, + "eval_runtime": 184.6031, + "eval_samples_per_second": 97.566, + "eval_steps_per_second": 6.1, "step": 59000 }, { - "epoch": 6.355612958777312, - "grad_norm": 0.793239951133728, - "learning_rate": 0.00021904105161081778, - "loss": 3.2845, + "epoch": 6.366576819407008, + "grad_norm": 0.7148496508598328, + "learning_rate": 0.00021839611440906638, + "loss": 3.2747, "step": 59050 }, { - "epoch": 6.360994510816919, - "grad_norm": 0.7626572251319885, - "learning_rate": 0.00021871781058075638, - "loss": 3.2751, + "epoch": 6.3719676549865225, + "grad_norm": 0.733859658241272, + "learning_rate": 0.00021807231516459793, + "loss": 3.2728, "step": 59100 }, { - "epoch": 6.366376062856528, - "grad_norm": 0.7418571710586548, - "learning_rate": 0.00021839456955069495, - "loss": 3.285, + "epoch": 6.377358490566038, + "grad_norm": 0.7047247886657715, + "learning_rate": 0.00021774851592012948, + "loss": 3.2855, "step": 59150 }, { - "epoch": 6.371757614896136, - "grad_norm": 0.7725799679756165, - "learning_rate": 0.00021807132852063354, - "loss": 3.2829, + "epoch": 6.382749326145553, + "grad_norm": 0.7953727841377258, + "learning_rate": 0.00021742471667566106, + "loss": 3.2883, "step": 59200 }, { - "epoch": 6.377139166935744, - "grad_norm": 0.731188952922821, - "learning_rate": 0.0002177480874905721, - "loss": 3.2764, + "epoch": 6.388140161725067, + "grad_norm": 0.7541425228118896, + "learning_rate": 0.00021710091743119264, + "loss": 3.2769, "step": 59250 }, { - "epoch": 6.382520718975353, - "grad_norm": 0.7765207290649414, - "learning_rate": 0.0002174248464605107, - "loss": 3.2717, + "epoch": 6.393530997304582, + "grad_norm": 0.7310684323310852, + "learning_rate": 0.00021677711818672422, + "loss": 3.298, "step": 59300 }, { - "epoch": 6.387902271014961, - "grad_norm": 0.7506345510482788, - "learning_rate": 0.0002171016054304493, - "loss": 3.2679, + "epoch": 6.398921832884097, + "grad_norm": 0.7032855749130249, + "learning_rate": 0.0002164533189422558, + "loss": 3.2742, "step": 59350 }, { - "epoch": 6.393283823054569, - "grad_norm": 0.7452055215835571, - "learning_rate": 0.00021677836440038787, - "loss": 3.2577, + "epoch": 6.404312668463612, + "grad_norm": 0.7056180834770203, + "learning_rate": 0.00021612951969778734, + "loss": 3.274, "step": 59400 }, { - "epoch": 6.398665375094177, - "grad_norm": 0.8073831796646118, - "learning_rate": 0.00021645512337032643, - "loss": 3.2606, + "epoch": 6.409703504043127, + "grad_norm": 0.7354710698127747, + "learning_rate": 0.00021580572045331892, + "loss": 3.2877, "step": 59450 }, { - "epoch": 6.404046927133785, - "grad_norm": 0.7789980173110962, - "learning_rate": 0.00021613188234026506, - "loss": 3.2712, + "epoch": 6.415094339622642, + "grad_norm": 0.7771357893943787, + "learning_rate": 0.0002154819212088505, + "loss": 3.2757, "step": 59500 }, { - "epoch": 6.4094284791733935, - "grad_norm": 0.8396959900856018, - "learning_rate": 0.00021580864131020362, - "loss": 3.2768, + "epoch": 6.420485175202156, + "grad_norm": 1.259022831916809, + "learning_rate": 0.00021515812196438208, + "loss": 3.2676, "step": 59550 }, { - "epoch": 6.414810031213002, - "grad_norm": 0.797767162322998, - "learning_rate": 0.0002154854002801422, - "loss": 3.2813, + "epoch": 6.425876010781671, + "grad_norm": 0.7444080710411072, + "learning_rate": 0.00021483432271991363, + "loss": 3.2774, "step": 59600 }, { - "epoch": 6.42019158325261, - "grad_norm": 0.802899956703186, - "learning_rate": 0.0002151621592500808, - "loss": 3.2889, + "epoch": 6.431266846361186, + "grad_norm": 0.7366527915000916, + "learning_rate": 0.0002145105234754452, + "loss": 3.284, "step": 59650 }, { - "epoch": 6.425573135292218, - "grad_norm": 0.8054869771003723, - "learning_rate": 0.00021483891822001938, - "loss": 3.2716, + "epoch": 6.436657681940701, + "grad_norm": 0.7495415806770325, + "learning_rate": 0.00021418672423097676, + "loss": 3.2778, "step": 59700 }, { - "epoch": 6.430954687331827, - "grad_norm": 0.7798833250999451, - "learning_rate": 0.00021451567718995795, - "loss": 3.2781, + "epoch": 6.442048517520216, + "grad_norm": 0.7302351593971252, + "learning_rate": 0.00021386292498650833, + "loss": 3.2894, "step": 59750 }, { - "epoch": 6.4363362393714345, - "grad_norm": 0.7423691153526306, - "learning_rate": 0.00021419243615989654, - "loss": 3.2763, + "epoch": 6.44743935309973, + "grad_norm": 0.7209936380386353, + "learning_rate": 0.0002135391257420399, + "loss": 3.2789, "step": 59800 }, { - "epoch": 6.441717791411043, - "grad_norm": 0.8051731586456299, - "learning_rate": 0.00021386919512983514, - "loss": 3.2666, + "epoch": 6.452830188679245, + "grad_norm": 0.7588741779327393, + "learning_rate": 0.0002132153264975715, + "loss": 3.2869, "step": 59850 }, { - "epoch": 6.447099343450651, - "grad_norm": 0.8098742365837097, - "learning_rate": 0.0002135459540997737, - "loss": 3.2656, + "epoch": 6.45822102425876, + "grad_norm": 0.7343195080757141, + "learning_rate": 0.00021289152725310307, + "loss": 3.2743, "step": 59900 }, { - "epoch": 6.452480895490259, - "grad_norm": 0.8203098177909851, - "learning_rate": 0.0002132227130697123, - "loss": 3.2729, + "epoch": 6.463611859838275, + "grad_norm": 0.6876043081283569, + "learning_rate": 0.00021256772800863464, + "loss": 3.2879, "step": 59950 }, { - "epoch": 6.457862447529868, - "grad_norm": 0.736182689666748, - "learning_rate": 0.00021289947203965087, - "loss": 3.2727, + "epoch": 6.46900269541779, + "grad_norm": 0.7846079468727112, + "learning_rate": 0.0002122439287641662, + "loss": 3.2841, "step": 60000 }, { - "epoch": 6.457862447529868, - "eval_accuracy": 0.385509246849691, - "eval_loss": 3.372286558151245, - "eval_runtime": 180.8685, - "eval_samples_per_second": 99.581, - "eval_steps_per_second": 6.226, + "epoch": 6.46900269541779, + "eval_accuracy": 0.3849802159389223, + "eval_loss": 3.3735952377319336, + "eval_runtime": 184.7204, + "eval_samples_per_second": 97.504, + "eval_steps_per_second": 6.096, "step": 60000 }, { - "epoch": 6.4632439995694755, - "grad_norm": 0.7694886326789856, - "learning_rate": 0.0002125762310095895, - "loss": 3.2795, + "epoch": 6.474393530997305, + "grad_norm": 0.7180082201957703, + "learning_rate": 0.00021192012951969775, + "loss": 3.2929, "step": 60050 }, { - "epoch": 6.468625551609084, - "grad_norm": 0.7625947594642639, - "learning_rate": 0.00021225298997952806, - "loss": 3.2678, + "epoch": 6.479784366576819, + "grad_norm": 0.7589936852455139, + "learning_rate": 0.00021159633027522932, + "loss": 3.2898, "step": 60100 }, { - "epoch": 6.474007103648693, - "grad_norm": 0.7633724808692932, - "learning_rate": 0.00021192974894946662, - "loss": 3.2773, + "epoch": 6.485175202156334, + "grad_norm": 0.748837947845459, + "learning_rate": 0.0002112725310307609, + "loss": 3.2811, "step": 60150 }, { - "epoch": 6.4793886556883, - "grad_norm": 0.7859768271446228, - "learning_rate": 0.00021160650791940524, - "loss": 3.2929, + "epoch": 6.490566037735849, + "grad_norm": 0.8100166916847229, + "learning_rate": 0.00021094873178629248, + "loss": 3.3062, "step": 60200 }, { - "epoch": 6.484770207727909, - "grad_norm": 0.7822881937026978, - "learning_rate": 0.0002112832668893438, - "loss": 3.2923, + "epoch": 6.495956873315364, + "grad_norm": 0.6930977702140808, + "learning_rate": 0.00021062493254182406, + "loss": 3.279, "step": 60250 }, { - "epoch": 6.490151759767517, - "grad_norm": 0.7258760929107666, - "learning_rate": 0.00021096002585928238, - "loss": 3.2784, + "epoch": 6.501347708894879, + "grad_norm": 0.7058539986610413, + "learning_rate": 0.00021030113329735563, + "loss": 3.2849, "step": 60300 }, { - "epoch": 6.495533311807125, - "grad_norm": 0.7965183854103088, - "learning_rate": 0.00021063678482922097, - "loss": 3.2617, + "epoch": 6.506738544474393, + "grad_norm": 0.7700808644294739, + "learning_rate": 0.0002099773340528872, + "loss": 3.2861, "step": 60350 }, { - "epoch": 6.500914863846734, - "grad_norm": 0.8430307507514954, - "learning_rate": 0.00021031354379915957, - "loss": 3.2637, + "epoch": 6.512129380053908, + "grad_norm": 0.7218989729881287, + "learning_rate": 0.0002096535348084188, + "loss": 3.2817, "step": 60400 }, { - "epoch": 6.506296415886341, - "grad_norm": 0.7958040833473206, - "learning_rate": 0.00020999030276909814, - "loss": 3.2915, + "epoch": 6.517520215633423, + "grad_norm": 0.7468814253807068, + "learning_rate": 0.0002093297355639503, + "loss": 3.2966, "step": 60450 }, { - "epoch": 6.51167796792595, - "grad_norm": 0.7817371487617493, - "learning_rate": 0.00020966706173903673, - "loss": 3.2911, + "epoch": 6.5229110512129385, + "grad_norm": 0.7302692532539368, + "learning_rate": 0.0002090059363194819, + "loss": 3.2844, "step": 60500 }, { - "epoch": 6.517059519965558, - "grad_norm": 0.7723913192749023, - "learning_rate": 0.0002093438207089753, - "loss": 3.2695, + "epoch": 6.528301886792453, + "grad_norm": 0.7506006956100464, + "learning_rate": 0.00020868213707501347, + "loss": 3.3064, "step": 60550 }, { - "epoch": 6.522441072005166, - "grad_norm": 0.7523823380470276, - "learning_rate": 0.00020902057967891387, - "loss": 3.2835, + "epoch": 6.533692722371968, + "grad_norm": 0.7327467203140259, + "learning_rate": 0.00020835833783054505, + "loss": 3.2984, "step": 60600 }, { - "epoch": 6.5278226240447745, - "grad_norm": 0.7971324920654297, - "learning_rate": 0.0002086973386488525, - "loss": 3.2922, + "epoch": 6.539083557951482, + "grad_norm": 0.7747229933738708, + "learning_rate": 0.00020803453858607662, + "loss": 3.2889, "step": 60650 }, { - "epoch": 6.533204176084383, - "grad_norm": 0.8252760767936707, - "learning_rate": 0.00020837409761879106, - "loss": 3.2898, + "epoch": 6.544474393530997, + "grad_norm": 0.7802739143371582, + "learning_rate": 0.0002077107393416082, + "loss": 3.3025, "step": 60700 }, { - "epoch": 6.538585728123991, - "grad_norm": 0.7614521980285645, - "learning_rate": 0.00020805085658872962, - "loss": 3.2906, + "epoch": 6.549865229110512, + "grad_norm": 0.7712839245796204, + "learning_rate": 0.00020738694009713975, + "loss": 3.29, "step": 60750 }, { - "epoch": 6.543967280163599, - "grad_norm": 0.8246181011199951, - "learning_rate": 0.00020772761555866825, - "loss": 3.2725, + "epoch": 6.555256064690027, + "grad_norm": 0.72076416015625, + "learning_rate": 0.00020706314085267133, + "loss": 3.2813, "step": 60800 }, { - "epoch": 6.549348832203208, - "grad_norm": 0.7531991004943848, - "learning_rate": 0.0002074043745286068, - "loss": 3.274, + "epoch": 6.560646900269542, + "grad_norm": 0.7788540720939636, + "learning_rate": 0.00020673934160820288, + "loss": 3.2969, "step": 60850 }, { - "epoch": 6.5547303842428155, - "grad_norm": 0.8107429146766663, - "learning_rate": 0.00020708113349854538, - "loss": 3.2757, + "epoch": 6.566037735849057, + "grad_norm": 0.7517972588539124, + "learning_rate": 0.00020641554236373446, + "loss": 3.2807, "step": 60900 }, { - "epoch": 6.560111936282424, - "grad_norm": 0.8336337208747864, - "learning_rate": 0.00020675789246848397, - "loss": 3.2756, + "epoch": 6.571428571428571, + "grad_norm": 0.7544095516204834, + "learning_rate": 0.00020609174311926604, + "loss": 3.2833, "step": 60950 }, { - "epoch": 6.565493488322032, - "grad_norm": 0.8028927445411682, - "learning_rate": 0.00020643465143842257, - "loss": 3.2838, + "epoch": 6.576819407008086, + "grad_norm": 0.739867091178894, + "learning_rate": 0.0002057679438747976, + "loss": 3.2765, "step": 61000 }, { - "epoch": 6.565493488322032, - "eval_accuracy": 0.3859593957639064, - "eval_loss": 3.3682973384857178, - "eval_runtime": 180.3806, - "eval_samples_per_second": 99.85, - "eval_steps_per_second": 6.242, + "epoch": 6.576819407008086, + "eval_accuracy": 0.3854455762574317, + "eval_loss": 3.370424270629883, + "eval_runtime": 184.81, + "eval_samples_per_second": 97.457, + "eval_steps_per_second": 6.093, "step": 61000 }, { - "epoch": 6.57087504036164, - "grad_norm": 0.8147567510604858, - "learning_rate": 0.00020611141040836116, - "loss": 3.2949, + "epoch": 6.5822102425876015, + "grad_norm": 0.7352069616317749, + "learning_rate": 0.00020544414463032916, + "loss": 3.2759, "step": 61050 }, { - "epoch": 6.576256592401249, - "grad_norm": 0.8347201347351074, - "learning_rate": 0.00020578816937829973, - "loss": 3.2929, + "epoch": 6.587601078167116, + "grad_norm": 0.7170955538749695, + "learning_rate": 0.00020512034538586074, + "loss": 3.2656, "step": 61100 }, { - "epoch": 6.5816381444408565, - "grad_norm": 0.8149718046188354, - "learning_rate": 0.0002054649283482383, - "loss": 3.2933, + "epoch": 6.592991913746631, + "grad_norm": 0.7378682494163513, + "learning_rate": 0.00020479654614139232, + "loss": 3.28, "step": 61150 }, { - "epoch": 6.587019696480465, - "grad_norm": 0.7641071677207947, - "learning_rate": 0.00020514168731817692, - "loss": 3.2689, + "epoch": 6.598382749326145, + "grad_norm": 0.785605788230896, + "learning_rate": 0.0002044727468969239, + "loss": 3.2807, "step": 61200 }, { - "epoch": 6.592401248520073, - "grad_norm": 0.8402169942855835, - "learning_rate": 0.0002048184462881155, - "loss": 3.2796, + "epoch": 6.60377358490566, + "grad_norm": 0.7256307005882263, + "learning_rate": 0.00020414894765245547, + "loss": 3.2822, "step": 61250 }, { - "epoch": 6.597782800559681, - "grad_norm": 0.7858189940452576, - "learning_rate": 0.00020449520525805406, - "loss": 3.29, + "epoch": 6.609164420485175, + "grad_norm": 0.7596465945243835, + "learning_rate": 0.00020382514840798702, + "loss": 3.2893, "step": 61300 }, { - "epoch": 6.60316435259929, - "grad_norm": 0.7707937359809875, - "learning_rate": 0.00020417196422799268, - "loss": 3.2823, + "epoch": 6.6145552560646905, + "grad_norm": 0.7317668199539185, + "learning_rate": 0.0002035013491635186, + "loss": 3.2854, "step": 61350 }, { - "epoch": 6.608545904638898, - "grad_norm": 0.7980074882507324, - "learning_rate": 0.00020384872319793125, - "loss": 3.2965, + "epoch": 6.619946091644205, + "grad_norm": 0.8078965544700623, + "learning_rate": 0.00020317754991905015, + "loss": 3.2705, "step": 61400 }, { - "epoch": 6.613927456678506, - "grad_norm": 0.771574854850769, - "learning_rate": 0.0002035254821678698, - "loss": 3.2852, + "epoch": 6.62533692722372, + "grad_norm": 0.7488526105880737, + "learning_rate": 0.00020285375067458173, + "loss": 3.3001, "step": 61450 }, { - "epoch": 6.619309008718115, - "grad_norm": 0.7880848050117493, - "learning_rate": 0.0002032022411378084, - "loss": 3.2822, + "epoch": 6.630727762803234, + "grad_norm": 0.738555908203125, + "learning_rate": 0.0002025299514301133, + "loss": 3.2768, "step": 61500 }, { - "epoch": 6.624690560757722, - "grad_norm": 0.7877942323684692, - "learning_rate": 0.000202879000107747, - "loss": 3.264, + "epoch": 6.636118598382749, + "grad_norm": 0.7701089978218079, + "learning_rate": 0.00020220615218564489, + "loss": 3.2937, "step": 61550 }, { - "epoch": 6.630072112797331, - "grad_norm": 0.7295520901679993, - "learning_rate": 0.00020255575907768557, - "loss": 3.2691, + "epoch": 6.6415094339622645, + "grad_norm": 0.6935857534408569, + "learning_rate": 0.00020188235294117646, + "loss": 3.2764, "step": 61600 }, { - "epoch": 6.635453664836939, - "grad_norm": 0.8145275712013245, - "learning_rate": 0.00020223251804762416, - "loss": 3.2715, + "epoch": 6.646900269541779, + "grad_norm": 0.740175187587738, + "learning_rate": 0.00020155855369670804, + "loss": 3.275, "step": 61650 }, { - "epoch": 6.640835216876547, - "grad_norm": 0.8078879714012146, - "learning_rate": 0.00020190927701756273, - "loss": 3.2752, + "epoch": 6.652291105121294, + "grad_norm": 0.778984010219574, + "learning_rate": 0.00020123475445223956, + "loss": 3.3037, "step": 61700 }, { - "epoch": 6.6462167689161555, - "grad_norm": 1.0516327619552612, - "learning_rate": 0.00020158603598750133, - "loss": 3.2702, + "epoch": 6.657681940700809, + "grad_norm": 0.7185824513435364, + "learning_rate": 0.00020091743119266052, + "loss": 3.2928, "step": 61750 }, { - "epoch": 6.651598320955763, - "grad_norm": 0.8313896059989929, - "learning_rate": 0.00020126279495743992, - "loss": 3.2767, + "epoch": 6.663072776280323, + "grad_norm": 0.7591161727905273, + "learning_rate": 0.0002005936319481921, + "loss": 3.2821, "step": 61800 }, { - "epoch": 6.656979872995372, - "grad_norm": 0.812926709651947, - "learning_rate": 0.0002009395539273785, - "loss": 3.2784, + "epoch": 6.668463611859838, + "grad_norm": 0.7795547246932983, + "learning_rate": 0.00020026983270372367, + "loss": 3.2903, "step": 61850 }, { - "epoch": 6.66236142503498, - "grad_norm": 0.7894045114517212, - "learning_rate": 0.00020061631289731706, - "loss": 3.2954, + "epoch": 6.6738544474393535, + "grad_norm": 0.7105275392532349, + "learning_rate": 0.00019994603345925525, + "loss": 3.2844, "step": 61900 }, { - "epoch": 6.667742977074588, - "grad_norm": 0.7732223272323608, - "learning_rate": 0.00020029307186725568, - "loss": 3.2946, + "epoch": 6.679245283018868, + "grad_norm": 0.7548275589942932, + "learning_rate": 0.00019962223421478683, + "loss": 3.2831, "step": 61950 }, { - "epoch": 6.6731245291141965, - "grad_norm": 0.7833398580551147, - "learning_rate": 0.00019996983083719425, - "loss": 3.2802, + "epoch": 6.684636118598383, + "grad_norm": 0.7622220516204834, + "learning_rate": 0.0001992984349703184, + "loss": 3.2973, "step": 62000 }, { - "epoch": 6.6731245291141965, - "eval_accuracy": 0.38636662678743505, - "eval_loss": 3.3642148971557617, - "eval_runtime": 180.6181, - "eval_samples_per_second": 99.719, - "eval_steps_per_second": 6.234, + "epoch": 6.684636118598383, + "eval_accuracy": 0.3861206366494231, + "eval_loss": 3.364516258239746, + "eval_runtime": 185.0556, + "eval_samples_per_second": 97.328, + "eval_steps_per_second": 6.085, "step": 62000 }, { - "epoch": 6.678506081153805, - "grad_norm": 0.7945963740348816, - "learning_rate": 0.00019964658980713284, - "loss": 3.306, + "epoch": 6.690026954177897, + "grad_norm": 0.7730100750923157, + "learning_rate": 0.00019897463572584993, + "loss": 3.2831, "step": 62050 }, { - "epoch": 6.683887633193413, - "grad_norm": 0.7843768000602722, - "learning_rate": 0.00019932334877707144, - "loss": 3.2765, + "epoch": 6.695417789757412, + "grad_norm": 0.7311808466911316, + "learning_rate": 0.0001986508364813815, + "loss": 3.2819, "step": 62100 }, { - "epoch": 6.689269185233021, - "grad_norm": 0.8199366927146912, - "learning_rate": 0.00019900657256761124, - "loss": 3.2743, + "epoch": 6.7008086253369274, + "grad_norm": 0.749595046043396, + "learning_rate": 0.00019832703723691308, + "loss": 3.2862, "step": 62150 }, { - "epoch": 6.69465073727263, - "grad_norm": 0.7921463251113892, - "learning_rate": 0.0001986833315375498, - "loss": 3.284, + "epoch": 6.706199460916442, + "grad_norm": 0.7545285224914551, + "learning_rate": 0.00019800323799244466, + "loss": 3.2901, "step": 62200 }, { - "epoch": 6.7000322893122375, - "grad_norm": 0.7658042311668396, - "learning_rate": 0.00019836009050748838, - "loss": 3.2779, + "epoch": 6.711590296495957, + "grad_norm": 0.7023864984512329, + "learning_rate": 0.00019767943874797624, + "loss": 3.2911, "step": 62250 }, { - "epoch": 6.705413841351846, - "grad_norm": 0.7855052947998047, - "learning_rate": 0.000198036849477427, - "loss": 3.2939, + "epoch": 6.716981132075472, + "grad_norm": 0.7340058088302612, + "learning_rate": 0.00019735563950350782, + "loss": 3.2823, "step": 62300 }, { - "epoch": 6.710795393391454, - "grad_norm": 0.778864860534668, - "learning_rate": 0.00019771360844736557, - "loss": 3.2865, + "epoch": 6.722371967654986, + "grad_norm": 0.7416219115257263, + "learning_rate": 0.0001970318402590394, + "loss": 3.286, "step": 62350 }, { - "epoch": 6.716176945431062, - "grad_norm": 0.7535867691040039, - "learning_rate": 0.00019739036741730413, - "loss": 3.281, + "epoch": 6.727762803234501, + "grad_norm": 0.7272533774375916, + "learning_rate": 0.00019670804101457097, + "loss": 3.2881, "step": 62400 }, { - "epoch": 6.721558497470671, - "grad_norm": 0.8026406168937683, - "learning_rate": 0.00019706712638724276, - "loss": 3.2898, + "epoch": 6.7331536388140165, + "grad_norm": 0.729224681854248, + "learning_rate": 0.00019638424177010252, + "loss": 3.2863, "step": 62450 }, { - "epoch": 6.7269400495102785, - "grad_norm": 0.826941192150116, - "learning_rate": 0.00019674388535718132, - "loss": 3.2813, + "epoch": 6.738544474393531, + "grad_norm": 0.7703849673271179, + "learning_rate": 0.00019606044252563407, + "loss": 3.2826, "step": 62500 }, { - "epoch": 6.732321601549887, - "grad_norm": 0.8191684484481812, - "learning_rate": 0.0001964206443271199, - "loss": 3.279, + "epoch": 6.743935309973046, + "grad_norm": 0.7224081754684448, + "learning_rate": 0.00019573664328116565, + "loss": 3.2981, "step": 62550 }, { - "epoch": 6.737703153589496, - "grad_norm": 0.7996468544006348, - "learning_rate": 0.00019609740329705849, - "loss": 3.2748, + "epoch": 6.74932614555256, + "grad_norm": 0.7727299332618713, + "learning_rate": 0.00019541284403669723, + "loss": 3.2744, "step": 62600 }, { - "epoch": 6.743084705629103, - "grad_norm": 0.8293940424919128, - "learning_rate": 0.00019577416226699708, - "loss": 3.2804, + "epoch": 6.754716981132075, + "grad_norm": 0.7562722563743591, + "learning_rate": 0.0001950890447922288, + "loss": 3.2836, "step": 62650 }, { - "epoch": 6.748466257668712, - "grad_norm": 0.7844554781913757, - "learning_rate": 0.00019545092123693565, - "loss": 3.2701, + "epoch": 6.7601078167115904, + "grad_norm": 0.7537541389465332, + "learning_rate": 0.00019476524554776038, + "loss": 3.2814, "step": 62700 }, { - "epoch": 6.75384780970832, - "grad_norm": 0.7604687809944153, - "learning_rate": 0.00019512768020687424, - "loss": 3.2818, + "epoch": 6.765498652291106, + "grad_norm": 0.7441864013671875, + "learning_rate": 0.00019444144630329193, + "loss": 3.2798, "step": 62750 }, { - "epoch": 6.759229361747928, - "grad_norm": 0.8484272956848145, - "learning_rate": 0.0001948044391768128, - "loss": 3.2771, + "epoch": 6.77088948787062, + "grad_norm": 0.7457358241081238, + "learning_rate": 0.0001941176470588235, + "loss": 3.281, "step": 62800 }, { - "epoch": 6.7646109137875365, - "grad_norm": 0.7642599940299988, - "learning_rate": 0.00019448119814675143, - "loss": 3.2808, + "epoch": 6.776280323450135, + "grad_norm": 0.7565246224403381, + "learning_rate": 0.0001937938478143551, + "loss": 3.2755, "step": 62850 }, { - "epoch": 6.769992465827144, - "grad_norm": 0.7840580344200134, - "learning_rate": 0.00019415795711669, - "loss": 3.2957, + "epoch": 6.781671159029649, + "grad_norm": 0.7899838089942932, + "learning_rate": 0.00019347652455477602, + "loss": 3.3014, "step": 62900 }, { - "epoch": 6.775374017866753, - "grad_norm": 0.8181833028793335, - "learning_rate": 0.00019383471608662857, - "loss": 3.2691, + "epoch": 6.787061994609164, + "grad_norm": 0.8076132535934448, + "learning_rate": 0.0001931527253103076, + "loss": 3.2825, "step": 62950 }, { - "epoch": 6.780755569906361, - "grad_norm": 0.8215237259864807, - "learning_rate": 0.0001935114750565672, - "loss": 3.2696, + "epoch": 6.7924528301886795, + "grad_norm": 0.7913707494735718, + "learning_rate": 0.00019282892606583917, + "loss": 3.272, "step": 63000 }, { - "epoch": 6.780755569906361, - "eval_accuracy": 0.38650168232698845, - "eval_loss": 3.359816312789917, - "eval_runtime": 180.6996, - "eval_samples_per_second": 99.674, - "eval_steps_per_second": 6.231, + "epoch": 6.7924528301886795, + "eval_accuracy": 0.38650135636832506, + "eval_loss": 3.3610804080963135, + "eval_runtime": 184.6468, + "eval_samples_per_second": 97.543, + "eval_steps_per_second": 6.098, "step": 63000 }, { - "epoch": 6.786137121945969, - "grad_norm": 0.7587153911590576, - "learning_rate": 0.00019318823402650576, - "loss": 3.2695, + "epoch": 6.797843665768194, + "grad_norm": 0.7771248817443848, + "learning_rate": 0.00019250512682137075, + "loss": 3.2959, "step": 63050 }, { - "epoch": 6.7915186739855775, - "grad_norm": 0.7708876729011536, - "learning_rate": 0.00019286499299644432, - "loss": 3.2752, + "epoch": 6.803234501347709, + "grad_norm": 0.7436697483062744, + "learning_rate": 0.0001921813275769023, + "loss": 3.2888, "step": 63100 }, { - "epoch": 6.796900226025185, - "grad_norm": 0.8366973400115967, - "learning_rate": 0.00019254175196638292, - "loss": 3.293, + "epoch": 6.808625336927224, + "grad_norm": 0.7928493022918701, + "learning_rate": 0.00019185752833243388, + "loss": 3.2828, "step": 63150 }, { - "epoch": 6.802281778064794, - "grad_norm": 0.8045485019683838, - "learning_rate": 0.0001922185109363215, - "loss": 3.2756, + "epoch": 6.814016172506738, + "grad_norm": 0.7770071029663086, + "learning_rate": 0.00019153372908796545, + "loss": 3.3013, "step": 63200 }, { - "epoch": 6.807663330104402, - "grad_norm": 0.7623797059059143, - "learning_rate": 0.00019189526990626008, - "loss": 3.2853, + "epoch": 6.819407008086253, + "grad_norm": 1.0295462608337402, + "learning_rate": 0.000191209929843497, + "loss": 3.2822, "step": 63250 }, { - "epoch": 6.813044882144011, - "grad_norm": 0.7878138422966003, - "learning_rate": 0.00019157202887619867, - "loss": 3.2875, + "epoch": 6.824797843665769, + "grad_norm": 0.7837042808532715, + "learning_rate": 0.00019088613059902858, + "loss": 3.2862, "step": 63300 }, { - "epoch": 6.8184264341836185, - "grad_norm": 0.7722912430763245, - "learning_rate": 0.00019124878784613724, - "loss": 3.2808, + "epoch": 6.830188679245283, + "grad_norm": 0.7821605205535889, + "learning_rate": 0.00019056233135456016, + "loss": 3.2704, "step": 63350 }, { - "epoch": 6.823807986223227, - "grad_norm": 0.7824761271476746, - "learning_rate": 0.00019092554681607584, - "loss": 3.2744, + "epoch": 6.835579514824798, + "grad_norm": 0.7281560301780701, + "learning_rate": 0.0001902385321100917, + "loss": 3.2843, "step": 63400 }, { - "epoch": 6.829189538262835, - "grad_norm": 0.8032478094100952, - "learning_rate": 0.00019060230578601443, - "loss": 3.2716, + "epoch": 6.840970350404312, + "grad_norm": 0.7526901960372925, + "learning_rate": 0.0001899147328656233, + "loss": 3.2753, "step": 63450 }, { - "epoch": 6.834571090302443, - "grad_norm": 0.7636817097663879, - "learning_rate": 0.000190279064755953, - "loss": 3.2809, + "epoch": 6.846361185983827, + "grad_norm": 0.7778500914573669, + "learning_rate": 0.00018959093362115487, + "loss": 3.3013, "step": 63500 }, { - "epoch": 6.839952642342052, - "grad_norm": 0.8288126587867737, - "learning_rate": 0.00018995582372589157, - "loss": 3.2843, + "epoch": 6.8517520215633425, + "grad_norm": 0.7681808471679688, + "learning_rate": 0.00018926713437668644, + "loss": 3.2797, "step": 63550 }, { - "epoch": 6.8453341943816595, - "grad_norm": 0.7764488458633423, - "learning_rate": 0.0001896325826958302, - "loss": 3.2944, + "epoch": 6.857142857142857, + "grad_norm": 0.7437977194786072, + "learning_rate": 0.00018894333513221802, + "loss": 3.3045, "step": 63600 }, { - "epoch": 6.850715746421268, - "grad_norm": 0.8440558910369873, - "learning_rate": 0.00018930934166576876, - "loss": 3.282, + "epoch": 6.862533692722372, + "grad_norm": 0.7671492695808411, + "learning_rate": 0.00018861953588774957, + "loss": 3.2904, "step": 63650 }, { - "epoch": 6.856097298460876, - "grad_norm": 0.8410781025886536, - "learning_rate": 0.00018898610063570732, - "loss": 3.2914, + "epoch": 6.867924528301887, + "grad_norm": 0.843818187713623, + "learning_rate": 0.00018829573664328115, + "loss": 3.2881, "step": 63700 }, { - "epoch": 6.861478850500484, - "grad_norm": 0.7732300758361816, - "learning_rate": 0.00018866285960564595, - "loss": 3.2623, + "epoch": 6.873315363881401, + "grad_norm": 0.7092686295509338, + "learning_rate": 0.0001879719373988127, + "loss": 3.3074, "step": 63750 }, { - "epoch": 6.866860402540093, - "grad_norm": 0.7925925850868225, - "learning_rate": 0.0001883396185755845, - "loss": 3.2792, + "epoch": 6.878706199460916, + "grad_norm": 0.7547516226768494, + "learning_rate": 0.00018764813815434428, + "loss": 3.2979, "step": 63800 }, { - "epoch": 6.8722419545797, - "grad_norm": 0.7949431538581848, - "learning_rate": 0.0001880163775455231, - "loss": 3.2922, + "epoch": 6.884097035040432, + "grad_norm": 0.7473245859146118, + "learning_rate": 0.00018732433890987586, + "loss": 3.2871, "step": 63850 }, { - "epoch": 6.877623506619309, - "grad_norm": 0.7886269092559814, - "learning_rate": 0.00018769313651546168, - "loss": 3.28, + "epoch": 6.889487870619946, + "grad_norm": 0.7430441379547119, + "learning_rate": 0.00018700053966540743, + "loss": 3.282, "step": 63900 }, { - "epoch": 6.8830050586589175, - "grad_norm": 0.8459096550941467, - "learning_rate": 0.00018736989548540027, - "loss": 3.2812, + "epoch": 6.894878706199461, + "grad_norm": 0.7878106832504272, + "learning_rate": 0.000186676740420939, + "loss": 3.2906, "step": 63950 }, { - "epoch": 6.888386610698525, - "grad_norm": 0.798220694065094, - "learning_rate": 0.00018704665445533886, - "loss": 3.2895, + "epoch": 6.900269541778976, + "grad_norm": 0.7469275593757629, + "learning_rate": 0.0001863529411764706, + "loss": 3.275, "step": 64000 }, { - "epoch": 6.888386610698525, - "eval_accuracy": 0.3871914108588348, - "eval_loss": 3.3550844192504883, - "eval_runtime": 180.8293, - "eval_samples_per_second": 99.602, - "eval_steps_per_second": 6.227, + "epoch": 6.900269541778976, + "eval_accuracy": 0.38682112181716305, + "eval_loss": 3.3576762676239014, + "eval_runtime": 184.785, + "eval_samples_per_second": 97.47, + "eval_steps_per_second": 6.094, "step": 64000 }, { - "epoch": 6.893768162738134, - "grad_norm": 0.788962185382843, - "learning_rate": 0.00018672341342527743, - "loss": 3.2834, + "epoch": 6.90566037735849, + "grad_norm": 0.7187466621398926, + "learning_rate": 0.00018602914193200217, + "loss": 3.2847, "step": 64050 }, { - "epoch": 6.899149714777742, - "grad_norm": 0.7914458513259888, - "learning_rate": 0.000186400172395216, - "loss": 3.2871, + "epoch": 6.9110512129380055, + "grad_norm": 0.7366333603858948, + "learning_rate": 0.0001857053426875337, + "loss": 3.2846, "step": 64100 }, { - "epoch": 6.90453126681735, - "grad_norm": 0.8202913403511047, - "learning_rate": 0.00018607693136515462, - "loss": 3.2844, + "epoch": 6.916442048517521, + "grad_norm": 0.7928003072738647, + "learning_rate": 0.00018538154344306527, + "loss": 3.2964, "step": 64150 }, { - "epoch": 6.9099128188569585, - "grad_norm": 0.8197557330131531, - "learning_rate": 0.0001857536903350932, - "loss": 3.2812, + "epoch": 6.921832884097035, + "grad_norm": 0.7863099575042725, + "learning_rate": 0.00018505774419859684, + "loss": 3.2825, "step": 64200 }, { - "epoch": 6.915294370896566, - "grad_norm": 0.8909661769866943, - "learning_rate": 0.00018543044930503176, - "loss": 3.2823, + "epoch": 6.92722371967655, + "grad_norm": 0.7603685855865479, + "learning_rate": 0.00018473394495412842, + "loss": 3.3005, "step": 64250 }, { - "epoch": 6.920675922936175, - "grad_norm": 0.8012292385101318, - "learning_rate": 0.00018510720827497035, - "loss": 3.2831, + "epoch": 6.932614555256064, + "grad_norm": 0.7410401701927185, + "learning_rate": 0.00018441014570966, + "loss": 3.2867, "step": 64300 }, { - "epoch": 6.926057474975783, - "grad_norm": 0.8419049978256226, - "learning_rate": 0.00018478396724490895, - "loss": 3.2819, + "epoch": 6.938005390835579, + "grad_norm": 0.7407622933387756, + "learning_rate": 0.00018408634646519158, + "loss": 3.2811, "step": 64350 }, { - "epoch": 6.931439027015391, - "grad_norm": 0.8665248155593872, - "learning_rate": 0.0001844607262148475, - "loss": 3.2778, + "epoch": 6.943396226415095, + "grad_norm": 0.7628289461135864, + "learning_rate": 0.00018376254722072316, + "loss": 3.2923, "step": 64400 }, { - "epoch": 6.9368205790549995, - "grad_norm": 0.8091945052146912, - "learning_rate": 0.0001841374851847861, - "loss": 3.2969, + "epoch": 6.948787061994609, + "grad_norm": 0.747056245803833, + "learning_rate": 0.0001834387479762547, + "loss": 3.2672, "step": 64450 }, { - "epoch": 6.942202131094608, - "grad_norm": 0.7917461395263672, - "learning_rate": 0.00018381424415472468, - "loss": 3.2777, + "epoch": 6.954177897574124, + "grad_norm": 0.7753750681877136, + "learning_rate": 0.00018311494873178628, + "loss": 3.2983, "step": 64500 }, { - "epoch": 6.947583683134216, - "grad_norm": 0.8483121395111084, - "learning_rate": 0.00018349100312466327, - "loss": 3.2774, + "epoch": 6.959568733153639, + "grad_norm": 0.7228161692619324, + "learning_rate": 0.00018279114948731783, + "loss": 3.2901, "step": 64550 }, { - "epoch": 6.952965235173824, - "grad_norm": 0.8231863379478455, - "learning_rate": 0.00018316776209460186, - "loss": 3.2817, + "epoch": 6.964959568733153, + "grad_norm": 0.736755907535553, + "learning_rate": 0.0001824673502428494, + "loss": 3.2753, "step": 64600 }, { - "epoch": 6.958346787213433, - "grad_norm": 0.7838043570518494, - "learning_rate": 0.00018284452106454043, - "loss": 3.2871, + "epoch": 6.9703504043126685, + "grad_norm": 0.7366510033607483, + "learning_rate": 0.000182143550998381, + "loss": 3.2889, "step": 64650 }, { - "epoch": 6.9637283392530405, - "grad_norm": 0.8054148554801941, - "learning_rate": 0.000182521280034479, - "loss": 3.2853, + "epoch": 6.975741239892184, + "grad_norm": 0.7212927937507629, + "learning_rate": 0.00018181975175391257, + "loss": 3.2938, "step": 64700 }, { - "epoch": 6.969109891292649, - "grad_norm": 0.8586058020591736, - "learning_rate": 0.00018219803900441762, - "loss": 3.2725, + "epoch": 6.981132075471698, + "grad_norm": 0.8152526021003723, + "learning_rate": 0.00018149595250944414, + "loss": 3.2776, "step": 64750 }, { - "epoch": 6.974491443332257, - "grad_norm": 0.8905388712882996, - "learning_rate": 0.0001818747979743562, - "loss": 3.2941, + "epoch": 6.986522911051213, + "grad_norm": 0.8053279519081116, + "learning_rate": 0.0001811721532649757, + "loss": 3.2878, "step": 64800 }, { - "epoch": 6.979872995371865, - "grad_norm": 0.8023561835289001, - "learning_rate": 0.00018155155694429478, - "loss": 3.2772, + "epoch": 6.991913746630727, + "grad_norm": 0.7522391676902771, + "learning_rate": 0.00018084835402050727, + "loss": 3.2619, "step": 64850 }, { - "epoch": 6.985254547411474, - "grad_norm": 0.8267327547073364, - "learning_rate": 0.00018122831591423338, - "loss": 3.3077, + "epoch": 6.997304582210242, + "grad_norm": 0.741847574710846, + "learning_rate": 0.00018052455477603885, + "loss": 3.3074, "step": 64900 }, { - "epoch": 6.990636099451081, - "grad_norm": 0.8111370801925659, - "learning_rate": 0.00018090507488417195, - "loss": 3.2822, + "epoch": 7.002695417789758, + "grad_norm": 0.754662036895752, + "learning_rate": 0.0001802007555315704, + "loss": 3.2549, "step": 64950 }, { - "epoch": 6.99601765149069, - "grad_norm": 0.7818148732185364, - "learning_rate": 0.00018058829867471175, - "loss": 3.2834, + "epoch": 7.008086253369272, + "grad_norm": 0.7811526656150818, + "learning_rate": 0.00017987695628710198, + "loss": 3.206, "step": 65000 }, { - "epoch": 6.99601765149069, - "eval_accuracy": 0.38743957405460305, - "eval_loss": 3.350639581680298, - "eval_runtime": 180.6606, - "eval_samples_per_second": 99.695, - "eval_steps_per_second": 6.233, + "epoch": 7.008086253369272, + "eval_accuracy": 0.38698475306621166, + "eval_loss": 3.359518051147461, + "eval_runtime": 184.6567, + "eval_samples_per_second": 97.538, + "eval_steps_per_second": 6.098, "step": 65000 }, { - "epoch": 7.0013992035302985, - "grad_norm": 0.8154913783073425, - "learning_rate": 0.00018026505764465035, - "loss": 3.2506, + "epoch": 7.013477088948787, + "grad_norm": 0.7693818807601929, + "learning_rate": 0.00017955315704263356, + "loss": 3.2067, "step": 65050 }, { - "epoch": 7.006780755569906, - "grad_norm": 0.776719868183136, - "learning_rate": 0.00017994181661458894, - "loss": 3.1834, + "epoch": 7.018867924528302, + "grad_norm": 0.7586972117424011, + "learning_rate": 0.00017923583378305448, + "loss": 3.2073, "step": 65100 }, { - "epoch": 7.012162307609515, - "grad_norm": 0.7705662846565247, - "learning_rate": 0.0001796185755845275, - "loss": 3.2001, + "epoch": 7.024258760107816, + "grad_norm": 0.8107732534408569, + "learning_rate": 0.00017891203453858606, + "loss": 3.2127, "step": 65150 }, { - "epoch": 7.017543859649122, - "grad_norm": 0.8426011800765991, - "learning_rate": 0.00017929533455446608, - "loss": 3.1784, + "epoch": 7.0296495956873315, + "grad_norm": 0.7354645729064941, + "learning_rate": 0.00017858823529411764, + "loss": 3.2202, "step": 65200 }, { - "epoch": 7.022925411688731, - "grad_norm": 0.8200896978378296, - "learning_rate": 0.0001789720935244047, - "loss": 3.1967, + "epoch": 7.035040431266847, + "grad_norm": 0.7854849100112915, + "learning_rate": 0.00017826443604964921, + "loss": 3.1969, "step": 65250 }, { - "epoch": 7.0283069637283395, - "grad_norm": 0.8123102188110352, - "learning_rate": 0.00017864885249434327, - "loss": 3.201, + "epoch": 7.040431266846361, + "grad_norm": 0.7866175174713135, + "learning_rate": 0.00017794063680518077, + "loss": 3.2094, "step": 65300 }, { - "epoch": 7.033688515767947, - "grad_norm": 0.8369794487953186, - "learning_rate": 0.00017832561146428183, - "loss": 3.1928, + "epoch": 7.045822102425876, + "grad_norm": 0.744255006313324, + "learning_rate": 0.00017761683756071234, + "loss": 3.2052, "step": 65350 }, { - "epoch": 7.039070067807556, - "grad_norm": 0.7779251933097839, - "learning_rate": 0.00017800237043422046, - "loss": 3.1874, + "epoch": 7.051212938005391, + "grad_norm": 0.7620016932487488, + "learning_rate": 0.00017729303831624392, + "loss": 3.2066, "step": 65400 }, { - "epoch": 7.044451619847164, - "grad_norm": 0.8082521557807922, - "learning_rate": 0.00017767912940415902, - "loss": 3.1967, + "epoch": 7.056603773584905, + "grad_norm": 0.757378101348877, + "learning_rate": 0.00017696923907177547, + "loss": 3.1946, "step": 65450 }, { - "epoch": 7.049833171886772, - "grad_norm": 0.8156419992446899, - "learning_rate": 0.0001773558883740976, - "loss": 3.1956, + "epoch": 7.061994609164421, + "grad_norm": 0.7628493309020996, + "learning_rate": 0.00017664543982730705, + "loss": 3.2097, "step": 65500 }, { - "epoch": 7.0552147239263805, - "grad_norm": 0.8252975344657898, - "learning_rate": 0.00017703264734403619, - "loss": 3.2095, + "epoch": 7.067385444743936, + "grad_norm": 0.7638753056526184, + "learning_rate": 0.00017632164058283863, + "loss": 3.1945, "step": 65550 }, { - "epoch": 7.060596275965988, - "grad_norm": 0.7841750383377075, - "learning_rate": 0.00017670940631397475, - "loss": 3.1895, + "epoch": 7.07277628032345, + "grad_norm": 0.7687901258468628, + "learning_rate": 0.0001759978413383702, + "loss": 3.1878, "step": 65600 }, { - "epoch": 7.065977828005597, - "grad_norm": 0.8294128179550171, - "learning_rate": 0.00017638616528391337, - "loss": 3.212, + "epoch": 7.078167115902965, + "grad_norm": 0.7791463136672974, + "learning_rate": 0.00017567404209390178, + "loss": 3.2067, "step": 65650 }, { - "epoch": 7.071359380045205, - "grad_norm": 0.8671795129776001, - "learning_rate": 0.00017606292425385194, - "loss": 3.2094, + "epoch": 7.083557951482479, + "grad_norm": 0.7624858617782593, + "learning_rate": 0.00017535024284943333, + "loss": 3.2304, "step": 65700 }, { - "epoch": 7.076740932084813, - "grad_norm": 0.7949682474136353, - "learning_rate": 0.0001757396832237905, - "loss": 3.2189, + "epoch": 7.0889487870619945, + "grad_norm": 0.7333658933639526, + "learning_rate": 0.00017502644360496488, + "loss": 3.2159, "step": 65750 }, { - "epoch": 7.0821224841244215, - "grad_norm": 0.8465704917907715, - "learning_rate": 0.00017541644219372913, - "loss": 3.2139, + "epoch": 7.09433962264151, + "grad_norm": 0.7534026503562927, + "learning_rate": 0.00017470264436049646, + "loss": 3.2068, "step": 65800 }, { - "epoch": 7.08750403616403, - "grad_norm": 0.8210230469703674, - "learning_rate": 0.0001750932011636677, - "loss": 3.2113, + "epoch": 7.099730458221024, + "grad_norm": 0.7843572497367859, + "learning_rate": 0.00017437884511602804, + "loss": 3.2185, "step": 65850 }, { - "epoch": 7.092885588203638, - "grad_norm": 0.8009158372879028, - "learning_rate": 0.00017476996013360627, - "loss": 3.2013, + "epoch": 7.105121293800539, + "grad_norm": 0.7714036107063293, + "learning_rate": 0.00017405504587155962, + "loss": 3.2306, "step": 65900 }, { - "epoch": 7.098267140243246, - "grad_norm": 0.8548141717910767, - "learning_rate": 0.00017444671910354486, - "loss": 3.2163, + "epoch": 7.110512129380054, + "grad_norm": 0.7692075967788696, + "learning_rate": 0.0001737312466270912, + "loss": 3.2265, "step": 65950 }, { - "epoch": 7.103648692282855, - "grad_norm": 0.8232263922691345, - "learning_rate": 0.00017412347807348346, - "loss": 3.2163, + "epoch": 7.115902964959568, + "grad_norm": 0.7614142298698425, + "learning_rate": 0.00017340744738262277, + "loss": 3.2005, "step": 66000 }, { - "epoch": 7.103648692282855, - "eval_accuracy": 0.3872667073100902, - "eval_loss": 3.3580427169799805, - "eval_runtime": 180.5751, - "eval_samples_per_second": 99.742, - "eval_steps_per_second": 6.236, + "epoch": 7.115902964959568, + "eval_accuracy": 0.38687142810422115, + "eval_loss": 3.361088275909424, + "eval_runtime": 184.9846, + "eval_samples_per_second": 97.365, + "eval_steps_per_second": 6.087, "step": 66000 }, { - "epoch": 7.109030244322462, - "grad_norm": 0.7871417999267578, - "learning_rate": 0.00017380023704342202, - "loss": 3.217, + "epoch": 7.121293800539084, + "grad_norm": 0.7808488011360168, + "learning_rate": 0.00017308364813815435, + "loss": 3.2523, "step": 66050 }, { - "epoch": 7.114411796362071, - "grad_norm": 0.8225372433662415, - "learning_rate": 0.00017347699601336062, - "loss": 3.2348, + "epoch": 7.126684636118599, + "grad_norm": 0.7770368456840515, + "learning_rate": 0.00017275984889368593, + "loss": 3.2059, "step": 66100 }, { - "epoch": 7.119793348401679, - "grad_norm": 0.8037530779838562, - "learning_rate": 0.00017315375498329919, - "loss": 3.2072, + "epoch": 7.132075471698113, + "grad_norm": 0.8269479274749756, + "learning_rate": 0.00017243604964921745, + "loss": 3.2103, "step": 66150 }, { - "epoch": 7.125174900441287, - "grad_norm": 1.7943295240402222, - "learning_rate": 0.00017283051395323778, - "loss": 3.2096, + "epoch": 7.137466307277628, + "grad_norm": 0.8035179972648621, + "learning_rate": 0.00017211225040474903, + "loss": 3.2091, "step": 66200 }, { - "epoch": 7.130556452480896, - "grad_norm": 0.8043410181999207, - "learning_rate": 0.00017250727292317638, - "loss": 3.2008, + "epoch": 7.142857142857143, + "grad_norm": 0.8429258465766907, + "learning_rate": 0.0001717884511602806, + "loss": 3.2233, "step": 66250 }, { - "epoch": 7.135938004520503, - "grad_norm": 0.8701448440551758, - "learning_rate": 0.00017218403189311494, - "loss": 3.2097, + "epoch": 7.1482479784366575, + "grad_norm": 0.7574069499969482, + "learning_rate": 0.00017146465191581218, + "loss": 3.2282, "step": 66300 }, { - "epoch": 7.141319556560112, - "grad_norm": 0.8197288513183594, - "learning_rate": 0.0001718607908630535, - "loss": 3.1995, + "epoch": 7.153638814016173, + "grad_norm": 0.7994669675827026, + "learning_rate": 0.00017114085267134376, + "loss": 3.2247, "step": 66350 }, { - "epoch": 7.1467011085997205, - "grad_norm": 0.8066554665565491, - "learning_rate": 0.00017153754983299213, - "loss": 3.2064, + "epoch": 7.159029649595688, + "grad_norm": 0.7484458684921265, + "learning_rate": 0.00017081705342687534, + "loss": 3.2324, "step": 66400 }, { - "epoch": 7.152082660639328, - "grad_norm": 0.7715657353401184, - "learning_rate": 0.0001712143088029307, - "loss": 3.2109, + "epoch": 7.164420485175202, + "grad_norm": 0.7799018025398254, + "learning_rate": 0.0001704932541824069, + "loss": 3.2306, "step": 66450 }, { - "epoch": 7.157464212678937, - "grad_norm": 0.8049308061599731, - "learning_rate": 0.00017089106777286927, - "loss": 3.2056, + "epoch": 7.169811320754717, + "grad_norm": 0.7528384327888489, + "learning_rate": 0.00017016945493793847, + "loss": 3.2158, "step": 66500 }, { - "epoch": 7.162845764718545, - "grad_norm": 0.8044945001602173, - "learning_rate": 0.0001705678267428079, - "loss": 3.2161, + "epoch": 7.175202156334231, + "grad_norm": 0.7923641204833984, + "learning_rate": 0.00016984565569347002, + "loss": 3.2163, "step": 66550 }, { - "epoch": 7.168227316758153, - "grad_norm": 0.8120958805084229, - "learning_rate": 0.00017024458571274646, - "loss": 3.2157, + "epoch": 7.180592991913747, + "grad_norm": 0.7415347099304199, + "learning_rate": 0.0001695218564490016, + "loss": 3.2183, "step": 66600 }, { - "epoch": 7.1736088687977615, - "grad_norm": 0.8076058030128479, - "learning_rate": 0.00016992134468268505, - "loss": 3.2389, + "epoch": 7.185983827493262, + "grad_norm": 0.7724004983901978, + "learning_rate": 0.00016919805720453317, + "loss": 3.2205, "step": 66650 }, { - "epoch": 7.178990420837369, - "grad_norm": 0.8148431181907654, - "learning_rate": 0.00016959810365262362, - "loss": 3.2187, + "epoch": 7.191374663072776, + "grad_norm": 0.813212513923645, + "learning_rate": 0.00016887425796006475, + "loss": 3.2178, "step": 66700 }, { - "epoch": 7.184371972876978, - "grad_norm": 0.8567858338356018, - "learning_rate": 0.0001692748626225622, - "loss": 3.2188, + "epoch": 7.196765498652291, + "grad_norm": 0.7927876114845276, + "learning_rate": 0.00016855045871559633, + "loss": 3.2103, "step": 66750 }, { - "epoch": 7.189753524916586, - "grad_norm": 0.8196995854377747, - "learning_rate": 0.0001689516215925008, - "loss": 3.2203, + "epoch": 7.202156334231806, + "grad_norm": 0.8068609833717346, + "learning_rate": 0.00016822665947112788, + "loss": 3.2318, "step": 66800 }, { - "epoch": 7.195135076956194, - "grad_norm": 0.8134042620658875, - "learning_rate": 0.00016862838056243938, - "loss": 3.2023, + "epoch": 7.2075471698113205, + "grad_norm": 0.8054190278053284, + "learning_rate": 0.00016790286022665946, + "loss": 3.2371, "step": 66850 }, { - "epoch": 7.2005166289958025, - "grad_norm": 0.8222635984420776, - "learning_rate": 0.00016830513953237794, - "loss": 3.2136, + "epoch": 7.212938005390836, + "grad_norm": 0.7957893013954163, + "learning_rate": 0.00016757906098219103, + "loss": 3.2164, "step": 66900 }, { - "epoch": 7.205898181035411, - "grad_norm": 0.8523157835006714, - "learning_rate": 0.00016798189850231657, - "loss": 3.2292, + "epoch": 7.218328840970351, + "grad_norm": 0.7549981474876404, + "learning_rate": 0.0001672552617377226, + "loss": 3.2372, "step": 66950 }, { - "epoch": 7.211279733075019, - "grad_norm": 0.8117284178733826, - "learning_rate": 0.00016765865747225513, - "loss": 3.2243, + "epoch": 7.223719676549865, + "grad_norm": 0.731869101524353, + "learning_rate": 0.00016693146249325416, + "loss": 3.2249, "step": 67000 }, { - "epoch": 7.211279733075019, - "eval_accuracy": 0.38720662226312885, - "eval_loss": 3.3582334518432617, - "eval_runtime": 180.7195, - "eval_samples_per_second": 99.663, - "eval_steps_per_second": 6.231, + "epoch": 7.223719676549865, + "eval_accuracy": 0.38759386115529965, + "eval_loss": 3.36012864112854, + "eval_runtime": 184.9495, + "eval_samples_per_second": 97.383, + "eval_steps_per_second": 6.088, "step": 67000 }, { - "epoch": 7.216661285114627, - "grad_norm": 0.849255383014679, - "learning_rate": 0.0001673354164421937, - "loss": 3.2088, + "epoch": 7.22911051212938, + "grad_norm": 0.9114392399787903, + "learning_rate": 0.00016660766324878574, + "loss": 3.2494, "step": 67050 }, { - "epoch": 7.222042837154235, - "grad_norm": 0.8112828135490417, - "learning_rate": 0.00016701217541213232, - "loss": 3.2221, + "epoch": 7.234501347708895, + "grad_norm": 0.7874780893325806, + "learning_rate": 0.0001662838640043173, + "loss": 3.2246, "step": 67100 }, { - "epoch": 7.2274243891938434, - "grad_norm": 0.8703665733337402, - "learning_rate": 0.0001666889343820709, - "loss": 3.2114, + "epoch": 7.2398921832884096, + "grad_norm": 0.833976686000824, + "learning_rate": 0.00016596006475984887, + "loss": 3.2383, "step": 67150 }, { - "epoch": 7.232805941233452, - "grad_norm": 0.7933388352394104, - "learning_rate": 0.00016636569335200946, + "epoch": 7.245283018867925, + "grad_norm": 0.8088477849960327, + "learning_rate": 0.00016563626551538045, "loss": 3.2238, "step": 67200 }, { - "epoch": 7.23818749327306, - "grad_norm": 0.8398449420928955, - "learning_rate": 0.00016604245232194805, - "loss": 3.2391, + "epoch": 7.250673854447439, + "grad_norm": 0.7701541781425476, + "learning_rate": 0.00016531246627091202, + "loss": 3.2332, "step": 67250 }, { - "epoch": 7.243569045312668, - "grad_norm": 0.8231996297836304, - "learning_rate": 0.00016571921129188665, - "loss": 3.2372, + "epoch": 7.256064690026954, + "grad_norm": 0.7686187624931335, + "learning_rate": 0.0001649886670264436, + "loss": 3.2307, "step": 67300 }, { - "epoch": 7.248950597352277, - "grad_norm": 0.8363311886787415, - "learning_rate": 0.00016539597026182521, - "loss": 3.2078, + "epoch": 7.261455525606469, + "grad_norm": 0.7967571020126343, + "learning_rate": 0.00016466486778197518, + "loss": 3.2253, "step": 67350 }, { - "epoch": 7.254332149391884, - "grad_norm": 0.8070518970489502, - "learning_rate": 0.0001650727292317638, - "loss": 3.2155, + "epoch": 7.2668463611859835, + "grad_norm": 0.8138226866722107, + "learning_rate": 0.0001643410685375067, + "loss": 3.2298, "step": 67400 }, { - "epoch": 7.259713701431493, - "grad_norm": 0.8043374419212341, - "learning_rate": 0.00016474948820170238, - "loss": 3.229, + "epoch": 7.272237196765499, + "grad_norm": 0.7992923855781555, + "learning_rate": 0.00016401726929303828, + "loss": 3.2251, "step": 67450 }, { - "epoch": 7.265095253471101, - "grad_norm": 0.8224925398826599, - "learning_rate": 0.00016442624717164094, - "loss": 3.2222, + "epoch": 7.277628032345014, + "grad_norm": 0.7716299891471863, + "learning_rate": 0.00016369347004856986, + "loss": 3.2362, "step": 67500 }, { - "epoch": 7.270476805510709, - "grad_norm": 0.8394501209259033, - "learning_rate": 0.00016410300614157957, - "loss": 3.2113, + "epoch": 7.283018867924528, + "grad_norm": 0.8016715049743652, + "learning_rate": 0.00016336967080410143, + "loss": 3.2404, "step": 67550 }, { - "epoch": 7.275858357550318, - "grad_norm": 0.8679465055465698, - "learning_rate": 0.00016377976511151813, - "loss": 3.2274, + "epoch": 7.288409703504043, + "grad_norm": 0.7413058876991272, + "learning_rate": 0.000163045871559633, + "loss": 3.2129, "step": 67600 }, { - "epoch": 7.281239909589925, - "grad_norm": 0.8839657306671143, - "learning_rate": 0.00016345652408145675, - "loss": 3.2073, + "epoch": 7.293800539083558, + "grad_norm": 0.7324450612068176, + "learning_rate": 0.0001627220723151646, + "loss": 3.2346, "step": 67650 }, { - "epoch": 7.286621461629534, - "grad_norm": 0.9003349542617798, - "learning_rate": 0.00016313328305139532, - "loss": 3.2276, + "epoch": 7.2991913746630726, + "grad_norm": 0.7739547491073608, + "learning_rate": 0.00016239827307069617, + "loss": 3.2217, "step": 67700 }, { - "epoch": 7.2920030136691425, - "grad_norm": 0.8355953097343445, - "learning_rate": 0.0001628100420213339, - "loss": 3.229, + "epoch": 7.304582210242588, + "grad_norm": 0.7586296796798706, + "learning_rate": 0.00016207447382622775, + "loss": 3.2313, "step": 67750 }, { - "epoch": 7.29738456570875, - "grad_norm": 0.8046534657478333, - "learning_rate": 0.00016248680099127248, - "loss": 3.2386, + "epoch": 7.309973045822103, + "grad_norm": 0.7944618463516235, + "learning_rate": 0.00016175067458175932, + "loss": 3.2225, "step": 67800 }, { - "epoch": 7.302766117748359, - "grad_norm": 0.8797202110290527, - "learning_rate": 0.00016216355996121105, - "loss": 3.2217, + "epoch": 7.315363881401617, + "grad_norm": 0.7308440208435059, + "learning_rate": 0.00016142687533729085, + "loss": 3.2313, "step": 67850 }, { - "epoch": 7.308147669787967, - "grad_norm": 0.9474117755889893, - "learning_rate": 0.00016184031893114965, - "loss": 3.2168, + "epoch": 7.320754716981132, + "grad_norm": 0.7774273157119751, + "learning_rate": 0.00016110307609282242, + "loss": 3.2173, "step": 67900 }, { - "epoch": 7.313529221827575, - "grad_norm": 0.8707678318023682, - "learning_rate": 0.00016151707790108824, - "loss": 3.2345, + "epoch": 7.3261455525606465, + "grad_norm": 0.7571539878845215, + "learning_rate": 0.000160779276848354, + "loss": 3.22, "step": 67950 }, { - "epoch": 7.3189107738671835, - "grad_norm": 0.8038205504417419, - "learning_rate": 0.0001611938368710268, - "loss": 3.222, + "epoch": 7.331536388140162, + "grad_norm": 0.8019688725471497, + "learning_rate": 0.00016045547760388558, + "loss": 3.2262, "step": 68000 }, { - "epoch": 7.3189107738671835, - "eval_accuracy": 0.387976102014631, - "eval_loss": 3.352640390396118, - "eval_runtime": 180.5346, - "eval_samples_per_second": 99.765, - "eval_steps_per_second": 6.237, + "epoch": 7.331536388140162, + "eval_accuracy": 0.38772761286019924, + "eval_loss": 3.3541133403778076, + "eval_runtime": 184.6341, + "eval_samples_per_second": 97.55, + "eval_steps_per_second": 6.099, "step": 68000 }, { - "epoch": 7.324292325906791, - "grad_norm": 0.8763945698738098, - "learning_rate": 0.00016087059584096538, - "loss": 3.2126, + "epoch": 7.336927223719677, + "grad_norm": 0.8166679739952087, + "learning_rate": 0.00016013167835941716, + "loss": 3.2332, "step": 68050 }, { - "epoch": 7.3296738779464, - "grad_norm": 0.8443440198898315, - "learning_rate": 0.000160547354810904, - "loss": 3.2391, + "epoch": 7.342318059299191, + "grad_norm": 0.7608354687690735, + "learning_rate": 0.00015980787911494873, + "loss": 3.2595, "step": 68100 }, { - "epoch": 7.335055429986008, - "grad_norm": 0.8131680488586426, - "learning_rate": 0.00016023057860144378, - "loss": 3.2449, + "epoch": 7.347708894878706, + "grad_norm": 0.8633112907409668, + "learning_rate": 0.00015948407987048029, + "loss": 3.2232, "step": 68150 }, { - "epoch": 7.340436982025616, - "grad_norm": 0.8589426279067993, - "learning_rate": 0.0001599073375713824, - "loss": 3.2291, + "epoch": 7.353099730458221, + "grad_norm": 0.7595582008361816, + "learning_rate": 0.00015916028062601186, + "loss": 3.221, "step": 68200 }, { - "epoch": 7.3458185340652244, - "grad_norm": 0.8745540380477905, - "learning_rate": 0.00015958409654132097, - "loss": 3.2265, + "epoch": 7.3584905660377355, + "grad_norm": 0.8150188326835632, + "learning_rate": 0.00015883648138154344, + "loss": 3.25, "step": 68250 }, { - "epoch": 7.351200086104833, - "grad_norm": 0.838469386100769, - "learning_rate": 0.00015926085551125953, - "loss": 3.2466, + "epoch": 7.363881401617251, + "grad_norm": 0.829832136631012, + "learning_rate": 0.000158512682137075, + "loss": 3.2272, "step": 68300 }, { - "epoch": 7.356581638144441, - "grad_norm": 0.8351563811302185, - "learning_rate": 0.00015893761448119813, - "loss": 3.2189, + "epoch": 7.369272237196766, + "grad_norm": 0.7878278493881226, + "learning_rate": 0.00015818888289260657, + "loss": 3.2386, "step": 68350 }, { - "epoch": 7.361963190184049, - "grad_norm": 0.8260790705680847, - "learning_rate": 0.00015861437345113672, - "loss": 3.2165, + "epoch": 7.37466307277628, + "grad_norm": 0.7915233373641968, + "learning_rate": 0.00015786508364813815, + "loss": 3.2403, "step": 68400 }, { - "epoch": 7.367344742223658, - "grad_norm": 0.8453501462936401, - "learning_rate": 0.00015829759724167653, - "loss": 3.2279, + "epoch": 7.380053908355795, + "grad_norm": 0.7962988615036011, + "learning_rate": 0.0001575412844036697, + "loss": 3.2236, "step": 68450 }, { - "epoch": 7.372726294263265, - "grad_norm": 0.8862749934196472, - "learning_rate": 0.0001579743562116151, - "loss": 3.2315, + "epoch": 7.38544474393531, + "grad_norm": 0.7690643072128296, + "learning_rate": 0.00015722396114409065, + "loss": 3.2106, "step": 68500 }, { - "epoch": 7.378107846302874, - "grad_norm": 0.7972759008407593, - "learning_rate": 0.00015765111518155372, - "loss": 3.2317, + "epoch": 7.390835579514825, + "grad_norm": 0.7795893549919128, + "learning_rate": 0.00015690016189962223, + "loss": 3.2353, "step": 68550 }, { - "epoch": 7.383489398342482, - "grad_norm": 0.8577588796615601, - "learning_rate": 0.0001573278741514923, - "loss": 3.2332, + "epoch": 7.39622641509434, + "grad_norm": 0.8003530502319336, + "learning_rate": 0.00015657636265515378, + "loss": 3.2169, "step": 68600 }, { - "epoch": 7.38887095038209, - "grad_norm": 0.8090537190437317, - "learning_rate": 0.00015700463312143085, - "loss": 3.2207, + "epoch": 7.401617250673855, + "grad_norm": 0.8012276291847229, + "learning_rate": 0.00015625256341068536, + "loss": 3.2393, "step": 68650 }, { - "epoch": 7.394252502421699, - "grad_norm": 0.8941483497619629, - "learning_rate": 0.00015668139209136945, - "loss": 3.2325, + "epoch": 7.407008086253369, + "grad_norm": 0.768429160118103, + "learning_rate": 0.00015592876416621693, + "loss": 3.2154, "step": 68700 }, { - "epoch": 7.399634054461306, - "grad_norm": 0.8350239396095276, - "learning_rate": 0.00015635815106130804, - "loss": 3.2347, + "epoch": 7.412398921832884, + "grad_norm": 0.8270565271377563, + "learning_rate": 0.0001556049649217485, + "loss": 3.2256, "step": 68750 }, { - "epoch": 7.405015606500915, - "grad_norm": 0.834892749786377, - "learning_rate": 0.0001560349100312466, - "loss": 3.2314, + "epoch": 7.4177897574123985, + "grad_norm": 0.7837132215499878, + "learning_rate": 0.00015528116567728006, + "loss": 3.2431, "step": 68800 }, { - "epoch": 7.4103971585405235, - "grad_norm": 0.8487468957901001, - "learning_rate": 0.0001557116690011852, - "loss": 3.2407, + "epoch": 7.423180592991914, + "grad_norm": 0.7685801386833191, + "learning_rate": 0.00015495736643281164, + "loss": 3.2363, "step": 68850 }, { - "epoch": 7.415778710580131, - "grad_norm": 0.8758741021156311, - "learning_rate": 0.00015538842797112377, - "loss": 3.2153, + "epoch": 7.428571428571429, + "grad_norm": 0.8626431822776794, + "learning_rate": 0.00015463356718834322, + "loss": 3.2411, "step": 68900 }, { - "epoch": 7.42116026261974, - "grad_norm": 0.879677951335907, - "learning_rate": 0.00015506518694106237, - "loss": 3.2249, + "epoch": 7.433962264150943, + "grad_norm": 0.7880833148956299, + "learning_rate": 0.0001543097679438748, + "loss": 3.2314, "step": 68950 }, { - "epoch": 7.426541814659347, - "grad_norm": 0.8545063138008118, - "learning_rate": 0.00015474194591100096, - "loss": 3.2349, + "epoch": 7.439353099730458, + "grad_norm": 0.7299560308456421, + "learning_rate": 0.00015398596869940637, + "loss": 3.2216, "step": 69000 }, { - "epoch": 7.426541814659347, - "eval_accuracy": 0.3882562091594167, - "eval_loss": 3.3482072353363037, - "eval_runtime": 180.8861, - "eval_samples_per_second": 99.571, - "eval_steps_per_second": 6.225, + "epoch": 7.439353099730458, + "eval_accuracy": 0.38783952533464816, + "eval_loss": 3.351468324661255, + "eval_runtime": 185.0392, + "eval_samples_per_second": 97.336, + "eval_steps_per_second": 6.085, "step": 69000 }, { - "epoch": 7.431923366698956, - "grad_norm": 0.8772327303886414, - "learning_rate": 0.00015441870488093953, - "loss": 3.2284, + "epoch": 7.444743935309973, + "grad_norm": 0.7486545443534851, + "learning_rate": 0.00015366216945493792, + "loss": 3.2187, "step": 69050 }, { - "epoch": 7.4373049187385645, - "grad_norm": 0.9024936556816101, - "learning_rate": 0.0001540954638508781, - "loss": 3.218, + "epoch": 7.450134770889488, + "grad_norm": 0.7718955278396606, + "learning_rate": 0.00015333837021046947, + "loss": 3.2249, "step": 69100 }, { - "epoch": 7.442686470778172, - "grad_norm": 0.8099319338798523, - "learning_rate": 0.00015377222282081672, - "loss": 3.2183, + "epoch": 7.455525606469003, + "grad_norm": 0.860206127166748, + "learning_rate": 0.00015301457096600105, + "loss": 3.2364, "step": 69150 }, { - "epoch": 7.448068022817781, - "grad_norm": 0.793427050113678, - "learning_rate": 0.0001534489817907553, - "loss": 3.2307, + "epoch": 7.460916442048518, + "grad_norm": 0.928758978843689, + "learning_rate": 0.00015269077172153263, + "loss": 3.2494, "step": 69200 }, { - "epoch": 7.453449574857389, - "grad_norm": 0.8657283186912537, - "learning_rate": 0.00015312574076069388, - "loss": 3.2321, + "epoch": 7.466307277628032, + "grad_norm": 0.8024472594261169, + "learning_rate": 0.0001523669724770642, + "loss": 3.227, "step": 69250 }, { - "epoch": 7.458831126896997, - "grad_norm": 0.8640968799591064, - "learning_rate": 0.00015280249973063248, - "loss": 3.2242, + "epoch": 7.471698113207547, + "grad_norm": 0.8709169030189514, + "learning_rate": 0.00015204317323259578, + "loss": 3.256, "step": 69300 }, { - "epoch": 7.4642126789366054, - "grad_norm": 0.8989497423171997, - "learning_rate": 0.00015247925870057104, - "loss": 3.2395, + "epoch": 7.4770889487870615, + "grad_norm": 0.8366961479187012, + "learning_rate": 0.00015171937398812736, + "loss": 3.2388, "step": 69350 }, { - "epoch": 7.469594230976213, - "grad_norm": 0.8708150386810303, - "learning_rate": 0.00015215601767050964, - "loss": 3.2317, + "epoch": 7.482479784366577, + "grad_norm": 0.8059794306755066, + "learning_rate": 0.00015139557474365894, + "loss": 3.2311, "step": 69400 }, { - "epoch": 7.474975783015822, - "grad_norm": 0.8274281024932861, - "learning_rate": 0.0001518327766404482, - "loss": 3.2348, + "epoch": 7.487870619946092, + "grad_norm": 0.7974139451980591, + "learning_rate": 0.00015107177549919046, + "loss": 3.2324, "step": 69450 }, { - "epoch": 7.48035733505543, - "grad_norm": 0.841109573841095, - "learning_rate": 0.0001515095356103868, - "loss": 3.2263, + "epoch": 7.493261455525606, + "grad_norm": 0.8359848856925964, + "learning_rate": 0.00015074797625472204, + "loss": 3.2453, "step": 69500 }, { - "epoch": 7.485738887095038, - "grad_norm": 0.8123463988304138, - "learning_rate": 0.0001511862945803254, - "loss": 3.2116, + "epoch": 7.498652291105121, + "grad_norm": 0.7960279583930969, + "learning_rate": 0.00015042417701025362, + "loss": 3.2229, "step": 69550 }, { - "epoch": 7.491120439134646, - "grad_norm": 0.8069713115692139, - "learning_rate": 0.00015086305355026396, - "loss": 3.201, + "epoch": 7.504043126684636, + "grad_norm": 0.826728105545044, + "learning_rate": 0.0001501003777657852, + "loss": 3.2342, "step": 69600 }, { - "epoch": 7.496501991174255, - "grad_norm": 0.8293700218200684, - "learning_rate": 0.00015053981252020253, - "loss": 3.2439, + "epoch": 7.509433962264151, + "grad_norm": 0.8121812343597412, + "learning_rate": 0.00014977657852131677, + "loss": 3.249, "step": 69650 }, { - "epoch": 7.501883543213863, - "grad_norm": 0.8899844884872437, - "learning_rate": 0.00015021657149014115, - "loss": 3.2287, + "epoch": 7.514824797843666, + "grad_norm": 0.78941410779953, + "learning_rate": 0.00014945277927684835, + "loss": 3.2428, "step": 69700 }, { - "epoch": 7.507265095253471, - "grad_norm": 0.8449057936668396, - "learning_rate": 0.00014989333046007972, - "loss": 3.2416, + "epoch": 7.520215633423181, + "grad_norm": 0.7744514346122742, + "learning_rate": 0.00014912898003237993, + "loss": 3.2616, "step": 69750 }, { - "epoch": 7.51264664729308, - "grad_norm": 0.8326901197433472, - "learning_rate": 0.00014957008943001832, - "loss": 3.2332, + "epoch": 7.525606469002695, + "grad_norm": 0.8503783941268921, + "learning_rate": 0.00014880518078791148, + "loss": 3.2482, "step": 69800 }, { - "epoch": 7.518028199332687, - "grad_norm": 0.8048696517944336, - "learning_rate": 0.00014924684839995688, - "loss": 3.2153, + "epoch": 7.53099730458221, + "grad_norm": 0.7720679044723511, + "learning_rate": 0.00014848138154344306, + "loss": 3.2262, "step": 69850 }, { - "epoch": 7.523409751372296, - "grad_norm": 0.8521336913108826, - "learning_rate": 0.00014892360736989548, - "loss": 3.2298, + "epoch": 7.536388140161725, + "grad_norm": 0.7836202383041382, + "learning_rate": 0.00014815758229897463, + "loss": 3.216, "step": 69900 }, { - "epoch": 7.528791303411904, - "grad_norm": 0.8536204695701599, - "learning_rate": 0.00014860036633983407, - "loss": 3.2237, + "epoch": 7.54177897574124, + "grad_norm": 0.8227617144584656, + "learning_rate": 0.0001478337830545062, + "loss": 3.2513, "step": 69950 }, { - "epoch": 7.534172855451512, - "grad_norm": 0.8090736865997314, - "learning_rate": 0.00014827712530977264, - "loss": 3.2364, + "epoch": 7.547169811320755, + "grad_norm": 0.8092595934867859, + "learning_rate": 0.00014750998381003776, + "loss": 3.2305, "step": 70000 }, { - "epoch": 7.534172855451512, - "eval_accuracy": 0.38876535659171535, - "eval_loss": 3.3461689949035645, - "eval_runtime": 180.6754, - "eval_samples_per_second": 99.687, - "eval_steps_per_second": 6.232, + "epoch": 7.547169811320755, + "eval_accuracy": 0.38827544072055986, + "eval_loss": 3.347477912902832, + "eval_runtime": 184.9341, + "eval_samples_per_second": 97.391, + "eval_steps_per_second": 6.089, "step": 70000 } ], "logging_steps": 50, - "max_steps": 92910, + "max_steps": 92750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, @@ -10456,7 +10456,7 @@ "attributes": {} } }, - "total_flos": 5.85279519326208e+17, + "total_flos": 5.852484255744e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null