diff --git "a/checkpoint-80000/trainer_state.json" "b/checkpoint-80000/trainer_state.json" --- "a/checkpoint-80000/trainer_state.json" +++ "b/checkpoint-80000/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 3.323802947998047, - "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_634/checkpoint-80000", - "epoch": 8.610483263373157, + "best_metric": 3.3251895904541016, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_100_634/checkpoint-80000", + "epoch": 8.625336927223719, "eval_steps": 1000, "global_step": 80000, "is_hyper_param_search": false, @@ -9,11928 +9,11928 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005381552039608223, - "grad_norm": 2.7091472148895264, + "epoch": 0.005390835579514825, + "grad_norm": 1.4323930740356445, "learning_rate": 0.0003, - "loss": 8.4865, + "loss": 8.6259, "step": 50 }, { - "epoch": 0.010763104079216447, - "grad_norm": 1.967639446258545, + "epoch": 0.01078167115902965, + "grad_norm": 1.0894039869308472, "learning_rate": 0.0006, - "loss": 6.9044, + "loss": 6.9226, "step": 100 }, { - "epoch": 0.01614465611882467, - "grad_norm": 2.5172040462493896, - "learning_rate": 0.0005996767589699385, - "loss": 6.4803, + "epoch": 0.016172506738544475, + "grad_norm": 1.6132558584213257, + "learning_rate": 0.0005996762007555315, + "loss": 6.4978, "step": 150 }, { - "epoch": 0.021526208158432893, - "grad_norm": 1.3199275732040405, - "learning_rate": 0.0005993535179398771, - "loss": 6.2354, + "epoch": 0.0215633423180593, + "grad_norm": 1.2535423040390015, + "learning_rate": 0.000599352401511063, + "loss": 6.2339, "step": 200 }, { - "epoch": 0.026907760198041114, - "grad_norm": 1.2347278594970703, - "learning_rate": 0.0005990302769098158, - "loss": 6.0754, + "epoch": 0.026954177897574125, + "grad_norm": 1.3019688129425049, + "learning_rate": 0.0005990286022665946, + "loss": 6.0869, "step": 250 }, { - "epoch": 0.03228931223764934, - "grad_norm": 1.4885425567626953, - "learning_rate": 0.0005987070358797543, - "loss": 5.9329, + "epoch": 0.03234501347708895, + "grad_norm": 1.3941422700881958, + "learning_rate": 0.0005987048030221263, + "loss": 5.9835, "step": 300 }, { - "epoch": 0.03767086427725756, - "grad_norm": 1.6232192516326904, - "learning_rate": 0.0005983837948496929, - "loss": 5.8614, + "epoch": 0.03773584905660377, + "grad_norm": 2.2118101119995117, + "learning_rate": 0.0005983810037776578, + "loss": 5.8779, "step": 350 }, { - "epoch": 0.04305241631686579, - "grad_norm": 1.598788857460022, - "learning_rate": 0.0005980605538196314, - "loss": 5.7913, + "epoch": 0.0431266846361186, + "grad_norm": 0.8065666556358337, + "learning_rate": 0.0005980572045331894, + "loss": 5.7809, "step": 400 }, { - "epoch": 0.048433968356474004, - "grad_norm": 2.481018543243408, - "learning_rate": 0.0005977373127895701, - "loss": 5.7337, + "epoch": 0.04851752021563342, + "grad_norm": 1.9112606048583984, + "learning_rate": 0.0005977334052887209, + "loss": 5.7161, "step": 450 }, { - "epoch": 0.05381552039608223, - "grad_norm": 1.4101097583770752, - "learning_rate": 0.0005974140717595086, - "loss": 5.6182, + "epoch": 0.05390835579514825, + "grad_norm": 1.0509525537490845, + "learning_rate": 0.0005974096060442526, + "loss": 5.6556, "step": 500 }, { - "epoch": 0.05919707243569045, - "grad_norm": 1.6082617044448853, - "learning_rate": 0.0005970908307294472, - "loss": 5.5589, + "epoch": 0.05929919137466307, + "grad_norm": 1.202839970588684, + "learning_rate": 0.0005970858067997841, + "loss": 5.5734, "step": 550 }, { - "epoch": 0.06457862447529868, - "grad_norm": 0.9049023389816284, - "learning_rate": 0.0005967675896993858, - "loss": 5.5049, + "epoch": 0.0646900269541779, + "grad_norm": 1.3976365327835083, + "learning_rate": 0.0005967620075553157, + "loss": 5.5046, "step": 600 }, { - "epoch": 0.0699601765149069, - "grad_norm": 1.3293204307556152, - "learning_rate": 0.0005964443486693243, - "loss": 5.4266, + "epoch": 0.07008086253369272, + "grad_norm": 1.0246344804763794, + "learning_rate": 0.0005964382083108472, + "loss": 5.4208, "step": 650 }, { - "epoch": 0.07534172855451512, - "grad_norm": 1.7935289144515991, - "learning_rate": 0.000596121107639263, - "loss": 5.3506, + "epoch": 0.07547169811320754, + "grad_norm": 1.313137412071228, + "learning_rate": 0.0005961144090663788, + "loss": 5.367, "step": 700 }, { - "epoch": 0.08072328059412334, - "grad_norm": 1.6690304279327393, - "learning_rate": 0.0005957978666092015, - "loss": 5.3179, + "epoch": 0.08086253369272237, + "grad_norm": 1.0808387994766235, + "learning_rate": 0.0005957906098219104, + "loss": 5.3003, "step": 750 }, { - "epoch": 0.08610483263373157, - "grad_norm": 1.4270386695861816, - "learning_rate": 0.0005954746255791401, - "loss": 5.2698, + "epoch": 0.0862533692722372, + "grad_norm": 0.8700747489929199, + "learning_rate": 0.0005954668105774419, + "loss": 5.2478, "step": 800 }, { - "epoch": 0.09148638467333979, - "grad_norm": 1.5141105651855469, - "learning_rate": 0.0005951513845490787, - "loss": 5.1827, + "epoch": 0.09164420485175202, + "grad_norm": 1.0296642780303955, + "learning_rate": 0.0005951430113329735, + "loss": 5.1893, "step": 850 }, { - "epoch": 0.09686793671294801, - "grad_norm": 1.6831254959106445, - "learning_rate": 0.0005948281435190174, - "loss": 5.1898, + "epoch": 0.09703504043126684, + "grad_norm": 1.3226677179336548, + "learning_rate": 0.0005948192120885051, + "loss": 5.1688, "step": 900 }, { - "epoch": 0.10224948875255624, - "grad_norm": 1.1933035850524902, - "learning_rate": 0.0005945049024889559, - "loss": 5.1376, + "epoch": 0.10242587601078167, + "grad_norm": 1.0636179447174072, + "learning_rate": 0.0005944954128440366, + "loss": 5.1311, "step": 950 }, { - "epoch": 0.10763104079216446, - "grad_norm": 1.2600319385528564, - "learning_rate": 0.0005941816614588944, - "loss": 5.0756, + "epoch": 0.1078167115902965, + "grad_norm": 1.4769413471221924, + "learning_rate": 0.0005941716135995682, + "loss": 5.0765, "step": 1000 }, { - "epoch": 0.10763104079216446, - "eval_accuracy": 0.22684951933592223, - "eval_loss": 5.017084121704102, - "eval_runtime": 185.5208, - "eval_samples_per_second": 97.083, - "eval_steps_per_second": 6.069, + "epoch": 0.1078167115902965, + "eval_accuracy": 0.22730021151457672, + "eval_loss": 5.021658420562744, + "eval_runtime": 183.6554, + "eval_samples_per_second": 98.07, + "eval_steps_per_second": 6.131, "step": 1000 }, { - "epoch": 0.11301259283177269, - "grad_norm": 1.1238996982574463, - "learning_rate": 0.000593858420428833, - "loss": 5.0471, + "epoch": 0.11320754716981132, + "grad_norm": 1.1359593868255615, + "learning_rate": 0.0005938478143550997, + "loss": 5.0369, "step": 1050 }, { - "epoch": 0.1183941448713809, - "grad_norm": 1.0484765768051147, - "learning_rate": 0.0005935351793987716, - "loss": 5.0295, + "epoch": 0.11859838274932614, + "grad_norm": 0.8879361748695374, + "learning_rate": 0.0005935240151106314, + "loss": 5.0082, "step": 1100 }, { - "epoch": 0.12377569691098914, - "grad_norm": 1.4683005809783936, - "learning_rate": 0.0005932119383687103, - "loss": 4.9958, + "epoch": 0.12398921832884097, + "grad_norm": 0.9153700470924377, + "learning_rate": 0.0005932002158661629, + "loss": 4.9873, "step": 1150 }, { - "epoch": 0.12915724895059735, - "grad_norm": 1.0093375444412231, - "learning_rate": 0.0005928886973386488, - "loss": 4.9523, + "epoch": 0.1293800539083558, + "grad_norm": 1.2399190664291382, + "learning_rate": 0.0005928764166216945, + "loss": 4.9252, "step": 1200 }, { - "epoch": 0.13453880099020557, - "grad_norm": 1.2035621404647827, - "learning_rate": 0.0005925654563085874, - "loss": 4.8916, + "epoch": 0.1347708894878706, + "grad_norm": 0.8370887041091919, + "learning_rate": 0.000592552617377226, + "loss": 4.8993, "step": 1250 }, { - "epoch": 0.1399203530298138, - "grad_norm": 1.0326842069625854, - "learning_rate": 0.000592242215278526, - "loss": 4.8852, + "epoch": 0.14016172506738545, + "grad_norm": 1.5546677112579346, + "learning_rate": 0.0005922288181327577, + "loss": 4.8775, "step": 1300 }, { - "epoch": 0.14530190506942203, - "grad_norm": 0.9776148200035095, - "learning_rate": 0.0005919189742484645, - "loss": 4.8661, + "epoch": 0.14555256064690028, + "grad_norm": 0.8524234294891357, + "learning_rate": 0.0005919050188882893, + "loss": 4.841, "step": 1350 }, { - "epoch": 0.15068345710903025, - "grad_norm": 1.0485109090805054, - "learning_rate": 0.0005915957332184032, - "loss": 4.8206, + "epoch": 0.1509433962264151, + "grad_norm": 0.998586893081665, + "learning_rate": 0.0005915812196438207, + "loss": 4.8349, "step": 1400 }, { - "epoch": 0.15606500914863847, - "grad_norm": 0.931476891040802, - "learning_rate": 0.0005912724921883417, - "loss": 4.8047, + "epoch": 0.15633423180592992, + "grad_norm": 1.1706006526947021, + "learning_rate": 0.0005912574203993524, + "loss": 4.8439, "step": 1450 }, { - "epoch": 0.16144656118824668, - "grad_norm": 1.7174251079559326, - "learning_rate": 0.0005909492511582803, - "loss": 4.8148, + "epoch": 0.16172506738544473, + "grad_norm": 1.0023614168167114, + "learning_rate": 0.0005909336211548839, + "loss": 4.7998, "step": 1500 }, { - "epoch": 0.1668281132278549, - "grad_norm": 0.9541407823562622, - "learning_rate": 0.0005906260101282189, - "loss": 4.7619, + "epoch": 0.16711590296495957, + "grad_norm": 0.874588668346405, + "learning_rate": 0.0005906098219104155, + "loss": 4.7577, "step": 1550 }, { - "epoch": 0.17220966526746315, - "grad_norm": 0.8106948733329773, - "learning_rate": 0.0005903027690981575, - "loss": 4.7593, + "epoch": 0.1725067385444744, + "grad_norm": 1.079401969909668, + "learning_rate": 0.000590286022665947, + "loss": 4.7449, "step": 1600 }, { - "epoch": 0.17759121730707136, - "grad_norm": 0.8909491896629333, - "learning_rate": 0.000589979528068096, - "loss": 4.7116, + "epoch": 0.1778975741239892, + "grad_norm": 0.9815456867218018, + "learning_rate": 0.0005899622234214787, + "loss": 4.7201, "step": 1650 }, { - "epoch": 0.18297276934667958, - "grad_norm": 0.911844789981842, - "learning_rate": 0.0005896562870380347, - "loss": 4.6914, + "epoch": 0.18328840970350405, + "grad_norm": 0.9003214836120605, + "learning_rate": 0.0005896384241770102, + "loss": 4.696, "step": 1700 }, { - "epoch": 0.1883543213862878, - "grad_norm": 1.3206818103790283, - "learning_rate": 0.0005893330460079732, - "loss": 4.6934, + "epoch": 0.18867924528301888, + "grad_norm": 0.9949226975440979, + "learning_rate": 0.0005893146249325418, + "loss": 4.7044, "step": 1750 }, { - "epoch": 0.19373587342589602, - "grad_norm": 0.8717551231384277, - "learning_rate": 0.0005890098049779118, - "loss": 4.6662, + "epoch": 0.1940700808625337, + "grad_norm": 0.9833788871765137, + "learning_rate": 0.0005889908256880733, + "loss": 4.672, "step": 1800 }, { - "epoch": 0.19911742546550426, - "grad_norm": 0.8639872670173645, - "learning_rate": 0.0005886865639478504, - "loss": 4.6283, + "epoch": 0.19946091644204852, + "grad_norm": 0.9863656163215637, + "learning_rate": 0.0005886670264436049, + "loss": 4.6412, "step": 1850 }, { - "epoch": 0.20449897750511248, - "grad_norm": 1.275295615196228, - "learning_rate": 0.0005883633229177889, - "loss": 4.6063, + "epoch": 0.20485175202156333, + "grad_norm": 0.9831191897392273, + "learning_rate": 0.0005883432271991365, + "loss": 4.6256, "step": 1900 }, { - "epoch": 0.2098805295447207, - "grad_norm": 0.9782142043113708, - "learning_rate": 0.0005880400818877276, - "loss": 4.6013, + "epoch": 0.21024258760107817, + "grad_norm": 0.9612619280815125, + "learning_rate": 0.0005880194279546681, + "loss": 4.6301, "step": 1950 }, { - "epoch": 0.2152620815843289, - "grad_norm": 0.7876592874526978, - "learning_rate": 0.0005877168408576662, - "loss": 4.5802, + "epoch": 0.215633423180593, + "grad_norm": 0.8835451006889343, + "learning_rate": 0.0005876956287101996, + "loss": 4.5772, "step": 2000 }, { - "epoch": 0.2152620815843289, - "eval_accuracy": 0.27130006765815323, - "eval_loss": 4.498872756958008, - "eval_runtime": 185.1447, - "eval_samples_per_second": 97.281, - "eval_steps_per_second": 6.082, + "epoch": 0.215633423180593, + "eval_accuracy": 0.2714264309666815, + "eval_loss": 4.5014872550964355, + "eval_runtime": 183.6725, + "eval_samples_per_second": 98.06, + "eval_steps_per_second": 6.13, "step": 2000 }, { - "epoch": 0.22064363362393713, - "grad_norm": 0.9489641785621643, - "learning_rate": 0.0005873935998276048, - "loss": 4.576, + "epoch": 0.2210242587601078, + "grad_norm": 1.0137361288070679, + "learning_rate": 0.0005873718294657312, + "loss": 4.5575, "step": 2050 }, { - "epoch": 0.22602518566354537, - "grad_norm": 0.8717491030693054, - "learning_rate": 0.0005870703587975433, - "loss": 4.5329, + "epoch": 0.22641509433962265, + "grad_norm": 0.7878215312957764, + "learning_rate": 0.0005870480302212628, + "loss": 4.5434, "step": 2100 }, { - "epoch": 0.2314067377031536, - "grad_norm": 0.6650089025497437, - "learning_rate": 0.0005867471177674818, - "loss": 4.5276, + "epoch": 0.23180592991913745, + "grad_norm": 0.8975843787193298, + "learning_rate": 0.0005867242309767943, + "loss": 4.5286, "step": 2150 }, { - "epoch": 0.2367882897427618, - "grad_norm": 0.9660583138465881, - "learning_rate": 0.0005864238767374205, - "loss": 4.5157, + "epoch": 0.2371967654986523, + "grad_norm": 0.9283952116966248, + "learning_rate": 0.0005864004317323259, + "loss": 4.4883, "step": 2200 }, { - "epoch": 0.24216984178237003, - "grad_norm": 0.8200618028640747, - "learning_rate": 0.0005861006357073591, - "loss": 4.4902, + "epoch": 0.24258760107816713, + "grad_norm": 0.7984836101531982, + "learning_rate": 0.0005860766324878575, + "loss": 4.5028, "step": 2250 }, { - "epoch": 0.24755139382197827, - "grad_norm": 0.832756519317627, - "learning_rate": 0.0005857773946772977, - "loss": 4.4648, + "epoch": 0.24797843665768193, + "grad_norm": 0.8196074366569519, + "learning_rate": 0.000585752833243389, + "loss": 4.4778, "step": 2300 }, { - "epoch": 0.2529329458615865, - "grad_norm": 1.736657977104187, - "learning_rate": 0.0005854541536472362, - "loss": 4.451, + "epoch": 0.25336927223719674, + "grad_norm": 1.0101298093795776, + "learning_rate": 0.0005854290339989206, + "loss": 4.4509, "step": 2350 }, { - "epoch": 0.2583144979011947, - "grad_norm": 0.8690645694732666, - "learning_rate": 0.0005851309126171749, - "loss": 4.442, + "epoch": 0.2587601078167116, + "grad_norm": 0.9420535564422607, + "learning_rate": 0.0005851052347544521, + "loss": 4.4503, "step": 2400 }, { - "epoch": 0.2636960499408029, - "grad_norm": 0.9058188199996948, - "learning_rate": 0.0005848076715871134, - "loss": 4.4145, + "epoch": 0.2641509433962264, + "grad_norm": 0.8534466028213501, + "learning_rate": 0.0005847814355099838, + "loss": 4.4329, "step": 2450 }, { - "epoch": 0.26907760198041114, - "grad_norm": 0.7224371433258057, - "learning_rate": 0.000584484430557052, - "loss": 4.4354, + "epoch": 0.2695417789757412, + "grad_norm": 0.7836869359016418, + "learning_rate": 0.0005844576362655154, + "loss": 4.4255, "step": 2500 }, { - "epoch": 0.27445915402001936, - "grad_norm": 0.9763414263725281, - "learning_rate": 0.0005841611895269906, - "loss": 4.3947, + "epoch": 0.2749326145552561, + "grad_norm": 1.1862176656723022, + "learning_rate": 0.0005841338370210469, + "loss": 4.3987, "step": 2550 }, { - "epoch": 0.2798407060596276, - "grad_norm": 0.8446633219718933, - "learning_rate": 0.0005838379484969291, - "loss": 4.3857, + "epoch": 0.2803234501347709, + "grad_norm": 0.9727432131767273, + "learning_rate": 0.0005838100377765785, + "loss": 4.3739, "step": 2600 }, { - "epoch": 0.2852222580992358, - "grad_norm": 0.7831347584724426, - "learning_rate": 0.0005835147074668678, - "loss": 4.3803, + "epoch": 0.2857142857142857, + "grad_norm": 0.9045615196228027, + "learning_rate": 0.0005834862385321101, + "loss": 4.3749, "step": 2650 }, { - "epoch": 0.29060381013884407, - "grad_norm": 0.9835968613624573, - "learning_rate": 0.0005831914664368063, - "loss": 4.3489, + "epoch": 0.29110512129380056, + "grad_norm": 0.6496215462684631, + "learning_rate": 0.0005831624392876417, + "loss": 4.3646, "step": 2700 }, { - "epoch": 0.2959853621784523, - "grad_norm": 0.7985646724700928, - "learning_rate": 0.0005828682254067449, - "loss": 4.3591, + "epoch": 0.29649595687331537, + "grad_norm": 0.943100094795227, + "learning_rate": 0.0005828386400431731, + "loss": 4.3741, "step": 2750 }, { - "epoch": 0.3013669142180605, - "grad_norm": 0.760099470615387, - "learning_rate": 0.0005825449843766835, - "loss": 4.3337, + "epoch": 0.3018867924528302, + "grad_norm": 0.7881677150726318, + "learning_rate": 0.0005825148407987048, + "loss": 4.3681, "step": 2800 }, { - "epoch": 0.3067484662576687, - "grad_norm": 0.7309588193893433, - "learning_rate": 0.0005822217433466221, - "loss": 4.3136, + "epoch": 0.30727762803234504, + "grad_norm": 0.8434391021728516, + "learning_rate": 0.0005821910415542363, + "loss": 4.3316, "step": 2850 }, { - "epoch": 0.31213001829727693, - "grad_norm": 0.749220609664917, - "learning_rate": 0.0005818985023165607, - "loss": 4.3327, + "epoch": 0.31266846361185985, + "grad_norm": 0.8027449250221252, + "learning_rate": 0.0005818672423097679, + "loss": 4.3199, "step": 2900 }, { - "epoch": 0.31751157033688515, - "grad_norm": 0.7987422943115234, - "learning_rate": 0.0005815752612864992, - "loss": 4.3193, + "epoch": 0.31805929919137466, + "grad_norm": 0.7706471681594849, + "learning_rate": 0.0005815434430652994, + "loss": 4.348, "step": 2950 }, { - "epoch": 0.32289312237649337, - "grad_norm": 0.9373783469200134, - "learning_rate": 0.0005812520202564378, - "loss": 4.2854, + "epoch": 0.32345013477088946, + "grad_norm": 0.7823014855384827, + "learning_rate": 0.0005812196438208311, + "loss": 4.3085, "step": 3000 }, { - "epoch": 0.32289312237649337, - "eval_accuracy": 0.2990188318271689, - "eval_loss": 4.231717109680176, - "eval_runtime": 187.2992, - "eval_samples_per_second": 96.162, - "eval_steps_per_second": 6.012, + "epoch": 0.32345013477088946, + "eval_accuracy": 0.2989201750050333, + "eval_loss": 4.233241081237793, + "eval_runtime": 183.5455, + "eval_samples_per_second": 98.128, + "eval_steps_per_second": 6.135, "step": 3000 }, { - "epoch": 0.3282746744161016, - "grad_norm": 0.967170774936676, - "learning_rate": 0.0005809287792263764, - "loss": 4.286, + "epoch": 0.3288409703504043, + "grad_norm": 0.7678599953651428, + "learning_rate": 0.0005808958445763626, + "loss": 4.3041, "step": 3050 }, { - "epoch": 0.3336562264557098, - "grad_norm": 0.8400319218635559, - "learning_rate": 0.0005806055381963151, - "loss": 4.2771, + "epoch": 0.33423180592991913, + "grad_norm": 0.7595804333686829, + "learning_rate": 0.0005805720453318942, + "loss": 4.2839, "step": 3100 }, { - "epoch": 0.3390377784953181, - "grad_norm": 0.7334919571876526, - "learning_rate": 0.0005802822971662536, - "loss": 4.2781, + "epoch": 0.33962264150943394, + "grad_norm": 0.8765571117401123, + "learning_rate": 0.0005802482460874257, + "loss": 4.2752, "step": 3150 }, { - "epoch": 0.3444193305349263, - "grad_norm": 0.744314432144165, - "learning_rate": 0.0005799590561361922, - "loss": 4.279, + "epoch": 0.3450134770889488, + "grad_norm": 0.6764340996742249, + "learning_rate": 0.0005799244468429573, + "loss": 4.2784, "step": 3200 }, { - "epoch": 0.3498008825745345, - "grad_norm": 0.8924934267997742, - "learning_rate": 0.0005796358151061307, - "loss": 4.2584, + "epoch": 0.3504043126684636, + "grad_norm": 1.5989835262298584, + "learning_rate": 0.0005796006475984889, + "loss": 4.2516, "step": 3250 }, { - "epoch": 0.35518243461414273, - "grad_norm": 0.6779358983039856, - "learning_rate": 0.0005793125740760694, - "loss": 4.2424, + "epoch": 0.3557951482479784, + "grad_norm": 0.8266173601150513, + "learning_rate": 0.0005792768483540205, + "loss": 4.2636, "step": 3300 }, { - "epoch": 0.36056398665375095, - "grad_norm": 0.6709272861480713, - "learning_rate": 0.0005789893330460079, - "loss": 4.233, + "epoch": 0.3611859838274933, + "grad_norm": 0.9941731095314026, + "learning_rate": 0.000578953049109552, + "loss": 4.2564, "step": 3350 }, { - "epoch": 0.36594553869335916, - "grad_norm": 0.7489773035049438, - "learning_rate": 0.0005786660920159465, - "loss": 4.2351, + "epoch": 0.3665768194070081, + "grad_norm": 0.6393300890922546, + "learning_rate": 0.0005786292498650836, + "loss": 4.2346, "step": 3400 }, { - "epoch": 0.3713270907329674, - "grad_norm": 0.8585236668586731, - "learning_rate": 0.0005783428509858851, - "loss": 4.2377, + "epoch": 0.3719676549865229, + "grad_norm": 0.8488381505012512, + "learning_rate": 0.0005783054506206152, + "loss": 4.2266, "step": 3450 }, { - "epoch": 0.3767086427725756, - "grad_norm": 0.7472572922706604, - "learning_rate": 0.0005780196099558237, + "epoch": 0.37735849056603776, + "grad_norm": 0.7094395756721497, + "learning_rate": 0.0005779816513761467, "loss": 4.2177, "step": 3500 }, { - "epoch": 0.3820901948121838, - "grad_norm": 0.6559500694274902, - "learning_rate": 0.0005776963689257623, - "loss": 4.2103, + "epoch": 0.38274932614555257, + "grad_norm": 0.756734311580658, + "learning_rate": 0.0005776578521316782, + "loss": 4.1968, "step": 3550 }, { - "epoch": 0.38747174685179203, - "grad_norm": 0.8734597563743591, - "learning_rate": 0.0005773731278957008, - "loss": 4.2007, + "epoch": 0.3881401617250674, + "grad_norm": 0.803911030292511, + "learning_rate": 0.0005773340528872099, + "loss": 4.2054, "step": 3600 }, { - "epoch": 0.3928532988914003, - "grad_norm": 0.8117654323577881, - "learning_rate": 0.0005770498868656394, - "loss": 4.194, + "epoch": 0.3935309973045822, + "grad_norm": 0.6941730976104736, + "learning_rate": 0.0005770102536427414, + "loss": 4.2234, "step": 3650 }, { - "epoch": 0.3982348509310085, - "grad_norm": 0.8064318895339966, - "learning_rate": 0.000576726645835578, - "loss": 4.1933, + "epoch": 0.39892183288409705, + "grad_norm": 0.6912753582000732, + "learning_rate": 0.000576686454398273, + "loss": 4.2084, "step": 3700 }, { - "epoch": 0.40361640297061674, - "grad_norm": 0.6744760274887085, - "learning_rate": 0.0005764034048055167, - "loss": 4.1869, + "epoch": 0.40431266846361186, + "grad_norm": 0.7011223435401917, + "learning_rate": 0.0005763626551538045, + "loss": 4.1927, "step": 3750 }, { - "epoch": 0.40899795501022496, - "grad_norm": 0.7638756036758423, - "learning_rate": 0.0005760801637754552, - "loss": 4.1967, + "epoch": 0.40970350404312667, + "grad_norm": 0.6987757086753845, + "learning_rate": 0.0005760388559093362, + "loss": 4.1824, "step": 3800 }, { - "epoch": 0.4143795070498332, - "grad_norm": 0.8815849423408508, - "learning_rate": 0.0005757569227453937, - "loss": 4.162, + "epoch": 0.41509433962264153, + "grad_norm": 0.6419551372528076, + "learning_rate": 0.0005757150566648678, + "loss": 4.1796, "step": 3850 }, { - "epoch": 0.4197610590894414, - "grad_norm": 0.7192302346229553, - "learning_rate": 0.0005754336817153324, - "loss": 4.1899, + "epoch": 0.42048517520215634, + "grad_norm": 0.7884727120399475, + "learning_rate": 0.0005753912574203993, + "loss": 4.1707, "step": 3900 }, { - "epoch": 0.4251426111290496, - "grad_norm": 0.5958099365234375, - "learning_rate": 0.0005751104406852709, - "loss": 4.1567, + "epoch": 0.42587601078167114, + "grad_norm": 0.772879421710968, + "learning_rate": 0.0005750674581759309, + "loss": 4.1724, "step": 3950 }, { - "epoch": 0.4305241631686578, - "grad_norm": 0.7568697333335876, - "learning_rate": 0.0005747871996552096, - "loss": 4.167, + "epoch": 0.431266846361186, + "grad_norm": 0.7401405572891235, + "learning_rate": 0.0005747436589314624, + "loss": 4.1541, "step": 4000 }, { - "epoch": 0.4305241631686578, - "eval_accuracy": 0.31295736754044956, - "eval_loss": 4.088932514190674, - "eval_runtime": 185.3099, - "eval_samples_per_second": 97.194, - "eval_steps_per_second": 6.076, + "epoch": 0.431266846361186, + "eval_accuracy": 0.3129712751100898, + "eval_loss": 4.084288597106934, + "eval_runtime": 183.5875, + "eval_samples_per_second": 98.106, + "eval_steps_per_second": 6.133, "step": 4000 }, { - "epoch": 0.43590571520826604, - "grad_norm": 0.8409259915351868, - "learning_rate": 0.0005744639586251481, - "loss": 4.1443, + "epoch": 0.4366576819407008, + "grad_norm": 0.6693394780158997, + "learning_rate": 0.0005744198596869941, + "loss": 4.1653, "step": 4050 }, { - "epoch": 0.44128726724787426, - "grad_norm": 0.630840003490448, - "learning_rate": 0.0005741407175950867, - "loss": 4.1604, + "epoch": 0.4420485175202156, + "grad_norm": 0.6576082110404968, + "learning_rate": 0.0005740960604425255, + "loss": 4.1609, "step": 4100 }, { - "epoch": 0.44666881928748253, - "grad_norm": 0.5622649788856506, - "learning_rate": 0.0005738174765650253, - "loss": 4.1458, + "epoch": 0.4474393530997305, + "grad_norm": 0.6224762201309204, + "learning_rate": 0.0005737722611980572, + "loss": 4.141, "step": 4150 }, { - "epoch": 0.45205037132709075, - "grad_norm": 0.681736171245575, - "learning_rate": 0.0005734942355349638, - "loss": 4.1354, + "epoch": 0.4528301886792453, + "grad_norm": 0.6698539853096008, + "learning_rate": 0.0005734484619535887, + "loss": 4.1189, "step": 4200 }, { - "epoch": 0.45743192336669897, - "grad_norm": 0.7544794082641602, - "learning_rate": 0.0005731709945049025, - "loss": 4.1296, + "epoch": 0.4582210242587601, + "grad_norm": 0.6492160558700562, + "learning_rate": 0.0005731246627091203, + "loss": 4.1501, "step": 4250 }, { - "epoch": 0.4628134754063072, - "grad_norm": 0.7159223556518555, - "learning_rate": 0.000572847753474841, - "loss": 4.1233, + "epoch": 0.4636118598382749, + "grad_norm": 0.6496559977531433, + "learning_rate": 0.0005728008634646518, + "loss": 4.1251, "step": 4300 }, { - "epoch": 0.4681950274459154, - "grad_norm": 0.7304403781890869, - "learning_rate": 0.0005725245124447796, - "loss": 4.1195, + "epoch": 0.46900269541778977, + "grad_norm": 0.6596023440361023, + "learning_rate": 0.0005724770642201835, + "loss": 4.1301, "step": 4350 }, { - "epoch": 0.4735765794855236, - "grad_norm": 0.6582966446876526, - "learning_rate": 0.0005722012714147182, - "loss": 4.122, + "epoch": 0.4743935309973046, + "grad_norm": 0.7114306688308716, + "learning_rate": 0.000572153264975715, + "loss": 4.1263, "step": 4400 }, { - "epoch": 0.47895813152513184, - "grad_norm": 0.6851973533630371, - "learning_rate": 0.0005718780303846568, - "loss": 4.1075, + "epoch": 0.4797843665768194, + "grad_norm": 0.6327182054519653, + "learning_rate": 0.0005718294657312466, + "loss": 4.0962, "step": 4450 }, { - "epoch": 0.48433968356474005, - "grad_norm": 0.629666805267334, - "learning_rate": 0.0005715547893545953, - "loss": 4.1247, + "epoch": 0.48517520215633425, + "grad_norm": 0.5750563740730286, + "learning_rate": 0.0005715056664867781, + "loss": 4.0943, "step": 4500 }, { - "epoch": 0.48972123560434827, - "grad_norm": 0.6585856080055237, - "learning_rate": 0.000571231548324534, - "loss": 4.089, + "epoch": 0.49056603773584906, + "grad_norm": 0.6841105818748474, + "learning_rate": 0.0005711818672423097, + "loss": 4.09, "step": 4550 }, { - "epoch": 0.49510278764395654, - "grad_norm": 0.7275299429893494, - "learning_rate": 0.0005709083072944725, - "loss": 4.102, + "epoch": 0.49595687331536387, + "grad_norm": 0.6326912641525269, + "learning_rate": 0.0005708580679978413, + "loss": 4.0963, "step": 4600 }, { - "epoch": 0.5004843396835648, - "grad_norm": 0.7020848393440247, - "learning_rate": 0.0005705850662644111, - "loss": 4.0871, + "epoch": 0.5013477088948787, + "grad_norm": 0.856378972530365, + "learning_rate": 0.0005705342687533729, + "loss": 4.0893, "step": 4650 }, { - "epoch": 0.505865891723173, - "grad_norm": 0.7129533290863037, - "learning_rate": 0.0005702618252343497, - "loss": 4.0816, + "epoch": 0.5067385444743935, + "grad_norm": 0.5741315484046936, + "learning_rate": 0.0005702104695089044, + "loss": 4.0698, "step": 4700 }, { - "epoch": 0.5112474437627812, - "grad_norm": 0.7143609523773193, - "learning_rate": 0.0005699385842042882, - "loss": 4.0851, + "epoch": 0.5121293800539084, + "grad_norm": 0.632841944694519, + "learning_rate": 0.000569886670264436, + "loss": 4.0845, "step": 4750 }, { - "epoch": 0.5166289958023894, - "grad_norm": 0.5908857583999634, - "learning_rate": 0.0005696153431742269, - "loss": 4.0737, + "epoch": 0.5175202156334232, + "grad_norm": 0.6167107224464417, + "learning_rate": 0.0005695628710199675, + "loss": 4.0719, "step": 4800 }, { - "epoch": 0.5220105478419976, - "grad_norm": 0.6978116035461426, - "learning_rate": 0.0005692921021441655, - "loss": 4.0704, + "epoch": 0.522911051212938, + "grad_norm": 0.6984403133392334, + "learning_rate": 0.0005692390717754991, + "loss": 4.0659, "step": 4850 }, { - "epoch": 0.5273920998816058, - "grad_norm": 0.5921167731285095, - "learning_rate": 0.0005689688611141041, - "loss": 4.0677, + "epoch": 0.5283018867924528, + "grad_norm": 0.7294288277626038, + "learning_rate": 0.0005689152725310306, + "loss": 4.0596, "step": 4900 }, { - "epoch": 0.5327736519212141, - "grad_norm": 0.6112943887710571, - "learning_rate": 0.0005686456200840426, - "loss": 4.0591, + "epoch": 0.5336927223719676, + "grad_norm": 0.74382084608078, + "learning_rate": 0.0005685914732865623, + "loss": 4.0633, "step": 4950 }, { - "epoch": 0.5381552039608223, - "grad_norm": 0.7539244890213013, - "learning_rate": 0.0005683223790539811, - "loss": 4.0613, + "epoch": 0.5390835579514824, + "grad_norm": 0.6517612934112549, + "learning_rate": 0.0005682676740420939, + "loss": 4.0732, "step": 5000 }, { - "epoch": 0.5381552039608223, - "eval_accuracy": 0.32120564286599806, - "eval_loss": 3.9930038452148438, - "eval_runtime": 185.4107, - "eval_samples_per_second": 97.141, - "eval_steps_per_second": 6.073, + "epoch": 0.5390835579514824, + "eval_accuracy": 0.3223035802973634, + "eval_loss": 3.9919276237487793, + "eval_runtime": 183.6973, + "eval_samples_per_second": 98.047, + "eval_steps_per_second": 6.13, "step": 5000 }, { - "epoch": 0.5435367560004305, - "grad_norm": 0.6740825176239014, - "learning_rate": 0.0005679991380239198, - "loss": 4.047, + "epoch": 0.5444743935309974, + "grad_norm": 0.7119221687316895, + "learning_rate": 0.0005679438747976254, + "loss": 4.0443, "step": 5050 }, { - "epoch": 0.5489183080400387, - "grad_norm": 0.755363941192627, - "learning_rate": 0.0005676758969938584, - "loss": 4.0381, + "epoch": 0.5498652291105122, + "grad_norm": 0.6482653617858887, + "learning_rate": 0.000567620075553157, + "loss": 4.045, "step": 5100 }, { - "epoch": 0.5542998600796469, - "grad_norm": 0.5839517116546631, - "learning_rate": 0.000567352655963797, - "loss": 4.0421, + "epoch": 0.555256064690027, + "grad_norm": 0.6632248163223267, + "learning_rate": 0.0005672962763086886, + "loss": 4.0499, "step": 5150 }, { - "epoch": 0.5596814121192552, - "grad_norm": 0.7238830924034119, - "learning_rate": 0.0005670294149337355, - "loss": 4.0415, + "epoch": 0.5606469002695418, + "grad_norm": 0.6758072376251221, + "learning_rate": 0.0005669724770642202, + "loss": 4.0323, "step": 5200 }, { - "epoch": 0.5650629641588634, - "grad_norm": 0.6979734301567078, - "learning_rate": 0.0005667061739036742, - "loss": 4.0334, + "epoch": 0.5660377358490566, + "grad_norm": 0.6546369194984436, + "learning_rate": 0.0005666486778197517, + "loss": 4.03, "step": 5250 }, { - "epoch": 0.5704445161984716, - "grad_norm": 0.8152750134468079, - "learning_rate": 0.0005663829328736127, - "loss": 4.0427, + "epoch": 0.5714285714285714, + "grad_norm": 0.6808826327323914, + "learning_rate": 0.0005663248785752833, + "loss": 4.0423, "step": 5300 }, { - "epoch": 0.5758260682380799, - "grad_norm": 0.6354012489318848, - "learning_rate": 0.0005660596918435512, - "loss": 4.0428, + "epoch": 0.5768194070080862, + "grad_norm": 0.5758659839630127, + "learning_rate": 0.0005660010793308148, + "loss": 4.0272, "step": 5350 }, { - "epoch": 0.5812076202776881, - "grad_norm": 0.7258121371269226, - "learning_rate": 0.0005657364508134899, - "loss": 4.0439, + "epoch": 0.5822102425876011, + "grad_norm": 0.601265013217926, + "learning_rate": 0.0005656772800863465, + "loss": 4.0451, "step": 5400 }, { - "epoch": 0.5865891723172963, - "grad_norm": 0.5233718752861023, - "learning_rate": 0.0005654132097834284, - "loss": 4.0143, + "epoch": 0.5876010781671159, + "grad_norm": 0.5763106942176819, + "learning_rate": 0.0005653534808418779, + "loss": 4.0201, "step": 5450 }, { - "epoch": 0.5919707243569046, - "grad_norm": 0.578235924243927, - "learning_rate": 0.0005650899687533671, - "loss": 4.0224, + "epoch": 0.5929919137466307, + "grad_norm": 0.6029739379882812, + "learning_rate": 0.0005650296815974096, + "loss": 4.0235, "step": 5500 }, { - "epoch": 0.5973522763965128, - "grad_norm": 0.5696157217025757, - "learning_rate": 0.0005647667277233056, - "loss": 4.012, + "epoch": 0.5983827493261455, + "grad_norm": 0.5782645344734192, + "learning_rate": 0.0005647058823529411, + "loss": 4.0153, "step": 5550 }, { - "epoch": 0.602733828436121, - "grad_norm": 0.7277674674987793, - "learning_rate": 0.0005644434866932442, - "loss": 4.0144, + "epoch": 0.6037735849056604, + "grad_norm": 0.5374035835266113, + "learning_rate": 0.0005643820831084727, + "loss": 4.0068, "step": 5600 }, { - "epoch": 0.6081153804757292, - "grad_norm": 0.760840117931366, - "learning_rate": 0.0005641202456631828, - "loss": 4.0055, + "epoch": 0.6091644204851752, + "grad_norm": 0.6709702610969543, + "learning_rate": 0.0005640582838640042, + "loss": 4.0187, "step": 5650 }, { - "epoch": 0.6134969325153374, - "grad_norm": 0.6378624439239502, - "learning_rate": 0.0005637970046331214, - "loss": 4.0246, + "epoch": 0.6145552560646901, + "grad_norm": 0.661410391330719, + "learning_rate": 0.0005637344846195358, + "loss": 4.0118, "step": 5700 }, { - "epoch": 0.6188784845549457, - "grad_norm": 0.6083015203475952, - "learning_rate": 0.00056347376360306, - "loss": 4.0077, + "epoch": 0.6199460916442049, + "grad_norm": 0.5730959177017212, + "learning_rate": 0.0005634106853750674, + "loss": 4.0212, "step": 5750 }, { - "epoch": 0.6242600365945539, - "grad_norm": 0.6895670890808105, - "learning_rate": 0.0005631505225729985, - "loss": 3.9889, + "epoch": 0.6253369272237197, + "grad_norm": 0.7666971683502197, + "learning_rate": 0.000563086886130599, + "loss": 3.9882, "step": 5800 }, { - "epoch": 0.6296415886341621, - "grad_norm": 0.7159281969070435, - "learning_rate": 0.0005628272815429371, - "loss": 3.9946, + "epoch": 0.6307277628032345, + "grad_norm": 0.7750036120414734, + "learning_rate": 0.0005627630868861305, + "loss": 3.9926, "step": 5850 }, { - "epoch": 0.6350231406737703, - "grad_norm": 0.7747655510902405, - "learning_rate": 0.0005625040405128757, - "loss": 3.9819, + "epoch": 0.6361185983827493, + "grad_norm": 0.6507880687713623, + "learning_rate": 0.0005624392876416621, + "loss": 3.9778, "step": 5900 }, { - "epoch": 0.6404046927133785, - "grad_norm": 0.6201274991035461, - "learning_rate": 0.0005621807994828143, - "loss": 3.9892, + "epoch": 0.6415094339622641, + "grad_norm": 0.6215813159942627, + "learning_rate": 0.0005621154883971937, + "loss": 3.9808, "step": 5950 }, { - "epoch": 0.6457862447529867, - "grad_norm": 0.5920886397361755, - "learning_rate": 0.0005618575584527529, - "loss": 3.9942, + "epoch": 0.6469002695417789, + "grad_norm": 0.7452011108398438, + "learning_rate": 0.0005617916891527253, + "loss": 3.9623, "step": 6000 }, { - "epoch": 0.6457862447529867, - "eval_accuracy": 0.3285595962719456, - "eval_loss": 3.9178080558776855, - "eval_runtime": 185.1703, - "eval_samples_per_second": 97.267, - "eval_steps_per_second": 6.081, + "epoch": 0.6469002695417789, + "eval_accuracy": 0.3290048558062093, + "eval_loss": 3.9154105186462402, + "eval_runtime": 183.6461, + "eval_samples_per_second": 98.074, + "eval_steps_per_second": 6.131, "step": 6000 }, { - "epoch": 0.651167796792595, - "grad_norm": 0.6048468351364136, - "learning_rate": 0.0005615343174226915, - "loss": 3.9991, + "epoch": 0.6522911051212938, + "grad_norm": 0.6668406128883362, + "learning_rate": 0.0005614678899082568, + "loss": 3.9958, "step": 6050 }, { - "epoch": 0.6565493488322032, - "grad_norm": 0.6038753986358643, - "learning_rate": 0.00056121107639263, - "loss": 3.971, + "epoch": 0.6576819407008087, + "grad_norm": 0.5859930515289307, + "learning_rate": 0.0005611440906637884, + "loss": 3.9842, "step": 6100 }, { - "epoch": 0.6619309008718114, - "grad_norm": 0.6248626708984375, - "learning_rate": 0.0005608878353625687, - "loss": 3.9687, + "epoch": 0.6630727762803235, + "grad_norm": 0.6970394253730774, + "learning_rate": 0.00056082029141932, + "loss": 3.9757, "step": 6150 }, { - "epoch": 0.6673124529114196, - "grad_norm": 0.7638702988624573, - "learning_rate": 0.0005605645943325072, - "loss": 3.9686, + "epoch": 0.6684636118598383, + "grad_norm": 0.6556830406188965, + "learning_rate": 0.0005604964921748515, + "loss": 3.9884, "step": 6200 }, { - "epoch": 0.6726940049510278, - "grad_norm": 0.5385346412658691, - "learning_rate": 0.0005602413533024458, - "loss": 3.9689, + "epoch": 0.6738544474393531, + "grad_norm": 0.6606318950653076, + "learning_rate": 0.000560172692930383, + "loss": 3.9484, "step": 6250 }, { - "epoch": 0.6780755569906362, - "grad_norm": 0.6341848373413086, - "learning_rate": 0.0005599181122723844, - "loss": 3.9662, + "epoch": 0.6792452830188679, + "grad_norm": 0.607824444770813, + "learning_rate": 0.0005598488936859147, + "loss": 3.959, "step": 6300 }, { - "epoch": 0.6834571090302444, - "grad_norm": 0.5963843464851379, - "learning_rate": 0.000559594871242323, - "loss": 3.9602, + "epoch": 0.6846361185983828, + "grad_norm": 0.6594722270965576, + "learning_rate": 0.0005595250944414463, + "loss": 3.9519, "step": 6350 }, { - "epoch": 0.6888386610698526, - "grad_norm": 0.5414633750915527, - "learning_rate": 0.0005592716302122616, - "loss": 3.9664, + "epoch": 0.6900269541778976, + "grad_norm": 0.6482193470001221, + "learning_rate": 0.0005592012951969778, + "loss": 3.9433, "step": 6400 }, { - "epoch": 0.6942202131094608, - "grad_norm": 0.581794023513794, - "learning_rate": 0.0005589483891822001, - "loss": 3.9706, + "epoch": 0.6954177897574124, + "grad_norm": 0.7418032884597778, + "learning_rate": 0.0005588774959525094, + "loss": 3.9704, "step": 6450 }, { - "epoch": 0.699601765149069, - "grad_norm": 0.588293731212616, - "learning_rate": 0.0005586251481521387, - "loss": 3.9601, + "epoch": 0.7008086253369272, + "grad_norm": 0.6005130410194397, + "learning_rate": 0.000558553696708041, + "loss": 3.9547, "step": 6500 }, { - "epoch": 0.7049833171886772, - "grad_norm": 0.6920768618583679, - "learning_rate": 0.0005583019071220773, - "loss": 3.9606, + "epoch": 0.706199460916442, + "grad_norm": 0.566909909248352, + "learning_rate": 0.0005582298974635726, + "loss": 3.9509, "step": 6550 }, { - "epoch": 0.7103648692282855, - "grad_norm": 0.6583566665649414, - "learning_rate": 0.000557978666092016, - "loss": 3.9327, + "epoch": 0.7115902964959568, + "grad_norm": 0.5656232833862305, + "learning_rate": 0.0005579060982191041, + "loss": 3.9505, "step": 6600 }, { - "epoch": 0.7157464212678937, - "grad_norm": 0.5119813680648804, - "learning_rate": 0.0005576554250619545, - "loss": 3.9518, + "epoch": 0.7169811320754716, + "grad_norm": 0.6370468139648438, + "learning_rate": 0.0005575822989746357, + "loss": 3.9435, "step": 6650 }, { - "epoch": 0.7211279733075019, - "grad_norm": 0.7030820250511169, - "learning_rate": 0.000557332184031893, - "loss": 3.9553, + "epoch": 0.7223719676549866, + "grad_norm": 0.5794790387153625, + "learning_rate": 0.0005572584997301672, + "loss": 3.9366, "step": 6700 }, { - "epoch": 0.7265095253471101, - "grad_norm": 0.640282928943634, - "learning_rate": 0.0005570089430018317, - "loss": 3.944, + "epoch": 0.7277628032345014, + "grad_norm": 0.6862756609916687, + "learning_rate": 0.0005569347004856989, + "loss": 3.94, "step": 6750 }, { - "epoch": 0.7318910773867183, - "grad_norm": 0.6724056005477905, - "learning_rate": 0.0005566857019717702, - "loss": 3.9515, + "epoch": 0.7331536388140162, + "grad_norm": 0.524553120136261, + "learning_rate": 0.0005566109012412303, + "loss": 3.9485, "step": 6800 }, { - "epoch": 0.7372726294263265, - "grad_norm": 0.5751603245735168, - "learning_rate": 0.0005563624609417089, - "loss": 3.955, + "epoch": 0.738544474393531, + "grad_norm": 0.5408486723899841, + "learning_rate": 0.000556287101996762, + "loss": 3.9322, "step": 6850 }, { - "epoch": 0.7426541814659348, - "grad_norm": 0.5616862177848816, - "learning_rate": 0.0005560456847322487, - "loss": 3.9535, + "epoch": 0.7439353099730458, + "grad_norm": 0.5416108965873718, + "learning_rate": 0.0005559633027522935, + "loss": 3.9263, "step": 6900 }, { - "epoch": 0.748035733505543, - "grad_norm": 0.5367492437362671, - "learning_rate": 0.0005557224437021872, - "loss": 3.9248, + "epoch": 0.7493261455525606, + "grad_norm": 0.6217844486236572, + "learning_rate": 0.0005556395035078251, + "loss": 3.9157, "step": 6950 }, { - "epoch": 0.7534172855451512, - "grad_norm": 0.5820096135139465, - "learning_rate": 0.0005553992026721258, - "loss": 3.9229, + "epoch": 0.7547169811320755, + "grad_norm": 0.5892395973205566, + "learning_rate": 0.000555322180248246, + "loss": 3.9291, "step": 7000 }, { - "epoch": 0.7534172855451512, - "eval_accuracy": 0.33421975980975316, - "eval_loss": 3.8591833114624023, - "eval_runtime": 185.366, - "eval_samples_per_second": 97.165, - "eval_steps_per_second": 6.074, + "epoch": 0.7547169811320755, + "eval_accuracy": 0.33351101702253927, + "eval_loss": 3.8597514629364014, + "eval_runtime": 183.7646, + "eval_samples_per_second": 98.011, + "eval_steps_per_second": 6.127, "step": 7000 }, { - "epoch": 0.7587988375847594, - "grad_norm": 0.6244399547576904, - "learning_rate": 0.0005550759616420644, - "loss": 3.9139, + "epoch": 0.7601078167115903, + "grad_norm": 0.5709213018417358, + "learning_rate": 0.0005549983810037776, + "loss": 3.932, "step": 7050 }, { - "epoch": 0.7641803896243676, - "grad_norm": 0.5721721649169922, - "learning_rate": 0.000554752720612003, - "loss": 3.9179, + "epoch": 0.7654986522911051, + "grad_norm": 0.5382469296455383, + "learning_rate": 0.0005546745817593091, + "loss": 3.9325, "step": 7100 }, { - "epoch": 0.7695619416639758, - "grad_norm": 0.5274028182029724, - "learning_rate": 0.0005544294795819415, - "loss": 3.939, + "epoch": 0.77088948787062, + "grad_norm": 0.7448933720588684, + "learning_rate": 0.0005543507825148408, + "loss": 3.9099, "step": 7150 }, { - "epoch": 0.7749434937035841, - "grad_norm": 0.6537976861000061, - "learning_rate": 0.0005541062385518801, - "loss": 3.9104, + "epoch": 0.7762803234501348, + "grad_norm": 0.6360498070716858, + "learning_rate": 0.0005540269832703723, + "loss": 3.9173, "step": 7200 }, { - "epoch": 0.7803250457431924, - "grad_norm": 0.5737007260322571, - "learning_rate": 0.0005537829975218188, - "loss": 3.8995, + "epoch": 0.7816711590296496, + "grad_norm": 0.6259989738464355, + "learning_rate": 0.0005537031840259039, + "loss": 3.9175, "step": 7250 }, { - "epoch": 0.7857065977828006, - "grad_norm": 0.655914306640625, - "learning_rate": 0.0005534597564917573, - "loss": 3.9133, + "epoch": 0.7870619946091644, + "grad_norm": 0.6026500463485718, + "learning_rate": 0.0005533793847814354, + "loss": 3.9054, "step": 7300 }, { - "epoch": 0.7910881498224088, - "grad_norm": 0.5855588912963867, - "learning_rate": 0.0005531365154616959, - "loss": 3.8965, + "epoch": 0.7924528301886793, + "grad_norm": 0.587363600730896, + "learning_rate": 0.000553055585536967, + "loss": 3.9147, "step": 7350 }, { - "epoch": 0.796469701862017, - "grad_norm": 0.5390224456787109, - "learning_rate": 0.0005528132744316344, - "loss": 3.9107, + "epoch": 0.7978436657681941, + "grad_norm": 0.647449791431427, + "learning_rate": 0.0005527317862924987, + "loss": 3.8796, "step": 7400 }, { - "epoch": 0.8018512539016253, - "grad_norm": 0.739325225353241, - "learning_rate": 0.0005524900334015731, - "loss": 3.9033, + "epoch": 0.8032345013477089, + "grad_norm": 0.587364137172699, + "learning_rate": 0.0005524079870480301, + "loss": 3.8893, "step": 7450 }, { - "epoch": 0.8072328059412335, - "grad_norm": 0.5827202796936035, - "learning_rate": 0.0005521667923715117, - "loss": 3.9216, + "epoch": 0.8086253369272237, + "grad_norm": 0.7309133410453796, + "learning_rate": 0.0005520841878035618, + "loss": 3.9045, "step": 7500 }, { - "epoch": 0.8126143579808417, - "grad_norm": 0.5299261212348938, - "learning_rate": 0.0005518435513414502, - "loss": 3.9062, + "epoch": 0.8140161725067385, + "grad_norm": 0.6132270693778992, + "learning_rate": 0.0005517603885590933, + "loss": 3.8994, "step": 7550 }, { - "epoch": 0.8179959100204499, - "grad_norm": 0.7677913308143616, - "learning_rate": 0.0005515203103113888, - "loss": 3.8891, + "epoch": 0.8194070080862533, + "grad_norm": 0.5545886158943176, + "learning_rate": 0.0005514365893146249, + "loss": 3.917, "step": 7600 }, { - "epoch": 0.8233774620600581, - "grad_norm": 0.5630691647529602, - "learning_rate": 0.0005511970692813274, - "loss": 3.8984, + "epoch": 0.8247978436657682, + "grad_norm": 0.6114673614501953, + "learning_rate": 0.0005511127900701564, + "loss": 3.9014, "step": 7650 }, { - "epoch": 0.8287590140996663, - "grad_norm": 0.6013116240501404, - "learning_rate": 0.000550873828251266, - "loss": 3.8819, + "epoch": 0.8301886792452831, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.000550788990825688, + "loss": 3.8704, "step": 7700 }, { - "epoch": 0.8341405661392746, - "grad_norm": 0.6272456049919128, - "learning_rate": 0.0005505505872212045, - "loss": 3.8984, + "epoch": 0.8355795148247979, + "grad_norm": 0.6900692582130432, + "learning_rate": 0.0005504651915812196, + "loss": 3.8857, "step": 7750 }, { - "epoch": 0.8395221181788828, - "grad_norm": 0.6356287002563477, - "learning_rate": 0.0005502273461911432, - "loss": 3.9032, + "epoch": 0.8409703504043127, + "grad_norm": 0.5441706776618958, + "learning_rate": 0.0005501413923367512, + "loss": 3.9005, "step": 7800 }, { - "epoch": 0.844903670218491, - "grad_norm": 0.650209367275238, - "learning_rate": 0.0005499041051610817, - "loss": 3.8769, + "epoch": 0.8463611859838275, + "grad_norm": 0.7785384654998779, + "learning_rate": 0.0005498175930922827, + "loss": 3.8865, "step": 7850 }, { - "epoch": 0.8502852222580992, - "grad_norm": 0.5765166878700256, - "learning_rate": 0.0005495808641310204, - "loss": 3.8887, + "epoch": 0.8517520215633423, + "grad_norm": 0.679541289806366, + "learning_rate": 0.0005494937938478143, + "loss": 3.8872, "step": 7900 }, { - "epoch": 0.8556667742977074, - "grad_norm": 0.6685440540313721, - "learning_rate": 0.0005492576231009589, - "loss": 3.9028, + "epoch": 0.8571428571428571, + "grad_norm": 0.5554518103599548, + "learning_rate": 0.0005491699946033459, + "loss": 3.8874, "step": 7950 }, { - "epoch": 0.8610483263373157, - "grad_norm": 0.5335162281990051, - "learning_rate": 0.0005489343820708974, - "loss": 3.8881, + "epoch": 0.862533692722372, + "grad_norm": 0.5628073215484619, + "learning_rate": 0.0005488461953588775, + "loss": 3.8652, "step": 8000 }, { - "epoch": 0.8610483263373157, - "eval_accuracy": 0.3379374270192134, - "eval_loss": 3.8148393630981445, - "eval_runtime": 184.805, - "eval_samples_per_second": 97.459, - "eval_steps_per_second": 6.093, + "epoch": 0.862533692722372, + "eval_accuracy": 0.337994469785316, + "eval_loss": 3.814110040664673, + "eval_runtime": 183.5196, + "eval_samples_per_second": 98.142, + "eval_steps_per_second": 6.136, "step": 8000 }, { - "epoch": 0.8664298783769239, - "grad_norm": 0.5469226241111755, - "learning_rate": 0.0005486111410408361, - "loss": 3.8857, + "epoch": 0.8679245283018868, + "grad_norm": 0.5652970671653748, + "learning_rate": 0.000548522396114409, + "loss": 3.8841, "step": 8050 }, { - "epoch": 0.8718114304165321, - "grad_norm": 0.5885823369026184, - "learning_rate": 0.0005482879000107746, - "loss": 3.8642, + "epoch": 0.8733153638814016, + "grad_norm": 0.7923115491867065, + "learning_rate": 0.0005481985968699406, + "loss": 3.8611, "step": 8100 }, { - "epoch": 0.8771929824561403, - "grad_norm": 0.5075781941413879, - "learning_rate": 0.0005479646589807133, - "loss": 3.8996, + "epoch": 0.8787061994609164, + "grad_norm": 0.5407063364982605, + "learning_rate": 0.0005478747976254721, + "loss": 3.8725, "step": 8150 }, { - "epoch": 0.8825745344957485, - "grad_norm": 0.5800351500511169, - "learning_rate": 0.0005476414179506518, - "loss": 3.8807, + "epoch": 0.8840970350404312, + "grad_norm": 0.5948967933654785, + "learning_rate": 0.0005475509983810037, + "loss": 3.8625, "step": 8200 }, { - "epoch": 0.8879560865353568, - "grad_norm": 0.5522460341453552, - "learning_rate": 0.0005473181769205904, - "loss": 3.8747, + "epoch": 0.889487870619946, + "grad_norm": 0.6480773091316223, + "learning_rate": 0.0005472271991365352, + "loss": 3.8802, "step": 8250 }, { - "epoch": 0.8933376385749651, - "grad_norm": 0.5287204384803772, - "learning_rate": 0.000546994935890529, - "loss": 3.861, + "epoch": 0.894878706199461, + "grad_norm": 0.5645371675491333, + "learning_rate": 0.0005469033998920669, + "loss": 3.852, "step": 8300 }, { - "epoch": 0.8987191906145733, - "grad_norm": 0.5514733791351318, - "learning_rate": 0.0005466716948604677, - "loss": 3.8641, + "epoch": 0.9002695417789758, + "grad_norm": 0.5577579140663147, + "learning_rate": 0.0005465796006475984, + "loss": 3.8686, "step": 8350 }, { - "epoch": 0.9041007426541815, - "grad_norm": 0.504178524017334, - "learning_rate": 0.0005463484538304062, - "loss": 3.862, + "epoch": 0.9056603773584906, + "grad_norm": 0.5483347773551941, + "learning_rate": 0.00054625580140313, + "loss": 3.8684, "step": 8400 }, { - "epoch": 0.9094822946937897, - "grad_norm": 0.5593713521957397, - "learning_rate": 0.0005460252128003447, - "loss": 3.8712, + "epoch": 0.9110512129380054, + "grad_norm": 0.6329666972160339, + "learning_rate": 0.0005459320021586615, + "loss": 3.8403, "step": 8450 }, { - "epoch": 0.9148638467333979, - "grad_norm": 0.6000622510910034, - "learning_rate": 0.0005457019717702833, - "loss": 3.8649, + "epoch": 0.9164420485175202, + "grad_norm": 0.5311110019683838, + "learning_rate": 0.0005456082029141932, + "loss": 3.852, "step": 8500 }, { - "epoch": 0.9202453987730062, - "grad_norm": 0.5310674905776978, - "learning_rate": 0.0005453787307402219, - "loss": 3.8597, + "epoch": 0.921832884097035, + "grad_norm": 0.5692155957221985, + "learning_rate": 0.0005452844036697248, + "loss": 3.8527, "step": 8550 }, { - "epoch": 0.9256269508126144, - "grad_norm": 0.5901429057121277, - "learning_rate": 0.0005450554897101605, - "loss": 3.8501, + "epoch": 0.9272237196765498, + "grad_norm": 0.6471691131591797, + "learning_rate": 0.0005449606044252563, + "loss": 3.8798, "step": 8600 }, { - "epoch": 0.9310085028522226, - "grad_norm": 0.5769229531288147, - "learning_rate": 0.0005447322486800991, - "loss": 3.8616, + "epoch": 0.9326145552560647, + "grad_norm": 0.6338027119636536, + "learning_rate": 0.0005446368051807879, + "loss": 3.8554, "step": 8650 }, { - "epoch": 0.9363900548918308, - "grad_norm": 0.5501662492752075, - "learning_rate": 0.0005444090076500377, - "loss": 3.8685, + "epoch": 0.9380053908355795, + "grad_norm": 0.593163013458252, + "learning_rate": 0.0005443130059363194, + "loss": 3.863, "step": 8700 }, { - "epoch": 0.941771606931439, - "grad_norm": 0.5846860408782959, - "learning_rate": 0.0005440857666199763, - "loss": 3.8416, + "epoch": 0.9433962264150944, + "grad_norm": 0.5378130674362183, + "learning_rate": 0.0005439892066918511, + "loss": 3.845, "step": 8750 }, { - "epoch": 0.9471531589710472, - "grad_norm": 0.5883833765983582, - "learning_rate": 0.0005437625255899148, - "loss": 3.8386, + "epoch": 0.9487870619946092, + "grad_norm": 0.6089158654212952, + "learning_rate": 0.0005436654074473825, + "loss": 3.8478, "step": 8800 }, { - "epoch": 0.9525347110106555, - "grad_norm": 0.5724109411239624, - "learning_rate": 0.0005434392845598534, - "loss": 3.8617, + "epoch": 0.954177897574124, + "grad_norm": 0.5586256980895996, + "learning_rate": 0.0005433416082029142, + "loss": 3.8464, "step": 8850 }, { - "epoch": 0.9579162630502637, - "grad_norm": 0.5604748129844666, - "learning_rate": 0.000543116043529792, - "loss": 3.8494, + "epoch": 0.9595687331536388, + "grad_norm": 0.6633445620536804, + "learning_rate": 0.0005430178089584457, + "loss": 3.8466, "step": 8900 }, { - "epoch": 0.9632978150898719, - "grad_norm": 0.6778799295425415, - "learning_rate": 0.0005427928024997306, - "loss": 3.85, + "epoch": 0.9649595687331537, + "grad_norm": 0.6361255049705505, + "learning_rate": 0.0005426940097139773, + "loss": 3.8484, "step": 8950 }, { - "epoch": 0.9686793671294801, - "grad_norm": 0.5963719487190247, - "learning_rate": 0.0005424695614696692, - "loss": 3.8573, + "epoch": 0.9703504043126685, + "grad_norm": 0.6028106808662415, + "learning_rate": 0.0005423702104695088, + "loss": 3.8655, "step": 9000 }, { - "epoch": 0.9686793671294801, - "eval_accuracy": 0.34179547375973, - "eval_loss": 3.7812108993530273, - "eval_runtime": 184.5862, - "eval_samples_per_second": 97.575, - "eval_steps_per_second": 6.1, + "epoch": 0.9703504043126685, + "eval_accuracy": 0.3412723101049033, + "eval_loss": 3.778677463531494, + "eval_runtime": 183.8289, + "eval_samples_per_second": 97.977, + "eval_steps_per_second": 6.125, "step": 9000 }, { - "epoch": 0.9740609191690883, - "grad_norm": 0.5469352006912231, - "learning_rate": 0.0005421463204396078, - "loss": 3.8486, + "epoch": 0.9757412398921833, + "grad_norm": 0.6262213587760925, + "learning_rate": 0.0005420528872099298, + "loss": 3.8415, "step": 9050 }, { - "epoch": 0.9794424712086965, - "grad_norm": 0.6023728251457214, - "learning_rate": 0.0005418230794095463, - "loss": 3.8536, + "epoch": 0.9811320754716981, + "grad_norm": 0.5979717373847961, + "learning_rate": 0.0005417290879654613, + "loss": 3.8327, "step": 9100 }, { - "epoch": 0.9848240232483048, - "grad_norm": 0.5906918048858643, - "learning_rate": 0.000541499838379485, - "loss": 3.8355, + "epoch": 0.9865229110512129, + "grad_norm": 0.5795934200286865, + "learning_rate": 0.000541405288720993, + "loss": 3.845, "step": 9150 }, { - "epoch": 0.9902055752879131, - "grad_norm": 0.5566977858543396, - "learning_rate": 0.0005411765973494235, - "loss": 3.8161, + "epoch": 0.9919137466307277, + "grad_norm": 0.5323654413223267, + "learning_rate": 0.0005410814894765245, + "loss": 3.8479, "step": 9200 }, { - "epoch": 0.9955871273275213, - "grad_norm": 0.6707938313484192, - "learning_rate": 0.0005408533563193621, - "loss": 3.8442, + "epoch": 0.9973045822102425, + "grad_norm": 0.5347334146499634, + "learning_rate": 0.0005407576902320561, + "loss": 3.8378, "step": 9250 }, { - "epoch": 1.0009686793671295, - "grad_norm": 0.5935998558998108, - "learning_rate": 0.0005405301152893007, - "loss": 3.8425, + "epoch": 1.0026954177897573, + "grad_norm": 0.5488175749778748, + "learning_rate": 0.0005404338909875876, + "loss": 3.8159, "step": 9300 }, { - "epoch": 1.0063502314067376, - "grad_norm": 0.7090808153152466, - "learning_rate": 0.0005402068742592392, - "loss": 3.7753, + "epoch": 1.0080862533692723, + "grad_norm": 0.5864335298538208, + "learning_rate": 0.0005401100917431192, + "loss": 3.7809, "step": 9350 }, { - "epoch": 1.011731783446346, - "grad_norm": 0.5787277221679688, - "learning_rate": 0.0005398836332291779, - "loss": 3.7816, + "epoch": 1.013477088948787, + "grad_norm": 0.5702503323554993, + "learning_rate": 0.0005397862924986508, + "loss": 3.79, "step": 9400 }, { - "epoch": 1.017113335485954, - "grad_norm": 0.5593772530555725, - "learning_rate": 0.0005395603921991164, - "loss": 3.7644, + "epoch": 1.0188679245283019, + "grad_norm": 0.549822986125946, + "learning_rate": 0.0005394624932541824, + "loss": 3.7797, "step": 9450 }, { - "epoch": 1.0224948875255624, - "grad_norm": 0.616612434387207, - "learning_rate": 0.0005392371511690551, - "loss": 3.7622, + "epoch": 1.0242587601078168, + "grad_norm": 0.5751377940177917, + "learning_rate": 0.0005391386940097139, + "loss": 3.7775, "step": 9500 }, { - "epoch": 1.0278764395651705, - "grad_norm": 0.6132011413574219, - "learning_rate": 0.0005389139101389936, - "loss": 3.7698, + "epoch": 1.0296495956873315, + "grad_norm": 0.5424072742462158, + "learning_rate": 0.0005388148947652455, + "loss": 3.7885, "step": 9550 }, { - "epoch": 1.0332579916047788, - "grad_norm": 0.583960771560669, - "learning_rate": 0.0005385906691089321, - "loss": 3.7711, + "epoch": 1.0350404312668464, + "grad_norm": 0.6139684915542603, + "learning_rate": 0.000538491095520777, + "loss": 3.7689, "step": 9600 }, { - "epoch": 1.0386395436443872, - "grad_norm": 0.5626878142356873, - "learning_rate": 0.0005382674280788708, - "loss": 3.7675, + "epoch": 1.0404312668463611, + "grad_norm": 0.507134199142456, + "learning_rate": 0.0005381672962763086, + "loss": 3.7627, "step": 9650 }, { - "epoch": 1.0440210956839953, - "grad_norm": 0.6028419733047485, - "learning_rate": 0.0005379441870488093, - "loss": 3.7545, + "epoch": 1.045822102425876, + "grad_norm": 0.5824893712997437, + "learning_rate": 0.0005378434970318403, + "loss": 3.7732, "step": 9700 }, { - "epoch": 1.0494026477236036, - "grad_norm": 0.5416905283927917, - "learning_rate": 0.0005376209460187479, - "loss": 3.7607, + "epoch": 1.0512129380053907, + "grad_norm": 0.6038205027580261, + "learning_rate": 0.0005375196977873718, + "loss": 3.766, "step": 9750 }, { - "epoch": 1.0547841997632117, - "grad_norm": 0.571316123008728, - "learning_rate": 0.0005372977049886865, - "loss": 3.7879, + "epoch": 1.0566037735849056, + "grad_norm": 0.5839976668357849, + "learning_rate": 0.0005371958985429034, + "loss": 3.7708, "step": 9800 }, { - "epoch": 1.06016575180282, - "grad_norm": 0.6111052632331848, - "learning_rate": 0.0005369744639586251, - "loss": 3.7666, + "epoch": 1.0619946091644206, + "grad_norm": 0.6339398622512817, + "learning_rate": 0.0005368720992984349, + "loss": 3.7547, "step": 9850 }, { - "epoch": 1.0655473038424281, - "grad_norm": 0.7614707350730896, - "learning_rate": 0.0005366512229285637, - "loss": 3.7689, + "epoch": 1.0673854447439353, + "grad_norm": 0.6098514199256897, + "learning_rate": 0.0005365483000539665, + "loss": 3.7632, "step": 9900 }, { - "epoch": 1.0709288558820365, - "grad_norm": 0.541038990020752, - "learning_rate": 0.0005363279818985022, - "loss": 3.756, + "epoch": 1.0727762803234502, + "grad_norm": 0.6129767298698425, + "learning_rate": 0.0005362245008094981, + "loss": 3.7648, "step": 9950 }, { - "epoch": 1.0763104079216446, - "grad_norm": 0.5469908714294434, - "learning_rate": 0.0005360047408684408, - "loss": 3.7641, + "epoch": 1.0781671159029649, + "grad_norm": 0.6090075969696045, + "learning_rate": 0.0005359007015650297, + "loss": 3.7691, "step": 10000 }, { - "epoch": 1.0763104079216446, - "eval_accuracy": 0.3449987781982765, - "eval_loss": 3.7476563453674316, - "eval_runtime": 184.5322, - "eval_samples_per_second": 97.604, - "eval_steps_per_second": 6.102, + "epoch": 1.0781671159029649, + "eval_accuracy": 0.3452738873102228, + "eval_loss": 3.7464487552642822, + "eval_runtime": 183.622, + "eval_samples_per_second": 98.087, + "eval_steps_per_second": 6.132, "step": 10000 }, { - "epoch": 1.081691959961253, - "grad_norm": 0.5756956338882446, - "learning_rate": 0.0005356814998383794, - "loss": 3.7808, + "epoch": 1.0835579514824798, + "grad_norm": 0.5005014538764954, + "learning_rate": 0.0005355769023205612, + "loss": 3.7581, "step": 10050 }, { - "epoch": 1.087073512000861, - "grad_norm": 0.5653090476989746, - "learning_rate": 0.0005353582588083181, - "loss": 3.7728, + "epoch": 1.0889487870619945, + "grad_norm": 0.6306165456771851, + "learning_rate": 0.0005352531030760928, + "loss": 3.7484, "step": 10100 }, { - "epoch": 1.0924550640404693, - "grad_norm": 0.5604647994041443, - "learning_rate": 0.0005350350177782566, - "loss": 3.7572, + "epoch": 1.0943396226415094, + "grad_norm": 0.5631089806556702, + "learning_rate": 0.0005349293038316244, + "loss": 3.7815, "step": 10150 }, { - "epoch": 1.0978366160800774, - "grad_norm": 0.6427847743034363, - "learning_rate": 0.0005347117767481952, - "loss": 3.7671, + "epoch": 1.0997304582210243, + "grad_norm": 0.57806396484375, + "learning_rate": 0.0005346055045871559, + "loss": 3.7813, "step": 10200 }, { - "epoch": 1.1032181681196858, - "grad_norm": 0.5688785910606384, - "learning_rate": 0.000534395000538735, - "loss": 3.7718, + "epoch": 1.105121293800539, + "grad_norm": 0.5878326296806335, + "learning_rate": 0.0005342817053426874, + "loss": 3.7473, "step": 10250 }, { - "epoch": 1.1085997201592939, - "grad_norm": 0.6266538500785828, - "learning_rate": 0.0005340717595086736, - "loss": 3.7582, + "epoch": 1.110512129380054, + "grad_norm": 0.6222975850105286, + "learning_rate": 0.0005339579060982191, + "loss": 3.7746, "step": 10300 }, { - "epoch": 1.1139812721989022, - "grad_norm": 0.6398711800575256, - "learning_rate": 0.0005337485184786122, - "loss": 3.7626, + "epoch": 1.1159029649595686, + "grad_norm": 0.5526645183563232, + "learning_rate": 0.0005336341068537506, + "loss": 3.7748, "step": 10350 }, { - "epoch": 1.1193628242385103, - "grad_norm": 0.5167144536972046, - "learning_rate": 0.0005334252774485507, - "loss": 3.7692, + "epoch": 1.1212938005390836, + "grad_norm": 0.5977159142494202, + "learning_rate": 0.0005333103076092822, + "loss": 3.759, "step": 10400 }, { - "epoch": 1.1247443762781186, - "grad_norm": 0.5785194039344788, - "learning_rate": 0.0005331020364184894, - "loss": 3.7633, + "epoch": 1.1266846361185983, + "grad_norm": 0.5398628115653992, + "learning_rate": 0.0005329865083648137, + "loss": 3.7247, "step": 10450 }, { - "epoch": 1.1301259283177267, - "grad_norm": 0.5897461175918579, - "learning_rate": 0.0005327787953884279, - "loss": 3.7521, + "epoch": 1.1320754716981132, + "grad_norm": 0.5337214469909668, + "learning_rate": 0.0005326627091203454, + "loss": 3.7499, "step": 10500 }, { - "epoch": 1.135507480357335, - "grad_norm": 0.5663477182388306, - "learning_rate": 0.0005324555543583665, - "loss": 3.7543, + "epoch": 1.137466307277628, + "grad_norm": 0.5868701338768005, + "learning_rate": 0.0005323389098758769, + "loss": 3.7506, "step": 10550 }, { - "epoch": 1.1408890323969434, - "grad_norm": 0.565779983997345, - "learning_rate": 0.0005321323133283051, - "loss": 3.7576, + "epoch": 1.1428571428571428, + "grad_norm": 0.6092649698257446, + "learning_rate": 0.0005320151106314085, + "loss": 3.7578, "step": 10600 }, { - "epoch": 1.1462705844365515, - "grad_norm": 0.5521111488342285, - "learning_rate": 0.0005318090722982436, - "loss": 3.7509, + "epoch": 1.1482479784366577, + "grad_norm": 0.5305042862892151, + "learning_rate": 0.00053169131138694, + "loss": 3.7657, "step": 10650 }, { - "epoch": 1.1516521364761596, - "grad_norm": 0.5536676645278931, - "learning_rate": 0.0005314858312681823, - "loss": 3.7664, + "epoch": 1.1536388140161726, + "grad_norm": 0.5675718784332275, + "learning_rate": 0.0005313675121424716, + "loss": 3.7481, "step": 10700 }, { - "epoch": 1.157033688515768, - "grad_norm": 0.5922389030456543, - "learning_rate": 0.0005311625902381209, - "loss": 3.7486, + "epoch": 1.1590296495956873, + "grad_norm": 0.601825475692749, + "learning_rate": 0.0005310437128980032, + "loss": 3.7453, "step": 10750 }, { - "epoch": 1.1624152405553763, - "grad_norm": 0.5422095656394958, - "learning_rate": 0.0005308393492080595, - "loss": 3.7454, + "epoch": 1.1644204851752022, + "grad_norm": 0.5961639881134033, + "learning_rate": 0.0005307199136535348, + "loss": 3.7255, "step": 10800 }, { - "epoch": 1.1677967925949844, - "grad_norm": 0.5556425452232361, - "learning_rate": 0.000530516108177998, - "loss": 3.7493, + "epoch": 1.169811320754717, + "grad_norm": 0.5785391330718994, + "learning_rate": 0.0005303961144090663, + "loss": 3.7645, "step": 10850 }, { - "epoch": 1.1731783446345927, - "grad_norm": 0.589709997177124, - "learning_rate": 0.0005301928671479365, - "loss": 3.7442, + "epoch": 1.1752021563342319, + "grad_norm": 0.5923433303833008, + "learning_rate": 0.0005300723151645979, + "loss": 3.7614, "step": 10900 }, { - "epoch": 1.1785598966742008, - "grad_norm": 0.6388732194900513, - "learning_rate": 0.0005298696261178752, - "loss": 3.7656, + "epoch": 1.1805929919137466, + "grad_norm": 0.610974133014679, + "learning_rate": 0.0005297485159201295, + "loss": 3.7204, "step": 10950 }, { - "epoch": 1.1839414487138091, - "grad_norm": 0.6585466265678406, - "learning_rate": 0.0005295463850878138, - "loss": 3.754, + "epoch": 1.1859838274932615, + "grad_norm": 0.537177562713623, + "learning_rate": 0.000529424716675661, + "loss": 3.7393, "step": 11000 }, { - "epoch": 1.1839414487138091, - "eval_accuracy": 0.3477161869225167, - "eval_loss": 3.722252130508423, - "eval_runtime": 184.2851, - "eval_samples_per_second": 97.734, - "eval_steps_per_second": 6.11, + "epoch": 1.1859838274932615, + "eval_accuracy": 0.34715510340984274, + "eval_loss": 3.7238121032714844, + "eval_runtime": 183.3902, + "eval_samples_per_second": 98.211, + "eval_steps_per_second": 6.14, "step": 11000 }, { - "epoch": 1.1893230007534172, - "grad_norm": 0.5404320359230042, - "learning_rate": 0.0005292231440577524, - "loss": 3.7624, + "epoch": 1.1913746630727764, + "grad_norm": 0.5162882208824158, + "learning_rate": 0.000529107393416082, + "loss": 3.749, "step": 11050 }, { - "epoch": 1.1947045527930256, - "grad_norm": 0.5776370167732239, - "learning_rate": 0.0005288999030276909, - "loss": 3.7492, + "epoch": 1.196765498652291, + "grad_norm": 0.5799063444137573, + "learning_rate": 0.0005287835941716135, + "loss": 3.7425, "step": 11100 }, { - "epoch": 1.2000861048326337, - "grad_norm": 0.6025657653808594, - "learning_rate": 0.0005285766619976295, - "loss": 3.7407, + "epoch": 1.202156334231806, + "grad_norm": 0.5635416507720947, + "learning_rate": 0.0005284597949271452, + "loss": 3.7357, "step": 11150 }, { - "epoch": 1.205467656872242, - "grad_norm": 0.5973086357116699, - "learning_rate": 0.0005282534209675681, - "loss": 3.736, + "epoch": 1.2075471698113207, + "grad_norm": 0.6384708881378174, + "learning_rate": 0.0005281359956826767, + "loss": 3.7503, "step": 11200 }, { - "epoch": 1.21084920891185, - "grad_norm": 0.5211036801338196, - "learning_rate": 0.0005279301799375066, - "loss": 3.7491, + "epoch": 1.2129380053908356, + "grad_norm": 0.5484032034873962, + "learning_rate": 0.0005278121964382083, + "loss": 3.7588, "step": 11250 }, { - "epoch": 1.2162307609514584, - "grad_norm": 0.5768703818321228, - "learning_rate": 0.0005276069389074453, - "loss": 3.7408, + "epoch": 1.2183288409703503, + "grad_norm": 0.6434171795845032, + "learning_rate": 0.0005274883971937398, + "loss": 3.7291, "step": 11300 }, { - "epoch": 1.2216123129910665, - "grad_norm": 0.5568981766700745, - "learning_rate": 0.0005272836978773838, - "loss": 3.7406, + "epoch": 1.2237196765498652, + "grad_norm": 0.5906100869178772, + "learning_rate": 0.0005271645979492714, + "loss": 3.7504, "step": 11350 }, { - "epoch": 1.2269938650306749, - "grad_norm": 0.5531343221664429, - "learning_rate": 0.0005269604568473225, - "loss": 3.7345, + "epoch": 1.2291105121293802, + "grad_norm": 0.58035808801651, + "learning_rate": 0.000526840798704803, + "loss": 3.7506, "step": 11400 }, { - "epoch": 1.232375417070283, - "grad_norm": 0.6032938957214355, - "learning_rate": 0.000526637215817261, - "loss": 3.7292, + "epoch": 1.2345013477088949, + "grad_norm": 0.5390593409538269, + "learning_rate": 0.0005265169994603346, + "loss": 3.7528, "step": 11450 }, { - "epoch": 1.2377569691098913, - "grad_norm": 0.5232219099998474, - "learning_rate": 0.0005263139747871996, - "loss": 3.7334, + "epoch": 1.2398921832884098, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0005261932002158661, + "loss": 3.7185, "step": 11500 }, { - "epoch": 1.2431385211494996, - "grad_norm": 0.5911790728569031, - "learning_rate": 0.0005259907337571381, - "loss": 3.7533, + "epoch": 1.2452830188679245, + "grad_norm": 0.5806322693824768, + "learning_rate": 0.0005258694009713977, + "loss": 3.7404, "step": 11550 }, { - "epoch": 1.2485200731891077, - "grad_norm": 0.5355823636054993, - "learning_rate": 0.000525673957547678, - "loss": 3.7246, + "epoch": 1.2506738544474394, + "grad_norm": 0.5735841989517212, + "learning_rate": 0.0005255456017269292, + "loss": 3.7371, "step": 11600 }, { - "epoch": 1.2539016252287158, - "grad_norm": 0.4860752522945404, - "learning_rate": 0.0005253507165176167, - "loss": 3.752, + "epoch": 1.256064690026954, + "grad_norm": 0.5399126410484314, + "learning_rate": 0.0005252218024824608, + "loss": 3.7348, "step": 11650 }, { - "epoch": 1.2592831772683242, - "grad_norm": 0.5607491731643677, - "learning_rate": 0.0005250274754875552, - "loss": 3.731, + "epoch": 1.261455525606469, + "grad_norm": 0.581797182559967, + "learning_rate": 0.0005248980032379924, + "loss": 3.7357, "step": 11700 }, { - "epoch": 1.2646647293079325, - "grad_norm": 0.5975137948989868, - "learning_rate": 0.0005247042344574938, - "loss": 3.7511, + "epoch": 1.266846361185984, + "grad_norm": 0.6158117651939392, + "learning_rate": 0.000524574203993524, + "loss": 3.7435, "step": 11750 }, { - "epoch": 1.2700462813475406, - "grad_norm": 0.5998445749282837, - "learning_rate": 0.0005243809934274323, - "loss": 3.7465, + "epoch": 1.2722371967654986, + "grad_norm": 0.5718212723731995, + "learning_rate": 0.0005242504047490555, + "loss": 3.7316, "step": 11800 }, { - "epoch": 1.275427833387149, - "grad_norm": 0.6529081463813782, - "learning_rate": 0.0005240577523973709, - "loss": 3.7262, + "epoch": 1.2776280323450135, + "grad_norm": 0.5981943011283875, + "learning_rate": 0.0005239266055045871, + "loss": 3.7131, "step": 11850 }, { - "epoch": 1.280809385426757, - "grad_norm": 0.5797929167747498, - "learning_rate": 0.0005237345113673095, - "loss": 3.7591, + "epoch": 1.2830188679245282, + "grad_norm": 0.48539048433303833, + "learning_rate": 0.0005236028062601186, + "loss": 3.7193, "step": 11900 }, { - "epoch": 1.2861909374663654, - "grad_norm": 0.5823287963867188, - "learning_rate": 0.0005234112703372481, - "loss": 3.722, + "epoch": 1.2884097035040432, + "grad_norm": 0.5573267936706543, + "learning_rate": 0.0005232790070156503, + "loss": 3.7284, "step": 11950 }, { - "epoch": 1.2915724895059735, - "grad_norm": 0.7930464148521423, - "learning_rate": 0.0005230880293071867, - "loss": 3.7378, + "epoch": 1.2938005390835579, + "grad_norm": 0.5594388246536255, + "learning_rate": 0.0005229552077711818, + "loss": 3.7248, "step": 12000 }, { - "epoch": 1.2915724895059735, - "eval_accuracy": 0.34940867295600286, - "eval_loss": 3.700629234313965, - "eval_runtime": 184.3929, - "eval_samples_per_second": 97.677, - "eval_steps_per_second": 6.107, + "epoch": 1.2938005390835579, + "eval_accuracy": 0.3495350362645311, + "eval_loss": 3.6991188526153564, + "eval_runtime": 183.7777, + "eval_samples_per_second": 98.004, + "eval_steps_per_second": 6.127, "step": 12000 }, { - "epoch": 1.2969540415455818, - "grad_norm": 0.49873632192611694, - "learning_rate": 0.0005227647882771253, - "loss": 3.7239, + "epoch": 1.2991913746630728, + "grad_norm": 0.5452162623405457, + "learning_rate": 0.0005226314085267134, + "loss": 3.7048, "step": 12050 }, { - "epoch": 1.30233559358519, - "grad_norm": 0.5773414373397827, - "learning_rate": 0.0005224415472470639, - "loss": 3.7392, + "epoch": 1.3045822102425877, + "grad_norm": 0.6047404408454895, + "learning_rate": 0.000522307609282245, + "loss": 3.7436, "step": 12100 }, { - "epoch": 1.3077171456247982, - "grad_norm": 0.6267489194869995, - "learning_rate": 0.0005221183062170024, - "loss": 3.7183, + "epoch": 1.3099730458221024, + "grad_norm": 0.5165073871612549, + "learning_rate": 0.0005219838100377766, + "loss": 3.7241, "step": 12150 }, { - "epoch": 1.3130986976644063, - "grad_norm": 0.5878105759620667, - "learning_rate": 0.0005217950651869409, - "loss": 3.7342, + "epoch": 1.3153638814016173, + "grad_norm": 0.5657083988189697, + "learning_rate": 0.000521660010793308, + "loss": 3.7097, "step": 12200 }, { - "epoch": 1.3184802497040147, - "grad_norm": 0.5103281736373901, - "learning_rate": 0.0005214718241568796, - "loss": 3.7391, + "epoch": 1.320754716981132, + "grad_norm": 0.6331221461296082, + "learning_rate": 0.0005213362115488396, + "loss": 3.7363, "step": 12250 }, { - "epoch": 1.3238618017436228, - "grad_norm": 0.5685731768608093, - "learning_rate": 0.0005211485831268182, - "loss": 3.7023, + "epoch": 1.326145552560647, + "grad_norm": 0.5557056069374084, + "learning_rate": 0.0005210124123043713, + "loss": 3.7024, "step": 12300 }, { - "epoch": 1.329243353783231, - "grad_norm": 0.5740435123443604, - "learning_rate": 0.0005208253420967568, - "loss": 3.7269, + "epoch": 1.3315363881401616, + "grad_norm": 0.5646718144416809, + "learning_rate": 0.0005206886130599028, + "loss": 3.7284, "step": 12350 }, { - "epoch": 1.3346249058228392, - "grad_norm": 0.5537181496620178, - "learning_rate": 0.0005205021010666953, - "loss": 3.7228, + "epoch": 1.3369272237196765, + "grad_norm": 0.5577690601348877, + "learning_rate": 0.0005203648138154344, + "loss": 3.699, "step": 12400 }, { - "epoch": 1.3400064578624475, - "grad_norm": 0.6004145741462708, - "learning_rate": 0.0005201788600366339, - "loss": 3.7306, + "epoch": 1.3423180592991915, + "grad_norm": 0.532707154750824, + "learning_rate": 0.0005200410145709659, + "loss": 3.7212, "step": 12450 }, { - "epoch": 1.3453880099020559, - "grad_norm": 0.5789647698402405, - "learning_rate": 0.0005198556190065725, - "loss": 3.7216, + "epoch": 1.3477088948787062, + "grad_norm": 0.5770918130874634, + "learning_rate": 0.0005197172153264976, + "loss": 3.7258, "step": 12500 }, { - "epoch": 1.350769561941664, - "grad_norm": 0.5887240767478943, - "learning_rate": 0.0005195323779765112, - "loss": 3.7198, + "epoch": 1.353099730458221, + "grad_norm": 0.548803448677063, + "learning_rate": 0.0005193934160820291, + "loss": 3.7393, "step": 12550 }, { - "epoch": 1.356151113981272, - "grad_norm": 0.5683675408363342, - "learning_rate": 0.0005192091369464497, - "loss": 3.7304, + "epoch": 1.3584905660377358, + "grad_norm": 0.5411361455917358, + "learning_rate": 0.0005190696168375607, + "loss": 3.7124, "step": 12600 }, { - "epoch": 1.3615326660208804, - "grad_norm": 0.5594485402107239, - "learning_rate": 0.0005188858959163882, - "loss": 3.7167, + "epoch": 1.3638814016172507, + "grad_norm": 0.5823306441307068, + "learning_rate": 0.0005187458175930922, + "loss": 3.7102, "step": 12650 }, { - "epoch": 1.3669142180604887, - "grad_norm": 0.6306636333465576, - "learning_rate": 0.0005185626548863269, - "loss": 3.7207, + "epoch": 1.3692722371967654, + "grad_norm": 0.5538209676742554, + "learning_rate": 0.0005184220183486238, + "loss": 3.7083, "step": 12700 }, { - "epoch": 1.3722957701000968, - "grad_norm": 0.5611990094184875, - "learning_rate": 0.0005182394138562654, - "loss": 3.723, + "epoch": 1.3746630727762803, + "grad_norm": 0.5278885960578918, + "learning_rate": 0.0005180982191041554, + "loss": 3.7216, "step": 12750 }, { - "epoch": 1.3776773221397052, - "grad_norm": 0.5362935662269592, - "learning_rate": 0.0005179161728262041, - "loss": 3.7171, + "epoch": 1.3800539083557952, + "grad_norm": 0.570948600769043, + "learning_rate": 0.000517774419859687, + "loss": 3.7147, "step": 12800 }, { - "epoch": 1.3830588741793133, - "grad_norm": 0.5498529672622681, - "learning_rate": 0.0005175929317961426, - "loss": 3.72, + "epoch": 1.38544474393531, + "grad_norm": 0.6416262984275818, + "learning_rate": 0.0005174506206152185, + "loss": 3.7362, "step": 12850 }, { - "epoch": 1.3884404262189216, - "grad_norm": 0.5442519783973694, - "learning_rate": 0.0005172696907660812, - "loss": 3.7128, + "epoch": 1.3908355795148248, + "grad_norm": 0.658841073513031, + "learning_rate": 0.0005171268213707501, + "loss": 3.7334, "step": 12900 }, { - "epoch": 1.3938219782585297, - "grad_norm": 0.5849413871765137, - "learning_rate": 0.0005169464497360198, - "loss": 3.7118, + "epoch": 1.3962264150943398, + "grad_norm": 0.5554807782173157, + "learning_rate": 0.0005168030221262816, + "loss": 3.7339, "step": 12950 }, { - "epoch": 1.399203530298138, - "grad_norm": 0.5864673852920532, - "learning_rate": 0.0005166232087059583, - "loss": 3.7056, + "epoch": 1.4016172506738545, + "grad_norm": 0.5637690424919128, + "learning_rate": 0.0005164792228818132, + "loss": 3.7151, "step": 13000 }, { - "epoch": 1.399203530298138, - "eval_accuracy": 0.3516231274625498, - "eval_loss": 3.67989444732666, - "eval_runtime": 184.826, - "eval_samples_per_second": 97.448, - "eval_steps_per_second": 6.092, + "epoch": 1.4016172506738545, + "eval_accuracy": 0.3519373516141093, + "eval_loss": 3.6800601482391357, + "eval_runtime": 183.4932, + "eval_samples_per_second": 98.156, + "eval_steps_per_second": 6.136, "step": 13000 }, { - "epoch": 1.4045850823377461, - "grad_norm": 0.5450125336647034, - "learning_rate": 0.0005162999676758969, - "loss": 3.7177, + "epoch": 1.4070080862533692, + "grad_norm": 0.6130073666572571, + "learning_rate": 0.0005161554236373448, + "loss": 3.6973, "step": 13050 }, { - "epoch": 1.4099666343773545, - "grad_norm": 0.5438733100891113, - "learning_rate": 0.0005159767266458355, - "loss": 3.7222, + "epoch": 1.412398921832884, + "grad_norm": 0.5765049457550049, + "learning_rate": 0.0005158381003777657, + "loss": 3.7145, "step": 13100 }, { - "epoch": 1.4153481864169626, - "grad_norm": 0.5453888773918152, - "learning_rate": 0.0005156534856157741, - "loss": 3.708, + "epoch": 1.417789757412399, + "grad_norm": 0.5363691449165344, + "learning_rate": 0.0005155143011332973, + "loss": 3.7161, "step": 13150 }, { - "epoch": 1.420729738456571, - "grad_norm": 0.572364330291748, - "learning_rate": 0.0005153302445857127, - "loss": 3.7001, + "epoch": 1.4231805929919137, + "grad_norm": 0.5675169229507446, + "learning_rate": 0.0005151905018888289, + "loss": 3.7403, "step": 13200 }, { - "epoch": 1.426111290496179, - "grad_norm": 0.5122117400169373, - "learning_rate": 0.0005150070035556513, - "loss": 3.7032, + "epoch": 1.4285714285714286, + "grad_norm": 0.594634473323822, + "learning_rate": 0.0005148667026443604, + "loss": 3.6966, "step": 13250 }, { - "epoch": 1.4314928425357873, - "grad_norm": 0.566405713558197, - "learning_rate": 0.0005146837625255898, - "loss": 3.7114, + "epoch": 1.4339622641509435, + "grad_norm": 0.5782607793807983, + "learning_rate": 0.000514542903399892, + "loss": 3.7007, "step": 13300 }, { - "epoch": 1.4368743945753955, - "grad_norm": 0.5422384142875671, - "learning_rate": 0.0005143605214955285, - "loss": 3.7076, + "epoch": 1.4393530997304582, + "grad_norm": 0.6014611721038818, + "learning_rate": 0.0005142191041554237, + "loss": 3.6826, "step": 13350 }, { - "epoch": 1.4422559466150038, - "grad_norm": 0.5637435913085938, - "learning_rate": 0.0005140372804654671, - "loss": 3.6984, + "epoch": 1.444743935309973, + "grad_norm": 0.5752432346343994, + "learning_rate": 0.0005138953049109552, + "loss": 3.7229, "step": 13400 }, { - "epoch": 1.447637498654612, - "grad_norm": 0.5512182712554932, - "learning_rate": 0.0005137140394354056, - "loss": 3.6969, + "epoch": 1.4501347708894878, + "grad_norm": 0.6054044961929321, + "learning_rate": 0.0005135715056664868, + "loss": 3.7175, "step": 13450 }, { - "epoch": 1.4530190506942202, - "grad_norm": 0.5623869299888611, - "learning_rate": 0.0005133907984053442, - "loss": 3.6991, + "epoch": 1.4555256064690028, + "grad_norm": 0.6118409037590027, + "learning_rate": 0.0005132477064220183, + "loss": 3.7031, "step": 13500 }, { - "epoch": 1.4584006027338283, - "grad_norm": 0.5506566762924194, - "learning_rate": 0.0005130675573752827, - "loss": 3.7273, + "epoch": 1.4609164420485174, + "grad_norm": 0.5970998406410217, + "learning_rate": 0.0005129239071775499, + "loss": 3.7209, "step": 13550 }, { - "epoch": 1.4637821547734367, - "grad_norm": 0.5795466303825378, - "learning_rate": 0.0005127443163452214, - "loss": 3.7172, + "epoch": 1.4663072776280324, + "grad_norm": 0.5053044557571411, + "learning_rate": 0.0005126001079330814, + "loss": 3.7031, "step": 13600 }, { - "epoch": 1.469163706813045, - "grad_norm": 0.5493975877761841, - "learning_rate": 0.00051242107531516, - "loss": 3.6948, + "epoch": 1.4716981132075473, + "grad_norm": 0.5769761204719543, + "learning_rate": 0.000512276308688613, + "loss": 3.7068, "step": 13650 }, { - "epoch": 1.474545258852653, - "grad_norm": 0.5260751247406006, - "learning_rate": 0.0005120978342850986, - "loss": 3.7049, + "epoch": 1.477088948787062, + "grad_norm": 0.5561441779136658, + "learning_rate": 0.0005119525094441446, + "loss": 3.7206, "step": 13700 }, { - "epoch": 1.4799268108922612, - "grad_norm": 0.5563033223152161, - "learning_rate": 0.0005117745932550371, - "loss": 3.7076, + "epoch": 1.482479784366577, + "grad_norm": 0.5928412079811096, + "learning_rate": 0.0005116287101996762, + "loss": 3.7031, "step": 13750 }, { - "epoch": 1.4853083629318695, - "grad_norm": 0.5787496566772461, - "learning_rate": 0.0005114513522249758, - "loss": 3.71, + "epoch": 1.4878706199460916, + "grad_norm": 0.5242429971694946, + "learning_rate": 0.0005113049109552077, + "loss": 3.7018, "step": 13800 }, { - "epoch": 1.4906899149714778, - "grad_norm": 0.5059001445770264, - "learning_rate": 0.0005111281111949143, - "loss": 3.684, + "epoch": 1.4932614555256065, + "grad_norm": 0.5548174977302551, + "learning_rate": 0.0005109811117107393, + "loss": 3.697, "step": 13850 }, { - "epoch": 1.496071467011086, - "grad_norm": 0.5558372735977173, - "learning_rate": 0.0005108048701648528, - "loss": 3.7065, + "epoch": 1.4986522911051212, + "grad_norm": 0.6115781664848328, + "learning_rate": 0.0005106573124662708, + "loss": 3.6875, "step": 13900 }, { - "epoch": 1.501453019050694, - "grad_norm": 0.5342027544975281, - "learning_rate": 0.0005104816291347915, - "loss": 3.6999, + "epoch": 1.5040431266846361, + "grad_norm": 0.536693811416626, + "learning_rate": 0.0005103335132218025, + "loss": 3.704, "step": 13950 }, { - "epoch": 1.5068345710903024, - "grad_norm": 0.5543688535690308, - "learning_rate": 0.00051015838810473, - "loss": 3.7016, + "epoch": 1.509433962264151, + "grad_norm": 0.5926096439361572, + "learning_rate": 0.000510009713977334, + "loss": 3.6969, "step": 14000 }, { - "epoch": 1.5068345710903024, - "eval_accuracy": 0.35396144626120524, - "eval_loss": 3.656766176223755, - "eval_runtime": 184.2793, - "eval_samples_per_second": 97.737, - "eval_steps_per_second": 6.11, + "epoch": 1.509433962264151, + "eval_accuracy": 0.3541569128063835, + "eval_loss": 3.6548423767089844, + "eval_runtime": 183.7963, + "eval_samples_per_second": 97.994, + "eval_steps_per_second": 6.126, "step": 14000 }, { - "epoch": 1.5122161231299107, - "grad_norm": 0.5722943544387817, - "learning_rate": 0.0005098351470746687, - "loss": 3.6839, + "epoch": 1.5148247978436657, + "grad_norm": 0.5280314087867737, + "learning_rate": 0.0005096859147328656, + "loss": 3.6741, "step": 14050 }, { - "epoch": 1.5175976751695188, - "grad_norm": 0.5564660429954529, - "learning_rate": 0.0005095119060446072, - "loss": 3.6961, + "epoch": 1.5202156334231804, + "grad_norm": 0.5069935321807861, + "learning_rate": 0.0005093621154883971, + "loss": 3.707, "step": 14100 }, { - "epoch": 1.5229792272091272, - "grad_norm": 0.5130824446678162, - "learning_rate": 0.0005091886650145458, - "loss": 3.6933, + "epoch": 1.5256064690026954, + "grad_norm": 0.5404360890388489, + "learning_rate": 0.0005090383162439288, + "loss": 3.6867, "step": 14150 }, { - "epoch": 1.5283607792487355, - "grad_norm": 0.5731927156448364, - "learning_rate": 0.0005088654239844844, - "loss": 3.6865, + "epoch": 1.5309973045822103, + "grad_norm": 0.5273117423057556, + "learning_rate": 0.0005087145169994602, + "loss": 3.6862, "step": 14200 }, { - "epoch": 1.5337423312883436, - "grad_norm": 0.5487149357795715, - "learning_rate": 0.0005085421829544229, - "loss": 3.675, + "epoch": 1.536388140161725, + "grad_norm": 0.5154308676719666, + "learning_rate": 0.0005083907177549918, + "loss": 3.6916, "step": 14250 }, { - "epoch": 1.5391238833279517, - "grad_norm": 0.5173348188400269, - "learning_rate": 0.0005082189419243616, - "loss": 3.7095, + "epoch": 1.54177897574124, + "grad_norm": 0.5643147230148315, + "learning_rate": 0.0005080669185105234, + "loss": 3.701, "step": 14300 }, { - "epoch": 1.54450543536756, - "grad_norm": 0.5780784487724304, - "learning_rate": 0.0005078957008943001, - "loss": 3.6955, + "epoch": 1.5471698113207548, + "grad_norm": 0.5398867130279541, + "learning_rate": 0.000507743119266055, + "loss": 3.708, "step": 14350 }, { - "epoch": 1.5498869874071683, - "grad_norm": 0.5526981949806213, - "learning_rate": 0.0005075724598642387, - "loss": 3.6764, + "epoch": 1.5525606469002695, + "grad_norm": 0.5693142414093018, + "learning_rate": 0.0005074193200215865, + "loss": 3.6898, "step": 14400 }, { - "epoch": 1.5552685394467765, - "grad_norm": 0.550183117389679, - "learning_rate": 0.0005072492188341773, - "loss": 3.7052, + "epoch": 1.5579514824797842, + "grad_norm": 0.5457934141159058, + "learning_rate": 0.0005070955207771181, + "loss": 3.7126, "step": 14450 }, { - "epoch": 1.5606500914863846, - "grad_norm": 0.5323998332023621, - "learning_rate": 0.000506925977804116, - "loss": 3.6719, + "epoch": 1.5633423180592994, + "grad_norm": 0.539290726184845, + "learning_rate": 0.0005067717215326498, + "loss": 3.6751, "step": 14500 }, { - "epoch": 1.566031643525993, - "grad_norm": 0.5508608818054199, - "learning_rate": 0.0005066027367740545, - "loss": 3.6832, + "epoch": 1.568733153638814, + "grad_norm": 0.5418549180030823, + "learning_rate": 0.0005064479222881813, + "loss": 3.6877, "step": 14550 }, { - "epoch": 1.5714131955656012, - "grad_norm": 0.5756356716156006, - "learning_rate": 0.000506279495743993, - "loss": 3.6806, + "epoch": 1.5741239892183287, + "grad_norm": 0.4996977746486664, + "learning_rate": 0.0005061241230437129, + "loss": 3.6781, "step": 14600 }, { - "epoch": 1.5767947476052093, - "grad_norm": 0.5558910369873047, - "learning_rate": 0.0005059562547139316, - "loss": 3.6743, + "epoch": 1.5795148247978437, + "grad_norm": 0.545849621295929, + "learning_rate": 0.0005058003237992444, + "loss": 3.6945, "step": 14650 }, { - "epoch": 1.5821762996448174, - "grad_norm": 0.5734641551971436, - "learning_rate": 0.0005056330136838702, - "loss": 3.6983, + "epoch": 1.5849056603773586, + "grad_norm": 0.5146830081939697, + "learning_rate": 0.000505476524554776, + "loss": 3.7012, "step": 14700 }, { - "epoch": 1.5875578516844258, - "grad_norm": 0.5970582962036133, - "learning_rate": 0.0005053097726538088, - "loss": 3.698, + "epoch": 1.5902964959568733, + "grad_norm": 0.5644547939300537, + "learning_rate": 0.0005051527253103076, + "loss": 3.6647, "step": 14750 }, { - "epoch": 1.592939403724034, - "grad_norm": 0.5290464162826538, - "learning_rate": 0.0005049865316237474, - "loss": 3.6823, + "epoch": 1.595687331536388, + "grad_norm": 0.5948972105979919, + "learning_rate": 0.0005048289260658392, + "loss": 3.6893, "step": 14800 }, { - "epoch": 1.5983209557636422, - "grad_norm": 0.5535150766372681, - "learning_rate": 0.000504663290593686, - "loss": 3.69, + "epoch": 1.6010781671159031, + "grad_norm": 0.5488345623016357, + "learning_rate": 0.0005045051268213707, + "loss": 3.6793, "step": 14850 }, { - "epoch": 1.6037025078032503, - "grad_norm": 0.5767305493354797, - "learning_rate": 0.0005043400495636246, - "loss": 3.6766, + "epoch": 1.6064690026954178, + "grad_norm": 0.5010797381401062, + "learning_rate": 0.0005041813275769023, + "loss": 3.6759, "step": 14900 }, { - "epoch": 1.6090840598428586, - "grad_norm": 0.5387904047966003, - "learning_rate": 0.0005040168085335632, - "loss": 3.6813, + "epoch": 1.6118598382749325, + "grad_norm": 0.5631289482116699, + "learning_rate": 0.0005038640043173232, + "loss": 3.675, "step": 14950 }, { - "epoch": 1.614465611882467, - "grad_norm": 0.5717957019805908, - "learning_rate": 0.0005036935675035017, - "loss": 3.6979, + "epoch": 1.6172506738544474, + "grad_norm": 0.6699851751327515, + "learning_rate": 0.0005035402050728548, + "loss": 3.6822, "step": 15000 }, { - "epoch": 1.614465611882467, - "eval_accuracy": 0.35539251344660977, - "eval_loss": 3.63720703125, - "eval_runtime": 184.8407, - "eval_samples_per_second": 97.441, - "eval_steps_per_second": 6.092, + "epoch": 1.6172506738544474, + "eval_accuracy": 0.35562035855235674, + "eval_loss": 3.640406370162964, + "eval_runtime": 184.0918, + "eval_samples_per_second": 97.837, + "eval_steps_per_second": 6.117, "step": 15000 }, { - "epoch": 1.619847163922075, - "grad_norm": 0.5999611020088196, - "learning_rate": 0.0005033703264734402, - "loss": 3.6837, + "epoch": 1.6226415094339623, + "grad_norm": 0.5111995339393616, + "learning_rate": 0.0005032164058283863, + "loss": 3.6909, "step": 15050 }, { - "epoch": 1.6252287159616834, - "grad_norm": 0.5812547206878662, - "learning_rate": 0.0005030470854433789, - "loss": 3.688, + "epoch": 1.628032345013477, + "grad_norm": 0.5510700941085815, + "learning_rate": 0.0005028926065839179, + "loss": 3.6663, "step": 15100 }, { - "epoch": 1.6306102680012917, - "grad_norm": 0.5843793749809265, - "learning_rate": 0.0005027238444133175, - "loss": 3.6731, + "epoch": 1.633423180592992, + "grad_norm": 0.5378367900848389, + "learning_rate": 0.0005025688073394495, + "loss": 3.6848, "step": 15150 }, { - "epoch": 1.6359918200408998, - "grad_norm": 0.5559870600700378, - "learning_rate": 0.0005024006033832561, - "loss": 3.6801, + "epoch": 1.6388140161725069, + "grad_norm": 0.6216373443603516, + "learning_rate": 0.0005022450080949811, + "loss": 3.6618, "step": 15200 }, { - "epoch": 1.641373372080508, - "grad_norm": 0.5399496555328369, - "learning_rate": 0.0005020773623531946, - "loss": 3.6836, + "epoch": 1.6442048517520216, + "grad_norm": 0.5785908102989197, + "learning_rate": 0.0005019212088505126, + "loss": 3.7039, "step": 15250 }, { - "epoch": 1.6467549241201163, - "grad_norm": 0.5161964297294617, - "learning_rate": 0.0005017541213231333, - "loss": 3.6662, + "epoch": 1.6495956873315363, + "grad_norm": 0.5875208377838135, + "learning_rate": 0.0005015974096060442, + "loss": 3.6864, "step": 15300 }, { - "epoch": 1.6521364761597246, - "grad_norm": 0.571781575679779, - "learning_rate": 0.0005014308802930718, - "loss": 3.6923, + "epoch": 1.6549865229110512, + "grad_norm": 0.5718501806259155, + "learning_rate": 0.0005012736103615758, + "loss": 3.6807, "step": 15350 }, { - "epoch": 1.6575180281993327, - "grad_norm": 0.54901522397995, - "learning_rate": 0.0005011076392630105, - "loss": 3.6763, + "epoch": 1.6603773584905661, + "grad_norm": 0.5843344926834106, + "learning_rate": 0.0005009498111171074, + "loss": 3.6706, "step": 15400 }, { - "epoch": 1.6628995802389408, - "grad_norm": 0.5774824619293213, - "learning_rate": 0.000500784398232949, - "loss": 3.675, + "epoch": 1.6657681940700808, + "grad_norm": 0.508507251739502, + "learning_rate": 0.0005006260118726389, + "loss": 3.6828, "step": 15450 }, { - "epoch": 1.6682811322785491, - "grad_norm": 0.5709084868431091, - "learning_rate": 0.0005004611572028875, - "loss": 3.6655, + "epoch": 1.6711590296495957, + "grad_norm": 0.5351792573928833, + "learning_rate": 0.0005003022126281705, + "loss": 3.6937, "step": 15500 }, { - "epoch": 1.6736626843181575, - "grad_norm": 0.5834586024284363, - "learning_rate": 0.0005001379161728262, - "loss": 3.6644, + "epoch": 1.6765498652291106, + "grad_norm": 0.5323431491851807, + "learning_rate": 0.000499978413383702, + "loss": 3.667, "step": 15550 }, { - "epoch": 1.6790442363577656, - "grad_norm": 0.5875089764595032, - "learning_rate": 0.0004998146751427647, - "loss": 3.6837, + "epoch": 1.6819407008086253, + "grad_norm": 0.5795451402664185, + "learning_rate": 0.0004996546141392336, + "loss": 3.6673, "step": 15600 }, { - "epoch": 1.6844257883973737, - "grad_norm": 0.5730432868003845, - "learning_rate": 0.0004994914341127034, - "loss": 3.6606, + "epoch": 1.68733153638814, + "grad_norm": 0.5577372908592224, + "learning_rate": 0.0004993308148947651, + "loss": 3.6636, "step": 15650 }, { - "epoch": 1.689807340436982, - "grad_norm": 0.598292887210846, - "learning_rate": 0.0004991681930826419, - "loss": 3.6657, + "epoch": 1.692722371967655, + "grad_norm": 0.5510057210922241, + "learning_rate": 0.0004990070156502968, + "loss": 3.6775, "step": 15700 }, { - "epoch": 1.6951888924765903, - "grad_norm": 0.5262405872344971, - "learning_rate": 0.0004988449520525805, - "loss": 3.6675, + "epoch": 1.6981132075471699, + "grad_norm": 0.6013910174369812, + "learning_rate": 0.0004986832164058284, + "loss": 3.6531, "step": 15750 }, { - "epoch": 1.7005704445161984, - "grad_norm": 0.562152624130249, - "learning_rate": 0.0004985217110225191, - "loss": 3.661, + "epoch": 1.7035040431266846, + "grad_norm": 0.49710965156555176, + "learning_rate": 0.0004983594171613599, + "loss": 3.674, "step": 15800 }, { - "epoch": 1.7059519965558065, - "grad_norm": 0.5154584646224976, - "learning_rate": 0.0004981984699924576, - "loss": 3.6626, + "epoch": 1.7088948787061995, + "grad_norm": 0.553176760673523, + "learning_rate": 0.0004980356179168915, + "loss": 3.6648, "step": 15850 }, { - "epoch": 1.7113335485954149, - "grad_norm": 0.5356432795524597, - "learning_rate": 0.0004978752289623962, - "loss": 3.652, + "epoch": 1.7142857142857144, + "grad_norm": 0.5849853157997131, + "learning_rate": 0.000497711818672423, + "loss": 3.6667, "step": 15900 }, { - "epoch": 1.7167151006350232, - "grad_norm": 0.5371306538581848, - "learning_rate": 0.0004975519879323348, - "loss": 3.6717, + "epoch": 1.719676549865229, + "grad_norm": 0.5109348297119141, + "learning_rate": 0.0004973880194279547, + "loss": 3.6563, "step": 15950 }, { - "epoch": 1.7220966526746313, - "grad_norm": 0.5752100944519043, - "learning_rate": 0.0004972352117228746, - "loss": 3.6631, + "epoch": 1.7250673854447438, + "grad_norm": 0.5275253653526306, + "learning_rate": 0.0004970642201834862, + "loss": 3.6547, "step": 16000 }, { - "epoch": 1.7220966526746313, - "eval_accuracy": 0.35704088640764325, - "eval_loss": 3.6225626468658447, - "eval_runtime": 183.8276, - "eval_samples_per_second": 97.978, - "eval_steps_per_second": 6.125, + "epoch": 1.7250673854447438, + "eval_accuracy": 0.3572827477359183, + "eval_loss": 3.619877338409424, + "eval_runtime": 184.0128, + "eval_samples_per_second": 97.879, + "eval_steps_per_second": 6.119, "step": 16000 }, { - "epoch": 1.7274782047142396, - "grad_norm": 0.49519824981689453, - "learning_rate": 0.0004969119706928133, - "loss": 3.649, + "epoch": 1.7304582210242587, + "grad_norm": 0.524674117565155, + "learning_rate": 0.0004967404209390178, + "loss": 3.6765, "step": 16050 }, { - "epoch": 1.732859756753848, - "grad_norm": 0.5395165085792542, - "learning_rate": 0.0004965887296627518, - "loss": 3.6648, + "epoch": 1.7358490566037736, + "grad_norm": 0.5455240607261658, + "learning_rate": 0.0004964166216945493, + "loss": 3.6762, "step": 16100 }, { - "epoch": 1.738241308793456, - "grad_norm": 0.5537828207015991, - "learning_rate": 0.0004962654886326904, - "loss": 3.6808, + "epoch": 1.7412398921832883, + "grad_norm": 0.6462677717208862, + "learning_rate": 0.000496092822450081, + "loss": 3.6584, "step": 16150 }, { - "epoch": 1.7436228608330642, - "grad_norm": 0.5962070226669312, - "learning_rate": 0.000495942247602629, - "loss": 3.6507, + "epoch": 1.7466307277628033, + "grad_norm": 0.49305227398872375, + "learning_rate": 0.0004957690232056125, + "loss": 3.6656, "step": 16200 }, { - "epoch": 1.7490044128726725, - "grad_norm": 0.6224369406700134, - "learning_rate": 0.0004956190065725676, - "loss": 3.6441, + "epoch": 1.7520215633423182, + "grad_norm": 0.5830181837081909, + "learning_rate": 0.0004954452239611441, + "loss": 3.6647, "step": 16250 }, { - "epoch": 1.7543859649122808, - "grad_norm": 0.5751168727874756, - "learning_rate": 0.0004952957655425062, - "loss": 3.6686, + "epoch": 1.7574123989218329, + "grad_norm": 0.5238760709762573, + "learning_rate": 0.0004951214247166756, + "loss": 3.6659, "step": 16300 }, { - "epoch": 1.759767516951889, - "grad_norm": 0.6183793544769287, - "learning_rate": 0.0004949725245124448, - "loss": 3.6619, + "epoch": 1.7628032345013476, + "grad_norm": 0.5242082476615906, + "learning_rate": 0.0004947976254722072, + "loss": 3.6653, "step": 16350 }, { - "epoch": 1.765149068991497, - "grad_norm": 0.5693207383155823, - "learning_rate": 0.0004946492834823833, - "loss": 3.6391, + "epoch": 1.7681940700808625, + "grad_norm": 0.5468643307685852, + "learning_rate": 0.0004944738262277387, + "loss": 3.6704, "step": 16400 }, { - "epoch": 1.7705306210311054, - "grad_norm": 0.8416994214057922, - "learning_rate": 0.0004943260424523219, - "loss": 3.6685, + "epoch": 1.7735849056603774, + "grad_norm": 0.5299263000488281, + "learning_rate": 0.0004941500269832703, + "loss": 3.671, "step": 16450 }, { - "epoch": 1.7759121730707137, - "grad_norm": 0.5473074913024902, - "learning_rate": 0.0004940028014222605, - "loss": 3.651, + "epoch": 1.778975741239892, + "grad_norm": 0.5592052340507507, + "learning_rate": 0.0004938262277388019, + "loss": 3.6673, "step": 16500 }, { - "epoch": 1.7812937251103218, - "grad_norm": 0.6257918477058411, - "learning_rate": 0.000493679560392199, - "loss": 3.647, + "epoch": 1.784366576819407, + "grad_norm": 0.5448793768882751, + "learning_rate": 0.0004935024284943335, + "loss": 3.6462, "step": 16550 }, { - "epoch": 1.78667527714993, - "grad_norm": 0.6103613972663879, - "learning_rate": 0.0004933563193621377, - "loss": 3.6511, + "epoch": 1.789757412398922, + "grad_norm": 0.5522679686546326, + "learning_rate": 0.000493178629249865, + "loss": 3.6596, "step": 16600 }, { - "epoch": 1.7920568291895382, - "grad_norm": 0.53249591588974, - "learning_rate": 0.0004930330783320762, - "loss": 3.6511, + "epoch": 1.7951482479784366, + "grad_norm": 0.5675358772277832, + "learning_rate": 0.0004928548300053966, + "loss": 3.6647, "step": 16650 }, { - "epoch": 1.7974383812291466, - "grad_norm": 0.5363080501556396, - "learning_rate": 0.0004927098373020149, - "loss": 3.6559, + "epoch": 1.8005390835579513, + "grad_norm": 0.5482249855995178, + "learning_rate": 0.0004925310307609282, + "loss": 3.6587, "step": 16700 }, { - "epoch": 1.8028199332687547, - "grad_norm": 0.5322283506393433, - "learning_rate": 0.0004923865962719534, - "loss": 3.6635, + "epoch": 1.8059299191374663, + "grad_norm": 0.588553249835968, + "learning_rate": 0.0004922072315164598, + "loss": 3.6471, "step": 16750 }, { - "epoch": 1.8082014853083628, - "grad_norm": 0.5365887880325317, - "learning_rate": 0.0004920633552418919, - "loss": 3.653, + "epoch": 1.8113207547169812, + "grad_norm": 0.5706552863121033, + "learning_rate": 0.0004918834322719913, + "loss": 3.6521, "step": 16800 }, { - "epoch": 1.813583037347971, - "grad_norm": 0.5999295711517334, - "learning_rate": 0.0004917401142118306, - "loss": 3.6537, + "epoch": 1.8167115902964959, + "grad_norm": 0.543329656124115, + "learning_rate": 0.0004915596330275229, + "loss": 3.649, "step": 16850 }, { - "epoch": 1.8189645893875794, - "grad_norm": 0.5953883528709412, - "learning_rate": 0.0004914168731817692, - "loss": 3.6539, + "epoch": 1.8221024258760108, + "grad_norm": 0.5550561547279358, + "learning_rate": 0.0004912358337830544, + "loss": 3.6587, "step": 16900 }, { - "epoch": 1.8243461414271875, - "grad_norm": 0.597101628780365, - "learning_rate": 0.0004910936321517078, - "loss": 3.6471, + "epoch": 1.8274932614555257, + "grad_norm": 0.565937876701355, + "learning_rate": 0.000490912034538586, + "loss": 3.6419, "step": 16950 }, { - "epoch": 1.8297276934667959, - "grad_norm": 0.5295618772506714, - "learning_rate": 0.0004907703911216463, - "loss": 3.6326, + "epoch": 1.8328840970350404, + "grad_norm": 0.5782588720321655, + "learning_rate": 0.0004905882352941175, + "loss": 3.6355, "step": 17000 }, { - "epoch": 1.8297276934667959, - "eval_accuracy": 0.35910942008585534, - "eval_loss": 3.6050846576690674, - "eval_runtime": 184.1833, - "eval_samples_per_second": 97.788, - "eval_steps_per_second": 6.113, + "epoch": 1.8328840970350404, + "eval_accuracy": 0.35895860987756883, + "eval_loss": 3.604949474334717, + "eval_runtime": 183.9848, + "eval_samples_per_second": 97.894, + "eval_steps_per_second": 6.12, "step": 17000 }, { - "epoch": 1.8351092455064042, - "grad_norm": 0.6159751415252686, - "learning_rate": 0.0004904471500915849, - "loss": 3.6271, + "epoch": 1.838274932614555, + "grad_norm": 0.5248723030090332, + "learning_rate": 0.0004902644360496492, + "loss": 3.6454, "step": 17050 }, { - "epoch": 1.8404907975460123, - "grad_norm": 0.6205544471740723, - "learning_rate": 0.0004901239090615235, - "loss": 3.6651, + "epoch": 1.8436657681940702, + "grad_norm": 0.5723742246627808, + "learning_rate": 0.0004899406368051808, + "loss": 3.6462, "step": 17100 }, { - "epoch": 1.8458723495856204, - "grad_norm": 0.5399280786514282, - "learning_rate": 0.000489800668031462, - "loss": 3.6452, + "epoch": 1.849056603773585, + "grad_norm": 0.5356608033180237, + "learning_rate": 0.0004896168375607123, + "loss": 3.6337, "step": 17150 }, { - "epoch": 1.8512539016252287, - "grad_norm": 0.5837388038635254, - "learning_rate": 0.0004894774270014007, - "loss": 3.6575, + "epoch": 1.8544474393530996, + "grad_norm": 0.5415787100791931, + "learning_rate": 0.0004892930383162439, + "loss": 3.6609, "step": 17200 }, { - "epoch": 1.856635453664837, - "grad_norm": 0.5471201539039612, - "learning_rate": 0.0004891541859713392, - "loss": 3.6555, + "epoch": 1.8598382749326146, + "grad_norm": 0.5791612863540649, + "learning_rate": 0.0004889692390717754, + "loss": 3.67, "step": 17250 }, { - "epoch": 1.8620170057044452, - "grad_norm": 0.5392946600914001, - "learning_rate": 0.0004888309449412779, - "loss": 3.6453, + "epoch": 1.8652291105121295, + "grad_norm": 0.5022833347320557, + "learning_rate": 0.0004886454398273071, + "loss": 3.6535, "step": 17300 }, { - "epoch": 1.8673985577440533, - "grad_norm": 0.5226132869720459, - "learning_rate": 0.0004885077039112164, - "loss": 3.6449, + "epoch": 1.8706199460916442, + "grad_norm": 0.5456568598747253, + "learning_rate": 0.0004883216405828386, + "loss": 3.6421, "step": 17350 }, { - "epoch": 1.8727801097836616, - "grad_norm": 0.5569941401481628, - "learning_rate": 0.00048818446288115497, - "loss": 3.629, + "epoch": 1.8760107816711589, + "grad_norm": 0.576461672782898, + "learning_rate": 0.00048799784133837017, + "loss": 3.6569, "step": 17400 }, { - "epoch": 1.87816166182327, - "grad_norm": 0.5523909330368042, - "learning_rate": 0.0004878612218510936, - "loss": 3.6612, + "epoch": 1.881401617250674, + "grad_norm": 0.5383881330490112, + "learning_rate": 0.0004876740420939017, + "loss": 3.6289, "step": 17450 }, { - "epoch": 1.883543213862878, - "grad_norm": 0.5485296845436096, - "learning_rate": 0.0004875379808210322, - "loss": 3.6438, + "epoch": 1.8867924528301887, + "grad_norm": 0.5738885402679443, + "learning_rate": 0.0004873502428494333, + "loss": 3.6522, "step": 17500 }, { - "epoch": 1.8889247659024861, - "grad_norm": 0.5691823959350586, - "learning_rate": 0.00048721473979097075, - "loss": 3.6349, + "epoch": 1.8921832884097034, + "grad_norm": 0.5352233052253723, + "learning_rate": 0.0004870264436049649, + "loss": 3.6572, "step": 17550 }, { - "epoch": 1.8943063179420945, - "grad_norm": 0.5617777705192566, - "learning_rate": 0.00048689149876090935, - "loss": 3.6613, + "epoch": 1.8975741239892183, + "grad_norm": 0.5754483342170715, + "learning_rate": 0.00048670264436049643, + "loss": 3.6438, "step": 17600 }, { - "epoch": 1.8996878699817028, - "grad_norm": 0.6149376034736633, - "learning_rate": 0.0004865682577308479, - "loss": 3.6477, + "epoch": 1.9029649595687332, + "grad_norm": 0.5442389249801636, + "learning_rate": 0.00048637884511602803, + "loss": 3.6423, "step": 17650 }, { - "epoch": 1.905069422021311, - "grad_norm": 0.5364314913749695, - "learning_rate": 0.0004862450167007865, - "loss": 3.6471, + "epoch": 1.908355795148248, + "grad_norm": 0.5781295299530029, + "learning_rate": 0.0004860550458715596, + "loss": 3.6568, "step": 17700 }, { - "epoch": 1.910450974060919, - "grad_norm": 0.5560828447341919, - "learning_rate": 0.00048592177567072513, - "loss": 3.6449, + "epoch": 1.9137466307277629, + "grad_norm": 0.5495123863220215, + "learning_rate": 0.0004857312466270912, + "loss": 3.6336, "step": 17750 }, { - "epoch": 1.9158325261005273, - "grad_norm": 0.562412440776825, - "learning_rate": 0.00048559853464066367, - "loss": 3.6439, + "epoch": 1.9191374663072778, + "grad_norm": 0.5798534750938416, + "learning_rate": 0.00048540744738262274, + "loss": 3.6362, "step": 17800 }, { - "epoch": 1.9212140781401357, - "grad_norm": 0.5733065605163574, - "learning_rate": 0.00048527529361060227, - "loss": 3.6473, + "epoch": 1.9245283018867925, + "grad_norm": 0.4734729826450348, + "learning_rate": 0.00048508364813815434, + "loss": 3.6268, "step": 17850 }, { - "epoch": 1.9265956301797438, - "grad_norm": 0.590917706489563, - "learning_rate": 0.00048495205258054086, - "loss": 3.6377, + "epoch": 1.9299191374663072, + "grad_norm": 0.5227976441383362, + "learning_rate": 0.00048475984889368584, + "loss": 3.6317, "step": 17900 }, { - "epoch": 1.931977182219352, - "grad_norm": 0.6305859684944153, - "learning_rate": 0.0004846288115504794, - "loss": 3.6313, + "epoch": 1.935309973045822, + "grad_norm": 0.5543326735496521, + "learning_rate": 0.0004844360496492175, + "loss": 3.641, "step": 17950 }, { - "epoch": 1.9373587342589604, - "grad_norm": 0.5712413787841797, - "learning_rate": 0.000484305570520418, - "loss": 3.6246, + "epoch": 1.940700808625337, + "grad_norm": 0.5557667016983032, + "learning_rate": 0.000484112250404749, + "loss": 3.6474, "step": 18000 }, { - "epoch": 1.9373587342589604, - "eval_accuracy": 0.36066337368737805, - "eval_loss": 3.5890228748321533, - "eval_runtime": 184.3111, - "eval_samples_per_second": 97.721, - "eval_steps_per_second": 6.109, + "epoch": 1.940700808625337, + "eval_accuracy": 0.360239953383565, + "eval_loss": 3.5897815227508545, + "eval_runtime": 183.5998, + "eval_samples_per_second": 98.099, + "eval_steps_per_second": 6.133, "step": 18000 }, { - "epoch": 1.9427402862985685, - "grad_norm": 0.5944366455078125, - "learning_rate": 0.00048398232949035665, - "loss": 3.6456, + "epoch": 1.9460916442048517, + "grad_norm": 0.5602201223373413, + "learning_rate": 0.00048378845116028055, + "loss": 3.6239, "step": 18050 }, { - "epoch": 1.9481218383381766, - "grad_norm": 0.5790172815322876, - "learning_rate": 0.0004836590884602952, - "loss": 3.6318, + "epoch": 1.9514824797843666, + "grad_norm": 0.5105127096176147, + "learning_rate": 0.00048346465191581215, + "loss": 3.6487, "step": 18100 }, { - "epoch": 1.953503390377785, - "grad_norm": 0.5425114631652832, - "learning_rate": 0.000483342312250835, - "loss": 3.6534, + "epoch": 1.9568733153638815, + "grad_norm": 0.5635515451431274, + "learning_rate": 0.0004831408526713437, + "loss": 3.6442, "step": 18150 }, { - "epoch": 1.9588849424173933, - "grad_norm": 0.5681806802749634, - "learning_rate": 0.0004830190712207736, - "loss": 3.6524, + "epoch": 1.9622641509433962, + "grad_norm": 0.5971663594245911, + "learning_rate": 0.0004828170534268753, + "loss": 3.6348, "step": 18200 }, { - "epoch": 1.9642664944570014, - "grad_norm": 0.6699956655502319, - "learning_rate": 0.0004826958301907122, - "loss": 3.6431, + "epoch": 1.967654986522911, + "grad_norm": 0.538390040397644, + "learning_rate": 0.00048249325418240686, + "loss": 3.6496, "step": 18250 }, { - "epoch": 1.9696480464966095, - "grad_norm": 0.5629168152809143, - "learning_rate": 0.0004823725891606507, - "loss": 3.6236, + "epoch": 1.9730458221024259, + "grad_norm": 0.5383585095405579, + "learning_rate": 0.00048216945493793846, + "loss": 3.6451, "step": 18300 }, { - "epoch": 1.9750295985362178, - "grad_norm": 0.5990884900093079, - "learning_rate": 0.0004820493481305893, - "loss": 3.6318, + "epoch": 1.9784366576819408, + "grad_norm": 0.6232275366783142, + "learning_rate": 0.00048184565569347, + "loss": 3.63, "step": 18350 }, { - "epoch": 1.9804111505758262, - "grad_norm": 0.560914158821106, - "learning_rate": 0.00048172610710052797, - "loss": 3.6473, + "epoch": 1.9838274932614555, + "grad_norm": 0.5328205823898315, + "learning_rate": 0.0004815218564490016, + "loss": 3.6627, "step": 18400 }, { - "epoch": 1.9857927026154343, - "grad_norm": 0.6116918325424194, - "learning_rate": 0.0004814028660704665, - "loss": 3.6316, + "epoch": 1.9892183288409704, + "grad_norm": 0.6225987672805786, + "learning_rate": 0.00048119805720453317, + "loss": 3.6153, "step": 18450 }, { - "epoch": 1.9911742546550424, - "grad_norm": 0.5204751491546631, - "learning_rate": 0.0004810796250404051, - "loss": 3.6317, + "epoch": 1.9946091644204853, + "grad_norm": 0.577847957611084, + "learning_rate": 0.0004808742579600647, + "loss": 3.6233, "step": 18500 }, { - "epoch": 1.9965558066946507, - "grad_norm": 0.5705627799034119, - "learning_rate": 0.00048075638401034364, - "loss": 3.6266, + "epoch": 2.0, + "grad_norm": 1.1217093467712402, + "learning_rate": 0.0004805504587155963, + "loss": 3.6389, "step": 18550 }, { - "epoch": 2.001937358734259, - "grad_norm": 0.5522881746292114, - "learning_rate": 0.00048043314298028224, - "loss": 3.6035, + "epoch": 2.0053908355795147, + "grad_norm": 0.5509372353553772, + "learning_rate": 0.0004802266594711278, + "loss": 3.5494, "step": 18600 }, { - "epoch": 2.007318910773867, - "grad_norm": 0.5946189165115356, - "learning_rate": 0.00048010990195022083, - "loss": 3.5188, + "epoch": 2.01078167115903, + "grad_norm": 0.5721856355667114, + "learning_rate": 0.0004799028602266594, + "loss": 3.5514, "step": 18650 }, { - "epoch": 2.0127004628134753, - "grad_norm": 0.5753241777420044, - "learning_rate": 0.0004797866609201594, - "loss": 3.5602, + "epoch": 2.0161725067385445, + "grad_norm": 0.5371805429458618, + "learning_rate": 0.000479579060982191, + "loss": 3.5437, "step": 18700 }, { - "epoch": 2.018082014853084, - "grad_norm": 0.5759482979774475, - "learning_rate": 0.000479463419890098, - "loss": 3.5473, + "epoch": 2.0215633423180592, + "grad_norm": 0.5734500288963318, + "learning_rate": 0.0004792552617377226, + "loss": 3.5601, "step": 18750 }, { - "epoch": 2.023463566892692, - "grad_norm": 0.5465207695960999, - "learning_rate": 0.0004791401788600366, - "loss": 3.5444, + "epoch": 2.026954177897574, + "grad_norm": 0.5779300928115845, + "learning_rate": 0.00047893146249325413, + "loss": 3.5493, "step": 18800 }, { - "epoch": 2.0288451189323, - "grad_norm": 0.5923919081687927, - "learning_rate": 0.00047881693782997515, - "loss": 3.5464, + "epoch": 2.032345013477089, + "grad_norm": 0.5518189072608948, + "learning_rate": 0.0004786076632487857, + "loss": 3.5319, "step": 18850 }, { - "epoch": 2.034226670971908, - "grad_norm": 0.5646787881851196, - "learning_rate": 0.00047849369679991375, - "loss": 3.5536, + "epoch": 2.0377358490566038, + "grad_norm": 0.5561443567276001, + "learning_rate": 0.0004782838640043173, + "loss": 3.5588, "step": 18900 }, { - "epoch": 2.0396082230115167, - "grad_norm": 0.5978553295135498, - "learning_rate": 0.0004781704557698523, - "loss": 3.5624, + "epoch": 2.0431266846361185, + "grad_norm": 0.5284110307693481, + "learning_rate": 0.00047796006475984883, + "loss": 3.5496, "step": 18950 }, { - "epoch": 2.044989775051125, - "grad_norm": 0.5860423445701599, - "learning_rate": 0.00047784721473979094, - "loss": 3.5539, + "epoch": 2.0485175202156336, + "grad_norm": 0.5990350246429443, + "learning_rate": 0.0004776427415002698, + "loss": 3.5646, "step": 19000 }, { - "epoch": 2.044989775051125, - "eval_accuracy": 0.36169546746872777, - "eval_loss": 3.579106092453003, - "eval_runtime": 184.1309, - "eval_samples_per_second": 97.816, - "eval_steps_per_second": 6.115, + "epoch": 2.0485175202156336, + "eval_accuracy": 0.3613797221767389, + "eval_loss": 3.5839173793792725, + "eval_runtime": 183.9308, + "eval_samples_per_second": 97.923, + "eval_steps_per_second": 6.122, "step": 19000 }, { - "epoch": 2.050371327090733, - "grad_norm": 0.5495768785476685, - "learning_rate": 0.00047752397370972953, - "loss": 3.5449, + "epoch": 2.0539083557951483, + "grad_norm": 0.5714938640594482, + "learning_rate": 0.0004773189422558014, + "loss": 3.5236, "step": 19050 }, { - "epoch": 2.055752879130341, - "grad_norm": 0.6042999625205994, - "learning_rate": 0.0004772007326796681, - "loss": 3.554, + "epoch": 2.059299191374663, + "grad_norm": 0.5469087958335876, + "learning_rate": 0.00047699514301133294, + "loss": 3.564, "step": 19100 }, { - "epoch": 2.0611344311699495, - "grad_norm": 0.6665642857551575, - "learning_rate": 0.00047687749164960667, - "loss": 3.5502, + "epoch": 2.0646900269541777, + "grad_norm": 0.5344583988189697, + "learning_rate": 0.00047667134376686455, + "loss": 3.5494, "step": 19150 }, { - "epoch": 2.0665159832095576, - "grad_norm": 0.5724649429321289, - "learning_rate": 0.00047655425061954526, - "loss": 3.5659, + "epoch": 2.070080862533693, + "grad_norm": 0.5535686612129211, + "learning_rate": 0.0004763475445223961, + "loss": 3.573, "step": 19200 }, { - "epoch": 2.0718975352491658, - "grad_norm": 0.5169804692268372, - "learning_rate": 0.00047623100958948386, - "loss": 3.548, + "epoch": 2.0754716981132075, + "grad_norm": 0.5836669206619263, + "learning_rate": 0.0004760237452779276, + "loss": 3.5682, "step": 19250 }, { - "epoch": 2.0772790872887743, - "grad_norm": 0.5063441395759583, - "learning_rate": 0.00047590776855942245, - "loss": 3.5655, + "epoch": 2.0808625336927222, + "grad_norm": 0.5648289918899536, + "learning_rate": 0.0004756999460334592, + "loss": 3.5661, "step": 19300 }, { - "epoch": 2.0826606393283824, - "grad_norm": 0.5674535632133484, - "learning_rate": 0.00047558452752936105, - "loss": 3.5459, + "epoch": 2.0862533692722374, + "grad_norm": 0.5942755341529846, + "learning_rate": 0.00047537614678899075, + "loss": 3.5567, "step": 19350 }, { - "epoch": 2.0880421913679905, - "grad_norm": 0.6405752301216125, - "learning_rate": 0.0004752612864992996, - "loss": 3.5313, + "epoch": 2.091644204851752, + "grad_norm": 0.5261635780334473, + "learning_rate": 0.00047505234754452235, + "loss": 3.5605, "step": 19400 }, { - "epoch": 2.0934237434075986, - "grad_norm": 0.586518406867981, - "learning_rate": 0.0004749380454692382, - "loss": 3.5814, + "epoch": 2.0970350404312668, + "grad_norm": 0.5810580849647522, + "learning_rate": 0.0004747285483000539, + "loss": 3.548, "step": 19450 }, { - "epoch": 2.098805295447207, - "grad_norm": 0.597720742225647, - "learning_rate": 0.0004746148044391767, - "loss": 3.5498, + "epoch": 2.1024258760107815, + "grad_norm": 0.5799669623374939, + "learning_rate": 0.0004744047490555855, + "loss": 3.5427, "step": 19500 }, { - "epoch": 2.1041868474868153, - "grad_norm": 0.6323477625846863, - "learning_rate": 0.00047429156340911537, - "loss": 3.5417, + "epoch": 2.1078167115902966, + "grad_norm": 0.6052512526512146, + "learning_rate": 0.00047408094981111706, + "loss": 3.5616, "step": 19550 }, { - "epoch": 2.1095683995264234, - "grad_norm": 0.6271188259124756, - "learning_rate": 0.00047396832237905397, - "loss": 3.5692, + "epoch": 2.1132075471698113, + "grad_norm": 0.5334910750389099, + "learning_rate": 0.00047375715056664866, + "loss": 3.5573, "step": 19600 }, { - "epoch": 2.1149499515660315, - "grad_norm": 0.5441372394561768, - "learning_rate": 0.0004736450813489925, - "loss": 3.5634, + "epoch": 2.118598382749326, + "grad_norm": 0.5874738097190857, + "learning_rate": 0.0004734333513221802, + "loss": 3.5585, "step": 19650 }, { - "epoch": 2.12033150360564, - "grad_norm": 0.5787696838378906, - "learning_rate": 0.0004733218403189311, - "loss": 3.5525, + "epoch": 2.123989218328841, + "grad_norm": 0.5650529861450195, + "learning_rate": 0.00047310955207771177, + "loss": 3.5481, "step": 19700 }, { - "epoch": 2.125713055645248, - "grad_norm": 0.6129936575889587, - "learning_rate": 0.0004729985992888697, - "loss": 3.5492, + "epoch": 2.129380053908356, + "grad_norm": 0.5615851283073425, + "learning_rate": 0.00047278575283324337, + "loss": 3.5418, "step": 19750 }, { - "epoch": 2.1310946076848563, - "grad_norm": 0.585528552532196, - "learning_rate": 0.00047267535825880824, - "loss": 3.5742, + "epoch": 2.1347708894878705, + "grad_norm": 0.546328604221344, + "learning_rate": 0.0004724619535887749, + "loss": 3.5744, "step": 19800 }, { - "epoch": 2.1364761597244644, - "grad_norm": 0.8197336792945862, - "learning_rate": 0.0004723521172287469, - "loss": 3.5695, + "epoch": 2.1401617250673857, + "grad_norm": 0.5430625081062317, + "learning_rate": 0.0004721381543443065, + "loss": 3.5731, "step": 19850 }, { - "epoch": 2.141857711764073, - "grad_norm": 0.5523197054862976, - "learning_rate": 0.0004720288761986855, - "loss": 3.5365, + "epoch": 2.1455525606469004, + "grad_norm": 0.9113521575927734, + "learning_rate": 0.000471814355099838, + "loss": 3.5625, "step": 19900 }, { - "epoch": 2.147239263803681, - "grad_norm": 0.5677591562271118, - "learning_rate": 0.000471705635168624, - "loss": 3.5599, + "epoch": 2.150943396226415, + "grad_norm": 0.5765600800514221, + "learning_rate": 0.0004714905558553697, + "loss": 3.5601, "step": 19950 }, { - "epoch": 2.152620815843289, - "grad_norm": 0.5868083238601685, - "learning_rate": 0.0004713823941385626, - "loss": 3.562, + "epoch": 2.1563342318059298, + "grad_norm": 0.5884748101234436, + "learning_rate": 0.0004711667566109012, + "loss": 3.5599, "step": 20000 }, { - "epoch": 2.152620815843289, - "eval_accuracy": 0.3632018311053878, - "eval_loss": 3.570688009262085, - "eval_runtime": 184.0571, - "eval_samples_per_second": 97.856, - "eval_steps_per_second": 6.118, + "epoch": 2.1563342318059298, + "eval_accuracy": 0.3625327466222262, + "eval_loss": 3.5735793113708496, + "eval_runtime": 183.4843, + "eval_samples_per_second": 98.161, + "eval_steps_per_second": 6.137, "step": 20000 }, { - "epoch": 2.1580023678828972, - "grad_norm": 0.561229944229126, - "learning_rate": 0.00047105915310850116, - "loss": 3.5775, + "epoch": 2.161725067385445, + "grad_norm": 0.5843698382377625, + "learning_rate": 0.00047084295736643273, + "loss": 3.5608, "step": 20050 }, { - "epoch": 2.163383919922506, - "grad_norm": 0.5729883909225464, - "learning_rate": 0.0004707359120784398, - "loss": 3.5575, + "epoch": 2.1671159029649596, + "grad_norm": 0.5710905194282532, + "learning_rate": 0.00047051915812196433, + "loss": 3.571, "step": 20100 }, { - "epoch": 2.168765471962114, - "grad_norm": 0.6092100739479065, - "learning_rate": 0.0004704126710483784, - "loss": 3.5663, + "epoch": 2.1725067385444743, + "grad_norm": 0.5336669683456421, + "learning_rate": 0.0004701953588774959, + "loss": 3.5473, "step": 20150 }, { - "epoch": 2.174147024001722, - "grad_norm": 0.5391753911972046, - "learning_rate": 0.0004700958948389182, - "loss": 3.5662, + "epoch": 2.177897574123989, + "grad_norm": 0.5899220705032349, + "learning_rate": 0.0004698715596330275, + "loss": 3.572, "step": 20200 }, { - "epoch": 2.1795285760413305, - "grad_norm": 0.5216943621635437, - "learning_rate": 0.0004697726538088568, - "loss": 3.5703, + "epoch": 2.183288409703504, + "grad_norm": 0.5227386355400085, + "learning_rate": 0.00046954776038855904, + "loss": 3.5711, "step": 20250 }, { - "epoch": 2.1849101280809387, - "grad_norm": 0.5354718565940857, - "learning_rate": 0.00046944941277879534, - "loss": 3.5427, + "epoch": 2.188679245283019, + "grad_norm": 0.5975064039230347, + "learning_rate": 0.00046922396114409064, + "loss": 3.546, "step": 20300 }, { - "epoch": 2.1902916801205468, - "grad_norm": 0.6368331909179688, - "learning_rate": 0.00046912617174873394, - "loss": 3.5627, + "epoch": 2.1940700808625335, + "grad_norm": 0.5341865420341492, + "learning_rate": 0.0004689001618996222, + "loss": 3.5638, "step": 20350 }, { - "epoch": 2.195673232160155, - "grad_norm": 0.5471709370613098, - "learning_rate": 0.0004688029307186725, - "loss": 3.5501, + "epoch": 2.1994609164420487, + "grad_norm": 0.5187112092971802, + "learning_rate": 0.0004685763626551538, + "loss": 3.5612, "step": 20400 }, { - "epoch": 2.2010547841997634, - "grad_norm": 0.5695568919181824, - "learning_rate": 0.00046847968968861107, - "loss": 3.5601, + "epoch": 2.2048517520215634, + "grad_norm": 0.5640769600868225, + "learning_rate": 0.00046825256341068535, + "loss": 3.5616, "step": 20450 }, { - "epoch": 2.2064363362393715, - "grad_norm": 0.6059097647666931, - "learning_rate": 0.0004681564486585497, - "loss": 3.5678, + "epoch": 2.210242587601078, + "grad_norm": 0.49423614144325256, + "learning_rate": 0.0004679287641662169, + "loss": 3.5668, "step": 20500 }, { - "epoch": 2.2118178882789796, - "grad_norm": 0.5702983140945435, - "learning_rate": 0.00046783320762848826, - "loss": 3.5725, + "epoch": 2.215633423180593, + "grad_norm": 0.6228650808334351, + "learning_rate": 0.0004676049649217485, + "loss": 3.5672, "step": 20550 }, { - "epoch": 2.2171994403185877, - "grad_norm": 0.5607845783233643, - "learning_rate": 0.00046750996659842685, - "loss": 3.5553, + "epoch": 2.221024258760108, + "grad_norm": 0.5874422192573547, + "learning_rate": 0.00046728116567728, + "loss": 3.5723, "step": 20600 }, { - "epoch": 2.2225809923581963, - "grad_norm": 0.5489804148674011, - "learning_rate": 0.00046718672556836545, - "loss": 3.5429, + "epoch": 2.2264150943396226, + "grad_norm": 0.5931791663169861, + "learning_rate": 0.0004669573664328116, + "loss": 3.5598, "step": 20650 }, { - "epoch": 2.2279625443978044, - "grad_norm": 0.5686173439025879, - "learning_rate": 0.000466863484538304, - "loss": 3.5843, + "epoch": 2.2318059299191373, + "grad_norm": 0.5493602156639099, + "learning_rate": 0.00046663356718834316, + "loss": 3.5606, "step": 20700 }, { - "epoch": 2.2333440964374125, - "grad_norm": 0.6201581358909607, - "learning_rate": 0.0004665402435082426, - "loss": 3.5601, + "epoch": 2.2371967654986524, + "grad_norm": 0.5716580748558044, + "learning_rate": 0.00046630976794387476, + "loss": 3.5628, "step": 20750 }, { - "epoch": 2.2387256484770206, - "grad_norm": 0.5640190243721008, - "learning_rate": 0.00046621700247818123, - "loss": 3.5754, + "epoch": 2.242587601078167, + "grad_norm": 0.5286273956298828, + "learning_rate": 0.0004659859686994063, + "loss": 3.5669, "step": 20800 }, { - "epoch": 2.244107200516629, - "grad_norm": 0.6988780498504639, - "learning_rate": 0.0004658937614481198, - "loss": 3.5475, + "epoch": 2.247978436657682, + "grad_norm": 0.5628600120544434, + "learning_rate": 0.0004656621694549379, + "loss": 3.5479, "step": 20850 }, { - "epoch": 2.2494887525562373, - "grad_norm": 0.616712749004364, - "learning_rate": 0.00046557052041805837, - "loss": 3.5727, + "epoch": 2.2533692722371965, + "grad_norm": 0.6254209280014038, + "learning_rate": 0.00046533837021046947, + "loss": 3.5606, "step": 20900 }, { - "epoch": 2.2548703045958454, - "grad_norm": 0.5751016139984131, - "learning_rate": 0.0004652472793879969, - "loss": 3.5648, + "epoch": 2.2587601078167117, + "grad_norm": 0.5592355132102966, + "learning_rate": 0.000465014570966001, + "loss": 3.565, "step": 20950 }, { - "epoch": 2.2602518566354535, - "grad_norm": 0.551199734210968, - "learning_rate": 0.0004649240383579355, - "loss": 3.5603, + "epoch": 2.2641509433962264, + "grad_norm": 0.559055507183075, + "learning_rate": 0.0004646907717215326, + "loss": 3.5482, "step": 21000 }, { - "epoch": 2.2602518566354535, - "eval_accuracy": 0.36398217614567135, - "eval_loss": 3.5630135536193848, - "eval_runtime": 184.4108, - "eval_samples_per_second": 97.668, - "eval_steps_per_second": 6.106, + "epoch": 2.2641509433962264, + "eval_accuracy": 0.36398869531894024, + "eval_loss": 3.559917688369751, + "eval_runtime": 183.7189, + "eval_samples_per_second": 98.036, + "eval_steps_per_second": 6.129, "step": 21000 }, { - "epoch": 2.265633408675062, - "grad_norm": 0.6206423044204712, - "learning_rate": 0.00046460079732787415, - "loss": 3.5794, + "epoch": 2.269541778975741, + "grad_norm": 0.5590160489082336, + "learning_rate": 0.0004643669724770642, + "loss": 3.5418, "step": 21050 }, { - "epoch": 2.27101496071467, - "grad_norm": 0.6502292156219482, - "learning_rate": 0.0004642775562978127, - "loss": 3.5543, + "epoch": 2.274932614555256, + "grad_norm": 0.5151308178901672, + "learning_rate": 0.0004640431732325958, + "loss": 3.5741, "step": 21100 }, { - "epoch": 2.2763965127542782, - "grad_norm": 0.5624255537986755, - "learning_rate": 0.0004639543152677513, - "loss": 3.5674, + "epoch": 2.280323450134771, + "grad_norm": 0.5813835859298706, + "learning_rate": 0.00046372584997301673, + "loss": 3.5675, "step": 21150 }, { - "epoch": 2.281778064793887, - "grad_norm": 0.6220043897628784, - "learning_rate": 0.0004636310742376899, - "loss": 3.5522, + "epoch": 2.2857142857142856, + "grad_norm": 0.5855079889297485, + "learning_rate": 0.0004634020507285483, + "loss": 3.5566, "step": 21200 }, { - "epoch": 2.287159616833495, - "grad_norm": 0.5274709463119507, - "learning_rate": 0.0004633078332076284, - "loss": 3.5407, + "epoch": 2.2911051212938007, + "grad_norm": 0.5452547073364258, + "learning_rate": 0.0004630782514840798, + "loss": 3.5307, "step": 21250 }, { - "epoch": 2.292541168873103, - "grad_norm": 0.5348024368286133, - "learning_rate": 0.000462984592177567, - "loss": 3.5461, + "epoch": 2.2964959568733154, + "grad_norm": 0.6098089814186096, + "learning_rate": 0.0004627544522396114, + "loss": 3.5531, "step": 21300 }, { - "epoch": 2.297922720912711, - "grad_norm": 0.5884036421775818, - "learning_rate": 0.00046266135114750567, - "loss": 3.5643, + "epoch": 2.30188679245283, + "grad_norm": 0.5738231539726257, + "learning_rate": 0.00046243065299514293, + "loss": 3.5683, "step": 21350 }, { - "epoch": 2.303304272952319, - "grad_norm": 0.5820002555847168, - "learning_rate": 0.0004623381101174442, - "loss": 3.5638, + "epoch": 2.3072776280323453, + "grad_norm": 0.5771190524101257, + "learning_rate": 0.00046210685375067454, + "loss": 3.5461, "step": 21400 }, { - "epoch": 2.3086858249919278, - "grad_norm": 0.6022721529006958, - "learning_rate": 0.0004620148690873828, - "loss": 3.5642, + "epoch": 2.31266846361186, + "grad_norm": 0.5646674633026123, + "learning_rate": 0.0004617830545062061, + "loss": 3.5582, "step": 21450 }, { - "epoch": 2.314067377031536, - "grad_norm": 0.5931360125541687, - "learning_rate": 0.00046169162805732134, - "loss": 3.5376, + "epoch": 2.3180592991913747, + "grad_norm": 0.5868499875068665, + "learning_rate": 0.0004614592552617377, + "loss": 3.5613, "step": 21500 }, { - "epoch": 2.319448929071144, - "grad_norm": 0.6036872863769531, - "learning_rate": 0.00046136838702725994, - "loss": 3.5702, + "epoch": 2.3234501347708894, + "grad_norm": 0.5620684027671814, + "learning_rate": 0.00046113545601726924, + "loss": 3.5472, "step": 21550 }, { - "epoch": 2.3248304811107525, - "grad_norm": 0.5146020650863647, - "learning_rate": 0.00046104514599719853, - "loss": 3.5693, + "epoch": 2.3288409703504045, + "grad_norm": 0.5650961995124817, + "learning_rate": 0.00046081165677280085, + "loss": 3.5579, "step": 21600 }, { - "epoch": 2.3302120331503606, - "grad_norm": 0.6073507070541382, - "learning_rate": 0.0004607219049671371, - "loss": 3.5649, + "epoch": 2.334231805929919, + "grad_norm": 0.5434009432792664, + "learning_rate": 0.0004604878575283324, + "loss": 3.5593, "step": 21650 }, { - "epoch": 2.3355935851899687, - "grad_norm": 0.57147216796875, - "learning_rate": 0.0004603986639370757, - "loss": 3.5451, + "epoch": 2.339622641509434, + "grad_norm": 0.512363851070404, + "learning_rate": 0.00046016405828386395, + "loss": 3.5606, "step": 21700 }, { - "epoch": 2.340975137229577, - "grad_norm": 0.6783694624900818, - "learning_rate": 0.0004600754229070143, - "loss": 3.5448, + "epoch": 2.3450134770889486, + "grad_norm": 0.6248358488082886, + "learning_rate": 0.00045984025903939555, + "loss": 3.5592, "step": 21750 }, { - "epoch": 2.3463566892691854, - "grad_norm": 0.5610417723655701, - "learning_rate": 0.00045975218187695286, - "loss": 3.5461, + "epoch": 2.3504043126684637, + "grad_norm": 0.5577827095985413, + "learning_rate": 0.0004595164597949271, + "loss": 3.5685, "step": 21800 }, { - "epoch": 2.3517382413087935, - "grad_norm": 0.5262538194656372, - "learning_rate": 0.00045942894084689145, + "epoch": 2.3557951482479784, + "grad_norm": 0.5624589920043945, + "learning_rate": 0.0004591926605504587, "loss": 3.5627, "step": 21850 }, { - "epoch": 2.3571197933484016, - "grad_norm": 0.5298727750778198, - "learning_rate": 0.0004591056998168301, - "loss": 3.5745, + "epoch": 2.361185983827493, + "grad_norm": 0.54544597864151, + "learning_rate": 0.0004588688613059902, + "loss": 3.5481, "step": 21900 }, { - "epoch": 2.3625013453880097, - "grad_norm": 0.5683722496032715, - "learning_rate": 0.00045878245878676864, - "loss": 3.5601, + "epoch": 2.3665768194070083, + "grad_norm": 0.5536805391311646, + "learning_rate": 0.00045854506206152186, + "loss": 3.5621, "step": 21950 }, { - "epoch": 2.3678828974276183, - "grad_norm": 0.5847147703170776, - "learning_rate": 0.00045845921775670723, - "loss": 3.546, + "epoch": 2.371967654986523, + "grad_norm": 0.5469595789909363, + "learning_rate": 0.00045822126281705336, + "loss": 3.562, "step": 22000 }, { - "epoch": 2.3678828974276183, - "eval_accuracy": 0.3645442375343357, - "eval_loss": 3.5517847537994385, - "eval_runtime": 183.8946, - "eval_samples_per_second": 97.942, - "eval_steps_per_second": 6.123, + "epoch": 2.371967654986523, + "eval_accuracy": 0.3649076814440751, + "eval_loss": 3.5507798194885254, + "eval_runtime": 183.8207, + "eval_samples_per_second": 97.981, + "eval_steps_per_second": 6.126, "step": 22000 }, { - "epoch": 2.3732644494672264, - "grad_norm": 0.5403057932853699, - "learning_rate": 0.0004581359767266458, - "loss": 3.5433, + "epoch": 2.3773584905660377, + "grad_norm": 0.5918774604797363, + "learning_rate": 0.00045789746357258497, + "loss": 3.538, "step": 22050 }, { - "epoch": 2.3786460015068345, - "grad_norm": 0.5787127614021301, - "learning_rate": 0.00045781273569658437, - "loss": 3.5425, + "epoch": 2.382749326145553, + "grad_norm": 0.5457159876823425, + "learning_rate": 0.0004575736643281165, + "loss": 3.5609, "step": 22100 }, { - "epoch": 2.384027553546443, - "grad_norm": 0.5755758881568909, - "learning_rate": 0.00045748949466652296, - "loss": 3.5561, + "epoch": 2.3881401617250675, + "grad_norm": 0.5094021558761597, + "learning_rate": 0.00045724986508364807, + "loss": 3.5564, "step": 22150 }, { - "epoch": 2.389409105586051, - "grad_norm": 0.6487693786621094, - "learning_rate": 0.00045716625363646156, - "loss": 3.5523, + "epoch": 2.393530997304582, + "grad_norm": 0.6784635186195374, + "learning_rate": 0.00045692606583917967, + "loss": 3.5821, "step": 22200 }, { - "epoch": 2.3947906576256592, - "grad_norm": 0.531506359577179, - "learning_rate": 0.00045684301260640015, - "loss": 3.5629, + "epoch": 2.398921832884097, + "grad_norm": 0.5692277550697327, + "learning_rate": 0.0004566022665947112, + "loss": 3.5521, "step": 22250 }, { - "epoch": 2.4001722096652673, - "grad_norm": 0.6475794315338135, - "learning_rate": 0.00045651977157633875, - "loss": 3.569, + "epoch": 2.404312668463612, + "grad_norm": 0.616717517375946, + "learning_rate": 0.0004562784673502428, + "loss": 3.5324, "step": 22300 }, { - "epoch": 2.4055537617048754, - "grad_norm": 0.5457855463027954, - "learning_rate": 0.0004561965305462773, - "loss": 3.5683, + "epoch": 2.4097035040431267, + "grad_norm": 0.5351888537406921, + "learning_rate": 0.0004559546681057744, + "loss": 3.5557, "step": 22350 }, { - "epoch": 2.410935313744484, - "grad_norm": 0.5556653141975403, - "learning_rate": 0.0004558732895162159, - "loss": 3.5537, + "epoch": 2.4150943396226414, + "grad_norm": 0.5345177054405212, + "learning_rate": 0.000455630868861306, + "loss": 3.5455, "step": 22400 }, { - "epoch": 2.416316865784092, - "grad_norm": 0.596947968006134, - "learning_rate": 0.0004555565133067557, - "loss": 3.552, + "epoch": 2.420485175202156, + "grad_norm": 0.5832816958427429, + "learning_rate": 0.00045530706961683753, + "loss": 3.5731, "step": 22450 }, { - "epoch": 2.4216984178237, - "grad_norm": 0.552823543548584, - "learning_rate": 0.0004552332722766943, - "loss": 3.5577, + "epoch": 2.4258760107816713, + "grad_norm": 0.6277952194213867, + "learning_rate": 0.00045498327037236914, + "loss": 3.5471, "step": 22500 }, { - "epoch": 2.4270799698633088, - "grad_norm": 0.5624712705612183, - "learning_rate": 0.0004549100312466328, - "loss": 3.5463, + "epoch": 2.431266846361186, + "grad_norm": 0.5768845677375793, + "learning_rate": 0.0004546594711279007, + "loss": 3.553, "step": 22550 }, { - "epoch": 2.432461521902917, - "grad_norm": 0.5931119322776794, - "learning_rate": 0.0004545867902165715, - "loss": 3.5592, + "epoch": 2.4366576819407006, + "grad_norm": 0.5368415713310242, + "learning_rate": 0.0004543356718834322, + "loss": 3.5538, "step": 22600 }, { - "epoch": 2.437843073942525, - "grad_norm": 0.6037808656692505, - "learning_rate": 0.00045426354918651007, - "loss": 3.547, + "epoch": 2.442048517520216, + "grad_norm": 0.6223293542861938, + "learning_rate": 0.0004540118726389638, + "loss": 3.5499, "step": 22650 }, { - "epoch": 2.443224625982133, - "grad_norm": 0.5643433928489685, - "learning_rate": 0.0004539403081564486, - "loss": 3.5637, + "epoch": 2.4474393530997305, + "grad_norm": 0.5658764243125916, + "learning_rate": 0.00045368807339449534, + "loss": 3.5387, "step": 22700 }, { - "epoch": 2.4486061780217416, - "grad_norm": 0.5612767934799194, - "learning_rate": 0.0004536170671263872, - "loss": 3.5429, + "epoch": 2.452830188679245, + "grad_norm": 0.58261638879776, + "learning_rate": 0.00045336427415002694, + "loss": 3.5648, "step": 22750 }, { - "epoch": 2.4539877300613497, - "grad_norm": 0.5697504878044128, - "learning_rate": 0.00045329382609632574, - "loss": 3.5538, + "epoch": 2.4582210242587603, + "grad_norm": 0.5521255731582642, + "learning_rate": 0.0004530404749055585, + "loss": 3.5745, "step": 22800 }, { - "epoch": 2.459369282100958, - "grad_norm": 0.5525127053260803, - "learning_rate": 0.0004529705850662644, - "loss": 3.5529, + "epoch": 2.463611859838275, + "grad_norm": 0.5698715448379517, + "learning_rate": 0.0004527166756610901, + "loss": 3.5392, "step": 22850 }, { - "epoch": 2.464750834140566, - "grad_norm": 0.6191689968109131, - "learning_rate": 0.000452647344036203, - "loss": 3.5386, + "epoch": 2.4690026954177897, + "grad_norm": 0.5453882813453674, + "learning_rate": 0.00045239287641662165, + "loss": 3.5493, "step": 22900 }, { - "epoch": 2.4701323861801745, - "grad_norm": 0.5639016032218933, - "learning_rate": 0.00045232410300614153, - "loss": 3.5754, + "epoch": 2.4743935309973044, + "grad_norm": 0.5743011832237244, + "learning_rate": 0.0004520690771721532, + "loss": 3.553, "step": 22950 }, { - "epoch": 2.4755139382197826, - "grad_norm": 0.6116729378700256, - "learning_rate": 0.0004520008619760801, - "loss": 3.5561, + "epoch": 2.4797843665768196, + "grad_norm": 0.5805160999298096, + "learning_rate": 0.0004517452779276848, + "loss": 3.5459, "step": 23000 }, { - "epoch": 2.4755139382197826, - "eval_accuracy": 0.3660388666591117, - "eval_loss": 3.5395596027374268, - "eval_runtime": 184.1301, - "eval_samples_per_second": 97.817, - "eval_steps_per_second": 6.115, + "epoch": 2.4797843665768196, + "eval_accuracy": 0.3661536041086438, + "eval_loss": 3.5383236408233643, + "eval_runtime": 183.8471, + "eval_samples_per_second": 97.967, + "eval_steps_per_second": 6.125, "step": 23000 }, { - "epoch": 2.4808954902593907, - "grad_norm": 0.5503861904144287, - "learning_rate": 0.0004516776209460187, - "loss": 3.5527, + "epoch": 2.4851752021563343, + "grad_norm": 0.5620664954185486, + "learning_rate": 0.00045142147868321636, + "loss": 3.5437, "step": 23050 }, { - "epoch": 2.4862770422989993, - "grad_norm": 0.5716922283172607, - "learning_rate": 0.00045135437991595726, - "loss": 3.5307, + "epoch": 2.490566037735849, + "grad_norm": 0.5519572496414185, + "learning_rate": 0.00045109767943874796, + "loss": 3.5415, "step": 23100 }, { - "epoch": 2.4916585943386074, - "grad_norm": 0.5459892153739929, - "learning_rate": 0.0004510311388858959, - "loss": 3.5607, + "epoch": 2.4959568733153636, + "grad_norm": 0.5925480723381042, + "learning_rate": 0.0004507738801942795, + "loss": 3.538, "step": 23150 }, { - "epoch": 2.4970401463782155, - "grad_norm": 0.6022793054580688, - "learning_rate": 0.0004507078978558345, - "loss": 3.5287, + "epoch": 2.501347708894879, + "grad_norm": 0.5759234428405762, + "learning_rate": 0.0004504500809498111, + "loss": 3.5515, "step": 23200 }, { - "epoch": 2.5024216984178236, - "grad_norm": 0.586172342300415, - "learning_rate": 0.00045038465682577304, - "loss": 3.5393, + "epoch": 2.5067385444743935, + "grad_norm": 0.5272903442382812, + "learning_rate": 0.0004501262817053426, + "loss": 3.5644, "step": 23250 }, { - "epoch": 2.5078032504574317, - "grad_norm": 0.5649908185005188, - "learning_rate": 0.00045006141579571164, - "loss": 3.5369, + "epoch": 2.512129380053908, + "grad_norm": 0.567111074924469, + "learning_rate": 0.00044980248246087427, + "loss": 3.5624, "step": 23300 }, { - "epoch": 2.5131848024970402, - "grad_norm": 0.6035097241401672, - "learning_rate": 0.0004497381747656502, - "loss": 3.5708, + "epoch": 2.5175202156334233, + "grad_norm": 0.5605776906013489, + "learning_rate": 0.00044947868321640577, + "loss": 3.5369, "step": 23350 }, { - "epoch": 2.5185663545366483, - "grad_norm": 0.5504043698310852, - "learning_rate": 0.00044941493373558877, - "loss": 3.5433, + "epoch": 2.522911051212938, + "grad_norm": 0.5629092454910278, + "learning_rate": 0.0004491548839719373, + "loss": 3.5407, "step": 23400 }, { - "epoch": 2.5239479065762565, - "grad_norm": 0.5592088103294373, - "learning_rate": 0.0004490916927055274, - "loss": 3.5428, + "epoch": 2.5283018867924527, + "grad_norm": 0.5650578141212463, + "learning_rate": 0.00044883756071235827, + "loss": 3.5449, "step": 23450 }, { - "epoch": 2.529329458615865, - "grad_norm": 0.5921136140823364, - "learning_rate": 0.00044876845167546596, - "loss": 3.5478, + "epoch": 2.533692722371968, + "grad_norm": 0.5151700973510742, + "learning_rate": 0.0004485137614678899, + "loss": 3.5247, "step": 23500 }, { - "epoch": 2.534711010655473, - "grad_norm": 0.5895003080368042, - "learning_rate": 0.00044844521064540455, - "loss": 3.5392, + "epoch": 2.5390835579514826, + "grad_norm": 0.6077624559402466, + "learning_rate": 0.0004481899622234214, + "loss": 3.5611, "step": 23550 }, { - "epoch": 2.540092562695081, - "grad_norm": 0.5717202425003052, - "learning_rate": 0.00044812196961534315, - "loss": 3.5425, + "epoch": 2.5444743935309972, + "grad_norm": 0.572530210018158, + "learning_rate": 0.00044786616297895303, + "loss": 3.5533, "step": 23600 }, { - "epoch": 2.5454741147346893, - "grad_norm": 0.5879724025726318, - "learning_rate": 0.0004477987285852817, - "loss": 3.5676, + "epoch": 2.5498652291105124, + "grad_norm": 0.5659880638122559, + "learning_rate": 0.0004475423637344846, + "loss": 3.5459, "step": 23650 }, { - "epoch": 2.550855666774298, - "grad_norm": 0.5525007247924805, - "learning_rate": 0.00044747548755522034, - "loss": 3.5493, + "epoch": 2.555256064690027, + "grad_norm": 0.5345480442047119, + "learning_rate": 0.00044721856449001613, + "loss": 3.5601, "step": 23700 }, { - "epoch": 2.556237218813906, - "grad_norm": 0.6007914543151855, - "learning_rate": 0.00044715224652515893, - "loss": 3.5563, + "epoch": 2.560646900269542, + "grad_norm": 0.5903283953666687, + "learning_rate": 0.00044689476524554774, + "loss": 3.5685, "step": 23750 }, { - "epoch": 2.561618770853514, - "grad_norm": 0.5593198537826538, - "learning_rate": 0.0004468290054950975, - "loss": 3.5568, + "epoch": 2.5660377358490565, + "grad_norm": 0.6142652630805969, + "learning_rate": 0.0004465709660010793, + "loss": 3.5422, "step": 23800 }, { - "epoch": 2.567000322893122, - "grad_norm": 0.5726816058158875, - "learning_rate": 0.00044650576446503607, - "loss": 3.5613, + "epoch": 2.571428571428571, + "grad_norm": 0.5753684043884277, + "learning_rate": 0.0004462471667566109, + "loss": 3.5405, "step": 23850 }, { - "epoch": 2.5723818749327307, - "grad_norm": 0.5541634559631348, - "learning_rate": 0.0004461825234349746, - "loss": 3.5451, + "epoch": 2.5768194070080863, + "grad_norm": 0.6161198616027832, + "learning_rate": 0.00044592336751214244, + "loss": 3.539, "step": 23900 }, { - "epoch": 2.577763426972339, - "grad_norm": 0.545291543006897, - "learning_rate": 0.0004458592824049132, - "loss": 3.5541, + "epoch": 2.582210242587601, + "grad_norm": 0.5361372828483582, + "learning_rate": 0.00044559956826767405, + "loss": 3.5651, "step": 23950 }, { - "epoch": 2.583144979011947, - "grad_norm": 0.5981674194335938, - "learning_rate": 0.00044553604137485185, - "loss": 3.5528, + "epoch": 2.5876010781671157, + "grad_norm": 0.5892603397369385, + "learning_rate": 0.00044527576902320554, + "loss": 3.5462, "step": 24000 }, { - "epoch": 2.583144979011947, - "eval_accuracy": 0.36709540734021967, - "eval_loss": 3.530630588531494, - "eval_runtime": 184.0185, - "eval_samples_per_second": 97.876, - "eval_steps_per_second": 6.119, + "epoch": 2.5876010781671157, + "eval_accuracy": 0.3668596305736623, + "eval_loss": 3.531897783279419, + "eval_runtime": 183.6864, + "eval_samples_per_second": 98.053, + "eval_steps_per_second": 6.13, "step": 24000 }, { - "epoch": 2.5885265310515555, - "grad_norm": 0.6674214005470276, - "learning_rate": 0.0004452128003447904, - "loss": 3.5395, + "epoch": 2.592991913746631, + "grad_norm": 0.6340603828430176, + "learning_rate": 0.00044495196977873715, + "loss": 3.5501, "step": 24050 }, { - "epoch": 2.5939080830911636, - "grad_norm": 0.5705569982528687, - "learning_rate": 0.000444889559314729, - "loss": 3.5585, + "epoch": 2.5983827493261455, + "grad_norm": 0.597061812877655, + "learning_rate": 0.0004446281705342687, + "loss": 3.5263, "step": 24100 }, { - "epoch": 2.5992896351307717, - "grad_norm": 0.5629080533981323, - "learning_rate": 0.0004445663182846676, - "loss": 3.5464, + "epoch": 2.6037735849056602, + "grad_norm": 0.5808542370796204, + "learning_rate": 0.00044430437128980025, + "loss": 3.5526, "step": 24150 }, { - "epoch": 2.60467118717038, - "grad_norm": 0.5345271229743958, - "learning_rate": 0.0004442430772546061, - "loss": 3.547, + "epoch": 2.6091644204851754, + "grad_norm": 0.5798895955085754, + "learning_rate": 0.00044398057204533185, + "loss": 3.5424, "step": 24200 }, { - "epoch": 2.610052739209988, - "grad_norm": 0.6864102482795715, - "learning_rate": 0.0004439198362245447, - "loss": 3.5398, + "epoch": 2.61455525606469, + "grad_norm": 0.5814908742904663, + "learning_rate": 0.0004436567728008634, + "loss": 3.5475, "step": 24250 }, { - "epoch": 2.6154342912495965, - "grad_norm": 0.5583524107933044, - "learning_rate": 0.00044359659519448337, - "loss": 3.534, + "epoch": 2.6199460916442048, + "grad_norm": 0.5761340856552124, + "learning_rate": 0.000443332973556395, + "loss": 3.5534, "step": 24300 }, { - "epoch": 2.6208158432892046, - "grad_norm": 0.5800455808639526, - "learning_rate": 0.0004432733541644219, - "loss": 3.5389, + "epoch": 2.62533692722372, + "grad_norm": 0.5729596614837646, + "learning_rate": 0.00044300917431192656, + "loss": 3.5558, "step": 24350 }, { - "epoch": 2.6261973953288127, - "grad_norm": 0.577061653137207, - "learning_rate": 0.0004429501131343605, - "loss": 3.5386, + "epoch": 2.6307277628032346, + "grad_norm": 0.5785598754882812, + "learning_rate": 0.00044268537506745816, + "loss": 3.5308, "step": 24400 }, { - "epoch": 2.6315789473684212, - "grad_norm": 0.5940808653831482, - "learning_rate": 0.00044262687210429904, - "loss": 3.5421, + "epoch": 2.6361185983827493, + "grad_norm": 0.5517538785934448, + "learning_rate": 0.0004423615758229897, + "loss": 3.5567, "step": 24450 }, { - "epoch": 2.6369604994080293, - "grad_norm": 0.55911785364151, - "learning_rate": 0.00044230363107423764, - "loss": 3.5452, + "epoch": 2.641509433962264, + "grad_norm": 0.5444765686988831, + "learning_rate": 0.0004420377765785213, + "loss": 3.5214, "step": 24500 }, { - "epoch": 2.6423420514476375, - "grad_norm": 0.6087802648544312, - "learning_rate": 0.0004419803900441762, - "loss": 3.5462, + "epoch": 2.6469002695417787, + "grad_norm": 0.5362924933433533, + "learning_rate": 0.00044171397733405287, + "loss": 3.5284, "step": 24550 }, { - "epoch": 2.6477236034872456, - "grad_norm": 0.5640580654144287, - "learning_rate": 0.0004416571490141148, - "loss": 3.5278, + "epoch": 2.652291105121294, + "grad_norm": 0.5639698505401611, + "learning_rate": 0.00044139017808958437, + "loss": 3.5345, "step": 24600 }, { - "epoch": 2.653105155526854, - "grad_norm": 0.6379339098930359, - "learning_rate": 0.0004413339079840534, - "loss": 3.5637, + "epoch": 2.6576819407008085, + "grad_norm": 0.5087600350379944, + "learning_rate": 0.00044106637884511597, + "loss": 3.5388, "step": 24650 }, { - "epoch": 2.658486707566462, - "grad_norm": 0.5877749919891357, - "learning_rate": 0.00044101066695399196, - "loss": 3.5535, + "epoch": 2.6630727762803232, + "grad_norm": 0.520511269569397, + "learning_rate": 0.0004407425796006475, + "loss": 3.5433, "step": 24700 }, { - "epoch": 2.6638682596060703, - "grad_norm": 0.5544137954711914, - "learning_rate": 0.0004406938907445318, - "loss": 3.5375, + "epoch": 2.6684636118598384, + "grad_norm": 0.5917512774467468, + "learning_rate": 0.00044041878035617913, + "loss": 3.5557, "step": 24750 }, { - "epoch": 2.6692498116456784, - "grad_norm": 0.5193669199943542, - "learning_rate": 0.00044037064971447036, - "loss": 3.5324, + "epoch": 2.673854447439353, + "grad_norm": 0.5649670958518982, + "learning_rate": 0.0004400949811117107, + "loss": 3.5326, "step": 24800 }, { - "epoch": 2.674631363685287, - "grad_norm": 0.5993558764457703, - "learning_rate": 0.00044004740868440896, - "loss": 3.5171, + "epoch": 2.6792452830188678, + "grad_norm": 0.5855710506439209, + "learning_rate": 0.0004397711818672423, + "loss": 3.5349, "step": 24850 }, { - "epoch": 2.680012915724895, - "grad_norm": 0.606824517250061, - "learning_rate": 0.00043972416765434755, - "loss": 3.5454, + "epoch": 2.684636118598383, + "grad_norm": 0.5762926936149597, + "learning_rate": 0.00043944738262277383, + "loss": 3.5423, "step": 24900 }, { - "epoch": 2.685394467764503, - "grad_norm": 0.5386514067649841, - "learning_rate": 0.00043940092662428615, - "loss": 3.5478, + "epoch": 2.6900269541778976, + "grad_norm": 0.5336632132530212, + "learning_rate": 0.00043912358337830544, + "loss": 3.5393, "step": 24950 }, { - "epoch": 2.6907760198041117, - "grad_norm": 0.5610313415527344, - "learning_rate": 0.00043907768559422474, - "loss": 3.5447, + "epoch": 2.6954177897574123, + "grad_norm": 0.5700528621673584, + "learning_rate": 0.000438799784133837, + "loss": 3.5523, "step": 25000 }, { - "epoch": 2.6907760198041117, - "eval_accuracy": 0.36763606410998456, - "eval_loss": 3.5236897468566895, - "eval_runtime": 184.5139, - "eval_samples_per_second": 97.613, - "eval_steps_per_second": 6.103, + "epoch": 2.6954177897574123, + "eval_accuracy": 0.367980167805693, + "eval_loss": 3.522153615951538, + "eval_runtime": 184.4043, + "eval_samples_per_second": 97.671, + "eval_steps_per_second": 6.106, "step": 25000 }, { - "epoch": 2.69615757184372, - "grad_norm": 0.5900009274482727, - "learning_rate": 0.00043875444456416334, - "loss": 3.5374, + "epoch": 2.7008086253369274, + "grad_norm": 0.6027398705482483, + "learning_rate": 0.00043847598488936854, + "loss": 3.5516, "step": 25050 }, { - "epoch": 2.701539123883328, - "grad_norm": 0.5528219938278198, - "learning_rate": 0.0004384312035341019, - "loss": 3.5508, + "epoch": 2.706199460916442, + "grad_norm": 0.5879538059234619, + "learning_rate": 0.00043815218564490014, + "loss": 3.5622, "step": 25100 }, { - "epoch": 2.706920675922936, - "grad_norm": 0.5879670977592468, - "learning_rate": 0.00043810796250404047, - "loss": 3.5379, + "epoch": 2.711590296495957, + "grad_norm": 0.5743058323860168, + "learning_rate": 0.0004378283864004317, + "loss": 3.5443, "step": 25150 }, { - "epoch": 2.712302227962544, - "grad_norm": 0.6659033894538879, - "learning_rate": 0.000437784721473979, - "loss": 3.539, + "epoch": 2.7169811320754715, + "grad_norm": 0.5928617715835571, + "learning_rate": 0.0004375045871559633, + "loss": 3.5291, "step": 25200 }, { - "epoch": 2.7176837800021527, - "grad_norm": 0.5466210246086121, - "learning_rate": 0.00043746148044391766, - "loss": 3.5477, + "epoch": 2.7223719676549867, + "grad_norm": 0.5669053196907043, + "learning_rate": 0.00043718078791149485, + "loss": 3.5318, "step": 25250 }, { - "epoch": 2.723065332041761, - "grad_norm": 0.5327200889587402, - "learning_rate": 0.00043713823941385625, - "loss": 3.546, + "epoch": 2.7277628032345014, + "grad_norm": 0.578813374042511, + "learning_rate": 0.00043685698866702645, + "loss": 3.5408, "step": 25300 }, { - "epoch": 2.728446884081369, - "grad_norm": 0.5748312473297119, - "learning_rate": 0.0004368149983837948, - "loss": 3.5431, + "epoch": 2.733153638814016, + "grad_norm": 0.538439929485321, + "learning_rate": 0.00043653318942255795, + "loss": 3.5313, "step": 25350 }, { - "epoch": 2.7338284361209775, - "grad_norm": 0.5589819550514221, - "learning_rate": 0.0004364917573537334, - "loss": 3.5408, + "epoch": 2.7385444743935308, + "grad_norm": 0.5391579270362854, + "learning_rate": 0.00043620939017808956, + "loss": 3.5237, "step": 25400 }, { - "epoch": 2.7392099881605856, - "grad_norm": 0.5818286538124084, - "learning_rate": 0.00043616851632367193, - "loss": 3.541, + "epoch": 2.743935309973046, + "grad_norm": 0.5517474412918091, + "learning_rate": 0.0004358855909336211, + "loss": 3.5417, "step": 25450 }, { - "epoch": 2.7445915402001937, - "grad_norm": 0.6057955026626587, - "learning_rate": 0.0004358452752936106, - "loss": 3.5448, + "epoch": 2.7493261455525606, + "grad_norm": 0.5709769129753113, + "learning_rate": 0.00043556179168915266, + "loss": 3.5577, "step": 25500 }, { - "epoch": 2.749973092239802, - "grad_norm": 0.5754573345184326, - "learning_rate": 0.0004355220342635492, - "loss": 3.5289, + "epoch": 2.7547169811320753, + "grad_norm": 0.6115706562995911, + "learning_rate": 0.00043523799244468426, + "loss": 3.5518, "step": 25550 }, { - "epoch": 2.7553546442794103, - "grad_norm": 0.5715060830116272, - "learning_rate": 0.00043519879323348777, - "loss": 3.5369, + "epoch": 2.7601078167115904, + "grad_norm": 0.58238685131073, + "learning_rate": 0.0004349141932002158, + "loss": 3.5414, "step": 25600 }, { - "epoch": 2.7607361963190185, - "grad_norm": 0.6356561779975891, - "learning_rate": 0.0004348755522034263, - "loss": 3.5337, + "epoch": 2.765498652291105, + "grad_norm": 0.5621404051780701, + "learning_rate": 0.0004345903939557474, + "loss": 3.5288, "step": 25650 }, { - "epoch": 2.7661177483586266, - "grad_norm": 0.5628809928894043, - "learning_rate": 0.0004345523111733649, - "loss": 3.5237, + "epoch": 2.77088948787062, + "grad_norm": 0.5885303616523743, + "learning_rate": 0.00043426659471127897, + "loss": 3.5514, "step": 25700 }, { - "epoch": 2.7714993003982347, - "grad_norm": 0.5270907282829285, - "learning_rate": 0.00043422907014330344, - "loss": 3.5477, + "epoch": 2.776280323450135, + "grad_norm": 0.5688592195510864, + "learning_rate": 0.00043394279546681057, + "loss": 3.5349, "step": 25750 }, { - "epoch": 2.776880852437843, - "grad_norm": 0.6290378570556641, - "learning_rate": 0.0004339058291132421, - "loss": 3.5433, + "epoch": 2.7816711590296497, + "grad_norm": 0.5848641395568848, + "learning_rate": 0.0004336189962223421, + "loss": 3.5355, "step": 25800 }, { - "epoch": 2.7822624044774513, - "grad_norm": 0.5788700580596924, - "learning_rate": 0.0004335825880831807, - "loss": 3.527, + "epoch": 2.7870619946091644, + "grad_norm": 0.5531324744224548, + "learning_rate": 0.0004332951969778737, + "loss": 3.523, "step": 25850 }, { - "epoch": 2.7876439565170594, - "grad_norm": 0.5836464166641235, - "learning_rate": 0.00043325934705311923, - "loss": 3.5346, + "epoch": 2.7924528301886795, + "grad_norm": 0.5385251045227051, + "learning_rate": 0.0004329713977334053, + "loss": 3.5402, "step": 25900 }, { - "epoch": 2.793025508556668, - "grad_norm": 0.5945377349853516, - "learning_rate": 0.0004329361060230578, - "loss": 3.5364, + "epoch": 2.797843665768194, + "grad_norm": 0.5430009961128235, + "learning_rate": 0.0004326475984889368, + "loss": 3.5146, "step": 25950 }, { - "epoch": 2.798407060596276, - "grad_norm": 0.5505304336547852, - "learning_rate": 0.00043261286499299636, - "loss": 3.5386, + "epoch": 2.803234501347709, + "grad_norm": 0.562092661857605, + "learning_rate": 0.0004323237992444684, + "loss": 3.5339, "step": 26000 }, { - "epoch": 2.798407060596276, - "eval_accuracy": 0.3685904710765469, - "eval_loss": 3.5148777961730957, - "eval_runtime": 184.2338, - "eval_samples_per_second": 97.762, - "eval_steps_per_second": 6.112, + "epoch": 2.803234501347709, + "eval_accuracy": 0.36839402665537835, + "eval_loss": 3.515779733657837, + "eval_runtime": 183.9113, + "eval_samples_per_second": 97.933, + "eval_steps_per_second": 6.123, "step": 26000 }, { - "epoch": 2.803788612635884, - "grad_norm": 0.6029397249221802, - "learning_rate": 0.00043228962396293496, - "loss": 3.5251, + "epoch": 2.8086253369272236, + "grad_norm": 0.5559754967689514, + "learning_rate": 0.00043199999999999993, + "loss": 3.539, "step": 26050 }, { - "epoch": 2.8091701646754923, - "grad_norm": 0.640298068523407, - "learning_rate": 0.0004319663829328736, - "loss": 3.5283, + "epoch": 2.8140161725067383, + "grad_norm": 0.5079193711280823, + "learning_rate": 0.00043167620075553153, + "loss": 3.5246, "step": 26100 }, { - "epoch": 2.8145517167151004, - "grad_norm": 0.6309469938278198, - "learning_rate": 0.00043164314190281215, - "loss": 3.5271, + "epoch": 2.8194070080862534, + "grad_norm": 0.572603702545166, + "learning_rate": 0.0004313524015110631, + "loss": 3.5364, "step": 26150 }, { - "epoch": 2.819933268754709, - "grad_norm": 0.5581336617469788, - "learning_rate": 0.00043131990087275074, - "loss": 3.5351, + "epoch": 2.824797843665768, + "grad_norm": 0.6666261553764343, + "learning_rate": 0.0004310286022665947, + "loss": 3.5467, "step": 26200 }, { - "epoch": 2.825314820794317, - "grad_norm": 0.6591759324073792, - "learning_rate": 0.00043099665984268934, - "loss": 3.5215, + "epoch": 2.830188679245283, + "grad_norm": 0.5429426431655884, + "learning_rate": 0.00043070480302212624, + "loss": 3.5337, "step": 26250 }, { - "epoch": 2.830696372833925, - "grad_norm": 0.5295543074607849, - "learning_rate": 0.0004306734188126279, - "loss": 3.5443, + "epoch": 2.835579514824798, + "grad_norm": 0.5463466048240662, + "learning_rate": 0.0004303874797625472, + "loss": 3.5309, "step": 26300 }, { - "epoch": 2.8360779248735337, - "grad_norm": 0.564895749092102, - "learning_rate": 0.00043035017778256647, - "loss": 3.5219, + "epoch": 2.8409703504043127, + "grad_norm": 0.5213525295257568, + "learning_rate": 0.00043006368051807874, + "loss": 3.5448, "step": 26350 }, { - "epoch": 2.841459476913142, - "grad_norm": 0.578912079334259, - "learning_rate": 0.0004300269367525051, - "loss": 3.5432, + "epoch": 2.8463611859838274, + "grad_norm": 0.5590488910675049, + "learning_rate": 0.00042973988127361035, + "loss": 3.5269, "step": 26400 }, { - "epoch": 2.84684102895275, - "grad_norm": 1.0079190731048584, - "learning_rate": 0.00042970369572244366, - "loss": 3.5434, + "epoch": 2.8517520215633425, + "grad_norm": 0.5565256476402283, + "learning_rate": 0.0004294160820291419, + "loss": 3.5379, "step": 26450 }, { - "epoch": 2.852222580992358, - "grad_norm": 0.5640878081321716, - "learning_rate": 0.00042938045469238226, - "loss": 3.5186, + "epoch": 2.857142857142857, + "grad_norm": 0.5384224057197571, + "learning_rate": 0.0004290922827846735, + "loss": 3.5254, "step": 26500 }, { - "epoch": 2.857604133031966, - "grad_norm": 0.5524346828460693, - "learning_rate": 0.0004290572136623208, - "loss": 3.5352, + "epoch": 2.862533692722372, + "grad_norm": 0.5617730617523193, + "learning_rate": 0.00042876848354020505, + "loss": 3.5092, "step": 26550 }, { - "epoch": 2.8629856850715747, - "grad_norm": 0.568709671497345, - "learning_rate": 0.0004287339726322594, - "loss": 3.5349, + "epoch": 2.867924528301887, + "grad_norm": 0.5675151944160461, + "learning_rate": 0.00042844468429573655, + "loss": 3.5319, "step": 26600 }, { - "epoch": 2.868367237111183, - "grad_norm": 0.533467710018158, - "learning_rate": 0.00042841073160219804, - "loss": 3.5455, + "epoch": 2.8733153638814017, + "grad_norm": 0.5250852108001709, + "learning_rate": 0.00042812088505126815, + "loss": 3.526, "step": 26650 }, { - "epoch": 2.873748789150791, - "grad_norm": 0.5956966280937195, - "learning_rate": 0.0004280874905721366, - "loss": 3.5321, + "epoch": 2.8787061994609164, + "grad_norm": 0.5593746304512024, + "learning_rate": 0.0004277970858067997, + "loss": 3.5234, "step": 26700 }, { - "epoch": 2.8791303411903995, - "grad_norm": 0.5749139785766602, - "learning_rate": 0.0004277642495420752, - "loss": 3.5357, + "epoch": 2.884097035040431, + "grad_norm": 0.5367515683174133, + "learning_rate": 0.0004274732865623313, + "loss": 3.527, "step": 26750 }, { - "epoch": 2.8845118932300076, - "grad_norm": 0.6081565022468567, - "learning_rate": 0.00042744100851201377, - "loss": 3.514, + "epoch": 2.889487870619946, + "grad_norm": 0.588120698928833, + "learning_rate": 0.00042714948731786286, + "loss": 3.5074, "step": 26800 }, { - "epoch": 2.8898934452696157, - "grad_norm": 0.5851724743843079, - "learning_rate": 0.0004271177674819523, - "loss": 3.5226, + "epoch": 2.894878706199461, + "grad_norm": 0.5879933834075928, + "learning_rate": 0.00042682568807339447, + "loss": 3.5276, "step": 26850 }, { - "epoch": 2.895274997309224, - "grad_norm": 0.6097344160079956, - "learning_rate": 0.00042680099127249217, - "loss": 3.5233, + "epoch": 2.9002695417789757, + "grad_norm": 0.5280848741531372, + "learning_rate": 0.000426501888828926, + "loss": 3.5137, "step": 26900 }, { - "epoch": 2.9006565493488323, - "grad_norm": 0.575410008430481, - "learning_rate": 0.0004264777502424307, - "loss": 3.5413, + "epoch": 2.9056603773584904, + "grad_norm": 0.6295096278190613, + "learning_rate": 0.0004261780895844576, + "loss": 3.5142, "step": 26950 }, { - "epoch": 2.9060381013884404, - "grad_norm": 0.6111117601394653, - "learning_rate": 0.0004261545092123693, - "loss": 3.5145, + "epoch": 2.9110512129380055, + "grad_norm": 0.6104944944381714, + "learning_rate": 0.00042585429033998917, + "loss": 3.5378, "step": 27000 }, { - "epoch": 2.9060381013884404, - "eval_accuracy": 0.36941308209019036, - "eval_loss": 3.5066769123077393, - "eval_runtime": 184.6661, - "eval_samples_per_second": 97.533, - "eval_steps_per_second": 6.097, + "epoch": 2.9110512129380055, + "eval_accuracy": 0.3697847836194037, + "eval_loss": 3.5046472549438477, + "eval_runtime": 184.1441, + "eval_samples_per_second": 97.809, + "eval_steps_per_second": 6.115, "step": 27000 }, { - "epoch": 2.9114196534280485, - "grad_norm": 0.5597397089004517, - "learning_rate": 0.00042583126818230795, - "loss": 3.5104, + "epoch": 2.91644204851752, + "grad_norm": 0.5822877883911133, + "learning_rate": 0.0004255304910955207, + "loss": 3.5514, "step": 27050 }, { - "epoch": 2.9168012054676566, - "grad_norm": 0.5836156010627747, - "learning_rate": 0.0004255080271522465, - "loss": 3.5324, + "epoch": 2.921832884097035, + "grad_norm": 0.5345104336738586, + "learning_rate": 0.0004252066918510523, + "loss": 3.5334, "step": 27100 }, { - "epoch": 2.922182757507265, - "grad_norm": 0.5847307443618774, - "learning_rate": 0.0004251847861221851, - "loss": 3.5126, + "epoch": 2.92722371967655, + "grad_norm": 0.5554478168487549, + "learning_rate": 0.0004248828926065839, + "loss": 3.5461, "step": 27150 }, { - "epoch": 2.9275643095468733, - "grad_norm": 0.5847127437591553, - "learning_rate": 0.00042486154509212363, - "loss": 3.5196, + "epoch": 2.9326145552560647, + "grad_norm": 0.5697243213653564, + "learning_rate": 0.0004245590933621155, + "loss": 3.5349, "step": 27200 }, { - "epoch": 2.9329458615864814, - "grad_norm": 0.5771723985671997, - "learning_rate": 0.0004245383040620622, - "loss": 3.5229, + "epoch": 2.9380053908355794, + "grad_norm": 0.570512056350708, + "learning_rate": 0.00042423529411764703, + "loss": 3.5493, "step": 27250 }, { - "epoch": 2.93832741362609, - "grad_norm": 0.5682346224784851, - "learning_rate": 0.0004242150630320009, - "loss": 3.5362, + "epoch": 2.9433962264150946, + "grad_norm": 0.623167872428894, + "learning_rate": 0.00042391149487317864, + "loss": 3.5298, "step": 27300 }, { - "epoch": 2.943708965665698, - "grad_norm": 0.600759744644165, - "learning_rate": 0.0004238918220019394, - "loss": 3.5464, + "epoch": 2.9487870619946093, + "grad_norm": 0.5901476144790649, + "learning_rate": 0.00042358769562871013, + "loss": 3.5171, "step": 27350 }, { - "epoch": 2.949090517705306, - "grad_norm": 0.5634221434593201, - "learning_rate": 0.000423568580971878, - "loss": 3.5436, + "epoch": 2.954177897574124, + "grad_norm": 0.5781039595603943, + "learning_rate": 0.00042326389638424174, + "loss": 3.5172, "step": 27400 }, { - "epoch": 2.9544720697449143, - "grad_norm": 0.5330511331558228, - "learning_rate": 0.00042324533994181655, - "loss": 3.5156, + "epoch": 2.9595687331536387, + "grad_norm": 0.5595638751983643, + "learning_rate": 0.0004229400971397733, + "loss": 3.5319, "step": 27450 }, { - "epoch": 2.9598536217845224, - "grad_norm": 0.6540337800979614, - "learning_rate": 0.00042292209891175514, - "loss": 3.5174, + "epoch": 2.964959568733154, + "grad_norm": 0.5504574179649353, + "learning_rate": 0.00042261629789530484, + "loss": 3.5351, "step": 27500 }, { - "epoch": 2.965235173824131, - "grad_norm": 0.5485448241233826, - "learning_rate": 0.00042259885788169374, - "loss": 3.5411, + "epoch": 2.9703504043126685, + "grad_norm": 0.5639024376869202, + "learning_rate": 0.00042229249865083644, + "loss": 3.5209, "step": 27550 }, { - "epoch": 2.970616725863739, - "grad_norm": 0.5691707134246826, - "learning_rate": 0.00042227561685163233, - "loss": 3.5295, + "epoch": 2.975741239892183, + "grad_norm": 0.5995092391967773, + "learning_rate": 0.000421968699406368, + "loss": 3.5227, "step": 27600 }, { - "epoch": 2.975998277903347, - "grad_norm": 0.5650212168693542, - "learning_rate": 0.00042195237582157093, - "loss": 3.5074, + "epoch": 2.981132075471698, + "grad_norm": 0.5549217462539673, + "learning_rate": 0.0004216449001618996, + "loss": 3.5435, "step": 27650 }, { - "epoch": 2.9813798299429557, - "grad_norm": 0.6257863640785217, - "learning_rate": 0.0004216291347915095, - "loss": 3.5218, + "epoch": 2.986522911051213, + "grad_norm": 0.6059816479682922, + "learning_rate": 0.00042132110091743115, + "loss": 3.5094, "step": 27700 }, { - "epoch": 2.986761381982564, - "grad_norm": 0.5358322262763977, - "learning_rate": 0.00042130589376144806, - "loss": 3.527, + "epoch": 2.9919137466307277, + "grad_norm": 0.5828582048416138, + "learning_rate": 0.00042099730167296275, + "loss": 3.5018, "step": 27750 }, { - "epoch": 2.992142934022172, - "grad_norm": 0.5799550414085388, - "learning_rate": 0.00042098265273138666, - "loss": 3.5273, + "epoch": 2.9973045822102424, + "grad_norm": 0.6414007544517517, + "learning_rate": 0.0004206735024284943, + "loss": 3.5228, "step": 27800 }, { - "epoch": 2.9975244860617805, - "grad_norm": 0.5868208408355713, - "learning_rate": 0.0004206594117013252, - "loss": 3.539, + "epoch": 3.0026954177897576, + "grad_norm": 0.5754533410072327, + "learning_rate": 0.0004203497031840259, + "loss": 3.4827, "step": 27850 }, { - "epoch": 3.0029060381013886, - "grad_norm": 0.599449872970581, - "learning_rate": 0.00042033617067126385, - "loss": 3.4554, + "epoch": 3.0080862533692723, + "grad_norm": 0.5822769403457642, + "learning_rate": 0.00042002590393955746, + "loss": 3.428, "step": 27900 }, { - "epoch": 3.0082875901409967, - "grad_norm": 0.6563495993614197, - "learning_rate": 0.00042001292964120244, - "loss": 3.4225, + "epoch": 3.013477088948787, + "grad_norm": 0.6208178997039795, + "learning_rate": 0.00041970210469508896, + "loss": 3.4073, "step": 27950 }, { - "epoch": 3.0136691421806048, - "grad_norm": 0.6105541586875916, - "learning_rate": 0.000419689688611141, - "loss": 3.4242, + "epoch": 3.018867924528302, + "grad_norm": 0.5660999417304993, + "learning_rate": 0.00041937830545062056, + "loss": 3.4285, "step": 28000 }, { - "epoch": 3.0136691421806048, - "eval_accuracy": 0.3703614044950352, - "eval_loss": 3.4996602535247803, - "eval_runtime": 184.2896, - "eval_samples_per_second": 97.732, - "eval_steps_per_second": 6.11, + "epoch": 3.018867924528302, + "eval_accuracy": 0.37032391924873914, + "eval_loss": 3.5003163814544678, + "eval_runtime": 184.587, + "eval_samples_per_second": 97.575, + "eval_steps_per_second": 6.1, "step": 28000 }, { - "epoch": 3.0190506942202133, - "grad_norm": 0.6090134978294373, - "learning_rate": 0.0004193664475810796, - "loss": 3.4247, + "epoch": 3.024258760107817, + "grad_norm": 0.6071078181266785, + "learning_rate": 0.0004190545062061521, + "loss": 3.4211, "step": 28050 }, { - "epoch": 3.0244322462598214, - "grad_norm": 0.6373844742774963, - "learning_rate": 0.00041904320655101817, - "loss": 3.4486, + "epoch": 3.0296495956873315, + "grad_norm": 0.5652373433113098, + "learning_rate": 0.0004187307069616837, + "loss": 3.4348, "step": 28100 }, { - "epoch": 3.0298137982994295, - "grad_norm": 0.6366713643074036, - "learning_rate": 0.0004187199655209567, - "loss": 3.4378, + "epoch": 3.035040431266846, + "grad_norm": 0.5970408916473389, + "learning_rate": 0.00041840690771721527, + "loss": 3.4384, "step": 28150 }, { - "epoch": 3.0351953503390376, - "grad_norm": 0.5975115895271301, - "learning_rate": 0.00041839672449089536, - "loss": 3.4535, + "epoch": 3.0404312668463613, + "grad_norm": 0.5694037079811096, + "learning_rate": 0.00041808310847274687, + "loss": 3.4361, "step": 28200 }, { - "epoch": 3.040576902378646, - "grad_norm": 0.6184483170509338, - "learning_rate": 0.00041807348346083395, - "loss": 3.4623, + "epoch": 3.045822102425876, + "grad_norm": 0.6052969694137573, + "learning_rate": 0.0004177593092282784, + "loss": 3.4426, "step": 28250 }, { - "epoch": 3.0459584544182543, - "grad_norm": 0.5912527441978455, - "learning_rate": 0.0004177502424307725, - "loss": 3.4439, + "epoch": 3.0512129380053907, + "grad_norm": 0.6220088005065918, + "learning_rate": 0.0004174419859686994, + "loss": 3.442, "step": 28300 }, { - "epoch": 3.0513400064578624, - "grad_norm": 0.6498454809188843, - "learning_rate": 0.0004174270014007111, - "loss": 3.4491, + "epoch": 3.056603773584906, + "grad_norm": 0.6056026220321655, + "learning_rate": 0.0004171181867242309, + "loss": 3.4497, "step": 28350 }, { - "epoch": 3.0567215584974705, - "grad_norm": 0.574679434299469, - "learning_rate": 0.00041710376037064963, - "loss": 3.4435, + "epoch": 3.0619946091644206, + "grad_norm": 0.588292121887207, + "learning_rate": 0.00041679438747976253, + "loss": 3.4249, "step": 28400 }, { - "epoch": 3.062103110537079, - "grad_norm": 0.601494312286377, - "learning_rate": 0.0004167805193405883, - "loss": 3.4275, + "epoch": 3.0673854447439353, + "grad_norm": 0.6074293851852417, + "learning_rate": 0.0004164705882352941, + "loss": 3.4462, "step": 28450 }, { - "epoch": 3.067484662576687, - "grad_norm": 0.5695728063583374, - "learning_rate": 0.0004164572783105269, - "loss": 3.4594, + "epoch": 3.07277628032345, + "grad_norm": 0.587044894695282, + "learning_rate": 0.0004161467889908257, + "loss": 3.4316, "step": 28500 }, { - "epoch": 3.0728662146162953, - "grad_norm": 0.5873692035675049, - "learning_rate": 0.0004161340372804654, - "loss": 3.4718, + "epoch": 3.078167115902965, + "grad_norm": 0.5824921727180481, + "learning_rate": 0.00041582298974635724, + "loss": 3.429, "step": 28550 }, { - "epoch": 3.0782477666559034, - "grad_norm": 0.5661360025405884, - "learning_rate": 0.000415810796250404, - "loss": 3.4408, + "epoch": 3.08355795148248, + "grad_norm": 0.5803788304328918, + "learning_rate": 0.00041549919050188884, + "loss": 3.4512, "step": 28600 }, { - "epoch": 3.083629318695512, - "grad_norm": 0.6276664137840271, - "learning_rate": 0.0004154875552203426, - "loss": 3.4421, + "epoch": 3.0889487870619945, + "grad_norm": 0.6797330379486084, + "learning_rate": 0.00041517539125742034, + "loss": 3.4525, "step": 28650 }, { - "epoch": 3.08901087073512, - "grad_norm": 0.5694606900215149, - "learning_rate": 0.00041516431419028114, - "loss": 3.4527, + "epoch": 3.0943396226415096, + "grad_norm": 0.5927037000656128, + "learning_rate": 0.0004148515920129519, + "loss": 3.4539, "step": 28700 }, { - "epoch": 3.094392422774728, - "grad_norm": 0.6117746829986572, - "learning_rate": 0.0004148410731602198, - "loss": 3.4553, + "epoch": 3.0997304582210243, + "grad_norm": 0.5757393836975098, + "learning_rate": 0.0004145277927684835, + "loss": 3.4479, "step": 28750 }, { - "epoch": 3.0997739748143363, - "grad_norm": 0.5743106603622437, - "learning_rate": 0.0004145178321301584, - "loss": 3.4538, + "epoch": 3.105121293800539, + "grad_norm": 0.6510079503059387, + "learning_rate": 0.00041420399352401504, + "loss": 3.4463, "step": 28800 }, { - "epoch": 3.105155526853945, - "grad_norm": 0.6104516983032227, - "learning_rate": 0.00041419459110009693, - "loss": 3.4521, + "epoch": 3.1105121293800537, + "grad_norm": 0.6105151176452637, + "learning_rate": 0.00041388019427954665, + "loss": 3.4525, "step": 28850 }, { - "epoch": 3.110537078893553, - "grad_norm": 0.6102493405342102, - "learning_rate": 0.0004138713500700355, - "loss": 3.4463, + "epoch": 3.115902964959569, + "grad_norm": 0.6211234927177429, + "learning_rate": 0.0004135563950350782, + "loss": 3.4499, "step": 28900 }, { - "epoch": 3.115918630933161, - "grad_norm": 0.6518812775611877, - "learning_rate": 0.00041354810903997406, - "loss": 3.4416, + "epoch": 3.1212938005390836, + "grad_norm": 0.5854358673095703, + "learning_rate": 0.0004132325957906098, + "loss": 3.4444, "step": 28950 }, { - "epoch": 3.121300182972769, - "grad_norm": 0.5711073875427246, - "learning_rate": 0.00041322486800991266, - "loss": 3.4594, + "epoch": 3.1266846361185983, + "grad_norm": 0.5977933406829834, + "learning_rate": 0.00041290879654614135, + "loss": 3.4526, "step": 29000 }, { - "epoch": 3.121300182972769, - "eval_accuracy": 0.37099832772340363, - "eval_loss": 3.499441385269165, - "eval_runtime": 184.2527, - "eval_samples_per_second": 97.752, - "eval_steps_per_second": 6.111, + "epoch": 3.1266846361185983, + "eval_accuracy": 0.3711602205262472, + "eval_loss": 3.493985652923584, + "eval_runtime": 183.758, + "eval_samples_per_second": 98.015, + "eval_steps_per_second": 6.128, "step": 29000 }, { - "epoch": 3.1266817350123777, - "grad_norm": 0.5713046789169312, - "learning_rate": 0.00041290809180045246, - "loss": 3.4588, + "epoch": 3.1320754716981134, + "grad_norm": 0.5728035569190979, + "learning_rate": 0.00041258499730167296, + "loss": 3.4309, "step": 29050 }, { - "epoch": 3.132063287051986, - "grad_norm": 0.6167808175086975, - "learning_rate": 0.0004125848507703911, - "loss": 3.4449, + "epoch": 3.137466307277628, + "grad_norm": 0.5746806263923645, + "learning_rate": 0.0004122611980572045, + "loss": 3.4581, "step": 29100 }, { - "epoch": 3.137444839091594, - "grad_norm": 0.5783477425575256, - "learning_rate": 0.0004122616097403297, - "loss": 3.4579, + "epoch": 3.142857142857143, + "grad_norm": 0.5982994437217712, + "learning_rate": 0.00041193739881273606, + "loss": 3.4529, "step": 29150 }, { - "epoch": 3.1428263911312024, - "grad_norm": 0.6156630516052246, - "learning_rate": 0.00041193836871026825, - "loss": 3.4503, + "epoch": 3.1482479784366575, + "grad_norm": 0.5874792337417603, + "learning_rate": 0.00041161359956826766, + "loss": 3.4685, "step": 29200 }, { - "epoch": 3.1482079431708105, - "grad_norm": 0.5773206353187561, - "learning_rate": 0.00041161512768020684, - "loss": 3.457, + "epoch": 3.1536388140161726, + "grad_norm": 0.6152310967445374, + "learning_rate": 0.0004112962763086886, + "loss": 3.4688, "step": 29250 }, { - "epoch": 3.1535894952104186, - "grad_norm": 0.638312816619873, - "learning_rate": 0.0004112918866501454, - "loss": 3.4327, + "epoch": 3.1590296495956873, + "grad_norm": 0.6202549338340759, + "learning_rate": 0.0004109724770642201, + "loss": 3.4607, "step": 29300 }, { - "epoch": 3.1589710472500268, - "grad_norm": 0.5940792560577393, - "learning_rate": 0.000410968645620084, - "loss": 3.4464, + "epoch": 3.164420485175202, + "grad_norm": 0.5819991827011108, + "learning_rate": 0.00041064867781975177, + "loss": 3.4491, "step": 29350 }, { - "epoch": 3.1643525992896353, - "grad_norm": 0.6411643624305725, - "learning_rate": 0.0004106454045900226, - "loss": 3.4502, + "epoch": 3.169811320754717, + "grad_norm": 0.6437973976135254, + "learning_rate": 0.00041032487857528327, + "loss": 3.4589, "step": 29400 }, { - "epoch": 3.1697341513292434, - "grad_norm": 0.627984881401062, - "learning_rate": 0.00041032216355996117, - "loss": 3.4583, + "epoch": 3.175202156334232, + "grad_norm": 0.6172235012054443, + "learning_rate": 0.0004100010793308148, + "loss": 3.4705, "step": 29450 }, { - "epoch": 3.1751157033688515, - "grad_norm": 0.6114353537559509, - "learning_rate": 0.00040999892252989976, - "loss": 3.4434, + "epoch": 3.1805929919137466, + "grad_norm": 0.6386169791221619, + "learning_rate": 0.0004096772800863464, + "loss": 3.4415, "step": 29500 }, { - "epoch": 3.1804972554084596, - "grad_norm": 0.6601144671440125, - "learning_rate": 0.00040967568149983836, - "loss": 3.4406, + "epoch": 3.1859838274932613, + "grad_norm": 0.6090947985649109, + "learning_rate": 0.000409353480841878, + "loss": 3.4567, "step": 29550 }, { - "epoch": 3.185878807448068, - "grad_norm": 0.5516364574432373, - "learning_rate": 0.0004093524404697769, - "loss": 3.4536, + "epoch": 3.1913746630727764, + "grad_norm": 0.6054538488388062, + "learning_rate": 0.0004090296815974096, + "loss": 3.4675, "step": 29600 }, { - "epoch": 3.1912603594876763, - "grad_norm": 0.5681900382041931, - "learning_rate": 0.0004090291994397155, - "loss": 3.4596, + "epoch": 3.196765498652291, + "grad_norm": 0.5757823586463928, + "learning_rate": 0.00040870588235294113, + "loss": 3.4519, "step": 29650 }, { - "epoch": 3.1966419115272844, - "grad_norm": 0.6558794975280762, - "learning_rate": 0.00040870595840965414, - "loss": 3.4553, + "epoch": 3.202156334231806, + "grad_norm": 0.5926768779754639, + "learning_rate": 0.00040838208310847273, + "loss": 3.471, "step": 29700 }, { - "epoch": 3.2020234635668925, - "grad_norm": 0.6110433340072632, - "learning_rate": 0.0004083827173795927, - "loss": 3.4507, + "epoch": 3.207547169811321, + "grad_norm": 0.6421563029289246, + "learning_rate": 0.0004080582838640043, + "loss": 3.4675, "step": 29750 }, { - "epoch": 3.207405015606501, - "grad_norm": 0.6141265630722046, - "learning_rate": 0.0004080594763495313, - "loss": 3.4408, + "epoch": 3.2129380053908356, + "grad_norm": 0.6215593218803406, + "learning_rate": 0.0004077344846195359, + "loss": 3.4446, "step": 29800 }, { - "epoch": 3.212786567646109, - "grad_norm": 0.6038244962692261, - "learning_rate": 0.0004077362353194698, - "loss": 3.4284, + "epoch": 3.2183288409703503, + "grad_norm": 0.6261037588119507, + "learning_rate": 0.00040741068537506744, + "loss": 3.4644, "step": 29850 }, { - "epoch": 3.2181681196857173, - "grad_norm": 0.6001572608947754, - "learning_rate": 0.0004074129942894084, - "loss": 3.4569, + "epoch": 3.223719676549865, + "grad_norm": 0.5810261964797974, + "learning_rate": 0.000407086886130599, + "loss": 3.4587, "step": 29900 }, { - "epoch": 3.2235496717253254, - "grad_norm": 0.6228058338165283, - "learning_rate": 0.000407089753259347, - "loss": 3.4854, + "epoch": 3.22911051212938, + "grad_norm": 0.5672079920768738, + "learning_rate": 0.0004067630868861306, + "loss": 3.4528, "step": 29950 }, { - "epoch": 3.228931223764934, - "grad_norm": 0.6055977940559387, - "learning_rate": 0.0004067665122292856, - "loss": 3.4337, + "epoch": 3.234501347708895, + "grad_norm": 0.605877697467804, + "learning_rate": 0.0004064392876416621, + "loss": 3.4631, "step": 30000 }, { - "epoch": 3.228931223764934, - "eval_accuracy": 0.37154919786462304, - "eval_loss": 3.493218421936035, - "eval_runtime": 184.7775, - "eval_samples_per_second": 97.474, - "eval_steps_per_second": 6.094, + "epoch": 3.234501347708895, + "eval_accuracy": 0.3715522401454819, + "eval_loss": 3.4904942512512207, + "eval_runtime": 183.4308, + "eval_samples_per_second": 98.19, + "eval_steps_per_second": 6.139, "step": 30000 }, { - "epoch": 3.234312775804542, - "grad_norm": 0.6261052489280701, - "learning_rate": 0.0004064432711992242, - "loss": 3.4797, + "epoch": 3.2398921832884096, + "grad_norm": 0.5602097511291504, + "learning_rate": 0.0004061154883971937, + "loss": 3.4415, "step": 30050 }, { - "epoch": 3.23969432784415, - "grad_norm": 0.5575237274169922, - "learning_rate": 0.0004061200301691628, - "loss": 3.4505, + "epoch": 3.2452830188679247, + "grad_norm": 0.594542384147644, + "learning_rate": 0.00040579168915272525, + "loss": 3.4567, "step": 30100 }, { - "epoch": 3.2450758798837587, - "grad_norm": 0.5761787295341492, - "learning_rate": 0.00040579678913910133, - "loss": 3.4694, + "epoch": 3.2506738544474394, + "grad_norm": 0.5522177219390869, + "learning_rate": 0.00040546788990825685, + "loss": 3.4605, "step": 30150 }, { - "epoch": 3.250457431923367, - "grad_norm": 0.5979387760162354, - "learning_rate": 0.0004054735481090399, - "loss": 3.4579, + "epoch": 3.256064690026954, + "grad_norm": 0.6020456552505493, + "learning_rate": 0.0004051440906637884, + "loss": 3.4754, "step": 30200 }, { - "epoch": 3.255838983962975, - "grad_norm": 0.5906158089637756, - "learning_rate": 0.0004051503070789786, - "loss": 3.4587, + "epoch": 3.2614555256064692, + "grad_norm": 0.6003113389015198, + "learning_rate": 0.00040482029141931995, + "loss": 3.4614, "step": 30250 }, { - "epoch": 3.261220536002583, - "grad_norm": 0.6434659361839294, - "learning_rate": 0.0004048270660489171, - "loss": 3.4295, + "epoch": 3.266846361185984, + "grad_norm": 0.6419681310653687, + "learning_rate": 0.00040449649217485156, + "loss": 3.4401, "step": 30300 }, { - "epoch": 3.2666020880421915, - "grad_norm": 0.5845122337341309, - "learning_rate": 0.0004045038250188557, - "loss": 3.4527, + "epoch": 3.2722371967654986, + "grad_norm": 0.5756829380989075, + "learning_rate": 0.0004041726929303831, + "loss": 3.4567, "step": 30350 }, { - "epoch": 3.2719836400817996, - "grad_norm": 0.608493447303772, - "learning_rate": 0.00040418058398879425, - "loss": 3.4514, + "epoch": 3.2776280323450133, + "grad_norm": 0.6107509732246399, + "learning_rate": 0.0004038488936859147, + "loss": 3.4674, "step": 30400 }, { - "epoch": 3.2773651921214078, - "grad_norm": 0.5747212171554565, - "learning_rate": 0.00040385734295873284, - "loss": 3.4804, + "epoch": 3.2830188679245285, + "grad_norm": 0.6611049771308899, + "learning_rate": 0.00040352509444144626, + "loss": 3.4556, "step": 30450 }, { - "epoch": 3.282746744161016, - "grad_norm": 0.6562155485153198, - "learning_rate": 0.00040354056674927265, - "loss": 3.4651, + "epoch": 3.288409703504043, + "grad_norm": 0.6525335311889648, + "learning_rate": 0.00040320129519697787, + "loss": 3.4683, "step": 30500 }, { - "epoch": 3.2881282962006244, - "grad_norm": 0.6439652442932129, - "learning_rate": 0.00040321732571921124, - "loss": 3.4652, + "epoch": 3.293800539083558, + "grad_norm": 0.6149199604988098, + "learning_rate": 0.0004028774959525094, + "loss": 3.4623, "step": 30550 }, { - "epoch": 3.2935098482402325, - "grad_norm": 0.6053032279014587, - "learning_rate": 0.0004028940846891498, - "loss": 3.4706, + "epoch": 3.2991913746630726, + "grad_norm": 0.5852982997894287, + "learning_rate": 0.000402553696708041, + "loss": 3.4567, "step": 30600 }, { - "epoch": 3.2988914002798406, - "grad_norm": 0.6160561442375183, - "learning_rate": 0.00040257084365908843, - "loss": 3.447, + "epoch": 3.3045822102425877, + "grad_norm": 0.631427526473999, + "learning_rate": 0.0004022298974635726, + "loss": 3.4527, "step": 30650 }, { - "epoch": 3.304272952319449, - "grad_norm": 0.6740018129348755, - "learning_rate": 0.00040224760262902703, - "loss": 3.4561, + "epoch": 3.3099730458221024, + "grad_norm": 0.6369751691818237, + "learning_rate": 0.00040190609821910407, + "loss": 3.466, "step": 30700 }, { - "epoch": 3.3096545043590573, - "grad_norm": 0.6057961583137512, - "learning_rate": 0.00040192436159896557, - "loss": 3.4598, + "epoch": 3.315363881401617, + "grad_norm": 0.6102563738822937, + "learning_rate": 0.0004015822989746357, + "loss": 3.4459, "step": 30750 }, { - "epoch": 3.3150360563986654, - "grad_norm": 0.5896490216255188, - "learning_rate": 0.00040160112056890416, - "loss": 3.452, + "epoch": 3.3207547169811322, + "grad_norm": 0.5564303398132324, + "learning_rate": 0.0004012584997301672, + "loss": 3.4402, "step": 30800 }, { - "epoch": 3.3204176084382735, - "grad_norm": 0.6008713245391846, - "learning_rate": 0.00040127787953884276, - "loss": 3.4669, + "epoch": 3.326145552560647, + "grad_norm": 0.5567905902862549, + "learning_rate": 0.00040093470048569883, + "loss": 3.442, "step": 30850 }, { - "epoch": 3.3257991604778816, - "grad_norm": 0.6577219367027283, - "learning_rate": 0.00040095463850878135, - "loss": 3.4536, + "epoch": 3.3315363881401616, + "grad_norm": 0.6052920818328857, + "learning_rate": 0.0004006109012412304, + "loss": 3.4722, "step": 30900 }, { - "epoch": 3.33118071251749, - "grad_norm": 0.581386923789978, - "learning_rate": 0.00040063139747871995, - "loss": 3.4546, + "epoch": 3.3369272237196768, + "grad_norm": 0.6521329879760742, + "learning_rate": 0.000400287101996762, + "loss": 3.4698, "step": 30950 }, { - "epoch": 3.3365622645570983, - "grad_norm": 0.6249150037765503, - "learning_rate": 0.00040030815644865854, - "loss": 3.4576, + "epoch": 3.3423180592991915, + "grad_norm": 0.557958722114563, + "learning_rate": 0.00039996330275229354, + "loss": 3.4485, "step": 31000 }, { - "epoch": 3.3365622645570983, - "eval_accuracy": 0.3725115364919959, - "eval_loss": 3.485410213470459, - "eval_runtime": 184.3657, - "eval_samples_per_second": 97.692, - "eval_steps_per_second": 6.107, + "epoch": 3.3423180592991915, + "eval_accuracy": 0.3723944086789319, + "eval_loss": 3.4821529388427734, + "eval_runtime": 183.7959, + "eval_samples_per_second": 97.995, + "eval_steps_per_second": 6.126, "step": 31000 }, { - "epoch": 3.3419438165967064, - "grad_norm": 0.60781329870224, - "learning_rate": 0.0003999849154185971, - "loss": 3.4657, + "epoch": 3.347708894878706, + "grad_norm": 0.597503125667572, + "learning_rate": 0.00039963950350782514, + "loss": 3.4578, "step": 31050 }, { - "epoch": 3.347325368636315, - "grad_norm": 0.6397443413734436, - "learning_rate": 0.0003996616743885357, - "loss": 3.4562, + "epoch": 3.353099730458221, + "grad_norm": 0.6289362907409668, + "learning_rate": 0.0003993157042633567, + "loss": 3.4768, "step": 31100 }, { - "epoch": 3.352706920675923, - "grad_norm": 0.6920917630195618, - "learning_rate": 0.0003993384333584742, - "loss": 3.4667, + "epoch": 3.358490566037736, + "grad_norm": 0.5845758318901062, + "learning_rate": 0.00039899190501888824, + "loss": 3.4587, "step": 31150 }, { - "epoch": 3.358088472715531, - "grad_norm": 0.6365469098091125, - "learning_rate": 0.00039901519232841287, - "loss": 3.4531, + "epoch": 3.3638814016172507, + "grad_norm": 0.6318169236183167, + "learning_rate": 0.00039866810577441985, + "loss": 3.4795, "step": 31200 }, { - "epoch": 3.3634700247551392, - "grad_norm": 0.6367911696434021, - "learning_rate": 0.00039869195129835146, - "loss": 3.4559, + "epoch": 3.3692722371967654, + "grad_norm": 0.6276087760925293, + "learning_rate": 0.0003983443065299514, + "loss": 3.4764, "step": 31250 }, { - "epoch": 3.368851576794748, - "grad_norm": 0.6479049921035767, - "learning_rate": 0.00039836871026829, - "loss": 3.4599, + "epoch": 3.37466307277628, + "grad_norm": 0.5823773741722107, + "learning_rate": 0.000398020507285483, + "loss": 3.4665, "step": 31300 }, { - "epoch": 3.374233128834356, - "grad_norm": 0.6077846884727478, - "learning_rate": 0.0003980454692382286, - "loss": 3.4647, + "epoch": 3.3800539083557952, + "grad_norm": 0.5732158422470093, + "learning_rate": 0.0003976967080410145, + "loss": 3.4629, "step": 31350 }, { - "epoch": 3.379614680873964, - "grad_norm": 0.610712468624115, - "learning_rate": 0.0003977222282081672, - "loss": 3.4588, + "epoch": 3.38544474393531, + "grad_norm": 0.5848298072814941, + "learning_rate": 0.0003973729087965461, + "loss": 3.4703, "step": 31400 }, { - "epoch": 3.384996232913572, - "grad_norm": 0.6273388862609863, - "learning_rate": 0.00039739898717810573, - "loss": 3.4545, + "epoch": 3.3908355795148246, + "grad_norm": 0.6035614609718323, + "learning_rate": 0.00039704910955207765, + "loss": 3.4623, "step": 31450 }, { - "epoch": 3.3903777849531807, - "grad_norm": 0.5932088494300842, - "learning_rate": 0.0003970757461480444, - "loss": 3.4425, + "epoch": 3.3962264150943398, + "grad_norm": 0.6049636006355286, + "learning_rate": 0.00039672531030760926, + "loss": 3.4645, "step": 31500 }, { - "epoch": 3.3957593369927888, - "grad_norm": 0.5899380445480347, - "learning_rate": 0.000396752505117983, - "loss": 3.4601, + "epoch": 3.4016172506738545, + "grad_norm": 0.6059445738792419, + "learning_rate": 0.0003964015110631408, + "loss": 3.4707, "step": 31550 }, { - "epoch": 3.401140889032397, - "grad_norm": 0.6221842765808105, - "learning_rate": 0.0003964292640879215, - "loss": 3.4652, + "epoch": 3.407008086253369, + "grad_norm": 0.617137610912323, + "learning_rate": 0.00039607771181867236, + "loss": 3.4582, "step": 31600 }, { - "epoch": 3.4065224410720054, - "grad_norm": 0.596818745136261, - "learning_rate": 0.0003961060230578601, - "loss": 3.4464, + "epoch": 3.4123989218328843, + "grad_norm": 0.5670456290245056, + "learning_rate": 0.00039575391257420397, + "loss": 3.4462, "step": 31650 }, { - "epoch": 3.4119039931116135, - "grad_norm": 0.6313436627388, - "learning_rate": 0.00039578278202779865, - "loss": 3.4746, + "epoch": 3.417789757412399, + "grad_norm": 0.6105981469154358, + "learning_rate": 0.0003954301133297355, + "loss": 3.4664, "step": 31700 }, { - "epoch": 3.4172855451512216, - "grad_norm": 0.7080163359642029, - "learning_rate": 0.00039545954099773725, - "loss": 3.4722, + "epoch": 3.4231805929919137, + "grad_norm": 0.5884623527526855, + "learning_rate": 0.0003951063140852671, + "loss": 3.47, "step": 31750 }, { - "epoch": 3.4226670971908297, - "grad_norm": 0.5776293277740479, - "learning_rate": 0.0003951362999676759, - "loss": 3.4666, + "epoch": 3.4285714285714284, + "grad_norm": 0.6073600053787231, + "learning_rate": 0.00039478251484079867, + "loss": 3.4509, "step": 31800 }, { - "epoch": 3.428048649230438, - "grad_norm": 0.6150648593902588, - "learning_rate": 0.00039481305893761444, - "loss": 3.4449, + "epoch": 3.4339622641509435, + "grad_norm": 0.6530110836029053, + "learning_rate": 0.0003944587155963303, + "loss": 3.4647, "step": 31850 }, { - "epoch": 3.4334302012700464, - "grad_norm": 0.654681384563446, - "learning_rate": 0.00039448981790755303, - "loss": 3.4583, + "epoch": 3.439353099730458, + "grad_norm": 0.5859715342521667, + "learning_rate": 0.0003941349163518618, + "loss": 3.4745, "step": 31900 }, { - "epoch": 3.4388117533096545, - "grad_norm": 0.6297449469566345, - "learning_rate": 0.0003941665768774916, - "loss": 3.4802, + "epoch": 3.444743935309973, + "grad_norm": 0.5740375518798828, + "learning_rate": 0.00039381111710739343, + "loss": 3.4645, "step": 31950 }, { - "epoch": 3.4441933053492626, - "grad_norm": 0.5857325792312622, - "learning_rate": 0.00039384333584743016, - "loss": 3.4854, + "epoch": 3.450134770889488, + "grad_norm": 0.5447757244110107, + "learning_rate": 0.000393487317862925, + "loss": 3.4613, "step": 32000 }, { - "epoch": 3.4441933053492626, - "eval_accuracy": 0.3729826554135595, - "eval_loss": 3.477412700653076, - "eval_runtime": 184.1469, - "eval_samples_per_second": 97.808, - "eval_steps_per_second": 6.115, + "epoch": 3.450134770889488, + "eval_accuracy": 0.3732764528222099, + "eval_loss": 3.4755747318267822, + "eval_runtime": 183.5947, + "eval_samples_per_second": 98.102, + "eval_steps_per_second": 6.133, "step": 32000 }, { - "epoch": 3.449574857388871, - "grad_norm": 0.5666436553001404, - "learning_rate": 0.0003935200948173688, - "loss": 3.4641, + "epoch": 3.4555256064690028, + "grad_norm": 0.634827733039856, + "learning_rate": 0.0003931635186184565, + "loss": 3.4625, "step": 32050 }, { - "epoch": 3.4549564094284793, - "grad_norm": 0.6132357120513916, - "learning_rate": 0.0003931968537873074, - "loss": 3.4594, + "epoch": 3.4609164420485174, + "grad_norm": 0.5631187558174133, + "learning_rate": 0.0003928397193739881, + "loss": 3.4694, "step": 32100 }, { - "epoch": 3.4603379614680874, - "grad_norm": 0.6949991583824158, - "learning_rate": 0.00039287361275724595, - "loss": 3.4698, + "epoch": 3.466307277628032, + "grad_norm": 0.6147528290748596, + "learning_rate": 0.00039251592012951963, + "loss": 3.4653, "step": 32150 }, { - "epoch": 3.4657195135076955, - "grad_norm": 0.5902615785598755, - "learning_rate": 0.00039255037172718454, - "loss": 3.4537, + "epoch": 3.4716981132075473, + "grad_norm": 0.5503348708152771, + "learning_rate": 0.00039219212088505124, + "loss": 3.4459, "step": 32200 }, { - "epoch": 3.471101065547304, - "grad_norm": 0.6425248980522156, - "learning_rate": 0.0003922271306971231, - "loss": 3.4657, + "epoch": 3.477088948787062, + "grad_norm": 0.5855104923248291, + "learning_rate": 0.0003918683216405828, + "loss": 3.46, "step": 32250 }, { - "epoch": 3.476482617586912, - "grad_norm": 0.6459230184555054, - "learning_rate": 0.0003919038896670617, - "loss": 3.4617, + "epoch": 3.4824797843665767, + "grad_norm": 0.5776777267456055, + "learning_rate": 0.0003915445223961144, + "loss": 3.4835, "step": 32300 }, { - "epoch": 3.4818641696265202, - "grad_norm": 0.8496313691139221, - "learning_rate": 0.00039158064863700033, - "loss": 3.4397, + "epoch": 3.487870619946092, + "grad_norm": 0.6286254525184631, + "learning_rate": 0.00039122072315164594, + "loss": 3.4538, "step": 32350 }, { - "epoch": 3.4872457216661283, - "grad_norm": 0.6361708641052246, - "learning_rate": 0.00039125740760693887, - "loss": 3.4406, + "epoch": 3.4932614555256065, + "grad_norm": 0.5974652767181396, + "learning_rate": 0.0003908969239071775, + "loss": 3.4655, "step": 32400 }, { - "epoch": 3.492627273705737, - "grad_norm": 0.6062164902687073, - "learning_rate": 0.00039093416657687746, - "loss": 3.4758, + "epoch": 3.498652291105121, + "grad_norm": 0.6419471502304077, + "learning_rate": 0.0003905731246627091, + "loss": 3.4534, "step": 32450 }, { - "epoch": 3.498008825745345, - "grad_norm": 0.6049349904060364, - "learning_rate": 0.00039061092554681606, - "loss": 3.4733, + "epoch": 3.5040431266846364, + "grad_norm": 0.5857613682746887, + "learning_rate": 0.00039024932541824065, + "loss": 3.4636, "step": 32500 }, { - "epoch": 3.503390377784953, - "grad_norm": 0.6517959833145142, - "learning_rate": 0.0003902876845167546, - "loss": 3.4654, + "epoch": 3.509433962264151, + "grad_norm": 0.591844916343689, + "learning_rate": 0.00038992552617377225, + "loss": 3.4575, "step": 32550 }, { - "epoch": 3.5087719298245617, - "grad_norm": 0.5791226625442505, - "learning_rate": 0.0003899644434866932, - "loss": 3.4739, + "epoch": 3.5148247978436657, + "grad_norm": 0.5561016201972961, + "learning_rate": 0.0003896017269293038, + "loss": 3.4629, "step": 32600 }, { - "epoch": 3.5141534818641698, - "grad_norm": 0.6079575419425964, - "learning_rate": 0.00038964120245663184, - "loss": 3.4733, + "epoch": 3.5202156334231804, + "grad_norm": 0.5965845584869385, + "learning_rate": 0.0003892779276848354, + "loss": 3.4548, "step": 32650 }, { - "epoch": 3.519535033903778, - "grad_norm": 0.5950577855110168, - "learning_rate": 0.0003893179614265704, - "loss": 3.4655, + "epoch": 3.525606469002695, + "grad_norm": 0.586627185344696, + "learning_rate": 0.0003889541284403669, + "loss": 3.4661, "step": 32700 }, { - "epoch": 3.524916585943386, - "grad_norm": 0.8927932381629944, - "learning_rate": 0.000388994720396509, - "loss": 3.4671, + "epoch": 3.5309973045822103, + "grad_norm": 0.658444344997406, + "learning_rate": 0.0003886303291958985, + "loss": 3.4555, "step": 32750 }, { - "epoch": 3.530298137982994, - "grad_norm": 0.5713189840316772, - "learning_rate": 0.0003886714793664475, - "loss": 3.4672, + "epoch": 3.536388140161725, + "grad_norm": 0.6075886487960815, + "learning_rate": 0.00038830652995143006, + "loss": 3.4609, "step": 32800 }, { - "epoch": 3.5356796900226026, - "grad_norm": 0.6249322891235352, - "learning_rate": 0.0003883482383363861, - "loss": 3.449, + "epoch": 3.5417789757412397, + "grad_norm": 0.6294785737991333, + "learning_rate": 0.0003879827307069616, + "loss": 3.4389, "step": 32850 }, { - "epoch": 3.5410612420622107, - "grad_norm": 0.635231077671051, - "learning_rate": 0.00038802499730632476, - "loss": 3.4726, + "epoch": 3.547169811320755, + "grad_norm": 0.621010422706604, + "learning_rate": 0.0003876589314624932, + "loss": 3.4549, "step": 32900 }, { - "epoch": 3.546442794101819, - "grad_norm": 0.6587965488433838, - "learning_rate": 0.0003877017562762633, - "loss": 3.4602, + "epoch": 3.5525606469002695, + "grad_norm": 0.5769633054733276, + "learning_rate": 0.00038733513221802477, + "loss": 3.4456, "step": 32950 }, { - "epoch": 3.5518243461414274, - "grad_norm": 0.6618245840072632, - "learning_rate": 0.0003873785152462019, - "loss": 3.4504, + "epoch": 3.557951482479784, + "grad_norm": 0.6038095951080322, + "learning_rate": 0.00038701133297355637, + "loss": 3.4479, "step": 33000 }, { - "epoch": 3.5518243461414274, - "eval_accuracy": 0.3737593062556574, - "eval_loss": 3.4713170528411865, - "eval_runtime": 184.6965, - "eval_samples_per_second": 97.517, - "eval_steps_per_second": 6.096, + "epoch": 3.557951482479784, + "eval_accuracy": 0.37366271383839034, + "eval_loss": 3.4711806774139404, + "eval_runtime": 183.6637, + "eval_samples_per_second": 98.065, + "eval_steps_per_second": 6.131, "step": 33000 }, { - "epoch": 3.5572058981810355, - "grad_norm": 0.5909431576728821, - "learning_rate": 0.0003870552742161405, - "loss": 3.4404, + "epoch": 3.5633423180592994, + "grad_norm": 0.6252723336219788, + "learning_rate": 0.0003866875337290879, + "loss": 3.4537, "step": 33050 }, { - "epoch": 3.5625874502206436, - "grad_norm": 0.6251736879348755, - "learning_rate": 0.00038673203318607903, - "loss": 3.4581, + "epoch": 3.568733153638814, + "grad_norm": 0.6400101780891418, + "learning_rate": 0.00038636373448461953, + "loss": 3.4645, "step": 33100 }, { - "epoch": 3.5679690022602517, - "grad_norm": 0.6399909853935242, - "learning_rate": 0.0003864087921560176, - "loss": 3.4428, + "epoch": 3.5741239892183287, + "grad_norm": 0.5554074645042419, + "learning_rate": 0.0003860399352401511, + "loss": 3.4393, "step": 33150 }, { - "epoch": 3.57335055429986, - "grad_norm": 0.6242807507514954, - "learning_rate": 0.0003860855511259563, - "loss": 3.4589, + "epoch": 3.579514824797844, + "grad_norm": 0.5511495471000671, + "learning_rate": 0.0003857161359956827, + "loss": 3.447, "step": 33200 }, { - "epoch": 3.5787321063394684, - "grad_norm": 0.6046002507209778, - "learning_rate": 0.0003857623100958948, - "loss": 3.4656, + "epoch": 3.5849056603773586, + "grad_norm": 0.6984949111938477, + "learning_rate": 0.00038539233675121423, + "loss": 3.4765, "step": 33250 }, { - "epoch": 3.5841136583790765, - "grad_norm": 0.6714318990707397, - "learning_rate": 0.0003854390690658334, - "loss": 3.4618, + "epoch": 3.5902964959568733, + "grad_norm": 0.667104184627533, + "learning_rate": 0.0003850750134916352, + "loss": 3.4662, "step": 33300 }, { - "epoch": 3.5894952104186846, - "grad_norm": 0.5967142581939697, - "learning_rate": 0.00038511582803577195, - "loss": 3.4555, + "epoch": 3.595687331536388, + "grad_norm": 0.5665805339813232, + "learning_rate": 0.0003847512142471667, + "loss": 3.4545, "step": 33350 }, { - "epoch": 3.594876762458293, - "grad_norm": 0.6154034733772278, - "learning_rate": 0.00038479258700571054, - "loss": 3.4456, + "epoch": 3.601078167115903, + "grad_norm": 0.6430594325065613, + "learning_rate": 0.0003844274150026983, + "loss": 3.4523, "step": 33400 }, { - "epoch": 3.6002583144979012, - "grad_norm": 0.5872713923454285, - "learning_rate": 0.00038446934597564914, - "loss": 3.4508, + "epoch": 3.606469002695418, + "grad_norm": 0.5975932478904724, + "learning_rate": 0.00038410361575822984, + "loss": 3.4644, "step": 33450 }, { - "epoch": 3.6056398665375093, - "grad_norm": 0.6076340079307556, - "learning_rate": 0.00038414610494558773, - "loss": 3.456, + "epoch": 3.6118598382749325, + "grad_norm": 0.6220806837081909, + "learning_rate": 0.00038377981651376144, + "loss": 3.4455, "step": 33500 }, { - "epoch": 3.611021418577118, - "grad_norm": 0.5723451972007751, - "learning_rate": 0.00038382286391552633, - "loss": 3.4651, + "epoch": 3.617250673854447, + "grad_norm": 0.6150628924369812, + "learning_rate": 0.000383456017269293, + "loss": 3.4532, "step": 33550 }, { - "epoch": 3.616402970616726, - "grad_norm": 0.5870938301086426, - "learning_rate": 0.0003834996228854649, - "loss": 3.4636, + "epoch": 3.6226415094339623, + "grad_norm": 0.638692319393158, + "learning_rate": 0.00038313221802482454, + "loss": 3.4592, "step": 33600 }, { - "epoch": 3.621784522656334, - "grad_norm": 0.6072779893875122, - "learning_rate": 0.00038317638185540346, - "loss": 3.4644, + "epoch": 3.628032345013477, + "grad_norm": 0.6110245585441589, + "learning_rate": 0.00038280841878035615, + "loss": 3.4702, "step": 33650 }, { - "epoch": 3.627166074695942, - "grad_norm": 0.6000068187713623, - "learning_rate": 0.00038285960564594327, - "loss": 3.4671, + "epoch": 3.6334231805929917, + "grad_norm": 0.5673446655273438, + "learning_rate": 0.0003824846195358877, + "loss": 3.4648, "step": 33700 }, { - "epoch": 3.6325476267355503, - "grad_norm": 0.6160212159156799, - "learning_rate": 0.00038253636461588186, - "loss": 3.463, + "epoch": 3.638814016172507, + "grad_norm": 0.6043810844421387, + "learning_rate": 0.0003821608202914193, + "loss": 3.4476, "step": 33750 }, { - "epoch": 3.637929178775159, - "grad_norm": 0.7063570618629456, - "learning_rate": 0.00038221312358582046, - "loss": 3.4378, + "epoch": 3.6442048517520216, + "grad_norm": 0.5578144788742065, + "learning_rate": 0.00038183702104695085, + "loss": 3.4637, "step": 33800 }, { - "epoch": 3.643310730814767, - "grad_norm": 0.6322859525680542, - "learning_rate": 0.00038188988255575905, - "loss": 3.4596, + "epoch": 3.6495956873315363, + "grad_norm": 0.6210846304893494, + "learning_rate": 0.00038151322180248246, + "loss": 3.4607, "step": 33850 }, { - "epoch": 3.648692282854375, - "grad_norm": 0.6268704533576965, - "learning_rate": 0.00038156664152569765, - "loss": 3.4784, + "epoch": 3.6549865229110514, + "grad_norm": 0.6444474458694458, + "learning_rate": 0.000381189422558014, + "loss": 3.454, "step": 33900 }, { - "epoch": 3.6540738348939836, - "grad_norm": 0.6058254837989807, - "learning_rate": 0.00038124340049563624, - "loss": 3.455, + "epoch": 3.660377358490566, + "grad_norm": 0.6198441982269287, + "learning_rate": 0.0003808656233135456, + "loss": 3.4499, "step": 33950 }, { - "epoch": 3.6594553869335917, - "grad_norm": 0.6425801515579224, - "learning_rate": 0.0003809201594655748, - "loss": 3.449, + "epoch": 3.665768194070081, + "grad_norm": 0.6694725155830383, + "learning_rate": 0.00038054182406907716, + "loss": 3.464, "step": 34000 }, { - "epoch": 3.6594553869335917, - "eval_accuracy": 0.3744735903401498, - "eval_loss": 3.4654529094696045, - "eval_runtime": 184.3767, - "eval_samples_per_second": 97.686, - "eval_steps_per_second": 6.107, + "epoch": 3.665768194070081, + "eval_accuracy": 0.374257371093399, + "eval_loss": 3.4641661643981934, + "eval_runtime": 183.6771, + "eval_samples_per_second": 98.058, + "eval_steps_per_second": 6.13, "step": 34000 }, { - "epoch": 3.6648369389732, - "grad_norm": 0.6552558541297913, - "learning_rate": 0.0003805969184355134, - "loss": 3.4635, + "epoch": 3.671159029649596, + "grad_norm": 0.6388247609138489, + "learning_rate": 0.00038021802482460866, + "loss": 3.4471, "step": 34050 }, { - "epoch": 3.670218491012808, - "grad_norm": 0.6297048926353455, - "learning_rate": 0.0003802736774054519, - "loss": 3.4479, + "epoch": 3.6765498652291106, + "grad_norm": 0.5763834714889526, + "learning_rate": 0.00037989422558014027, + "loss": 3.4612, "step": 34100 }, { - "epoch": 3.675600043052416, - "grad_norm": 0.6090021729469299, - "learning_rate": 0.00037995043637539057, - "loss": 3.4711, + "epoch": 3.6819407008086253, + "grad_norm": 0.6030428409576416, + "learning_rate": 0.0003795704263356718, + "loss": 3.4607, "step": 34150 }, { - "epoch": 3.6809815950920246, - "grad_norm": 0.6067349314689636, - "learning_rate": 0.00037962719534532916, - "loss": 3.4607, + "epoch": 3.68733153638814, + "grad_norm": 0.6248852610588074, + "learning_rate": 0.0003792466270912034, + "loss": 3.4441, "step": 34200 }, { - "epoch": 3.6863631471316327, - "grad_norm": 0.6618533730506897, - "learning_rate": 0.0003793039543152677, - "loss": 3.4597, + "epoch": 3.6927223719676547, + "grad_norm": 0.6712538003921509, + "learning_rate": 0.00037892282784673497, + "loss": 3.4588, "step": 34250 }, { - "epoch": 3.691744699171241, - "grad_norm": 0.5938194990158081, - "learning_rate": 0.0003789807132852063, - "loss": 3.4435, + "epoch": 3.69811320754717, + "grad_norm": 0.6329993605613708, + "learning_rate": 0.0003785990286022666, + "loss": 3.4463, "step": 34300 }, { - "epoch": 3.6971262512108494, - "grad_norm": 0.6445248126983643, - "learning_rate": 0.0003786574722551449, - "loss": 3.4287, + "epoch": 3.7035040431266846, + "grad_norm": 0.5744032859802246, + "learning_rate": 0.00037827522935779813, + "loss": 3.4697, "step": 34350 }, { - "epoch": 3.7025078032504575, - "grad_norm": 0.6069079041481018, - "learning_rate": 0.00037833423122508343, - "loss": 3.4655, + "epoch": 3.7088948787061993, + "grad_norm": 0.609049916267395, + "learning_rate": 0.00037795143011332973, + "loss": 3.4542, "step": 34400 }, { - "epoch": 3.7078893552900656, - "grad_norm": 0.6393512487411499, - "learning_rate": 0.0003780109901950221, - "loss": 3.457, + "epoch": 3.7142857142857144, + "grad_norm": 0.6319896578788757, + "learning_rate": 0.0003776276308688613, + "loss": 3.4407, "step": 34450 }, { - "epoch": 3.713270907329674, - "grad_norm": 0.5912173390388489, - "learning_rate": 0.0003776877491649607, - "loss": 3.4475, + "epoch": 3.719676549865229, + "grad_norm": 0.7728492617607117, + "learning_rate": 0.00037730383162439283, + "loss": 3.4583, "step": 34500 }, { - "epoch": 3.7186524593692822, - "grad_norm": 0.6467220187187195, - "learning_rate": 0.0003773645081348992, - "loss": 3.4497, + "epoch": 3.725067385444744, + "grad_norm": 0.6464096903800964, + "learning_rate": 0.00037698003237992444, + "loss": 3.4606, "step": 34550 }, { - "epoch": 3.7240340114088903, - "grad_norm": 0.6172649264335632, - "learning_rate": 0.0003770412671048378, - "loss": 3.4442, + "epoch": 3.730458221024259, + "grad_norm": 0.6643472909927368, + "learning_rate": 0.000376656233135456, + "loss": 3.4623, "step": 34600 }, { - "epoch": 3.7294155634484984, - "grad_norm": 0.7250891327857971, - "learning_rate": 0.00037671802607477635, - "loss": 3.4512, + "epoch": 3.7358490566037736, + "grad_norm": 0.5791902542114258, + "learning_rate": 0.0003763324338909876, + "loss": 3.4528, "step": 34650 }, { - "epoch": 3.7347971154881066, - "grad_norm": 0.6259332299232483, - "learning_rate": 0.000376394785044715, - "loss": 3.4547, + "epoch": 3.7412398921832883, + "grad_norm": 0.5891251564025879, + "learning_rate": 0.0003760086346465191, + "loss": 3.4573, "step": 34700 }, { - "epoch": 3.740178667527715, - "grad_norm": 0.6603428721427917, - "learning_rate": 0.0003760715440146536, - "loss": 3.4524, + "epoch": 3.7466307277628035, + "grad_norm": 0.6279162764549255, + "learning_rate": 0.0003756848354020507, + "loss": 3.4399, "step": 34750 }, { - "epoch": 3.745560219567323, - "grad_norm": 0.6384559273719788, - "learning_rate": 0.00037574830298459214, - "loss": 3.4597, + "epoch": 3.752021563342318, + "grad_norm": 0.6248940825462341, + "learning_rate": 0.00037536103615758224, + "loss": 3.474, "step": 34800 }, { - "epoch": 3.7509417716069313, - "grad_norm": 0.5936117172241211, - "learning_rate": 0.00037542506195453073, - "loss": 3.4618, + "epoch": 3.757412398921833, + "grad_norm": 0.6553635597229004, + "learning_rate": 0.0003750372369131138, + "loss": 3.46, "step": 34850 }, { - "epoch": 3.75632332364654, - "grad_norm": 0.6444550156593323, - "learning_rate": 0.0003751018209244693, - "loss": 3.4451, + "epoch": 3.7628032345013476, + "grad_norm": 0.6073916554450989, + "learning_rate": 0.0003747134376686454, + "loss": 3.459, "step": 34900 }, { - "epoch": 3.761704875686148, - "grad_norm": 0.6175785660743713, - "learning_rate": 0.00037477857989440787, - "loss": 3.4498, + "epoch": 3.7681940700808623, + "grad_norm": 0.580033004283905, + "learning_rate": 0.00037438963842417695, + "loss": 3.4599, "step": 34950 }, { - "epoch": 3.767086427725756, - "grad_norm": 0.6283439993858337, - "learning_rate": 0.0003744553388643465, - "loss": 3.4402, + "epoch": 3.7735849056603774, + "grad_norm": 0.6097987294197083, + "learning_rate": 0.00037406583917970856, + "loss": 3.447, "step": 35000 }, { - "epoch": 3.767086427725756, - "eval_accuracy": 0.37506064189301147, - "eval_loss": 3.4579710960388184, - "eval_runtime": 184.4841, - "eval_samples_per_second": 97.629, - "eval_steps_per_second": 6.104, + "epoch": 3.7735849056603774, + "eval_accuracy": 0.3745048823718404, + "eval_loss": 3.4596734046936035, + "eval_runtime": 183.8415, + "eval_samples_per_second": 97.97, + "eval_steps_per_second": 6.125, "step": 35000 }, { - "epoch": 3.772467979765364, - "grad_norm": 0.6437238454818726, - "learning_rate": 0.0003741320978342851, - "loss": 3.45, + "epoch": 3.778975741239892, + "grad_norm": 0.6718741655349731, + "learning_rate": 0.0003737420399352401, + "loss": 3.4594, "step": 35050 }, { - "epoch": 3.7778495318049723, - "grad_norm": 0.6655225157737732, - "learning_rate": 0.00037380885680422365, - "loss": 3.4558, + "epoch": 3.784366576819407, + "grad_norm": 0.5822587013244629, + "learning_rate": 0.0003734182406907717, + "loss": 3.4532, "step": 35100 }, { - "epoch": 3.783231083844581, - "grad_norm": 0.6094281673431396, - "learning_rate": 0.00037348561577416224, - "loss": 3.451, + "epoch": 3.789757412398922, + "grad_norm": 0.5875661373138428, + "learning_rate": 0.00037309444144630326, + "loss": 3.4483, "step": 35150 }, { - "epoch": 3.788612635884189, - "grad_norm": 0.6181848645210266, - "learning_rate": 0.0003731623747441008, - "loss": 3.443, + "epoch": 3.7951482479784366, + "grad_norm": 0.6691891551017761, + "learning_rate": 0.00037277064220183487, + "loss": 3.4729, "step": 35200 }, { - "epoch": 3.793994187923797, - "grad_norm": 0.6608075499534607, - "learning_rate": 0.0003728391337140394, - "loss": 3.4426, + "epoch": 3.8005390835579513, + "grad_norm": 0.6240682005882263, + "learning_rate": 0.0003724468429573664, + "loss": 3.4471, "step": 35250 }, { - "epoch": 3.7993757399634056, - "grad_norm": 0.6112875938415527, - "learning_rate": 0.00037251589268397803, - "loss": 3.4448, + "epoch": 3.8059299191374665, + "grad_norm": 0.6282724142074585, + "learning_rate": 0.0003721230437128979, + "loss": 3.4403, "step": 35300 }, { - "epoch": 3.8047572920030137, - "grad_norm": 0.650355339050293, - "learning_rate": 0.00037219265165391657, - "loss": 3.4504, + "epoch": 3.811320754716981, + "grad_norm": 0.6456165909767151, + "learning_rate": 0.00037180572045331887, + "loss": 3.4422, "step": 35350 }, { - "epoch": 3.810138844042622, - "grad_norm": 0.6517062187194824, - "learning_rate": 0.00037186941062385516, - "loss": 3.4615, + "epoch": 3.816711590296496, + "grad_norm": 0.6330288648605347, + "learning_rate": 0.00037148192120885047, + "loss": 3.4571, "step": 35400 }, { - "epoch": 3.8155203960822304, - "grad_norm": 0.7008894085884094, - "learning_rate": 0.0003715461695937937, - "loss": 3.4503, + "epoch": 3.822102425876011, + "grad_norm": 0.5821753740310669, + "learning_rate": 0.000371158121964382, + "loss": 3.4666, "step": 35450 }, { - "epoch": 3.8209019481218385, - "grad_norm": 0.6126015186309814, - "learning_rate": 0.0003712229285637323, - "loss": 3.4488, + "epoch": 3.8274932614555257, + "grad_norm": 0.6028748750686646, + "learning_rate": 0.0003708343227199136, + "loss": 3.4447, "step": 35500 }, { - "epoch": 3.8262835001614466, - "grad_norm": 0.6550661325454712, - "learning_rate": 0.0003708996875336709, - "loss": 3.4358, + "epoch": 3.8328840970350404, + "grad_norm": 0.5814579129219055, + "learning_rate": 0.0003705105234754452, + "loss": 3.4496, "step": 35550 }, { - "epoch": 3.8316650522010547, - "grad_norm": 0.6141082644462585, - "learning_rate": 0.0003705764465036095, - "loss": 3.4613, + "epoch": 3.838274932614555, + "grad_norm": 0.6393029093742371, + "learning_rate": 0.0003701867242309768, + "loss": 3.4504, "step": 35600 }, { - "epoch": 3.837046604240663, - "grad_norm": 0.6304882764816284, - "learning_rate": 0.0003702532054735481, - "loss": 3.4615, + "epoch": 3.8436657681940702, + "grad_norm": 0.6390960216522217, + "learning_rate": 0.00036986292498650833, + "loss": 3.4478, "step": 35650 }, { - "epoch": 3.8424281562802713, - "grad_norm": 0.6499120593070984, - "learning_rate": 0.0003699299644434867, - "loss": 3.4667, + "epoch": 3.849056603773585, + "grad_norm": 0.5880372524261475, + "learning_rate": 0.0003695391257420399, + "loss": 3.4499, "step": 35700 }, { - "epoch": 3.8478097083198795, - "grad_norm": 0.6515492796897888, - "learning_rate": 0.0003696067234134252, - "loss": 3.4591, + "epoch": 3.8544474393530996, + "grad_norm": 0.5923008918762207, + "learning_rate": 0.0003692153264975715, + "loss": 3.4565, "step": 35750 }, { - "epoch": 3.8531912603594876, - "grad_norm": 0.6420202255249023, - "learning_rate": 0.0003692834823833638, - "loss": 3.4544, + "epoch": 3.8598382749326143, + "grad_norm": 0.585785984992981, + "learning_rate": 0.00036889152725310304, + "loss": 3.4454, "step": 35800 }, { - "epoch": 3.858572812399096, - "grad_norm": 0.5830044150352478, - "learning_rate": 0.00036896024135330246, - "loss": 3.4592, + "epoch": 3.8652291105121295, + "grad_norm": 0.600885808467865, + "learning_rate": 0.00036856772800863464, + "loss": 3.4475, "step": 35850 }, { - "epoch": 3.863954364438704, - "grad_norm": 0.61103355884552, - "learning_rate": 0.000368637000323241, - "loss": 3.4685, + "epoch": 3.870619946091644, + "grad_norm": 0.6213895678520203, + "learning_rate": 0.0003682439287641662, + "loss": 3.4587, "step": 35900 }, { - "epoch": 3.8693359164783123, - "grad_norm": 0.7067725658416748, - "learning_rate": 0.0003683137592931796, - "loss": 3.469, + "epoch": 3.876010781671159, + "grad_norm": 0.5899269580841064, + "learning_rate": 0.0003679201295196978, + "loss": 3.455, "step": 35950 }, { - "epoch": 3.8747174685179204, - "grad_norm": 0.5974989533424377, - "learning_rate": 0.00036799051826311814, - "loss": 3.4581, + "epoch": 3.881401617250674, + "grad_norm": 0.6595240831375122, + "learning_rate": 0.00036759633027522935, + "loss": 3.453, "step": 36000 }, { - "epoch": 3.8747174685179204, - "eval_accuracy": 0.37549394960961563, - "eval_loss": 3.4535152912139893, - "eval_runtime": 184.307, - "eval_samples_per_second": 97.723, - "eval_steps_per_second": 6.109, + "epoch": 3.881401617250674, + "eval_accuracy": 0.37545374804112425, + "eval_loss": 3.4512643814086914, + "eval_runtime": 183.5211, + "eval_samples_per_second": 98.141, + "eval_steps_per_second": 6.136, "step": 36000 }, { - "epoch": 3.8800990205575285, - "grad_norm": 0.568252682685852, - "learning_rate": 0.00036766727723305673, - "loss": 3.4568, + "epoch": 3.8867924528301887, + "grad_norm": 0.6180174350738525, + "learning_rate": 0.00036727253103076084, + "loss": 3.4486, "step": 36050 }, { - "epoch": 3.885480572597137, - "grad_norm": 0.6623275279998779, - "learning_rate": 0.0003673440362029953, - "loss": 3.4443, + "epoch": 3.8921832884097034, + "grad_norm": 0.5822926163673401, + "learning_rate": 0.00036694873178629245, + "loss": 3.4582, "step": 36100 }, { - "epoch": 3.890862124636745, - "grad_norm": 0.6388083696365356, - "learning_rate": 0.0003670207951729339, - "loss": 3.456, + "epoch": 3.8975741239892185, + "grad_norm": 0.6376808285713196, + "learning_rate": 0.000366624932541824, + "loss": 3.435, "step": 36150 }, { - "epoch": 3.8962436766763533, - "grad_norm": 0.634536862373352, - "learning_rate": 0.0003666975541428725, - "loss": 3.4427, + "epoch": 3.9029649595687332, + "grad_norm": 0.6071598529815674, + "learning_rate": 0.0003663011332973556, + "loss": 3.4454, "step": 36200 }, { - "epoch": 3.901625228715962, - "grad_norm": 0.5885773301124573, - "learning_rate": 0.0003663743131128111, - "loss": 3.4392, + "epoch": 3.908355795148248, + "grad_norm": 0.5952266454696655, + "learning_rate": 0.00036597733405288715, + "loss": 3.4652, "step": 36250 }, { - "epoch": 3.90700678075557, - "grad_norm": 0.7272632718086243, - "learning_rate": 0.00036605107208274965, - "loss": 3.4446, + "epoch": 3.913746630727763, + "grad_norm": 0.6375381946563721, + "learning_rate": 0.00036565353480841876, + "loss": 3.4503, "step": 36300 }, { - "epoch": 3.912388332795178, - "grad_norm": 0.590799868106842, - "learning_rate": 0.00036572783105268824, - "loss": 3.4486, + "epoch": 3.9191374663072778, + "grad_norm": 0.6298218965530396, + "learning_rate": 0.0003653297355639503, + "loss": 3.4557, "step": 36350 }, { - "epoch": 3.9177698848347866, - "grad_norm": 0.6622004508972168, - "learning_rate": 0.0003654045900226268, - "loss": 3.4538, + "epoch": 3.9245283018867925, + "grad_norm": 0.6288528442382812, + "learning_rate": 0.0003650059363194819, + "loss": 3.445, "step": 36400 }, { - "epoch": 3.9231514368743947, - "grad_norm": 0.6048895120620728, - "learning_rate": 0.00036508134899256543, - "loss": 3.4494, + "epoch": 3.929919137466307, + "grad_norm": 0.5978701710700989, + "learning_rate": 0.00036468213707501347, + "loss": 3.4672, "step": 36450 }, { - "epoch": 3.928532988914003, - "grad_norm": 0.6244843602180481, - "learning_rate": 0.00036475810796250403, - "loss": 3.4429, + "epoch": 3.935309973045822, + "grad_norm": 0.6203653812408447, + "learning_rate": 0.000364358337830545, + "loss": 3.4593, "step": 36500 }, { - "epoch": 3.933914540953611, - "grad_norm": 0.7105559706687927, - "learning_rate": 0.00036443486693244257, - "loss": 3.4539, + "epoch": 3.940700808625337, + "grad_norm": 0.5815897583961487, + "learning_rate": 0.0003640345385860766, + "loss": 3.4576, "step": 36550 }, { - "epoch": 3.939296092993219, - "grad_norm": 0.5961714386940002, - "learning_rate": 0.00036411162590238116, - "loss": 3.4572, + "epoch": 3.9460916442048517, + "grad_norm": 0.5865249633789062, + "learning_rate": 0.00036371073934160817, + "loss": 3.4599, "step": 36600 }, { - "epoch": 3.9446776450328276, - "grad_norm": 0.6044503450393677, - "learning_rate": 0.00036378838487231976, - "loss": 3.4577, + "epoch": 3.9514824797843664, + "grad_norm": 0.6294233202934265, + "learning_rate": 0.0003633869400971398, + "loss": 3.4563, "step": 36650 }, { - "epoch": 3.9500591970724357, - "grad_norm": 0.6206450462341309, - "learning_rate": 0.00036346514384225835, - "loss": 3.4502, + "epoch": 3.9568733153638815, + "grad_norm": 0.7033206224441528, + "learning_rate": 0.00036306314085267127, + "loss": 3.4494, "step": 36700 }, { - "epoch": 3.955440749112044, - "grad_norm": 0.6197695732116699, - "learning_rate": 0.00036314190281219695, - "loss": 3.4515, + "epoch": 3.9622641509433962, + "grad_norm": 0.6106424331665039, + "learning_rate": 0.00036273934160820293, + "loss": 3.482, "step": 36750 }, { - "epoch": 3.9608223011516523, - "grad_norm": 0.6754399538040161, - "learning_rate": 0.00036281866178213554, - "loss": 3.4478, + "epoch": 3.967654986522911, + "grad_norm": 0.6779366731643677, + "learning_rate": 0.00036241554236373443, + "loss": 3.4663, "step": 36800 }, { - "epoch": 3.9662038531912605, - "grad_norm": 0.6515491604804993, - "learning_rate": 0.0003624954207520741, - "loss": 3.4414, + "epoch": 3.973045822102426, + "grad_norm": 0.6342257261276245, + "learning_rate": 0.00036209174311926603, + "loss": 3.4406, "step": 36850 }, { - "epoch": 3.9715854052308686, - "grad_norm": 0.7095997929573059, - "learning_rate": 0.0003621721797220127, - "loss": 3.4407, + "epoch": 3.9784366576819408, + "grad_norm": 0.6370275020599365, + "learning_rate": 0.0003617679438747976, + "loss": 3.4522, "step": 36900 }, { - "epoch": 3.9769669572704767, - "grad_norm": 0.6265909671783447, - "learning_rate": 0.0003618489386919512, - "loss": 3.4681, + "epoch": 3.9838274932614555, + "grad_norm": 0.6269297003746033, + "learning_rate": 0.00036144414463032913, + "loss": 3.4313, "step": 36950 }, { - "epoch": 3.9823485093100848, - "grad_norm": 0.6624922156333923, - "learning_rate": 0.00036152569766188987, - "loss": 3.4399, + "epoch": 3.9892183288409706, + "grad_norm": 0.6149274110794067, + "learning_rate": 0.00036112034538586074, + "loss": 3.4401, "step": 37000 }, { - "epoch": 3.9823485093100848, - "eval_accuracy": 0.3764659583440039, - "eval_loss": 3.4450950622558594, - "eval_runtime": 184.7248, - "eval_samples_per_second": 97.502, - "eval_steps_per_second": 6.096, + "epoch": 3.9892183288409706, + "eval_accuracy": 0.3759343197639277, + "eval_loss": 3.4460744857788086, + "eval_runtime": 183.9017, + "eval_samples_per_second": 97.938, + "eval_steps_per_second": 6.123, "step": 37000 }, { - "epoch": 3.9877300613496933, - "grad_norm": 0.675195574760437, - "learning_rate": 0.00036120245663182846, - "loss": 3.4563, + "epoch": 3.9946091644204853, + "grad_norm": 0.6133876442909241, + "learning_rate": 0.0003607965461413923, + "loss": 3.4475, "step": 37050 }, { - "epoch": 3.9931116133893014, - "grad_norm": 0.6407047510147095, - "learning_rate": 0.000360879215601767, - "loss": 3.4663, + "epoch": 4.0, + "grad_norm": 1.1652570962905884, + "learning_rate": 0.0003604727468969239, + "loss": 3.4314, "step": 37100 }, { - "epoch": 3.9984931654289095, - "grad_norm": 0.6153064966201782, - "learning_rate": 0.0003605559745717056, - "loss": 3.457, + "epoch": 4.005390835579515, + "grad_norm": 0.6168572306632996, + "learning_rate": 0.00036014894765245544, + "loss": 3.3569, "step": 37150 }, { - "epoch": 4.003874717468518, - "grad_norm": 0.6592976450920105, - "learning_rate": 0.0003602327335416442, - "loss": 3.3949, + "epoch": 4.010781671159029, + "grad_norm": 0.6085106134414673, + "learning_rate": 0.00035982514840798705, + "loss": 3.3585, "step": 37200 }, { - "epoch": 4.009256269508126, - "grad_norm": 0.6417883634567261, - "learning_rate": 0.00035990949251158273, - "loss": 3.3379, + "epoch": 4.0161725067385445, + "grad_norm": 0.5930397510528564, + "learning_rate": 0.00035950782514840795, + "loss": 3.3546, "step": 37250 }, { - "epoch": 4.014637821547734, - "grad_norm": 0.6505351066589355, - "learning_rate": 0.0003595862514815214, - "loss": 3.3506, + "epoch": 4.02156334231806, + "grad_norm": 0.6373165249824524, + "learning_rate": 0.00035918402590393955, + "loss": 3.3747, "step": 37300 }, { - "epoch": 4.020019373587343, - "grad_norm": 0.622450053691864, - "learning_rate": 0.00035926301045146, - "loss": 3.354, + "epoch": 4.026954177897574, + "grad_norm": 0.5819648504257202, + "learning_rate": 0.00035886022665947105, + "loss": 3.3607, "step": 37350 }, { - "epoch": 4.0254009256269505, - "grad_norm": 0.6620802283287048, - "learning_rate": 0.0003589397694213985, - "loss": 3.3948, + "epoch": 4.032345013477089, + "grad_norm": 0.596740186214447, + "learning_rate": 0.0003585364274150027, + "loss": 3.3562, "step": 37400 }, { - "epoch": 4.030782477666559, - "grad_norm": 0.674140214920044, - "learning_rate": 0.0003586165283913371, - "loss": 3.374, + "epoch": 4.037735849056604, + "grad_norm": 0.6327202916145325, + "learning_rate": 0.0003582126281705342, + "loss": 3.3695, "step": 37450 }, { - "epoch": 4.036164029706168, - "grad_norm": 0.6449016332626343, - "learning_rate": 0.00035829328736127565, - "loss": 3.3668, + "epoch": 4.0431266846361185, + "grad_norm": 0.6567413210868835, + "learning_rate": 0.0003578888289260658, + "loss": 3.3703, "step": 37500 }, { - "epoch": 4.041545581745775, - "grad_norm": 0.6092488169670105, - "learning_rate": 0.00035797004633121425, - "loss": 3.3746, + "epoch": 4.048517520215634, + "grad_norm": 0.6062192916870117, + "learning_rate": 0.00035756502968159736, + "loss": 3.368, "step": 37550 }, { - "epoch": 4.046927133785384, - "grad_norm": 0.655305027961731, - "learning_rate": 0.0003576468053011529, - "loss": 3.352, + "epoch": 4.053908355795148, + "grad_norm": 0.668158769607544, + "learning_rate": 0.00035724123043712896, + "loss": 3.356, "step": 37600 }, { - "epoch": 4.0523086858249915, - "grad_norm": 0.650629460811615, - "learning_rate": 0.00035732356427109143, - "loss": 3.3727, + "epoch": 4.059299191374663, + "grad_norm": 0.5983052849769592, + "learning_rate": 0.0003569174311926605, + "loss": 3.3772, "step": 37650 }, { - "epoch": 4.0576902378646, - "grad_norm": 0.6936136484146118, - "learning_rate": 0.00035700032324103003, - "loss": 3.3571, + "epoch": 4.064690026954178, + "grad_norm": 0.5878796577453613, + "learning_rate": 0.00035659363194819206, + "loss": 3.3821, "step": 37700 }, { - "epoch": 4.063071789904209, - "grad_norm": 0.6292705535888672, - "learning_rate": 0.00035668354703156984, - "loss": 3.3797, + "epoch": 4.070080862533692, + "grad_norm": 0.6337990760803223, + "learning_rate": 0.00035626983270372367, + "loss": 3.3793, "step": 37750 }, { - "epoch": 4.068453341943816, - "grad_norm": 0.6342557072639465, - "learning_rate": 0.00035636030600150843, - "loss": 3.3791, + "epoch": 4.0754716981132075, + "grad_norm": 0.594143271446228, + "learning_rate": 0.0003559460334592552, + "loss": 3.3521, "step": 37800 }, { - "epoch": 4.073834893983425, - "grad_norm": 0.6861465573310852, - "learning_rate": 0.00035603706497144697, - "loss": 3.3849, + "epoch": 4.080862533692723, + "grad_norm": 0.6610286831855774, + "learning_rate": 0.0003556222342147868, + "loss": 3.3667, "step": 37850 }, { - "epoch": 4.079216446023033, - "grad_norm": 0.6176989674568176, - "learning_rate": 0.00035571382394138557, - "loss": 3.3603, + "epoch": 4.086253369272237, + "grad_norm": 0.6618052124977112, + "learning_rate": 0.0003552984349703184, + "loss": 3.3683, "step": 37900 }, { - "epoch": 4.084597998062641, - "grad_norm": 0.6110954880714417, - "learning_rate": 0.0003553905829113242, - "loss": 3.3471, + "epoch": 4.091644204851752, + "grad_norm": 0.6083658933639526, + "learning_rate": 0.00035497463572585, + "loss": 3.3693, "step": 37950 }, { - "epoch": 4.08997955010225, - "grad_norm": 0.6518476009368896, - "learning_rate": 0.00035506734188126275, - "loss": 3.3962, + "epoch": 4.097035040431267, + "grad_norm": 0.6676791906356812, + "learning_rate": 0.00035465083648138153, + "loss": 3.3826, "step": 38000 }, { - "epoch": 4.08997955010225, - "eval_accuracy": 0.37652365302743346, - "eval_loss": 3.4501006603240967, - "eval_runtime": 184.0449, - "eval_samples_per_second": 97.862, - "eval_steps_per_second": 6.118, + "epoch": 4.097035040431267, + "eval_accuracy": 0.3765557056293387, + "eval_loss": 3.4486758708953857, + "eval_runtime": 183.4657, + "eval_samples_per_second": 98.171, + "eval_steps_per_second": 6.137, "step": 38000 }, { - "epoch": 4.095361102141858, - "grad_norm": 0.7071831822395325, - "learning_rate": 0.00035474410085120135, - "loss": 3.3712, + "epoch": 4.1024258760107815, + "grad_norm": 0.6280378103256226, + "learning_rate": 0.00035432703723691314, + "loss": 3.3745, "step": 38050 }, { - "epoch": 4.100742654181466, - "grad_norm": 0.6040788292884827, - "learning_rate": 0.00035442085982113994, - "loss": 3.3925, + "epoch": 4.107816711590297, + "grad_norm": 0.639232873916626, + "learning_rate": 0.00035400323799244463, + "loss": 3.383, "step": 38100 }, { - "epoch": 4.106124206221074, - "grad_norm": 0.6902562975883484, - "learning_rate": 0.0003540976187910785, - "loss": 3.3507, + "epoch": 4.113207547169812, + "grad_norm": 0.6617724299430847, + "learning_rate": 0.0003536794387479762, + "loss": 3.3792, "step": 38150 }, { - "epoch": 4.111505758260682, - "grad_norm": 0.6432631015777588, - "learning_rate": 0.0003537743777610171, - "loss": 3.3778, + "epoch": 4.118598382749326, + "grad_norm": 0.5961510539054871, + "learning_rate": 0.0003533556395035078, + "loss": 3.3645, "step": 38200 }, { - "epoch": 4.1168873103002905, - "grad_norm": 0.6424291729927063, - "learning_rate": 0.00035345113673095573, - "loss": 3.3656, + "epoch": 4.123989218328841, + "grad_norm": 0.6423681378364563, + "learning_rate": 0.00035303184025903934, + "loss": 3.3716, "step": 38250 }, { - "epoch": 4.122268862339899, - "grad_norm": 0.6857510209083557, - "learning_rate": 0.00035312789570089427, - "loss": 3.3442, + "epoch": 4.129380053908355, + "grad_norm": 0.5851473808288574, + "learning_rate": 0.00035270804101457094, + "loss": 3.3768, "step": 38300 }, { - "epoch": 4.127650414379507, - "grad_norm": 0.6548146605491638, - "learning_rate": 0.00035280465467083286, - "loss": 3.3866, + "epoch": 4.1347708894878705, + "grad_norm": 0.6345853805541992, + "learning_rate": 0.0003523842417701025, + "loss": 3.3602, "step": 38350 }, { - "epoch": 4.133031966419115, - "grad_norm": 0.6027295589447021, - "learning_rate": 0.0003524814136407714, - "loss": 3.3592, + "epoch": 4.140161725067386, + "grad_norm": 0.6322952508926392, + "learning_rate": 0.0003520604425256341, + "loss": 3.3742, "step": 38400 }, { - "epoch": 4.138413518458724, - "grad_norm": 0.6444876194000244, - "learning_rate": 0.00035215817261071, - "loss": 3.3592, + "epoch": 4.1455525606469, + "grad_norm": 0.6358514428138733, + "learning_rate": 0.00035173664328116565, + "loss": 3.3609, "step": 38450 }, { - "epoch": 4.1437950704983315, - "grad_norm": 0.6224494576454163, - "learning_rate": 0.00035183493158064865, - "loss": 3.3752, + "epoch": 4.150943396226415, + "grad_norm": 0.5968437194824219, + "learning_rate": 0.00035141284403669725, + "loss": 3.3902, "step": 38500 }, { - "epoch": 4.14917662253794, - "grad_norm": 0.5903575420379639, - "learning_rate": 0.0003515116905505872, - "loss": 3.3655, + "epoch": 4.15633423180593, + "grad_norm": 0.6028518676757812, + "learning_rate": 0.0003510890447922288, + "loss": 3.3678, "step": 38550 }, { - "epoch": 4.154558174577549, - "grad_norm": 0.7473770976066589, - "learning_rate": 0.0003511884495205258, - "loss": 3.3896, + "epoch": 4.1617250673854445, + "grad_norm": 0.6527324914932251, + "learning_rate": 0.00035076524554776035, + "loss": 3.3751, "step": 38600 }, { - "epoch": 4.159939726617156, - "grad_norm": 0.6288912296295166, - "learning_rate": 0.0003508652084904644, - "loss": 3.3903, + "epoch": 4.16711590296496, + "grad_norm": 0.6300586462020874, + "learning_rate": 0.00035044144630329196, + "loss": 3.3759, "step": 38650 }, { - "epoch": 4.165321278656765, - "grad_norm": 0.6335214376449585, - "learning_rate": 0.0003505419674604029, - "loss": 3.3766, + "epoch": 4.172506738544475, + "grad_norm": 0.6409329771995544, + "learning_rate": 0.00035011764705882346, + "loss": 3.397, "step": 38700 }, { - "epoch": 4.1707028306963725, - "grad_norm": 0.6742566823959351, - "learning_rate": 0.0003502187264303415, - "loss": 3.3673, + "epoch": 4.177897574123989, + "grad_norm": 0.6493109464645386, + "learning_rate": 0.0003497938478143551, + "loss": 3.3677, "step": 38750 }, { - "epoch": 4.176084382735981, - "grad_norm": 0.6295595169067383, - "learning_rate": 0.00034989548540028016, - "loss": 3.384, + "epoch": 4.183288409703504, + "grad_norm": 0.6340633630752563, + "learning_rate": 0.0003494700485698866, + "loss": 3.3919, "step": 38800 }, { - "epoch": 4.18146593477559, - "grad_norm": 0.6516152024269104, - "learning_rate": 0.0003495787091908199, - "loss": 3.3958, + "epoch": 4.188679245283019, + "grad_norm": 0.6204349994659424, + "learning_rate": 0.0003491462493254182, + "loss": 3.3822, "step": 38850 }, { - "epoch": 4.186847486815197, - "grad_norm": 0.6820975542068481, - "learning_rate": 0.0003492554681607585, - "loss": 3.3867, + "epoch": 4.1940700808625335, + "grad_norm": 0.6226798892021179, + "learning_rate": 0.00034882245008094977, + "loss": 3.3894, "step": 38900 }, { - "epoch": 4.192229038854806, - "grad_norm": 0.650514543056488, - "learning_rate": 0.0003489322271306971, - "loss": 3.3893, + "epoch": 4.199460916442049, + "grad_norm": 0.6485829949378967, + "learning_rate": 0.0003484986508364813, + "loss": 3.385, "step": 38950 }, { - "epoch": 4.197610590894414, - "grad_norm": 0.6839084625244141, - "learning_rate": 0.0003486089861006357, - "loss": 3.3787, + "epoch": 4.204851752021563, + "grad_norm": 0.710810661315918, + "learning_rate": 0.0003481748515920129, + "loss": 3.3797, "step": 39000 }, { - "epoch": 4.197610590894414, - "eval_accuracy": 0.37696271934709175, - "eval_loss": 3.4459009170532227, - "eval_runtime": 184.0684, - "eval_samples_per_second": 97.849, - "eval_steps_per_second": 6.117, + "epoch": 4.204851752021563, + "eval_accuracy": 0.3766247002131009, + "eval_loss": 3.4453039169311523, + "eval_runtime": 183.5571, + "eval_samples_per_second": 98.122, + "eval_steps_per_second": 6.134, "step": 39000 }, { - "epoch": 4.202992142934022, - "grad_norm": 0.6607173085212708, - "learning_rate": 0.00034828574507057424, - "loss": 3.3867, + "epoch": 4.210242587601078, + "grad_norm": 0.6208673715591431, + "learning_rate": 0.00034785105234754447, + "loss": 3.3822, "step": 39050 }, { - "epoch": 4.208373694973631, - "grad_norm": 0.6593530774116516, - "learning_rate": 0.00034796250404051283, - "loss": 3.3728, + "epoch": 4.215633423180593, + "grad_norm": 0.6665465235710144, + "learning_rate": 0.0003475272531030761, + "loss": 3.385, "step": 39100 }, { - "epoch": 4.213755247013238, - "grad_norm": 0.6216228008270264, - "learning_rate": 0.00034763926301045137, - "loss": 3.3738, + "epoch": 4.2210242587601075, + "grad_norm": 0.6304270029067993, + "learning_rate": 0.00034720345385860763, + "loss": 3.3941, "step": 39150 }, { - "epoch": 4.219136799052847, - "grad_norm": 0.6555355191230774, - "learning_rate": 0.00034731602198039, - "loss": 3.3772, + "epoch": 4.226415094339623, + "grad_norm": 0.6752989888191223, + "learning_rate": 0.00034687965461413923, + "loss": 3.3906, "step": 39200 }, { - "epoch": 4.224518351092455, - "grad_norm": 0.6763815879821777, - "learning_rate": 0.0003469927809503286, - "loss": 3.3861, + "epoch": 4.231805929919138, + "grad_norm": 0.5948718786239624, + "learning_rate": 0.0003465558553696708, + "loss": 3.387, "step": 39250 }, { - "epoch": 4.229899903132063, - "grad_norm": 0.6993557810783386, - "learning_rate": 0.00034666953992026716, - "loss": 3.3736, + "epoch": 4.237196765498652, + "grad_norm": 0.6458513736724854, + "learning_rate": 0.0003462320561252024, + "loss": 3.3823, "step": 39300 }, { - "epoch": 4.2352814551716715, - "grad_norm": 0.6681185960769653, - "learning_rate": 0.00034634629889020575, - "loss": 3.3867, + "epoch": 4.242587601078167, + "grad_norm": 0.6506422758102417, + "learning_rate": 0.00034590825688073394, + "loss": 3.3977, "step": 39350 }, { - "epoch": 4.24066300721128, - "grad_norm": 0.673267126083374, - "learning_rate": 0.00034602305786014435, - "loss": 3.3843, + "epoch": 4.247978436657682, + "grad_norm": 0.6215380430221558, + "learning_rate": 0.00034558445763626543, + "loss": 3.3828, "step": 39400 }, { - "epoch": 4.246044559250888, - "grad_norm": 0.6191767454147339, - "learning_rate": 0.00034569981683008294, - "loss": 3.3884, + "epoch": 4.2533692722371965, + "grad_norm": 0.6403311491012573, + "learning_rate": 0.00034526065839179704, + "loss": 3.3864, "step": 39450 }, { - "epoch": 4.251426111290496, - "grad_norm": 0.6691135764122009, - "learning_rate": 0.00034537657580002154, - "loss": 3.3979, + "epoch": 4.258760107816712, + "grad_norm": 0.6085329651832581, + "learning_rate": 0.0003449368591473286, + "loss": 3.3863, "step": 39500 }, { - "epoch": 4.256807663330104, - "grad_norm": 0.6780727505683899, - "learning_rate": 0.00034505333476996013, - "loss": 3.3776, + "epoch": 4.264150943396227, + "grad_norm": 0.6336445212364197, + "learning_rate": 0.00034461953588774954, + "loss": 3.3869, "step": 39550 }, { - "epoch": 4.2621892153697125, - "grad_norm": 0.6386967897415161, - "learning_rate": 0.00034473009373989867, - "loss": 3.3767, + "epoch": 4.269541778975741, + "grad_norm": 0.6452109217643738, + "learning_rate": 0.00034429573664328115, + "loss": 3.3841, "step": 39600 }, { - "epoch": 4.267570767409321, - "grad_norm": 0.6862613558769226, - "learning_rate": 0.00034440685270983727, - "loss": 3.3865, + "epoch": 4.274932614555256, + "grad_norm": 0.6085174083709717, + "learning_rate": 0.0003439719373988127, + "loss": 3.3809, "step": 39650 }, { - "epoch": 4.272952319448929, - "grad_norm": 0.6203247904777527, - "learning_rate": 0.0003440836116797758, - "loss": 3.3937, + "epoch": 4.280323450134771, + "grad_norm": 0.6353808641433716, + "learning_rate": 0.00034364813815434425, + "loss": 3.3975, "step": 39700 }, { - "epoch": 4.278333871488537, - "grad_norm": 0.6543366312980652, - "learning_rate": 0.00034376037064971445, - "loss": 3.3948, + "epoch": 4.285714285714286, + "grad_norm": 0.6164100170135498, + "learning_rate": 0.00034332433890987585, + "loss": 3.3868, "step": 39750 }, { - "epoch": 4.283715423528146, - "grad_norm": 0.6546263098716736, - "learning_rate": 0.00034343712961965305, - "loss": 3.375, + "epoch": 4.291105121293801, + "grad_norm": 0.6289504170417786, + "learning_rate": 0.0003430005396654074, + "loss": 3.3913, "step": 39800 }, { - "epoch": 4.2890969755677535, - "grad_norm": 0.6535509824752808, - "learning_rate": 0.0003431138885895916, - "loss": 3.3772, + "epoch": 4.296495956873315, + "grad_norm": 0.6531320810317993, + "learning_rate": 0.000342676740420939, + "loss": 3.3607, "step": 39850 }, { - "epoch": 4.294478527607362, - "grad_norm": 0.6452800035476685, - "learning_rate": 0.0003427906475595302, - "loss": 3.3722, + "epoch": 4.30188679245283, + "grad_norm": 0.598456621170044, + "learning_rate": 0.00034235294117647056, + "loss": 3.4004, "step": 39900 }, { - "epoch": 4.299860079646971, - "grad_norm": 0.6130262613296509, - "learning_rate": 0.0003424674065294688, - "loss": 3.3759, + "epoch": 4.307277628032345, + "grad_norm": 0.6940332651138306, + "learning_rate": 0.00034202914193200216, + "loss": 3.3772, "step": 39950 }, { - "epoch": 4.305241631686578, - "grad_norm": 0.6329742670059204, - "learning_rate": 0.0003421441654994073, - "loss": 3.3976, + "epoch": 4.3126684636118595, + "grad_norm": 0.6354912519454956, + "learning_rate": 0.0003417053426875337, + "loss": 3.397, "step": 40000 }, { - "epoch": 4.305241631686578, - "eval_accuracy": 0.37801404468958466, - "eval_loss": 3.4401395320892334, - "eval_runtime": 184.1282, - "eval_samples_per_second": 97.818, - "eval_steps_per_second": 6.115, + "epoch": 4.3126684636118595, + "eval_accuracy": 0.3773185575546842, + "eval_loss": 3.440795660018921, + "eval_runtime": 183.676, + "eval_samples_per_second": 98.059, + "eval_steps_per_second": 6.13, "step": 40000 }, { - "epoch": 4.310623183726187, - "grad_norm": 0.6644855737686157, - "learning_rate": 0.00034182092446934597, - "loss": 3.3697, + "epoch": 4.318059299191375, + "grad_norm": 0.6281748414039612, + "learning_rate": 0.0003413815434430653, + "loss": 3.4086, "step": 40050 }, { - "epoch": 4.3160047357657945, - "grad_norm": 0.6772639751434326, - "learning_rate": 0.00034149768343928456, - "loss": 3.389, + "epoch": 4.32345013477089, + "grad_norm": 0.6316752433776855, + "learning_rate": 0.0003410577441985968, + "loss": 3.3781, "step": 40100 }, { - "epoch": 4.321386287805403, - "grad_norm": 0.6445088386535645, - "learning_rate": 0.0003411744424092231, - "loss": 3.39, + "epoch": 4.328840970350404, + "grad_norm": 0.6012856364250183, + "learning_rate": 0.00034073394495412837, + "loss": 3.3941, "step": 40150 }, { - "epoch": 4.326767839845012, - "grad_norm": 0.6840226054191589, - "learning_rate": 0.0003408512013791617, - "loss": 3.3781, + "epoch": 4.334231805929919, + "grad_norm": 0.6168755888938904, + "learning_rate": 0.00034041014570965997, + "loss": 3.4104, "step": 40200 }, { - "epoch": 4.332149391884619, - "grad_norm": 0.7001984715461731, - "learning_rate": 0.00034052796034910024, - "loss": 3.4049, + "epoch": 4.339622641509434, + "grad_norm": 0.6107414960861206, + "learning_rate": 0.0003400863464651915, + "loss": 3.3907, "step": 40250 }, { - "epoch": 4.337530943924228, - "grad_norm": 0.6908742189407349, - "learning_rate": 0.0003402047193190389, - "loss": 3.3838, + "epoch": 4.345013477088949, + "grad_norm": 0.6335568428039551, + "learning_rate": 0.0003397625472207231, + "loss": 3.4018, "step": 40300 }, { - "epoch": 4.342912495963836, - "grad_norm": 0.6147873401641846, - "learning_rate": 0.0003398814782889775, - "loss": 3.3824, + "epoch": 4.350404312668464, + "grad_norm": 0.6497905254364014, + "learning_rate": 0.0003394387479762547, + "loss": 3.4083, "step": 40350 }, { - "epoch": 4.348294048003444, - "grad_norm": 0.6918706893920898, - "learning_rate": 0.000339558237258916, - "loss": 3.4047, + "epoch": 4.355795148247978, + "grad_norm": 0.6541007161140442, + "learning_rate": 0.0003391149487317863, + "loss": 3.397, "step": 40400 }, { - "epoch": 4.3536756000430525, - "grad_norm": 0.6723856925964355, - "learning_rate": 0.0003392349962288546, - "loss": 3.4004, + "epoch": 4.361185983827493, + "grad_norm": 0.6449550986289978, + "learning_rate": 0.00033879114948731783, + "loss": 3.4032, "step": 40450 }, { - "epoch": 4.359057152082661, - "grad_norm": 0.6632176637649536, - "learning_rate": 0.0003389117551987932, - "loss": 3.3876, + "epoch": 4.366576819407008, + "grad_norm": 0.6576782464981079, + "learning_rate": 0.00033846735024284944, + "loss": 3.3723, "step": 40500 }, { - "epoch": 4.364438704122269, - "grad_norm": 0.6662275791168213, - "learning_rate": 0.00033858851416873175, - "loss": 3.396, + "epoch": 4.3719676549865225, + "grad_norm": 0.6394531726837158, + "learning_rate": 0.000338143550998381, + "loss": 3.3806, "step": 40550 }, { - "epoch": 4.369820256161877, - "grad_norm": 0.6214112043380737, - "learning_rate": 0.0003382652731386704, - "loss": 3.3953, + "epoch": 4.377358490566038, + "grad_norm": 0.6244447827339172, + "learning_rate": 0.00033781975175391254, + "loss": 3.3881, "step": 40600 }, { - "epoch": 4.375201808201485, - "grad_norm": 0.6457570195198059, - "learning_rate": 0.000337942032108609, - "loss": 3.3698, + "epoch": 4.382749326145553, + "grad_norm": 0.6317362189292908, + "learning_rate": 0.00033749595250944414, + "loss": 3.3891, "step": 40650 }, { - "epoch": 4.3805833602410935, - "grad_norm": 0.7015320062637329, - "learning_rate": 0.00033761879107854754, - "loss": 3.4086, + "epoch": 4.388140161725067, + "grad_norm": 0.6562482118606567, + "learning_rate": 0.00033717215326497564, + "loss": 3.3921, "step": 40700 }, { - "epoch": 4.385964912280702, - "grad_norm": 0.6032451391220093, - "learning_rate": 0.00033729555004848613, - "loss": 3.3934, + "epoch": 4.393530997304582, + "grad_norm": 0.6376356482505798, + "learning_rate": 0.0003368483540205073, + "loss": 3.3925, "step": 40750 }, { - "epoch": 4.39134646432031, - "grad_norm": 0.7044788002967834, - "learning_rate": 0.00033697230901842467, - "loss": 3.387, + "epoch": 4.398921832884097, + "grad_norm": 0.5876014232635498, + "learning_rate": 0.0003365245547760388, + "loss": 3.3929, "step": 40800 }, { - "epoch": 4.396728016359918, - "grad_norm": 0.6239883899688721, - "learning_rate": 0.00033664906798836327, - "loss": 3.4018, + "epoch": 4.404312668463612, + "grad_norm": 0.6881380081176758, + "learning_rate": 0.0003362007555315704, + "loss": 3.3851, "step": 40850 }, { - "epoch": 4.402109568399527, - "grad_norm": 0.6300894021987915, - "learning_rate": 0.0003363258269583019, - "loss": 3.3962, + "epoch": 4.409703504043127, + "grad_norm": 0.6236041188240051, + "learning_rate": 0.00033587695628710195, + "loss": 3.39, "step": 40900 }, { - "epoch": 4.4074911204391345, - "grad_norm": 0.6542284488677979, - "learning_rate": 0.00033600258592824046, - "loss": 3.3824, + "epoch": 4.415094339622642, + "grad_norm": 0.6073125600814819, + "learning_rate": 0.00033555315704263355, + "loss": 3.3982, "step": 40950 }, { - "epoch": 4.412872672478743, - "grad_norm": 0.6408656239509583, - "learning_rate": 0.00033567934489817905, - "loss": 3.3844, + "epoch": 4.420485175202156, + "grad_norm": 0.6230391263961792, + "learning_rate": 0.0003352293577981651, + "loss": 3.3907, "step": 41000 }, { - "epoch": 4.412872672478743, - "eval_accuracy": 0.37776924973333864, - "eval_loss": 3.43493390083313, - "eval_runtime": 184.3217, - "eval_samples_per_second": 97.715, - "eval_steps_per_second": 6.109, + "epoch": 4.420485175202156, + "eval_accuracy": 0.37776414304761136, + "eval_loss": 3.4336507320404053, + "eval_runtime": 183.8326, + "eval_samples_per_second": 97.975, + "eval_steps_per_second": 6.125, "step": 41000 }, { - "epoch": 4.418254224518351, - "grad_norm": 0.6970384120941162, - "learning_rate": 0.00033535610386811764, - "loss": 3.3984, + "epoch": 4.425876010781671, + "grad_norm": 0.6647446155548096, + "learning_rate": 0.00033490555855369665, + "loss": 3.3783, "step": 41050 }, { - "epoch": 4.423635776557959, - "grad_norm": 0.719208300113678, - "learning_rate": 0.0003350328628380562, - "loss": 3.4049, + "epoch": 4.431266846361186, + "grad_norm": 0.7069098353385925, + "learning_rate": 0.00033458175930922826, + "loss": 3.3957, "step": 41100 }, { - "epoch": 4.429017328597568, - "grad_norm": 0.6588863134384155, - "learning_rate": 0.0003347096218079948, - "loss": 3.3783, + "epoch": 4.436657681940701, + "grad_norm": 0.6159655451774597, + "learning_rate": 0.0003342579600647598, + "loss": 3.3931, "step": 41150 }, { - "epoch": 4.4343988806371755, - "grad_norm": 0.6663377285003662, - "learning_rate": 0.00033438638077793343, - "loss": 3.3826, + "epoch": 4.442048517520216, + "grad_norm": 0.6176379323005676, + "learning_rate": 0.0003339341608202914, + "loss": 3.376, "step": 41200 }, { - "epoch": 4.439780432676784, - "grad_norm": 0.649978518486023, - "learning_rate": 0.00033406313974787197, - "loss": 3.3947, + "epoch": 4.44743935309973, + "grad_norm": 0.6525024771690369, + "learning_rate": 0.00033361036157582297, + "loss": 3.3894, "step": 41250 }, { - "epoch": 4.445161984716393, - "grad_norm": 0.6662973761558533, - "learning_rate": 0.00033373989871781056, - "loss": 3.3922, + "epoch": 4.452830188679245, + "grad_norm": 0.6509463787078857, + "learning_rate": 0.00033328656233135457, + "loss": 3.3807, "step": 41300 }, { - "epoch": 4.450543536756, - "grad_norm": 0.6409628391265869, - "learning_rate": 0.0003334166576877491, - "loss": 3.3832, + "epoch": 4.45822102425876, + "grad_norm": 0.6493340134620667, + "learning_rate": 0.0003329627630868861, + "loss": 3.4081, "step": 41350 }, { - "epoch": 4.455925088795609, - "grad_norm": 0.6468245387077332, - "learning_rate": 0.0003330934166576877, - "loss": 3.3892, + "epoch": 4.463611859838275, + "grad_norm": 0.623526930809021, + "learning_rate": 0.0003326389638424177, + "loss": 3.4001, "step": 41400 }, { - "epoch": 4.461306640835216, - "grad_norm": 0.6183198690414429, - "learning_rate": 0.00033277017562762635, - "loss": 3.4058, + "epoch": 4.46900269541779, + "grad_norm": 0.6079691052436829, + "learning_rate": 0.0003323151645979492, + "loss": 3.3872, "step": 41450 }, { - "epoch": 4.466688192874825, - "grad_norm": 0.6112544536590576, - "learning_rate": 0.0003324469345975649, - "loss": 3.4, + "epoch": 4.474393530997305, + "grad_norm": 0.6355500817298889, + "learning_rate": 0.00033199136535348077, + "loss": 3.3767, "step": 41500 }, { - "epoch": 4.4720697449144335, - "grad_norm": 0.6665729880332947, - "learning_rate": 0.0003321236935675035, - "loss": 3.4103, + "epoch": 4.479784366576819, + "grad_norm": 0.6550726294517517, + "learning_rate": 0.0003316675661090124, + "loss": 3.3942, "step": 41550 }, { - "epoch": 4.477451296954041, - "grad_norm": 0.6289717555046082, - "learning_rate": 0.0003318004525374421, - "loss": 3.3997, + "epoch": 4.485175202156334, + "grad_norm": 0.6663205027580261, + "learning_rate": 0.00033134376686454393, + "loss": 3.3968, "step": 41600 }, { - "epoch": 4.48283284899365, - "grad_norm": 0.6453819274902344, - "learning_rate": 0.0003314772115073806, - "loss": 3.3909, + "epoch": 4.490566037735849, + "grad_norm": 0.6686147451400757, + "learning_rate": 0.00033101996762007553, + "loss": 3.3851, "step": 41650 }, { - "epoch": 4.488214401033258, - "grad_norm": 0.7051796913146973, - "learning_rate": 0.0003311539704773192, - "loss": 3.4105, + "epoch": 4.495956873315364, + "grad_norm": 0.6377660632133484, + "learning_rate": 0.0003306961683756071, + "loss": 3.4061, "step": 41700 }, { - "epoch": 4.493595953072866, - "grad_norm": 0.7093035578727722, - "learning_rate": 0.00033083072944725786, - "loss": 3.3901, + "epoch": 4.501347708894879, + "grad_norm": 0.6469005942344666, + "learning_rate": 0.0003303723691311387, + "loss": 3.3871, "step": 41750 }, { - "epoch": 4.4989775051124745, - "grad_norm": 0.6689607501029968, - "learning_rate": 0.0003305074884171964, - "loss": 3.3928, + "epoch": 4.506738544474393, + "grad_norm": 0.6090621948242188, + "learning_rate": 0.00033004856988667024, + "loss": 3.3872, "step": 41800 }, { - "epoch": 4.504359057152083, - "grad_norm": 0.6262616515159607, - "learning_rate": 0.000330184247387135, - "loss": 3.3853, + "epoch": 4.512129380053908, + "grad_norm": 0.6723948121070862, + "learning_rate": 0.0003297247706422018, + "loss": 3.3949, "step": 41850 }, { - "epoch": 4.509740609191691, - "grad_norm": 0.663016676902771, - "learning_rate": 0.00032986100635707354, - "loss": 3.3819, + "epoch": 4.517520215633423, + "grad_norm": 0.6774519085884094, + "learning_rate": 0.0003294009713977334, + "loss": 3.3961, "step": 41900 }, { - "epoch": 4.515122161231299, - "grad_norm": 0.6628140807151794, - "learning_rate": 0.00032953776532701213, - "loss": 3.3863, + "epoch": 4.5229110512129385, + "grad_norm": 0.61894690990448, + "learning_rate": 0.00032907717215326494, + "loss": 3.3959, "step": 41950 }, { - "epoch": 4.520503713270907, - "grad_norm": 0.6778246760368347, - "learning_rate": 0.00032921452429695067, - "loss": 3.3913, + "epoch": 4.528301886792453, + "grad_norm": 0.6208569407463074, + "learning_rate": 0.00032875337290879655, + "loss": 3.3865, "step": 42000 }, { - "epoch": 4.520503713270907, - "eval_accuracy": 0.37859772800292407, - "eval_loss": 3.43190598487854, - "eval_runtime": 183.9779, - "eval_samples_per_second": 97.898, - "eval_steps_per_second": 6.12, + "epoch": 4.528301886792453, + "eval_accuracy": 0.37887283711487035, + "eval_loss": 3.42965030670166, + "eval_runtime": 184.057, + "eval_samples_per_second": 97.856, + "eval_steps_per_second": 6.118, "step": 42000 }, { - "epoch": 4.5258852653105155, - "grad_norm": 0.6411513686180115, - "learning_rate": 0.0003288912832668893, - "loss": 3.3865, + "epoch": 4.533692722371968, + "grad_norm": 0.6180669069290161, + "learning_rate": 0.00032842957366432805, + "loss": 3.3838, "step": 42050 }, { - "epoch": 4.531266817350124, - "grad_norm": 0.6528933644294739, - "learning_rate": 0.0003285680422368279, - "loss": 3.4013, + "epoch": 4.539083557951482, + "grad_norm": 0.6204472184181213, + "learning_rate": 0.0003281057744198597, + "loss": 3.4125, "step": 42100 }, { - "epoch": 4.536648369389732, - "grad_norm": 0.6158764362335205, - "learning_rate": 0.0003282448012067665, - "loss": 3.3782, + "epoch": 4.544474393530997, + "grad_norm": 0.6484192609786987, + "learning_rate": 0.0003277819751753912, + "loss": 3.3988, "step": 42150 }, { - "epoch": 4.54202992142934, - "grad_norm": 0.7035838961601257, - "learning_rate": 0.00032792156017670505, - "loss": 3.4084, + "epoch": 4.549865229110512, + "grad_norm": 0.5903990864753723, + "learning_rate": 0.0003274581759309228, + "loss": 3.3797, "step": 42200 }, { - "epoch": 4.547411473468949, - "grad_norm": 0.6502971053123474, - "learning_rate": 0.00032759831914664365, - "loss": 3.3929, + "epoch": 4.555256064690027, + "grad_norm": 0.8267539739608765, + "learning_rate": 0.00032713437668645436, + "loss": 3.4023, "step": 42250 }, { - "epoch": 4.5527930255085565, - "grad_norm": 0.6222167611122131, - "learning_rate": 0.0003272750781165823, - "loss": 3.3799, + "epoch": 4.560646900269542, + "grad_norm": 0.645702600479126, + "learning_rate": 0.0003268105774419859, + "loss": 3.4002, "step": 42300 }, { - "epoch": 4.558174577548165, - "grad_norm": 0.7282003164291382, - "learning_rate": 0.00032695183708652083, - "loss": 3.3734, + "epoch": 4.566037735849057, + "grad_norm": 0.6652170419692993, + "learning_rate": 0.0003264867781975175, + "loss": 3.3839, "step": 42350 }, { - "epoch": 4.563556129587774, - "grad_norm": 0.6520505547523499, - "learning_rate": 0.00032662859605645943, - "loss": 3.3722, + "epoch": 4.571428571428571, + "grad_norm": 0.7304674983024597, + "learning_rate": 0.00032616297895304906, + "loss": 3.4082, "step": 42400 }, { - "epoch": 4.568937681627381, - "grad_norm": 0.6786864399909973, - "learning_rate": 0.00032630535502639797, - "loss": 3.4029, + "epoch": 4.576819407008086, + "grad_norm": 0.590006411075592, + "learning_rate": 0.00032583917970858067, + "loss": 3.3774, "step": 42450 }, { - "epoch": 4.57431923366699, - "grad_norm": 0.651889979839325, - "learning_rate": 0.00032598211399633656, - "loss": 3.3934, + "epoch": 4.5822102425876015, + "grad_norm": 0.6259329915046692, + "learning_rate": 0.0003255153804641122, + "loss": 3.3776, "step": 42500 }, { - "epoch": 4.579700785706597, - "grad_norm": 0.662212610244751, - "learning_rate": 0.0003256588729662751, - "loss": 3.3881, + "epoch": 4.587601078167116, + "grad_norm": 0.6422986388206482, + "learning_rate": 0.0003251915812196438, + "loss": 3.3924, "step": 42550 }, { - "epoch": 4.585082337746206, - "grad_norm": 0.6370143294334412, - "learning_rate": 0.00032533563193621375, - "loss": 3.3962, + "epoch": 4.592991913746631, + "grad_norm": 0.6286759972572327, + "learning_rate": 0.00032486778197517537, + "loss": 3.391, "step": 42600 }, { - "epoch": 4.5904638897858145, - "grad_norm": 0.6888731718063354, - "learning_rate": 0.00032501239090615235, - "loss": 3.3792, + "epoch": 4.598382749326145, + "grad_norm": 0.6902345418930054, + "learning_rate": 0.000324543982730707, + "loss": 3.4023, "step": 42650 }, { - "epoch": 4.595845441825422, - "grad_norm": 0.6885330677032471, - "learning_rate": 0.0003246891498760909, - "loss": 3.3741, + "epoch": 4.60377358490566, + "grad_norm": 0.6514720320701599, + "learning_rate": 0.00032422018348623853, + "loss": 3.3967, "step": 42700 }, { - "epoch": 4.601226993865031, - "grad_norm": 0.6344414949417114, - "learning_rate": 0.0003243659088460295, - "loss": 3.3869, + "epoch": 4.609164420485175, + "grad_norm": 0.6290830373764038, + "learning_rate": 0.0003239028602266595, + "loss": 3.3837, "step": 42750 }, { - "epoch": 4.606608545904638, - "grad_norm": 0.680259108543396, - "learning_rate": 0.0003240426678159681, - "loss": 3.3875, + "epoch": 4.6145552560646905, + "grad_norm": 0.6454638838768005, + "learning_rate": 0.000323579060982191, + "loss": 3.4053, "step": 42800 }, { - "epoch": 4.611990097944247, - "grad_norm": 0.6663812398910522, - "learning_rate": 0.0003237258916065079, - "loss": 3.415, + "epoch": 4.619946091644205, + "grad_norm": 0.6117614507675171, + "learning_rate": 0.0003232552617377226, + "loss": 3.3996, "step": 42850 }, { - "epoch": 4.6173716499838555, - "grad_norm": 0.6414598822593689, - "learning_rate": 0.0003234026505764465, - "loss": 3.3925, + "epoch": 4.62533692722372, + "grad_norm": 0.5953809022903442, + "learning_rate": 0.00032293146249325413, + "loss": 3.3871, "step": 42900 }, { - "epoch": 4.622753202023463, - "grad_norm": 0.657367467880249, - "learning_rate": 0.000323079409546385, - "loss": 3.3902, + "epoch": 4.630727762803234, + "grad_norm": 0.6086660027503967, + "learning_rate": 0.00032260766324878574, + "loss": 3.3894, "step": 42950 }, { - "epoch": 4.628134754063072, - "grad_norm": 0.6666000485420227, - "learning_rate": 0.00032275616851632367, - "loss": 3.3802, + "epoch": 4.636118598382749, + "grad_norm": 0.6196882128715515, + "learning_rate": 0.0003222838640043173, + "loss": 3.4048, "step": 43000 }, { - "epoch": 4.628134754063072, - "eval_accuracy": 0.378789283044141, - "eval_loss": 3.4260668754577637, - "eval_runtime": 184.1527, - "eval_samples_per_second": 97.805, - "eval_steps_per_second": 6.114, + "epoch": 4.636118598382749, + "eval_accuracy": 0.37893466060803677, + "eval_loss": 3.424194097518921, + "eval_runtime": 183.5359, + "eval_samples_per_second": 98.133, + "eval_steps_per_second": 6.135, "step": 43000 }, { - "epoch": 4.63351630610268, - "grad_norm": 0.6748586893081665, - "learning_rate": 0.00032243292748626226, - "loss": 3.4022, + "epoch": 4.6415094339622645, + "grad_norm": 0.5985151529312134, + "learning_rate": 0.00032196006475984884, + "loss": 3.3852, "step": 43050 }, { - "epoch": 4.638897858142288, - "grad_norm": 0.6140267252922058, - "learning_rate": 0.0003221096864562008, - "loss": 3.3837, + "epoch": 4.646900269541779, + "grad_norm": 0.6808710694313049, + "learning_rate": 0.00032163626551538044, + "loss": 3.3863, "step": 43100 }, { - "epoch": 4.6442794101818965, - "grad_norm": 0.6773905754089355, - "learning_rate": 0.0003217864454261394, - "loss": 3.3804, + "epoch": 4.652291105121294, + "grad_norm": 0.6364196538925171, + "learning_rate": 0.000321312466270912, + "loss": 3.3832, "step": 43150 }, { - "epoch": 4.649660962221505, - "grad_norm": 0.6858988404273987, - "learning_rate": 0.00032146320439607794, - "loss": 3.3782, + "epoch": 4.657681940700809, + "grad_norm": 0.6553201675415039, + "learning_rate": 0.0003209886670264436, + "loss": 3.3917, "step": 43200 }, { - "epoch": 4.655042514261113, - "grad_norm": 0.6207734942436218, - "learning_rate": 0.0003211399633660166, - "loss": 3.3924, + "epoch": 4.663072776280323, + "grad_norm": 0.6401380300521851, + "learning_rate": 0.00032066486778197515, + "loss": 3.3898, "step": 43250 }, { - "epoch": 4.660424066300721, - "grad_norm": 0.6594465374946594, - "learning_rate": 0.0003208167223359552, - "loss": 3.4044, + "epoch": 4.668463611859838, + "grad_norm": 0.6377171277999878, + "learning_rate": 0.00032034106853750675, + "loss": 3.398, "step": 43300 }, { - "epoch": 4.665805618340329, - "grad_norm": 0.7123188972473145, - "learning_rate": 0.0003204934813058937, - "loss": 3.3888, + "epoch": 4.6738544474393535, + "grad_norm": 0.6544540524482727, + "learning_rate": 0.0003200172692930383, + "loss": 3.3757, "step": 43350 }, { - "epoch": 4.6711871703799375, - "grad_norm": 0.6485627889633179, - "learning_rate": 0.0003201702402758323, - "loss": 3.4016, + "epoch": 4.679245283018868, + "grad_norm": 0.7005414366722107, + "learning_rate": 0.0003196934700485699, + "loss": 3.3839, "step": 43400 }, { - "epoch": 4.676568722419546, - "grad_norm": 0.6207435727119446, - "learning_rate": 0.0003198469992457709, - "loss": 3.3982, + "epoch": 4.684636118598383, + "grad_norm": 0.6461729407310486, + "learning_rate": 0.0003193696708041014, + "loss": 3.3907, "step": 43450 }, { - "epoch": 4.681950274459154, - "grad_norm": 0.6076232194900513, - "learning_rate": 0.00031952375821570945, - "loss": 3.3853, + "epoch": 4.690026954177897, + "grad_norm": 0.6113434433937073, + "learning_rate": 0.00031904587155963296, + "loss": 3.3896, "step": 43500 }, { - "epoch": 4.687331826498762, - "grad_norm": 0.6661508679389954, - "learning_rate": 0.0003192005171856481, - "loss": 3.397, + "epoch": 4.695417789757412, + "grad_norm": 0.6722586154937744, + "learning_rate": 0.00031872207231516456, + "loss": 3.377, "step": 43550 }, { - "epoch": 4.692713378538371, - "grad_norm": 0.6532484889030457, - "learning_rate": 0.0003188772761555867, - "loss": 3.3898, + "epoch": 4.7008086253369274, + "grad_norm": 0.6458831429481506, + "learning_rate": 0.0003183982730706961, + "loss": 3.3819, "step": 43600 }, { - "epoch": 4.6980949305779784, - "grad_norm": 0.6694145798683167, - "learning_rate": 0.00031855403512552524, - "loss": 3.4124, + "epoch": 4.706199460916442, + "grad_norm": 0.6122461557388306, + "learning_rate": 0.0003180744738262277, + "loss": 3.3987, "step": 43650 }, { - "epoch": 4.703476482617587, - "grad_norm": 0.7081990838050842, - "learning_rate": 0.00031823079409546383, - "loss": 3.3859, + "epoch": 4.711590296495957, + "grad_norm": 0.6254520416259766, + "learning_rate": 0.00031775067458175927, + "loss": 3.3896, "step": 43700 }, { - "epoch": 4.7088580346571955, - "grad_norm": 0.6278550028800964, - "learning_rate": 0.00031790755306540237, - "loss": 3.3846, + "epoch": 4.716981132075472, + "grad_norm": 0.6644843816757202, + "learning_rate": 0.00031742687533729087, + "loss": 3.39, "step": 43750 }, { - "epoch": 4.714239586696803, - "grad_norm": 0.6548607349395752, - "learning_rate": 0.00031758431203534097, - "loss": 3.388, + "epoch": 4.722371967654986, + "grad_norm": 0.6221049427986145, + "learning_rate": 0.0003171030760928224, + "loss": 3.3815, "step": 43800 }, { - "epoch": 4.719621138736412, - "grad_norm": 0.6434149742126465, - "learning_rate": 0.0003172610710052796, - "loss": 3.3952, + "epoch": 4.727762803234501, + "grad_norm": 0.6321914792060852, + "learning_rate": 0.000316779276848354, + "loss": 3.402, "step": 43850 }, { - "epoch": 4.725002690776019, - "grad_norm": 0.6363683938980103, - "learning_rate": 0.00031693782997521816, - "loss": 3.3814, + "epoch": 4.7331536388140165, + "grad_norm": 0.6218891143798828, + "learning_rate": 0.0003164554776038856, + "loss": 3.4087, "step": 43900 }, { - "epoch": 4.730384242815628, - "grad_norm": 0.6362830996513367, - "learning_rate": 0.00031661458894515675, - "loss": 3.3889, + "epoch": 4.738544474393531, + "grad_norm": 0.6218807101249695, + "learning_rate": 0.00031613167835941713, + "loss": 3.4077, "step": 43950 }, { - "epoch": 4.7357657948552365, - "grad_norm": 0.6621768474578857, - "learning_rate": 0.0003162913479150953, - "loss": 3.393, + "epoch": 4.743935309973046, + "grad_norm": 0.6768582463264465, + "learning_rate": 0.00031580787911494873, + "loss": 3.3723, "step": 44000 }, { - "epoch": 4.7357657948552365, - "eval_accuracy": 0.3792806114028381, - "eval_loss": 3.420133352279663, - "eval_runtime": 184.4795, - "eval_samples_per_second": 97.631, - "eval_steps_per_second": 6.104, + "epoch": 4.743935309973046, + "eval_accuracy": 0.37957755974523505, + "eval_loss": 3.418135166168213, + "eval_runtime": 183.6046, + "eval_samples_per_second": 98.097, + "eval_steps_per_second": 6.133, "step": 44000 }, { - "epoch": 4.741147346894844, - "grad_norm": 0.6609659194946289, - "learning_rate": 0.0003159681068850339, - "loss": 3.3955, + "epoch": 4.74932614555256, + "grad_norm": 0.6679436564445496, + "learning_rate": 0.00031548407987048023, + "loss": 3.4082, "step": 44050 }, { - "epoch": 4.746528898934453, - "grad_norm": 0.6639893054962158, - "learning_rate": 0.00031564486585497253, - "loss": 3.3899, + "epoch": 4.754716981132075, + "grad_norm": 0.698415994644165, + "learning_rate": 0.0003151602806260119, + "loss": 3.3843, "step": 44100 }, { - "epoch": 4.751910450974061, - "grad_norm": 0.6364026069641113, - "learning_rate": 0.0003153216248249111, - "loss": 3.3913, + "epoch": 4.7601078167115904, + "grad_norm": 0.6204515695571899, + "learning_rate": 0.0003148364813815434, + "loss": 3.3792, "step": 44150 }, { - "epoch": 4.757292003013669, - "grad_norm": 0.6532016396522522, - "learning_rate": 0.00031499838379484967, - "loss": 3.4, + "epoch": 4.765498652291106, + "grad_norm": 0.60456782579422, + "learning_rate": 0.000314512682137075, + "loss": 3.401, "step": 44200 }, { - "epoch": 4.7626735550532775, - "grad_norm": 0.6646419167518616, - "learning_rate": 0.00031467514276478826, - "loss": 3.3959, + "epoch": 4.77088948787062, + "grad_norm": 0.6760085225105286, + "learning_rate": 0.00031418888289260654, + "loss": 3.3948, "step": 44250 }, { - "epoch": 4.768055107092886, - "grad_norm": 0.6860271096229553, - "learning_rate": 0.0003143519017347268, - "loss": 3.4054, + "epoch": 4.776280323450135, + "grad_norm": 0.6306468844413757, + "learning_rate": 0.0003138650836481381, + "loss": 3.3888, "step": 44300 }, { - "epoch": 4.773436659132494, - "grad_norm": 0.6953567862510681, - "learning_rate": 0.0003140286607046654, - "loss": 3.3912, + "epoch": 4.781671159029649, + "grad_norm": 0.618640124797821, + "learning_rate": 0.0003135412844036697, + "loss": 3.3987, "step": 44350 }, { - "epoch": 4.778818211172102, - "grad_norm": 0.6766008734703064, - "learning_rate": 0.00031370541967460405, - "loss": 3.3885, + "epoch": 4.787061994609164, + "grad_norm": 0.7838083505630493, + "learning_rate": 0.00031321748515920124, + "loss": 3.3795, "step": 44400 }, { - "epoch": 4.78419976321171, - "grad_norm": 0.6261332631111145, - "learning_rate": 0.0003133821786445426, - "loss": 3.3869, + "epoch": 4.7924528301886795, + "grad_norm": 0.6274157166481018, + "learning_rate": 0.00031289368591473285, + "loss": 3.3961, "step": 44450 }, { - "epoch": 4.7895813152513185, - "grad_norm": 0.6700854897499084, - "learning_rate": 0.0003130589376144812, - "loss": 3.4018, + "epoch": 4.797843665768194, + "grad_norm": 0.6056527495384216, + "learning_rate": 0.0003125698866702644, + "loss": 3.379, "step": 44500 }, { - "epoch": 4.794962867290927, - "grad_norm": 0.680160641670227, - "learning_rate": 0.0003127356965844197, - "loss": 3.3795, + "epoch": 4.803234501347709, + "grad_norm": 0.6391441226005554, + "learning_rate": 0.000312246087425796, + "loss": 3.3785, "step": 44550 }, { - "epoch": 4.800344419330535, - "grad_norm": 0.6283779740333557, - "learning_rate": 0.0003124124555543583, - "loss": 3.3899, + "epoch": 4.808625336927224, + "grad_norm": 0.6809561252593994, + "learning_rate": 0.00031192228818132756, + "loss": 3.3926, "step": 44600 }, { - "epoch": 4.805725971370143, - "grad_norm": 0.6881921887397766, - "learning_rate": 0.0003120892145242969, - "loss": 3.3931, + "epoch": 4.814016172506738, + "grad_norm": 0.6451635360717773, + "learning_rate": 0.00031159848893685916, + "loss": 3.3838, "step": 44650 }, { - "epoch": 4.811107523409751, - "grad_norm": 0.7671242356300354, - "learning_rate": 0.0003117659734942355, - "loss": 3.4, + "epoch": 4.819407008086253, + "grad_norm": 0.6554741859436035, + "learning_rate": 0.0003112746896923907, + "loss": 3.3749, "step": 44700 }, { - "epoch": 4.8164890754493594, - "grad_norm": 0.6230366826057434, - "learning_rate": 0.00031144919728477526, - "loss": 3.3955, + "epoch": 4.824797843665769, + "grad_norm": 0.6644231677055359, + "learning_rate": 0.0003109508904479222, + "loss": 3.3767, "step": 44750 }, { - "epoch": 4.821870627488968, - "grad_norm": 0.6817081570625305, - "learning_rate": 0.0003111259562547139, - "loss": 3.3913, + "epoch": 4.830188679245283, + "grad_norm": 0.6254944205284119, + "learning_rate": 0.0003106270912034538, + "loss": 3.3938, "step": 44800 }, { - "epoch": 4.827252179528576, - "grad_norm": 0.6694238185882568, - "learning_rate": 0.0003108027152246525, - "loss": 3.3856, + "epoch": 4.835579514824798, + "grad_norm": 0.6484498977661133, + "learning_rate": 0.00031030329195898536, + "loss": 3.3877, "step": 44850 }, { - "epoch": 4.832633731568184, - "grad_norm": 0.6804167628288269, - "learning_rate": 0.0003104794741945911, - "loss": 3.3836, + "epoch": 4.840970350404312, + "grad_norm": 0.6299556493759155, + "learning_rate": 0.00030997949271451697, + "loss": 3.3888, "step": 44900 }, { - "epoch": 4.838015283607793, - "grad_norm": 0.7400542497634888, - "learning_rate": 0.00031015623316452964, - "loss": 3.3945, + "epoch": 4.846361185983827, + "grad_norm": 0.6518325209617615, + "learning_rate": 0.0003096556934700485, + "loss": 3.3854, "step": 44950 }, { - "epoch": 4.8433968356474, - "grad_norm": 0.6841332912445068, - "learning_rate": 0.00030983299213446823, - "loss": 3.4061, + "epoch": 4.8517520215633425, + "grad_norm": 0.6268826723098755, + "learning_rate": 0.0003093318942255801, + "loss": 3.3969, "step": 45000 }, { - "epoch": 4.8433968356474, - "eval_accuracy": 0.3799427421011795, - "eval_loss": 3.4151318073272705, - "eval_runtime": 184.0085, - "eval_samples_per_second": 97.881, - "eval_steps_per_second": 6.119, + "epoch": 4.8517520215633425, + "eval_accuracy": 0.38006736696350274, + "eval_loss": 3.4140915870666504, + "eval_runtime": 183.6741, + "eval_samples_per_second": 98.06, + "eval_steps_per_second": 6.13, "step": 45000 }, { - "epoch": 4.848778387687009, - "grad_norm": 0.6875273585319519, - "learning_rate": 0.0003095097511044069, - "loss": 3.381, + "epoch": 4.857142857142857, + "grad_norm": 0.6254715919494629, + "learning_rate": 0.0003090080949811117, + "loss": 3.3722, "step": 45050 }, { - "epoch": 4.8541599397266175, - "grad_norm": 0.630673348903656, - "learning_rate": 0.0003091865100743454, - "loss": 3.3863, + "epoch": 4.862533692722372, + "grad_norm": 0.6830570697784424, + "learning_rate": 0.0003086842957366433, + "loss": 3.3984, "step": 45100 }, { - "epoch": 4.859541491766225, - "grad_norm": 0.6930129528045654, - "learning_rate": 0.000308863269044284, - "loss": 3.3922, + "epoch": 4.867924528301887, + "grad_norm": 0.701416015625, + "learning_rate": 0.00030836049649217483, + "loss": 3.3715, "step": 45150 }, { - "epoch": 4.864923043805834, - "grad_norm": 0.6872902512550354, - "learning_rate": 0.00030854002801422256, - "loss": 3.4027, + "epoch": 4.873315363881401, + "grad_norm": 0.6478974223136902, + "learning_rate": 0.0003080366972477064, + "loss": 3.3933, "step": 45200 }, { - "epoch": 4.870304595845441, - "grad_norm": 0.6678783297538757, - "learning_rate": 0.00030821678698416115, - "loss": 3.3917, + "epoch": 4.878706199460916, + "grad_norm": 0.6832784414291382, + "learning_rate": 0.000307712898003238, + "loss": 3.3898, "step": 45250 }, { - "epoch": 4.87568614788505, - "grad_norm": 0.7070086598396301, - "learning_rate": 0.0003078935459540997, - "loss": 3.3919, + "epoch": 4.884097035040432, + "grad_norm": 0.6223458647727966, + "learning_rate": 0.00030738909875876953, + "loss": 3.3962, "step": 45300 }, { - "epoch": 4.8810676999246585, - "grad_norm": 0.7027028203010559, - "learning_rate": 0.00030757030492403834, - "loss": 3.3948, + "epoch": 4.889487870619946, + "grad_norm": 0.6796822547912598, + "learning_rate": 0.00030706529951430114, + "loss": 3.3877, "step": 45350 }, { - "epoch": 4.886449251964266, - "grad_norm": 0.6884028911590576, - "learning_rate": 0.00030724706389397694, - "loss": 3.4086, + "epoch": 4.894878706199461, + "grad_norm": 0.6090751886367798, + "learning_rate": 0.0003067415002698327, + "loss": 3.3837, "step": 45400 }, { - "epoch": 4.891830804003875, - "grad_norm": 0.7701573967933655, - "learning_rate": 0.0003069238228639155, - "loss": 3.3962, + "epoch": 4.900269541778976, + "grad_norm": 0.6062365174293518, + "learning_rate": 0.0003064177010253643, + "loss": 3.3928, "step": 45450 }, { - "epoch": 4.897212356043483, - "grad_norm": 0.6813535094261169, - "learning_rate": 0.00030660058183385407, - "loss": 3.4013, + "epoch": 4.90566037735849, + "grad_norm": 0.6820232272148132, + "learning_rate": 0.0003060939017808958, + "loss": 3.391, "step": 45500 }, { - "epoch": 4.902593908083091, - "grad_norm": 0.6463482975959778, - "learning_rate": 0.00030627734080379267, - "loss": 3.3926, + "epoch": 4.9110512129380055, + "grad_norm": 0.641365647315979, + "learning_rate": 0.0003057701025364274, + "loss": 3.3799, "step": 45550 }, { - "epoch": 4.9079754601226995, - "grad_norm": 0.6464250683784485, - "learning_rate": 0.0003059540997737312, - "loss": 3.3822, + "epoch": 4.916442048517521, + "grad_norm": 0.6937678456306458, + "learning_rate": 0.00030544630329195895, + "loss": 3.4047, "step": 45600 }, { - "epoch": 4.913357012162308, - "grad_norm": 0.6345609426498413, - "learning_rate": 0.00030563085874366986, - "loss": 3.393, + "epoch": 4.921832884097035, + "grad_norm": 0.6434536576271057, + "learning_rate": 0.0003051225040474905, + "loss": 3.3787, "step": 45650 }, { - "epoch": 4.918738564201916, - "grad_norm": 0.6901547908782959, - "learning_rate": 0.00030530761771360845, - "loss": 3.3725, + "epoch": 4.92722371967655, + "grad_norm": 0.655083417892456, + "learning_rate": 0.0003047987048030221, + "loss": 3.3902, "step": 45700 }, { - "epoch": 4.924120116241524, - "grad_norm": 0.6914418935775757, - "learning_rate": 0.000304984376683547, - "loss": 3.3789, + "epoch": 4.932614555256064, + "grad_norm": 0.6419015526771545, + "learning_rate": 0.00030447490555855365, + "loss": 3.3935, "step": 45750 }, { - "epoch": 4.929501668281132, - "grad_norm": 0.734369695186615, - "learning_rate": 0.0003046611356534856, - "loss": 3.3904, + "epoch": 4.938005390835579, + "grad_norm": 0.6669827103614807, + "learning_rate": 0.00030415110631408526, + "loss": 3.3875, "step": 45800 }, { - "epoch": 4.9348832203207404, - "grad_norm": 0.6802307963371277, - "learning_rate": 0.0003043378946234241, - "loss": 3.3965, + "epoch": 4.943396226415095, + "grad_norm": 0.6806350350379944, + "learning_rate": 0.0003038273070696168, + "loss": 3.3902, "step": 45850 }, { - "epoch": 4.940264772360349, - "grad_norm": 0.6808255314826965, - "learning_rate": 0.0003040146535933628, - "loss": 3.3812, + "epoch": 4.948787061994609, + "grad_norm": 0.662300705909729, + "learning_rate": 0.0003035035078251484, + "loss": 3.3905, "step": 45900 }, { - "epoch": 4.945646324399957, - "grad_norm": 0.6597874760627747, - "learning_rate": 0.00030369141256330137, - "loss": 3.3646, + "epoch": 4.954177897574124, + "grad_norm": 0.6265142560005188, + "learning_rate": 0.00030317970858067996, + "loss": 3.3757, "step": 45950 }, { - "epoch": 4.951027876439565, - "grad_norm": 0.6720778346061707, - "learning_rate": 0.0003033681715332399, - "loss": 3.3894, + "epoch": 4.959568733153639, + "grad_norm": 0.6365964412689209, + "learning_rate": 0.00030285590933621157, + "loss": 3.3867, "step": 46000 }, { - "epoch": 4.951027876439565, - "eval_accuracy": 0.38033237135688225, - "eval_loss": 3.410205841064453, - "eval_runtime": 184.0, - "eval_samples_per_second": 97.886, - "eval_steps_per_second": 6.12, + "epoch": 4.959568733153639, + "eval_accuracy": 0.3805977017089252, + "eval_loss": 3.410641670227051, + "eval_runtime": 183.6273, + "eval_samples_per_second": 98.085, + "eval_steps_per_second": 6.132, "step": 46000 }, { - "epoch": 4.956409428479174, - "grad_norm": 0.6535595059394836, - "learning_rate": 0.0003030449305031785, - "loss": 3.3842, + "epoch": 4.964959568733153, + "grad_norm": 0.5964719653129578, + "learning_rate": 0.0003025321100917431, + "loss": 3.3936, "step": 46050 }, { - "epoch": 4.961790980518781, - "grad_norm": 0.6358091235160828, - "learning_rate": 0.0003027216894731171, - "loss": 3.3774, + "epoch": 4.9703504043126685, + "grad_norm": 0.6579270958900452, + "learning_rate": 0.0003022083108472746, + "loss": 3.3882, "step": 46100 }, { - "epoch": 4.96717253255839, - "grad_norm": 1.3344215154647827, - "learning_rate": 0.00030239844844305564, - "loss": 3.3972, + "epoch": 4.975741239892184, + "grad_norm": 0.6702588200569153, + "learning_rate": 0.0003018845116028062, + "loss": 3.3865, "step": 46150 }, { - "epoch": 4.9725540845979985, - "grad_norm": 0.6452608108520508, - "learning_rate": 0.0003020752074129943, - "loss": 3.3968, + "epoch": 4.981132075471698, + "grad_norm": 0.6525739431381226, + "learning_rate": 0.00030156071235833777, + "loss": 3.3865, "step": 46200 }, { - "epoch": 4.977935636637606, - "grad_norm": 0.7212709188461304, - "learning_rate": 0.0003017519663829329, - "loss": 3.3917, + "epoch": 4.986522911051213, + "grad_norm": 0.6661361455917358, + "learning_rate": 0.0003012369131138694, + "loss": 3.4074, "step": 46250 }, { - "epoch": 4.983317188677215, - "grad_norm": 0.6865209341049194, - "learning_rate": 0.0003014287253528714, - "loss": 3.3774, + "epoch": 4.991913746630727, + "grad_norm": 0.6791821122169495, + "learning_rate": 0.0003009131138694009, + "loss": 3.3957, "step": 46300 }, { - "epoch": 4.988698740716822, - "grad_norm": 0.7008549571037292, - "learning_rate": 0.00030110548432281, - "loss": 3.384, + "epoch": 4.997304582210242, + "grad_norm": 0.6901088953018188, + "learning_rate": 0.00030058931462493253, + "loss": 3.3834, "step": 46350 }, { - "epoch": 4.994080292756431, - "grad_norm": 0.6504687666893005, - "learning_rate": 0.00030078224329274856, - "loss": 3.3863, + "epoch": 5.002695417789758, + "grad_norm": 0.6481421589851379, + "learning_rate": 0.0003002655153804641, + "loss": 3.34, "step": 46400 }, { - "epoch": 4.9994618447960395, - "grad_norm": 0.6690630912780762, - "learning_rate": 0.00030045900226268715, - "loss": 3.3824, + "epoch": 5.008086253369272, + "grad_norm": 0.6943140625953674, + "learning_rate": 0.00029994171613599563, + "loss": 3.2891, "step": 46450 }, { - "epoch": 5.004843396835647, - "grad_norm": 0.6338120102882385, - "learning_rate": 0.0003001357612326258, - "loss": 3.2995, + "epoch": 5.013477088948787, + "grad_norm": 0.638863205909729, + "learning_rate": 0.00029961791689152724, + "loss": 3.2942, "step": 46500 }, { - "epoch": 5.010224948875256, - "grad_norm": 0.6364915370941162, - "learning_rate": 0.00029981252020256434, - "loss": 3.2997, + "epoch": 5.018867924528302, + "grad_norm": 0.6785563230514526, + "learning_rate": 0.0002992941176470588, + "loss": 3.3015, "step": 46550 }, { - "epoch": 5.015606500914864, - "grad_norm": 0.6910309791564941, - "learning_rate": 0.00029948927917250294, - "loss": 3.3019, + "epoch": 5.024258760107816, + "grad_norm": 0.6411895751953125, + "learning_rate": 0.0002989703184025904, + "loss": 3.2999, "step": 46600 }, { - "epoch": 5.020988052954472, - "grad_norm": 0.6775645613670349, - "learning_rate": 0.00029916603814244153, - "loss": 3.2952, + "epoch": 5.0296495956873315, + "grad_norm": 0.7144386768341064, + "learning_rate": 0.00029864651915812194, + "loss": 3.3046, "step": 46650 }, { - "epoch": 5.0263696049940805, - "grad_norm": 0.7527064681053162, - "learning_rate": 0.0002988427971123801, - "loss": 3.3052, + "epoch": 5.035040431266847, + "grad_norm": 0.657825767993927, + "learning_rate": 0.0002983227199136535, + "loss": 3.2974, "step": 46700 }, { - "epoch": 5.031751157033688, - "grad_norm": 0.6929546594619751, - "learning_rate": 0.00029851955608231867, - "loss": 3.3145, + "epoch": 5.040431266846361, + "grad_norm": 0.6665033102035522, + "learning_rate": 0.0002979989206691851, + "loss": 3.313, "step": 46750 }, { - "epoch": 5.037132709073297, - "grad_norm": 0.624110996723175, - "learning_rate": 0.00029819631505225726, - "loss": 3.3169, + "epoch": 5.045822102425876, + "grad_norm": 0.6799901127815247, + "learning_rate": 0.000297681597409606, + "loss": 3.2983, "step": 46800 }, { - "epoch": 5.042514261112905, - "grad_norm": 0.698697566986084, - "learning_rate": 0.00029787307402219586, - "loss": 3.3072, + "epoch": 5.051212938005391, + "grad_norm": 0.7204418778419495, + "learning_rate": 0.0002973577981651376, + "loss": 3.315, "step": 46850 }, { - "epoch": 5.047895813152513, - "grad_norm": 0.760403573513031, - "learning_rate": 0.00029754983299213445, - "loss": 3.3023, + "epoch": 5.056603773584905, + "grad_norm": 0.679721474647522, + "learning_rate": 0.00029703399892066915, + "loss": 3.3029, "step": 46900 }, { - "epoch": 5.0532773651921215, - "grad_norm": 0.7195192575454712, - "learning_rate": 0.000297226591962073, - "loss": 3.3267, + "epoch": 5.061994609164421, + "grad_norm": 0.6594417095184326, + "learning_rate": 0.00029671019967620076, + "loss": 3.2934, "step": 46950 }, { - "epoch": 5.05865891723173, - "grad_norm": 0.640288770198822, - "learning_rate": 0.00029690335093201164, - "loss": 3.2988, + "epoch": 5.067385444743936, + "grad_norm": 0.7225853204727173, + "learning_rate": 0.0002963864004317323, + "loss": 3.3111, "step": 47000 }, { - "epoch": 5.05865891723173, - "eval_accuracy": 0.38089660580330287, - "eval_loss": 3.4140713214874268, - "eval_runtime": 184.5126, - "eval_samples_per_second": 97.614, - "eval_steps_per_second": 6.103, + "epoch": 5.067385444743936, + "eval_accuracy": 0.3804139696756309, + "eval_loss": 3.4154088497161865, + "eval_runtime": 183.2512, + "eval_samples_per_second": 98.286, + "eval_steps_per_second": 6.145, "step": 47000 }, { - "epoch": 5.064040469271338, - "grad_norm": 0.7123163938522339, - "learning_rate": 0.0002965801099019502, - "loss": 3.3039, + "epoch": 5.07277628032345, + "grad_norm": 0.6604564785957336, + "learning_rate": 0.00029606260118726386, + "loss": 3.313, "step": 47050 }, { - "epoch": 5.069422021310946, - "grad_norm": 0.6437656879425049, - "learning_rate": 0.0002962568688718888, - "loss": 3.308, + "epoch": 5.078167115902965, + "grad_norm": 0.647045910358429, + "learning_rate": 0.0002957388019427954, + "loss": 3.3159, "step": 47100 }, { - "epoch": 5.074803573350554, - "grad_norm": 0.6438406109809875, - "learning_rate": 0.00029593362784182737, - "loss": 3.3086, + "epoch": 5.083557951482479, + "grad_norm": 0.6736243367195129, + "learning_rate": 0.000295415002698327, + "loss": 3.3109, "step": 47150 }, { - "epoch": 5.080185125390162, - "grad_norm": 0.6933695673942566, - "learning_rate": 0.00029561038681176596, - "loss": 3.3286, + "epoch": 5.0889487870619945, + "grad_norm": 0.6551802754402161, + "learning_rate": 0.00029509120345385856, + "loss": 3.3092, "step": 47200 }, { - "epoch": 5.085566677429771, - "grad_norm": 0.6499897241592407, - "learning_rate": 0.00029528714578170456, - "loss": 3.3282, + "epoch": 5.09433962264151, + "grad_norm": 0.6889127492904663, + "learning_rate": 0.00029476740420939017, + "loss": 3.3115, "step": 47250 }, { - "epoch": 5.090948229469379, - "grad_norm": 0.6867937445640564, - "learning_rate": 0.0002949639047516431, - "loss": 3.3086, + "epoch": 5.099730458221024, + "grad_norm": 0.6516887545585632, + "learning_rate": 0.0002944436049649217, + "loss": 3.3085, "step": 47300 }, { - "epoch": 5.096329781508987, - "grad_norm": 0.7074191570281982, - "learning_rate": 0.0002946406637215817, - "loss": 3.2997, + "epoch": 5.105121293800539, + "grad_norm": 0.6999820470809937, + "learning_rate": 0.0002941198057204533, + "loss": 3.2985, "step": 47350 }, { - "epoch": 5.101711333548596, - "grad_norm": 0.6474207043647766, - "learning_rate": 0.0002943174226915203, - "loss": 3.3096, + "epoch": 5.110512129380054, + "grad_norm": 0.6833156943321228, + "learning_rate": 0.00029379600647598487, + "loss": 3.3138, "step": 47400 }, { - "epoch": 5.107092885588203, - "grad_norm": 0.7220308780670166, - "learning_rate": 0.0002939941816614589, - "loss": 3.3151, + "epoch": 5.115902964959568, + "grad_norm": 0.6783242225646973, + "learning_rate": 0.0002934722072315164, + "loss": 3.3288, "step": 47450 }, { - "epoch": 5.112474437627812, - "grad_norm": 0.6929433941841125, - "learning_rate": 0.0002936709406313974, - "loss": 3.3169, + "epoch": 5.121293800539084, + "grad_norm": 0.6660628914833069, + "learning_rate": 0.000293148407987048, + "loss": 3.3228, "step": 47500 }, { - "epoch": 5.1178559896674205, - "grad_norm": 0.6843468546867371, - "learning_rate": 0.0002933476996013361, - "loss": 3.3147, + "epoch": 5.126684636118599, + "grad_norm": 0.6789077520370483, + "learning_rate": 0.0002928246087425796, + "loss": 3.3296, "step": 47550 }, { - "epoch": 5.123237541707028, - "grad_norm": 0.6644719243049622, - "learning_rate": 0.0002930244585712746, - "loss": 3.3146, + "epoch": 5.132075471698113, + "grad_norm": 0.6862949132919312, + "learning_rate": 0.00029250080949811113, + "loss": 3.3219, "step": 47600 }, { - "epoch": 5.128619093746637, - "grad_norm": 0.7787826657295227, - "learning_rate": 0.0002927012175412132, - "loss": 3.3084, + "epoch": 5.137466307277628, + "grad_norm": 0.6389700770378113, + "learning_rate": 0.00029217701025364273, + "loss": 3.3121, "step": 47650 }, { - "epoch": 5.134000645786244, - "grad_norm": 0.7502739429473877, - "learning_rate": 0.0002923779765111518, - "loss": 3.32, + "epoch": 5.142857142857143, + "grad_norm": 0.6830697655677795, + "learning_rate": 0.0002918532110091743, + "loss": 3.3252, "step": 47700 }, { - "epoch": 5.139382197825853, - "grad_norm": 0.6958211660385132, - "learning_rate": 0.0002920547354810904, - "loss": 3.3128, + "epoch": 5.1482479784366575, + "grad_norm": 0.64373779296875, + "learning_rate": 0.0002915294117647059, + "loss": 3.3187, "step": 47750 }, { - "epoch": 5.1447637498654615, - "grad_norm": 0.7047584652900696, - "learning_rate": 0.00029173149445102894, - "loss": 3.344, + "epoch": 5.153638814016173, + "grad_norm": 0.6759216785430908, + "learning_rate": 0.00029120561252023744, + "loss": 3.308, "step": 47800 }, { - "epoch": 5.150145301905069, - "grad_norm": 0.6888059377670288, - "learning_rate": 0.00029140825342096753, - "loss": 3.3247, + "epoch": 5.159029649595688, + "grad_norm": 0.651127278804779, + "learning_rate": 0.000290881813275769, + "loss": 3.3319, "step": 47850 }, { - "epoch": 5.155526853944678, - "grad_norm": 0.7417131066322327, - "learning_rate": 0.0002910850123909061, - "loss": 3.3113, + "epoch": 5.164420485175202, + "grad_norm": 0.6850906610488892, + "learning_rate": 0.00029055801403130054, + "loss": 3.3238, "step": 47900 }, { - "epoch": 5.160908405984286, - "grad_norm": 0.6905604600906372, - "learning_rate": 0.0002907617713608447, - "loss": 3.3091, + "epoch": 5.169811320754717, + "grad_norm": 0.9405094981193542, + "learning_rate": 0.00029023421478683215, + "loss": 3.3112, "step": 47950 }, { - "epoch": 5.166289958023894, - "grad_norm": 0.6897056698799133, - "learning_rate": 0.0002904385303307833, - "loss": 3.3063, + "epoch": 5.175202156334231, + "grad_norm": 0.6840382218360901, + "learning_rate": 0.0002899104155423637, + "loss": 3.3174, "step": 48000 }, { - "epoch": 5.166289958023894, - "eval_accuracy": 0.3808204401289449, - "eval_loss": 3.41259765625, - "eval_runtime": 184.1998, - "eval_samples_per_second": 97.78, - "eval_steps_per_second": 6.113, + "epoch": 5.175202156334231, + "eval_accuracy": 0.38094006695842864, + "eval_loss": 3.411670446395874, + "eval_runtime": 183.7964, + "eval_samples_per_second": 97.994, + "eval_steps_per_second": 6.126, "step": 48000 }, { - "epoch": 5.1716715100635025, - "grad_norm": 0.7102882862091064, - "learning_rate": 0.00029011528930072186, - "loss": 3.3235, + "epoch": 5.180592991913747, + "grad_norm": 0.6287639737129211, + "learning_rate": 0.0002895866162978953, + "loss": 3.3208, "step": 48050 }, { - "epoch": 5.17705306210311, - "grad_norm": 0.6777679920196533, - "learning_rate": 0.00028979204827066045, - "loss": 3.325, + "epoch": 5.185983827493262, + "grad_norm": 0.6605691909790039, + "learning_rate": 0.00028926281705342685, + "loss": 3.3316, "step": 48100 }, { - "epoch": 5.182434614142719, - "grad_norm": 0.7214860916137695, - "learning_rate": 0.00028946880724059905, - "loss": 3.3411, + "epoch": 5.191374663072776, + "grad_norm": 0.6740576028823853, + "learning_rate": 0.0002889390178089584, + "loss": 3.3262, "step": 48150 }, { - "epoch": 5.187816166182327, - "grad_norm": 0.6319332718849182, - "learning_rate": 0.00028914556621053764, - "loss": 3.3318, + "epoch": 5.196765498652291, + "grad_norm": 0.6887629628181458, + "learning_rate": 0.00028861521856449, + "loss": 3.3363, "step": 48200 }, { - "epoch": 5.193197718221935, - "grad_norm": 0.707127034664154, - "learning_rate": 0.00028882232518047624, - "loss": 3.3286, + "epoch": 5.202156334231806, + "grad_norm": 0.6479724645614624, + "learning_rate": 0.00028829141932002156, + "loss": 3.3306, "step": 48250 }, { - "epoch": 5.198579270261543, - "grad_norm": 0.7037991285324097, - "learning_rate": 0.00028849908415041483, - "loss": 3.3115, + "epoch": 5.2075471698113205, + "grad_norm": 0.6248446106910706, + "learning_rate": 0.0002879676200755531, + "loss": 3.3056, "step": 48300 }, { - "epoch": 5.203960822301152, - "grad_norm": 0.6779250502586365, - "learning_rate": 0.00028817584312035337, - "loss": 3.3145, + "epoch": 5.212938005390836, + "grad_norm": 0.6646822690963745, + "learning_rate": 0.0002876438208310847, + "loss": 3.3394, "step": 48350 }, { - "epoch": 5.20934237434076, - "grad_norm": 0.6654755473136902, - "learning_rate": 0.00028785260209029197, - "loss": 3.3269, + "epoch": 5.218328840970351, + "grad_norm": 0.6973866820335388, + "learning_rate": 0.00028732002158661626, + "loss": 3.3275, "step": 48400 }, { - "epoch": 5.214723926380368, - "grad_norm": 0.713239848613739, - "learning_rate": 0.00028752936106023056, - "loss": 3.3167, + "epoch": 5.223719676549865, + "grad_norm": 0.6720141768455505, + "learning_rate": 0.00028699622234214787, + "loss": 3.3337, "step": 48450 }, { - "epoch": 5.220105478419977, - "grad_norm": 0.6545668244361877, - "learning_rate": 0.00028720612003016915, - "loss": 3.3313, + "epoch": 5.22911051212938, + "grad_norm": 0.686879575252533, + "learning_rate": 0.0002866724230976794, + "loss": 3.3398, "step": 48500 }, { - "epoch": 5.225487030459584, - "grad_norm": 0.6693907380104065, - "learning_rate": 0.00028688287900010775, - "loss": 3.3347, + "epoch": 5.234501347708895, + "grad_norm": 0.7400215268135071, + "learning_rate": 0.00028634862385321097, + "loss": 3.3177, "step": 48550 }, { - "epoch": 5.230868582499193, - "grad_norm": 0.7082215547561646, - "learning_rate": 0.0002865596379700463, - "loss": 3.3182, + "epoch": 5.2398921832884096, + "grad_norm": 0.6599448323249817, + "learning_rate": 0.0002860248246087426, + "loss": 3.3204, "step": 48600 }, { - "epoch": 5.236250134538801, - "grad_norm": 0.6843087673187256, - "learning_rate": 0.0002862363969399849, - "loss": 3.3212, + "epoch": 5.245283018867925, + "grad_norm": 0.6918583512306213, + "learning_rate": 0.0002857010253642741, + "loss": 3.3422, "step": 48650 }, { - "epoch": 5.241631686578409, - "grad_norm": 0.6862715482711792, - "learning_rate": 0.0002859131559099235, - "loss": 3.3275, + "epoch": 5.250673854447439, + "grad_norm": 0.693386971950531, + "learning_rate": 0.0002853772261198057, + "loss": 3.3288, "step": 48700 }, { - "epoch": 5.247013238618018, - "grad_norm": 0.6855129599571228, - "learning_rate": 0.0002855963797004633, - "loss": 3.33, + "epoch": 5.256064690026954, + "grad_norm": 0.7383105158805847, + "learning_rate": 0.0002850534268753373, + "loss": 3.3309, "step": 48750 }, { - "epoch": 5.252394790657625, - "grad_norm": 0.6837905049324036, - "learning_rate": 0.0002852731386704019, - "loss": 3.3324, + "epoch": 5.261455525606469, + "grad_norm": 0.6746464371681213, + "learning_rate": 0.0002847361036157582, + "loss": 3.3401, "step": 48800 }, { - "epoch": 5.257776342697234, - "grad_norm": 0.6358972787857056, - "learning_rate": 0.0002849498976403405, - "loss": 3.3167, + "epoch": 5.2668463611859835, + "grad_norm": 0.6412659287452698, + "learning_rate": 0.0002844123043712898, + "loss": 3.3324, "step": 48850 }, { - "epoch": 5.2631578947368425, - "grad_norm": 0.6836221218109131, - "learning_rate": 0.000284626656610279, - "loss": 3.3239, + "epoch": 5.272237196765499, + "grad_norm": 0.7231650948524475, + "learning_rate": 0.00028408850512682133, + "loss": 3.3225, "step": 48900 }, { - "epoch": 5.26853944677645, - "grad_norm": 0.7453120946884155, - "learning_rate": 0.0002843034155802176, - "loss": 3.3433, + "epoch": 5.277628032345014, + "grad_norm": 0.6406319737434387, + "learning_rate": 0.00028376470588235294, + "loss": 3.3355, "step": 48950 }, { - "epoch": 5.273920998816059, - "grad_norm": 0.6284498572349548, - "learning_rate": 0.0002839801745501562, - "loss": 3.3432, + "epoch": 5.283018867924528, + "grad_norm": 0.6728077530860901, + "learning_rate": 0.0002834409066378845, + "loss": 3.3123, "step": 49000 }, { - "epoch": 5.273920998816059, - "eval_accuracy": 0.3816899891901242, - "eval_loss": 3.4081289768218994, - "eval_runtime": 183.9045, - "eval_samples_per_second": 97.937, - "eval_steps_per_second": 6.123, + "epoch": 5.283018867924528, + "eval_accuracy": 0.38115900252737484, + "eval_loss": 3.409712314605713, + "eval_runtime": 183.6676, + "eval_samples_per_second": 98.063, + "eval_steps_per_second": 6.131, "step": 49000 }, { - "epoch": 5.279302550855666, - "grad_norm": 0.6980060338973999, - "learning_rate": 0.000283663398340696, - "loss": 3.3392, + "epoch": 5.288409703504043, + "grad_norm": 0.7094541788101196, + "learning_rate": 0.00028311710739341604, + "loss": 3.3339, "step": 49050 }, { - "epoch": 5.284684102895275, - "grad_norm": 0.6563853621482849, - "learning_rate": 0.0002833401573106346, - "loss": 3.3385, + "epoch": 5.293800539083558, + "grad_norm": 0.6673089265823364, + "learning_rate": 0.00028279330814894764, + "loss": 3.3366, "step": 49100 }, { - "epoch": 5.2900656549348835, - "grad_norm": 0.7191060781478882, - "learning_rate": 0.0002830169162805732, - "loss": 3.3377, + "epoch": 5.2991913746630726, + "grad_norm": 0.7256841659545898, + "learning_rate": 0.0002824695089044792, + "loss": 3.3322, "step": 49150 }, { - "epoch": 5.295447206974491, - "grad_norm": 0.6789143085479736, - "learning_rate": 0.0002826936752505118, - "loss": 3.3321, + "epoch": 5.304582210242588, + "grad_norm": 0.7026752233505249, + "learning_rate": 0.00028214570966001075, + "loss": 3.3275, "step": 49200 }, { - "epoch": 5.3008287590141, - "grad_norm": 0.6590706706047058, - "learning_rate": 0.00028237043422045034, - "loss": 3.343, + "epoch": 5.309973045822103, + "grad_norm": 0.6675985455513, + "learning_rate": 0.00028182191041554235, + "loss": 3.309, "step": 49250 }, { - "epoch": 5.306210311053708, - "grad_norm": 0.6632896661758423, - "learning_rate": 0.00028204719319038893, - "loss": 3.314, + "epoch": 5.315363881401617, + "grad_norm": 0.6676135063171387, + "learning_rate": 0.0002814981111710739, + "loss": 3.3123, "step": 49300 }, { - "epoch": 5.311591863093316, - "grad_norm": 0.7267322540283203, - "learning_rate": 0.0002817239521603275, - "loss": 3.3426, + "epoch": 5.320754716981132, + "grad_norm": 0.6714651584625244, + "learning_rate": 0.0002811743119266055, + "loss": 3.3421, "step": 49350 }, { - "epoch": 5.316973415132924, - "grad_norm": 0.6905198693275452, - "learning_rate": 0.0002814007111302661, - "loss": 3.3185, + "epoch": 5.3261455525606465, + "grad_norm": 0.6427444815635681, + "learning_rate": 0.00028085051268213706, + "loss": 3.3131, "step": 49400 }, { - "epoch": 5.322354967172533, - "grad_norm": 0.6762885451316833, - "learning_rate": 0.0002810774701002047, - "loss": 3.3412, + "epoch": 5.331536388140162, + "grad_norm": 0.6853386163711548, + "learning_rate": 0.0002805267134376686, + "loss": 3.3307, "step": 49450 }, { - "epoch": 5.327736519212141, - "grad_norm": 0.7283837795257568, - "learning_rate": 0.00028075422907014325, - "loss": 3.3324, + "epoch": 5.336927223719677, + "grad_norm": 0.7251453995704651, + "learning_rate": 0.00028020291419320016, + "loss": 3.3406, "step": 49500 }, { - "epoch": 5.333118071251749, - "grad_norm": 0.7100688219070435, - "learning_rate": 0.00028043098804008185, - "loss": 3.3433, + "epoch": 5.342318059299191, + "grad_norm": 0.6374843120574951, + "learning_rate": 0.00027987911494873176, + "loss": 3.3277, "step": 49550 }, { - "epoch": 5.338499623291357, - "grad_norm": 0.6812211275100708, - "learning_rate": 0.00028010774701002044, - "loss": 3.3305, + "epoch": 5.347708894878706, + "grad_norm": 0.6895403861999512, + "learning_rate": 0.0002795553157042633, + "loss": 3.3241, "step": 49600 }, { - "epoch": 5.343881175330965, - "grad_norm": 0.6936942934989929, - "learning_rate": 0.00027978450597995904, - "loss": 3.3567, + "epoch": 5.353099730458221, + "grad_norm": 0.6677640676498413, + "learning_rate": 0.0002792315164597949, + "loss": 3.3268, "step": 49650 }, { - "epoch": 5.349262727370574, - "grad_norm": 0.6920641660690308, - "learning_rate": 0.0002794612649498976, - "loss": 3.3213, + "epoch": 5.3584905660377355, + "grad_norm": 0.6784765720367432, + "learning_rate": 0.00027890771721532647, + "loss": 3.3029, "step": 49700 }, { - "epoch": 5.354644279410182, - "grad_norm": 0.7323217988014221, - "learning_rate": 0.00027913802391983623, - "loss": 3.3312, + "epoch": 5.363881401617251, + "grad_norm": 0.642457902431488, + "learning_rate": 0.00027858391797085807, + "loss": 3.3357, "step": 49750 }, { - "epoch": 5.36002583144979, - "grad_norm": 0.6813651919364929, - "learning_rate": 0.00027881478288977477, - "loss": 3.3503, + "epoch": 5.369272237196766, + "grad_norm": 0.6742371320724487, + "learning_rate": 0.0002782601187263896, + "loss": 3.3265, "step": 49800 }, { - "epoch": 5.365407383489399, - "grad_norm": 0.697881281375885, - "learning_rate": 0.00027849154185971336, - "loss": 3.3306, + "epoch": 5.37466307277628, + "grad_norm": 0.6685590147972107, + "learning_rate": 0.0002779363194819212, + "loss": 3.3281, "step": 49850 }, { - "epoch": 5.370788935529006, - "grad_norm": 0.73151034116745, - "learning_rate": 0.00027816830082965196, - "loss": 3.3337, + "epoch": 5.380053908355795, + "grad_norm": 0.692225992679596, + "learning_rate": 0.0002776125202374527, + "loss": 3.3313, "step": 49900 }, { - "epoch": 5.376170487568615, - "grad_norm": 0.6968307495117188, - "learning_rate": 0.00027784505979959055, - "loss": 3.3471, + "epoch": 5.38544474393531, + "grad_norm": 0.686076283454895, + "learning_rate": 0.00027728872099298433, + "loss": 3.3262, "step": 49950 }, { - "epoch": 5.3815520396082235, - "grad_norm": 0.6323778033256531, - "learning_rate": 0.00027752181876952915, - "loss": 3.3325, + "epoch": 5.390835579514825, + "grad_norm": 0.6989368200302124, + "learning_rate": 0.0002769649217485159, + "loss": 3.3394, "step": 50000 }, { - "epoch": 5.3815520396082235, - "eval_accuracy": 0.3820180122584361, - "eval_loss": 3.4034645557403564, - "eval_runtime": 184.1386, - "eval_samples_per_second": 97.812, - "eval_steps_per_second": 6.115, + "epoch": 5.390835579514825, + "eval_accuracy": 0.38204593605060444, + "eval_loss": 3.4039955139160156, + "eval_runtime": 183.4202, + "eval_samples_per_second": 98.195, + "eval_steps_per_second": 6.139, "step": 50000 }, { - "epoch": 5.386933591647831, - "grad_norm": 0.6919659376144409, - "learning_rate": 0.0002771985777394677, - "loss": 3.3212, + "epoch": 5.39622641509434, + "grad_norm": 0.6822784543037415, + "learning_rate": 0.0002766411225040475, + "loss": 3.3313, "step": 50050 }, { - "epoch": 5.39231514368744, - "grad_norm": 0.6490268707275391, - "learning_rate": 0.0002768753367094063, - "loss": 3.3219, + "epoch": 5.401617250673855, + "grad_norm": 0.6851001381874084, + "learning_rate": 0.00027631732325957903, + "loss": 3.3418, "step": 50100 }, { - "epoch": 5.397696695727047, - "grad_norm": 0.7063648700714111, - "learning_rate": 0.0002765520956793449, - "loss": 3.3245, + "epoch": 5.407008086253369, + "grad_norm": 0.6685195565223694, + "learning_rate": 0.0002759935240151106, + "loss": 3.313, "step": 50150 }, { - "epoch": 5.403078247766656, - "grad_norm": 0.7237613201141357, - "learning_rate": 0.00027622885464928347, - "loss": 3.3175, + "epoch": 5.412398921832884, + "grad_norm": 0.7843024730682373, + "learning_rate": 0.0002756697247706422, + "loss": 3.3129, "step": 50200 }, { - "epoch": 5.4084597998062645, - "grad_norm": 0.6712755560874939, - "learning_rate": 0.000275905613619222, - "loss": 3.3331, + "epoch": 5.4177897574123985, + "grad_norm": 0.7111794352531433, + "learning_rate": 0.00027534592552617374, + "loss": 3.3371, "step": 50250 }, { - "epoch": 5.413841351845872, - "grad_norm": 0.7124558091163635, - "learning_rate": 0.00027558237258916066, - "loss": 3.3226, + "epoch": 5.423180592991914, + "grad_norm": 0.6508558988571167, + "learning_rate": 0.00027502212628170535, + "loss": 3.3195, "step": 50300 }, { - "epoch": 5.419222903885481, - "grad_norm": 0.7070554494857788, - "learning_rate": 0.0002752591315590992, - "loss": 3.3364, + "epoch": 5.428571428571429, + "grad_norm": 0.658149242401123, + "learning_rate": 0.0002746983270372369, + "loss": 3.3327, "step": 50350 }, { - "epoch": 5.424604455925088, - "grad_norm": 0.7024772763252258, - "learning_rate": 0.0002749358905290378, - "loss": 3.3117, + "epoch": 5.433962264150943, + "grad_norm": 0.6759116649627686, + "learning_rate": 0.00027437452779276845, + "loss": 3.3227, "step": 50400 }, { - "epoch": 5.429986007964697, - "grad_norm": 0.6279031038284302, - "learning_rate": 0.0002746126494989764, - "loss": 3.328, + "epoch": 5.439353099730458, + "grad_norm": 0.6558474898338318, + "learning_rate": 0.00027405072854830005, + "loss": 3.3348, "step": 50450 }, { - "epoch": 5.435367560004305, - "grad_norm": 0.6562391519546509, - "learning_rate": 0.000274289408468915, - "loss": 3.3276, + "epoch": 5.444743935309973, + "grad_norm": 0.6985653042793274, + "learning_rate": 0.0002737269293038316, + "loss": 3.335, "step": 50500 }, { - "epoch": 5.440749112043913, - "grad_norm": 0.7366046905517578, - "learning_rate": 0.0002739661674388535, - "loss": 3.3354, + "epoch": 5.450134770889488, + "grad_norm": 0.6962571144104004, + "learning_rate": 0.00027340313005936315, + "loss": 3.3645, "step": 50550 }, { - "epoch": 5.446130664083522, - "grad_norm": 0.6988213062286377, - "learning_rate": 0.0002736429264087921, - "loss": 3.335, + "epoch": 5.455525606469003, + "grad_norm": 0.6763041019439697, + "learning_rate": 0.00027307933081489476, + "loss": 3.3216, "step": 50600 }, { - "epoch": 5.45151221612313, - "grad_norm": 0.6984215974807739, - "learning_rate": 0.0002733196853787307, - "loss": 3.3239, + "epoch": 5.460916442048518, + "grad_norm": 0.6788709759712219, + "learning_rate": 0.00027276200755531565, + "loss": 3.3401, "step": 50650 }, { - "epoch": 5.456893768162738, - "grad_norm": 0.6875210404396057, - "learning_rate": 0.0002729964443486693, - "loss": 3.3494, + "epoch": 5.466307277628032, + "grad_norm": 0.6613984107971191, + "learning_rate": 0.00027243820831084726, + "loss": 3.3386, "step": 50700 }, { - "epoch": 5.462275320202346, - "grad_norm": 0.7105006575584412, - "learning_rate": 0.0002726732033186079, - "loss": 3.3254, + "epoch": 5.471698113207547, + "grad_norm": 0.7615039348602295, + "learning_rate": 0.0002721144090663788, + "loss": 3.3294, "step": 50750 }, { - "epoch": 5.467656872241955, - "grad_norm": 0.772357702255249, - "learning_rate": 0.00027234996228854644, - "loss": 3.3215, + "epoch": 5.4770889487870615, + "grad_norm": 0.6573368906974792, + "learning_rate": 0.00027179060982191036, + "loss": 3.3422, "step": 50800 }, { - "epoch": 5.473038424281563, - "grad_norm": 0.7547757625579834, - "learning_rate": 0.0002720267212584851, - "loss": 3.3237, + "epoch": 5.482479784366577, + "grad_norm": 0.6838216781616211, + "learning_rate": 0.00027146681057744197, + "loss": 3.3442, "step": 50850 }, { - "epoch": 5.478419976321171, - "grad_norm": 0.6670495271682739, - "learning_rate": 0.00027170348022842363, - "loss": 3.3262, + "epoch": 5.487870619946092, + "grad_norm": 0.6906365156173706, + "learning_rate": 0.0002711430113329735, + "loss": 3.3355, "step": 50900 }, { - "epoch": 5.483801528360779, - "grad_norm": 0.6717911958694458, - "learning_rate": 0.00027138023919836223, - "loss": 3.3375, + "epoch": 5.493261455525606, + "grad_norm": 0.6942813992500305, + "learning_rate": 0.0002708192120885051, + "loss": 3.3432, "step": 50950 }, { - "epoch": 5.489183080400387, - "grad_norm": 0.6847409009933472, - "learning_rate": 0.0002710569981683008, - "loss": 3.3373, + "epoch": 5.498652291105121, + "grad_norm": 0.6663623452186584, + "learning_rate": 0.00027049541284403667, + "loss": 3.3221, "step": 51000 }, { - "epoch": 5.489183080400387, - "eval_accuracy": 0.38237319854870166, - "eval_loss": 3.399523973464966, - "eval_runtime": 184.134, - "eval_samples_per_second": 97.815, - "eval_steps_per_second": 6.115, + "epoch": 5.498652291105121, + "eval_accuracy": 0.38235624869820256, + "eval_loss": 3.3973066806793213, + "eval_runtime": 183.6193, + "eval_samples_per_second": 98.089, + "eval_steps_per_second": 6.132, "step": 51000 }, { - "epoch": 5.494564632439996, - "grad_norm": 0.7280586957931519, - "learning_rate": 0.0002707337571382394, - "loss": 3.3331, + "epoch": 5.504043126684636, + "grad_norm": 0.7017855644226074, + "learning_rate": 0.0002701716135995683, + "loss": 3.341, "step": 51050 }, { - "epoch": 5.499946184479604, - "grad_norm": 0.7104876637458801, - "learning_rate": 0.00027041051610817796, - "loss": 3.346, + "epoch": 5.509433962264151, + "grad_norm": 0.6982112526893616, + "learning_rate": 0.0002698478143550998, + "loss": 3.3139, "step": 51100 }, { - "epoch": 5.505327736519212, - "grad_norm": 0.692229688167572, - "learning_rate": 0.00027008727507811655, - "loss": 3.3201, + "epoch": 5.514824797843666, + "grad_norm": 0.7699517011642456, + "learning_rate": 0.0002695240151106314, + "loss": 3.3275, "step": 51150 }, { - "epoch": 5.510709288558821, - "grad_norm": 0.658868134021759, - "learning_rate": 0.00026976403404805515, - "loss": 3.3207, + "epoch": 5.520215633423181, + "grad_norm": 0.703709065914154, + "learning_rate": 0.00026920021586616293, + "loss": 3.3352, "step": 51200 }, { - "epoch": 5.516090840598428, - "grad_norm": 0.698153555393219, - "learning_rate": 0.00026944079301799374, - "loss": 3.3452, + "epoch": 5.525606469002695, + "grad_norm": 0.6830039620399475, + "learning_rate": 0.00026887641662169453, + "loss": 3.3385, "step": 51250 }, { - "epoch": 5.521472392638037, - "grad_norm": 0.7251303791999817, - "learning_rate": 0.00026911755198793234, - "loss": 3.3477, + "epoch": 5.53099730458221, + "grad_norm": 0.7063279151916504, + "learning_rate": 0.0002685526173772261, + "loss": 3.3502, "step": 51300 }, { - "epoch": 5.5268539446776455, - "grad_norm": 0.6895780563354492, - "learning_rate": 0.0002687943109578709, - "loss": 3.3188, + "epoch": 5.536388140161725, + "grad_norm": 0.675649881362915, + "learning_rate": 0.0002682288181327577, + "loss": 3.3169, "step": 51350 }, { - "epoch": 5.532235496717253, - "grad_norm": 0.7204925417900085, - "learning_rate": 0.00026847106992780947, - "loss": 3.3279, + "epoch": 5.54177897574124, + "grad_norm": 0.8147872686386108, + "learning_rate": 0.00026790501888828924, + "loss": 3.338, "step": 51400 }, { - "epoch": 5.537617048756862, - "grad_norm": 0.7200186252593994, - "learning_rate": 0.00026814782889774807, - "loss": 3.3376, + "epoch": 5.547169811320755, + "grad_norm": 0.7457138895988464, + "learning_rate": 0.00026758121964382084, + "loss": 3.3357, "step": 51450 }, { - "epoch": 5.542998600796469, - "grad_norm": 0.7235830426216125, - "learning_rate": 0.00026782458786768666, - "loss": 3.3367, + "epoch": 5.55256064690027, + "grad_norm": 0.6662501692771912, + "learning_rate": 0.00026725742039935234, + "loss": 3.3231, "step": 51500 }, { - "epoch": 5.548380152836078, - "grad_norm": 0.6808749437332153, - "learning_rate": 0.0002675013468376252, - "loss": 3.3245, + "epoch": 5.557951482479784, + "grad_norm": 0.6656699776649475, + "learning_rate": 0.00026693362115488394, + "loss": 3.3468, "step": 51550 }, { - "epoch": 5.553761704875686, - "grad_norm": 0.7232927680015564, - "learning_rate": 0.00026717810580756385, - "loss": 3.3472, + "epoch": 5.563342318059299, + "grad_norm": 0.6908094882965088, + "learning_rate": 0.0002666098219104155, + "loss": 3.3343, "step": 51600 }, { - "epoch": 5.559143256915294, - "grad_norm": 0.7236559987068176, - "learning_rate": 0.0002668548647775024, - "loss": 3.3412, + "epoch": 5.568733153638814, + "grad_norm": 0.719346284866333, + "learning_rate": 0.0002662860226659471, + "loss": 3.3265, "step": 51650 }, { - "epoch": 5.564524808954903, - "grad_norm": 0.7104244232177734, - "learning_rate": 0.000266531623747441, - "loss": 3.3445, + "epoch": 5.574123989218329, + "grad_norm": 0.657321572303772, + "learning_rate": 0.00026596222342147865, + "loss": 3.3462, "step": 51700 }, { - "epoch": 5.569906360994511, - "grad_norm": 0.7107536792755127, - "learning_rate": 0.0002662083827173796, - "loss": 3.3265, + "epoch": 5.579514824797844, + "grad_norm": 0.6842498183250427, + "learning_rate": 0.00026563842417701026, + "loss": 3.3369, "step": 51750 }, { - "epoch": 5.575287913034119, - "grad_norm": 0.7243977189064026, - "learning_rate": 0.0002658851416873182, - "loss": 3.3212, + "epoch": 5.584905660377358, + "grad_norm": 0.7126151323318481, + "learning_rate": 0.0002653146249325418, + "loss": 3.3398, "step": 51800 }, { - "epoch": 5.580669465073727, - "grad_norm": 0.8111730217933655, - "learning_rate": 0.00026556190065725677, - "loss": 3.3283, + "epoch": 5.590296495956873, + "grad_norm": 0.6378708481788635, + "learning_rate": 0.00026499082568807336, + "loss": 3.3478, "step": 51850 }, { - "epoch": 5.586051017113336, - "grad_norm": 0.7091906070709229, - "learning_rate": 0.0002652386596271953, - "loss": 3.3271, + "epoch": 5.595687331536388, + "grad_norm": 0.7025904655456543, + "learning_rate": 0.00026466702644360496, + "loss": 3.3434, "step": 51900 }, { - "epoch": 5.591432569152944, - "grad_norm": 0.7432578802108765, - "learning_rate": 0.0002649154185971339, - "loss": 3.3401, + "epoch": 5.601078167115903, + "grad_norm": 0.6371275782585144, + "learning_rate": 0.0002643432271991365, + "loss": 3.3528, "step": 51950 }, { - "epoch": 5.596814121192552, - "grad_norm": 0.7412343621253967, - "learning_rate": 0.0002645921775670725, - "loss": 3.3506, + "epoch": 5.606469002695418, + "grad_norm": 0.714966356754303, + "learning_rate": 0.00026401942795466806, + "loss": 3.3428, "step": 52000 }, { - "epoch": 5.596814121192552, - "eval_accuracy": 0.38275261443294983, - "eval_loss": 3.394073486328125, - "eval_runtime": 183.9682, - "eval_samples_per_second": 97.903, - "eval_steps_per_second": 6.121, + "epoch": 5.606469002695418, + "eval_accuracy": 0.3825655141601333, + "eval_loss": 3.3958473205566406, + "eval_runtime": 183.6356, + "eval_samples_per_second": 98.08, + "eval_steps_per_second": 6.132, "step": 52000 }, { - "epoch": 5.60219567323216, - "grad_norm": 0.6942436695098877, - "learning_rate": 0.0002642689365370111, - "loss": 3.3277, + "epoch": 5.611859838274933, + "grad_norm": 0.8294630646705627, + "learning_rate": 0.00026369562871019967, + "loss": 3.3444, "step": 52050 }, { - "epoch": 5.607577225271768, - "grad_norm": 0.7447067499160767, - "learning_rate": 0.00026394569550694963, - "loss": 3.3234, + "epoch": 5.617250673854447, + "grad_norm": 0.6631956100463867, + "learning_rate": 0.0002633718294657312, + "loss": 3.3405, "step": 52100 }, { - "epoch": 5.612958777311377, - "grad_norm": 0.7501024603843689, - "learning_rate": 0.00026362245447688823, - "loss": 3.3183, + "epoch": 5.622641509433962, + "grad_norm": 0.689293622970581, + "learning_rate": 0.0002630480302212628, + "loss": 3.3411, "step": 52150 }, { - "epoch": 5.618340329350985, - "grad_norm": 0.6899356842041016, - "learning_rate": 0.0002632992134468268, - "loss": 3.3369, + "epoch": 5.628032345013477, + "grad_norm": 0.7070659399032593, + "learning_rate": 0.00026272423097679437, + "loss": 3.3278, "step": 52200 }, { - "epoch": 5.623721881390593, - "grad_norm": 0.7304120659828186, - "learning_rate": 0.0002629759724167654, - "loss": 3.329, + "epoch": 5.633423180592992, + "grad_norm": 0.7154769897460938, + "learning_rate": 0.0002624004317323259, + "loss": 3.3291, "step": 52250 }, { - "epoch": 5.629103433430201, - "grad_norm": 0.6875249147415161, - "learning_rate": 0.000262652731386704, - "loss": 3.3431, + "epoch": 5.638814016172507, + "grad_norm": 0.6829545497894287, + "learning_rate": 0.00026207663248785753, + "loss": 3.3489, "step": 52300 }, { - "epoch": 5.634484985469809, - "grad_norm": 0.770589292049408, - "learning_rate": 0.00026232949035664255, - "loss": 3.3448, + "epoch": 5.644204851752022, + "grad_norm": 0.7409862875938416, + "learning_rate": 0.0002617528332433891, + "loss": 3.3226, "step": 52350 }, { - "epoch": 5.639866537509418, - "grad_norm": 0.6867998838424683, - "learning_rate": 0.00026200624932658115, - "loss": 3.3254, + "epoch": 5.649595687331536, + "grad_norm": 0.6577402949333191, + "learning_rate": 0.00026142903399892063, + "loss": 3.3248, "step": 52400 }, { - "epoch": 5.645248089549026, - "grad_norm": 0.701833188533783, - "learning_rate": 0.00026168300829651974, - "loss": 3.3515, + "epoch": 5.654986522911051, + "grad_norm": 0.7112260460853577, + "learning_rate": 0.00026110523475445223, + "loss": 3.3286, "step": 52450 }, { - "epoch": 5.650629641588634, - "grad_norm": 0.6975635290145874, - "learning_rate": 0.00026135976726645834, - "loss": 3.3351, + "epoch": 5.660377358490566, + "grad_norm": 0.6959606409072876, + "learning_rate": 0.0002607814355099838, + "loss": 3.3431, "step": 52500 }, { - "epoch": 5.656011193628243, - "grad_norm": 0.7372641563415527, - "learning_rate": 0.0002610365262363969, - "loss": 3.329, + "epoch": 5.665768194070081, + "grad_norm": 0.7187190055847168, + "learning_rate": 0.00026045763626551534, + "loss": 3.3478, "step": 52550 }, { - "epoch": 5.66139274566785, - "grad_norm": 0.7018235921859741, - "learning_rate": 0.00026071328520633553, - "loss": 3.3242, + "epoch": 5.671159029649596, + "grad_norm": 0.7454319000244141, + "learning_rate": 0.00026013383702104694, + "loss": 3.3464, "step": 52600 }, { - "epoch": 5.666774297707459, - "grad_norm": 0.7098267674446106, - "learning_rate": 0.00026039004417627407, - "loss": 3.3422, + "epoch": 5.67654986522911, + "grad_norm": 0.6750582456588745, + "learning_rate": 0.0002598100377765785, + "loss": 3.3393, "step": 52650 }, { - "epoch": 5.672155849747067, - "grad_norm": 0.7544245719909668, - "learning_rate": 0.00026006680314621266, - "loss": 3.349, + "epoch": 5.681940700808625, + "grad_norm": 0.7068287134170532, + "learning_rate": 0.0002594862385321101, + "loss": 3.3378, "step": 52700 }, { - "epoch": 5.677537401786675, - "grad_norm": 0.7348368167877197, - "learning_rate": 0.00025974356211615126, - "loss": 3.33, + "epoch": 5.6873315363881405, + "grad_norm": 0.687258780002594, + "learning_rate": 0.00025916243928764165, + "loss": 3.335, "step": 52750 }, { - "epoch": 5.682918953826284, - "grad_norm": 0.6879133582115173, - "learning_rate": 0.00025942032108608985, - "loss": 3.3439, + "epoch": 5.692722371967655, + "grad_norm": 0.6578938364982605, + "learning_rate": 0.0002588386400431732, + "loss": 3.3315, "step": 52800 }, { - "epoch": 5.688300505865891, - "grad_norm": 0.7134944796562195, - "learning_rate": 0.00025909708005602845, - "loss": 3.3526, + "epoch": 5.69811320754717, + "grad_norm": 0.6887868642807007, + "learning_rate": 0.00025851484079870475, + "loss": 3.333, "step": 52850 }, { - "epoch": 5.6936820579055, - "grad_norm": 0.7157636880874634, - "learning_rate": 0.000258773839025967, - "loss": 3.3244, + "epoch": 5.703504043126685, + "grad_norm": 0.6832111477851868, + "learning_rate": 0.00025819104155423635, + "loss": 3.328, "step": 52900 }, { - "epoch": 5.699063609945108, - "grad_norm": 0.6918929815292358, - "learning_rate": 0.0002584505979959056, - "loss": 3.3486, + "epoch": 5.708894878706199, + "grad_norm": 0.6999159455299377, + "learning_rate": 0.0002578672423097679, + "loss": 3.3221, "step": 52950 }, { - "epoch": 5.704445161984716, - "grad_norm": 0.7037851214408875, - "learning_rate": 0.0002581273569658442, - "loss": 3.323, + "epoch": 5.714285714285714, + "grad_norm": 0.6764535307884216, + "learning_rate": 0.0002575434430652995, + "loss": 3.35, "step": 53000 }, { - "epoch": 5.704445161984716, - "eval_accuracy": 0.3833323862423279, - "eval_loss": 3.3892478942871094, - "eval_runtime": 184.0857, - "eval_samples_per_second": 97.84, - "eval_steps_per_second": 6.117, + "epoch": 5.714285714285714, + "eval_accuracy": 0.3829353685902538, + "eval_loss": 3.392476797103882, + "eval_runtime": 183.5207, + "eval_samples_per_second": 98.142, + "eval_steps_per_second": 6.136, "step": 53000 }, { - "epoch": 5.709826714024325, - "grad_norm": 0.730811595916748, - "learning_rate": 0.00025780411593578277, - "loss": 3.3242, + "epoch": 5.719676549865229, + "grad_norm": 0.7244840860366821, + "learning_rate": 0.00025721964382083106, + "loss": 3.3289, "step": 53050 }, { - "epoch": 5.715208266063933, - "grad_norm": 0.6601924300193787, - "learning_rate": 0.0002574873397263226, - "loss": 3.3345, + "epoch": 5.725067385444744, + "grad_norm": 0.6984117031097412, + "learning_rate": 0.00025689584457636266, + "loss": 3.3212, "step": 53100 }, { - "epoch": 5.720589818103541, - "grad_norm": 0.7023354768753052, - "learning_rate": 0.00025716409869626117, - "loss": 3.3334, + "epoch": 5.730458221024259, + "grad_norm": 0.6968520879745483, + "learning_rate": 0.0002565720453318942, + "loss": 3.3479, "step": 53150 }, { - "epoch": 5.725971370143149, - "grad_norm": 0.7079684734344482, - "learning_rate": 0.0002568408576661997, - "loss": 3.3024, + "epoch": 5.735849056603773, + "grad_norm": 0.6894665956497192, + "learning_rate": 0.00025624824608742576, + "loss": 3.3145, "step": 53200 }, { - "epoch": 5.731352922182758, - "grad_norm": 0.7269954681396484, - "learning_rate": 0.00025651761663613836, - "loss": 3.3144, + "epoch": 5.741239892183288, + "grad_norm": 0.7057806849479675, + "learning_rate": 0.0002559244468429573, + "loss": 3.3325, "step": 53250 }, { - "epoch": 5.736734474222366, - "grad_norm": 0.6841776967048645, - "learning_rate": 0.0002561943756060769, - "loss": 3.327, + "epoch": 5.7466307277628035, + "grad_norm": 0.6735665798187256, + "learning_rate": 0.0002556006475984889, + "loss": 3.3367, "step": 53300 }, { - "epoch": 5.742116026261974, - "grad_norm": Infinity, - "learning_rate": 0.0002558775993966167, - "loss": 3.359, + "epoch": 5.752021563342318, + "grad_norm": 0.6803260445594788, + "learning_rate": 0.00025527684835402047, + "loss": 3.3264, "step": 53350 }, { - "epoch": 5.747497578301582, - "grad_norm": 0.6885505318641663, - "learning_rate": 0.0002555543583665553, - "loss": 3.3354, + "epoch": 5.757412398921833, + "grad_norm": 0.6724193692207336, + "learning_rate": 0.0002549530491095521, + "loss": 3.3538, "step": 53400 }, { - "epoch": 5.75287913034119, - "grad_norm": 0.6598957180976868, - "learning_rate": 0.0002552311173364939, - "loss": 3.3288, + "epoch": 5.762803234501348, + "grad_norm": 0.6589038372039795, + "learning_rate": 0.0002546292498650836, + "loss": 3.3275, "step": 53450 }, { - "epoch": 5.758260682380799, - "grad_norm": 0.7532528638839722, - "learning_rate": 0.0002549078763064325, - "loss": 3.324, + "epoch": 5.768194070080862, + "grad_norm": 0.6979022026062012, + "learning_rate": 0.00025430545062061523, + "loss": 3.3243, "step": 53500 }, { - "epoch": 5.763642234420407, - "grad_norm": 0.7090945839881897, - "learning_rate": 0.00025458463527637103, - "loss": 3.3497, + "epoch": 5.773584905660377, + "grad_norm": 0.7001596093177795, + "learning_rate": 0.0002539816513761468, + "loss": 3.3527, "step": 53550 }, { - "epoch": 5.769023786460015, - "grad_norm": 0.7386473417282104, - "learning_rate": 0.0002542613942463097, - "loss": 3.3253, + "epoch": 5.7789757412398925, + "grad_norm": 0.6990877389907837, + "learning_rate": 0.00025365785213167833, + "loss": 3.3337, "step": 53600 }, { - "epoch": 5.774405338499624, - "grad_norm": 0.6926794052124023, - "learning_rate": 0.0002539381532162482, - "loss": 3.3503, + "epoch": 5.784366576819407, + "grad_norm": 0.716033935546875, + "learning_rate": 0.0002533340528872099, + "loss": 3.3509, "step": 53650 }, { - "epoch": 5.779786890539231, - "grad_norm": 0.6957927942276001, - "learning_rate": 0.0002536149121861868, - "loss": 3.3296, + "epoch": 5.789757412398922, + "grad_norm": 0.7423722147941589, + "learning_rate": 0.0002530102536427415, + "loss": 3.3522, "step": 53700 }, { - "epoch": 5.78516844257884, - "grad_norm": 0.6737018823623657, - "learning_rate": 0.0002532916711561254, - "loss": 3.3303, + "epoch": 5.795148247978437, + "grad_norm": 0.681389331817627, + "learning_rate": 0.00025268645439827304, + "loss": 3.3322, "step": 53750 }, { - "epoch": 5.790549994618448, - "grad_norm": 0.6735786199569702, - "learning_rate": 0.000252968430126064, - "loss": 3.3267, + "epoch": 5.800539083557951, + "grad_norm": 0.728777289390564, + "learning_rate": 0.00025236265515380464, + "loss": 3.3393, "step": 53800 }, { - "epoch": 5.795931546658056, - "grad_norm": 0.7254409790039062, - "learning_rate": 0.00025264518909600255, - "loss": 3.317, + "epoch": 5.8059299191374665, + "grad_norm": 0.6941030621528625, + "learning_rate": 0.0002520388559093362, + "loss": 3.3219, "step": 53850 }, { - "epoch": 5.801313098697665, - "grad_norm": 0.7492697834968567, - "learning_rate": 0.00025232194806594114, - "loss": 3.3346, + "epoch": 5.811320754716981, + "grad_norm": 0.6763263940811157, + "learning_rate": 0.00025171505666486774, + "loss": 3.3551, "step": 53900 }, { - "epoch": 5.806694650737272, - "grad_norm": 0.7044953107833862, - "learning_rate": 0.00025199870703587974, - "loss": 3.3272, + "epoch": 5.816711590296496, + "grad_norm": 0.6964560151100159, + "learning_rate": 0.00025139125742039935, + "loss": 3.3294, "step": 53950 }, { - "epoch": 5.812076202776881, - "grad_norm": 0.7152127623558044, - "learning_rate": 0.00025167546600581833, - "loss": 3.3388, + "epoch": 5.822102425876011, + "grad_norm": 0.6470865607261658, + "learning_rate": 0.0002510674581759309, + "loss": 3.3319, "step": 54000 }, { - "epoch": 5.812076202776881, - "eval_accuracy": 0.3838037224696671, - "eval_loss": 3.383706569671631, - "eval_runtime": 184.3385, - "eval_samples_per_second": 97.706, - "eval_steps_per_second": 6.108, + "epoch": 5.822102425876011, + "eval_accuracy": 0.38318559619089054, + "eval_loss": 3.387439727783203, + "eval_runtime": 185.3555, + "eval_samples_per_second": 97.17, + "eval_steps_per_second": 6.075, "step": 54000 }, { - "epoch": 5.817457754816489, - "grad_norm": 0.6760851740837097, - "learning_rate": 0.0002513522249757569, - "loss": 3.3252, + "epoch": 5.827493261455525, + "grad_norm": 0.7239425182342529, + "learning_rate": 0.0002507436589314625, + "loss": 3.3272, "step": 54050 }, { - "epoch": 5.822839306856097, - "grad_norm": 0.7422081232070923, - "learning_rate": 0.00025102898394569547, - "loss": 3.3384, + "epoch": 5.83288409703504, + "grad_norm": 0.6678457856178284, + "learning_rate": 0.00025041985968699405, + "loss": 3.3339, "step": 54100 }, { - "epoch": 5.828220858895706, - "grad_norm": 0.7860513925552368, - "learning_rate": 0.00025070574291563406, - "loss": 3.3348, + "epoch": 5.8382749326145555, + "grad_norm": 0.6725283265113831, + "learning_rate": 0.0002500960604425256, + "loss": 3.3496, "step": 54150 }, { - "epoch": 5.833602410935313, - "grad_norm": 0.705047070980072, - "learning_rate": 0.00025038250188557265, - "loss": 3.3267, + "epoch": 5.84366576819407, + "grad_norm": 0.7176863551139832, + "learning_rate": 0.00024977226119805715, + "loss": 3.3397, "step": 54200 }, { - "epoch": 5.838983962974922, - "grad_norm": 0.7171577215194702, - "learning_rate": 0.00025005926085551125, - "loss": 3.3154, + "epoch": 5.849056603773585, + "grad_norm": 0.6935498714447021, + "learning_rate": 0.00024944846195358876, + "loss": 3.3478, "step": 54250 }, { - "epoch": 5.84436551501453, - "grad_norm": 0.7100509405136108, - "learning_rate": 0.0002497360198254498, - "loss": 3.3315, + "epoch": 5.8544474393531, + "grad_norm": 0.7223532795906067, + "learning_rate": 0.0002491246627091203, + "loss": 3.3461, "step": 54300 }, { - "epoch": 5.849747067054138, - "grad_norm": 0.7151844501495361, - "learning_rate": 0.00024941277879538844, - "loss": 3.336, + "epoch": 5.859838274932614, + "grad_norm": 0.673150897026062, + "learning_rate": 0.0002488008634646519, + "loss": 3.3365, "step": 54350 }, { - "epoch": 5.855128619093747, - "grad_norm": 0.7535050511360168, - "learning_rate": 0.000249089537765327, - "loss": 3.3343, + "epoch": 5.8652291105121295, + "grad_norm": 0.6913243532180786, + "learning_rate": 0.00024847706422018346, + "loss": 3.3225, "step": 54400 }, { - "epoch": 5.860510171133355, - "grad_norm": 0.6949968934059143, - "learning_rate": 0.0002487662967352656, - "loss": 3.3246, + "epoch": 5.870619946091644, + "grad_norm": 0.6828725934028625, + "learning_rate": 0.00024815326497571507, + "loss": 3.318, "step": 54450 }, { - "epoch": 5.865891723172963, - "grad_norm": 0.7528766989707947, - "learning_rate": 0.00024844305570520417, - "loss": 3.3319, + "epoch": 5.876010781671159, + "grad_norm": 0.7479648590087891, + "learning_rate": 0.00024782946573124657, + "loss": 3.332, "step": 54500 }, { - "epoch": 5.871273275212571, - "grad_norm": 0.6865386962890625, - "learning_rate": 0.00024811981467514276, - "loss": 3.3407, + "epoch": 5.881401617250674, + "grad_norm": 0.7425312399864197, + "learning_rate": 0.00024750566648677817, + "loss": 3.3284, "step": 54550 }, { - "epoch": 5.87665482725218, - "grad_norm": 0.721444308757782, - "learning_rate": 0.00024779657364508136, - "loss": 3.3203, + "epoch": 5.886792452830189, + "grad_norm": 0.6899173855781555, + "learning_rate": 0.0002471818672423097, + "loss": 3.3596, "step": 54600 }, { - "epoch": 5.882036379291788, - "grad_norm": 0.7100372910499573, - "learning_rate": 0.0002474733326150199, - "loss": 3.3361, + "epoch": 5.892183288409703, + "grad_norm": 0.7049915194511414, + "learning_rate": 0.0002468645439827307, + "loss": 3.3453, "step": 54650 }, { - "epoch": 5.887417931331396, - "grad_norm": 0.6949239373207092, - "learning_rate": 0.0002471500915849585, - "loss": 3.3499, + "epoch": 5.8975741239892185, + "grad_norm": 0.7107455730438232, + "learning_rate": 0.0002465407447382623, + "loss": 3.3435, "step": 54700 }, { - "epoch": 5.892799483371004, - "grad_norm": 0.7012357115745544, - "learning_rate": 0.0002468268505548971, - "loss": 3.3406, + "epoch": 5.902964959568733, + "grad_norm": 0.7119571566581726, + "learning_rate": 0.00024621694549379383, + "loss": 3.3442, "step": 54750 }, { - "epoch": 5.898181035410612, - "grad_norm": 0.6897000074386597, - "learning_rate": 0.0002465036095248357, - "loss": 3.3447, + "epoch": 5.908355795148248, + "grad_norm": 0.6925951838493347, + "learning_rate": 0.00024589314624932543, + "loss": 3.3476, "step": 54800 }, { - "epoch": 5.903562587450221, - "grad_norm": 0.726473867893219, - "learning_rate": 0.0002461803684947742, - "loss": 3.3388, + "epoch": 5.913746630727763, + "grad_norm": 0.6913626790046692, + "learning_rate": 0.00024556934700485693, + "loss": 3.3439, "step": 54850 }, { - "epoch": 5.9089441394898286, - "grad_norm": 0.7095021605491638, - "learning_rate": 0.0002458571274647128, - "loss": 3.3416, + "epoch": 5.919137466307277, + "grad_norm": 0.7057418823242188, + "learning_rate": 0.00024524554776038853, + "loss": 3.3232, "step": 54900 }, { - "epoch": 5.914325691529437, - "grad_norm": 0.7249286770820618, - "learning_rate": 0.0002455338864346514, - "loss": 3.3427, + "epoch": 5.9245283018867925, + "grad_norm": 0.7266544699668884, + "learning_rate": 0.0002449217485159201, + "loss": 3.3287, "step": 54950 }, { - "epoch": 5.919707243569046, - "grad_norm": 0.7552708983421326, - "learning_rate": 0.00024521064540459, - "loss": 3.3396, + "epoch": 5.929919137466308, + "grad_norm": 0.661906898021698, + "learning_rate": 0.0002445979492714517, + "loss": 3.3318, "step": 55000 }, { - "epoch": 5.919707243569046, - "eval_accuracy": 0.38397126522267705, - "eval_loss": 3.3809568881988525, - "eval_runtime": 184.0462, - "eval_samples_per_second": 97.861, - "eval_steps_per_second": 6.118, + "epoch": 5.929919137466308, + "eval_accuracy": 0.3839206329769555, + "eval_loss": 3.3820478916168213, + "eval_runtime": 185.4571, + "eval_samples_per_second": 97.117, + "eval_steps_per_second": 6.071, "step": 55000 }, { - "epoch": 5.925088795608653, - "grad_norm": 0.7218322157859802, - "learning_rate": 0.0002448874043745286, - "loss": 3.3392, + "epoch": 5.935309973045822, + "grad_norm": 0.6756860017776489, + "learning_rate": 0.00024427415002698324, + "loss": 3.3462, "step": 55050 }, { - "epoch": 5.930470347648262, - "grad_norm": 0.7611395120620728, - "learning_rate": 0.00024456416334446714, - "loss": 3.3307, + "epoch": 5.940700808625337, + "grad_norm": 0.7302395105361938, + "learning_rate": 0.00024395035078251482, + "loss": 3.3318, "step": 55100 }, { - "epoch": 5.93585189968787, - "grad_norm": 0.7061910033226013, - "learning_rate": 0.00024424092231440574, - "loss": 3.3327, + "epoch": 5.946091644204852, + "grad_norm": 0.7301084399223328, + "learning_rate": 0.0002436265515380464, + "loss": 3.3298, "step": 55150 }, { - "epoch": 5.941233451727478, - "grad_norm": 0.6887156367301941, - "learning_rate": 0.00024391768128434436, - "loss": 3.35, + "epoch": 5.951482479784366, + "grad_norm": 0.7300922870635986, + "learning_rate": 0.00024330275229357797, + "loss": 3.3517, "step": 55200 }, { - "epoch": 5.946615003767087, - "grad_norm": 0.7248396277427673, - "learning_rate": 0.00024359444025428293, - "loss": 3.3488, + "epoch": 5.9568733153638815, + "grad_norm": 0.6984791159629822, + "learning_rate": 0.00024297895304910952, + "loss": 3.3235, "step": 55250 }, { - "epoch": 5.951996555806694, - "grad_norm": 0.7237182855606079, - "learning_rate": 0.0002432711992242215, - "loss": 3.3265, + "epoch": 5.962264150943396, + "grad_norm": 0.6925639510154724, + "learning_rate": 0.0002426551538046411, + "loss": 3.3236, "step": 55300 }, { - "epoch": 5.957378107846303, - "grad_norm": 0.7203229069709778, - "learning_rate": 0.0002429479581941601, - "loss": 3.3224, + "epoch": 5.967654986522911, + "grad_norm": 0.7360100746154785, + "learning_rate": 0.00024233135456017265, + "loss": 3.3322, "step": 55350 }, { - "epoch": 5.962759659885911, - "grad_norm": 0.7003002762794495, - "learning_rate": 0.00024262471716409868, - "loss": 3.3244, + "epoch": 5.973045822102426, + "grad_norm": 0.782108724117279, + "learning_rate": 0.00024200755531570423, + "loss": 3.337, "step": 55400 }, { - "epoch": 5.968141211925519, - "grad_norm": 0.725002110004425, - "learning_rate": 0.00024230147613403728, - "loss": 3.3313, + "epoch": 5.97843665768194, + "grad_norm": 0.6809326410293579, + "learning_rate": 0.0002416837560712358, + "loss": 3.3307, "step": 55450 }, { - "epoch": 5.973522763965128, - "grad_norm": 0.7069681286811829, - "learning_rate": 0.00024197823510397584, - "loss": 3.3428, + "epoch": 5.9838274932614555, + "grad_norm": 0.725308895111084, + "learning_rate": 0.00024135995682676739, + "loss": 3.3075, "step": 55500 }, { - "epoch": 5.978904316004736, - "grad_norm": 0.7072627544403076, - "learning_rate": 0.0002416549940739144, + "epoch": 5.989218328840971, + "grad_norm": 0.6809728741645813, + "learning_rate": 0.00024103615758229896, "loss": 3.3378, "step": 55550 }, { - "epoch": 5.984285868044344, - "grad_norm": 0.7657013535499573, - "learning_rate": 0.00024133175304385303, - "loss": 3.3343, + "epoch": 5.994609164420485, + "grad_norm": 0.7342197299003601, + "learning_rate": 0.00024071235833783054, + "loss": 3.3266, "step": 55600 }, { - "epoch": 5.989667420083952, - "grad_norm": 0.7375352382659912, - "learning_rate": 0.0002410085120137916, - "loss": 3.3289, + "epoch": 6.0, + "grad_norm": 1.4940729141235352, + "learning_rate": 0.00024038855909336212, + "loss": 3.3575, "step": 55650 }, { - "epoch": 5.995048972123561, - "grad_norm": 0.7755512595176697, - "learning_rate": 0.00024068527098373017, - "loss": 3.3562, + "epoch": 6.005390835579515, + "grad_norm": 0.6556679606437683, + "learning_rate": 0.00024006475984889364, + "loss": 3.2374, "step": 55700 }, { - "epoch": 6.000430524163169, - "grad_norm": 0.7189819812774658, - "learning_rate": 0.0002403620299536688, - "loss": 3.3257, + "epoch": 6.010781671159029, + "grad_norm": 0.7130705118179321, + "learning_rate": 0.00023974096060442522, + "loss": 3.2386, "step": 55750 }, { - "epoch": 6.005812076202777, - "grad_norm": 0.713076114654541, - "learning_rate": 0.00024003878892360736, - "loss": 3.2465, + "epoch": 6.0161725067385445, + "grad_norm": 0.7809520363807678, + "learning_rate": 0.0002394171613599568, + "loss": 3.251, "step": 55800 }, { - "epoch": 6.011193628242385, - "grad_norm": 0.7407640814781189, - "learning_rate": 0.00023971554789354593, - "loss": 3.2446, + "epoch": 6.02156334231806, + "grad_norm": 0.700687825679779, + "learning_rate": 0.00023909336211548837, + "loss": 3.2536, "step": 55850 }, { - "epoch": 6.016575180281993, - "grad_norm": 0.7104265093803406, - "learning_rate": 0.00023939230686348452, - "loss": 3.2488, + "epoch": 6.026954177897574, + "grad_norm": 0.7130185961723328, + "learning_rate": 0.00023876956287101995, + "loss": 3.2534, "step": 55900 }, { - "epoch": 6.021956732321602, - "grad_norm": 0.7195842266082764, - "learning_rate": 0.0002390690658334231, - "loss": 3.2401, + "epoch": 6.032345013477089, + "grad_norm": 0.7143113613128662, + "learning_rate": 0.00023844576362655153, + "loss": 3.2509, "step": 55950 }, { - "epoch": 6.0273382843612096, - "grad_norm": 0.7403207421302795, - "learning_rate": 0.00023874582480336168, - "loss": 3.237, + "epoch": 6.037735849056604, + "grad_norm": 0.7439408302307129, + "learning_rate": 0.0002381219643820831, + "loss": 3.2649, "step": 56000 }, { - "epoch": 6.0273382843612096, - "eval_accuracy": 0.3839623556858762, - "eval_loss": 3.3840363025665283, - "eval_runtime": 183.8506, - "eval_samples_per_second": 97.965, - "eval_steps_per_second": 6.125, + "epoch": 6.037735849056604, + "eval_accuracy": 0.383912158051706, + "eval_loss": 3.386408805847168, + "eval_runtime": 185.4382, + "eval_samples_per_second": 97.127, + "eval_steps_per_second": 6.072, "step": 56000 }, { - "epoch": 6.032719836400818, - "grad_norm": 0.6788797378540039, - "learning_rate": 0.00023842258377330028, - "loss": 3.2471, + "epoch": 6.0431266846361185, + "grad_norm": 0.7821211814880371, + "learning_rate": 0.00023779816513761466, + "loss": 3.2624, "step": 56050 }, { - "epoch": 6.038101388440427, - "grad_norm": 0.6888821125030518, - "learning_rate": 0.00023809934274323885, - "loss": 3.2575, + "epoch": 6.048517520215634, + "grad_norm": 0.6759442687034607, + "learning_rate": 0.00023748084187803558, + "loss": 3.2505, "step": 56100 }, { - "epoch": 6.043482940480034, - "grad_norm": 0.7463328838348389, - "learning_rate": 0.0002377761017131774, - "loss": 3.2439, + "epoch": 6.053908355795148, + "grad_norm": 0.7412766218185425, + "learning_rate": 0.00023715704263356716, + "loss": 3.2657, "step": 56150 }, { - "epoch": 6.048864492519643, - "grad_norm": 0.690376341342926, - "learning_rate": 0.00023745286068311603, - "loss": 3.2494, + "epoch": 6.059299191374663, + "grad_norm": 0.7447103261947632, + "learning_rate": 0.00023683324338909874, + "loss": 3.259, "step": 56200 }, { - "epoch": 6.0542460445592505, - "grad_norm": 0.7049649953842163, - "learning_rate": 0.0002371296196530546, - "loss": 3.2616, + "epoch": 6.064690026954178, + "grad_norm": 0.6834601759910583, + "learning_rate": 0.00023650944414463032, + "loss": 3.2416, "step": 56250 }, { - "epoch": 6.059627596598859, - "grad_norm": 0.7419145703315735, - "learning_rate": 0.00023680637862299317, - "loss": 3.2423, + "epoch": 6.070080862533692, + "grad_norm": 0.7556888461112976, + "learning_rate": 0.0002361856449001619, + "loss": 3.2681, "step": 56300 }, { - "epoch": 6.065009148638468, - "grad_norm": 0.732083261013031, - "learning_rate": 0.0002364831375929318, - "loss": 3.2486, + "epoch": 6.0754716981132075, + "grad_norm": 0.7148715853691101, + "learning_rate": 0.00023586184565569347, + "loss": 3.2538, "step": 56350 }, { - "epoch": 6.070390700678075, - "grad_norm": 0.7247629165649414, - "learning_rate": 0.00023615989656287036, - "loss": 3.2325, + "epoch": 6.080862533692723, + "grad_norm": 0.6975698471069336, + "learning_rate": 0.00023553804641122502, + "loss": 3.2559, "step": 56400 }, { - "epoch": 6.075772252717684, - "grad_norm": 0.7294711470603943, - "learning_rate": 0.00023583665553280895, - "loss": 3.2576, + "epoch": 6.086253369272237, + "grad_norm": 0.7379735112190247, + "learning_rate": 0.00023521424716675657, + "loss": 3.2518, "step": 56450 }, { - "epoch": 6.081153804757292, - "grad_norm": 0.6983347535133362, - "learning_rate": 0.00023551341450274752, - "loss": 3.2536, + "epoch": 6.091644204851752, + "grad_norm": 0.7769717574119568, + "learning_rate": 0.00023489044792228815, + "loss": 3.2717, "step": 56500 }, { - "epoch": 6.0865353567969, - "grad_norm": 0.7216185927391052, - "learning_rate": 0.00023519017347268612, - "loss": 3.2575, + "epoch": 6.097035040431267, + "grad_norm": 0.7235531806945801, + "learning_rate": 0.00023456664867781973, + "loss": 3.2488, "step": 56550 }, { - "epoch": 6.091916908836509, - "grad_norm": 0.738471269607544, - "learning_rate": 0.0002348669324426247, - "loss": 3.2558, + "epoch": 6.1024258760107815, + "grad_norm": 0.7254287600517273, + "learning_rate": 0.0002342428494333513, + "loss": 3.255, "step": 56600 }, { - "epoch": 6.097298460876116, - "grad_norm": 0.7062331438064575, - "learning_rate": 0.00023454369141256328, - "loss": 3.2474, + "epoch": 6.107816711590297, + "grad_norm": 0.7020230889320374, + "learning_rate": 0.00023391905018888288, + "loss": 3.254, "step": 56650 }, { - "epoch": 6.102680012915725, - "grad_norm": 0.7130937576293945, - "learning_rate": 0.00023422045038250185, - "loss": 3.2571, + "epoch": 6.113207547169812, + "grad_norm": 0.7128614187240601, + "learning_rate": 0.00023359525094441443, + "loss": 3.2763, "step": 56700 }, { - "epoch": 6.108061564955333, - "grad_norm": 0.7500676512718201, - "learning_rate": 0.00023389720935244047, - "loss": 3.2526, + "epoch": 6.118598382749326, + "grad_norm": 0.72292160987854, + "learning_rate": 0.000233271451699946, + "loss": 3.2677, "step": 56750 }, { - "epoch": 6.113443116994941, - "grad_norm": 0.7299767136573792, - "learning_rate": 0.00023357396832237903, - "loss": 3.2539, + "epoch": 6.123989218328841, + "grad_norm": 0.7195286750793457, + "learning_rate": 0.0002329476524554776, + "loss": 3.2702, "step": 56800 }, { - "epoch": 6.11882466903455, - "grad_norm": 0.7288519740104675, - "learning_rate": 0.0002332507272923176, - "loss": 3.286, + "epoch": 6.129380053908355, + "grad_norm": 0.727357804775238, + "learning_rate": 0.00023262385321100917, + "loss": 3.27, "step": 56850 }, { - "epoch": 6.124206221074158, - "grad_norm": 0.7326245903968811, - "learning_rate": 0.00023292748626225622, - "loss": 3.2707, + "epoch": 6.1347708894878705, + "grad_norm": 0.7259669899940491, + "learning_rate": 0.00023230005396654072, + "loss": 3.2769, "step": 56900 }, { - "epoch": 6.129587773113766, - "grad_norm": 0.7590323686599731, - "learning_rate": 0.0002326042452321948, - "loss": 3.2554, + "epoch": 6.140161725067386, + "grad_norm": 0.7762994766235352, + "learning_rate": 0.0002319762547220723, + "loss": 3.2559, "step": 56950 }, { - "epoch": 6.134969325153374, - "grad_norm": 0.732631266117096, - "learning_rate": 0.00023228100420213336, - "loss": 3.2561, + "epoch": 6.1455525606469, + "grad_norm": 0.7135004997253418, + "learning_rate": 0.00023165245547760387, + "loss": 3.2537, "step": 57000 }, { - "epoch": 6.134969325153374, - "eval_accuracy": 0.3845425621068056, - "eval_loss": 3.3839833736419678, - "eval_runtime": 184.6599, - "eval_samples_per_second": 97.536, - "eval_steps_per_second": 6.098, + "epoch": 6.1455525606469, + "eval_accuracy": 0.3839402991496499, + "eval_loss": 3.385230779647827, + "eval_runtime": 185.6484, + "eval_samples_per_second": 97.017, + "eval_steps_per_second": 6.065, "step": 57000 }, { - "epoch": 6.140350877192983, - "grad_norm": 0.7149685621261597, - "learning_rate": 0.00023195776317207195, - "loss": 3.2676, + "epoch": 6.150943396226415, + "grad_norm": 0.7256219983100891, + "learning_rate": 0.00023132865623313542, + "loss": 3.2828, "step": 57050 }, { - "epoch": 6.1457324292325906, - "grad_norm": 0.7450768351554871, - "learning_rate": 0.00023163452214201055, - "loss": 3.267, + "epoch": 6.15633423180593, + "grad_norm": 0.7467809319496155, + "learning_rate": 0.000231004856988667, + "loss": 3.2713, "step": 57100 }, { - "epoch": 6.151113981272199, - "grad_norm": 0.71061110496521, - "learning_rate": 0.00023131128111194912, - "loss": 3.2632, + "epoch": 6.1617250673854445, + "grad_norm": 0.6944103837013245, + "learning_rate": 0.00023068105774419858, + "loss": 3.2551, "step": 57150 }, { - "epoch": 6.156495533311807, - "grad_norm": 0.8081583976745605, - "learning_rate": 0.0002309880400818877, - "loss": 3.265, + "epoch": 6.16711590296496, + "grad_norm": 0.6952575445175171, + "learning_rate": 0.00023035725849973016, + "loss": 3.2618, "step": 57200 }, { - "epoch": 6.161877085351415, - "grad_norm": 0.6962241530418396, - "learning_rate": 0.00023066479905182628, - "loss": 3.2733, + "epoch": 6.172506738544475, + "grad_norm": 0.7093820571899414, + "learning_rate": 0.00023003345925526173, + "loss": 3.2645, "step": 57250 }, { - "epoch": 6.167258637391024, - "grad_norm": 0.7313778400421143, - "learning_rate": 0.00023034155802176487, - "loss": 3.2848, + "epoch": 6.177897574123989, + "grad_norm": 0.7516891360282898, + "learning_rate": 0.00022970966001079328, + "loss": 3.263, "step": 57300 }, { - "epoch": 6.1726401894306315, - "grad_norm": 0.7355571985244751, - "learning_rate": 0.00023001831699170347, - "loss": 3.2765, + "epoch": 6.183288409703504, + "grad_norm": 0.7763015627861023, + "learning_rate": 0.00022938586076632484, + "loss": 3.2802, "step": 57350 }, { - "epoch": 6.17802174147024, - "grad_norm": 0.7399037480354309, - "learning_rate": 0.00022970154078224327, - "loss": 3.2481, + "epoch": 6.188679245283019, + "grad_norm": 0.6774693131446838, + "learning_rate": 0.0002290620615218564, + "loss": 3.2703, "step": 57400 }, { - "epoch": 6.183403293509849, - "grad_norm": 0.7529656291007996, - "learning_rate": 0.00022937829975218187, - "loss": 3.2717, + "epoch": 6.1940700808625335, + "grad_norm": 0.7566379308700562, + "learning_rate": 0.000228738262277388, + "loss": 3.276, "step": 57450 }, { - "epoch": 6.188784845549456, - "grad_norm": 0.6955074071884155, - "learning_rate": 0.00022905505872212044, - "loss": 3.2703, + "epoch": 6.199460916442049, + "grad_norm": 0.7387679815292358, + "learning_rate": 0.00022841446303291957, + "loss": 3.273, "step": 57500 }, { - "epoch": 6.194166397589065, - "grad_norm": 0.704826831817627, - "learning_rate": 0.00022873181769205903, - "loss": 3.2711, + "epoch": 6.204851752021563, + "grad_norm": 0.7531148195266724, + "learning_rate": 0.00022809066378845115, + "loss": 3.2638, "step": 57550 }, { - "epoch": 6.1995479496286725, - "grad_norm": 0.75359708070755, - "learning_rate": 0.0002284085766619976, - "loss": 3.2582, + "epoch": 6.210242587601078, + "grad_norm": 0.8101642727851868, + "learning_rate": 0.00022776686454398272, + "loss": 3.2498, "step": 57600 }, { - "epoch": 6.204929501668281, - "grad_norm": 0.7706862688064575, - "learning_rate": 0.0002280853356319362, - "loss": 3.2637, + "epoch": 6.215633423180593, + "grad_norm": 0.7339323163032532, + "learning_rate": 0.0002274430652995143, + "loss": 3.2721, "step": 57650 }, { - "epoch": 6.21031105370789, - "grad_norm": 0.7804993391036987, - "learning_rate": 0.0002277620946018748, - "loss": 3.2706, + "epoch": 6.2210242587601075, + "grad_norm": 0.7086048126220703, + "learning_rate": 0.00022711926605504588, + "loss": 3.2757, "step": 57700 }, { - "epoch": 6.215692605747497, - "grad_norm": 0.7386913299560547, - "learning_rate": 0.00022743885357181336, - "loss": 3.2786, + "epoch": 6.226415094339623, + "grad_norm": 0.7538915276527405, + "learning_rate": 0.0002267954668105774, + "loss": 3.2926, "step": 57750 }, { - "epoch": 6.221074157787106, - "grad_norm": 0.7626097202301025, - "learning_rate": 0.00022711561254175192, - "loss": 3.2787, + "epoch": 6.231805929919138, + "grad_norm": 0.7328342199325562, + "learning_rate": 0.00022647166756610898, + "loss": 3.2866, "step": 57800 }, { - "epoch": 6.226455709826714, - "grad_norm": 0.7361026406288147, - "learning_rate": 0.00022679237151169054, - "loss": 3.2744, + "epoch": 6.237196765498652, + "grad_norm": 0.7042235136032104, + "learning_rate": 0.00022614786832164056, + "loss": 3.2587, "step": 57850 }, { - "epoch": 6.231837261866322, - "grad_norm": 0.7097683548927307, - "learning_rate": 0.0002264691304816291, - "loss": 3.2767, + "epoch": 6.242587601078167, + "grad_norm": 0.7145513296127319, + "learning_rate": 0.00022582406907717214, + "loss": 3.2803, "step": 57900 }, { - "epoch": 6.237218813905931, - "grad_norm": 0.792580783367157, - "learning_rate": 0.00022614588945156768, - "loss": 3.2917, + "epoch": 6.247978436657682, + "grad_norm": 0.7408495545387268, + "learning_rate": 0.0002255002698327037, + "loss": 3.2699, "step": 57950 }, { - "epoch": 6.242600365945538, - "grad_norm": 0.6779635548591614, - "learning_rate": 0.0002258226484215063, - "loss": 3.2553, + "epoch": 6.2533692722371965, + "grad_norm": 0.7923481464385986, + "learning_rate": 0.0002251764705882353, + "loss": 3.2566, "step": 58000 }, { - "epoch": 6.242600365945538, - "eval_accuracy": 0.3850510576217773, - "eval_loss": 3.3806025981903076, - "eval_runtime": 184.4045, - "eval_samples_per_second": 97.671, - "eval_steps_per_second": 6.106, + "epoch": 6.2533692722371965, + "eval_accuracy": 0.3843151516126099, + "eval_loss": 3.3821535110473633, + "eval_runtime": 185.4378, + "eval_samples_per_second": 97.127, + "eval_steps_per_second": 6.072, "step": 58000 }, { - "epoch": 6.247981917985147, - "grad_norm": 0.7419663667678833, - "learning_rate": 0.00022549940739144487, - "loss": 3.259, + "epoch": 6.258760107816712, + "grad_norm": 0.7456364631652832, + "learning_rate": 0.00022485267134376687, + "loss": 3.2721, "step": 58050 }, { - "epoch": 6.253363470024755, - "grad_norm": 0.7085452675819397, - "learning_rate": 0.00022517616636138344, - "loss": 3.2813, + "epoch": 6.264150943396227, + "grad_norm": 0.7139208912849426, + "learning_rate": 0.00022452887209929842, + "loss": 3.2846, "step": 58100 }, { - "epoch": 6.258745022064363, - "grad_norm": 0.7902224659919739, - "learning_rate": 0.00022485292533132203, - "loss": 3.2794, + "epoch": 6.269541778975741, + "grad_norm": 0.7725496292114258, + "learning_rate": 0.00022420507285482997, + "loss": 3.2611, "step": 58150 }, { - "epoch": 6.264126574103972, - "grad_norm": 0.7155488133430481, - "learning_rate": 0.00022452968430126063, - "loss": 3.2745, + "epoch": 6.274932614555256, + "grad_norm": 0.7034215927124023, + "learning_rate": 0.00022388127361036155, + "loss": 3.2786, "step": 58200 }, { - "epoch": 6.26950812614358, - "grad_norm": 1.2818570137023926, - "learning_rate": 0.00022420644327119922, - "loss": 3.268, + "epoch": 6.280323450134771, + "grad_norm": 0.7483360767364502, + "learning_rate": 0.0002235639503507825, + "loss": 3.2824, "step": 58250 }, { - "epoch": 6.274889678183188, - "grad_norm": 0.750627875328064, - "learning_rate": 0.0002238832022411378, - "loss": 3.2734, + "epoch": 6.285714285714286, + "grad_norm": 0.7298248410224915, + "learning_rate": 0.00022324015110631408, + "loss": 3.268, "step": 58300 }, { - "epoch": 6.280271230222796, - "grad_norm": 0.7265231013298035, - "learning_rate": 0.00022355996121107636, - "loss": 3.274, + "epoch": 6.291105121293801, + "grad_norm": 0.7724031805992126, + "learning_rate": 0.00022291635186184565, + "loss": 3.2586, "step": 58350 }, { - "epoch": 6.285652782262405, - "grad_norm": 0.679812490940094, - "learning_rate": 0.00022323672018101498, - "loss": 3.29, + "epoch": 6.296495956873315, + "grad_norm": 0.7413439750671387, + "learning_rate": 0.0002225925526173772, + "loss": 3.2683, "step": 58400 }, { - "epoch": 6.2910343343020125, - "grad_norm": 0.7458456754684448, - "learning_rate": 0.00022291347915095355, - "loss": 3.2672, + "epoch": 6.30188679245283, + "grad_norm": 0.7157818675041199, + "learning_rate": 0.00022226875337290878, + "loss": 3.2867, "step": 58450 }, { - "epoch": 6.296415886341621, - "grad_norm": 0.7149653434753418, - "learning_rate": 0.0002225902381208921, - "loss": 3.2806, + "epoch": 6.307277628032345, + "grad_norm": 0.688450276851654, + "learning_rate": 0.00022194495412844033, + "loss": 3.2592, "step": 58500 }, { - "epoch": 6.301797438381229, - "grad_norm": 0.7718719244003296, - "learning_rate": 0.00022226699709083073, - "loss": 3.2599, + "epoch": 6.3126684636118595, + "grad_norm": 0.7490639686584473, + "learning_rate": 0.0002216211548839719, + "loss": 3.2803, "step": 58550 }, { - "epoch": 6.307178990420837, - "grad_norm": 0.7340017557144165, - "learning_rate": 0.0002219437560607693, - "loss": 3.2805, + "epoch": 6.318059299191375, + "grad_norm": 0.7094032764434814, + "learning_rate": 0.0002212973556395035, + "loss": 3.287, "step": 58600 }, { - "epoch": 6.312560542460446, - "grad_norm": 0.7550706267356873, - "learning_rate": 0.00022162051503070787, - "loss": 3.2808, + "epoch": 6.32345013477089, + "grad_norm": 0.708122193813324, + "learning_rate": 0.00022097355639503507, + "loss": 3.2805, "step": 58650 }, { - "epoch": 6.3179420945000535, - "grad_norm": 0.7718273401260376, - "learning_rate": 0.00022129727400064646, - "loss": 3.2815, + "epoch": 6.328840970350404, + "grad_norm": 0.7124499678611755, + "learning_rate": 0.00022064975715056664, + "loss": 3.2957, "step": 58700 }, { - "epoch": 6.323323646539662, - "grad_norm": 0.7422057390213013, - "learning_rate": 0.00022097403297058506, - "loss": 3.2822, + "epoch": 6.334231805929919, + "grad_norm": 0.7175976037979126, + "learning_rate": 0.0002203259579060982, + "loss": 3.2933, "step": 58750 }, { - "epoch": 6.328705198579271, - "grad_norm": 0.7373623847961426, - "learning_rate": 0.00022065079194052363, - "loss": 3.2781, + "epoch": 6.339622641509434, + "grad_norm": 0.7456777095794678, + "learning_rate": 0.00022000215866162977, + "loss": 3.279, "step": 58800 }, { - "epoch": 6.334086750618878, - "grad_norm": 0.716949462890625, - "learning_rate": 0.00022032755091046222, - "loss": 3.2824, + "epoch": 6.345013477088949, + "grad_norm": 0.728626012802124, + "learning_rate": 0.00021967835941716135, + "loss": 3.2823, "step": 58850 }, { - "epoch": 6.339468302658487, - "grad_norm": 0.7594158053398132, - "learning_rate": 0.0002200043098804008, - "loss": 3.2713, + "epoch": 6.350404312668464, + "grad_norm": 0.7576169967651367, + "learning_rate": 0.0002193545601726929, + "loss": 3.2701, "step": 58900 }, { - "epoch": 6.344849854698095, - "grad_norm": 0.7525697350502014, - "learning_rate": 0.00021968106885033938, - "loss": 3.2712, + "epoch": 6.355795148247978, + "grad_norm": 0.6919223070144653, + "learning_rate": 0.00021903076092822448, + "loss": 3.2899, "step": 58950 }, { - "epoch": 6.350231406737703, - "grad_norm": 0.8482348918914795, - "learning_rate": 0.00021935782782027798, - "loss": 3.2965, + "epoch": 6.361185983827493, + "grad_norm": 0.7274947166442871, + "learning_rate": 0.00021870696168375606, + "loss": 3.2887, "step": 59000 }, { - "epoch": 6.350231406737703, - "eval_accuracy": 0.3853254061635089, - "eval_loss": 3.375702142715454, - "eval_runtime": 184.5908, - "eval_samples_per_second": 97.573, - "eval_steps_per_second": 6.1, + "epoch": 6.361185983827493, + "eval_accuracy": 0.3848371200856706, + "eval_loss": 3.378706932067871, + "eval_runtime": 183.6452, + "eval_samples_per_second": 98.075, + "eval_steps_per_second": 6.131, "step": 59000 }, { - "epoch": 6.355612958777312, - "grad_norm": 0.7227912545204163, - "learning_rate": 0.00021903458679021655, - "loss": 3.299, + "epoch": 6.366576819407008, + "grad_norm": 0.7238966822624207, + "learning_rate": 0.0002183831624392876, + "loss": 3.2926, "step": 59050 }, { - "epoch": 6.360994510816919, - "grad_norm": 0.7208841443061829, - "learning_rate": 0.0002187113457601551, - "loss": 3.2816, + "epoch": 6.3719676549865225, + "grad_norm": 0.7809830904006958, + "learning_rate": 0.00021805936319481918, + "loss": 3.2823, "step": 59100 }, { - "epoch": 6.366376062856528, - "grad_norm": 0.6990681290626526, - "learning_rate": 0.00021838810473009373, - "loss": 3.2922, + "epoch": 6.377358490566038, + "grad_norm": 0.7627469897270203, + "learning_rate": 0.00021773556395035076, + "loss": 3.2846, "step": 59150 }, { - "epoch": 6.371757614896136, - "grad_norm": 0.7942614555358887, - "learning_rate": 0.0002180648637000323, - "loss": 3.2885, + "epoch": 6.382749326145553, + "grad_norm": 0.7125155925750732, + "learning_rate": 0.00021741176470588234, + "loss": 3.3025, "step": 59200 }, { - "epoch": 6.377139166935744, - "grad_norm": 0.7575851678848267, - "learning_rate": 0.0002177416226699709, - "loss": 3.2699, + "epoch": 6.388140161725067, + "grad_norm": 0.7636107206344604, + "learning_rate": 0.00021708796546141392, + "loss": 3.2771, "step": 59250 }, { - "epoch": 6.382520718975353, - "grad_norm": 0.7590829133987427, - "learning_rate": 0.00021741838163990946, - "loss": 3.2947, + "epoch": 6.393530997304582, + "grad_norm": 0.7409188151359558, + "learning_rate": 0.0002167641662169455, + "loss": 3.2668, "step": 59300 }, { - "epoch": 6.387902271014961, - "grad_norm": 0.7357906103134155, - "learning_rate": 0.00021709514060984806, - "loss": 3.2785, + "epoch": 6.398921832884097, + "grad_norm": 0.7753366231918335, + "learning_rate": 0.00021644036697247702, + "loss": 3.2823, "step": 59350 }, { - "epoch": 6.393283823054569, - "grad_norm": 0.7962828278541565, - "learning_rate": 0.00021677189957978665, - "loss": 3.2822, + "epoch": 6.404312668463612, + "grad_norm": 0.7260041236877441, + "learning_rate": 0.0002161165677280086, + "loss": 3.3024, "step": 59400 }, { - "epoch": 6.398665375094177, - "grad_norm": 0.750723659992218, - "learning_rate": 0.00021645512337032643, - "loss": 3.295, + "epoch": 6.409703504043127, + "grad_norm": 0.7569220662117004, + "learning_rate": 0.00021579276848354017, + "loss": 3.2825, "step": 59450 }, { - "epoch": 6.404046927133785, - "grad_norm": 0.7317831516265869, - "learning_rate": 0.00021613188234026506, - "loss": 3.287, + "epoch": 6.415094339622642, + "grad_norm": 0.7487280964851379, + "learning_rate": 0.00021546896923907175, + "loss": 3.2704, "step": 59500 }, { - "epoch": 6.4094284791733935, - "grad_norm": 0.7471715211868286, - "learning_rate": 0.00021580864131020362, - "loss": 3.2852, + "epoch": 6.420485175202156, + "grad_norm": 0.7295806407928467, + "learning_rate": 0.00021514516999460333, + "loss": 3.3035, "step": 59550 }, { - "epoch": 6.414810031213002, - "grad_norm": 0.759760320186615, - "learning_rate": 0.0002154854002801422, - "loss": 3.2947, + "epoch": 6.425876010781671, + "grad_norm": 0.7379623055458069, + "learning_rate": 0.0002148213707501349, + "loss": 3.2726, "step": 59600 }, { - "epoch": 6.42019158325261, - "grad_norm": 0.8137605786323547, - "learning_rate": 0.0002151621592500808, - "loss": 3.2968, + "epoch": 6.431266846361186, + "grad_norm": 0.7402395009994507, + "learning_rate": 0.00021449757150566648, + "loss": 3.276, "step": 59650 }, { - "epoch": 6.425573135292218, - "grad_norm": 0.7391119599342346, - "learning_rate": 0.00021483891822001938, - "loss": 3.2797, + "epoch": 6.436657681940701, + "grad_norm": 0.7622630000114441, + "learning_rate": 0.00021417377226119806, + "loss": 3.3128, "step": 59700 }, { - "epoch": 6.430954687331827, - "grad_norm": 0.7140395641326904, - "learning_rate": 0.00021451567718995795, - "loss": 3.2755, + "epoch": 6.442048517520216, + "grad_norm": 0.7114800810813904, + "learning_rate": 0.0002138499730167296, + "loss": 3.294, "step": 59750 }, { - "epoch": 6.4363362393714345, - "grad_norm": 0.7186073064804077, - "learning_rate": 0.00021419243615989654, - "loss": 3.295, + "epoch": 6.44743935309973, + "grad_norm": 0.7696911692619324, + "learning_rate": 0.00021352617377226116, + "loss": 3.2828, "step": 59800 }, { - "epoch": 6.441717791411043, - "grad_norm": 0.7441525459289551, - "learning_rate": 0.00021386919512983514, - "loss": 3.2836, + "epoch": 6.452830188679245, + "grad_norm": 0.7608886361122131, + "learning_rate": 0.00021320237452779274, + "loss": 3.2768, "step": 59850 }, { - "epoch": 6.447099343450651, - "grad_norm": 0.743151843547821, - "learning_rate": 0.0002135459540997737, - "loss": 3.2637, + "epoch": 6.45822102425876, + "grad_norm": 0.7601315379142761, + "learning_rate": 0.00021287857528332432, + "loss": 3.2739, "step": 59900 }, { - "epoch": 6.452480895490259, - "grad_norm": 0.7524924278259277, - "learning_rate": 0.0002132227130697123, - "loss": 3.2825, + "epoch": 6.463611859838275, + "grad_norm": 0.719896674156189, + "learning_rate": 0.0002125547760388559, + "loss": 3.2707, "step": 59950 }, { - "epoch": 6.457862447529868, - "grad_norm": 0.747959554195404, - "learning_rate": 0.00021289947203965087, - "loss": 3.2712, + "epoch": 6.46900269541779, + "grad_norm": 0.7298495173454285, + "learning_rate": 0.00021223097679438747, + "loss": 3.2905, "step": 60000 }, { - "epoch": 6.457862447529868, - "eval_accuracy": 0.3854671981821068, - "eval_loss": 3.372908353805542, - "eval_runtime": 184.6808, - "eval_samples_per_second": 97.525, - "eval_steps_per_second": 6.097, + "epoch": 6.46900269541779, + "eval_accuracy": 0.385286725735447, + "eval_loss": 3.3724496364593506, + "eval_runtime": 184.0657, + "eval_samples_per_second": 97.851, + "eval_steps_per_second": 6.117, "step": 60000 }, { - "epoch": 6.4632439995694755, - "grad_norm": 0.8120265007019043, - "learning_rate": 0.0002125762310095895, - "loss": 3.263, + "epoch": 6.474393530997305, + "grad_norm": 0.7553102374076843, + "learning_rate": 0.00021190717754991905, + "loss": 3.2652, "step": 60050 }, { - "epoch": 6.468625551609084, - "grad_norm": 0.7335713505744934, - "learning_rate": 0.00021225298997952806, - "loss": 3.282, + "epoch": 6.479784366576819, + "grad_norm": 0.7708149552345276, + "learning_rate": 0.0002115833783054506, + "loss": 3.2889, "step": 60100 }, { - "epoch": 6.474007103648693, - "grad_norm": 0.744031548500061, - "learning_rate": 0.00021192974894946662, - "loss": 3.2892, + "epoch": 6.485175202156334, + "grad_norm": 0.754406213760376, + "learning_rate": 0.00021125957906098218, + "loss": 3.2904, "step": 60150 }, { - "epoch": 6.4793886556883, - "grad_norm": 0.7717245221138, - "learning_rate": 0.00021160650791940524, - "loss": 3.3058, + "epoch": 6.490566037735849, + "grad_norm": 0.7574535012245178, + "learning_rate": 0.00021093577981651373, + "loss": 3.2602, "step": 60200 }, { - "epoch": 6.484770207727909, - "grad_norm": 0.7391660213470459, - "learning_rate": 0.0002112832668893438, - "loss": 3.2783, + "epoch": 6.495956873315364, + "grad_norm": 0.7688772678375244, + "learning_rate": 0.0002106119805720453, + "loss": 3.2718, "step": 60250 }, { - "epoch": 6.490151759767517, - "grad_norm": 0.7157920598983765, - "learning_rate": 0.00021096002585928238, - "loss": 3.2907, + "epoch": 6.501347708894879, + "grad_norm": 0.7309618592262268, + "learning_rate": 0.00021028818132757689, + "loss": 3.3053, "step": 60300 }, { - "epoch": 6.495533311807125, - "grad_norm": 0.7572237253189087, - "learning_rate": 0.00021063678482922097, - "loss": 3.28, + "epoch": 6.506738544474393, + "grad_norm": 0.7908754944801331, + "learning_rate": 0.00020996438208310846, + "loss": 3.2638, "step": 60350 }, { - "epoch": 6.500914863846734, - "grad_norm": 0.7682546377182007, - "learning_rate": 0.00021031354379915957, - "loss": 3.2659, + "epoch": 6.512129380053908, + "grad_norm": 0.7500775456428528, + "learning_rate": 0.00020964058283864001, + "loss": 3.28, "step": 60400 }, { - "epoch": 6.506296415886341, - "grad_norm": 0.7918459177017212, - "learning_rate": 0.00020999030276909814, - "loss": 3.2761, + "epoch": 6.517520215633423, + "grad_norm": 0.7242320775985718, + "learning_rate": 0.0002093167835941716, + "loss": 3.2828, "step": 60450 }, { - "epoch": 6.51167796792595, - "grad_norm": 0.7500776052474976, - "learning_rate": 0.00020966706173903673, - "loss": 3.2804, + "epoch": 6.5229110512129385, + "grad_norm": 0.7319316267967224, + "learning_rate": 0.00020899946033459254, + "loss": 3.2878, "step": 60500 }, { - "epoch": 6.517059519965558, - "grad_norm": 0.7752882838249207, - "learning_rate": 0.0002093438207089753, - "loss": 3.287, + "epoch": 6.528301886792453, + "grad_norm": 0.7341902256011963, + "learning_rate": 0.0002086756610901241, + "loss": 3.2869, "step": 60550 }, { - "epoch": 6.522441072005166, - "grad_norm": 0.7291586399078369, - "learning_rate": 0.00020902057967891387, - "loss": 3.2863, + "epoch": 6.533692722371968, + "grad_norm": 0.729345440864563, + "learning_rate": 0.00020835186184565567, + "loss": 3.2844, "step": 60600 }, { - "epoch": 6.5278226240447745, - "grad_norm": 0.7302510738372803, - "learning_rate": 0.0002086973386488525, - "loss": 3.3077, + "epoch": 6.539083557951482, + "grad_norm": 0.7409000992774963, + "learning_rate": 0.00020802806260118725, + "loss": 3.2876, "step": 60650 }, { - "epoch": 6.533204176084383, - "grad_norm": 0.7242251038551331, - "learning_rate": 0.00020837409761879106, - "loss": 3.2746, + "epoch": 6.544474393530997, + "grad_norm": 0.7650253772735596, + "learning_rate": 0.00020770426335671883, + "loss": 3.2808, "step": 60700 }, { - "epoch": 6.538585728123991, - "grad_norm": 0.7295466661453247, - "learning_rate": 0.00020805085658872962, - "loss": 3.2869, + "epoch": 6.549865229110512, + "grad_norm": 0.8125891089439392, + "learning_rate": 0.00020738046411225038, + "loss": 3.2828, "step": 60750 }, { - "epoch": 6.543967280163599, - "grad_norm": 0.7399840354919434, - "learning_rate": 0.00020772761555866825, - "loss": 3.2958, + "epoch": 6.555256064690027, + "grad_norm": 0.7584858536720276, + "learning_rate": 0.00020705666486778196, + "loss": 3.2762, "step": 60800 }, { - "epoch": 6.549348832203208, - "grad_norm": 0.8592695593833923, - "learning_rate": 0.0002074043745286068, - "loss": 3.2937, + "epoch": 6.560646900269542, + "grad_norm": 0.7711278796195984, + "learning_rate": 0.00020673286562331353, + "loss": 3.2765, "step": 60850 }, { - "epoch": 6.5547303842428155, - "grad_norm": 0.7525405883789062, - "learning_rate": 0.00020708113349854538, - "loss": 3.2769, + "epoch": 6.566037735849057, + "grad_norm": 0.7544257044792175, + "learning_rate": 0.0002064090663788451, + "loss": 3.2738, "step": 60900 }, { - "epoch": 6.560111936282424, - "grad_norm": 0.719514787197113, - "learning_rate": 0.00020675789246848397, - "loss": 3.2716, + "epoch": 6.571428571428571, + "grad_norm": 0.7313734889030457, + "learning_rate": 0.00020608526713437666, + "loss": 3.2841, "step": 60950 }, { - "epoch": 6.565493488322032, - "grad_norm": 0.7651938796043396, - "learning_rate": 0.00020643465143842257, - "loss": 3.2857, + "epoch": 6.576819407008086, + "grad_norm": 0.7042865753173828, + "learning_rate": 0.00020576146788990824, + "loss": 3.2855, "step": 61000 }, { - "epoch": 6.565493488322032, - "eval_accuracy": 0.38612998079777516, - "eval_loss": 3.3699281215667725, - "eval_runtime": 184.6571, - "eval_samples_per_second": 97.538, - "eval_steps_per_second": 6.098, + "epoch": 6.576819407008086, + "eval_accuracy": 0.38551544006429644, + "eval_loss": 3.3711304664611816, + "eval_runtime": 183.4425, + "eval_samples_per_second": 98.183, + "eval_steps_per_second": 6.138, "step": 61000 }, { - "epoch": 6.57087504036164, - "grad_norm": 0.8320351243019104, - "learning_rate": 0.00020611141040836116, - "loss": 3.2741, + "epoch": 6.5822102425876015, + "grad_norm": 0.7999841570854187, + "learning_rate": 0.0002054376686454398, + "loss": 3.2767, "step": 61050 }, { - "epoch": 6.576256592401249, - "grad_norm": 0.7015548944473267, - "learning_rate": 0.00020578816937829973, - "loss": 3.3026, + "epoch": 6.587601078167116, + "grad_norm": 0.7251748442649841, + "learning_rate": 0.00020511386940097137, + "loss": 3.297, "step": 61100 }, { - "epoch": 6.5816381444408565, - "grad_norm": 0.7299136519432068, - "learning_rate": 0.0002054649283482383, - "loss": 3.2858, + "epoch": 6.592991913746631, + "grad_norm": 0.7549523115158081, + "learning_rate": 0.00020479007015650294, + "loss": 3.2981, "step": 61150 }, { - "epoch": 6.587019696480465, - "grad_norm": 0.7206780910491943, - "learning_rate": 0.00020514168731817692, - "loss": 3.2945, + "epoch": 6.598382749326145, + "grad_norm": 0.7553712725639343, + "learning_rate": 0.00020446627091203452, + "loss": 3.2911, "step": 61200 }, { - "epoch": 6.592401248520073, - "grad_norm": 0.7657489776611328, - "learning_rate": 0.0002048184462881155, - "loss": 3.2834, + "epoch": 6.60377358490566, + "grad_norm": 0.7162251472473145, + "learning_rate": 0.0002041424716675661, + "loss": 3.2905, "step": 61250 }, { - "epoch": 6.597782800559681, - "grad_norm": 0.786909282207489, - "learning_rate": 0.00020449520525805406, - "loss": 3.2806, + "epoch": 6.609164420485175, + "grad_norm": 0.7627036571502686, + "learning_rate": 0.00020381867242309768, + "loss": 3.2762, "step": 61300 }, { - "epoch": 6.60316435259929, - "grad_norm": 0.6916054487228394, - "learning_rate": 0.00020417196422799268, - "loss": 3.2892, + "epoch": 6.6145552560646905, + "grad_norm": 0.7106635570526123, + "learning_rate": 0.00020349487317862926, + "loss": 3.3057, "step": 61350 }, { - "epoch": 6.608545904638898, - "grad_norm": 0.8359130620956421, - "learning_rate": 0.00020384872319793125, - "loss": 3.2798, + "epoch": 6.619946091644205, + "grad_norm": 0.8031513094902039, + "learning_rate": 0.00020317107393416078, + "loss": 3.2917, "step": 61400 }, { - "epoch": 6.613927456678506, - "grad_norm": 0.7628897428512573, - "learning_rate": 0.00020353194698847105, - "loss": 3.2831, + "epoch": 6.62533692722372, + "grad_norm": 0.7408685088157654, + "learning_rate": 0.00020284727468969236, + "loss": 3.3023, "step": 61450 }, { - "epoch": 6.619309008718115, - "grad_norm": 0.7674877643585205, - "learning_rate": 0.00020321517077901086, - "loss": 3.2636, + "epoch": 6.630727762803234, + "grad_norm": 0.7966960668563843, + "learning_rate": 0.00020252347544522393, + "loss": 3.2897, "step": 61500 }, { - "epoch": 6.624690560757722, - "grad_norm": 0.7460737228393555, - "learning_rate": 0.00020289192974894945, - "loss": 3.2839, + "epoch": 6.636118598382749, + "grad_norm": 0.7547671794891357, + "learning_rate": 0.0002021996762007555, + "loss": 3.2943, "step": 61550 }, { - "epoch": 6.630072112797331, - "grad_norm": 0.7583357691764832, - "learning_rate": 0.00020256868871888802, - "loss": 3.2861, + "epoch": 6.6415094339622645, + "grad_norm": 0.7418311834335327, + "learning_rate": 0.0002018758769562871, + "loss": 3.2906, "step": 61600 }, { - "epoch": 6.635453664836939, - "grad_norm": 0.8294389843940735, - "learning_rate": 0.00020224544768882664, - "loss": 3.2915, + "epoch": 6.646900269541779, + "grad_norm": 0.7400493621826172, + "learning_rate": 0.00020155207771181867, + "loss": 3.2906, "step": 61650 }, { - "epoch": 6.640835216876547, - "grad_norm": 0.7348142266273499, - "learning_rate": 0.0002019222066587652, - "loss": 3.2809, + "epoch": 6.652291105121294, + "grad_norm": 0.7756510376930237, + "learning_rate": 0.00020122827846735024, + "loss": 3.2716, "step": 61700 }, { - "epoch": 6.6462167689161555, - "grad_norm": 0.726068913936615, - "learning_rate": 0.00020159896562870378, - "loss": 3.2742, + "epoch": 6.657681940700809, + "grad_norm": 0.7837494611740112, + "learning_rate": 0.00020090447922288182, + "loss": 3.2932, "step": 61750 }, { - "epoch": 6.651598320955763, - "grad_norm": 0.7640007734298706, - "learning_rate": 0.00020127572459864237, - "loss": 3.294, + "epoch": 6.663072776280323, + "grad_norm": 0.7266611456871033, + "learning_rate": 0.00020058067997841335, + "loss": 3.3023, "step": 61800 }, { - "epoch": 6.656979872995372, - "grad_norm": 0.7816165685653687, - "learning_rate": 0.00020095248356858097, - "loss": 3.2971, + "epoch": 6.668463611859838, + "grad_norm": 0.818396806716919, + "learning_rate": 0.00020025688073394492, + "loss": 3.2976, "step": 61850 }, { - "epoch": 6.66236142503498, - "grad_norm": 0.7704948782920837, - "learning_rate": 0.00020062924253851953, - "loss": 3.2863, + "epoch": 6.6738544474393535, + "grad_norm": 0.7452014088630676, + "learning_rate": 0.0001999330814894765, + "loss": 3.272, "step": 61900 }, { - "epoch": 6.667742977074588, - "grad_norm": 0.7741549611091614, - "learning_rate": 0.00020030600150845813, - "loss": 3.2754, + "epoch": 6.679245283018868, + "grad_norm": 0.7601395845413208, + "learning_rate": 0.00019960928224500808, + "loss": 3.2741, "step": 61950 }, { - "epoch": 6.6731245291141965, - "grad_norm": 0.7607588768005371, - "learning_rate": 0.0001999827604783967, - "loss": 3.2798, + "epoch": 6.684636118598383, + "grad_norm": 0.8315152525901794, + "learning_rate": 0.00019928548300053966, + "loss": 3.2744, "step": 62000 }, { - "epoch": 6.6731245291141965, - "eval_accuracy": 0.386048056520363, - "eval_loss": 3.3638477325439453, - "eval_runtime": 184.5657, - "eval_samples_per_second": 97.586, - "eval_steps_per_second": 6.101, + "epoch": 6.684636118598383, + "eval_accuracy": 0.3863460913916381, + "eval_loss": 3.3641045093536377, + "eval_runtime": 183.6686, + "eval_samples_per_second": 98.062, + "eval_steps_per_second": 6.131, "step": 62000 }, { - "epoch": 6.678506081153805, - "grad_norm": 0.7352211475372314, - "learning_rate": 0.0001996595194483353, - "loss": 3.2716, + "epoch": 6.690026954177897, + "grad_norm": 0.7635255455970764, + "learning_rate": 0.00019896168375607123, + "loss": 3.2993, "step": 62050 }, { - "epoch": 6.683887633193413, - "grad_norm": 0.7325161695480347, - "learning_rate": 0.00019933627841827389, - "loss": 3.2848, + "epoch": 6.695417789757412, + "grad_norm": 0.7203477621078491, + "learning_rate": 0.00019863788451160278, + "loss": 3.2907, "step": 62100 }, { - "epoch": 6.689269185233021, - "grad_norm": 0.7613785266876221, - "learning_rate": 0.00019901303738821245, - "loss": 3.2751, + "epoch": 6.7008086253369274, + "grad_norm": 0.7851729989051819, + "learning_rate": 0.00019831408526713436, + "loss": 3.278, "step": 62150 }, { - "epoch": 6.69465073727263, - "grad_norm": 0.7282025814056396, - "learning_rate": 0.00019868979635815102, - "loss": 3.2849, + "epoch": 6.706199460916442, + "grad_norm": 0.7568328380584717, + "learning_rate": 0.00019799028602266594, + "loss": 3.2825, "step": 62200 }, { - "epoch": 6.7000322893122375, - "grad_norm": 0.7329760789871216, - "learning_rate": 0.00019836655532808964, - "loss": 3.2808, + "epoch": 6.711590296495957, + "grad_norm": 0.8196280598640442, + "learning_rate": 0.0001976664867781975, + "loss": 3.2929, "step": 62250 }, { - "epoch": 6.705413841351846, - "grad_norm": 0.7544186115264893, - "learning_rate": 0.0001980433142980282, - "loss": 3.2799, + "epoch": 6.716981132075472, + "grad_norm": 0.7875428795814514, + "learning_rate": 0.00019734268753372907, + "loss": 3.2781, "step": 62300 }, { - "epoch": 6.710795393391454, - "grad_norm": 0.7261156439781189, - "learning_rate": 0.00019772007326796678, - "loss": 3.2821, + "epoch": 6.722371967654986, + "grad_norm": 0.7377816438674927, + "learning_rate": 0.00019701888828926065, + "loss": 3.2958, "step": 62350 }, { - "epoch": 6.716176945431062, - "grad_norm": 0.722864031791687, - "learning_rate": 0.0001973968322379054, - "loss": 3.2838, + "epoch": 6.727762803234501, + "grad_norm": 0.7832558155059814, + "learning_rate": 0.0001966950890447922, + "loss": 3.2779, "step": 62400 }, { - "epoch": 6.721558497470671, - "grad_norm": 0.7303403615951538, - "learning_rate": 0.00019707359120784397, - "loss": 3.307, + "epoch": 6.7331536388140165, + "grad_norm": 0.7229846715927124, + "learning_rate": 0.00019637128980032377, + "loss": 3.2829, "step": 62450 }, { - "epoch": 6.7269400495102785, - "grad_norm": 0.7062789797782898, - "learning_rate": 0.00019675035017778253, - "loss": 3.2812, + "epoch": 6.738544474393531, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.00019604749055585535, + "loss": 3.2874, "step": 62500 }, { - "epoch": 6.732321601549887, - "grad_norm": 0.748625636100769, - "learning_rate": 0.00019642710914772113, - "loss": 3.2913, + "epoch": 6.743935309973046, + "grad_norm": 0.75005704164505, + "learning_rate": 0.00019572369131138693, + "loss": 3.2933, "step": 62550 }, { - "epoch": 6.737703153589496, - "grad_norm": 0.7621287703514099, - "learning_rate": 0.00019610386811765972, - "loss": 3.2881, + "epoch": 6.74932614555256, + "grad_norm": 0.7449986934661865, + "learning_rate": 0.0001953998920669185, + "loss": 3.2773, "step": 62600 }, { - "epoch": 6.743084705629103, - "grad_norm": 0.7435240149497986, - "learning_rate": 0.00019578062708759832, - "loss": 3.2807, + "epoch": 6.754716981132075, + "grad_norm": 0.7422336339950562, + "learning_rate": 0.00019507609282245006, + "loss": 3.2822, "step": 62650 }, { - "epoch": 6.748466257668712, - "grad_norm": 0.7503770589828491, - "learning_rate": 0.00019545738605753689, - "loss": 3.3027, + "epoch": 6.7601078167115904, + "grad_norm": 0.754895806312561, + "learning_rate": 0.00019475229357798164, + "loss": 3.2715, "step": 62700 }, { - "epoch": 6.75384780970832, - "grad_norm": 0.7563770413398743, - "learning_rate": 0.00019513414502747545, - "loss": 3.2875, + "epoch": 6.765498652291106, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.00019442849433351319, + "loss": 3.3003, "step": 62750 }, { - "epoch": 6.759229361747928, - "grad_norm": 0.8158665299415588, - "learning_rate": 0.00019481090399741408, - "loss": 3.2725, + "epoch": 6.77088948787062, + "grad_norm": 0.783345639705658, + "learning_rate": 0.00019410469508904476, + "loss": 3.2854, "step": 62800 }, { - "epoch": 6.7646109137875365, - "grad_norm": 0.7590814828872681, - "learning_rate": 0.00019448766296735264, - "loss": 3.3002, + "epoch": 6.776280323450135, + "grad_norm": 0.7650436162948608, + "learning_rate": 0.00019378089584457634, + "loss": 3.2992, "step": 62850 }, { - "epoch": 6.769992465827144, - "grad_norm": 0.7146130800247192, - "learning_rate": 0.0001941644219372912, - "loss": 3.2843, + "epoch": 6.781671159029649, + "grad_norm": 0.7753744125366211, + "learning_rate": 0.00019345709660010792, + "loss": 3.2989, "step": 62900 }, { - "epoch": 6.775374017866753, - "grad_norm": 0.7704887986183167, - "learning_rate": 0.00019384118090722983, - "loss": 3.2762, + "epoch": 6.787061994609164, + "grad_norm": 0.7236669063568115, + "learning_rate": 0.0001931332973556395, + "loss": 3.272, "step": 62950 }, { - "epoch": 6.780755569906361, - "grad_norm": 0.748637855052948, - "learning_rate": 0.0001935179398771684, - "loss": 3.2641, + "epoch": 6.7924528301886795, + "grad_norm": 0.7842093110084534, + "learning_rate": 0.00019280949811117107, + "loss": 3.2813, "step": 63000 }, { - "epoch": 6.780755569906361, - "eval_accuracy": 0.3869664993810588, - "eval_loss": 3.359816551208496, - "eval_runtime": 184.8195, - "eval_samples_per_second": 97.452, - "eval_steps_per_second": 6.092, + "epoch": 6.7924528301886795, + "eval_accuracy": 0.3865612241095108, + "eval_loss": 3.3622243404388428, + "eval_runtime": 183.9001, + "eval_samples_per_second": 97.939, + "eval_steps_per_second": 6.123, "step": 63000 }, { - "epoch": 6.786137121945969, - "grad_norm": 0.8034060597419739, - "learning_rate": 0.00019319469884710697, - "loss": 3.2973, + "epoch": 6.797843665768194, + "grad_norm": 0.7585256099700928, + "learning_rate": 0.00019248569886670265, + "loss": 3.2818, "step": 63050 }, { - "epoch": 6.7915186739855775, - "grad_norm": 0.7926182150840759, - "learning_rate": 0.00019287145781704556, - "loss": 3.2821, + "epoch": 6.803234501347709, + "grad_norm": 0.7767728567123413, + "learning_rate": 0.00019216189962223418, + "loss": 3.2819, "step": 63100 }, { - "epoch": 6.796900226025185, - "grad_norm": 0.7208576798439026, - "learning_rate": 0.00019254821678698416, - "loss": 3.283, + "epoch": 6.808625336927224, + "grad_norm": 0.7501540184020996, + "learning_rate": 0.00019183810037776575, + "loss": 3.2829, "step": 63150 }, { - "epoch": 6.802281778064794, - "grad_norm": 0.7681335210800171, - "learning_rate": 0.00019222497575692272, - "loss": 3.2823, + "epoch": 6.814016172506738, + "grad_norm": 0.7633841037750244, + "learning_rate": 0.00019151430113329733, + "loss": 3.2847, "step": 63200 }, { - "epoch": 6.807663330104402, - "grad_norm": 0.7635934352874756, - "learning_rate": 0.00019190173472686132, - "loss": 3.3013, + "epoch": 6.819407008086253, + "grad_norm": 0.7208578586578369, + "learning_rate": 0.0001911905018888289, + "loss": 3.2791, "step": 63250 }, { - "epoch": 6.813044882144011, - "grad_norm": 0.7212764024734497, - "learning_rate": 0.0001915784936967999, - "loss": 3.299, + "epoch": 6.824797843665769, + "grad_norm": 0.7506759166717529, + "learning_rate": 0.00019086670264436049, + "loss": 3.269, "step": 63300 }, { - "epoch": 6.8184264341836185, - "grad_norm": 0.7291616201400757, - "learning_rate": 0.00019125525266673845, - "loss": 3.2863, + "epoch": 6.830188679245283, + "grad_norm": 0.7481265664100647, + "learning_rate": 0.00019054290339989206, + "loss": 3.2873, "step": 63350 }, { - "epoch": 6.823807986223227, - "grad_norm": 0.7780003547668457, - "learning_rate": 0.00019093201163667708, - "loss": 3.2753, + "epoch": 6.835579514824798, + "grad_norm": 0.7677626609802246, + "learning_rate": 0.00019021910415542364, + "loss": 3.277, "step": 63400 }, { - "epoch": 6.829189538262835, - "grad_norm": 0.7213042378425598, - "learning_rate": 0.00019060877060661564, - "loss": 3.2894, + "epoch": 6.840970350404312, + "grad_norm": 0.85811847448349, + "learning_rate": 0.0001898953049109552, + "loss": 3.2981, "step": 63450 }, { - "epoch": 6.834571090302443, - "grad_norm": 0.7561571002006531, - "learning_rate": 0.0001902855295765542, - "loss": 3.2773, + "epoch": 6.846361185983827, + "grad_norm": 0.7475490570068359, + "learning_rate": 0.00018957150566648677, + "loss": 3.2839, "step": 63500 }, { - "epoch": 6.839952642342052, - "grad_norm": 0.8355714678764343, - "learning_rate": 0.00018996228854649283, - "loss": 3.2936, + "epoch": 6.8517520215633425, + "grad_norm": 0.7986608147621155, + "learning_rate": 0.00018924770642201832, + "loss": 3.2864, "step": 63550 }, { - "epoch": 6.8453341943816595, - "grad_norm": 0.7631604075431824, - "learning_rate": 0.0001896390475164314, - "loss": 3.2791, + "epoch": 6.857142857142857, + "grad_norm": 0.8158076405525208, + "learning_rate": 0.0001889239071775499, + "loss": 3.28, "step": 63600 }, { - "epoch": 6.850715746421268, - "grad_norm": 0.81952303647995, - "learning_rate": 0.0001893222713069712, - "loss": 3.2909, + "epoch": 6.862533692722372, + "grad_norm": 0.7751656174659729, + "learning_rate": 0.00018860010793308148, + "loss": 3.2886, "step": 63650 }, { - "epoch": 6.856097298460876, - "grad_norm": 0.7389697432518005, - "learning_rate": 0.0001889990302769098, - "loss": 3.292, + "epoch": 6.867924528301887, + "grad_norm": 0.7646748423576355, + "learning_rate": 0.00018827630868861305, + "loss": 3.2876, "step": 63700 }, { - "epoch": 6.861478850500484, - "grad_norm": 0.7638382315635681, - "learning_rate": 0.0001886757892468484, - "loss": 3.2677, + "epoch": 6.873315363881401, + "grad_norm": 0.7480393648147583, + "learning_rate": 0.0001879525094441446, + "loss": 3.2891, "step": 63750 }, { - "epoch": 6.866860402540093, - "grad_norm": 0.7346477508544922, - "learning_rate": 0.00018835254821678696, - "loss": 3.2803, + "epoch": 6.878706199460916, + "grad_norm": 0.7870710492134094, + "learning_rate": 0.00018762871019967618, + "loss": 3.2732, "step": 63800 }, { - "epoch": 6.8722419545797, - "grad_norm": 0.7897524237632751, - "learning_rate": 0.00018802930718672553, - "loss": 3.2859, + "epoch": 6.884097035040432, + "grad_norm": 0.7654228210449219, + "learning_rate": 0.00018730491095520776, + "loss": 3.2743, "step": 63850 }, { - "epoch": 6.877623506619309, - "grad_norm": 0.791483461856842, - "learning_rate": 0.00018770606615666415, - "loss": 3.2651, + "epoch": 6.889487870619946, + "grad_norm": 0.7906088829040527, + "learning_rate": 0.00018698111171073934, + "loss": 3.3025, "step": 63900 }, { - "epoch": 6.8830050586589175, - "grad_norm": 0.7509163022041321, - "learning_rate": 0.00018738282512660272, - "loss": 3.2808, + "epoch": 6.894878706199461, + "grad_norm": 0.7547808289527893, + "learning_rate": 0.0001866573124662709, + "loss": 3.2894, "step": 63950 }, { - "epoch": 6.888386610698525, - "grad_norm": 0.8272793889045715, - "learning_rate": 0.0001870595840965413, - "loss": 3.2795, + "epoch": 6.900269541778976, + "grad_norm": 0.7568020224571228, + "learning_rate": 0.00018633351322180246, + "loss": 3.2983, "step": 64000 }, { - "epoch": 6.888386610698525, - "eval_accuracy": 0.3871891291481907, - "eval_loss": 3.356895923614502, - "eval_runtime": 184.5849, - "eval_samples_per_second": 97.576, - "eval_steps_per_second": 6.1, + "epoch": 6.900269541778976, + "eval_accuracy": 0.3869987692887397, + "eval_loss": 3.3588764667510986, + "eval_runtime": 183.5221, + "eval_samples_per_second": 98.141, + "eval_steps_per_second": 6.136, "step": 64000 }, { - "epoch": 6.893768162738134, - "grad_norm": 0.7592813968658447, - "learning_rate": 0.0001867363430664799, - "loss": 3.2996, + "epoch": 6.90566037735849, + "grad_norm": 0.7180277109146118, + "learning_rate": 0.00018600971397733404, + "loss": 3.282, "step": 64050 }, { - "epoch": 6.899149714777742, - "grad_norm": 0.7349041700363159, - "learning_rate": 0.00018641310203641848, - "loss": 3.2695, + "epoch": 6.9110512129380055, + "grad_norm": 0.8041961193084717, + "learning_rate": 0.0001856859147328656, + "loss": 3.2673, "step": 64100 }, { - "epoch": 6.90453126681735, - "grad_norm": 0.7781025767326355, - "learning_rate": 0.00018608986100635705, - "loss": 3.2748, + "epoch": 6.916442048517521, + "grad_norm": 0.732753574848175, + "learning_rate": 0.00018536859147328655, + "loss": 3.2776, "step": 64150 }, { - "epoch": 6.9099128188569585, - "grad_norm": 0.7788296937942505, - "learning_rate": 0.00018576661997629564, - "loss": 3.2818, + "epoch": 6.921832884097035, + "grad_norm": 0.8041583299636841, + "learning_rate": 0.00018504479222881812, + "loss": 3.2846, "step": 64200 }, { - "epoch": 6.915294370896566, - "grad_norm": 0.8603556752204895, - "learning_rate": 0.00018544337894623423, - "loss": 3.2798, + "epoch": 6.92722371967655, + "grad_norm": 0.7492255568504333, + "learning_rate": 0.0001847209929843497, + "loss": 3.2804, "step": 64250 }, { - "epoch": 6.920675922936175, - "grad_norm": 0.7542293667793274, - "learning_rate": 0.0001851201379161728, - "loss": 3.2944, + "epoch": 6.932614555256064, + "grad_norm": 0.8601753115653992, + "learning_rate": 0.00018439719373988125, + "loss": 3.2752, "step": 64300 }, { - "epoch": 6.926057474975783, - "grad_norm": 0.7522438764572144, - "learning_rate": 0.0001847968968861114, - "loss": 3.3009, + "epoch": 6.938005390835579, + "grad_norm": 0.7451220154762268, + "learning_rate": 0.00018407339449541283, + "loss": 3.2889, "step": 64350 }, { - "epoch": 6.931439027015391, - "grad_norm": 0.7493849992752075, - "learning_rate": 0.00018447365585604996, - "loss": 3.2968, + "epoch": 6.943396226415095, + "grad_norm": 0.778103768825531, + "learning_rate": 0.00018374959525094438, + "loss": 3.2795, "step": 64400 }, { - "epoch": 6.9368205790549995, - "grad_norm": 0.7522703409194946, - "learning_rate": 0.00018415041482598859, - "loss": 3.2705, + "epoch": 6.948787061994609, + "grad_norm": 0.7740045785903931, + "learning_rate": 0.00018342579600647596, + "loss": 3.2769, "step": 64450 }, { - "epoch": 6.942202131094608, - "grad_norm": 0.7539739012718201, - "learning_rate": 0.00018382717379592715, - "loss": 3.2899, + "epoch": 6.954177897574124, + "grad_norm": 0.737163782119751, + "learning_rate": 0.00018310199676200753, + "loss": 3.2916, "step": 64500 }, { - "epoch": 6.947583683134216, - "grad_norm": 0.8078305125236511, - "learning_rate": 0.00018350393276586572, - "loss": 3.3006, + "epoch": 6.959568733153639, + "grad_norm": 0.7356468439102173, + "learning_rate": 0.0001827781975175391, + "loss": 3.2844, "step": 64550 }, { - "epoch": 6.952965235173824, - "grad_norm": 0.7467326521873474, - "learning_rate": 0.00018318069173580434, - "loss": 3.2725, + "epoch": 6.964959568733153, + "grad_norm": 0.7236778736114502, + "learning_rate": 0.0001824543982730707, + "loss": 3.2871, "step": 64600 }, { - "epoch": 6.958346787213433, - "grad_norm": 0.7241435050964355, - "learning_rate": 0.0001828574507057429, - "loss": 3.29, + "epoch": 6.9703504043126685, + "grad_norm": 0.7288910150527954, + "learning_rate": 0.00018213059902860227, + "loss": 3.2924, "step": 64650 }, { - "epoch": 6.9637283392530405, - "grad_norm": 0.7517207860946655, - "learning_rate": 0.00018253420967568148, - "loss": 3.2657, + "epoch": 6.975741239892184, + "grad_norm": 0.7726614475250244, + "learning_rate": 0.00018180679978413382, + "loss": 3.2981, "step": 64700 }, { - "epoch": 6.969109891292649, - "grad_norm": 0.8167835474014282, - "learning_rate": 0.00018221096864562007, - "loss": 3.291, + "epoch": 6.981132075471698, + "grad_norm": 0.8178704977035522, + "learning_rate": 0.00018148300053966537, + "loss": 3.2902, "step": 64750 }, { - "epoch": 6.974491443332257, - "grad_norm": 0.7494839429855347, - "learning_rate": 0.00018188772761555867, - "loss": 3.3016, + "epoch": 6.986522911051213, + "grad_norm": 0.8220640420913696, + "learning_rate": 0.00018115920129519695, + "loss": 3.288, "step": 64800 }, { - "epoch": 6.979872995371865, - "grad_norm": 0.7137006521224976, - "learning_rate": 0.00018156448658549723, - "loss": 3.2964, + "epoch": 6.991913746630727, + "grad_norm": 0.7409705519676208, + "learning_rate": 0.00018083540205072852, + "loss": 3.2857, "step": 64850 }, { - "epoch": 6.985254547411474, - "grad_norm": 0.7802633047103882, - "learning_rate": 0.00018124124555543583, - "loss": 3.2759, + "epoch": 6.997304582210242, + "grad_norm": 0.79615318775177, + "learning_rate": 0.0001805116028062601, + "loss": 3.2914, "step": 64900 }, { - "epoch": 6.990636099451081, - "grad_norm": 0.7316725254058838, - "learning_rate": 0.0001809180045253744, - "loss": 3.2743, + "epoch": 7.002695417789758, + "grad_norm": 0.835076093673706, + "learning_rate": 0.00018018780356179168, + "loss": 3.2366, "step": 64950 }, { - "epoch": 6.99601765149069, - "grad_norm": 0.7353840470314026, - "learning_rate": 0.00018059476349531296, - "loss": 3.2994, + "epoch": 7.008086253369272, + "grad_norm": 0.794971764087677, + "learning_rate": 0.00017986400431732326, + "loss": 3.2014, "step": 65000 }, { - "epoch": 6.99601765149069, - "eval_accuracy": 0.38757897570966904, - "eval_loss": 3.3525335788726807, - "eval_runtime": 184.903, - "eval_samples_per_second": 97.408, - "eval_steps_per_second": 6.09, + "epoch": 7.008086253369272, + "eval_accuracy": 0.3871302392829952, + "eval_loss": 3.3600101470947266, + "eval_runtime": 183.7138, + "eval_samples_per_second": 98.038, + "eval_steps_per_second": 6.129, "step": 65000 }, { - "epoch": 7.0013992035302985, - "grad_norm": 0.7722090482711792, - "learning_rate": 0.00018027152246525159, - "loss": 3.2508, + "epoch": 7.013477088948787, + "grad_norm": 0.7871414422988892, + "learning_rate": 0.00017954020507285483, + "loss": 3.1982, "step": 65050 }, { - "epoch": 7.006780755569906, - "grad_norm": 0.745133638381958, - "learning_rate": 0.00017994828143519015, - "loss": 3.185, + "epoch": 7.018867924528302, + "grad_norm": 0.8027901649475098, + "learning_rate": 0.0001792164058283864, + "loss": 3.1985, "step": 65100 }, { - "epoch": 7.012162307609515, - "grad_norm": 0.7634187936782837, - "learning_rate": 0.00017962504040512872, - "loss": 3.185, + "epoch": 7.024258760107816, + "grad_norm": 0.767401933670044, + "learning_rate": 0.00017889260658391794, + "loss": 3.2055, "step": 65150 }, { - "epoch": 7.017543859649122, - "grad_norm": 0.7494111657142639, - "learning_rate": 0.00017930179937506734, - "loss": 3.21, + "epoch": 7.0296495956873315, + "grad_norm": 0.7891222834587097, + "learning_rate": 0.00017856880733944951, + "loss": 3.184, "step": 65200 }, { - "epoch": 7.022925411688731, - "grad_norm": 0.798454999923706, - "learning_rate": 0.0001789785583450059, - "loss": 3.2116, + "epoch": 7.035040431266847, + "grad_norm": 0.8046509027481079, + "learning_rate": 0.0001782450080949811, + "loss": 3.1978, "step": 65250 }, { - "epoch": 7.0283069637283395, - "grad_norm": 0.7529196739196777, - "learning_rate": 0.00017865531731494448, - "loss": 3.2022, + "epoch": 7.040431266846361, + "grad_norm": 0.791054904460907, + "learning_rate": 0.00017792120885051267, + "loss": 3.2013, "step": 65300 }, { - "epoch": 7.033688515767947, - "grad_norm": 0.7898669242858887, - "learning_rate": 0.00017833207628488307, - "loss": 3.1969, + "epoch": 7.045822102425876, + "grad_norm": 0.763992965221405, + "learning_rate": 0.00017759740960604425, + "loss": 3.1919, "step": 65350 }, { - "epoch": 7.039070067807556, - "grad_norm": 0.7746036052703857, - "learning_rate": 0.00017800883525482167, - "loss": 3.1942, + "epoch": 7.051212938005391, + "grad_norm": 0.889952540397644, + "learning_rate": 0.00017727361036157582, + "loss": 3.2239, "step": 65400 }, { - "epoch": 7.044451619847164, - "grad_norm": 0.8117377161979675, - "learning_rate": 0.00017768559422476026, - "loss": 3.2071, + "epoch": 7.056603773584905, + "grad_norm": 0.7865025401115417, + "learning_rate": 0.00017694981111710737, + "loss": 3.2114, "step": 65450 }, { - "epoch": 7.049833171886772, - "grad_norm": 0.7499861121177673, - "learning_rate": 0.00017736235319469883, - "loss": 3.2166, + "epoch": 7.061994609164421, + "grad_norm": 0.7957040667533875, + "learning_rate": 0.00017662601187263895, + "loss": 3.202, "step": 65500 }, { - "epoch": 7.0552147239263805, - "grad_norm": 0.7656523585319519, - "learning_rate": 0.0001770391121646374, - "loss": 3.1925, + "epoch": 7.067385444743936, + "grad_norm": 0.7598467469215393, + "learning_rate": 0.0001763022126281705, + "loss": 3.2151, "step": 65550 }, { - "epoch": 7.060596275965988, - "grad_norm": 0.7908286452293396, - "learning_rate": 0.00017671587113457602, - "loss": 3.2114, + "epoch": 7.07277628032345, + "grad_norm": 0.7556450963020325, + "learning_rate": 0.00017597841338370208, + "loss": 3.1908, "step": 65600 }, { - "epoch": 7.065977828005597, - "grad_norm": 0.7967495322227478, - "learning_rate": 0.0001763926301045146, - "loss": 3.2132, + "epoch": 7.078167115902965, + "grad_norm": 0.7661687135696411, + "learning_rate": 0.00017565461413923366, + "loss": 3.215, "step": 65650 }, { - "epoch": 7.071359380045205, - "grad_norm": 0.7433943748474121, - "learning_rate": 0.00017606938907445315, - "loss": 3.2086, + "epoch": 7.083557951482479, + "grad_norm": 0.7638933658599854, + "learning_rate": 0.00017533081489476524, + "loss": 3.2194, "step": 65700 }, { - "epoch": 7.076740932084813, - "grad_norm": 0.7751272320747375, - "learning_rate": 0.00017574614804439178, - "loss": 3.2388, + "epoch": 7.0889487870619945, + "grad_norm": 0.8166995644569397, + "learning_rate": 0.0001750070156502968, + "loss": 3.2183, "step": 65750 }, { - "epoch": 7.0821224841244215, - "grad_norm": 0.7907826900482178, - "learning_rate": 0.00017542290701433034, - "loss": 3.2292, + "epoch": 7.09433962264151, + "grad_norm": 0.832233190536499, + "learning_rate": 0.00017468321640582836, + "loss": 3.2143, "step": 65800 }, { - "epoch": 7.08750403616403, - "grad_norm": 0.7536311745643616, - "learning_rate": 0.0001750996659842689, - "loss": 3.2051, + "epoch": 7.099730458221024, + "grad_norm": 0.7524721026420593, + "learning_rate": 0.00017435941716135994, + "loss": 3.198, "step": 65850 }, { - "epoch": 7.092885588203638, - "grad_norm": 0.8473614454269409, - "learning_rate": 0.0001747764249542075, - "loss": 3.1845, + "epoch": 7.105121293800539, + "grad_norm": 0.7654711008071899, + "learning_rate": 0.00017403561791689152, + "loss": 3.2277, "step": 65900 }, { - "epoch": 7.098267140243246, - "grad_norm": 0.764302134513855, - "learning_rate": 0.0001744531839241461, - "loss": 3.193, + "epoch": 7.110512129380054, + "grad_norm": 0.7651508450508118, + "learning_rate": 0.0001737118186724231, + "loss": 3.1887, "step": 65950 }, { - "epoch": 7.103648692282855, - "grad_norm": 0.7599833607673645, - "learning_rate": 0.00017412994289408467, - "loss": 3.2226, + "epoch": 7.115902964959568, + "grad_norm": 0.7957179546356201, + "learning_rate": 0.00017338801942795465, + "loss": 3.2214, "step": 66000 }, { - "epoch": 7.103648692282855, - "eval_accuracy": 0.38751063304323374, - "eval_loss": 3.3595468997955322, - "eval_runtime": 184.5489, - "eval_samples_per_second": 97.595, - "eval_steps_per_second": 6.101, + "epoch": 7.115902964959568, + "eval_accuracy": 0.3870713494177998, + "eval_loss": 3.360347032546997, + "eval_runtime": 183.8356, + "eval_samples_per_second": 97.973, + "eval_steps_per_second": 6.125, "step": 66000 }, { - "epoch": 7.109030244322462, - "grad_norm": 0.7426865100860596, - "learning_rate": 0.00017380670186402326, - "loss": 3.2087, + "epoch": 7.121293800539084, + "grad_norm": 0.7835813164710999, + "learning_rate": 0.00017306422018348623, + "loss": 3.2142, "step": 66050 }, { - "epoch": 7.114411796362071, - "grad_norm": 0.7286989688873291, - "learning_rate": 0.00017348346083396183, - "loss": 3.2345, + "epoch": 7.126684636118599, + "grad_norm": 0.7825093865394592, + "learning_rate": 0.00017274042093901778, + "loss": 3.2036, "step": 66100 }, { - "epoch": 7.119793348401679, - "grad_norm": 0.7323970794677734, - "learning_rate": 0.00017316021980390042, - "loss": 3.2154, + "epoch": 7.132075471698113, + "grad_norm": 0.7593418955802917, + "learning_rate": 0.00017241662169454935, + "loss": 3.2382, "step": 66150 }, { - "epoch": 7.125174900441287, - "grad_norm": 0.8056437373161316, - "learning_rate": 0.00017283697877383902, - "loss": 3.2244, + "epoch": 7.137466307277628, + "grad_norm": 0.8279274702072144, + "learning_rate": 0.00017209282245008093, + "loss": 3.2337, "step": 66200 }, { - "epoch": 7.130556452480896, - "grad_norm": 0.8104016184806824, - "learning_rate": 0.0001725137377437776, - "loss": 3.2237, + "epoch": 7.142857142857143, + "grad_norm": 0.7875446677207947, + "learning_rate": 0.0001717690232056125, + "loss": 3.223, "step": 66250 }, { - "epoch": 7.135938004520503, - "grad_norm": 0.7950676679611206, - "learning_rate": 0.00017219049671371615, - "loss": 3.2161, + "epoch": 7.1482479784366575, + "grad_norm": 0.822761058807373, + "learning_rate": 0.0001714452239611441, + "loss": 3.2112, "step": 66300 }, { - "epoch": 7.141319556560112, - "grad_norm": 0.7405356764793396, - "learning_rate": 0.00017186725568365478, - "loss": 3.2312, + "epoch": 7.153638814016173, + "grad_norm": 0.7847926616668701, + "learning_rate": 0.000171127900701565, + "loss": 3.2183, "step": 66350 }, { - "epoch": 7.1467011085997205, - "grad_norm": 0.7578369379043579, - "learning_rate": 0.00017154401465359334, - "loss": 3.2037, + "epoch": 7.159029649595688, + "grad_norm": 0.7854703664779663, + "learning_rate": 0.0001708041014570966, + "loss": 3.2161, "step": 66400 }, { - "epoch": 7.152082660639328, - "grad_norm": 0.7615239024162292, - "learning_rate": 0.00017122077362353194, - "loss": 3.2061, + "epoch": 7.164420485175202, + "grad_norm": 3.3990447521209717, + "learning_rate": 0.00017048030221262814, + "loss": 3.2148, "step": 66450 }, { - "epoch": 7.157464212678937, - "grad_norm": 0.7853385210037231, - "learning_rate": 0.00017089753259347053, - "loss": 3.2221, + "epoch": 7.169811320754717, + "grad_norm": 0.7549009323120117, + "learning_rate": 0.00017015650296815972, + "loss": 3.2194, "step": 66500 }, { - "epoch": 7.162845764718545, - "grad_norm": 0.7812248468399048, - "learning_rate": 0.0001705742915634091, - "loss": 3.2089, + "epoch": 7.175202156334231, + "grad_norm": 0.764918863773346, + "learning_rate": 0.0001698327037236913, + "loss": 3.2173, "step": 66550 }, { - "epoch": 7.168227316758153, - "grad_norm": 0.79538494348526, - "learning_rate": 0.0001702510505333477, - "loss": 3.2266, + "epoch": 7.180592991913747, + "grad_norm": 0.7957922220230103, + "learning_rate": 0.00016950890447922287, + "loss": 3.2251, "step": 66600 }, { - "epoch": 7.1736088687977615, - "grad_norm": 0.7920806407928467, - "learning_rate": 0.00016992780950328626, - "loss": 3.2159, + "epoch": 7.185983827493262, + "grad_norm": 0.7517008185386658, + "learning_rate": 0.00016918510523475445, + "loss": 3.224, "step": 66650 }, { - "epoch": 7.178990420837369, - "grad_norm": 0.8788477182388306, - "learning_rate": 0.00016960456847322486, - "loss": 3.2352, + "epoch": 7.191374663072776, + "grad_norm": 0.7847917079925537, + "learning_rate": 0.00016886130599028603, + "loss": 3.2224, "step": 66700 }, { - "epoch": 7.184371972876978, - "grad_norm": 0.787079930305481, - "learning_rate": 0.00016928132744316345, - "loss": 3.2172, + "epoch": 7.196765498652291, + "grad_norm": 0.8399714231491089, + "learning_rate": 0.00016853750674581755, + "loss": 3.2064, "step": 66750 }, { - "epoch": 7.189753524916586, - "grad_norm": 0.7809692025184631, - "learning_rate": 0.00016895808641310202, - "loss": 3.2311, + "epoch": 7.202156334231806, + "grad_norm": 0.7885647416114807, + "learning_rate": 0.00016821370750134913, + "loss": 3.215, "step": 66800 }, { - "epoch": 7.195135076956194, - "grad_norm": 0.7488144636154175, - "learning_rate": 0.0001686348453830406, - "loss": 3.2273, + "epoch": 7.2075471698113205, + "grad_norm": 0.7742317914962769, + "learning_rate": 0.0001678899082568807, + "loss": 3.2136, "step": 66850 }, { - "epoch": 7.2005166289958025, - "grad_norm": 0.7899309396743774, - "learning_rate": 0.0001683116043529792, - "loss": 3.2404, + "epoch": 7.212938005390836, + "grad_norm": 0.7558757662773132, + "learning_rate": 0.00016756610901241228, + "loss": 3.2134, "step": 66900 }, { - "epoch": 7.205898181035411, - "grad_norm": 0.7522909045219421, - "learning_rate": 0.00016798836332291778, - "loss": 3.2386, + "epoch": 7.218328840970351, + "grad_norm": 0.8128474950790405, + "learning_rate": 0.00016724230976794386, + "loss": 3.2153, "step": 66950 }, { - "epoch": 7.211279733075019, - "grad_norm": 0.7979075312614441, - "learning_rate": 0.00016766512229285634, - "loss": 3.236, + "epoch": 7.223719676549865, + "grad_norm": 0.7827127575874329, + "learning_rate": 0.00016691851052347544, + "loss": 3.2243, "step": 67000 }, { - "epoch": 7.211279733075019, - "eval_accuracy": 0.3877005582911335, - "eval_loss": 3.3552801609039307, - "eval_runtime": 184.2064, - "eval_samples_per_second": 97.776, - "eval_steps_per_second": 6.113, + "epoch": 7.223719676549865, + "eval_accuracy": 0.38756995751998047, + "eval_loss": 3.358355760574341, + "eval_runtime": 183.444, + "eval_samples_per_second": 98.183, + "eval_steps_per_second": 6.138, "step": 67000 }, { - "epoch": 7.216661285114627, - "grad_norm": 0.7617997527122498, - "learning_rate": 0.00016734188126279494, - "loss": 3.2404, + "epoch": 7.22911051212938, + "grad_norm": 0.8113185167312622, + "learning_rate": 0.00016659471127900702, + "loss": 3.2343, "step": 67050 }, { - "epoch": 7.222042837154235, - "grad_norm": 0.7634185552597046, - "learning_rate": 0.00016701864023273353, - "loss": 3.2352, + "epoch": 7.234501347708895, + "grad_norm": 0.7938553690910339, + "learning_rate": 0.0001662709120345386, + "loss": 3.2358, "step": 67100 }, { - "epoch": 7.2274243891938434, - "grad_norm": 0.8232079148292542, - "learning_rate": 0.0001666953992026721, - "loss": 3.2292, + "epoch": 7.2398921832884096, + "grad_norm": 0.8004481196403503, + "learning_rate": 0.00016594711279007015, + "loss": 3.1933, "step": 67150 }, { - "epoch": 7.232805941233452, - "grad_norm": 0.8213094472885132, - "learning_rate": 0.0001663721581726107, - "loss": 3.2231, + "epoch": 7.245283018867925, + "grad_norm": 0.8019503951072693, + "learning_rate": 0.0001656233135456017, + "loss": 3.2262, "step": 67200 }, { - "epoch": 7.23818749327306, - "grad_norm": 0.7654265761375427, - "learning_rate": 0.00016604891714254926, - "loss": 3.2287, + "epoch": 7.250673854447439, + "grad_norm": 0.756632387638092, + "learning_rate": 0.00016529951430113327, + "loss": 3.2188, "step": 67250 }, { - "epoch": 7.243569045312668, - "grad_norm": 0.7495639324188232, - "learning_rate": 0.00016572567611248786, - "loss": 3.2287, + "epoch": 7.256064690026954, + "grad_norm": 0.7605134844779968, + "learning_rate": 0.00016497571505666485, + "loss": 3.2257, "step": 67300 }, { - "epoch": 7.248950597352277, - "grad_norm": 0.7579506039619446, - "learning_rate": 0.00016540243508242645, - "loss": 3.2304, + "epoch": 7.261455525606469, + "grad_norm": 0.7907736897468567, + "learning_rate": 0.00016465191581219643, + "loss": 3.241, "step": 67350 }, { - "epoch": 7.254332149391884, - "grad_norm": 0.7823368906974792, - "learning_rate": 0.00016507919405236502, - "loss": 3.209, + "epoch": 7.2668463611859835, + "grad_norm": 0.7794830799102783, + "learning_rate": 0.000164328116567728, + "loss": 3.2353, "step": 67400 }, { - "epoch": 7.259713701431493, - "grad_norm": 0.7844114303588867, - "learning_rate": 0.00016475595302230364, - "loss": 3.2216, + "epoch": 7.272237196765499, + "grad_norm": 0.8242986798286438, + "learning_rate": 0.00016400431732325956, + "loss": 3.2464, "step": 67450 }, { - "epoch": 7.265095253471101, - "grad_norm": 0.7908133864402771, - "learning_rate": 0.0001644327119922422, - "loss": 3.2328, + "epoch": 7.277628032345014, + "grad_norm": 0.7823339700698853, + "learning_rate": 0.00016368051807879114, + "loss": 3.2166, "step": 67500 }, { - "epoch": 7.270476805510709, - "grad_norm": 0.8082903623580933, - "learning_rate": 0.00016410947096218078, - "loss": 3.2432, + "epoch": 7.283018867924528, + "grad_norm": 0.7602329254150391, + "learning_rate": 0.0001633567188343227, + "loss": 3.2096, "step": 67550 }, { - "epoch": 7.275858357550318, - "grad_norm": 0.7870227694511414, - "learning_rate": 0.00016378622993211937, - "loss": 3.2239, + "epoch": 7.288409703504043, + "grad_norm": 0.7734456062316895, + "learning_rate": 0.00016303291958985426, + "loss": 3.2457, "step": 67600 }, { - "epoch": 7.281239909589925, - "grad_norm": 0.7916932702064514, - "learning_rate": 0.00016346298890205797, - "loss": 3.2237, + "epoch": 7.293800539083558, + "grad_norm": 0.8424938917160034, + "learning_rate": 0.00016270912034538584, + "loss": 3.2314, "step": 67650 }, { - "epoch": 7.286621461629534, - "grad_norm": 0.7457194924354553, - "learning_rate": 0.00016314621269259777, - "loss": 3.2412, + "epoch": 7.2991913746630726, + "grad_norm": 0.7849023342132568, + "learning_rate": 0.00016238532110091742, + "loss": 3.2246, "step": 67700 }, { - "epoch": 7.2920030136691425, - "grad_norm": 0.8166563510894775, - "learning_rate": 0.00016282297166253634, - "loss": 3.2098, + "epoch": 7.304582210242588, + "grad_norm": 0.7933061122894287, + "learning_rate": 0.000162061521856449, + "loss": 3.2295, "step": 67750 }, { - "epoch": 7.29738456570875, - "grad_norm": 0.8328801393508911, - "learning_rate": 0.00016249973063247494, - "loss": 3.2359, + "epoch": 7.309973045822103, + "grad_norm": 0.8396109938621521, + "learning_rate": 0.00016173772261198055, + "loss": 3.2416, "step": 67800 }, { - "epoch": 7.302766117748359, - "grad_norm": 0.777811586856842, - "learning_rate": 0.00016217648960241353, - "loss": 3.2321, + "epoch": 7.315363881401617, + "grad_norm": 0.8306365013122559, + "learning_rate": 0.00016141392336751212, + "loss": 3.2286, "step": 67850 }, { - "epoch": 7.308147669787967, - "grad_norm": 0.8480938673019409, - "learning_rate": 0.0001618532485723521, - "loss": 3.2132, + "epoch": 7.320754716981132, + "grad_norm": 0.8018426895141602, + "learning_rate": 0.0001610901241230437, + "loss": 3.2235, "step": 67900 }, { - "epoch": 7.313529221827575, - "grad_norm": 0.7613167762756348, - "learning_rate": 0.00016153000754229067, - "loss": 3.235, + "epoch": 7.3261455525606465, + "grad_norm": 0.7814752459526062, + "learning_rate": 0.00016076632487857528, + "loss": 3.2307, "step": 67950 }, { - "epoch": 7.3189107738671835, - "grad_norm": 0.7387501001358032, - "learning_rate": 0.0001612067665122293, - "loss": 3.2177, + "epoch": 7.331536388140162, + "grad_norm": 0.8125154376029968, + "learning_rate": 0.00016044252563410686, + "loss": 3.226, "step": 68000 }, { - "epoch": 7.3189107738671835, - "eval_accuracy": 0.3881735243117899, - "eval_loss": 3.3549270629882812, - "eval_runtime": 183.9194, - "eval_samples_per_second": 97.929, - "eval_steps_per_second": 6.122, + "epoch": 7.331536388140162, + "eval_accuracy": 0.38801782472355173, + "eval_loss": 3.355308771133423, + "eval_runtime": 183.676, + "eval_samples_per_second": 98.059, + "eval_steps_per_second": 6.13, "step": 68000 }, { - "epoch": 7.324292325906791, - "grad_norm": 0.8005468845367432, - "learning_rate": 0.00016088352548216785, - "loss": 3.224, + "epoch": 7.336927223719677, + "grad_norm": 0.7748324275016785, + "learning_rate": 0.0001601187263896384, + "loss": 3.223, "step": 68050 }, { - "epoch": 7.3296738779464, - "grad_norm": 0.7966600060462952, - "learning_rate": 0.00016056028445210642, - "loss": 3.2397, + "epoch": 7.342318059299191, + "grad_norm": 0.7646241188049316, + "learning_rate": 0.00015979492714516996, + "loss": 3.2285, "step": 68100 }, { - "epoch": 7.335055429986008, - "grad_norm": 0.727301836013794, - "learning_rate": 0.00016023704342204504, - "loss": 3.2308, + "epoch": 7.347708894878706, + "grad_norm": 0.7891027331352234, + "learning_rate": 0.00015947112790070154, + "loss": 3.2304, "step": 68150 }, { - "epoch": 7.340436982025616, - "grad_norm": 0.7933337688446045, - "learning_rate": 0.0001599138023919836, - "loss": 3.2197, + "epoch": 7.353099730458221, + "grad_norm": 0.8173832297325134, + "learning_rate": 0.00015914732865623311, + "loss": 3.2261, "step": 68200 }, { - "epoch": 7.3458185340652244, - "grad_norm": 0.77813321352005, - "learning_rate": 0.0001595905613619222, - "loss": 3.2116, + "epoch": 7.3584905660377355, + "grad_norm": 0.8146997690200806, + "learning_rate": 0.0001588235294117647, + "loss": 3.2288, "step": 68250 }, { - "epoch": 7.351200086104833, - "grad_norm": 0.8122656345367432, - "learning_rate": 0.00015926732033186077, - "loss": 3.2054, + "epoch": 7.363881401617251, + "grad_norm": 0.8188638687133789, + "learning_rate": 0.00015849973016729627, + "loss": 3.2446, "step": 68300 }, { - "epoch": 7.356581638144441, - "grad_norm": 0.7639948725700378, - "learning_rate": 0.00015894407930179934, - "loss": 3.2372, + "epoch": 7.369272237196766, + "grad_norm": 0.7957025170326233, + "learning_rate": 0.00015817593092282785, + "loss": 3.2231, "step": 68350 }, { - "epoch": 7.361963190184049, - "grad_norm": 0.7615376710891724, - "learning_rate": 0.00015862083827173796, - "loss": 3.2182, + "epoch": 7.37466307277628, + "grad_norm": 0.7923418879508972, + "learning_rate": 0.00015785213167835942, + "loss": 3.2206, "step": 68400 }, { - "epoch": 7.367344742223658, - "grad_norm": 0.7874367833137512, - "learning_rate": 0.00015829759724167653, - "loss": 3.2262, + "epoch": 7.380053908355795, + "grad_norm": 0.839530885219574, + "learning_rate": 0.00015752833243389095, + "loss": 3.2353, "step": 68450 }, { - "epoch": 7.372726294263265, - "grad_norm": 0.7851904630661011, - "learning_rate": 0.0001579743562116151, - "loss": 3.2176, + "epoch": 7.38544474393531, + "grad_norm": 0.9345459938049316, + "learning_rate": 0.00015720453318942253, + "loss": 3.2093, "step": 68500 }, { - "epoch": 7.378107846302874, - "grad_norm": 0.7756659388542175, - "learning_rate": 0.00015765111518155372, - "loss": 3.2207, + "epoch": 7.390835579514825, + "grad_norm": 0.782017707824707, + "learning_rate": 0.0001568807339449541, + "loss": 3.201, "step": 68550 }, { - "epoch": 7.383489398342482, - "grad_norm": 0.7584941387176514, - "learning_rate": 0.0001573278741514923, - "loss": 3.2037, + "epoch": 7.39622641509434, + "grad_norm": 0.803848147392273, + "learning_rate": 0.00015655693470048568, + "loss": 3.2296, "step": 68600 }, { - "epoch": 7.38887095038209, - "grad_norm": 0.7656226754188538, - "learning_rate": 0.00015700463312143085, - "loss": 3.241, + "epoch": 7.401617250673855, + "grad_norm": 0.8133928775787354, + "learning_rate": 0.00015623313545601726, + "loss": 3.2408, "step": 68650 }, { - "epoch": 7.394252502421699, - "grad_norm": 0.814250648021698, - "learning_rate": 0.00015668139209136945, - "loss": 3.2298, + "epoch": 7.407008086253369, + "grad_norm": 0.7830129861831665, + "learning_rate": 0.00015590933621154884, + "loss": 3.2185, "step": 68700 }, { - "epoch": 7.399634054461306, - "grad_norm": 0.8023512363433838, - "learning_rate": 0.00015635815106130804, - "loss": 3.2192, + "epoch": 7.412398921832884, + "grad_norm": 0.8253718018531799, + "learning_rate": 0.00015558553696708041, + "loss": 3.2344, "step": 68750 }, { - "epoch": 7.405015606500915, - "grad_norm": 0.845595121383667, - "learning_rate": 0.0001560349100312466, - "loss": 3.2426, + "epoch": 7.4177897574123985, + "grad_norm": 0.8411740660667419, + "learning_rate": 0.00015526173772261196, + "loss": 3.225, "step": 68800 }, { - "epoch": 7.4103971585405235, - "grad_norm": 0.7891848087310791, - "learning_rate": 0.0001557116690011852, - "loss": 3.214, + "epoch": 7.423180592991914, + "grad_norm": 0.8184799551963806, + "learning_rate": 0.00015493793847814354, + "loss": 3.2399, "step": 68850 }, { - "epoch": 7.415778710580131, - "grad_norm": 0.8064113855361938, - "learning_rate": 0.00015538842797112377, - "loss": 3.2318, + "epoch": 7.428571428571429, + "grad_norm": 0.8034173250198364, + "learning_rate": 0.0001546141392336751, + "loss": 3.2341, "step": 68900 }, { - "epoch": 7.42116026261974, - "grad_norm": 0.7896233797073364, - "learning_rate": 0.00015506518694106237, - "loss": 3.2226, + "epoch": 7.433962264150943, + "grad_norm": 0.8255000114440918, + "learning_rate": 0.00015429033998920667, + "loss": 3.2394, "step": 68950 }, { - "epoch": 7.426541814659347, - "grad_norm": 0.7546834945678711, - "learning_rate": 0.00015474194591100096, - "loss": 3.2373, + "epoch": 7.439353099730458, + "grad_norm": 0.8404207825660706, + "learning_rate": 0.00015396654074473825, + "loss": 3.2498, "step": 69000 }, { - "epoch": 7.426541814659347, - "eval_accuracy": 0.3881545100564224, - "eval_loss": 3.350152015686035, - "eval_runtime": 183.5756, - "eval_samples_per_second": 98.112, - "eval_steps_per_second": 6.134, + "epoch": 7.439353099730458, + "eval_accuracy": 0.3881027912818227, + "eval_loss": 3.3517191410064697, + "eval_runtime": 183.9439, + "eval_samples_per_second": 97.916, + "eval_steps_per_second": 6.121, "step": 69000 }, { - "epoch": 7.431923366698956, - "grad_norm": 0.8021217584609985, - "learning_rate": 0.00015441870488093953, - "loss": 3.2293, + "epoch": 7.444743935309973, + "grad_norm": 0.8004981279373169, + "learning_rate": 0.00015364274150026983, + "loss": 3.2569, "step": 69050 }, { - "epoch": 7.4373049187385645, - "grad_norm": 0.7711283564567566, - "learning_rate": 0.0001540954638508781, - "loss": 3.2068, + "epoch": 7.450134770889488, + "grad_norm": 0.8153196573257446, + "learning_rate": 0.0001533189422558014, + "loss": 3.2388, "step": 69100 }, { - "epoch": 7.442686470778172, - "grad_norm": 0.7406951189041138, - "learning_rate": 0.00015377222282081672, - "loss": 3.2363, + "epoch": 7.455525606469003, + "grad_norm": 0.8217753767967224, + "learning_rate": 0.00015299514301133295, + "loss": 3.2183, "step": 69150 }, { - "epoch": 7.448068022817781, - "grad_norm": 0.7828607559204102, - "learning_rate": 0.0001534489817907553, - "loss": 3.23, + "epoch": 7.460916442048518, + "grad_norm": 0.8331478834152222, + "learning_rate": 0.00015267134376686453, + "loss": 3.2407, "step": 69200 }, { - "epoch": 7.453449574857389, - "grad_norm": 0.7775204181671143, - "learning_rate": 0.00015312574076069388, - "loss": 3.2188, + "epoch": 7.466307277628032, + "grad_norm": 0.8319252133369446, + "learning_rate": 0.0001523475445223961, + "loss": 3.234, "step": 69250 }, { - "epoch": 7.458831126896997, - "grad_norm": 0.8648480176925659, - "learning_rate": 0.00015280249973063248, - "loss": 3.2287, + "epoch": 7.471698113207547, + "grad_norm": 0.8192378878593445, + "learning_rate": 0.00015202374527792766, + "loss": 3.2284, "step": 69300 }, { - "epoch": 7.4642126789366054, - "grad_norm": 0.7523376941680908, - "learning_rate": 0.00015247925870057104, - "loss": 3.2364, + "epoch": 7.4770889487870615, + "grad_norm": 0.8338528871536255, + "learning_rate": 0.00015169994603345924, + "loss": 3.229, "step": 69350 }, { - "epoch": 7.469594230976213, - "grad_norm": 0.7940405607223511, - "learning_rate": 0.00015215601767050964, - "loss": 3.2329, + "epoch": 7.482479784366577, + "grad_norm": 0.8317149877548218, + "learning_rate": 0.00015137614678899082, + "loss": 3.2566, "step": 69400 }, { - "epoch": 7.474975783015822, - "grad_norm": 0.8218305706977844, - "learning_rate": 0.0001518327766404482, - "loss": 3.2279, + "epoch": 7.487870619946092, + "grad_norm": 0.8038789629936218, + "learning_rate": 0.00015105234754452237, + "loss": 3.2393, "step": 69450 }, { - "epoch": 7.48035733505543, - "grad_norm": 0.8601292967796326, - "learning_rate": 0.0001515095356103868, - "loss": 3.2287, + "epoch": 7.493261455525606, + "grad_norm": 0.789286732673645, + "learning_rate": 0.00015072854830005394, + "loss": 3.2284, "step": 69500 }, { - "epoch": 7.485738887095038, - "grad_norm": 0.7894765138626099, - "learning_rate": 0.0001511862945803254, - "loss": 3.2431, + "epoch": 7.498652291105121, + "grad_norm": 0.8185145258903503, + "learning_rate": 0.00015040474905558552, + "loss": 3.2413, "step": 69550 }, { - "epoch": 7.491120439134646, - "grad_norm": 0.7626713514328003, - "learning_rate": 0.00015086305355026396, - "loss": 3.239, + "epoch": 7.504043126684636, + "grad_norm": 0.8178480267524719, + "learning_rate": 0.0001500809498111171, + "loss": 3.2342, "step": 69600 }, { - "epoch": 7.496501991174255, - "grad_norm": 0.7660843133926392, - "learning_rate": 0.00015053981252020253, - "loss": 3.2452, + "epoch": 7.509433962264151, + "grad_norm": 0.7696956992149353, + "learning_rate": 0.00014975715056664865, + "loss": 3.2172, "step": 69650 }, { - "epoch": 7.501883543213863, - "grad_norm": 0.7898152470588684, - "learning_rate": 0.00015021657149014115, - "loss": 3.2364, + "epoch": 7.514824797843666, + "grad_norm": 0.792907178401947, + "learning_rate": 0.00014943335132218023, + "loss": 3.2548, "step": 69700 }, { - "epoch": 7.507265095253471, - "grad_norm": 0.7465683221817017, - "learning_rate": 0.00014989333046007972, - "loss": 3.2378, + "epoch": 7.520215633423181, + "grad_norm": 0.7962068915367126, + "learning_rate": 0.0001491095520777118, + "loss": 3.2337, "step": 69750 }, { - "epoch": 7.51264664729308, - "grad_norm": 0.8163168430328369, - "learning_rate": 0.00014957008943001832, - "loss": 3.2374, + "epoch": 7.525606469002695, + "grad_norm": 0.8143125176429749, + "learning_rate": 0.00014878575283324338, + "loss": 3.2427, "step": 69800 }, { - "epoch": 7.518028199332687, - "grad_norm": 0.7777751684188843, - "learning_rate": 0.00014925331322055812, - "loss": 3.2407, + "epoch": 7.53099730458221, + "grad_norm": 0.7968950271606445, + "learning_rate": 0.00014846195358877493, + "loss": 3.227, "step": 69850 }, { - "epoch": 7.523409751372296, - "grad_norm": 0.7856535911560059, - "learning_rate": 0.0001489300721904967, - "loss": 3.2332, + "epoch": 7.536388140161725, + "grad_norm": 0.8375698328018188, + "learning_rate": 0.0001481381543443065, + "loss": 3.2473, "step": 69900 }, { - "epoch": 7.528791303411904, - "grad_norm": 0.8025381565093994, - "learning_rate": 0.00014860683116043528, - "loss": 3.2268, + "epoch": 7.54177897574124, + "grad_norm": 0.8069729208946228, + "learning_rate": 0.0001478143550998381, + "loss": 3.2388, "step": 69950 }, { - "epoch": 7.534172855451512, - "grad_norm": 0.768328070640564, - "learning_rate": 0.00014828359013037385, - "loss": 3.2271, + "epoch": 7.547169811320755, + "grad_norm": 0.8065479397773743, + "learning_rate": 0.00014749055585536967, + "loss": 3.2323, "step": 70000 }, { - "epoch": 7.534172855451512, - "eval_accuracy": 0.3889133418249187, - "eval_loss": 3.347104549407959, - "eval_runtime": 186.753, - "eval_samples_per_second": 96.443, - "eval_steps_per_second": 6.029, + "epoch": 7.547169811320755, + "eval_accuracy": 0.3886169367469608, + "eval_loss": 3.3482158184051514, + "eval_runtime": 183.4915, + "eval_samples_per_second": 98.157, + "eval_steps_per_second": 6.137, "step": 70000 }, { - "epoch": 7.539554407491121, - "grad_norm": 0.7838550209999084, - "learning_rate": 0.00014796034910031245, - "loss": 3.2358, + "epoch": 7.55256064690027, + "grad_norm": 0.7838699221611023, + "learning_rate": 0.00014716675661090122, + "loss": 3.2376, "step": 70050 }, { - "epoch": 7.544935959530728, - "grad_norm": 0.8263210654258728, - "learning_rate": 0.00014763710807025104, - "loss": 3.2473, + "epoch": 7.557951482479784, + "grad_norm": 0.8455720543861389, + "learning_rate": 0.0001468429573664328, + "loss": 3.246, "step": 70100 }, { - "epoch": 7.550317511570337, - "grad_norm": 0.7420403361320496, - "learning_rate": 0.0001473138670401896, - "loss": 3.2405, + "epoch": 7.563342318059299, + "grad_norm": 0.8139489889144897, + "learning_rate": 0.00014651915812196437, + "loss": 3.2397, "step": 70150 }, { - "epoch": 7.5556990636099455, - "grad_norm": 0.8509701490402222, - "learning_rate": 0.0001469906260101282, - "loss": 3.2202, + "epoch": 7.568733153638814, + "grad_norm": 0.8418183326721191, + "learning_rate": 0.00014619535887749595, + "loss": 3.2382, "step": 70200 }, { - "epoch": 7.561080615649553, - "grad_norm": 0.7671039700508118, - "learning_rate": 0.0001466673849800668, - "loss": 3.2382, + "epoch": 7.574123989218329, + "grad_norm": 0.7932751178741455, + "learning_rate": 0.00014587155963302753, + "loss": 3.2411, "step": 70250 }, { - "epoch": 7.566462167689162, - "grad_norm": 0.7579676508903503, - "learning_rate": 0.0001463441439500054, - "loss": 3.2368, + "epoch": 7.579514824797844, + "grad_norm": 0.990576446056366, + "learning_rate": 0.00014554776038855908, + "loss": 3.2422, "step": 70300 }, { - "epoch": 7.57184371972877, - "grad_norm": 0.7678229212760925, - "learning_rate": 0.00014602090291994396, - "loss": 3.2319, + "epoch": 7.584905660377358, + "grad_norm": 0.8364869356155396, + "learning_rate": 0.00014523043712898003, + "loss": 3.2565, "step": 70350 }, { - "epoch": 7.577225271768378, - "grad_norm": 0.7729119658470154, - "learning_rate": 0.00014569766188988255, - "loss": 3.2431, + "epoch": 7.590296495956873, + "grad_norm": 0.8097447752952576, + "learning_rate": 0.00014490663788451158, + "loss": 3.2301, "step": 70400 }, { - "epoch": 7.5826068238079865, - "grad_norm": 0.8020555973052979, - "learning_rate": 0.00014537442085982112, - "loss": 3.2512, + "epoch": 7.595687331536388, + "grad_norm": 0.8373438119888306, + "learning_rate": 0.00014458283864004316, + "loss": 3.2483, "step": 70450 }, { - "epoch": 7.587988375847594, - "grad_norm": 0.8227038383483887, - "learning_rate": 0.00014505117982975972, - "loss": 3.2352, + "epoch": 7.601078167115903, + "grad_norm": 0.813815712928772, + "learning_rate": 0.00014425903939557474, + "loss": 3.2568, "step": 70500 }, { - "epoch": 7.593369927887203, - "grad_norm": 0.7413498163223267, - "learning_rate": 0.00014472793879969828, - "loss": 3.226, + "epoch": 7.606469002695418, + "grad_norm": 0.8149926662445068, + "learning_rate": 0.00014393524015110631, + "loss": 3.2454, "step": 70550 }, { - "epoch": 7.598751479926811, - "grad_norm": 0.8386573791503906, - "learning_rate": 0.00014440469776963688, - "loss": 3.2471, + "epoch": 7.611859838274933, + "grad_norm": 0.866629958152771, + "learning_rate": 0.00014361144090663786, + "loss": 3.2318, "step": 70600 }, { - "epoch": 7.604133031966419, - "grad_norm": 0.7619134783744812, - "learning_rate": 0.00014408145673957545, - "loss": 3.213, + "epoch": 7.617250673854447, + "grad_norm": 0.8428345918655396, + "learning_rate": 0.00014328764166216944, + "loss": 3.2296, "step": 70650 }, { - "epoch": 7.609514584006027, - "grad_norm": 0.7510246634483337, - "learning_rate": 0.00014375821570951404, - "loss": 3.2313, + "epoch": 7.622641509433962, + "grad_norm": 0.8193896412849426, + "learning_rate": 0.00014296384241770102, + "loss": 3.2552, "step": 70700 }, { - "epoch": 7.614896136045635, - "grad_norm": 0.795792818069458, - "learning_rate": 0.00014343497467945264, - "loss": 3.2269, + "epoch": 7.628032345013477, + "grad_norm": 0.794182538986206, + "learning_rate": 0.0001426400431732326, + "loss": 3.2357, "step": 70750 }, { - "epoch": 7.620277688085244, - "grad_norm": 0.803733766078949, - "learning_rate": 0.00014311173364939123, - "loss": 3.2285, + "epoch": 7.633423180592992, + "grad_norm": 0.8212337493896484, + "learning_rate": 0.00014231624392876417, + "loss": 3.2446, "step": 70800 }, { - "epoch": 7.625659240124852, - "grad_norm": 0.7954763770103455, - "learning_rate": 0.0001427884926193298, - "loss": 3.2346, + "epoch": 7.638814016172507, + "grad_norm": 0.778418242931366, + "learning_rate": 0.00014199244468429573, + "loss": 3.2301, "step": 70850 }, { - "epoch": 7.63104079216446, - "grad_norm": 0.7582803964614868, - "learning_rate": 0.0001424652515892684, - "loss": 3.2259, + "epoch": 7.644204851752022, + "grad_norm": 0.847078263759613, + "learning_rate": 0.0001416686454398273, + "loss": 3.2439, "step": 70900 }, { - "epoch": 7.636422344204068, - "grad_norm": 0.8451318740844727, - "learning_rate": 0.000142142010559207, - "loss": 3.2392, + "epoch": 7.649595687331536, + "grad_norm": 0.786853551864624, + "learning_rate": 0.00014134484619535888, + "loss": 3.2243, "step": 70950 }, { - "epoch": 7.641803896243677, - "grad_norm": 0.7700170278549194, - "learning_rate": 0.00014181876952914555, - "loss": 3.2571, + "epoch": 7.654986522911051, + "grad_norm": 0.7910891771316528, + "learning_rate": 0.00014102104695089043, + "loss": 3.2291, "step": 71000 }, { - "epoch": 7.641803896243677, - "eval_accuracy": 0.38935566773121144, - "eval_loss": 3.3431193828582764, - "eval_runtime": 184.6174, - "eval_samples_per_second": 97.559, - "eval_steps_per_second": 6.099, + "epoch": 7.654986522911051, + "eval_accuracy": 0.3887969745820694, + "eval_loss": 3.3445234298706055, + "eval_runtime": 183.6631, + "eval_samples_per_second": 98.065, + "eval_steps_per_second": 6.131, "step": 71000 }, { - "epoch": 7.647185448283285, - "grad_norm": 0.7822245955467224, - "learning_rate": 0.00014149552849908415, - "loss": 3.237, + "epoch": 7.660377358490566, + "grad_norm": 0.8126778602600098, + "learning_rate": 0.000140697247706422, + "loss": 3.2577, "step": 71050 }, { - "epoch": 7.652567000322893, - "grad_norm": 0.8479968905448914, - "learning_rate": 0.00014117228746902272, - "loss": 3.2428, + "epoch": 7.665768194070081, + "grad_norm": 0.8710419535636902, + "learning_rate": 0.0001403734484619536, + "loss": 3.2308, "step": 71100 }, { - "epoch": 7.657948552362502, - "grad_norm": 0.818569004535675, - "learning_rate": 0.0001408490464389613, - "loss": 3.2402, + "epoch": 7.671159029649596, + "grad_norm": 0.8106386065483093, + "learning_rate": 0.00014004964921748514, + "loss": 3.2225, "step": 71150 }, { - "epoch": 7.663330104402109, - "grad_norm": 0.8019593954086304, - "learning_rate": 0.00014052580540889988, - "loss": 3.2463, + "epoch": 7.67654986522911, + "grad_norm": 0.839568018913269, + "learning_rate": 0.00013972584997301671, + "loss": 3.2321, "step": 71200 }, { - "epoch": 7.668711656441718, - "grad_norm": 0.7790815234184265, - "learning_rate": 0.00014020256437883847, - "loss": 3.2355, + "epoch": 7.681940700808625, + "grad_norm": 0.8566945791244507, + "learning_rate": 0.0001394020507285483, + "loss": 3.2389, "step": 71250 }, { - "epoch": 7.674093208481326, - "grad_norm": 0.7565615177154541, - "learning_rate": 0.00013987932334877707, - "loss": 3.2197, + "epoch": 7.6873315363881405, + "grad_norm": 0.8562785387039185, + "learning_rate": 0.00013907825148407984, + "loss": 3.2319, "step": 71300 }, { - "epoch": 7.679474760520934, - "grad_norm": 0.782861590385437, - "learning_rate": 0.00013955608231871564, - "loss": 3.2444, + "epoch": 7.692722371967655, + "grad_norm": 0.8187358975410461, + "learning_rate": 0.00013875445223961142, + "loss": 3.2427, "step": 71350 }, { - "epoch": 7.684856312560543, - "grad_norm": 0.8096931576728821, - "learning_rate": 0.00013923284128865423, - "loss": 3.2483, + "epoch": 7.69811320754717, + "grad_norm": 0.8082362413406372, + "learning_rate": 0.000138430652995143, + "loss": 3.2448, "step": 71400 }, { - "epoch": 7.69023786460015, - "grad_norm": 0.7934433817863464, - "learning_rate": 0.00013890960025859283, - "loss": 3.2454, + "epoch": 7.703504043126685, + "grad_norm": 0.8942475318908691, + "learning_rate": 0.00013810685375067455, + "loss": 3.2331, "step": 71450 }, { - "epoch": 7.695619416639759, - "grad_norm": 0.7780919671058655, - "learning_rate": 0.0001385863592285314, - "loss": 3.236, + "epoch": 7.708894878706199, + "grad_norm": 0.8236886858940125, + "learning_rate": 0.00013778305450620613, + "loss": 3.2444, "step": 71500 }, { - "epoch": 7.7010009686793675, - "grad_norm": 0.7714535593986511, - "learning_rate": 0.00013826311819847, - "loss": 3.2354, + "epoch": 7.714285714285714, + "grad_norm": 0.860549807548523, + "learning_rate": 0.0001374592552617377, + "loss": 3.2318, "step": 71550 }, { - "epoch": 7.706382520718975, - "grad_norm": 0.7559123039245605, - "learning_rate": 0.00013793987716840858, - "loss": 3.2452, + "epoch": 7.719676549865229, + "grad_norm": 0.8225796818733215, + "learning_rate": 0.00013713545601726928, + "loss": 3.2357, "step": 71600 }, { - "epoch": 7.711764072758584, - "grad_norm": 0.7606051564216614, - "learning_rate": 0.00013761663613834715, - "loss": 3.2283, + "epoch": 7.725067385444744, + "grad_norm": 0.8761296272277832, + "learning_rate": 0.00013681165677280086, + "loss": 3.2266, "step": 71650 }, { - "epoch": 7.717145624798192, - "grad_norm": 0.7872258424758911, - "learning_rate": 0.00013729339510828572, - "loss": 3.2319, + "epoch": 7.730458221024259, + "grad_norm": 0.8161090612411499, + "learning_rate": 0.0001364878575283324, + "loss": 3.2535, "step": 71700 }, { - "epoch": 7.7225271768378, - "grad_norm": 0.7712429761886597, - "learning_rate": 0.0001369701540782243, - "loss": 3.2328, + "epoch": 7.735849056603773, + "grad_norm": 0.7882575392723083, + "learning_rate": 0.000136164058283864, + "loss": 3.2397, "step": 71750 }, { - "epoch": 7.727908728877408, - "grad_norm": 0.8140832185745239, - "learning_rate": 0.0001366469130481629, - "loss": 3.2216, + "epoch": 7.741239892183288, + "grad_norm": 0.8809931874275208, + "learning_rate": 0.00013584025903939557, + "loss": 3.2278, "step": 71800 }, { - "epoch": 7.733290280917016, - "grad_norm": 0.8168127536773682, - "learning_rate": 0.00013632367201810147, - "loss": 3.2295, + "epoch": 7.7466307277628035, + "grad_norm": 0.8188114762306213, + "learning_rate": 0.00013551645979492714, + "loss": 3.2244, "step": 71850 }, { - "epoch": 7.738671832956625, - "grad_norm": 0.8033936619758606, - "learning_rate": 0.0001360068958086413, - "loss": 3.2374, + "epoch": 7.752021563342318, + "grad_norm": 0.8257802128791809, + "learning_rate": 0.0001351926605504587, + "loss": 3.2463, "step": 71900 }, { - "epoch": 7.744053384996233, - "grad_norm": 0.9045777320861816, - "learning_rate": 0.00013568365477857988, - "loss": 3.2282, + "epoch": 7.757412398921833, + "grad_norm": 0.874371349811554, + "learning_rate": 0.00013486886130599027, + "loss": 3.24, "step": 71950 }, { - "epoch": 7.749434937035841, - "grad_norm": 0.7644913792610168, - "learning_rate": 0.00013536041374851847, - "loss": 3.2354, + "epoch": 7.762803234501348, + "grad_norm": 0.8590646982192993, + "learning_rate": 0.00013454506206152185, + "loss": 3.2425, "step": 72000 }, { - "epoch": 7.749434937035841, - "eval_accuracy": 0.38948757233701825, - "eval_loss": 3.3385231494903564, - "eval_runtime": 185.7025, - "eval_samples_per_second": 96.988, - "eval_steps_per_second": 6.063, + "epoch": 7.762803234501348, + "eval_accuracy": 0.38919529606879727, + "eval_loss": 3.338242292404175, + "eval_runtime": 183.9612, + "eval_samples_per_second": 97.907, + "eval_steps_per_second": 6.121, "step": 72000 }, { - "epoch": 7.754816489075449, - "grad_norm": 0.7797622680664062, - "learning_rate": 0.00013503717271845706, - "loss": 3.2282, + "epoch": 7.768194070080862, + "grad_norm": 0.800114631652832, + "learning_rate": 0.00013422126281705343, + "loss": 3.2321, "step": 72050 }, { - "epoch": 7.760198041115058, - "grad_norm": 0.7901070713996887, - "learning_rate": 0.00013471393168839563, - "loss": 3.2469, + "epoch": 7.773584905660377, + "grad_norm": 0.8323007822036743, + "learning_rate": 0.00013389746357258498, + "loss": 3.242, "step": 72100 }, { - "epoch": 7.765579593154666, - "grad_norm": 0.8164810538291931, - "learning_rate": 0.00013439069065833423, - "loss": 3.2479, + "epoch": 7.7789757412398925, + "grad_norm": 0.7954844236373901, + "learning_rate": 0.00013357366432811656, + "loss": 3.2409, "step": 72150 }, { - "epoch": 7.770961145194274, - "grad_norm": 0.8166574835777283, - "learning_rate": 0.0001340674496282728, - "loss": 3.2366, + "epoch": 7.784366576819407, + "grad_norm": 0.830142617225647, + "learning_rate": 0.00013324986508364813, + "loss": 3.2646, "step": 72200 }, { - "epoch": 7.776342697233883, - "grad_norm": 0.7946748733520508, - "learning_rate": 0.0001337442085982114, - "loss": 3.2279, + "epoch": 7.789757412398922, + "grad_norm": 0.8352818489074707, + "learning_rate": 0.0001329260658391797, + "loss": 3.2187, "step": 72250 }, { - "epoch": 7.78172424927349, - "grad_norm": 0.8163143396377563, - "learning_rate": 0.00013342096756814996, - "loss": 3.2278, + "epoch": 7.795148247978437, + "grad_norm": 0.8565665483474731, + "learning_rate": 0.00013260226659471126, + "loss": 3.2451, "step": 72300 }, { - "epoch": 7.787105801313099, - "grad_norm": 0.7848666906356812, - "learning_rate": 0.00013309772653808855, - "loss": 3.2493, + "epoch": 7.800539083557951, + "grad_norm": 0.8777026534080505, + "learning_rate": 0.00013227846735024284, + "loss": 3.238, "step": 72350 }, { - "epoch": 7.792487353352707, - "grad_norm": 0.7951918840408325, - "learning_rate": 0.00013277448550802715, - "loss": 3.2208, + "epoch": 7.8059299191374665, + "grad_norm": 0.9060974717140198, + "learning_rate": 0.00013195466810577442, + "loss": 3.2376, "step": 72400 }, { - "epoch": 7.797868905392315, - "grad_norm": 0.7784159779548645, - "learning_rate": 0.0001324512444779657, - "loss": 3.2259, + "epoch": 7.811320754716981, + "grad_norm": 0.7934736609458923, + "learning_rate": 0.000131630868861306, + "loss": 3.2418, "step": 72450 }, { - "epoch": 7.803250457431924, - "grad_norm": 0.8353852033615112, - "learning_rate": 0.0001321280034479043, - "loss": 3.2519, + "epoch": 7.816711590296496, + "grad_norm": 0.8556714653968811, + "learning_rate": 0.00013130706961683754, + "loss": 3.2286, "step": 72500 }, { - "epoch": 7.808632009471531, - "grad_norm": 0.7999012470245361, - "learning_rate": 0.0001318047624178429, - "loss": 3.2405, + "epoch": 7.822102425876011, + "grad_norm": 0.8331558704376221, + "learning_rate": 0.00013098327037236912, + "loss": 3.2158, "step": 72550 }, { - "epoch": 7.81401356151114, - "grad_norm": 0.8065581321716309, - "learning_rate": 0.0001314815213877815, - "loss": 3.2377, + "epoch": 7.827493261455525, + "grad_norm": 0.79069584608078, + "learning_rate": 0.0001306594711279007, + "loss": 3.2468, "step": 72600 }, { - "epoch": 7.819395113550748, - "grad_norm": 0.7664511799812317, - "learning_rate": 0.00013115828035772007, - "loss": 3.2396, + "epoch": 7.83288409703504, + "grad_norm": 0.8664618134498596, + "learning_rate": 0.00013033567188343225, + "loss": 3.2358, "step": 72650 }, { - "epoch": 7.824776665590356, - "grad_norm": 0.8212724924087524, - "learning_rate": 0.00013083503932765866, - "loss": 3.2414, + "epoch": 7.8382749326145555, + "grad_norm": 0.8485777378082275, + "learning_rate": 0.00013001187263896383, + "loss": 3.2413, "step": 72700 }, { - "epoch": 7.830158217629965, - "grad_norm": 0.8192585110664368, - "learning_rate": 0.00013051179829759723, - "loss": 3.2474, + "epoch": 7.84366576819407, + "grad_norm": 0.9091561436653137, + "learning_rate": 0.0001296880733944954, + "loss": 3.2381, "step": 72750 }, { - "epoch": 7.835539769669572, - "grad_norm": 0.7615439295768738, - "learning_rate": 0.00013018855726753582, - "loss": 3.2471, + "epoch": 7.849056603773585, + "grad_norm": 0.7862276434898376, + "learning_rate": 0.00012936427415002696, + "loss": 3.2335, "step": 72800 }, { - "epoch": 7.840921321709181, - "grad_norm": 0.8273650407791138, - "learning_rate": 0.0001298653162374744, - "loss": 3.2322, + "epoch": 7.8544474393531, + "grad_norm": 0.8281474113464355, + "learning_rate": 0.00012904047490555853, + "loss": 3.2286, "step": 72850 }, { - "epoch": 7.846302873748789, - "grad_norm": 0.8271594047546387, - "learning_rate": 0.00012954207520741298, - "loss": 3.2416, + "epoch": 7.859838274932614, + "grad_norm": 0.7945317625999451, + "learning_rate": 0.0001287166756610901, + "loss": 3.2301, "step": 72900 }, { - "epoch": 7.851684425788397, - "grad_norm": 0.7770468592643738, - "learning_rate": 0.00012921883417735155, - "loss": 3.2437, + "epoch": 7.8652291105121295, + "grad_norm": 0.8411368727684021, + "learning_rate": 0.0001283928764166217, + "loss": 3.2244, "step": 72950 }, { - "epoch": 7.857065977828006, - "grad_norm": 0.8396730422973633, - "learning_rate": 0.00012889559314729015, - "loss": 3.2343, + "epoch": 7.870619946091644, + "grad_norm": 0.8305996060371399, + "learning_rate": 0.00012806907717215324, + "loss": 3.2255, "step": 73000 }, { - "epoch": 7.857065977828006, - "eval_accuracy": 0.3899009965751523, - "eval_loss": 3.3356707096099854, - "eval_runtime": 185.9451, - "eval_samples_per_second": 96.862, - "eval_steps_per_second": 6.056, + "epoch": 7.870619946091644, + "eval_accuracy": 0.3896523987678328, + "eval_loss": 3.3369224071502686, + "eval_runtime": 183.4047, + "eval_samples_per_second": 98.204, + "eval_steps_per_second": 6.139, "step": 73000 }, { - "epoch": 7.862447529867614, - "grad_norm": 0.8383583426475525, - "learning_rate": 0.00012857235211722874, + "epoch": 7.876010781671159, + "grad_norm": 0.8752392530441284, + "learning_rate": 0.00012774527792768482, "loss": 3.2385, "step": 73050 }, { - "epoch": 7.867829081907222, - "grad_norm": 0.8135746121406555, - "learning_rate": 0.00012824911108716734, - "loss": 3.2319, + "epoch": 7.881401617250674, + "grad_norm": 0.9205421209335327, + "learning_rate": 0.0001274214786832164, + "loss": 3.2175, "step": 73100 }, { - "epoch": 7.87321063394683, - "grad_norm": 0.7756822109222412, - "learning_rate": 0.0001279258700571059, - "loss": 3.2414, + "epoch": 7.886792452830189, + "grad_norm": 0.8184296488761902, + "learning_rate": 0.00012709767943874795, + "loss": 3.2264, "step": 73150 }, { - "epoch": 7.878592185986438, - "grad_norm": 0.7989129424095154, - "learning_rate": 0.0001276026290270445, - "loss": 3.2258, + "epoch": 7.892183288409703, + "grad_norm": 0.84311443567276, + "learning_rate": 0.00012677388019427952, + "loss": 3.243, "step": 73200 }, { - "epoch": 7.883973738026047, - "grad_norm": 0.7954966425895691, - "learning_rate": 0.00012727938799698307, - "loss": 3.2598, + "epoch": 7.8975741239892185, + "grad_norm": 0.7987281084060669, + "learning_rate": 0.0001264500809498111, + "loss": 3.2351, "step": 73250 }, { - "epoch": 7.889355290065655, - "grad_norm": 0.8074511885643005, - "learning_rate": 0.00012696261178752287, - "loss": 3.2205, + "epoch": 7.902964959568733, + "grad_norm": 0.8131510615348816, + "learning_rate": 0.00012612628170534268, + "loss": 3.2291, "step": 73300 }, { - "epoch": 7.894736842105263, - "grad_norm": 0.8159090876579285, - "learning_rate": 0.00012663937075746147, - "loss": 3.2612, + "epoch": 7.908355795148248, + "grad_norm": 0.8380711674690247, + "learning_rate": 0.00012580248246087426, + "loss": 3.2148, "step": 73350 }, { - "epoch": 7.900118394144871, - "grad_norm": 0.8149001598358154, - "learning_rate": 0.00012631612972740006, - "loss": 3.2642, + "epoch": 7.913746630727763, + "grad_norm": 0.8311099410057068, + "learning_rate": 0.00012548515920129518, + "loss": 3.259, "step": 73400 }, { - "epoch": 7.90549994618448, - "grad_norm": 0.7865789532661438, - "learning_rate": 0.00012599288869733863, - "loss": 3.2372, + "epoch": 7.919137466307277, + "grad_norm": 0.8204582333564758, + "learning_rate": 0.00012516135995682676, + "loss": 3.2398, "step": 73450 }, { - "epoch": 7.910881498224088, - "grad_norm": 0.805672824382782, - "learning_rate": 0.00012566964766727722, - "loss": 3.232, + "epoch": 7.9245283018867925, + "grad_norm": 0.8196551203727722, + "learning_rate": 0.0001248375607123583, + "loss": 3.2513, "step": 73500 }, { - "epoch": 7.916263050263696, - "grad_norm": 0.7899633049964905, - "learning_rate": 0.00012534640663721582, - "loss": 3.2377, + "epoch": 7.929919137466308, + "grad_norm": 0.8708674311637878, + "learning_rate": 0.0001245137614678899, + "loss": 3.2424, "step": 73550 }, { - "epoch": 7.921644602303305, - "grad_norm": 0.795754611492157, - "learning_rate": 0.00012502316560715439, - "loss": 3.2179, + "epoch": 7.935309973045822, + "grad_norm": 0.8819312453269958, + "learning_rate": 0.00012418996222342147, + "loss": 3.241, "step": 73600 }, { - "epoch": 7.927026154342912, - "grad_norm": 0.7938458323478699, - "learning_rate": 0.00012469992457709298, - "loss": 3.2383, + "epoch": 7.940700808625337, + "grad_norm": 0.8691334128379822, + "learning_rate": 0.00012386616297895304, + "loss": 3.251, "step": 73650 }, { - "epoch": 7.932407706382521, - "grad_norm": 0.7996332049369812, - "learning_rate": 0.00012437668354703158, - "loss": 3.2522, + "epoch": 7.946091644204852, + "grad_norm": 0.8479558825492859, + "learning_rate": 0.0001235423637344846, + "loss": 3.2569, "step": 73700 }, { - "epoch": 7.937789258422129, - "grad_norm": 0.7663394808769226, - "learning_rate": 0.00012405344251697014, - "loss": 3.2147, + "epoch": 7.951482479784366, + "grad_norm": 0.824360191822052, + "learning_rate": 0.00012321856449001617, + "loss": 3.2141, "step": 73750 }, { - "epoch": 7.943170810461737, - "grad_norm": 0.7848943471908569, - "learning_rate": 0.00012373020148690874, - "loss": 3.2357, + "epoch": 7.9568733153638815, + "grad_norm": 0.8288909196853638, + "learning_rate": 0.00012289476524554775, + "loss": 3.2363, "step": 73800 }, { - "epoch": 7.948552362501346, - "grad_norm": 0.7804843187332153, - "learning_rate": 0.0001234069604568473, - "loss": 3.2358, + "epoch": 7.962264150943396, + "grad_norm": 0.8791062235832214, + "learning_rate": 0.00012257096600107933, + "loss": 3.2223, "step": 73850 }, { - "epoch": 7.953933914540953, - "grad_norm": 0.81059730052948, - "learning_rate": 0.0001230837194267859, - "loss": 3.2335, + "epoch": 7.967654986522911, + "grad_norm": 0.8680914044380188, + "learning_rate": 0.0001222471667566109, + "loss": 3.2292, "step": 73900 }, { - "epoch": 7.959315466580562, - "grad_norm": 0.8302901387214661, - "learning_rate": 0.00012276047839672447, - "loss": 3.2246, + "epoch": 7.973045822102426, + "grad_norm": 0.8580226898193359, + "learning_rate": 0.00012192336751214245, + "loss": 3.2387, "step": 73950 }, { - "epoch": 7.96469701862017, - "grad_norm": 0.793449878692627, - "learning_rate": 0.00012243723736666306, - "loss": 3.2224, + "epoch": 7.97843665768194, + "grad_norm": 0.8580893278121948, + "learning_rate": 0.00012159956826767403, + "loss": 3.2364, "step": 74000 }, { - "epoch": 7.96469701862017, - "eval_accuracy": 0.3902945373348164, - "eval_loss": 3.3304708003997803, - "eval_runtime": 185.3875, - "eval_samples_per_second": 97.153, - "eval_steps_per_second": 6.074, + "epoch": 7.97843665768194, + "eval_accuracy": 0.3899765103321834, + "eval_loss": 3.333406925201416, + "eval_runtime": 183.9362, + "eval_samples_per_second": 97.92, + "eval_steps_per_second": 6.122, "step": 74000 }, { - "epoch": 7.970078570659778, - "grad_norm": 0.8431436419487, - "learning_rate": 0.00012211399633660166, - "loss": 3.2265, + "epoch": 7.9838274932614555, + "grad_norm": 0.843823254108429, + "learning_rate": 0.00012127576902320561, + "loss": 3.2387, "step": 74050 }, { - "epoch": 7.975460122699387, - "grad_norm": 0.8346486687660217, - "learning_rate": 0.00012179075530654022, - "loss": 3.2429, + "epoch": 7.989218328840971, + "grad_norm": 0.8297356963157654, + "learning_rate": 0.00012095196977873717, + "loss": 3.2414, "step": 74100 }, { - "epoch": 7.980841674738995, - "grad_norm": 0.8443086743354797, - "learning_rate": 0.00012146751427647882, - "loss": 3.2296, + "epoch": 7.994609164420485, + "grad_norm": 0.8512176871299744, + "learning_rate": 0.00012062817053426874, + "loss": 3.2302, "step": 74150 }, { - "epoch": 7.986223226778603, - "grad_norm": 0.8074975609779358, - "learning_rate": 0.0001211442732464174, - "loss": 3.2325, + "epoch": 8.0, + "grad_norm": 1.710868239402771, + "learning_rate": 0.00012030437128980032, + "loss": 3.2452, "step": 74200 }, { - "epoch": 7.991604778818211, - "grad_norm": 0.7959332466125488, - "learning_rate": 0.00012082103221635598, - "loss": 3.2252, + "epoch": 8.005390835579515, + "grad_norm": 0.8137375712394714, + "learning_rate": 0.00011998057204533188, + "loss": 3.1626, "step": 74250 }, { - "epoch": 7.996986330857819, - "grad_norm": 0.7904613018035889, - "learning_rate": 0.00012049779118629456, - "loss": 3.2217, + "epoch": 8.01078167115903, + "grad_norm": 0.8332458138465881, + "learning_rate": 0.00011965677280086346, + "loss": 3.1565, "step": 74300 }, { - "epoch": 8.002367882897428, - "grad_norm": 0.830864429473877, - "learning_rate": 0.00012017455015623316, - "loss": 3.2092, + "epoch": 8.016172506738544, + "grad_norm": 0.8537135720252991, + "learning_rate": 0.00011933297355639502, + "loss": 3.1415, "step": 74350 }, { - "epoch": 8.007749434937036, - "grad_norm": 0.8585742712020874, - "learning_rate": 0.00011985130912617175, - "loss": 3.151, + "epoch": 8.021563342318059, + "grad_norm": 0.8167550563812256, + "learning_rate": 0.00011900917431192659, + "loss": 3.1471, "step": 74400 }, { - "epoch": 8.013130986976645, - "grad_norm": 0.7843531370162964, - "learning_rate": 0.00011952806809611032, - "loss": 3.1576, + "epoch": 8.026954177897574, + "grad_norm": 0.7993401288986206, + "learning_rate": 0.00011868537506745816, + "loss": 3.1652, "step": 74450 }, { - "epoch": 8.018512539016251, - "grad_norm": 0.8142103552818298, - "learning_rate": 0.00011920482706604891, - "loss": 3.1468, + "epoch": 8.032345013477089, + "grad_norm": 0.8277669548988342, + "learning_rate": 0.00011836157582298974, + "loss": 3.1505, "step": 74500 }, { - "epoch": 8.02389409105586, - "grad_norm": 0.7832826972007751, - "learning_rate": 0.0001188815860359875, - "loss": 3.1542, + "epoch": 8.037735849056604, + "grad_norm": 0.8260676860809326, + "learning_rate": 0.00011803777657852132, + "loss": 3.1681, "step": 74550 }, { - "epoch": 8.029275643095469, - "grad_norm": 0.825363278388977, - "learning_rate": 0.00011855834500592608, - "loss": 3.1713, + "epoch": 8.04312668463612, + "grad_norm": 0.8829268217086792, + "learning_rate": 0.00011771397733405287, + "loss": 3.1606, "step": 74600 }, { - "epoch": 8.034657195135077, - "grad_norm": 0.8069754242897034, - "learning_rate": 0.00011823510397586466, - "loss": 3.1709, + "epoch": 8.048517520215633, + "grad_norm": 0.8338451385498047, + "learning_rate": 0.00011739017808958445, + "loss": 3.1549, "step": 74650 }, { - "epoch": 8.040038747174686, - "grad_norm": 0.8154868483543396, - "learning_rate": 0.00011791186294580325, - "loss": 3.1677, + "epoch": 8.053908355795148, + "grad_norm": 0.8860074281692505, + "learning_rate": 0.00011706637884511602, + "loss": 3.1636, "step": 74700 }, { - "epoch": 8.045420299214294, - "grad_norm": 0.8420593738555908, - "learning_rate": 0.00011758862191574182, - "loss": 3.1661, + "epoch": 8.059299191374663, + "grad_norm": 0.8655481338500977, + "learning_rate": 0.0001167425796006476, + "loss": 3.1584, "step": 74750 }, { - "epoch": 8.050801851253901, - "grad_norm": 0.7894124388694763, - "learning_rate": 0.00011726538088568041, - "loss": 3.1623, + "epoch": 8.064690026954178, + "grad_norm": 0.8550511002540588, + "learning_rate": 0.00011641878035617915, + "loss": 3.1629, "step": 74800 }, { - "epoch": 8.05618340329351, - "grad_norm": 0.7732349038124084, - "learning_rate": 0.000116942139855619, - "loss": 3.1625, + "epoch": 8.070080862533693, + "grad_norm": 0.8494965434074402, + "learning_rate": 0.00011609498111171073, + "loss": 3.1723, "step": 74850 }, { - "epoch": 8.061564955333118, - "grad_norm": 0.7911288738250732, - "learning_rate": 0.00011661889882555759, - "loss": 3.1792, + "epoch": 8.075471698113208, + "grad_norm": 0.8265563249588013, + "learning_rate": 0.00011577118186724231, + "loss": 3.1505, "step": 74900 }, { - "epoch": 8.066946507372727, - "grad_norm": 0.7990174293518066, - "learning_rate": 0.00011629565779549616, - "loss": 3.1663, + "epoch": 8.080862533692722, + "grad_norm": 0.8405255675315857, + "learning_rate": 0.00011544738262277387, + "loss": 3.1527, "step": 74950 }, { - "epoch": 8.072328059412335, - "grad_norm": 0.8296521902084351, - "learning_rate": 0.00011597241676543475, - "loss": 3.1624, + "epoch": 8.086253369272237, + "grad_norm": 0.8647146821022034, + "learning_rate": 0.00011512358337830544, + "loss": 3.1645, "step": 75000 }, { - "epoch": 8.072328059412335, - "eval_accuracy": 0.390195771859793, - "eval_loss": 3.338721513748169, - "eval_runtime": 185.4374, - "eval_samples_per_second": 97.127, - "eval_steps_per_second": 6.072, + "epoch": 8.086253369272237, + "eval_accuracy": 0.3898099454551638, + "eval_loss": 3.3394393920898438, + "eval_runtime": 183.6112, + "eval_samples_per_second": 98.093, + "eval_steps_per_second": 6.133, "step": 75000 }, { - "epoch": 8.077709611451942, - "grad_norm": 0.8073539733886719, - "learning_rate": 0.00011564917573537335, - "loss": 3.1724, + "epoch": 8.091644204851752, + "grad_norm": 0.9160317182540894, + "learning_rate": 0.00011479978413383701, + "loss": 3.1715, "step": 75050 }, { - "epoch": 8.08309116349155, - "grad_norm": 0.8138803839683533, - "learning_rate": 0.00011532593470531191, - "loss": 3.1767, + "epoch": 8.097035040431267, + "grad_norm": 0.7973936200141907, + "learning_rate": 0.00011447598488936858, + "loss": 3.1577, "step": 75100 }, { - "epoch": 8.088472715531159, - "grad_norm": 0.8345963358879089, - "learning_rate": 0.0001150026936752505, - "loss": 3.1701, + "epoch": 8.102425876010782, + "grad_norm": 0.825989842414856, + "learning_rate": 0.00011415218564490016, + "loss": 3.178, "step": 75150 }, { - "epoch": 8.093854267570768, - "grad_norm": 0.8031169176101685, - "learning_rate": 0.00011467945264518909, - "loss": 3.1608, + "epoch": 8.107816711590296, + "grad_norm": 0.8714901208877563, + "learning_rate": 0.00011382838640043172, + "loss": 3.1699, "step": 75200 }, { - "epoch": 8.099235819610376, - "grad_norm": 0.8811976909637451, - "learning_rate": 0.00011435621161512766, - "loss": 3.1759, + "epoch": 8.11320754716981, + "grad_norm": 0.8460976481437683, + "learning_rate": 0.00011350458715596328, + "loss": 3.1599, "step": 75250 }, { - "epoch": 8.104617371649983, - "grad_norm": 0.8665459156036377, - "learning_rate": 0.00011403297058506625, - "loss": 3.1649, + "epoch": 8.118598382749326, + "grad_norm": 0.8299224972724915, + "learning_rate": 0.00011318078791149486, + "loss": 3.1705, "step": 75300 }, { - "epoch": 8.109998923689592, - "grad_norm": 0.8429995775222778, - "learning_rate": 0.00011370972955500485, - "loss": 3.1501, + "epoch": 8.123989218328841, + "grad_norm": 0.878569483757019, + "learning_rate": 0.00011285698866702644, + "loss": 3.1724, "step": 75350 }, { - "epoch": 8.1153804757292, - "grad_norm": 0.7925933599472046, - "learning_rate": 0.00011338648852494343, - "loss": 3.1473, + "epoch": 8.129380053908356, + "grad_norm": 0.8617807030677795, + "learning_rate": 0.00011253318942255802, + "loss": 3.1723, "step": 75400 }, { - "epoch": 8.120762027768809, - "grad_norm": 0.806531548500061, - "learning_rate": 0.00011306324749488201, - "loss": 3.1832, + "epoch": 8.134770889487871, + "grad_norm": 0.8983737826347351, + "learning_rate": 0.00011220939017808957, + "loss": 3.1759, "step": 75450 }, { - "epoch": 8.126143579808417, - "grad_norm": 0.8357629179954529, - "learning_rate": 0.00011274000646482059, - "loss": 3.1748, + "epoch": 8.140161725067385, + "grad_norm": 0.8343617916107178, + "learning_rate": 0.00011188559093362115, + "loss": 3.1481, "step": 75500 }, { - "epoch": 8.131525131848026, - "grad_norm": 0.8737144470214844, - "learning_rate": 0.00011241676543475918, - "loss": 3.1739, + "epoch": 8.1455525606469, + "grad_norm": 0.8640062808990479, + "learning_rate": 0.00011156179168915272, + "loss": 3.1828, "step": 75550 }, { - "epoch": 8.136906683887632, - "grad_norm": 0.8199858665466309, - "learning_rate": 0.00011209352440469775, - "loss": 3.1759, + "epoch": 8.150943396226415, + "grad_norm": 0.8575279116630554, + "learning_rate": 0.00011123799244468429, + "loss": 3.1583, "step": 75600 }, { - "epoch": 8.142288235927241, - "grad_norm": 0.7963452339172363, - "learning_rate": 0.00011177028337463635, - "loss": 3.1703, + "epoch": 8.15633423180593, + "grad_norm": 0.8382391929626465, + "learning_rate": 0.00011091419320021585, + "loss": 3.1735, "step": 75650 }, { - "epoch": 8.14766978796685, - "grad_norm": 0.8108802437782288, - "learning_rate": 0.00011144704234457493, - "loss": 3.1827, + "epoch": 8.161725067385445, + "grad_norm": 0.8406264185905457, + "learning_rate": 0.00011059039395574743, + "loss": 3.1957, "step": 75700 }, { - "epoch": 8.153051340006458, - "grad_norm": 0.8074478507041931, - "learning_rate": 0.00011112380131451351, - "loss": 3.1549, + "epoch": 8.167115902964959, + "grad_norm": 0.8683697581291199, + "learning_rate": 0.00011026659471127899, + "loss": 3.1723, "step": 75750 }, { - "epoch": 8.158432892046067, - "grad_norm": 0.8079409599304199, - "learning_rate": 0.00011080056028445209, - "loss": 3.1839, + "epoch": 8.172506738544474, + "grad_norm": 0.8310422897338867, + "learning_rate": 0.00010994279546681057, + "loss": 3.1679, "step": 75800 }, { - "epoch": 8.163814444085673, - "grad_norm": 0.8951064944267273, - "learning_rate": 0.00011047731925439068, - "loss": 3.1682, + "epoch": 8.177897574123989, + "grad_norm": 0.8195992708206177, + "learning_rate": 0.00010961899622234213, + "loss": 3.1575, "step": 75850 }, { - "epoch": 8.169195996125282, - "grad_norm": 0.8033889532089233, - "learning_rate": 0.00011015407822432928, - "loss": 3.1732, + "epoch": 8.183288409703504, + "grad_norm": 0.8489004373550415, + "learning_rate": 0.00010929519697787371, + "loss": 3.1935, "step": 75900 }, { - "epoch": 8.17457754816489, - "grad_norm": 0.8481862545013428, - "learning_rate": 0.00010983083719426785, - "loss": 3.1711, + "epoch": 8.18867924528302, + "grad_norm": 0.8514471054077148, + "learning_rate": 0.00010897139773340528, + "loss": 3.153, "step": 75950 }, { - "epoch": 8.1799591002045, - "grad_norm": 0.8201789855957031, - "learning_rate": 0.00010950759616420644, - "loss": 3.1815, + "epoch": 8.194070080862534, + "grad_norm": 0.8518810868263245, + "learning_rate": 0.00010864759848893685, + "loss": 3.1706, "step": 76000 }, { - "epoch": 8.1799591002045, - "eval_accuracy": 0.39044654272486884, - "eval_loss": 3.3347246646881104, - "eval_runtime": 185.7806, - "eval_samples_per_second": 96.948, - "eval_steps_per_second": 6.061, + "epoch": 8.194070080862534, + "eval_accuracy": 0.39030138246674867, + "eval_loss": 3.336826801300049, + "eval_runtime": 183.7013, + "eval_samples_per_second": 98.045, + "eval_steps_per_second": 6.13, "step": 76000 }, { - "epoch": 8.185340652244108, - "grad_norm": 0.7705520391464233, - "learning_rate": 0.00010918435513414502, - "loss": 3.1902, + "epoch": 8.199460916442048, + "grad_norm": 0.8850810527801514, + "learning_rate": 0.00010832379924446842, + "loss": 3.1794, "step": 76050 }, { - "epoch": 8.190722204283716, - "grad_norm": 0.8561078310012817, - "learning_rate": 0.00010886111410408359, - "loss": 3.188, + "epoch": 8.204851752021563, + "grad_norm": 0.8909291625022888, + "learning_rate": 0.00010799999999999998, + "loss": 3.1893, "step": 76100 }, { - "epoch": 8.196103756323323, - "grad_norm": 0.8287132382392883, - "learning_rate": 0.00010853787307402218, - "loss": 3.176, + "epoch": 8.210242587601078, + "grad_norm": 0.862022876739502, + "learning_rate": 0.00010767620075553156, + "loss": 3.1766, "step": 76150 }, { - "epoch": 8.201485308362932, - "grad_norm": 0.8220933675765991, - "learning_rate": 0.00010821463204396078, - "loss": 3.1587, + "epoch": 8.215633423180593, + "grad_norm": 0.8716922402381897, + "learning_rate": 0.00010735240151106314, + "loss": 3.1806, "step": 76200 }, { - "epoch": 8.20686686040254, - "grad_norm": 0.8482598066329956, - "learning_rate": 0.00010789139101389935, - "loss": 3.1814, + "epoch": 8.221024258760108, + "grad_norm": 0.8505301475524902, + "learning_rate": 0.00010702860226659472, + "loss": 3.1685, "step": 76250 }, { - "epoch": 8.212248412442149, - "grad_norm": 0.7827437520027161, - "learning_rate": 0.00010757461480443917, - "loss": 3.184, + "epoch": 8.226415094339623, + "grad_norm": 0.8627468943595886, + "learning_rate": 0.00010670480302212627, + "loss": 3.1764, "step": 76300 }, { - "epoch": 8.217629964481757, - "grad_norm": 0.8484346270561218, - "learning_rate": 0.00010725137377437776, - "loss": 3.176, + "epoch": 8.231805929919137, + "grad_norm": 0.8449823260307312, + "learning_rate": 0.00010638100377765784, + "loss": 3.1821, "step": 76350 }, { - "epoch": 8.223011516521364, - "grad_norm": 0.8320658206939697, - "learning_rate": 0.00010692813274431633, - "loss": 3.1593, + "epoch": 8.237196765498652, + "grad_norm": 0.9091751575469971, + "learning_rate": 0.00010605720453318942, + "loss": 3.19, "step": 76400 }, { - "epoch": 8.228393068560973, - "grad_norm": 0.8569603562355042, - "learning_rate": 0.00010660489171425492, - "loss": 3.1722, + "epoch": 8.242587601078167, + "grad_norm": 0.8580686450004578, + "learning_rate": 0.00010573340528872099, + "loss": 3.1607, "step": 76450 }, { - "epoch": 8.233774620600581, - "grad_norm": 0.8456915020942688, - "learning_rate": 0.0001062816506841935, - "loss": 3.1646, + "epoch": 8.247978436657682, + "grad_norm": 0.8650367856025696, + "learning_rate": 0.00010540960604425255, + "loss": 3.1667, "step": 76500 }, { - "epoch": 8.23915617264019, - "grad_norm": 0.8972082138061523, - "learning_rate": 0.00010595840965413209, - "loss": 3.1839, + "epoch": 8.253369272237197, + "grad_norm": 0.8627279996871948, + "learning_rate": 0.00010508580679978413, + "loss": 3.1607, "step": 76550 }, { - "epoch": 8.244537724679798, - "grad_norm": 0.7962077260017395, - "learning_rate": 0.00010563516862407067, - "loss": 3.1742, + "epoch": 8.25876010781671, + "grad_norm": 0.8591924905776978, + "learning_rate": 0.00010476200755531569, + "loss": 3.1841, "step": 76600 }, { - "epoch": 8.249919276719407, - "grad_norm": 0.8209040760993958, - "learning_rate": 0.00010531192759400926, - "loss": 3.1883, + "epoch": 8.264150943396226, + "grad_norm": 0.8910982012748718, + "learning_rate": 0.00010443820831084727, + "loss": 3.1651, "step": 76650 }, { - "epoch": 8.255300828759013, - "grad_norm": 0.8176620602607727, - "learning_rate": 0.00010498868656394784, - "loss": 3.1758, + "epoch": 8.269541778975741, + "grad_norm": 0.8738256692886353, + "learning_rate": 0.00010411440906637883, + "loss": 3.1976, "step": 76700 }, { - "epoch": 8.260682380798622, - "grad_norm": 0.8269286751747131, - "learning_rate": 0.00010466544553388642, - "loss": 3.1552, + "epoch": 8.274932614555256, + "grad_norm": 0.8481124639511108, + "learning_rate": 0.0001037906098219104, + "loss": 3.1799, "step": 76750 }, { - "epoch": 8.26606393283823, - "grad_norm": 0.8178252577781677, - "learning_rate": 0.000104342204503825, - "loss": 3.1911, + "epoch": 8.280323450134771, + "grad_norm": 0.8235103487968445, + "learning_rate": 0.00010346681057744197, + "loss": 3.1631, "step": 76800 }, { - "epoch": 8.27144548487784, - "grad_norm": 0.8506616950035095, - "learning_rate": 0.0001040189634737636, - "loss": 3.1894, + "epoch": 8.285714285714286, + "grad_norm": 0.8965120911598206, + "learning_rate": 0.00010314301133297355, + "loss": 3.2055, "step": 76850 }, { - "epoch": 8.276827036917448, - "grad_norm": 0.8493346571922302, - "learning_rate": 0.00010369572244370217, - "loss": 3.179, + "epoch": 8.2911051212938, + "grad_norm": 0.8587255477905273, + "learning_rate": 0.0001028192120885051, + "loss": 3.1898, "step": 76900 }, { - "epoch": 8.282208588957054, - "grad_norm": 0.826920211315155, - "learning_rate": 0.00010337248141364076, - "loss": 3.176, + "epoch": 8.296495956873315, + "grad_norm": 0.8255211114883423, + "learning_rate": 0.00010249541284403668, + "loss": 3.174, "step": 76950 }, { - "epoch": 8.287590140996663, - "grad_norm": 0.7999362349510193, - "learning_rate": 0.00010304924038357936, - "loss": 3.1758, + "epoch": 8.30188679245283, + "grad_norm": 0.8812569379806519, + "learning_rate": 0.00010217161359956826, + "loss": 3.184, "step": 77000 }, { - "epoch": 8.287590140996663, - "eval_accuracy": 0.39078890797437227, - "eval_loss": 3.3327534198760986, - "eval_runtime": 185.3503, - "eval_samples_per_second": 97.173, - "eval_steps_per_second": 6.075, + "epoch": 8.30188679245283, + "eval_accuracy": 0.39023510420518187, + "eval_loss": 3.3336145877838135, + "eval_runtime": 183.6293, + "eval_samples_per_second": 98.083, + "eval_steps_per_second": 6.132, "step": 77000 }, { - "epoch": 8.292971693036272, - "grad_norm": 0.9111582040786743, - "learning_rate": 0.00010272599935351792, + "epoch": 8.307277628032345, + "grad_norm": 0.8371959924697876, + "learning_rate": 0.00010184781435509984, "loss": 3.178, "step": 77050 }, { - "epoch": 8.29835324507588, - "grad_norm": 0.8611065745353699, - "learning_rate": 0.00010240275832345652, - "loss": 3.1736, + "epoch": 8.31266846361186, + "grad_norm": 0.8691743016242981, + "learning_rate": 0.00010152401511063141, + "loss": 3.1812, "step": 77100 }, { - "epoch": 8.303734797115489, - "grad_norm": 0.7752180099487305, - "learning_rate": 0.0001020795172933951, - "loss": 3.1778, + "epoch": 8.318059299191376, + "grad_norm": 0.8494551181793213, + "learning_rate": 0.00010120021586616296, + "loss": 3.1846, "step": 77150 }, { - "epoch": 8.309116349155097, - "grad_norm": 0.7995375394821167, - "learning_rate": 0.0001017562762633337, - "loss": 3.2014, + "epoch": 8.323450134770889, + "grad_norm": 0.8436992168426514, + "learning_rate": 0.00010087641662169454, + "loss": 3.1926, "step": 77200 }, { - "epoch": 8.314497901194704, - "grad_norm": 0.8144875168800354, - "learning_rate": 0.00010143303523327226, - "loss": 3.1808, + "epoch": 8.328840970350404, + "grad_norm": 0.8713855147361755, + "learning_rate": 0.00010055261737722612, + "loss": 3.1935, "step": 77250 }, { - "epoch": 8.319879453234313, - "grad_norm": 0.8051859140396118, - "learning_rate": 0.00010110979420321086, - "loss": 3.1826, + "epoch": 8.33423180592992, + "grad_norm": 0.8606102466583252, + "learning_rate": 0.00010022881813275768, + "loss": 3.1791, "step": 77300 }, { - "epoch": 8.325261005273921, - "grad_norm": 0.7994803786277771, - "learning_rate": 0.00010078655317314944, - "loss": 3.1801, + "epoch": 8.339622641509434, + "grad_norm": 0.8840133547782898, + "learning_rate": 9.990501888828925e-05, + "loss": 3.1774, "step": 77350 }, { - "epoch": 8.33064255731353, - "grad_norm": 0.8555433750152588, - "learning_rate": 0.00010046331214308802, - "loss": 3.166, + "epoch": 8.34501347708895, + "grad_norm": 0.8631689548492432, + "learning_rate": 9.958121964382083e-05, + "loss": 3.1958, "step": 77400 }, { - "epoch": 8.336024109353138, - "grad_norm": 0.8833298087120056, - "learning_rate": 0.0001001400711130266, - "loss": 3.2049, + "epoch": 8.350404312668463, + "grad_norm": 0.8530083298683167, + "learning_rate": 9.926389638424175e-05, + "loss": 3.1735, "step": 77450 }, { - "epoch": 8.341405661392745, - "grad_norm": 0.8662601113319397, - "learning_rate": 9.98168300829652e-05, - "loss": 3.1979, + "epoch": 8.355795148247978, + "grad_norm": 0.8499566316604614, + "learning_rate": 9.894009713977333e-05, + "loss": 3.1802, "step": 77500 }, { - "epoch": 8.346787213432354, - "grad_norm": 0.8390443325042725, - "learning_rate": 9.949358905290376e-05, - "loss": 3.1907, + "epoch": 8.361185983827493, + "grad_norm": 0.825239360332489, + "learning_rate": 9.86162978953049e-05, + "loss": 3.1909, "step": 77550 }, { - "epoch": 8.352168765471962, - "grad_norm": 0.8560362458229065, - "learning_rate": 9.917034802284236e-05, - "loss": 3.1891, + "epoch": 8.366576819407008, + "grad_norm": 0.8787322640419006, + "learning_rate": 9.829249865083647e-05, + "loss": 3.1879, "step": 77600 }, { - "epoch": 8.35755031751157, - "grad_norm": 0.8235760927200317, - "learning_rate": 9.884710699278094e-05, - "loss": 3.1708, + "epoch": 8.371967654986523, + "grad_norm": 0.903548538684845, + "learning_rate": 9.796869940636805e-05, + "loss": 3.178, "step": 77650 }, { - "epoch": 8.36293186955118, - "grad_norm": 0.8120622634887695, - "learning_rate": 9.852386596271953e-05, - "loss": 3.1958, + "epoch": 8.377358490566039, + "grad_norm": 0.8681001663208008, + "learning_rate": 9.764490016189961e-05, + "loss": 3.1936, "step": 77700 }, { - "epoch": 8.368313421590786, - "grad_norm": 0.7914016246795654, - "learning_rate": 9.82006249326581e-05, - "loss": 3.1867, + "epoch": 8.382749326145552, + "grad_norm": 0.8955147862434387, + "learning_rate": 9.732110091743119e-05, + "loss": 3.1737, "step": 77750 }, { - "epoch": 8.373694973630395, - "grad_norm": 0.86472088098526, - "learning_rate": 9.78773839025967e-05, - "loss": 3.1636, + "epoch": 8.388140161725067, + "grad_norm": 0.861054539680481, + "learning_rate": 9.699730167296275e-05, + "loss": 3.1939, "step": 77800 }, { - "epoch": 8.379076525670003, - "grad_norm": 0.8104182481765747, - "learning_rate": 9.755414287253529e-05, - "loss": 3.1801, + "epoch": 8.393530997304582, + "grad_norm": 0.90186607837677, + "learning_rate": 9.667350242849433e-05, + "loss": 3.1821, "step": 77850 }, { - "epoch": 8.384458077709612, - "grad_norm": 0.8004899621009827, - "learning_rate": 9.723090184247386e-05, - "loss": 3.2015, + "epoch": 8.398921832884097, + "grad_norm": 0.8513303399085999, + "learning_rate": 9.63497031840259e-05, + "loss": 3.184, "step": 77900 }, { - "epoch": 8.38983962974922, - "grad_norm": 0.8116545081138611, - "learning_rate": 9.690766081241245e-05, - "loss": 3.1839, + "epoch": 8.404312668463612, + "grad_norm": 0.8518136739730835, + "learning_rate": 9.602590393955746e-05, + "loss": 3.1806, "step": 77950 }, { - "epoch": 8.395221181788829, - "grad_norm": 0.8094545006752014, - "learning_rate": 9.658441978235103e-05, - "loss": 3.1992, + "epoch": 8.409703504043126, + "grad_norm": 0.8534890413284302, + "learning_rate": 9.570210469508904e-05, + "loss": 3.1718, "step": 78000 }, { - "epoch": 8.395221181788829, - "eval_accuracy": 0.3910330510132914, - "eval_loss": 3.3307039737701416, - "eval_runtime": 184.6192, - "eval_samples_per_second": 97.558, - "eval_steps_per_second": 6.099, + "epoch": 8.409703504043126, + "eval_accuracy": 0.3906564601041264, + "eval_loss": 3.3315069675445557, + "eval_runtime": 183.818, + "eval_samples_per_second": 97.983, + "eval_steps_per_second": 6.126, "step": 78000 }, { - "epoch": 8.400602733828435, - "grad_norm": 0.8134550452232361, - "learning_rate": 9.626117875228961e-05, - "loss": 3.1925, + "epoch": 8.415094339622641, + "grad_norm": 0.8671660423278809, + "learning_rate": 9.537830545062061e-05, + "loss": 3.1825, "step": 78050 }, { - "epoch": 8.405984285868044, - "grad_norm": 0.8060505390167236, - "learning_rate": 9.59379377222282e-05, - "loss": 3.1681, + "epoch": 8.420485175202156, + "grad_norm": 0.9098289012908936, + "learning_rate": 9.505450620615217e-05, + "loss": 3.1816, "step": 78100 }, { - "epoch": 8.411365837907653, - "grad_norm": 0.8342357873916626, - "learning_rate": 9.561469669216679e-05, - "loss": 3.1893, + "epoch": 8.425876010781671, + "grad_norm": 0.9419325590133667, + "learning_rate": 9.473070696168374e-05, + "loss": 3.1814, "step": 78150 }, { - "epoch": 8.416747389947261, - "grad_norm": 0.803632378578186, - "learning_rate": 9.529145566210537e-05, - "loss": 3.1694, + "epoch": 8.431266846361186, + "grad_norm": 0.8998332619667053, + "learning_rate": 9.440690771721532e-05, + "loss": 3.1702, "step": 78200 }, { - "epoch": 8.42212894198687, - "grad_norm": 0.8132495284080505, - "learning_rate": 9.496821463204395e-05, - "loss": 3.181, + "epoch": 8.436657681940702, + "grad_norm": 0.8600930571556091, + "learning_rate": 9.40831084727469e-05, + "loss": 3.1846, "step": 78250 }, { - "epoch": 8.427510494026476, - "grad_norm": 0.7954928874969482, - "learning_rate": 9.464497360198253e-05, - "loss": 3.1813, + "epoch": 8.442048517520215, + "grad_norm": 0.8776827454566956, + "learning_rate": 9.375930922827845e-05, + "loss": 3.1775, "step": 78300 }, { - "epoch": 8.432892046066085, - "grad_norm": 0.8573002815246582, - "learning_rate": 9.432173257192113e-05, - "loss": 3.1867, + "epoch": 8.44743935309973, + "grad_norm": 0.8739205002784729, + "learning_rate": 9.343550998381003e-05, + "loss": 3.2013, "step": 78350 }, { - "epoch": 8.438273598105694, - "grad_norm": 0.8724850416183472, - "learning_rate": 9.39984915418597e-05, - "loss": 3.1723, + "epoch": 8.452830188679245, + "grad_norm": 0.8829511404037476, + "learning_rate": 9.31117107393416e-05, + "loss": 3.1902, "step": 78400 }, { - "epoch": 8.443655150145302, - "grad_norm": 0.825176477432251, - "learning_rate": 9.367525051179829e-05, - "loss": 3.1915, + "epoch": 8.45822102425876, + "grad_norm": 0.8830351829528809, + "learning_rate": 9.278791149487317e-05, + "loss": 3.1743, "step": 78450 }, { - "epoch": 8.44903670218491, - "grad_norm": 0.8631423115730286, - "learning_rate": 9.335200948173688e-05, - "loss": 3.189, + "epoch": 8.463611859838275, + "grad_norm": 0.8151218891143799, + "learning_rate": 9.246411225040475e-05, + "loss": 3.2046, "step": 78500 }, { - "epoch": 8.45441825422452, - "grad_norm": 0.8290612697601318, - "learning_rate": 9.302876845167545e-05, - "loss": 3.1749, + "epoch": 8.46900269541779, + "grad_norm": 0.8532697558403015, + "learning_rate": 9.214031300593631e-05, + "loss": 3.1741, "step": 78550 }, { - "epoch": 8.459799806264126, - "grad_norm": 0.8389089703559875, - "learning_rate": 9.270552742161403e-05, - "loss": 3.1706, + "epoch": 8.474393530997304, + "grad_norm": 0.8145749568939209, + "learning_rate": 9.181651376146787e-05, + "loss": 3.1797, "step": 78600 }, { - "epoch": 8.465181358303735, - "grad_norm": 0.8537444472312927, - "learning_rate": 9.238228639155263e-05, - "loss": 3.1814, + "epoch": 8.479784366576819, + "grad_norm": 0.8687123656272888, + "learning_rate": 9.149271451699945e-05, + "loss": 3.179, "step": 78650 }, { - "epoch": 8.470562910343343, - "grad_norm": 0.8286445140838623, - "learning_rate": 9.205904536149122e-05, - "loss": 3.1737, + "epoch": 8.485175202156334, + "grad_norm": 0.8689339756965637, + "learning_rate": 9.116891527253103e-05, + "loss": 3.2056, "step": 78700 }, { - "epoch": 8.475944462382952, - "grad_norm": 0.8667829632759094, - "learning_rate": 9.173580433142979e-05, - "loss": 3.1634, + "epoch": 8.49056603773585, + "grad_norm": 0.8794228434562683, + "learning_rate": 9.084511602806258e-05, + "loss": 3.1902, "step": 78750 }, { - "epoch": 8.48132601442256, - "grad_norm": 0.8203412294387817, - "learning_rate": 9.141256330136838e-05, - "loss": 3.1728, + "epoch": 8.495956873315365, + "grad_norm": 0.8810113072395325, + "learning_rate": 9.052131678359416e-05, + "loss": 3.1825, "step": 78800 }, { - "epoch": 8.486707566462167, - "grad_norm": 0.8149620294570923, - "learning_rate": 9.108932227130697e-05, - "loss": 3.1789, + "epoch": 8.501347708894878, + "grad_norm": 0.8873330354690552, + "learning_rate": 9.019751753912574e-05, + "loss": 3.1851, "step": 78850 }, { - "epoch": 8.492089118501776, - "grad_norm": 0.8609354496002197, - "learning_rate": 9.076608124124555e-05, - "loss": 3.1664, + "epoch": 8.506738544474393, + "grad_norm": 0.8767502307891846, + "learning_rate": 8.987371829465731e-05, + "loss": 3.18, "step": 78900 }, { - "epoch": 8.497470670541384, - "grad_norm": 0.8484876751899719, - "learning_rate": 9.044284021118413e-05, - "loss": 3.1736, + "epoch": 8.512129380053908, + "grad_norm": 0.8605179786682129, + "learning_rate": 8.954991905018886e-05, + "loss": 3.201, "step": 78950 }, { - "epoch": 8.502852222580993, - "grad_norm": 0.817091166973114, - "learning_rate": 9.011959918112272e-05, - "loss": 3.183, + "epoch": 8.517520215633423, + "grad_norm": 0.8646233677864075, + "learning_rate": 8.922611980572044e-05, + "loss": 3.1856, "step": 79000 }, { - "epoch": 8.502852222580993, - "eval_accuracy": 0.3913540116438954, - "eval_loss": 3.326340913772583, - "eval_runtime": 183.8032, - "eval_samples_per_second": 97.991, - "eval_steps_per_second": 6.126, + "epoch": 8.517520215633423, + "eval_accuracy": 0.3910653209209723, + "eval_loss": 3.3277745246887207, + "eval_runtime": 183.8981, + "eval_samples_per_second": 97.94, + "eval_steps_per_second": 6.123, "step": 79000 }, { - "epoch": 8.508233774620601, - "grad_norm": 0.846644401550293, - "learning_rate": 8.979635815106129e-05, - "loss": 3.1844, + "epoch": 8.522911051212938, + "grad_norm": 0.8412624001502991, + "learning_rate": 8.890232056125202e-05, + "loss": 3.1946, "step": 79050 }, { - "epoch": 8.513615326660208, - "grad_norm": 0.8385283350944519, - "learning_rate": 8.947311712099989e-05, - "loss": 3.196, + "epoch": 8.528301886792454, + "grad_norm": 0.8897676467895508, + "learning_rate": 8.85785213167836e-05, + "loss": 3.2009, "step": 79100 }, { - "epoch": 8.518996878699816, - "grad_norm": 0.8298741579055786, - "learning_rate": 8.914987609093847e-05, - "loss": 3.1814, + "epoch": 8.533692722371967, + "grad_norm": 0.8410210013389587, + "learning_rate": 8.825472207231516e-05, + "loss": 3.1751, "step": 79150 }, { - "epoch": 8.524378430739425, - "grad_norm": 0.8278105854988098, - "learning_rate": 8.882663506087706e-05, - "loss": 3.1983, + "epoch": 8.539083557951482, + "grad_norm": 0.8973972201347351, + "learning_rate": 8.793092282784672e-05, + "loss": 3.1806, "step": 79200 }, { - "epoch": 8.529759982779034, - "grad_norm": 0.8638647198677063, - "learning_rate": 8.850339403081563e-05, - "loss": 3.1806, + "epoch": 8.544474393530997, + "grad_norm": 0.8442065715789795, + "learning_rate": 8.76071235833783e-05, + "loss": 3.1718, "step": 79250 }, { - "epoch": 8.535141534818642, - "grad_norm": 0.8197671175003052, - "learning_rate": 8.818015300075422e-05, - "loss": 3.1808, + "epoch": 8.549865229110512, + "grad_norm": 0.926730751991272, + "learning_rate": 8.728332433890987e-05, + "loss": 3.1895, "step": 79300 }, { - "epoch": 8.54052308685825, - "grad_norm": 0.8648247122764587, - "learning_rate": 8.785691197069282e-05, - "loss": 3.2081, + "epoch": 8.555256064690028, + "grad_norm": 0.8862857222557068, + "learning_rate": 8.695952509444144e-05, + "loss": 3.1906, "step": 79350 }, { - "epoch": 8.545904638897857, - "grad_norm": 0.8894992470741272, - "learning_rate": 8.753367094063139e-05, - "loss": 3.1711, + "epoch": 8.560646900269543, + "grad_norm": 0.8473867774009705, + "learning_rate": 8.663572584997301e-05, + "loss": 3.1616, "step": 79400 }, { - "epoch": 8.551286190937466, - "grad_norm": 0.8495647311210632, - "learning_rate": 8.721042991056998e-05, - "loss": 3.1824, + "epoch": 8.566037735849056, + "grad_norm": 0.8909599184989929, + "learning_rate": 8.631840259039395e-05, + "loss": 3.195, "step": 79450 }, { - "epoch": 8.556667742977075, - "grad_norm": 0.8385526537895203, - "learning_rate": 8.688718888050856e-05, - "loss": 3.1831, + "epoch": 8.571428571428571, + "grad_norm": 0.8786565065383911, + "learning_rate": 8.599460334592551e-05, + "loss": 3.1852, "step": 79500 }, { - "epoch": 8.562049295016683, - "grad_norm": 0.8313244581222534, - "learning_rate": 8.656394785044713e-05, - "loss": 3.1923, + "epoch": 8.576819407008086, + "grad_norm": 0.8280052542686462, + "learning_rate": 8.567080410145709e-05, + "loss": 3.1833, "step": 79550 }, { - "epoch": 8.567430847056292, - "grad_norm": 0.7992787957191467, - "learning_rate": 8.624070682038572e-05, - "loss": 3.1883, + "epoch": 8.582210242587601, + "grad_norm": 0.9174842834472656, + "learning_rate": 8.534700485698867e-05, + "loss": 3.1853, "step": 79600 }, { - "epoch": 8.572812399095898, - "grad_norm": 0.8515611290931702, - "learning_rate": 8.591746579032432e-05, - "loss": 3.1855, + "epoch": 8.587601078167117, + "grad_norm": 0.9096667170524597, + "learning_rate": 8.502320561252023e-05, + "loss": 3.1649, "step": 79650 }, { - "epoch": 8.578193951135507, - "grad_norm": 0.8336965441703796, - "learning_rate": 8.55942247602629e-05, - "loss": 3.1975, + "epoch": 8.59299191374663, + "grad_norm": 0.8842190504074097, + "learning_rate": 8.469940636805181e-05, + "loss": 3.1896, "step": 79700 }, { - "epoch": 8.583575503175116, - "grad_norm": 0.8350300192832947, - "learning_rate": 8.527098373020148e-05, - "loss": 3.1877, + "epoch": 8.598382749326145, + "grad_norm": 0.9274055361747742, + "learning_rate": 8.437560712358337e-05, + "loss": 3.1992, "step": 79750 }, { - "epoch": 8.588957055214724, - "grad_norm": 0.8040255308151245, - "learning_rate": 8.494774270014006e-05, - "loss": 3.1902, + "epoch": 8.60377358490566, + "grad_norm": 0.8860254883766174, + "learning_rate": 8.405180787911494e-05, + "loss": 3.1794, "step": 79800 }, { - "epoch": 8.594338607254333, - "grad_norm": 0.8291066884994507, - "learning_rate": 8.462450167007866e-05, - "loss": 3.1798, + "epoch": 8.609164420485175, + "grad_norm": 0.8451575040817261, + "learning_rate": 8.372800863464651e-05, + "loss": 3.1644, "step": 79850 }, { - "epoch": 8.599720159293941, - "grad_norm": 0.8915792107582092, - "learning_rate": 8.430126064001722e-05, - "loss": 3.1781, + "epoch": 8.61455525606469, + "grad_norm": 0.8715153932571411, + "learning_rate": 8.340420939017809e-05, + "loss": 3.1715, "step": 79900 }, { - "epoch": 8.605101711333548, - "grad_norm": 0.8477835655212402, - "learning_rate": 8.397801960995582e-05, - "loss": 3.1845, + "epoch": 8.619946091644206, + "grad_norm": 0.8940669894218445, + "learning_rate": 8.308041014570964e-05, + "loss": 3.1943, "step": 79950 }, { - "epoch": 8.610483263373157, - "grad_norm": 0.8423581123352051, - "learning_rate": 8.36547785798944e-05, - "loss": 3.1571, + "epoch": 8.625336927223719, + "grad_norm": 0.8656378984451294, + "learning_rate": 8.275661090124122e-05, + "loss": 3.183, "step": 80000 }, { - "epoch": 8.610483263373157, - "eval_accuracy": 0.39168312124108545, - "eval_loss": 3.323802947998047, - "eval_runtime": 184.6741, - "eval_samples_per_second": 97.529, - "eval_steps_per_second": 6.097, + "epoch": 8.625336927223719, + "eval_accuracy": 0.3916346620531202, + "eval_loss": 3.3251895904541016, + "eval_runtime": 183.6111, + "eval_samples_per_second": 98.093, + "eval_steps_per_second": 6.133, "step": 80000 } ], "logging_steps": 50, - "max_steps": 92910, + "max_steps": 92750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, @@ -11946,7 +11946,7 @@ "attributes": {} } }, - "total_flos": 6.68874156539904e+17, + "total_flos": 6.688553435136e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null